xref: /xnu-12377.61.12/bsd/kern/mem_acct.c (revision 4d495c6e23c53686cf65f45067f79024cf5dcee8)
1 /*
2  * Copyright (c) 2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. Please obtain a copy of the License at
10  * http://www.opensource.apple.com/apsl/ and read it before using this
11  * file.
12  *
13  * The Original Code and all software distributed under the License are
14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18  * Please see the License for the specific language governing rights and
19  * limitations under the License.
20  *
21  * @APPLE_LICENSE_HEADER_END@
22  */
23 
24 #include <kern/cpu_data.h>
25 #include <kern/kalloc.h>
26 #include <kern/locks.h>
27 #include <kern/mem_acct.h>
28 #include <kern/percpu.h>
29 
30 #include <os/atomic_private.h>
31 #include <os/log.h>
32 #include <os/ptrtools.h>
33 
34 #include <sys/mem_acct_private.h>
35 #include <sys/param.h>
36 #include <sys/sysctl.h>
37 
38 #include <net/net_sysctl.h>
39 
40 struct mem_acct {
41 	int64_t _Atomic ma_allocated; /* Amount of memory accounted towards this subsystem (ignore temporary per-CPU accounting from below) */
42 	int32_t *__zpercpu ma_percpu; /* Per-CPU "bounce-buffer" of accounting that will be folded in to `ma_allocated` */
43 	uint64_t ma_hardlimit; /* hard limit that will not be exceeded */
44 	uint8_t ma_percent; /* Percent of hard-limit we should start soft-limiting (if != 100 && != 0) */
45 	uint64_t _Atomic ma_peak;
46 	char ma_name[MEM_ACCT_NAME_LENGTH]; /* Name of the subsystem using this instance of memory-accounting module */
47 };
48 
49 #define MEM_ACCT_PCPU_MAX 1024 * 1024 /* Update global var after 1MB in the per-cpu var */
50 
51 static struct mem_acct *memacct[MEM_ACCT_MAX];
52 
53 static uint64_t
mem_acct_softlimit(uint64_t hardlimit,uint8_t percent)54 mem_acct_softlimit(uint64_t hardlimit, uint8_t percent)
55 {
56 	return (hardlimit * percent) / 100;
57 }
58 
59 static uint64_t
mem_acct_presoftlimit(uint64_t hardlimit,uint8_t percent)60 mem_acct_presoftlimit(uint64_t hardlimit, uint8_t percent)
61 {
62 	return (mem_acct_softlimit(hardlimit, percent) * percent) / 100;
63 }
64 
65 int
mem_acct_limited(const struct mem_acct * macct)66 mem_acct_limited(const struct mem_acct *macct)
67 {
68 	uint64_t hardlimit;
69 	int64_t allocated;
70 	uint8_t percent;
71 
72 	allocated = os_atomic_load(&macct->ma_allocated, relaxed);
73 	if (allocated < 0) {
74 		return 0;
75 	}
76 
77 	hardlimit = os_access_once(macct->ma_hardlimit);
78 	if (hardlimit && allocated > hardlimit) {
79 		return MEMACCT_HARDLIMIT;
80 	}
81 
82 	percent = os_access_once(macct->ma_percent);
83 	if (percent) {
84 		if (allocated > mem_acct_softlimit(hardlimit, percent)) {
85 			return MEMACCT_SOFTLIMIT;
86 		}
87 
88 		if (allocated > mem_acct_presoftlimit(hardlimit, percent)) {
89 			return MEMACCT_PRESOFTLIMIT;
90 		}
91 	}
92 
93 	return 0;
94 }
95 
96 void
_mem_acct_add(struct mem_acct * macct,int size)97 _mem_acct_add(struct mem_acct *macct, int size)
98 {
99 	int *pcpu;
100 
101 	/*
102 	 * Yes, the accounting is not 100% accurate with the per-cpu
103 	 * "bounce-buffer" storing intermediate results. For example, we may
104 	 * report "hard-limit" even though all the per-cpu counters may bring us
105 	 * below the limit. But honestly, we don't care... If we hit hard-limit
106 	 * the system is gonna be in a bad state anyways until we have given
107 	 * away enough memory.
108 	 *
109 	 * The same counts for softlimit, but softlimit still allows us to
110 	 * account memory and just makes us a bit more aggressive at freeing
111 	 * stuff.
112 	 */
113 
114 	/* Now, add the size to the per-cpu variable */
115 	disable_preemption();
116 	pcpu = zpercpu_get(macct->ma_percpu);
117 	*pcpu += size;
118 
119 	/* If we added enough to the pcpu variable, fold it into the global variable */
120 	if (*pcpu > MEM_ACCT_PCPU_MAX || *pcpu < -MEM_ACCT_PCPU_MAX) {
121 		int limited, newlimited;
122 		int64_t allocated;
123 
124 		limited = mem_acct_limited(macct);
125 
126 		allocated = os_atomic_add(&macct->ma_allocated, *pcpu, relaxed);
127 
128 		/*
129 		 * Can be temporarily < 0 if the CPU freeing memory hits
130 		 * MEM_ACCT_PCPU_MAX first.
131 		 */
132 		if (allocated > 0) {
133 			os_atomic_max(&macct->ma_peak, allocated, relaxed);
134 		}
135 
136 		newlimited = mem_acct_limited(macct);
137 		if (limited != newlimited) {
138 			os_log(OS_LOG_DEFAULT,
139 			    "memacct: %s goes from %u to %u for its limit",
140 			    macct->ma_name, limited, newlimited);
141 		}
142 
143 		*pcpu = 0;
144 	}
145 	enable_preemption();
146 }
147 
148 static LCK_GRP_DECLARE(mem_acct_mtx_grp, "mem_acct");
149 static LCK_MTX_DECLARE(mem_acct_mtx, &mem_acct_mtx_grp);
150 
151 struct mem_acct *
mem_acct_register(const char * __null_terminated name,uint64_t hardlimit,uint8_t percent)152 mem_acct_register(const char *__null_terminated name,
153     uint64_t hardlimit, uint8_t percent)
154 {
155 	struct mem_acct *acct = NULL;
156 	int i, index = -1;
157 
158 	if (percent > 100) {
159 		os_log(OS_LOG_DEFAULT,
160 		    "memacct: percentage for softlimit is out-of-bounds\n");
161 		return NULL;
162 	}
163 
164 	lck_mtx_lock(&mem_acct_mtx);
165 
166 	/* Find an empty slot in the accounting array and check for name uniqueness */
167 	for (i = 0; i < MEM_ACCT_MAX; i++) {
168 		if (memacct[i] == NULL) {
169 			if (index == -1) {
170 				index = i;
171 			}
172 
173 			continue;
174 		}
175 
176 		if (strlcmp(memacct[i]->ma_name, name, MEM_ACCT_NAME_LENGTH - 1) == 0) {
177 			os_log(OS_LOG_DEFAULT,
178 			    "memacct: subsystem %s already exists", name);
179 			goto exit;
180 		}
181 	}
182 
183 	if (index == -1) {
184 		os_log(OS_LOG_DEFAULT, "memacct: No space for additional subsystem");
185 		goto exit;
186 	}
187 
188 	memacct[index] = kalloc_type(struct mem_acct, Z_WAITOK_ZERO_NOFAIL);
189 
190 	acct = memacct[index];
191 
192 	strlcpy(acct->ma_name, name, MEM_ACCT_NAME_LENGTH);
193 	acct->ma_hardlimit = hardlimit;
194 	if (percent >= 100) {
195 		os_log(OS_LOG_DEFAULT,
196 		    "memacct: percent is > 100");
197 
198 		memacct[index] = NULL;
199 		kfree_type(struct mem_acct, acct);
200 		acct = NULL;
201 
202 		goto exit;
203 	}
204 	acct->ma_percent = percent;
205 	acct->ma_percpu = zalloc_percpu_permanent_type(int32_t);
206 
207 exit:
208 	lck_mtx_unlock(&mem_acct_mtx);
209 
210 	return acct;
211 }
212 
213 /*
214  *	Memory Accounting sysctl handlers
215  */
216 
217 struct walkarg {
218 	int     w_op, w_sub;
219 	struct sysctl_req *w_req;
220 };
221 
222 /* sysctls on a per-subsystem basis */
223 static int sysctl_subsystem_peak(struct walkarg *w);
224 static int sysctl_subsystem_soft_limit(struct walkarg *w);
225 static int sysctl_subsystem_hard_limit(struct walkarg *w);
226 static int sysctl_subsystem_allocated(struct walkarg *w);
227 static int sysctl_all_subsystem_statistics(struct walkarg *w);
228 
229 /* sysctls for all active subsystems */
230 static int sysctl_all_statistics(struct sysctl_req *);
231 static int sysctl_mem_acct_subsystems(struct sysctl_req *);
232 
233 /* Handler function for all Memory Accounting sysctls */
234 static int sysctl_mem_acct SYSCTL_HANDLER_ARGS;
235 
236 /* Helper functions */
237 static void memacct_copy_stats(struct memacct_statistics *s, struct mem_acct *a);
238 
239 SYSCTL_NODE(_kern, OID_AUTO, memacct,
240     CTLFLAG_RW | CTLFLAG_LOCKED, sysctl_mem_acct, "Memory Accounting");
241 
242 static int
243 sysctl_mem_acct SYSCTL_HANDLER_ARGS
244 {
245 #pragma unused(oidp)
246 	DECLARE_SYSCTL_HANDLER_ARG_ARRAY(int, 2, name, namelen);
247 	int error = EINVAL;
248 	struct walkarg w;
249 
250 	/* Verify the specified subsystem index is valid */
251 	if (name[1] >= MEM_ACCT_MAX || name[1] < 0) {
252 		return EINVAL;
253 	}
254 
255 	bzero(&w, sizeof(w));
256 	w.w_req = req;
257 	w.w_op = name[0];
258 	w.w_sub = name[1];
259 
260 	switch (w.w_op) {
261 	case MEM_ACCT_PEAK:
262 		error = sysctl_subsystem_peak(&w);
263 		break;
264 	case MEM_ACCT_SOFT_LIMIT:
265 		error = sysctl_subsystem_soft_limit(&w);
266 		break;
267 	case MEM_ACCT_HARD_LIMIT:
268 		error = sysctl_subsystem_hard_limit(&w);
269 		break;
270 	case MEM_ACCT_ALLOCATED:
271 		error = sysctl_subsystem_allocated(&w);
272 		break;
273 	case MEM_ACCT_SUBSYSTEMS:
274 		error = sysctl_mem_acct_subsystems(req);
275 		break;
276 	case MEM_ACCT_ALL_SUBSYSTEM_STATISTICS:
277 		error = sysctl_all_subsystem_statistics(&w);
278 		break;
279 	case MEM_ACCT_ALL_STATISTICS:
280 		error = sysctl_all_statistics(req);
281 		break;
282 	}
283 
284 	return error;
285 }
286 
287 static int
sysctl_subsystem_peak(struct walkarg * w)288 sysctl_subsystem_peak(struct walkarg *w)
289 {
290 	int error;
291 	uint64_t value;
292 	int changed = 0;
293 	struct mem_acct *acct = memacct[w->w_sub];
294 
295 	if (acct == NULL) {
296 		return ENOENT;
297 	}
298 
299 	value = os_atomic_load(&acct->ma_peak, relaxed);
300 	error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed);
301 	if (error || !changed) {
302 		return error;
303 	}
304 
305 	os_atomic_store(&acct->ma_peak, value, relaxed);
306 	return 0;
307 }
308 
309 static int
sysctl_subsystem_soft_limit(struct walkarg * w)310 sysctl_subsystem_soft_limit(struct walkarg *w)
311 {
312 	int error;
313 	uint64_t hardlimit, value;
314 	int changed = 0;
315 	struct mem_acct *acct = memacct[w->w_sub];
316 
317 	if (acct == NULL) {
318 		return ENOENT;
319 	}
320 
321 	hardlimit = os_atomic_load(&acct->ma_hardlimit, relaxed);
322 	if (acct->ma_percent) {
323 		value = mem_acct_softlimit(hardlimit, acct->ma_percent);
324 	} else {
325 		value = hardlimit;
326 	}
327 	error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed);
328 	if (error || !changed) {
329 		return error;
330 	}
331 
332 	return EPERM;
333 }
334 
335 static int
sysctl_subsystem_hard_limit(struct walkarg * w)336 sysctl_subsystem_hard_limit(struct walkarg *w)
337 {
338 	int error;
339 	uint64_t value;
340 	int changed = 0;
341 	struct mem_acct *acct = memacct[w->w_sub];
342 
343 	if (acct == NULL) {
344 		return ENOENT;
345 	}
346 
347 	value = os_atomic_load(&acct->ma_hardlimit, relaxed);
348 	error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed);
349 	if (error || !changed) {
350 		return error;
351 	}
352 
353 	acct->ma_hardlimit = value;
354 	return 0;
355 }
356 
357 static int
sysctl_subsystem_allocated(struct walkarg * w)358 sysctl_subsystem_allocated(struct walkarg *w)
359 {
360 	int64_t value;
361 	struct mem_acct *acct = memacct[w->w_sub];
362 
363 	lck_mtx_lock(&mem_acct_mtx);
364 
365 	if (acct == NULL) {
366 		return ENOENT;
367 	}
368 
369 	value = os_atomic_load(&acct->ma_allocated, relaxed);
370 	zpercpu_foreach(v, acct->ma_percpu) {
371 		value += *v;
372 	}
373 
374 	lck_mtx_unlock(&mem_acct_mtx);
375 
376 	return sysctl_io_number(w->w_req, value, sizeof(value), NULL, NULL);
377 }
378 
379 static int
sysctl_all_subsystem_statistics(struct walkarg * w)380 sysctl_all_subsystem_statistics(struct walkarg *w)
381 {
382 	/* Returns a single memacct_statistics struct for the specified subsystem */
383 	struct memacct_statistics stats = {};
384 	struct mem_acct *acct = memacct[w->w_sub];
385 
386 	lck_mtx_lock(&mem_acct_mtx);
387 
388 	if (acct == NULL) {
389 		return ENOENT;
390 	}
391 
392 	memacct_copy_stats(&stats, acct);
393 
394 	lck_mtx_unlock(&mem_acct_mtx);
395 
396 	return sysctl_io_opaque(w->w_req, &stats, sizeof(stats), NULL);
397 }
398 
399 static int
sysctl_all_statistics(struct sysctl_req * req)400 sysctl_all_statistics(struct sysctl_req *req)
401 {
402 	/* Returns an array of memacct_statistics structs for all active subsystems */
403 	int i, error;
404 	int count = 0;
405 
406 	lck_mtx_lock(&mem_acct_mtx);
407 
408 	for (i = 0; i < MEM_ACCT_MAX; i++) {
409 		if (memacct[i] == NULL) {
410 			break;
411 		}
412 		count++;
413 	}
414 
415 	struct memacct_statistics *memstats = kalloc_data(sizeof(struct memacct_statistics) * count, Z_WAITOK_ZERO_NOFAIL);
416 
417 	for (i = 0; i < count; i++) {
418 		struct mem_acct *acct;
419 		struct memacct_statistics *stats;
420 
421 		acct = memacct[i];
422 		stats = &memstats[i];
423 
424 		memacct_copy_stats(stats, acct);
425 	}
426 
427 	lck_mtx_unlock(&mem_acct_mtx);
428 
429 	error = sysctl_io_opaque(req, memstats, sizeof(struct memacct_statistics) * count, NULL);
430 	if (error) {
431 		kfree_data(memstats, sizeof(struct memacct_statistics) * count);
432 		return error;
433 	}
434 
435 	kfree_data(memstats, sizeof(struct memacct_statistics) * count);
436 	return 0;
437 }
438 
439 static int
sysctl_mem_acct_subsystems(struct sysctl_req * req)440 sysctl_mem_acct_subsystems(struct sysctl_req *req)
441 {
442 	/* Returns an array names for all active subsystems */
443 	int i, j, error;
444 	int count = 0;
445 	int totalCharCount = 0;
446 
447 	lck_mtx_lock(&mem_acct_mtx);
448 
449 	for (i = 0; i < MEM_ACCT_MAX; i++) {
450 		if (memacct[i] == NULL) {
451 			break;
452 		}
453 		count++;
454 	}
455 
456 	char *names = kalloc_data(count * MEM_ACCT_NAME_LENGTH, Z_WAITOK_ZERO_NOFAIL);
457 
458 	for (i = 0; i < count; i++) {
459 		struct mem_acct *acct = memacct[i];
460 		char acct_name[MEM_ACCT_NAME_LENGTH];
461 
462 		strbufcpy(acct_name, acct->ma_name);
463 
464 		for (j = 0; j < MEM_ACCT_NAME_LENGTH; j++) {
465 			names[totalCharCount++] = acct_name[j];
466 		}
467 	}
468 
469 	lck_mtx_unlock(&mem_acct_mtx);
470 
471 	error = sysctl_io_opaque(req, names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH, NULL);
472 	if (error) {
473 		kfree_data(names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH);
474 		return error;
475 	}
476 
477 	kfree_data(names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH);
478 	return 0;
479 }
480 
481 static void
memacct_copy_stats(struct memacct_statistics * s,struct mem_acct * a)482 memacct_copy_stats(struct memacct_statistics *s, struct mem_acct *a)
483 {
484 	s->peak = os_atomic_load(&a->ma_peak, relaxed);
485 	s->allocated = os_atomic_load(&a->ma_allocated, relaxed);
486 	zpercpu_foreach(v, a->ma_percpu) {
487 		s->allocated += *v;
488 	}
489 	if (a->ma_percent) {
490 		s->softlimit = mem_acct_softlimit(a->ma_hardlimit, a->ma_percent);
491 	} else {
492 		s->softlimit = a->ma_hardlimit;
493 	}
494 	s->hardlimit = a->ma_hardlimit;
495 	strbufcpy(s->ma_name, a->ma_name);
496 }
497