1 /*
2 * Copyright (c) 2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 #include <kern/cpu_data.h>
25 #include <kern/kalloc.h>
26 #include <kern/locks.h>
27 #include <kern/mem_acct.h>
28 #include <kern/percpu.h>
29
30 #include <os/atomic_private.h>
31 #include <os/log.h>
32 #include <os/ptrtools.h>
33
34 #include <sys/mem_acct_private.h>
35 #include <sys/param.h>
36 #include <sys/sysctl.h>
37
38 #include <net/net_sysctl.h>
39
40 struct mem_acct {
41 int64_t _Atomic ma_allocated; /* Amount of memory accounted towards this subsystem (ignore temporary per-CPU accounting from below) */
42 int32_t *__zpercpu ma_percpu; /* Per-CPU "bounce-buffer" of accounting that will be folded in to `ma_allocated` */
43 uint64_t ma_hardlimit; /* hard limit that will not be exceeded */
44 uint8_t ma_percent; /* Percent of hard-limit we should start soft-limiting (if != 100 && != 0) */
45 uint64_t _Atomic ma_peak;
46 char ma_name[MEM_ACCT_NAME_LENGTH]; /* Name of the subsystem using this instance of memory-accounting module */
47 };
48
49 #define MEM_ACCT_PCPU_MAX 1024 * 1024 /* Update global var after 1MB in the per-cpu var */
50
51 static struct mem_acct *memacct[MEM_ACCT_MAX];
52
53 static uint64_t
mem_acct_softlimit(uint64_t hardlimit,uint8_t percent)54 mem_acct_softlimit(uint64_t hardlimit, uint8_t percent)
55 {
56 return (hardlimit * percent) / 100;
57 }
58
59 static uint64_t
mem_acct_presoftlimit(uint64_t hardlimit,uint8_t percent)60 mem_acct_presoftlimit(uint64_t hardlimit, uint8_t percent)
61 {
62 return (mem_acct_softlimit(hardlimit, percent) * percent) / 100;
63 }
64
65 int
mem_acct_limited(const struct mem_acct * macct)66 mem_acct_limited(const struct mem_acct *macct)
67 {
68 uint64_t hardlimit;
69 int64_t allocated;
70 uint8_t percent;
71
72 allocated = os_atomic_load(&macct->ma_allocated, relaxed);
73 if (allocated < 0) {
74 return 0;
75 }
76
77 hardlimit = os_access_once(macct->ma_hardlimit);
78 if (hardlimit && allocated > hardlimit) {
79 return MEMACCT_HARDLIMIT;
80 }
81
82 percent = os_access_once(macct->ma_percent);
83 if (percent) {
84 if (allocated > mem_acct_softlimit(hardlimit, percent)) {
85 return MEMACCT_SOFTLIMIT;
86 }
87
88 if (allocated > mem_acct_presoftlimit(hardlimit, percent)) {
89 return MEMACCT_PRESOFTLIMIT;
90 }
91 }
92
93 return 0;
94 }
95
96 void
_mem_acct_add(struct mem_acct * macct,int size)97 _mem_acct_add(struct mem_acct *macct, int size)
98 {
99 int *pcpu;
100
101 /*
102 * Yes, the accounting is not 100% accurate with the per-cpu
103 * "bounce-buffer" storing intermediate results. For example, we may
104 * report "hard-limit" even though all the per-cpu counters may bring us
105 * below the limit. But honestly, we don't care... If we hit hard-limit
106 * the system is gonna be in a bad state anyways until we have given
107 * away enough memory.
108 *
109 * The same counts for softlimit, but softlimit still allows us to
110 * account memory and just makes us a bit more aggressive at freeing
111 * stuff.
112 */
113
114 /* Now, add the size to the per-cpu variable */
115 disable_preemption();
116 pcpu = zpercpu_get(macct->ma_percpu);
117 *pcpu += size;
118
119 /* If we added enough to the pcpu variable, fold it into the global variable */
120 if (*pcpu > MEM_ACCT_PCPU_MAX || *pcpu < -MEM_ACCT_PCPU_MAX) {
121 int limited, newlimited;
122 int64_t allocated;
123
124 limited = mem_acct_limited(macct);
125
126 allocated = os_atomic_add(&macct->ma_allocated, *pcpu, relaxed);
127
128 /*
129 * Can be temporarily < 0 if the CPU freeing memory hits
130 * MEM_ACCT_PCPU_MAX first.
131 */
132 if (allocated > 0) {
133 os_atomic_max(&macct->ma_peak, allocated, relaxed);
134 }
135
136 newlimited = mem_acct_limited(macct);
137 if (limited != newlimited) {
138 os_log(OS_LOG_DEFAULT,
139 "memacct: %s goes from %u to %u for its limit",
140 macct->ma_name, limited, newlimited);
141 }
142
143 *pcpu = 0;
144 }
145 enable_preemption();
146 }
147
148 static LCK_GRP_DECLARE(mem_acct_mtx_grp, "mem_acct");
149 static LCK_MTX_DECLARE(mem_acct_mtx, &mem_acct_mtx_grp);
150
151 struct mem_acct *
mem_acct_register(const char * __null_terminated name,uint64_t hardlimit,uint8_t percent)152 mem_acct_register(const char *__null_terminated name,
153 uint64_t hardlimit, uint8_t percent)
154 {
155 struct mem_acct *acct = NULL;
156 int i, index = -1;
157
158 if (percent > 100) {
159 os_log(OS_LOG_DEFAULT,
160 "memacct: percentage for softlimit is out-of-bounds\n");
161 return NULL;
162 }
163
164 lck_mtx_lock(&mem_acct_mtx);
165
166 /* Find an empty slot in the accounting array and check for name uniqueness */
167 for (i = 0; i < MEM_ACCT_MAX; i++) {
168 if (memacct[i] == NULL) {
169 if (index == -1) {
170 index = i;
171 }
172
173 continue;
174 }
175
176 if (strlcmp(memacct[i]->ma_name, name, MEM_ACCT_NAME_LENGTH - 1) == 0) {
177 os_log(OS_LOG_DEFAULT,
178 "memacct: subsystem %s already exists", name);
179 goto exit;
180 }
181 }
182
183 if (index == -1) {
184 os_log(OS_LOG_DEFAULT, "memacct: No space for additional subsystem");
185 goto exit;
186 }
187
188 memacct[index] = kalloc_type(struct mem_acct, Z_WAITOK_ZERO_NOFAIL);
189
190 acct = memacct[index];
191
192 strlcpy(acct->ma_name, name, MEM_ACCT_NAME_LENGTH);
193 acct->ma_hardlimit = hardlimit;
194 if (percent >= 100) {
195 os_log(OS_LOG_DEFAULT,
196 "memacct: percent is > 100");
197
198 memacct[index] = NULL;
199 kfree_type(struct mem_acct, acct);
200 acct = NULL;
201
202 goto exit;
203 }
204 acct->ma_percent = percent;
205 acct->ma_percpu = zalloc_percpu_permanent_type(int32_t);
206
207 exit:
208 lck_mtx_unlock(&mem_acct_mtx);
209
210 return acct;
211 }
212
213 /*
214 * Memory Accounting sysctl handlers
215 */
216
217 struct walkarg {
218 int w_op, w_sub;
219 struct sysctl_req *w_req;
220 };
221
222 /* sysctls on a per-subsystem basis */
223 static int sysctl_subsystem_peak(struct walkarg *w);
224 static int sysctl_subsystem_soft_limit(struct walkarg *w);
225 static int sysctl_subsystem_hard_limit(struct walkarg *w);
226 static int sysctl_subsystem_allocated(struct walkarg *w);
227 static int sysctl_all_subsystem_statistics(struct walkarg *w);
228
229 /* sysctls for all active subsystems */
230 static int sysctl_all_statistics(struct sysctl_req *);
231 static int sysctl_mem_acct_subsystems(struct sysctl_req *);
232
233 /* Handler function for all Memory Accounting sysctls */
234 static int sysctl_mem_acct SYSCTL_HANDLER_ARGS;
235
236 /* Helper functions */
237 static void memacct_copy_stats(struct memacct_statistics *s, struct mem_acct *a);
238
239 SYSCTL_NODE(_kern, OID_AUTO, memacct,
240 CTLFLAG_RW | CTLFLAG_LOCKED, sysctl_mem_acct, "Memory Accounting");
241
242 static int
243 sysctl_mem_acct SYSCTL_HANDLER_ARGS
244 {
245 #pragma unused(oidp)
246 DECLARE_SYSCTL_HANDLER_ARG_ARRAY(int, 2, name, namelen);
247 int error = EINVAL;
248 struct walkarg w;
249
250 /* Verify the specified subsystem index is valid */
251 if (name[1] >= MEM_ACCT_MAX || name[1] < 0) {
252 return EINVAL;
253 }
254
255 bzero(&w, sizeof(w));
256 w.w_req = req;
257 w.w_op = name[0];
258 w.w_sub = name[1];
259
260 switch (w.w_op) {
261 case MEM_ACCT_PEAK:
262 error = sysctl_subsystem_peak(&w);
263 break;
264 case MEM_ACCT_SOFT_LIMIT:
265 error = sysctl_subsystem_soft_limit(&w);
266 break;
267 case MEM_ACCT_HARD_LIMIT:
268 error = sysctl_subsystem_hard_limit(&w);
269 break;
270 case MEM_ACCT_ALLOCATED:
271 error = sysctl_subsystem_allocated(&w);
272 break;
273 case MEM_ACCT_SUBSYSTEMS:
274 error = sysctl_mem_acct_subsystems(req);
275 break;
276 case MEM_ACCT_ALL_SUBSYSTEM_STATISTICS:
277 error = sysctl_all_subsystem_statistics(&w);
278 break;
279 case MEM_ACCT_ALL_STATISTICS:
280 error = sysctl_all_statistics(req);
281 break;
282 }
283
284 return error;
285 }
286
287 static int
sysctl_subsystem_peak(struct walkarg * w)288 sysctl_subsystem_peak(struct walkarg *w)
289 {
290 int error;
291 uint64_t value;
292 int changed = 0;
293 struct mem_acct *acct = memacct[w->w_sub];
294
295 if (acct == NULL) {
296 return ENOENT;
297 }
298
299 value = os_atomic_load(&acct->ma_peak, relaxed);
300 error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed);
301 if (error || !changed) {
302 return error;
303 }
304
305 os_atomic_store(&acct->ma_peak, value, relaxed);
306 return 0;
307 }
308
309 static int
sysctl_subsystem_soft_limit(struct walkarg * w)310 sysctl_subsystem_soft_limit(struct walkarg *w)
311 {
312 int error;
313 uint64_t hardlimit, value;
314 int changed = 0;
315 struct mem_acct *acct = memacct[w->w_sub];
316
317 if (acct == NULL) {
318 return ENOENT;
319 }
320
321 hardlimit = os_atomic_load(&acct->ma_hardlimit, relaxed);
322 if (acct->ma_percent) {
323 value = mem_acct_softlimit(hardlimit, acct->ma_percent);
324 } else {
325 value = hardlimit;
326 }
327 error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed);
328 if (error || !changed) {
329 return error;
330 }
331
332 return EPERM;
333 }
334
335 static int
sysctl_subsystem_hard_limit(struct walkarg * w)336 sysctl_subsystem_hard_limit(struct walkarg *w)
337 {
338 int error;
339 uint64_t value;
340 int changed = 0;
341 struct mem_acct *acct = memacct[w->w_sub];
342
343 if (acct == NULL) {
344 return ENOENT;
345 }
346
347 value = os_atomic_load(&acct->ma_hardlimit, relaxed);
348 error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed);
349 if (error || !changed) {
350 return error;
351 }
352
353 acct->ma_hardlimit = value;
354 return 0;
355 }
356
357 static int
sysctl_subsystem_allocated(struct walkarg * w)358 sysctl_subsystem_allocated(struct walkarg *w)
359 {
360 int64_t value;
361 struct mem_acct *acct = memacct[w->w_sub];
362
363 lck_mtx_lock(&mem_acct_mtx);
364
365 if (acct == NULL) {
366 return ENOENT;
367 }
368
369 value = os_atomic_load(&acct->ma_allocated, relaxed);
370 zpercpu_foreach(v, acct->ma_percpu) {
371 value += *v;
372 }
373
374 lck_mtx_unlock(&mem_acct_mtx);
375
376 return sysctl_io_number(w->w_req, value, sizeof(value), NULL, NULL);
377 }
378
379 static int
sysctl_all_subsystem_statistics(struct walkarg * w)380 sysctl_all_subsystem_statistics(struct walkarg *w)
381 {
382 /* Returns a single memacct_statistics struct for the specified subsystem */
383 struct memacct_statistics stats = {};
384 struct mem_acct *acct = memacct[w->w_sub];
385
386 lck_mtx_lock(&mem_acct_mtx);
387
388 if (acct == NULL) {
389 return ENOENT;
390 }
391
392 memacct_copy_stats(&stats, acct);
393
394 lck_mtx_unlock(&mem_acct_mtx);
395
396 return sysctl_io_opaque(w->w_req, &stats, sizeof(stats), NULL);
397 }
398
399 static int
sysctl_all_statistics(struct sysctl_req * req)400 sysctl_all_statistics(struct sysctl_req *req)
401 {
402 /* Returns an array of memacct_statistics structs for all active subsystems */
403 int i, error;
404 int count = 0;
405
406 lck_mtx_lock(&mem_acct_mtx);
407
408 for (i = 0; i < MEM_ACCT_MAX; i++) {
409 if (memacct[i] == NULL) {
410 break;
411 }
412 count++;
413 }
414
415 struct memacct_statistics *memstats = kalloc_data(sizeof(struct memacct_statistics) * count, Z_WAITOK_ZERO_NOFAIL);
416
417 for (i = 0; i < count; i++) {
418 struct mem_acct *acct;
419 struct memacct_statistics *stats;
420
421 acct = memacct[i];
422 stats = &memstats[i];
423
424 memacct_copy_stats(stats, acct);
425 }
426
427 lck_mtx_unlock(&mem_acct_mtx);
428
429 error = sysctl_io_opaque(req, memstats, sizeof(struct memacct_statistics) * count, NULL);
430 if (error) {
431 kfree_data(memstats, sizeof(struct memacct_statistics) * count);
432 return error;
433 }
434
435 kfree_data(memstats, sizeof(struct memacct_statistics) * count);
436 return 0;
437 }
438
439 static int
sysctl_mem_acct_subsystems(struct sysctl_req * req)440 sysctl_mem_acct_subsystems(struct sysctl_req *req)
441 {
442 /* Returns an array names for all active subsystems */
443 int i, j, error;
444 int count = 0;
445 int totalCharCount = 0;
446
447 lck_mtx_lock(&mem_acct_mtx);
448
449 for (i = 0; i < MEM_ACCT_MAX; i++) {
450 if (memacct[i] == NULL) {
451 break;
452 }
453 count++;
454 }
455
456 char *names = kalloc_data(count * MEM_ACCT_NAME_LENGTH, Z_WAITOK_ZERO_NOFAIL);
457
458 for (i = 0; i < count; i++) {
459 struct mem_acct *acct = memacct[i];
460 char acct_name[MEM_ACCT_NAME_LENGTH];
461
462 strbufcpy(acct_name, acct->ma_name);
463
464 for (j = 0; j < MEM_ACCT_NAME_LENGTH; j++) {
465 names[totalCharCount++] = acct_name[j];
466 }
467 }
468
469 lck_mtx_unlock(&mem_acct_mtx);
470
471 error = sysctl_io_opaque(req, names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH, NULL);
472 if (error) {
473 kfree_data(names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH);
474 return error;
475 }
476
477 kfree_data(names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH);
478 return 0;
479 }
480
481 static void
memacct_copy_stats(struct memacct_statistics * s,struct mem_acct * a)482 memacct_copy_stats(struct memacct_statistics *s, struct mem_acct *a)
483 {
484 s->peak = os_atomic_load(&a->ma_peak, relaxed);
485 s->allocated = os_atomic_load(&a->ma_allocated, relaxed);
486 zpercpu_foreach(v, a->ma_percpu) {
487 s->allocated += *v;
488 }
489 if (a->ma_percent) {
490 s->softlimit = mem_acct_softlimit(a->ma_hardlimit, a->ma_percent);
491 } else {
492 s->softlimit = a->ma_hardlimit;
493 }
494 s->hardlimit = a->ma_hardlimit;
495 strbufcpy(s->ma_name, a->ma_name);
496 }
497