1 /*
2 * Copyright (c) 2000-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /* Guard header includes, so that the userspace test can include this file. */
30 #include <os/atomic_private.h>
31 #ifndef VM_MTE_FF_VERIFY
32 #include <debug.h>
33 #include <mach_assert.h>
34
35 #include <kern/bits.h>
36 #include <kern/kcdata.h>
37 #include <kern/queue.h>
38
39 #include <mach/sdt.h>
40
41 #include <vm/pmap.h>
42 #include <vm/vm_compressor_internal.h>
43 #include <vm/vm_kern.h>
44 #include <vm/vm_object_internal.h>
45 #include <vm/vm_page_internal.h>
46 #include <vm/vm_pageout.h>
47 #include <vm/vm_mteinfo_internal.h>
48
49 extern lck_grp_t vm_page_lck_grp_bucket;
50
51 #endif /* VM_MTE_FF_VERIFY */
52 #pragma mark Documentation
53 #if HAS_MTE
54
55 /*
56 * VM MTE Info
57 * ===========
58 *
59 * The top level goal of this code is to implement the policies managing the
60 * selection of tag storage pages on the system, in order to:
61 * - Minimize the number of live tag storage pages at any given time;
62 * - Maximize occupancy (the number of covered pages using MTE compared to tag
63 * storage pages actually being used for tag storage).
64 *
65 *
66 * Physical Memory Layout
67 * ----------------------
68 *
69 * The diagram below describes the general layout of the physical memory. iBoot
70 * will determine the placement of the tag storage region, at the end of the
71 * managed address space.
72 *
73 * As a result, the tag storage space is always part of the vm_pages array.
74 * However, several things should be noted:
75 *
76 * - The last tag storage pages cover unmanaged DRAM at the end of physical
77 * memory, as well as the tag storage space itself, and will never be used as
78 * tag storage memory by the system (the unmanaged space will not be MTE'd,
79 * and the tag storage space will never itself use MTE).
80 *
81 * - The first tag storage pages also cover unmanaged DRAM space at the
82 * beginning of physical memory, but might be used for tagging due to early
83 * boot code. However, these first tag storage pages will not be used for
84 * tag storage space dynamically by the system.
85 *
86 * - The beginning of the tag region space is always aligned to a 32 page
87 * boundary; however the start of the vm_pages array is not. As a result,
88 * there is a cluster of 32 pages that possibly crosses this boundary. This
89 * is relevant because dynamic tag storage management only functions for
90 * taggable pages inside the vm_pages array.
91 *
92 *
93 * ┌────────────┐─╮
94 * │ P_n+31 │ │
95 * ├────────────┤ │
96 * ╎ ... ╎ │
97 * ├────────────┤ │
98 * │ P_n │ │
99 * ├────────────┤─╯
100 * │ │
101 * ╎ ╎
102 * ╎ ... ╎
103 * ╎ ╎
104 * │ │
105 * mte_tag_storage_end ─ ─ ─├────────────┤ ─ ─ ─ vm_pages_end
106 * ┬ │TTTTTTTTTTTT│ Tag storage for pages [n:n+31]
107 * │ ├────────────┤
108 * │ │ │
109 * │ ╎ ... ╎
110 * │ │ │
111 * │ ├────────────┤
112 * 1/32 │ │TTTTTTTTTTTT│ Tag storage for pages [i:i+31]
113 * of DRAM │ ├────────────┤
114 * │ │ │
115 * │ ╎ ... ╎
116 * │ │ │
117 * │ ├────────────┤
118 * │ │TTTTTTTTTTTT│ Tag storage for pages [32:63]
119 * │ ├────────────┤
120 * ┴ │TTTTTTTTTTTT│ Tag storage for pages [0:31]
121 * mte_tag_storage_start ─ ─ ─├────────────┤─╮
122 * │ P_i+31 │ │
123 * ├────────────┤ │
124 * ╎ ... ╎ │
125 * ├────────────┤ │
126 * │ P_i │ │
127 * ├────────────┤─╯
128 * │ │
129 * ╎ ╎
130 * ╎ ... ╎
131 * ╎ ╎
132 * │ │
133 * ├────────────┤─╮
134 * │ │ │
135 * ╎ ... ╎ │
136 * ├────────────┤ │ ─ ─ vm_pages
137 * ╎ ... ╎ │
138 * │ │ │
139 * │────────────┤─╯
140 * │ │
141 * ╎ ╎
142 * ╎ ... ╎
143 * ╎ ╎
144 * │ │
145 * ├────────────┤─╮
146 * │ P_31 │ │
147 * ├────────────┤ │
148 * ╎ ... ╎ │
149 * ├────────────┤ │
150 * │ P_0 │ │
151 * pmap_first_pnum ─ ─└────────────┘─╯ ─ ─ gDramBase
152 * Physical Memory
153 *
154 *
155 * Tag storage and cells
156 * ~~~~~~~~~~~~~~~~~~~~~
157 *
158 * Tag storage pages require metadata to track their state machine, in order to
159 * not grow the vm_page_t data structure for all pages on the system when only
160 * 1/32 of them are tag storage.
161 *
162 * The metadata is stored into a data structure called the MTE cell
163 * (@see cell_t) which is queued into the so called MTE Info data structure
164 * (@see @c mte_info_lists).
165 *
166 * The documentation of this file happily calls a cell a tag storage page and
167 * vice versa as result, since the mapping is 1:1.
168 *
169 *
170 * Tag storage state machine
171 * ~~~~~~~~~~~~~~~~~~~~~~~~~
172 *
173 * Disabled is a special state: this is the state cells start in,
174 * and never transition back to unless there is an ECC error.
175 *
176 * The state diagram involving "Disabled" looks like this:
177 *
178 * ╭──────────────╮ ╭───╴K.3╶──╮ ╔══════════════╗
179 * │ RECLAIMING ┼───╮ │ v ╭───>║ ACTIVE ║
180 * ╰──────────────╯ K.1 ╔═╪════════════╗ I.1 ╚══════════════╝
181 * ├───>║ DISABLED ╫───┤
182 * ╔═════════════╗ K.2 ╚══════════════╝ I.2 ╔══════════════╗
183 * ║ CLAIMED ╫───╯ ^ ^ ╰───>║ INACTIVE ║
184 * ╚═══════════╪═╝ │ │ ╚═╪════════════╝
185 * ╰────╴U.1╶───╯ ╰───╴U.2╶────╯
186 *
187 * ╔═╗ Double bar square boxes ╭─╮ Single bar round boxes
188 * ╚═╝ denote stable states. ╰─╯ denote transitionary states.
189 *
190 *
191 * Initialization (I.1, I.2)
192 *
193 * This is performed by mteinfo_tag_storage_release_startup()
194 * This function might decide to leave pages as disabled.
195 *
196 * Unmanaged discovery (U.1, U.2)
197 *
198 * This is performed at lockdown by mteinfo_tag_storage_unmanaged_discover()
199 * to discover tag storage that covers pages that will never have a canonical
200 * vm_page_t made for them, which are effectively unmanaged.
201 *
202 * Retirement (K.1, K.2, K.3)
203 *
204 * This is performed by mteinfo_tag_storage_set_retired(),
205 * itself called by vm_page_retire() which can only happen
206 * for pages that were never created (the cell will be DISABLED),
207 * or on the tag storage claimed page free path (the cell
208 * will either be RECLAIMING or CLAIMED).
209 *
210 *
211 * The rest of the tag storage state machine looks like this:
212 *
213 * ╭──────────────╮
214 * ╭────╴D.2╶───┼ DEACTIVATING │<───╴D.1╶────╮
215 * │ a ╰──────────────╯ a │
216 * v │
217 * ╔══════════════╗ ╭──────────────╮ ╔═╪════════════╗
218 * ║ INACTIVE ╫──╴A.1╶──>│ ACTIVATING ┼───╴A.2╶──>║ ACTIVE ║<─╮
219 * ╚════════════╪═╝ i/a ╰──────────────╯ i/a ╚══════════════╝ │
220 * ^ │ │
221 * │ │ │
222 * │ │ ╔════════════╗ │
223 * │ │ ╭───╴B.2╶───╫ PINNED ║<───╴B.1╶───╮ │
224 * │ │ │ i ╚════════════╝ a │ R.2
225 * │ │ │ │ a
226 * │ │ │ ╭─────╴R.x╶─────╮ │ │
227 * │ │ v v a │ │ │
228 * │ │ ╔═════════════╗ ╭─┼──────────┼─╮ │
229 * │ ╰────╴C.1╶──>║ CLAIMED ╫────╴R.1╶──>│ RECLAIMING ┼──╯
230 * │ i ╚═╪═══════════╝ a ╰─┼────────────╯
231 * │ │ │
232 * ╰──────────╴F.1╶──────────╯<─────────╴F.2╶───────────╯
233 * i i
234 *
235 * ╔═╗ Double bar square boxes ╭─╮ Single bar round boxes
236 * ╚═╝ denote stable states. ╰─╯ denote transitionary states.
237 *
238 * a the transition can be done by the refill thread (async)
239 * i the transition can be done inline by any thread.
240 *
241 *
242 * Activation (A.1, A.2)
243 *
244 * [A.1 inline] is performed by mteinfo_tag_storage_try_activate() by
245 * vm_page_grab_slow() if the current grab would deplete the taggable
246 * space too much and that there seem to be an ample reserve of free
247 * pages.
248 *
249 * This path however will limit itself to pages that are really worth
250 * activating (17+ free associated pages, which coincide with the first 3
251 * mteinfo buckets for MTE_STATE_INACTIVE).
252 *
253 *
254 * [A.1 async] is performed by mteinfo_tag_storage_active_refill() when it
255 * decides that activating pages is the best strategy to get more taggable
256 * pages. It will only do so if [R.1 async] isn't more profitable.
257 *
258 *
259 * [A.2 inline/async] is performed by mteinfo_tag_storage_activate_locked()
260 * on the results of [A.1 inline/async]. The most notable thing to mention
261 * is until the tag pages are fully activated, no tagged page can be
262 * allocated, and if the thread doing this operation inline is a low priority
263 * thread, this could cause starvation due to priority inversions.
264 *
265 * To prevent this issue, turnstiles are used for the inline case so that
266 * there's a single activator at a time with priority inversion avoidance.
267 * The async path doesn't use this as it is a very high priority thread,
268 * and is meant to run in case of emergencies.
269 *
270 *
271 * Deactivation (D.1, D.2)
272 *
273 * [D.1 async] is performed by mteinfo_tag_storage_drain(). The refill
274 * thread will invoke this function after it is done with activations.
275 *
276 * This phase will only drain active(0.0) pages, meaning pages that are active
277 * but have no free pages associated with it nor MTE pages. Having such pages
278 * on the system is a sign of untagged memory pressure, and it's probably
279 * a good idea to free that tag storage page so it can be used for untagged
280 * purposes (i.e., become claimed).
281 *
282 * It will drain pages until the @c mte_claimable_queue has a healthy level.
283 *
284 * This transition is triggered lazily from the @c mteinfo_free_queue_grab()
285 * path when untagged pages have been allocated and tapped into the taggable
286 * space, and that system conditions permit
287 * (see @c mteinfo_tag_storage_should_drain()).
288 *
289 * [D.2 async] is performed by mteinfo_tag_storage_drain_flush(),
290 * which is called by mteinfo_tag_storage_drain() on the results
291 * of [D.1 async]
292 *
293 *
294 * Allocation/Claiming (C.1)
295 *
296 * [C.1 inline] is performed by @c mteinfo_tag_storage_claimable_refill()
297 * from the context of any @c mteinfo_free_queue_grab() (tagged or regular).
298 * The path will opportunistically determine that there are enough pages
299 * on the @c mte_claimable_queue that amortizing the cost of taking
300 * the spinlock protecting the per-cpu queue is worth it.
301 *
302 * It is done unconditionally otherwise, as the reclaim thread can steal
303 * from these queues. The @c vm_page_grab_options() fastpath knows how
304 * to draw from this directly.
305 *
306 *
307 * Freeing (F.1, F.2)
308 *
309 * [F.1 inline] is performed by page free paths who eventually call into
310 * @c vm_page_free_queue_enter(VM_MEMORY_CLASS_TAG_STORAGE).
311 *
312 * [F.2 inline] is the exact same transition but for the case when the refill
313 * thread was attempting to reclaim this page (it had performed [R.1 async]).
314 * It is worth nothing that on paper, the [C.1 inline] transition could happen
315 * again before the refill thread notices.
316 *
317 *
318 * Reclaiming (R.1, R.2, R.x, B.1, B.2)
319 *
320 * [R.1 async] is performed by mteinfo_tag_storage_active_refill() when it
321 * decides that reclaiming (stealing) pages is the best strategy to get more
322 * taggable pages. It will only do so if [A.1 async] isn't more profitable.
323 *
324 * Once pages have been marked as reclaiming, it will attempt to either steal
325 * the page from the cpu free queue, or attempt a relocation.
326 *
327 * [R.2 async] is exactly the same as [A.2 async], being performed by
328 * mteinfo_tag_storage_activate_locked() on the results of [R.1 async].
329 * The major difference however is that it is done one page at a time.
330 *
331 * [B.1 async] is performed by @c mteinfo_reclaim_tag_storage_page() when
332 * the relocating a claimed page failed due to the page being pinned.
333 * In which case, the tag storage page is marked with @c vmp_ts_wanted bit.
334 *
335 * [B.2 inline] is performed by @c mteinfo_tag_storage_wakeup() when threads
336 * notice that @c vmp_ts_wanted is set and that the condition causing it to be
337 * set has cleared.
338 *
339 * [R.x async] is performed when stealing the page was otherwise not
340 * successful (in @c mteinfo_reclaim_tag_storage_page() or
341 * @c mteinfo_tag_storage_flush_reclaiming()).
342 */
343
344
345 #pragma mark Types
346
347 /*!
348 * @typedef cell_state_mask_t
349 *
350 * @abstract
351 * Mask/bit-field version of the @c mte_cell_state_t bit in order to do assertions.
352 */
353 __options_decl(cell_state_mask_t, uint32_t, {
354 MTE_MASK_DISABLED = BIT(MTE_STATE_DISABLED),
355 MTE_MASK_PINNED = BIT(MTE_STATE_PINNED),
356 MTE_MASK_DEACTIVATING = BIT(MTE_STATE_DEACTIVATING),
357 MTE_MASK_CLAIMED = BIT(MTE_STATE_CLAIMED),
358 MTE_MASK_INACTIVE = BIT(MTE_STATE_INACTIVE),
359 MTE_MASK_RECLAIMING = BIT(MTE_STATE_RECLAIMING),
360 MTE_MASK_ACTIVATING = BIT(MTE_STATE_ACTIVATING),
361 MTE_MASK_ACTIVE = BIT(MTE_STATE_ACTIVE),
362 });
363
364 #define MTE_FF_CELL_INDEX_BITS 24 /* Number of bits for a cell index */
365 #define MTE_FF_CELL_PAGE_COUNT_BITS 6 /* Number of bits for a page count */
366 #define MTE_FF_CELL_STATE_BITS 3
367
368 /*!
369 * @typedef cell_idx_t
370 *
371 * @abstract
372 * Represents the index of a cell in the cell array (when positive), or a queue
373 * head (when negative).
374 *
375 * @discussion
376 * This type only has @c MTE_FF_CELL_INDEX_BITS worth of significant bits.
377 * Given that one bit is used to denote queues, it means we can support systems
378 * with up to:
379 * - 2^(MTE_FF_CELL_INDEX_BITS - 1) tag storage pages,
380 * - 2^(MTE_FF_CELL_INDEX_BITS + 4) pages,
381 * - 2^(MTE_FF_CELL_INDEX_BITS + 4 + PAGE_SHIFT) bytes.
382 *
383 * On a 16KB system (PAGE_SHIFT == 14) and with MTE_FF_CELL_INDEX_BITS == 24,
384 * this covers 2^42 == 4TB of physical memory.
385 */
386 typedef int32_t cell_idx_t;
387
388 typedef uint32_t cell_count_t;
389
390 /*!
391 * @typedef cell_t
392 *
393 * @abstract
394 * This data structure contains the metadata associated with a tag storage page,
395 * and its covered pages in the mteinfo tracking data structure.
396 *
397 * @discussion
398 * Here are some important invariants for this data structure:
399 * - mte_page_count + popcount(free_mask) <= MTE_PAGES_PER_TAG_PAGE
400 * - mte_page_count must be 0 unless state is DISABLED or ACTIVE.
401 *
402 * @field prev
403 * Linkage to the prev cell (as an index in the cell array).
404 *
405 * @field next
406 * Linkage to the next cell (as an index in the cell array).
407 *
408 * @field enqueue_pos
409 * If @c free_mask isn't 0, this contains the index of the free covered page
410 * which represents this cell in the mte free queues (@see @c mte_free_queues[]).
411 *
412 * @field mte_page_count
413 * The number of pages covered with this tag storage page, that are currently
414 * used and tagged.
415 *
416 * @field state
417 * The current state of the tag storage page this cell represents.
418 * @see mte_cell_state_t.
419 *
420 * @field free_mask
421 * A bitmask where each bit set corresponds to an associated covered page that
422 * is free (tagged or not).
423 *
424 * @field cell_count
425 * When the cell is a queue head, the number of cells enqueued on this bucket.
426 */
427 #pragma pack(4)
428 typedef struct {
429 cell_idx_t prev : MTE_FF_CELL_INDEX_BITS;
430 cell_idx_t next : MTE_FF_CELL_INDEX_BITS;
431 cell_count_t enqueue_pos : MTE_FF_CELL_PAGE_COUNT_BITS;
432 cell_count_t mte_page_count : MTE_FF_CELL_PAGE_COUNT_BITS;
433 mte_cell_state_t state : MTE_FF_CELL_STATE_BITS;
434 uint8_t __unused_bits : 1;
435 union {
436 uint32_t free_mask;
437 uint32_t cell_count;
438 };
439 } cell_t;
440 #pragma pack()
441
442 static_assert(sizeof(cell_t) == 12);
443 static_assert(MTE_STATE_ACTIVE < (1u << MTE_FF_CELL_STATE_BITS));
444 static_assert(MTE_PAGES_PER_TAG_PAGE <= (1 << MTE_FF_CELL_PAGE_COUNT_BITS));
445
446 /*!
447 * @typedef mte_cell_queue_t
448 *
449 * @abstract
450 * This data structure represents a particular queue/bucket of cells.
451 */
452 typedef struct mte_cell_queue_head {
453 cell_t head;
454 } *mte_cell_queue_t;
455
456 /*!
457 * @typedef mte_cell_bucket_t
458 *
459 * @abstract
460 * Represents the index of a bucket inside of a list.
461 */
462 __enum_decl(mte_cell_bucket_t, uint32_t, {
463 MTE_BUCKET_0,
464 MTE_BUCKET_1_8,
465 MTE_BUCKET_9_16,
466 MTE_BUCKET_17_24,
467 MTE_BUCKET_25_32,
468
469 _MTE_BUCKET_COUNT,
470 });
471
472 static_assert(_MTE_BUCKET_COUNT == MTE_BUCKETS_COUNT_MAX);
473
474 #define MTE_QUEUES_COUNT \
475 (1 /* disabled */ + \
476 1 /* pinned */ + \
477 MTE_BUCKETS_COUNT_MAX /* claimed */ + \
478 MTE_BUCKETS_COUNT_MAX /* inactive */ + \
479 1 /* deactivating */ + \
480 1 /* reclaiming */ + \
481 1 /* activating */ + \
482 MTE_BUCKETS_COUNT_MAX /* active_0 */ + \
483 1 /* active */ )
484
485
486 #pragma mark Behavioral boot-args
487
488 /*
489 * Boot-arg to enable/disable the interface for grabbing tag storage pages.
490 * This exists in case tunables or settings for tag storage management expose
491 * us to page shortages or system hangs due to wired tag storage pages. This
492 * boot-arg should allow us to bypass any such issues.
493 */
494 static TUNABLE(bool, vm_mte_enable_tag_storage_grab, "mte_ts_grab", true);
495
496 /*
497 * Boot-args controlling the draining down of tag storage space
498 *
499 * @var vm_page_tag_storage_reserved
500 * How many tag storage pages the inactive_0 queue needs to preserve
501 * at all times.
502 */
503 TUNABLE(uint32_t, vm_page_tag_storage_reserved, "mte_ts_grab_rsv", 100);
504
505 /*
506 * Boot-arg to enable/disable grabbing tag storage pages for the compressor
507 * pool.
508 */
509 TUNABLE(bool, vm_mte_tag_storage_for_compressor, "mte_ts_compressor", true);
510
511 #ifndef VM_MTE_FF_VERIFY
512 /*
513 * Boot-arg to enable/disable grabbing tag storage pages for specific VM tags.
514 * Note that the string length was somewhat arbitrarily chosen, so if the use
515 * case arises, we may need to bump that up...
516 *
517 * Currently, we allow allocations with VM tags of VM_MEMORY_MALLOC_SMALL (2),
518 * VM_MEMORY_MALLOC_TINY (7), and VM_MEMORY_MALLOC_NANO (11) to use tag storage
519 * pages. See vm_statistics.h for other potential candidates.
520 * In particular, VM_MEMORY_STACK (30) is promising.
521 */
522 static TUNABLE_STR(vm_mte_tag_storage_for_vm_tags, 256, "mte_ts_vmtag", "2,7,11");
523 #endif /* VM_MTE_FF_VERIFY */
524
525 #pragma mark Counters and Globals
526
527 struct mte_cell_list mte_info_lists[MTE_LISTS_COUNT];
528
529 static SECURITY_READ_ONLY_LATE(cell_t *) mte_info_cells;
530
531 #ifndef VM_MTE_FF_VERIFY
532 /*
533 * Fill thread state. The wake state of the thread is tracked to minimize
534 * scheduler interactions. Guarded with the free page lock.
535 */
536 static sched_cond_atomic_t fill_thread_cond = SCHED_COND_INIT;
537 static SECURITY_READ_ONLY_LATE(thread_t) vm_mte_fill_thread = THREAD_NULL;
538 static thread_t vm_mte_activator = THREAD_NULL;
539 static bool vm_mte_activator_waiters = false;
540
541 struct mte_pcpu PERCPU_DATA(mte_pcpu);
542 SCALABLE_COUNTER_DEFINE(vm_cpu_free_tagged_count);
543 SCALABLE_COUNTER_DEFINE(vm_cpu_free_claimed_count);
544 #endif
545
546 /*
547 * Free taggable pages queue, per-cpu queues, and its counters.
548 *
549 * guarded by the free page lock
550 */
551 uint32_t vm_page_free_taggable_count;
552 uint32_t vm_page_free_unmanaged_tag_storage_count;
553 uint32_t vm_page_tagged_count; /* Total tagged covered pages. */
554 uint32_t vm_page_free_wanted_tagged = 0;
555 uint32_t vm_page_free_wanted_tagged_privileged = 0;
556
557 /*
558 * Counters for tag storage pages we will just give to the system permanently
559 * for use as regular memory. These could technically be a subset of the
560 * claimed tag storage, but counting them separately is useful because they
561 * will have a different page lifecycle than the claimed tag storage pages...
562 * as when freed, these pages will go to the regular free queues.
563 *
564 * These shouldn't be mutated after bootstrap... so they have no lock.
565 */
566 uint32_t vm_page_recursive_tag_storage_count;
567 uint32_t vm_page_retired_tag_storage_count;
568 uint32_t vm_page_unmanaged_tag_storage_count;
569
570 /*
571 * The wired tag storage page count is guarded by the page queues lock. This
572 * counter is diagnostic; it exists to inform investigations about reclaim
573 * efficiency.
574 */
575 uint32_t vm_page_wired_tag_storage_count;
576
577 /*
578 * Diagnostic counters for reclamation; describes how many times reclamation
579 * attempts have succeeded or failed (as well as a breakout for failures due to
580 * the page being wired). Guarded by the free page lock.
581 */
582 uint64_t vm_mte_refill_thread_wakeups;
583 uint64_t vm_page_tag_storage_activation_count;
584 uint64_t vm_page_tag_storage_deactivation_count;
585 uint64_t vm_page_tag_storage_reclaim_from_cpu_count;
586 uint64_t vm_page_tag_storage_reclaim_success_count;
587 uint64_t vm_page_tag_storage_reclaim_failure_count;
588 uint64_t vm_page_tag_storage_reclaim_wired_failure_count;
589 uint64_t vm_page_tag_storage_wire_relocation_count;
590 uint64_t vm_page_tag_storage_reclaim_compressor_failure_count;
591 uint64_t vm_page_tag_storage_compressor_relocation_count;
592
593 #ifndef VM_MTE_FF_VERIFY
594 /*
595 * Diagnostic counter for reclamation describing the number of tag storage
596 * pages that have ever been allocated as claimed. Note that this value
597 * only increases.
598 */
599 SCALABLE_COUNTER_DEFINE(vm_cpu_claimed_count);
600 #endif /* VM_MTE_FF_VERIFY */
601
602 /*
603 * Array of 4 64-bit masks for which VM tags can use tag storage.
604 * There are a total of 256 VM tags.
605 * This shouldn't be mutated after bootstrap... so it has no lock.
606 */
607 bitmap_t vm_mte_tag_storage_for_vm_tags_mask[BITMAP_LEN(VM_MEMORY_COUNT)];
608
609 #pragma mark cell_idx_t
610
611 __pure2
612 static bool
cell_idx_is_queue(cell_idx_t idx)613 cell_idx_is_queue(cell_idx_t idx)
614 {
615 return idx < 0;
616 }
617
618 __pure2
619 static cell_t *
cell_from_idx(cell_idx_t idx)620 cell_from_idx(cell_idx_t idx)
621 {
622 return &mte_info_cells[idx];
623 }
624
625 __pure2
626 __attribute__((overloadable))
627 static cell_idx_t
cell_idx(const cell_t * cell)628 cell_idx(const cell_t *cell)
629 {
630 return (cell_idx_t)(cell - mte_info_cells);
631 }
632
633 __pure2
634 __attribute__((overloadable))
635 static cell_idx_t
cell_idx(mte_cell_queue_t queue)636 cell_idx(mte_cell_queue_t queue)
637 {
638 return cell_idx(&queue->head);
639 }
640
641 __pure2
642 static cell_count_t
cell_free_page_count(cell_t cell)643 cell_free_page_count(cell_t cell)
644 {
645 return __builtin_popcountll(cell.free_mask);
646 }
647
648 __pure2
649 static ppnum_t
cell_first_covered_pnum(const cell_t * cell)650 cell_first_covered_pnum(const cell_t *cell)
651 {
652 return pmap_first_pnum + cell_idx(cell) * MTE_PAGES_PER_TAG_PAGE;
653 }
654
655
656 #pragma mark mte_cell_queue_t
657
658 /*
659 * Based on the existing queue code in XNU. Look at <kern/queue.h> for the
660 * original code; done here due to the custom linkages.
661 */
662
663 static cell_idx_t
cell_queue_first_idx(mte_cell_queue_t queue)664 cell_queue_first_idx(mte_cell_queue_t queue)
665 {
666 return queue->head.next;
667 }
668
669 static cell_idx_t
cell_queue_last_idx(mte_cell_queue_t queue)670 cell_queue_last_idx(mte_cell_queue_t queue)
671 {
672 return queue->head.prev;
673 }
674
675 static cell_t *
cell_queue_first(mte_cell_queue_t queue)676 cell_queue_first(mte_cell_queue_t queue)
677 {
678 return cell_from_idx(cell_queue_first_idx(queue));
679 }
680
681 static uint32_t
cell_queue_count(mte_cell_queue_t queue)682 cell_queue_count(mte_cell_queue_t queue)
683 {
684 return queue->head.cell_count;
685 }
686
687
688 static bool
cell_queue_insert_tail(mte_cell_queue_t queue,cell_t * cell)689 cell_queue_insert_tail(mte_cell_queue_t queue, cell_t *cell)
690 {
691 cell_idx_t qidx = cell_idx(queue);
692 cell_idx_t tidx = cell_queue_last_idx(queue);
693 cell_t *tail = cell_from_idx(tidx);
694
695 if (tail->next != qidx) {
696 __queue_element_linkage_invalid(tail);
697 }
698
699 cell->next = qidx;
700 cell->prev = tidx;
701 queue->head.prev = tail->next = cell_idx(cell);
702
703 /* If the original tail was the queue, then it was empty. */
704 return cell_idx_is_queue(tidx);
705 }
706
707 static bool
cell_queue_remove(cell_t * cell)708 cell_queue_remove(cell_t *cell)
709 {
710 cell_idx_t pidx = cell->prev;
711 cell_idx_t nidx = cell->next;
712 cell_idx_t cidx = cell_idx(cell);
713 cell_t *prev = cell_from_idx(pidx);
714 cell_t *next = cell_from_idx(nidx);
715
716 if (prev->next != cidx || next->prev != cidx) {
717 __queue_element_linkage_invalid(cell);
718 }
719
720 next->prev = pidx;
721 prev->next = nidx;
722 /* No linkage cleanup because cells are never dequeued at rest. */
723
724 /*
725 * If the prev and next indices are the same, then this is the head
726 * index, and the queue became empty
727 */
728
729 return pidx == nidx;
730 }
731
732 #define cell_queue_foreach(it, q) \
733 for (cell_t *it = cell_queue_first(q); \
734 it != &(q)->head; \
735 it = cell_from_idx(it->next))
736
737 #define cell_queue_foreach_safe(it, q) \
738 for (cell_t *__next_it, *it = cell_queue_first(q); \
739 it != &(q)->head && (__next_it = cell_from_idx(it->next), 1); \
740 it = __next_it)
741
742
743 #pragma mark MTE free queue
744
745 /*
746 * The MTE free queue is a multi-dimensioned queue that replaces the
747 * vm_page_free_queue for covered pages on MTE targets.
748 *
749 * It is an array of colored free queues indexed by @c mte_free_queue_idx_t.
750 *
751 *
752 * A queue of tag storage pages
753 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
754 *
755 * When a tag storage page has no associated free covered pages, no page is
756 * enqueued on the mte free queue. However when a tag storage page has one or
757 * more free covered pages associated then there is one and only one of these
758 * pages enqueued on the mte free queues.
759 *
760 * The chosen representative for the cell is remembered on the cell of the
761 * associated tag storage @c cell_t::enqueue_pos value.
762 *
763 *
764 * Enqueue / dequeue algorithm
765 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
766 *
767 * This chosen representative makes the cluster available for its page color,
768 * and only this color, despite other colors being possibly available for this
769 * tag storage page.
770 *
771 * When removing a free page from the MTE queue, if the page being grabbed
772 * was the enqueued candidate, then the next enqueued candidate is chosen
773 * as the next free page in bitmask "circular" order
774 * (@see mteinfo_free_queue_next_bit()).
775 *
776 * As a result, by "pushing" the page forward this way, the tag storage page
777 * will be made available through all colors that it can provide.
778 *
779 *
780 * Allocation stability and bucket selection
781 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
782 *
783 * The free queues are in that order:
784 *
785 * {claimed/disabled} -> {inactive_0, inactive_1} ->
786 * {active_0, active_1, active_2, active_3} -> {activating}
787 *
788 * This is selected carefully to have the following crucial properties:
789 *
790 * - allocating untagged pages chooses buckets "left to right"
791 * (in increasing free queue index order).
792 *
793 * - allocating tagged pages chooses active buckets "right to left"
794 * (in decreasing free queue index from the active_* queues).
795 *
796 * - when allocating untagged pages, the impact on the tag storage page will
797 * be that it stays in the same free queue or moves "down" in the free queue
798 * indices order.
799 *
800 * - when allocating tagged pages, the impact on the tag storage page will
801 * be that it stays in the same free queue or moves "up" in the free queue
802 * indices order.
803 *
804 * This is important and allows for a nice optimization: if a tag storage page
805 * was found to be a good candidate for a given grab operation, it always will
806 * stay a "best" candidate until it has no free pages left, which allows for
807 * allocations of contiguous spans of pages at once
808 * (@see mteinfo_free_queue_grab()).
809 *
810 * Lastly, in order to find the first free bucket quickly,
811 * @c mte_claimable_queue is a bitmask where a bit being set means that the
812 * corresponding bucket has at least one queue non empty.
813 *
814 *
815 * Tag Storage Free queue
816 * ~~~~~~~~~~~~~~~~~~~~~~
817 *
818 * Tag storage pages can only be claimed if they are inactive with the [C.1]
819 * transition. Getting pages to inactive is done via the Deactivation [D.*].
820 *
821 * However, as we mentioned the MTE free queue is only about covered pages
822 * proper, and do not contain the tag storage pages. Another point is that
823 * we do not want to claim pages too aggressively as it could get in the way
824 * of the Activation [A.*] transition when tagged pages are required.
825 *
826 * To solve this tension, the @c mte_claimable_queue holds inactive tag storage
827 * pages that have 8 free pages or less at any given time. These are unlikely
828 * to be profitable activation candidates, but also demonstrate that there is
829 * enough untagged memory pressure on the system that we have clusters of
830 * covered pages in use.
831 *
832 * The @c mteinfo_free_queue_grab() code will promote these to a per-cpu
833 * free queue that in turn the @c vm_page_grab_options() fastpath can tap into
834 * as another opportunistic source of pages.
835 */
836 struct vm_page_free_queue mte_free_queues[MTE_FREE_NOT_QUEUED];
837 struct vm_page_free_queue mte_claimable_queue;
838 static uint32_t mte_free_queue_mask;
839
840 /*!
841 * @abstract
842 * Computes the proper mte free queue index for a given cell.
843 */
844 __pure2
845 static mte_free_queue_idx_t
mteinfo_free_queue_idx(cell_t cell)846 mteinfo_free_queue_idx(cell_t cell)
847 {
848 uint32_t free = cell_free_page_count(cell);
849 uint32_t tagged = cell.mte_page_count;
850 uint32_t used = MTE_PAGES_PER_TAG_PAGE - free - tagged;
851 uint32_t n;
852
853 if (cell.free_mask == 0) {
854 return MTE_FREE_NOT_QUEUED;
855 }
856
857 switch (cell.state) {
858 case MTE_STATE_DISABLED:
859 case MTE_STATE_PINNED:
860 case MTE_STATE_DEACTIVATING:
861 return MTE_FREE_UNTAGGABLE_0;
862
863 case MTE_STATE_CLAIMED:
864 case MTE_STATE_INACTIVE:
865 /*
866 * This is "clever" code to map:
867 * MTE_FREE_UNTAGGABLE_0: Claimed[0-16]
868 * MTE_FREE_UNTAGGABLE_1: Claimed[16-32], Inactive[0-16]
869 * MTE_FREE_UNTAGGABLE_2: Inactive[16-32]
870 */
871 n = MTE_FREE_UNTAGGABLE_0 + cell.state - MTE_STATE_CLAIMED;
872 static_assert(MTE_STATE_INACTIVE == MTE_STATE_CLAIMED + 1);
873 return n + (free > MTE_PAGES_PER_TAG_PAGE / 2);
874
875 case MTE_STATE_RECLAIMING:
876 case MTE_STATE_ACTIVATING:
877 return MTE_FREE_UNTAGGABLE_ACTIVATING;
878
879 case MTE_STATE_ACTIVE:
880 break;
881 }
882
883 /*
884 * Empirically this seems to give decent fragmentation results
885 * with alternating MTE/non-MTE workloads.
886 *
887 * This tries to find a balance between favoring buckets with mte pages
888 * allocated and to penalize buckets with untagged pages allocated,
889 * while keeping buckets with the most free pages on the fence.
890 *
891 * The distribution it generates can be printed by running the
892 * "active_buckets" subtest of tests/vm/vm_mteinfo.c
893 */
894
895 n = tagged + free / 5;
896 n -= MIN(n, used) / 3;
897 return MTE_FREE_ACTIVE_0 + fls(n / 4);
898 }
899
900 static vm_page_queue_t
mteinfo_free_queue_head(mte_free_queue_idx_t idx,uint32_t color)901 mteinfo_free_queue_head(mte_free_queue_idx_t idx, uint32_t color)
902 {
903 return &mte_free_queues[idx].vmpfq_queues[color].qhead;
904 }
905
906 /*!
907 * @abstract
908 * Computes the next bit in "circular" mask order
909 *
910 * @discussion
911 * This computes the next bit set in @c mask that is larger or equal
912 * to @c bit, or if none exist, then the smallest bit set in @c mask.
913 *
914 * This means that for a mask with positions mask={1, 5, 6, 10} set,
915 * the "next" bit for:
916 * - 4 is 5,
917 * - 10 is 10,
918 * - 12 is 1.
919 *
920 * @param mask The mask to scan. The mask must be non 0.
921 * @param bit The bit to scan from.
922 * @returns The next bit set in "circular" order.
923 */
924 static cell_count_t
mteinfo_free_queue_next_bit(uint32_t mask,cell_count_t bit)925 mteinfo_free_queue_next_bit(uint32_t mask, cell_count_t bit)
926 {
927 cell_count_t cur = bit % MTE_PAGES_PER_TAG_PAGE;
928
929 mask = (mask >> cur) | (mask << (32 - cur));
930 bit += ffs(mask) - 1;
931
932 return bit % MTE_PAGES_PER_TAG_PAGE;
933 }
934
935 /*!
936 * @abstract
937 * Backend for CELL_UPDATE() to manage update/requeues to the mte free queue.
938 *
939 * @param cell The new state of the cell.
940 * @param orig The original state of the cell.
941 * @param oidx The original free queue index for the cell.
942 * @param nidx The new free queue index for the cell.
943 */
944 __attribute__((noinline))
945 static void
mteinfo_free_queue_requeue(cell_t * cell,const cell_t orig,mte_free_queue_idx_t oidx,mte_free_queue_idx_t nidx)946 mteinfo_free_queue_requeue(
947 cell_t *cell,
948 const cell_t orig,
949 mte_free_queue_idx_t oidx,
950 mte_free_queue_idx_t nidx)
951 {
952 ppnum_t first_pnum = cell_first_covered_pnum(cell);
953 vm_page_queue_t queue;
954 cell_count_t bit = orig.enqueue_pos;
955 vm_page_t mem;
956
957 if (oidx == MTE_FREE_NOT_QUEUED && nidx == MTE_FREE_NOT_QUEUED) {
958 cell->enqueue_pos = -1;
959 return;
960 }
961
962 if (oidx != MTE_FREE_NOT_QUEUED) {
963 mem = vm_page_find_canonical(first_pnum + bit);
964 queue = mteinfo_free_queue_head(oidx,
965 (first_pnum + bit) & vm_color_mask);
966 assert(bit_test(orig.free_mask, bit));
967
968 vm_page_queue_remove(queue, mem, vmp_pageq);
969 VM_COUNTER_DEC(&mte_free_queues[oidx].vmpfq_count);
970 if (mte_free_queues[oidx].vmpfq_count == 0) {
971 bit_clear(mte_free_queue_mask, oidx);
972 }
973 }
974
975 if (nidx == MTE_FREE_NOT_QUEUED) {
976 cell->enqueue_pos = -1;
977 } else {
978 bit = mteinfo_free_queue_next_bit(cell->free_mask, bit);
979 mem = vm_page_find_canonical(first_pnum + bit);
980 queue = mteinfo_free_queue_head(nidx,
981 (first_pnum + bit) & vm_color_mask);
982 assert(bit_test(cell->free_mask, bit));
983
984 cell->enqueue_pos = bit;
985 vm_page_queue_enter_first(queue, mem, vmp_pageq);
986 if (mte_free_queues[nidx].vmpfq_count == 0) {
987 bit_set(mte_free_queue_mask, nidx);
988 }
989 VM_COUNTER_INC(&mte_free_queues[nidx].vmpfq_count);
990 }
991 }
992
993
994 #pragma mark mte_cell_list_t
995
996 __pure2
997 static mte_cell_bucket_t
cell_list_idx_buckets(mte_cell_list_idx_t idx)998 cell_list_idx_buckets(mte_cell_list_idx_t idx)
999 {
1000 switch (idx) {
1001 case MTE_LIST_INACTIVE_IDX:
1002 case MTE_LIST_CLAIMED_IDX:
1003 case MTE_LIST_ACTIVE_0_IDX:
1004 return MTE_BUCKETS_COUNT_MAX;
1005 default:
1006 return 1;
1007 }
1008 }
1009
1010 __pure2
1011 static mte_cell_list_idx_t
cell_list_idx(const cell_t cell)1012 cell_list_idx(const cell_t cell)
1013 {
1014 if (cell.state != MTE_STATE_ACTIVE || cell.mte_page_count == 0) {
1015 return (mte_cell_list_idx_t)cell.state;
1016 }
1017
1018 return MTE_LIST_ACTIVE_IDX;
1019 }
1020
1021 __pure2
1022 static mte_cell_bucket_t
cell_list_bucket(const cell_t cell)1023 cell_list_bucket(const cell_t cell)
1024 {
1025 if (cell_list_idx_buckets(cell_list_idx(cell)) > 1) {
1026 return (cell_free_page_count(cell) + 7) / 8;
1027 }
1028 return 0;
1029 }
1030
1031 __pure2
1032 static inline bool
cell_on_claimable_queue(const cell_t cell)1033 cell_on_claimable_queue(const cell_t cell)
1034 {
1035 if (cell.state == MTE_STATE_INACTIVE) {
1036 return cell_list_bucket(cell) <= MTE_BUCKET_1_8;
1037 }
1038 return false;
1039 }
1040
1041 __attribute__((noinline))
1042 static void
cell_list_requeue(cell_t * cell,vm_page_t tag_page,mte_cell_list_idx_t oidx,mte_cell_bucket_t obucket,mte_cell_list_idx_t nidx,mte_cell_bucket_t nbucket,int claim_requeue)1043 cell_list_requeue(
1044 cell_t *cell,
1045 vm_page_t tag_page,
1046 mte_cell_list_idx_t oidx,
1047 mte_cell_bucket_t obucket,
1048 mte_cell_list_idx_t nidx,
1049 mte_cell_bucket_t nbucket,
1050 int claim_requeue)
1051 {
1052 mte_cell_list_t olist = &mte_info_lists[oidx];
1053 mte_cell_list_t nlist = &mte_info_lists[nidx];
1054
1055 if (cell_queue_remove(cell)) {
1056 bit_clear(olist->mask, obucket);
1057 }
1058
1059 if (cell_queue_insert_tail(&nlist->buckets[nbucket], cell)) {
1060 bit_set(nlist->mask, nbucket);
1061 }
1062
1063 olist->buckets[obucket].head.cell_count--;
1064 nlist->buckets[nbucket].head.cell_count++;
1065
1066 if (olist != nlist) {
1067 olist->count--;
1068 nlist->count++;
1069 }
1070
1071 if (claim_requeue) {
1072 #ifndef VM_MTE_FF_VERIFY
1073 uint32_t color = VM_PAGE_GET_COLOR(tag_page);
1074 vm_page_queue_t queue;
1075
1076 queue = &mte_claimable_queue.vmpfq_queues[color].qhead;
1077 if (claim_requeue > 0) {
1078 vm_page_queue_enter(queue, tag_page, vmp_pageq);
1079 } else {
1080 vm_page_queue_remove(queue, tag_page, vmp_pageq);
1081 }
1082 VM_COUNTER_DELTA(&mte_claimable_queue.vmpfq_count, claim_requeue);
1083 #endif /* VM_MTE_FF_VERIFY */
1084 }
1085 }
1086
1087 /*!
1088 * @abstract
1089 * Find a page in the last non-empty bucket that is larger than the
1090 * specified bucket index.
1091 *
1092 * @param lidx The list index to scan.
1093 * @param min_bucket The minimum bucket index to consider.
1094 * @param tag_page The tag page associated with the returned cell.
1095 * @returns The cell that was found or NULL.
1096 */
1097 static cell_t *
cell_list_find_last_page(mte_cell_list_idx_t lidx,mte_cell_bucket_t min_bucket,vm_page_t * tag_page)1098 cell_list_find_last_page(
1099 mte_cell_list_idx_t lidx,
1100 mte_cell_bucket_t min_bucket,
1101 vm_page_t *tag_page)
1102 {
1103 mte_cell_list_t list = &mte_info_lists[lidx];
1104 uint32_t mask = list->mask & ~mask(min_bucket);
1105 mte_cell_queue_t queue;
1106
1107 if (__improbable(mask == 0)) {
1108 *tag_page = VM_PAGE_NULL;
1109 return NULL;
1110 }
1111
1112 queue = &list->buckets[fls(mask) - 1];
1113 *tag_page = vm_tag_storage_page_get(cell_queue_first_idx(queue));
1114 return cell_queue_first(queue);
1115 }
1116
1117
1118 #pragma mark Tag storage space state machine
1119
1120 /*!
1121 * Assert that a cell is in one of the states specified by the mask.
1122 */
1123 #define assert_cell_state(cell, mask) \
1124 release_assert(((mask) & (1 << (cell)->state)) != 0)
1125
1126 /*!
1127 * Perform an arbitrary update on a cell, and update the MTE info queues
1128 * accordingly.
1129 *
1130 * This should be used this way:
1131 *
1132 * <code>
1133 * // Preflights and asserts here
1134 * assert_cell_state(cell_var, ...);
1135 *
1136 * CELL_UPDATE(cell_var, tag_page, cleared_bit, {
1137 * // Mutations of cell_var here
1138 * cell_var->state = ...;
1139 * });
1140 * </code>
1141 *
1142 * @param cell The cell to update.
1143 * @param tag_page The tag page corresponding to @c cell.
1144 * @param cleared_bit The bit that was cleared or -1
1145 * @param mut Code that mutates its argument, and performs the
1146 * required update.
1147 */
1148 #define CELL_UPDATE(cell, tag_page, cleared_bit, ...) ({ \
1149 mte_cell_list_idx_t __ol, __nl; \
1150 mte_cell_bucket_t __ob, __nb; \
1151 mte_free_queue_idx_t __oi, __ni; \
1152 int __ocq, __ncq; \
1153 cell_t *__cell = (cell); \
1154 cell_t __orig = *__cell; \
1155 \
1156 __ol = cell_list_idx(__orig); \
1157 __ob = cell_list_bucket(__orig); \
1158 __ocq = cell_on_claimable_queue(__orig); \
1159 __oi = mteinfo_free_queue_idx(__orig); \
1160 \
1161 __VA_ARGS__; \
1162 \
1163 __nl = cell_list_idx(*__cell); \
1164 __nb = cell_list_bucket(*__cell); \
1165 __ncq = cell_on_claimable_queue(*__cell); \
1166 __ni = mteinfo_free_queue_idx(*__cell); \
1167 \
1168 if (__ol != __nl || __ob != __nb) { \
1169 cell_list_requeue(__cell, tag_page, __ol, __ob, __nl, __nb, \
1170 __ncq - __ocq); \
1171 } \
1172 if (__oi != __ni || (cleared_bit)) { \
1173 mteinfo_free_queue_requeue(__cell, __orig, __oi, __ni); \
1174 } \
1175 })
1176
1177 __pure2
1178 static cell_t *
cell_from_tag_storage_page(const struct vm_page * page)1179 cell_from_tag_storage_page(const struct vm_page *page)
1180 {
1181 cell_idx_t pidx;
1182
1183 pidx = (cell_idx_t)(page - vm_pages_tag_storage_array_internal());
1184 return cell_from_idx(pidx);
1185 }
1186
1187 __pure2
1188 __attribute__((overloadable))
1189 static cell_t *
cell_from_covered_ppnum(ppnum_t pnum)1190 cell_from_covered_ppnum(ppnum_t pnum)
1191 {
1192 cell_idx_t cidx = (pnum - pmap_first_pnum) / MTE_PAGES_PER_TAG_PAGE;
1193
1194 return cell_from_idx(cidx);
1195 }
1196
1197 __pure2
1198 __attribute__((overloadable))
1199 static cell_t *
cell_from_covered_ppnum(ppnum_t pnum,vm_page_t * tag_page)1200 cell_from_covered_ppnum(ppnum_t pnum, vm_page_t *tag_page)
1201 {
1202 cell_idx_t cidx = (pnum - pmap_first_pnum) / MTE_PAGES_PER_TAG_PAGE;
1203
1204 *tag_page = vm_tag_storage_page_get(cidx);
1205 return cell_from_idx(cidx);
1206 }
1207
1208 /*!
1209 * @function mteinfo_tag_storage_set_active()
1210 *
1211 * @abstract
1212 * Mark a tag storage page as active.
1213 *
1214 * @discussion
1215 * The page should be disabled (initial activation) or activating.
1216 *
1217 * @param tag_page The pointer to a page inside the tag storage space.
1218 * @param mte_count How many covered pages are used and tagged for @c tag_page.
1219 * @param init Whether this is the initial transition.
1220 * @returns The number of covered pages this made taggable.
1221 */
1222 static uint32_t
mteinfo_tag_storage_set_active(vm_page_t tag_page,uint32_t mte_count,bool init)1223 mteinfo_tag_storage_set_active(vm_page_t tag_page, uint32_t mte_count, bool init)
1224 {
1225 cell_t *cell = cell_from_tag_storage_page(tag_page);
1226 cell_count_t free_page_count = cell_free_page_count(*cell);
1227
1228 assert(mte_count + free_page_count <= MTE_PAGES_PER_TAG_PAGE);
1229 if (init) {
1230 assert_cell_state(cell,
1231 /* [I.1] */ MTE_MASK_DISABLED);
1232 } else {
1233 assert_cell_state(cell,
1234 /* [R.2] */ MTE_MASK_RECLAIMING |
1235 /* [A.2] */ MTE_MASK_ACTIVATING);
1236 }
1237
1238 VM_COUNTER_ADD(&vm_page_free_taggable_count, free_page_count);
1239 vm_page_tag_storage_activation_count++;
1240
1241 CELL_UPDATE(cell, tag_page, false, {
1242 cell->state = MTE_STATE_ACTIVE;
1243 cell->mte_page_count = mte_count;
1244 });
1245
1246 return free_page_count;
1247 }
1248
1249 bool
mteinfo_tag_storage_disabled(const struct vm_page * tag_page)1250 mteinfo_tag_storage_disabled(const struct vm_page *tag_page)
1251 {
1252 return cell_from_tag_storage_page(tag_page)->state == MTE_STATE_DISABLED;
1253 }
1254
1255 bool
mteinfo_tag_storage_is_active(const struct vm_page * tag_page)1256 mteinfo_tag_storage_is_active(const struct vm_page *tag_page)
1257 {
1258 return cell_from_tag_storage_page(tag_page)->state == MTE_STATE_ACTIVE;
1259 }
1260
1261 void
mteinfo_tag_storage_set_retired(vm_page_t tag_page)1262 mteinfo_tag_storage_set_retired(vm_page_t tag_page)
1263 {
1264 cell_t *cell = cell_from_tag_storage_page(tag_page);
1265
1266 assert(cell->mte_page_count == 0);
1267 assert_cell_state(cell,
1268 /* [K.3] */ MTE_MASK_DISABLED |
1269 /* [K.2] */ MTE_MASK_CLAIMED |
1270 /* [K.1] */ MTE_MASK_RECLAIMING);
1271
1272 VM_COUNTER_INC(&vm_page_retired_tag_storage_count);
1273
1274 CELL_UPDATE(cell, tag_page, false, {
1275 cell->state = MTE_STATE_DISABLED;
1276 });
1277 }
1278
1279 #ifndef VM_MTE_FF_VERIFY
1280 /*!
1281 * @function mteinfo_tag_storage_set_unmanaged()
1282 *
1283 * @abstract
1284 * Mark a tag storage page as actually being disabled-unmanaged
1285 *
1286 * @discussion
1287 * The tag storage page must be claimed or inactive.
1288 *
1289 * @param cell The cell to mark as disabled.
1290 * @param tag_page The tag page corresponding to @c cell.
1291 */
1292 static void
mteinfo_tag_storage_set_unmanaged(cell_t * cell,vm_page_t tag_page)1293 mteinfo_tag_storage_set_unmanaged(cell_t *cell, vm_page_t tag_page)
1294 {
1295 bool queue = cell->state == MTE_STATE_INACTIVE;
1296
1297 assert(cell->mte_page_count == 0);
1298 assert(cell->free_mask == 0);
1299
1300 assert_cell_state(cell,
1301 /* [U.1] */ MTE_MASK_CLAIMED |
1302 /* [U.2] */ MTE_MASK_INACTIVE);
1303
1304 VM_COUNTER_INC(&vm_page_unmanaged_tag_storage_count);
1305
1306 CELL_UPDATE(cell, tag_page, false, {
1307 cell->state = MTE_STATE_DISABLED;
1308 });
1309
1310 if (queue) {
1311 vm_page_free_queue_enter(VM_MEMORY_CLASS_DEAD_TAG_STORAGE,
1312 tag_page, VM_PAGE_GET_PHYS_PAGE(tag_page));
1313 }
1314 }
1315 #endif /* VM_MTE_FF_VERIFY */
1316
1317 void
mteinfo_tag_storage_set_inactive(vm_page_t tag_page,bool init)1318 mteinfo_tag_storage_set_inactive(vm_page_t tag_page, bool init)
1319 {
1320 cell_t *cell = cell_from_tag_storage_page(tag_page);
1321
1322 assert(cell->mte_page_count == 0);
1323 if (init) {
1324 assert_cell_state(cell,
1325 /* [I.2] */ MTE_MASK_DISABLED);
1326 } else {
1327 assert_cell_state(cell,
1328 /* [D.2] */ MTE_MASK_DEACTIVATING |
1329 /* [F.1] */ MTE_MASK_CLAIMED |
1330 /* [F.2] */ MTE_MASK_RECLAIMING);
1331 }
1332
1333 #ifndef VM_MTE_FF_VERIFY
1334 if (cell->state == MTE_STATE_CLAIMED) {
1335 /*
1336 * This is to account for [F.1].
1337 * For [F.2], we already decremented due to [R.1]
1338 */
1339 counter_dec(&vm_cpu_claimed_count);
1340 }
1341 #endif /* VM_MTE_FF_VERIFY */
1342
1343 CELL_UPDATE(cell, tag_page, false, {
1344 cell->state = MTE_STATE_INACTIVE;
1345 });
1346 }
1347
1348 void
mteinfo_tag_storage_set_claimed(vm_page_t tag_page)1349 mteinfo_tag_storage_set_claimed(vm_page_t tag_page)
1350 {
1351 cell_t *cell = cell_from_tag_storage_page(tag_page);
1352
1353 assert(cell->mte_page_count == 0);
1354 assert_cell_state(cell,
1355 /* [C.1] */ MTE_MASK_INACTIVE |
1356 /* [R.x] */ MTE_MASK_RECLAIMING);
1357
1358 #ifndef VM_MTE_FF_VERIFY
1359 if (cell->state == MTE_STATE_RECLAIMING) {
1360 counter_inc(&vm_cpu_claimed_count);
1361 }
1362 #endif /* VM_MTE_FF_VERIFY */
1363
1364 CELL_UPDATE(cell, tag_page, false, {
1365 cell->state = MTE_STATE_CLAIMED;
1366 });
1367 }
1368
1369 /*!
1370 * @function mteinfo_tag_storage_set_reclaiming()
1371 *
1372 * @abstract
1373 * Mark a tag storage page as being reclaimed.
1374 *
1375 * @discussion
1376 * The tag storage page must be claimed.
1377 *
1378 * @param cell The cell to mark as reclaiming
1379 * @param tag_page The tag page corresponding to @c cell.
1380 */
1381 static void
mteinfo_tag_storage_set_reclaiming(cell_t * cell,vm_page_t tag_page)1382 mteinfo_tag_storage_set_reclaiming(cell_t *cell, vm_page_t tag_page)
1383 {
1384 assert(cell->mte_page_count == 0);
1385 assert_cell_state(cell, /* [R.1] */ MTE_MASK_CLAIMED);
1386
1387 CELL_UPDATE(cell, tag_page, false, {
1388 cell->state = MTE_STATE_RECLAIMING;
1389 });
1390
1391 #ifndef VM_MTE_FF_VERIFY
1392 counter_dec(&vm_cpu_claimed_count);
1393 #endif /* VM_MTE_FF_VERIFY */
1394 }
1395
1396 /*!
1397 * @function mteinfo_tag_storage_flush_reclaiming()
1398 *
1399 * @abstract
1400 * Empties the reclaiming queue, moving all pages on it back to claimed.
1401 */
1402 static void
mteinfo_tag_storage_flush_reclaiming(void)1403 mteinfo_tag_storage_flush_reclaiming(void)
1404 {
1405 mte_cell_list_t list = &mte_info_lists[MTE_LIST_RECLAIMING_IDX];
1406 mte_cell_queue_t queue = &list->buckets[0];
1407 uint32_t batch = VMP_FREE_BATCH_SIZE;
1408
1409 while (cell_queue_count(queue) > 0) {
1410 cell_idx_t idx = cell_queue_first_idx(queue);
1411 vm_page_t tag_page = vm_tag_storage_page_get(idx);
1412 cell_t *cell = cell_from_idx(idx);
1413
1414 assert_cell_state(cell, /* [R.x] */ MTE_MASK_RECLAIMING);
1415 CELL_UPDATE(cell, tag_page, false, {
1416 cell->state = MTE_STATE_CLAIMED;
1417 });
1418
1419 #ifndef VM_MTE_FF_VERIFY
1420 counter_inc(&vm_cpu_claimed_count);
1421 #endif /* VM_MTE_FF_VERIFY */
1422
1423 if (--batch == 0 && cell_queue_count(queue)) {
1424 #ifndef VM_MTE_FF_VERIFY
1425 vm_free_page_unlock();
1426 vm_free_page_lock_spin();
1427 #endif /* VM_MTE_FF_VERIFY */
1428 batch = VMP_FREE_BATCH_SIZE;
1429 }
1430 }
1431 }
1432
1433 #ifndef VM_MTE_FF_VERIFY
1434
1435 void
mteinfo_tag_storage_wakeup(vm_page_t tag_page,bool fq_locked)1436 mteinfo_tag_storage_wakeup(vm_page_t tag_page, bool fq_locked)
1437 {
1438 cell_t *cell = cell_from_tag_storage_page(tag_page);
1439
1440 if (!fq_locked) {
1441 vm_free_page_lock_spin();
1442 }
1443
1444 assert(tag_page->vmp_ts_wanted);
1445 tag_page->vmp_ts_wanted = false;
1446
1447 assert_cell_state(cell, /* [B.2] */ MTE_MASK_PINNED);
1448 CELL_UPDATE(cell, tag_page, false, {
1449 cell->state = MTE_STATE_CLAIMED;
1450 });
1451
1452 if (cell->free_mask != 0 &&
1453 (vm_page_free_wanted_tagged_privileged || vm_page_free_wanted_tagged)) {
1454 mteinfo_wake_fill_thread();
1455 }
1456
1457 if (!fq_locked) {
1458 vm_free_page_unlock();
1459 }
1460
1461 counter_inc(&vm_cpu_claimed_count);
1462 }
1463
1464 #endif /* VM_MTE_FF_VERIFY */
1465 #pragma mark Covered pages state machine
1466
1467 bool
mteinfo_covered_page_taggable(ppnum_t pnum)1468 mteinfo_covered_page_taggable(ppnum_t pnum)
1469 {
1470 return cell_from_covered_ppnum(pnum)->state == MTE_STATE_ACTIVE;
1471 }
1472
1473 void
mteinfo_covered_page_set_free(ppnum_t pnum,bool tagged)1474 mteinfo_covered_page_set_free(ppnum_t pnum, bool tagged)
1475 {
1476 vm_page_t tag_page;
1477 cell_t *cell = cell_from_covered_ppnum(pnum, &tag_page);
1478 int bit = pnum % MTE_PAGES_PER_TAG_PAGE;
1479
1480 assert(cell->mte_page_count >= tagged);
1481 assert(!bit_test(cell->free_mask, bit));
1482
1483 VM_COUNTER_INC(&vm_page_free_count);
1484 if (cell->state == MTE_STATE_ACTIVE) {
1485 VM_COUNTER_INC(&vm_page_free_taggable_count);
1486 }
1487 if (tagged) {
1488 VM_COUNTER_DEC(&vm_page_tagged_count);
1489 }
1490
1491 CELL_UPDATE(cell, tag_page, false, {
1492 cell->mte_page_count -= tagged;
1493 bit_set(cell->free_mask, bit);
1494 });
1495 }
1496
1497 void
mteinfo_covered_page_set_used(ppnum_t pnum,bool tagged)1498 mteinfo_covered_page_set_used(ppnum_t pnum, bool tagged)
1499 {
1500 vm_page_t tag_page;
1501 cell_t *cell = cell_from_covered_ppnum(pnum, &tag_page);
1502 int bit = pnum % MTE_PAGES_PER_TAG_PAGE;
1503
1504 assert(cell->mte_page_count + tagged <= MTE_PAGES_PER_TAG_PAGE);
1505 assert(bit_test(cell->free_mask, bit));
1506
1507 VM_COUNTER_DEC(&vm_page_free_count);
1508 if (cell->state == MTE_STATE_ACTIVE) {
1509 VM_COUNTER_DEC(&vm_page_free_taggable_count);
1510 }
1511 if (tagged) {
1512 VM_COUNTER_INC(&vm_page_tagged_count);
1513 }
1514
1515 CELL_UPDATE(cell, tag_page, true, {
1516 bit_clear(cell->free_mask, bit);
1517 cell->mte_page_count += tagged;
1518 });
1519 }
1520
1521 __startup_func
1522 void
mteinfo_covered_page_set_stolen_tagged(ppnum_t pnum)1523 mteinfo_covered_page_set_stolen_tagged(ppnum_t pnum)
1524 {
1525 vm_page_t tag_page;
1526 cell_t *cell = cell_from_covered_ppnum(pnum, &tag_page);
1527
1528 assert(cell->mte_page_count < MTE_PAGES_PER_TAG_PAGE);
1529 assert(!bit_test(cell->free_mask, pnum % MTE_PAGES_PER_TAG_PAGE));
1530
1531 CELL_UPDATE(cell, tag_page, false, {
1532 cell->mte_page_count++;
1533 });
1534 }
1535
1536 void
mteinfo_covered_page_clear_tagged(ppnum_t pnum)1537 mteinfo_covered_page_clear_tagged(ppnum_t pnum)
1538 {
1539 vm_page_t tag_page;
1540 cell_t *cell = cell_from_covered_ppnum(pnum, &tag_page);
1541
1542 assert(cell->mte_page_count > 0);
1543 assert(!bit_test(cell->free_mask, pnum % MTE_PAGES_PER_TAG_PAGE));
1544
1545 CELL_UPDATE(cell, tag_page, false, {
1546 cell->mte_page_count--;
1547 });
1548 }
1549
1550 #if DEBUG || DEVELOPMENT
1551 vm_page_t
mteinfo_tag_page_from_covered_page(ppnum_t pnum,vm_offset_t * offset_to_tag_data)1552 mteinfo_tag_page_from_covered_page(ppnum_t pnum, vm_offset_t * offset_to_tag_data)
1553 {
1554 cell_idx_t cidx;
1555 cell_t *cell;
1556
1557 if (!mteinfo_covered_page_taggable(pnum)) {
1558 return NULL;
1559 }
1560
1561 cidx = (pnum - pmap_first_pnum) / MTE_PAGES_PER_TAG_PAGE;
1562 cell = cell_from_idx(cidx);
1563
1564 vm_page_t tag_page = vm_tag_storage_page_get(cidx);
1565 assert(vm_page_in_tag_storage_array(tag_page));
1566
1567 *offset_to_tag_data =
1568 (PAGE_SIZE / MTE_PAGES_PER_TAG_PAGE) * /* size of tag data */
1569 ((pnum - pmap_first_pnum) % MTE_PAGES_PER_TAG_PAGE); /* index within cell */
1570
1571 return tag_page;
1572 }
1573 #endif /* DEBUG || DEVELOPMENT */
1574
1575 #pragma mark Activate
1576 #ifndef VM_MTE_FF_VERIFY
1577
1578 /*!
1579 * @function mteinfo_tag_storage_wire_locked()
1580 *
1581 * @abstract
1582 * Wire the given tag storage page.
1583 *
1584 * @discussion
1585 * The page will be wired as part of mte_tags_object.
1586 *
1587 * This must be called with the object lock and the page queues lock held.
1588 *
1589 * @param tag_page
1590 * A tag storage page.
1591 */
1592 static void
mteinfo_tag_storage_wire_locked(vm_page_t tag_page)1593 mteinfo_tag_storage_wire_locked(vm_page_t tag_page)
1594 {
1595 vm_object_offset_t page_addr = ptoa(VM_PAGE_GET_PHYS_PAGE(tag_page));
1596
1597 assert(tag_page->vmp_wire_count == 0);
1598 vm_page_wire(tag_page, VM_KERN_MEMORY_MTAG,
1599 /* Don't check memory status. */ FALSE);
1600
1601 vm_page_insert_internal(tag_page, mte_tags_object, page_addr,
1602 VM_KERN_MEMORY_MTAG,
1603 /* We already hold the queue locks. */ TRUE,
1604 /* Add this page to the hash. */ TRUE,
1605 /* Don't bother batching pmap operations. */ FALSE,
1606 /* Don't bother batching accounting. */ FALSE,
1607 /* Don't bother with delayed ledger updates. */ NULL);
1608 }
1609
1610 /*!
1611 * @function mteinfo_tag_storage_select_activating()
1612 *
1613 * @abstract
1614 * Select tag storage pages to activate toward a certain number of free covered
1615 * pages to make taggable.
1616 *
1617 * @discussion
1618 * The caller must make sure there's at least one page to activate for the
1619 * selected buckets.
1620 *
1621 * @param target how many covered taggable free pages to try to generate
1622 * as a result of this activation.
1623 * @param bucket which inactive bucket to start drawing from
1624 *
1625 * @returns the list of tag storage pages to activate
1626 * with mteinfo_tag_storage_activate_locked().
1627 */
1628 static vm_page_list_t
mteinfo_tag_storage_select_activating(uint32_t target,mte_cell_bucket_t bucket)1629 mteinfo_tag_storage_select_activating(uint32_t target, mte_cell_bucket_t bucket)
1630 {
1631 vm_page_list_t list = { };
1632 vm_page_t tag_page = VM_PAGE_NULL;
1633 cell_t *cell = NULL;
1634 uint32_t total = 0;
1635 uint32_t covered = 0;
1636
1637 /*
1638 * Convert the lock hold into a mutex, to signal to waiters that the
1639 * lock may be held for longer.
1640 */
1641 vm_free_page_lock_convert();
1642
1643 do {
1644 cell = cell_list_find_last_page(MTE_LIST_INACTIVE_IDX,
1645 bucket, &tag_page);
1646 if (tag_page == VM_PAGE_NULL) {
1647 break;
1648 }
1649
1650 assert_cell_state(cell, /* [A.1] */ MTE_MASK_INACTIVE);
1651 CELL_UPDATE(cell, tag_page, false, {
1652 cell->state = MTE_STATE_ACTIVATING;
1653 });
1654
1655 covered = cell_free_page_count(*cell);
1656 total += covered;
1657
1658 KDBG(VMDBG_CODE(DBG_VM_TAG_PAGE_INACTIVE) | DBG_FUNC_NONE,
1659 VM_KERNEL_ADDRHIDE(tag_page), covered);
1660
1661 tag_page->vmp_q_state = VM_PAGE_NOT_ON_Q;
1662 vm_page_list_push(&list, tag_page);
1663 } while (total < target);
1664
1665 return list;
1666 }
1667
1668 /*!
1669 * @function mteinfo_tag_storage_activate_locked()
1670 *
1671 * @abstract
1672 * Activate a list of tag storage pages in reclaiming or activating state.
1673 *
1674 * @discussion
1675 * The page free queue lock must be held, however it is dropped and retaken by
1676 * this function.
1677 *
1678 * @param list the list of pages to activate.
1679 * @param spin_mode whether to take the free page queue lock in spin mode.
1680 *
1681 * @returns how many covered pages have been made taggable.
1682 */
1683 static uint32_t
mteinfo_tag_storage_activate_locked(vm_page_list_t list,bool spin_mode)1684 mteinfo_tag_storage_activate_locked(vm_page_list_t list, bool spin_mode)
1685 {
1686 vm_page_t tag_page = VM_PAGE_NULL;
1687 uint32_t result, total;
1688
1689 vm_free_page_unlock();
1690
1691 /*
1692 * First, retype the pages and add them to the MTE object.
1693 */
1694
1695 vm_page_list_foreach(tag_page, list) {
1696 ppnum_t tag_pnum = VM_PAGE_GET_PHYS_PAGE(tag_page);
1697
1698 assert(vm_page_is_tag_storage_pnum(tag_page, tag_pnum));
1699 pmap_make_tag_storage_page(tag_pnum);
1700 }
1701
1702 vm_object_lock(mte_tags_object);
1703 vm_page_lock_queues();
1704 vm_page_list_foreach(tag_page, list) {
1705 vm_page_t save_snext = NEXT_PAGE(tag_page);
1706
1707 NEXT_PAGE(tag_page) = VM_PAGE_NULL;
1708 mteinfo_tag_storage_wire_locked(tag_page);
1709 NEXT_PAGE(tag_page) = save_snext;
1710 }
1711 vm_page_unlock_queues();
1712 vm_object_unlock(mte_tags_object);
1713
1714 if (spin_mode) {
1715 vm_free_page_lock_spin();
1716 } else {
1717 vm_free_page_lock();
1718 }
1719
1720 /*
1721 * Second, mark all the pages as active now, which makes the
1722 * covered pages available for taggable allocation.
1723 *
1724 * And recompute how many taggable pages we really freed,
1725 * as allocations/free of untagged pages could have made
1726 * progress while we dropped the free page queue lock.
1727 */
1728
1729 total = 0;
1730 vm_page_list_foreach_consume(tag_page, &list) {
1731 total += mteinfo_tag_storage_set_active(tag_page, 0, false);
1732 }
1733 result = total;
1734
1735
1736 /*
1737 * Last perform wakeups.
1738 *
1739 * 1. wake up other activators
1740 * 2. wake up privileged waiters
1741 * 3. wake up regular waiters
1742 *
1743 * We do not need to consider secluded pools, or other waiters because
1744 * we never prevent them from allocating the pages associated with
1745 * the tag storage we are activating during this process. Which is why
1746 * we don't use vm_page_free_queue_handle_wakeups_and_unlock() but
1747 * instead have this simplified implementation.
1748 */
1749
1750 if (vm_mte_activator_waiters) {
1751 vm_mte_activator_waiters = false;
1752 wakeup_all_with_inheritor(&vm_mte_activator_waiters,
1753 THREAD_AWAKENED);
1754 }
1755
1756 if (vm_page_free_wanted_tagged_privileged && total) {
1757 if (total < vm_page_free_wanted_tagged_privileged) {
1758 vm_page_free_wanted_tagged_privileged -= total;
1759 total = 0;
1760 } else {
1761 total -= vm_page_free_wanted_tagged_privileged;
1762 vm_page_free_wanted_tagged_privileged = 0;
1763 }
1764 vm_page_free_wakeup(&vm_page_free_wanted_tagged_privileged,
1765 UINT32_MAX);
1766 }
1767
1768 if (vm_page_free_wanted_tagged && total) {
1769 uint32_t wakeup = 0;
1770
1771 if (total < vm_page_free_wanted_tagged) {
1772 wakeup = total;
1773 vm_page_free_wanted_tagged -= total;
1774 total = 0;
1775 } else {
1776 total -= vm_page_free_wanted_tagged;
1777 vm_page_free_wanted_tagged = 0;
1778 wakeup = UINT32_MAX;
1779 }
1780 vm_page_free_wakeup(&vm_page_free_wanted_tagged, wakeup);
1781 }
1782
1783 return result;
1784 }
1785
1786 bool
mteinfo_tag_storage_try_activate(uint32_t target,bool spin_mode)1787 mteinfo_tag_storage_try_activate(uint32_t target, bool spin_mode)
1788 {
1789 mte_cell_bucket_t first_bucket = MTE_BUCKET_17_24;
1790 thread_t thread_self = current_thread();
1791 vm_page_list_t list = { };
1792
1793 /*
1794 * We only draw from buckets covering more than half of the pages free.
1795 * We do not want to do buckets that are less full, as this is too slow
1796 * for the inline path and will rely on the refill thread instead.
1797 */
1798
1799 if (mte_info_lists[MTE_LIST_INACTIVE_IDX].mask < BIT(first_bucket)) {
1800 return false;
1801 }
1802
1803 if (vm_mte_activator) {
1804 /*
1805 * We only allow one thread activating pages at a time,
1806 * only wait if we the caller can't make progress without
1807 * this though.
1808 *
1809 * We do not need to consider that the waiters is privileged
1810 * for the wait however, because activation isn't affected
1811 * by TH_OPT_VMPRIV.
1812 */
1813
1814 if (vm_page_free_taggable_count > vm_page_free_reserved) {
1815 return false;
1816 }
1817 if (vm_page_free_taggable_count > 0 &&
1818 (thread_self->options & TH_OPT_VMPRIV)) {
1819 return false;
1820 }
1821
1822 vm_mte_activator_waiters = true;
1823 lck_mtx_sleep_with_inheritor(&vm_page_queue_free_lock,
1824 spin_mode ? LCK_SLEEP_SPIN : LCK_SLEEP_DEFAULT,
1825 &vm_mte_activator_waiters, vm_mte_activator,
1826 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1827
1828 return true;
1829 }
1830
1831 vm_mte_activator = thread_self;
1832 list = mteinfo_tag_storage_select_activating(target, first_bucket);
1833 mteinfo_tag_storage_activate_locked(list, spin_mode);
1834 vm_mte_activator = THREAD_NULL;
1835
1836 return true;
1837 }
1838
1839
1840 #pragma mark Deactivate
1841
1842 /*!
1843 * @abstract
1844 * Returns whether the active(0.0) bucket should be drained to make inactive
1845 * pages.
1846 *
1847 * @param for_wakeup Whether the question is to wakeup the refill thread
1848 * (true) or decide whether the refill thread should keep
1849 * going (false).
1850 */
1851 static bool
mteinfo_tag_storage_should_drain(bool for_wakeup)1852 mteinfo_tag_storage_should_drain(bool for_wakeup)
1853 {
1854 mte_cell_list_t active_0 = &mte_info_lists[MTE_LIST_ACTIVE_0_IDX];
1855 uint32_t threshold = VMP_FREE_BATCH_SIZE * (for_wakeup ? 2 : 1);
1856
1857 if (!vm_mte_enable_tag_storage_grab) {
1858 return false;
1859 }
1860
1861 if (mte_claimable_queue.vmpfq_count >= vm_free_magazine_refill_limit) {
1862 return false;
1863 }
1864
1865 if (active_0->count <= vm_page_tag_storage_reserved) {
1866 return false;
1867 }
1868
1869 return cell_queue_count(&active_0->buckets[0]) >= threshold;
1870 }
1871
1872 /*
1873 * @function mteinfo_tag_storage_deactivate_barrier()
1874 *
1875 * @abstract
1876 * Wait until all possible untagging operations that could make deactivation
1877 * invalid have finished.
1878 *
1879 * @discussion
1880 * Before we can do any deactivation we must make sure
1881 * that no CPU has untagging activity in flight.
1882 *
1883 * See mteinfo_free_queue_grab() and mteinfo_page_list_fix_tagging().
1884 */
1885 static void
mteinfo_tag_storage_deactivate_barrier(void)1886 mteinfo_tag_storage_deactivate_barrier(void)
1887 {
1888 mte_pcpu_t this_cpu = PERCPU_GET(mte_pcpu);
1889
1890 assert(get_preemption_level() > 0);
1891
1892 percpu_foreach(it, mte_pcpu) {
1893 if (it == this_cpu) {
1894 /*
1895 * A thread is allowed to both have pending untagging
1896 * going on and a page to deactivate.
1897 *
1898 * As a result, ignore the current core's suspension
1899 * state as it is harmless as long as the core commits
1900 * to untagging before it does its deactivations.
1901 *
1902 * If a thread fails to do that, this will reliably
1903 * panic in SPTM, so the risk of silent bugs is rather
1904 * unlikely.
1905 */
1906 continue;
1907 }
1908
1909 if (os_atomic_load(&it->deactivate_suspend, relaxed)) {
1910 hw_wait_while_equals32(&it->deactivate_suspend, 1);
1911 }
1912 }
1913 os_atomic_thread_fence(seq_cst);
1914 }
1915
1916 /*!
1917 * @abstract
1918 * Flush a list of deactivating page storage.
1919 *
1920 * @discussion
1921 * The page free queue lock must be held, but will be dropped while this
1922 * function operates.
1923 *
1924 * @param list The list of pages in @c MTE_STATE_DEACTIVATING state.
1925 */
1926 static void
mteinfo_tag_storage_drain_flush(vm_page_list_t list)1927 mteinfo_tag_storage_drain_flush(vm_page_list_t list)
1928 {
1929 vm_page_t tag_page = VM_PAGE_NULL;
1930
1931 mteinfo_tag_storage_deactivate_barrier();
1932
1933 vm_free_page_unlock();
1934
1935 vm_object_lock(mte_tags_object);
1936 vm_page_lock_queues();
1937
1938 vm_page_list_foreach(tag_page, list) {
1939 vm_page_t save_next = NEXT_PAGE(tag_page);
1940
1941
1942 /*
1943 * The unwiring path expects the page linkage to be
1944 * NULL, so transiently make it NULL. We'll restore
1945 * the linkage after the unwire is done.
1946 */
1947
1948 NEXT_PAGE(tag_page) = VM_PAGE_NULL;
1949 vm_page_unwire(tag_page,
1950 /* Don't put the page into aging queues. */ FALSE);
1951 vm_page_remove(tag_page,
1952 /* Remove the page from the hash. */ TRUE);
1953 NEXT_PAGE(tag_page) = save_next;
1954 }
1955
1956 vm_page_unlock_queues();
1957 vm_object_unlock(mte_tags_object);
1958
1959 vm_page_list_foreach(tag_page, list) {
1960 pmap_unmake_tag_storage_page(VM_PAGE_GET_PHYS_PAGE(tag_page));
1961 }
1962
1963 vm_free_page_lock_spin();
1964
1965 vm_page_tag_storage_deactivation_count += list.vmpl_count;
1966
1967 vm_page_list_foreach_consume(tag_page, &list) {
1968 vm_page_free_queue_enter(VM_MEMORY_CLASS_TAG_STORAGE,
1969 tag_page, VM_PAGE_GET_PHYS_PAGE(tag_page));
1970 }
1971 }
1972
1973 /*!
1974 * @function mteinfo_tag_storage_drain()
1975 *
1976 * @abstract
1977 * Attempt to drain the active(0.0) bucket of pages since these are always
1978 * wasted.
1979 *
1980 * @discussion
1981 * This is one of the core routines of the fill thread.
1982 *
1983 * @returns
1984 * How many tag storage pages were deactivated.
1985 */
1986 static uint32_t
mteinfo_tag_storage_drain(void)1987 mteinfo_tag_storage_drain(void)
1988 {
1989 mte_cell_list_t active_0 = &mte_info_lists[MTE_LIST_ACTIVE_0_IDX];
1990 mte_cell_queue_t bucket_0 = &active_0->buckets[0];
1991 vm_page_t tag_page = VM_PAGE_NULL;
1992 cell_t *cell = NULL;
1993 uint32_t total = 0;
1994 vm_page_list_t list = { };
1995
1996 LCK_MTX_ASSERT_OWNED_SPIN(&vm_page_queue_free_lock);
1997
1998 while (mteinfo_tag_storage_should_drain(false)) {
1999 tag_page = vm_tag_storage_page_get(cell_queue_first_idx(bucket_0));
2000 cell = cell_queue_first(bucket_0);
2001
2002 assert(cell->free_mask == 0);
2003 assert_cell_state(cell, /* [D.1] */ MTE_MASK_ACTIVE);
2004 CELL_UPDATE(cell, tag_page, false, {
2005 cell->state = MTE_STATE_DEACTIVATING;
2006 });
2007
2008 vm_page_list_push(&list, tag_page);
2009
2010 if (list.vmpl_count >= VMP_FREE_BATCH_SIZE) {
2011 total += list.vmpl_count;
2012 mteinfo_tag_storage_drain_flush(list);
2013 list = (vm_page_list_t){ };
2014 }
2015 }
2016
2017 if (list.vmpl_count) {
2018 total += list.vmpl_count;
2019 mteinfo_tag_storage_drain_flush(list);
2020 }
2021
2022 return total;
2023 }
2024
2025
2026 #pragma mark Reclaim
2027
2028 /*!
2029 * @abstract
2030 * Attempt to steal a tag page from a per cpu claimed free queue.
2031 *
2032 * @discussion
2033 * The caller must have checked that the tag_page is on a local free queue,
2034 * even if this check is racy.
2035 *
2036 * @param tag_page A tag storage page appearing to sit on a per cpu queue.
2037 *
2038 * @returns Whether stealing was successful (true) or not (false).
2039 */
2040 static bool
mteinfo_reclaim_tag_storage_page_try_pcpu(vm_page_t tag_page)2041 mteinfo_reclaim_tag_storage_page_try_pcpu(vm_page_t tag_page)
2042 {
2043 mte_pcpu_t mte_pcpu;
2044 uint16_t cpu;
2045
2046 cpu = os_atomic_load(&tag_page->vmp_local_id, relaxed);
2047 mte_pcpu = PERCPU_GET_WITH_BASE(other_percpu_base(cpu), mte_pcpu);
2048
2049 lck_ticket_lock(&mte_pcpu->free_claimed_lock, &vm_page_lck_grp_bucket);
2050
2051 if (tag_page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q &&
2052 tag_page->vmp_local_id == cpu) {
2053 vm_page_queue_remove(&mte_pcpu->free_claimed_pages,
2054 tag_page, vmp_pageq);
2055 tag_page->vmp_q_state = VM_PAGE_NOT_ON_Q;
2056 tag_page->vmp_local_id = 0;
2057 counter_dec_preemption_disabled(&vm_cpu_free_claimed_count);
2058 } else {
2059 tag_page = VM_PAGE_NULL;
2060 }
2061
2062 lck_ticket_unlock(&mte_pcpu->free_claimed_lock);
2063
2064 return tag_page != VM_PAGE_NULL;
2065 }
2066
2067 /*!
2068 * @function mteinfo_reclaim_tag_storage_page()
2069 *
2070 * @abstract
2071 * Attempt to reclaim a claimed tag storage page.
2072 *
2073 * @discussion
2074 * This will try to reclaim a tag storage page by relocating its contents to a
2075 * different page, so that the tag storage page becomes (effectively) free.
2076 *
2077 * This expects a claimed tag storage page, and on success, will finish with
2078 * the page in the reclaimed state. On failure, no guarantees are made about
2079 * the state of the page (due to locking operations); the page could still be
2080 * claimed, or reclamation may have failed because the page became free in the
2081 * interim. However, if the page was not in a relocatable state, this function
2082 * will not force it out of the reclaiming state, so that the client can choose
2083 * when and why the page is returned to claimed.
2084 *
2085 * This function is called with the free page queue lock in spin mode and
2086 * returns with it held in spin mode.
2087 *
2088 * @param tag_page
2089 * The claimed tag storage page to try reclaiming.
2090 *
2091 * @returns
2092 * - KERN_SUCCESS success,
2093 *
2094 * - KERN_INVALID_OBJECT the page has no object set
2095 *
2096 * - KERN_NOT_WAITING the state of the cell/tag page changed
2097 * during evaluation.
2098 *
2099 * - KERN_ABORTED the tag page was wired. reclaiming it was
2100 * aborted and it was marked as MTE_STATE_PINNED.
2101 *
2102 * - KERN_RESOURCE_SHORTAGE from vm_page_relocate(): relocation failed due
2103 * to being out of replacement memory.
2104 *
2105 * - KERN_FAILURE from vm_page_relocate(): relocation failed due
2106 * to the page not being currently relocatable.
2107 */
2108 static kern_return_t
mteinfo_reclaim_tag_storage_page(vm_page_t tag_page)2109 mteinfo_reclaim_tag_storage_page(vm_page_t tag_page)
2110 {
2111 cell_t *cell = cell_from_tag_storage_page(tag_page);
2112 kern_return_t kr = KERN_FAILURE;
2113 vm_object_t object;
2114 bool compressor_locked = false;
2115 bool vm_object_trylock_failed = false;
2116
2117 /* We need to try and reclaim the tag storage page. */
2118 mteinfo_tag_storage_set_reclaiming(cell, tag_page);
2119
2120 if (tag_page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q &&
2121 mteinfo_reclaim_tag_storage_page_try_pcpu(tag_page)) {
2122 vm_page_tag_storage_reclaim_from_cpu_count++;
2123 vm_page_tag_storage_reclaim_success_count++;
2124
2125 KDBG(VMDBG_CODE(DBG_VM_TAG_PAGE_CLAIMED) | DBG_FUNC_NONE,
2126 VM_KERNEL_ADDRHIDE(tag_page),
2127 mteinfo_tag_storage_free_pages_for_covered(tag_page));
2128
2129 return KERN_SUCCESS;
2130 }
2131
2132 vm_free_page_unlock();
2133
2134 /*
2135 * Snoop the vmp_q_state. If the page is currently used by the compressor
2136 * (VM_PAGE_USED_BY_COMPRESSOR), we'll grab the global compressor lock
2137 * for write (PAGE_REPLACEMENT_ALLOWED(TRUE)) and the compressor
2138 * object lock.
2139 *
2140 * Typically, we can't know that the object will be stable
2141 * without grabbing the object or page queues lock (see the comment on
2142 * "relocation lock dance" below), but we know that the compressor object
2143 * is stable. So, we do _not_ need to grab the page queues and object locks
2144 * in the wrong order. This ensures that we will wait our turn in case
2145 * someone else is using the compressor object lock, and there is no chance
2146 * the reclaim will fail because we can't acquire the right locks.
2147 *
2148 * The contiguous memory allocator grabs this lock before the page queues
2149 * and object lock, so we must do the same here.
2150 */
2151 if (tag_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2152 assert(vm_mte_tag_storage_for_compressor);
2153 PAGE_REPLACEMENT_ALLOWED(TRUE);
2154 vm_object_lock(compressor_object);
2155 compressor_locked = true;
2156
2157 /*
2158 * The page state transitions into and out of VM_PAGE_USED_BY_COMPRESSOR
2159 * happen under the compressor object, so now the page state is stable.
2160 */
2161 if (tag_page->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR) {
2162 /*
2163 * The page was removed from the compressor pool. It could be
2164 * in any state now, but it's probably free and unusable. Give up.
2165 */
2166 vm_object_unlock(compressor_object);
2167 PAGE_REPLACEMENT_ALLOWED(FALSE);
2168 compressor_locked = false;
2169 vm_free_page_lock_spin();
2170 kr = KERN_FAILURE;
2171 goto locks_acquired;
2172 }
2173 }
2174
2175 /*
2176 * Do the relocation lock dance. This is a little odd; because we're
2177 * starting with a page, and trying to look up the object, we need the
2178 * queues lock to keep the object from being deallocated or changed.
2179 *
2180 * This means we need to get the object lock after the queues lock;
2181 * this inverts the lock ordering, so we can only TRY the object lock.
2182 */
2183 vm_page_lock_queues();
2184
2185 object = VM_PAGE_OBJECT(tag_page);
2186 if (compressor_locked) {
2187 assert(object == compressor_object);
2188 }
2189
2190 if (object == VM_OBJECT_NULL) {
2191 /* [PH] XXX: Can this even happen? */
2192 kr = KERN_INVALID_OBJECT;
2193 goto release_locks;
2194 } else if (!compressor_locked && !vm_object_lock_try_scan(object)) {
2195 /*
2196 * hopefully the next time we drain reclaiming pages taking
2197 * that object lock will work.
2198 */
2199 vm_object_trylock_failed = true;
2200 kr = KERN_NOT_WAITING;
2201 goto release_locks;
2202 } else if (VM_PAGE_OBJECT(tag_page) != object) {
2203 /*
2204 * vm_page_insert_internal() doesn't require the page queue lock
2205 * to be held if the page is wired, so the object could change
2206 * under us.
2207 */
2208 vm_object_unlock(object);
2209
2210 kr = KERN_NOT_WAITING;
2211 goto release_locks;
2212 }
2213
2214 /*
2215 * Now that all the locking is out of the way,
2216 * see if the page is actually relocatable.
2217 */
2218 if (VM_PAGE_WIRED(tag_page) ||
2219 (tag_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR && tag_page->vmp_busy)) {
2220 /*
2221 * TODO: Relocation fails when one of these conditions is met:
2222 *
2223 * VM_PAGE_WIRED(tag_page)
2224 * tag_page->vmp_gobbled
2225 * tag_page->vmp_laundry
2226 * tag_page->vmp_wanted
2227 * tag_page->vmp_cleaning
2228 * tag_page->vmp_overwriting
2229 * tag_page->vmp_free_when_done
2230 * tag_page->vmp_busy
2231 * tag_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q
2232 *
2233 * We only handle VM_PAGE_WIRED() and when the tag page is being
2234 * swapped out (from usage in the compressor pool) for now,
2235 * because these are the most likely, but we should use vmp_ts_wanted
2236 * for all cases.
2237 *
2238 * We would need to find all places in the kernel that alter
2239 * this condition, to notice that a relocation was attempted
2240 * (vmp_ts_wanted is set) and call mteinfo_tag_storage_wakeup().
2241 */
2242
2243 /*
2244 * Take the page free lock before setting vmp_ts_wanted,
2245 * before we drop the object lock, otherwise
2246 * mteinfo_tag_storage_wakeup() might see vmp_ts_wanted
2247 * before the transition to MTE_STATE_PINNED has happened.
2248 *
2249 * Note that we should do nothing if the cell is no longer in
2250 * the MTE_STATE_RECLAIMING state, which could hypothetically
2251 * happen since we dropped the free queue lock above.
2252 */
2253 vm_free_page_lock_spin();
2254
2255 if (cell->state == MTE_STATE_RECLAIMING) {
2256 assert(tag_page->vmp_ts_wanted == false);
2257 tag_page->vmp_ts_wanted = true;
2258 kr = KERN_ABORTED;
2259 } else {
2260 kr = KERN_NOT_WAITING;
2261 }
2262
2263 vm_object_unlock(object);
2264 vm_page_unlock_queues();
2265 if (compressor_locked) {
2266 PAGE_REPLACEMENT_ALLOWED(FALSE);
2267 compressor_locked = false;
2268 }
2269
2270 if (kr == KERN_ABORTED) {
2271 assert_cell_state(cell, /* [B.1] */ MTE_MASK_RECLAIMING);
2272 CELL_UPDATE(cell, tag_page, false, {
2273 cell->state = MTE_STATE_PINNED;
2274 });
2275 if (tag_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2276 vm_page_tag_storage_reclaim_compressor_failure_count++;
2277 } else {
2278 vm_page_tag_storage_reclaim_wired_failure_count++;
2279 }
2280 }
2281
2282 goto locks_acquired;
2283 } else if ((*vm_mte_tag_storage_for_vm_tags) &&
2284 !vm_page_is_relocatable(tag_page, VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM)) {
2285 /*
2286 * If we're allowing tag storage pages to be used for specific VM tags,
2287 * those pages could be unrelocatable for reasons we haven't
2288 * expected. We're also assuming that if a tag storage page were to
2289 * be unrelocatable for whatever reason, it's (at the very least) not
2290 * because the page is wired or involved in an IO that could take a
2291 * long time, so hopefully it won't be unavailable for too long, and
2292 * the fill thread won't churn over the same set of unavailable claimed
2293 * pages.
2294 *
2295 * We'll just skip over this page and move it back to claiming at the
2296 * bottom of this function.
2297 */
2298 kr = KERN_NOT_WAITING;
2299 vm_object_unlock(object);
2300 } else {
2301 kr = vm_page_relocate(tag_page, NULL,
2302 VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM, NULL);
2303 vm_object_unlock(object);
2304
2305 assert(kr != KERN_ABORTED);
2306 }
2307
2308 release_locks:
2309 if (compressor_locked) {
2310 PAGE_REPLACEMENT_ALLOWED(FALSE);
2311 }
2312 vm_page_unlock_queues();
2313 if (vm_object_trylock_failed && vm_object_lock_avoid(object)) {
2314 /*
2315 * We failed to lock the VM object, and pageout_scan
2316 * wants this object. Back off for a little bit.
2317 *
2318 * Note that the VM object may no longer be valid after releasing
2319 * the VM object lock, but `vm_object_lock_avoid` only compares
2320 * pointers and doesn't dereference them, so it's fine.
2321 */
2322 mutex_pause(2);
2323 }
2324 vm_free_page_lock_spin();
2325
2326
2327 locks_acquired:
2328 /*
2329 * Assert that all codepaths leading up to this point have the lock
2330 * held in spin mode (and therefore, preemption disabled).
2331 */
2332 LCK_MTX_ASSERT_OWNED_SPIN(&vm_page_queue_free_lock);
2333
2334 if (kr == KERN_SUCCESS) {
2335 vm_page_tag_storage_reclaim_success_count++;
2336
2337 /* We relocated the page. Now we can use it. */
2338 if (cell->state != MTE_STATE_RECLAIMING) {
2339 /*
2340 * The page was manipulated while we were relocating
2341 * it. This likely means it was freed and reallocated
2342 * between us dropping the free page lock and getting
2343 * the queues lock.
2344 *
2345 * This should be ludicrously rare, and should still
2346 * mean that the page is claimed (otherwise relocate
2347 * would have failed). Set to reclaiming for client
2348 * consistency.
2349 *
2350 * In the state diagram this corresponds to other
2351 * threads having performed [F.2 inline] followed
2352 * by [C.1 inline], possibly multiple times.
2353 */
2354 mteinfo_tag_storage_set_reclaiming(cell, tag_page);
2355 }
2356
2357 KDBG(VMDBG_CODE(DBG_VM_TAG_PAGE_CLAIMED) | DBG_FUNC_NONE,
2358 VM_KERNEL_ADDRHIDE(tag_page),
2359 mteinfo_tag_storage_free_pages_for_covered(tag_page));
2360
2361 assert(tag_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
2362 } else {
2363 vm_page_tag_storage_reclaim_failure_count++;
2364
2365 if (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_NOT_WAITING) {
2366 /*
2367 * If there was no available page to relocate the tag
2368 * storage page to, or that some race happened that
2369 * changed the page state under our feet, just put the
2370 * page back as claimed if it's still reclaiming.
2371 *
2372 * It will as a result get reconsidered more quickly...
2373 * it WAS our best candidate, after all.
2374 */
2375 if (cell->state == MTE_STATE_RECLAIMING) {
2376 mteinfo_tag_storage_set_claimed(tag_page);
2377 }
2378 }
2379 }
2380
2381 return kr;
2382 }
2383
2384
2385 #pragma mark Refill Thread
2386
2387 /*!
2388 * @abstract
2389 * Returns whether the refill thread should keep refilling the active pool.
2390 *
2391 * @discussion
2392 * If we're below the free target, and there are no tagged waiters of any kind,
2393 * avoid activating any pages if the untagged pool is not extremely healthy.
2394 */
2395 static inline bool
mteinfo_tag_storage_active_should_refill(void)2396 mteinfo_tag_storage_active_should_refill(void)
2397 {
2398 if (vm_page_free_taggable_count >= vm_page_free_target) {
2399 return false;
2400 }
2401
2402 if (vm_page_free_taggable_count <= vm_page_free_reserved) {
2403 return true;
2404 }
2405
2406 if (vm_page_free_wanted_tagged_privileged || vm_page_free_wanted_tagged) {
2407 return true;
2408 }
2409
2410 /*
2411 * 16/15 is ~1.07: we define "healthy" as at least 7% excess pages
2412 * over the target.
2413 *
2414 * We want some slop because a system under pressure will sometimes go
2415 * above @c vm_page_free_target and we want to avoid thrashing.
2416 */
2417 return vm_page_free_count * 15ull >= vm_page_free_target * 16ull;
2418 }
2419
2420 /*!
2421 * @function mteinfo_tag_storage_active_refill()
2422 *
2423 * @abstract
2424 * Attempt to fill the global free tagged covered page queue.
2425 *
2426 * @discussion
2427 * This is one of the core routines of the fill thread. It will attempt to get
2428 * the global free tagged covered page queue to or above a target value. It
2429 * will also wake threads waiting for more of these pages as appropriate.
2430 *
2431 * This function is called with the free page queue lock held in spin mode
2432 * and returns with it held in spin mode.
2433 *
2434 * @param taggablep How many free taggable pages have been added.
2435 * @returns The number of tag storage pages this function activated.
2436 */
2437 static uint32_t
mteinfo_tag_storage_active_refill(uint32_t * taggablep)2438 mteinfo_tag_storage_active_refill(uint32_t *taggablep)
2439 {
2440 mte_cell_list_t claimed_list = &mte_info_lists[MTE_LIST_CLAIMED_IDX];
2441 mte_cell_list_t inactive_list = &mte_info_lists[MTE_LIST_INACTIVE_IDX];
2442 uint32_t taggable = 0;
2443 uint32_t activated = 0;
2444
2445 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
2446
2447 while (mteinfo_tag_storage_active_should_refill()) {
2448 mte_cell_bucket_t i_bucket = 0;
2449 mte_cell_bucket_t c_bucket = 0;
2450 vm_page_list_t list = { };
2451 kern_return_t kr = KERN_SUCCESS;
2452
2453 /*
2454 * Step 1: try to activate or reclaim pages.
2455 *
2456 * Pick the pool between inactive and claimed that will
2457 * make us progress the fastest (picking inactive over
2458 * claimed for equivalent buckets, given that reclaiming
2459 * is more expensive).
2460 *
2461 * In particular always pick active buckets over reclaiming
2462 * pages if they have more than 50% of the pages free.
2463 */
2464
2465 if (inactive_list->mask) {
2466 i_bucket = fls(inactive_list->mask) - 1;
2467 } else {
2468 i_bucket = 0;
2469 }
2470 if (claimed_list->mask) {
2471 c_bucket = fls(claimed_list->mask) - 1;
2472 } else {
2473 c_bucket = 0;
2474 }
2475
2476 if (i_bucket && i_bucket >= MIN(MTE_BUCKET_17_24, c_bucket)) {
2477 list = mteinfo_tag_storage_select_activating(VMP_FREE_BATCH_SIZE,
2478 MIN(i_bucket, MTE_BUCKET_17_24));
2479 } else if (c_bucket > MTE_BUCKET_0) {
2480 mte_cell_queue_t queue = &claimed_list->buckets[c_bucket];
2481 cell_idx_t idx = cell_queue_first_idx(queue);
2482 vm_page_t page = vm_tag_storage_page_get(idx);
2483
2484 kr = mteinfo_reclaim_tag_storage_page(page);
2485 if (kr == KERN_SUCCESS) {
2486 list = vm_page_list_for_page(page);
2487 }
2488 } else {
2489 /*
2490 * There is no progress we can do here because we do not
2491 * have good candidates to activate or reclaim.
2492 *
2493 * As a result, even if the system has free untaggable
2494 * pages, they can't be converted to taggable either
2495 * because they're permanently untaggable, or beacuse
2496 * their associated tag storage can't be reclaimed.
2497 *
2498 * Waiting in VM_PAGE_WAIT() below sounds appealing
2499 * but will result in busy loops, so we should just
2500 * go park and wait until some page free is saving us
2501 * via the "wakeup_refill_thread" cases in
2502 * @c vm_page_free_queue_handle_wakeups_and_unlock().
2503 */
2504 break;
2505 }
2506
2507 if (kr == KERN_SUCCESS) {
2508 activated += list.vmpl_count;
2509 taggable += mteinfo_tag_storage_activate_locked(list,
2510 /* spin-mode */ true);
2511 continue;
2512 }
2513
2514 /*
2515 * Step 2: wait if needed
2516 *
2517 * KERN_RESOURCE_SHORTAGE means that we were out of pages
2518 * to relocate or tag storage candidates.
2519 *
2520 * Other errors are relocation failures and we can just
2521 * retry immediately.
2522 */
2523
2524 if (kr == KERN_RESOURCE_SHORTAGE) {
2525 /*
2526 * There was no good candidate tag storage page. Wait
2527 * on the VM to make new pages available.
2528 *
2529 * TODO: This isn't a great solution; the VM doesn't
2530 * understand what we are actually waiting on. This
2531 * should converge eventually due to VM activity... but
2532 * the bigger picture fix is to make all free pages
2533 * eligible for MTE. Then our only significant concern
2534 * around tag storage pages will be tag storage pages
2535 * with ECC errors, which should be a small number.
2536 */
2537 vm_free_page_unlock();
2538 current_thread()->page_wait_class = VM_MEMORY_CLASS_REGULAR;
2539 VM_PAGE_WAIT();
2540 vm_free_page_lock_spin();
2541
2542 /*
2543 * We waited above, the system conditions changed,
2544 * flush our reclaiming queue.
2545 */
2546 mteinfo_tag_storage_flush_reclaiming();
2547 }
2548 }
2549
2550 mteinfo_tag_storage_flush_reclaiming();
2551
2552 *taggablep += taggable;
2553 return activated;
2554 }
2555
2556 /*!
2557 * @function mteinfo_fill_continue()
2558 *
2559 * @abstract
2560 * Continuation for the MTE fill thread.
2561 *
2562 * @discussion
2563 * The MTE fill thread manages the global free queue of covered tagged pages,
2564 * and moving tag storage pages between the active and inactive states.
2565 *
2566 * @param param
2567 * Unused.
2568 *
2569 * @param wr
2570 * Unused.
2571 */
2572 __dead2
2573 static void
mteinfo_fill_continue(void * param __unused,wait_result_t wr __unused)2574 mteinfo_fill_continue(void *param __unused, wait_result_t wr __unused)
2575 {
2576 #if CONFIG_THREAD_GROUPS
2577 static bool _fill_thread_self_inited;
2578
2579 if (!_fill_thread_self_inited) {
2580 thread_group_vm_add();
2581 _fill_thread_self_inited = true;
2582 }
2583 #endif /* CONFIG_THREAD_GROUPS */
2584
2585 (void)sched_cond_ack(&fill_thread_cond);
2586 vm_mte_refill_thread_wakeups++;
2587
2588 for (;;) {
2589 uint32_t added = 0;
2590 uint32_t activated = 0;
2591 uint32_t deactivated = 0;
2592
2593 VM_DEBUG_CONSTANT_EVENT(, DBG_VM_REFILL_MTE, DBG_FUNC_START,
2594 0, 0, 0, 0);
2595
2596 /*
2597 * NB: We take the free queue lock in spin mode here because there are
2598 * a number of operations that occur during active_refill and drain
2599 * that requires preemption to be disabled. For example:
2600 * - in active_refill: if the fill thread tries to reclaim a tag
2601 * storage page, it first tries to steal a free tag storage page
2602 * from the local free queue.
2603 * - in drain: when flushing the queue of deactivating tag storage
2604 * pages, the fill thread waits for all cores to finish any untagging
2605 * before proceeding. See mteinfo_tag_storage_deactivate_barrier().
2606 *
2607 * Coupling enabling/disabling preemption with acquiring/releasing the
2608 * free queue lock is easier than managing preemption by hand, so all
2609 * instances of free queue lock acquisition must be done in spin mode.
2610 */
2611 vm_free_page_lock_spin();
2612
2613 activated += mteinfo_tag_storage_active_refill(&added);
2614 deactivated += mteinfo_tag_storage_drain();
2615
2616 vm_free_page_unlock();
2617
2618 VM_DEBUG_CONSTANT_EVENT(, DBG_VM_REFILL_MTE, DBG_FUNC_END,
2619 added, activated, deactivated, 0);
2620
2621 sched_cond_wait_parameter(&fill_thread_cond, THREAD_UNINT,
2622 mteinfo_fill_continue, NULL);
2623 }
2624 }
2625
2626 void
mteinfo_wake_fill_thread(void)2627 mteinfo_wake_fill_thread(void)
2628 {
2629 if (is_mte_enabled) {
2630 sched_cond_signal(&fill_thread_cond, vm_mte_fill_thread);
2631 }
2632 }
2633
2634
2635 #pragma mark Alloc
2636
2637 /*!
2638 * @abstract
2639 * Returns whether @c mteinfo_free_queue_grab() should refill the per-cpu
2640 * claimable queue.
2641 *
2642 * @discussion
2643 * The policy is to refill if the queue is empty and that the claimable
2644 * queue has a full batch of @c VMP_FREE_BATCH_SIZE free pages.
2645 *
2646 * This is chosen so that the taking of the spinlock it implies is amortized
2647 * well and reduce thrashing.
2648 *
2649 * The function must be called with preemption disabled.
2650 *
2651 * @param mte_pcpu The current CPU's mte_pcpu_t data structure.
2652 */
2653 static bool
mteinfo_tag_storage_claimable_should_refill(mte_pcpu_t mte_pcpu)2654 mteinfo_tag_storage_claimable_should_refill(mte_pcpu_t mte_pcpu)
2655 {
2656 if (__improbable(!vm_mte_enable_tag_storage_grab)) {
2657 return false;
2658 }
2659
2660 if (!vm_page_queue_empty(&mte_pcpu->free_claimed_pages)) {
2661 return false;
2662 }
2663
2664 return mte_claimable_queue.vmpfq_count >= VMP_FREE_BATCH_SIZE;
2665 }
2666
2667 /*!
2668 * @abstract
2669 * Refill the current CPU's claimed free queue.
2670 *
2671 * @discussion
2672 * This is done opportunistically by @c mteinfo_free_queue_grab()
2673 * When it notices that it should refill the claimable queue
2674 * (see @mteinfo_tag_storage_claimable_should_refill()).
2675 *
2676 * The function must be called with preemption disabled.
2677 *
2678 * @param mte_pcpu The current CPU's mte_pcpu_t data structure.
2679 * @param target The number of tag storage pages to grab.
2680 * @param colorp A pointer to the current color selector.
2681 */
2682 static void
mteinfo_tag_storage_claimable_refill(mte_pcpu_t mte_pcpu,uint32_t target,uint32_t * colorp)2683 mteinfo_tag_storage_claimable_refill(
2684 mte_pcpu_t mte_pcpu,
2685 uint32_t target,
2686 uint32_t *colorp)
2687 {
2688 const int cpu = cpu_number();
2689 vm_page_queue_t queue;
2690 ppnum_t pnum;
2691 vm_page_t mem;
2692
2693 lck_ticket_lock_nopreempt(&mte_pcpu->free_claimed_lock,
2694 &vm_page_lck_grp_bucket);
2695
2696 for (uint32_t i = target; i-- > 0;) {
2697 queue = &mte_claimable_queue.vmpfq_queues[*colorp].qhead;
2698 while (vm_page_queue_empty(queue)) {
2699 *colorp = (*colorp + 1) & vm_color_mask;
2700 queue = &mte_claimable_queue.vmpfq_queues[*colorp].qhead;
2701 }
2702
2703 mem = (vm_page_t)vm_page_queue_first(queue);
2704 pnum = VM_PAGE_GET_PHYS_PAGE(mem);
2705
2706 assert(mem->vmp_q_state == VM_PAGE_ON_FREE_Q);
2707 mteinfo_tag_storage_set_claimed(mem);
2708 mem->vmp_q_state = VM_PAGE_ON_FREE_LOCAL_Q;
2709 mem->vmp_local_id = (uint16_t)cpu;
2710 vm_page_queue_enter(&mte_pcpu->free_claimed_pages, mem, vmp_pageq);
2711 }
2712
2713 lck_ticket_unlock_nopreempt(&mte_pcpu->free_claimed_lock);
2714
2715 counter_add_preemption_disabled(&vm_cpu_free_claimed_count,
2716 target);
2717 }
2718
2719 vm_page_list_t
mteinfo_free_queue_grab(vm_grab_options_t options,vm_memory_class_t class,unsigned int num_pages,vm_page_q_state_t q_state)2720 mteinfo_free_queue_grab(
2721 vm_grab_options_t options,
2722 vm_memory_class_t class,
2723 unsigned int num_pages,
2724 vm_page_q_state_t q_state)
2725 {
2726 mte_pcpu_t mte_pcpu = PERCPU_GET(mte_pcpu);
2727 unsigned int *colorp;
2728 unsigned int color;
2729 vm_page_list_t list = { };
2730 mte_free_queue_idx_t idx;
2731
2732 assert(!mte_pcpu->deactivate_suspend && get_preemption_level() > 0);
2733
2734 if (class == VM_MEMORY_CLASS_REGULAR) {
2735 /*
2736 * VM_MEMORY_CLASS_DEAD_TAG_STORAGE is not part of
2737 * vm_page_free_count, which means the caller didn't take them
2738 * into account when making this allocation ask.
2739 *
2740 * As a result do not respect num_pages. However these are
2741 * different than the regular claimable pool because we can
2742 * always safely wire them.
2743 */
2744 if (vm_page_queue_free.vmpfq_count) {
2745 list = vm_page_free_queue_grab(options,
2746 VM_MEMORY_CLASS_DEAD_TAG_STORAGE,
2747 MIN(vm_free_magazine_refill_limit / 2,
2748 vm_page_queue_free.vmpfq_count), q_state);
2749 }
2750
2751 assert(num_pages <= vm_page_free_count);
2752 } else {
2753 assert(num_pages <= vm_page_free_taggable_count);
2754 }
2755
2756 colorp = PERCPU_GET(start_color);
2757 color = *colorp;
2758
2759 if (mteinfo_tag_storage_claimable_should_refill(mte_pcpu)) {
2760 mteinfo_tag_storage_claimable_refill(mte_pcpu,
2761 VMP_FREE_BATCH_SIZE, &color);
2762 }
2763
2764 while (list.vmpl_count < num_pages) {
2765 vm_page_queue_t queue;
2766 cell_count_t bit;
2767 vm_page_t tag_page;
2768 vm_page_t mem;
2769 uint32_t count;
2770 ppnum_t first_pnum;
2771 cell_t orig;
2772 cell_t *cell;
2773
2774 /*
2775 * Select which queue we dequeue from
2776 *
2777 * Regular allocations can allocate from any bucket.
2778 * Tagged allocations must draw from an MTE_FREE_ACTIVE_* one.
2779 */
2780
2781 if (class == VM_MEMORY_CLASS_REGULAR) {
2782 idx = ffs(mte_free_queue_mask) - 1;
2783 } else {
2784 uint32_t mask = mte_free_queue_mask;
2785
2786 mask &= BIT(MTE_FREE_ACTIVE_0) |
2787 BIT(MTE_FREE_ACTIVE_1) |
2788 BIT(MTE_FREE_ACTIVE_2) |
2789 BIT(MTE_FREE_ACTIVE_3);
2790
2791 assert(mask);
2792 idx = fls(mask) - 1;
2793 }
2794
2795 queue = mteinfo_free_queue_head(idx, color);
2796 while (vm_page_queue_empty(queue)) {
2797 color = (color + 1) & vm_color_mask;
2798 queue = mteinfo_free_queue_head(idx, color);
2799 }
2800
2801 /*
2802 * Dequeue the linkage, find the page of the right color.
2803 */
2804
2805 vm_page_queue_remove_first(queue, mem, vmp_pageq);
2806
2807 VM_COUNTER_DEC(&mte_free_queues[idx].vmpfq_count);
2808 if (mte_free_queues[idx].vmpfq_count == 0) {
2809 bit_clear(mte_free_queue_mask, idx);
2810 }
2811
2812 first_pnum = VM_PAGE_GET_PHYS_PAGE(mem) & -MTE_PAGES_PER_TAG_PAGE;
2813 cell = cell_from_covered_ppnum(first_pnum, &tag_page);
2814 orig = *cell;
2815 bit = orig.enqueue_pos;
2816 count = 0;
2817 assert((orig.enqueue_pos & vm_color_mask) ==
2818 color % MTE_PAGES_PER_TAG_PAGE);
2819
2820 /*
2821 * Dequeue a span of covered pages from that tag storage
2822 *
2823 * If we have a contiguous run of free pages and we need more,
2824 * we know this tag storage page is going to be the one we pick
2825 * next.
2826 */
2827
2828 for (;;) {
2829 assert(bit_test(orig.free_mask, bit));
2830 bit_clear(cell->free_mask, bit);
2831
2832 mem->vmp_q_state = q_state;
2833 vm_page_list_push(&list, mem);
2834
2835 count += 1;
2836 bit += 1;
2837
2838 if (!bit_test(cell->free_mask, bit) ||
2839 list.vmpl_count >= num_pages) {
2840 break;
2841 }
2842
2843 mem = vm_page_find_canonical(first_pnum + bit);
2844 }
2845
2846 color = (color + count) & vm_color_mask;
2847
2848 /*
2849 * Update counters (see mteinfo_covered_page_set_used())
2850 */
2851
2852 VM_COUNTER_SUB(&vm_page_free_count, count);
2853 if (idx >= MTE_FREE_ACTIVE_0 && idx <= MTE_FREE_ACTIVE_3) {
2854 VM_COUNTER_SUB(&vm_page_free_taggable_count, count);
2855 }
2856 if (class != VM_MEMORY_CLASS_REGULAR) {
2857 VM_COUNTER_ADD(&vm_page_tagged_count, count);
2858 cell->mte_page_count += count;
2859 }
2860
2861 /*
2862 * Requeue the tag storage (tail end of CELL_UPDATE())
2863 */
2864
2865 if (cell_list_idx(orig) != cell_list_idx(*cell) ||
2866 cell_list_bucket(orig) != cell_list_bucket(*cell)) {
2867 cell_list_requeue(cell, tag_page,
2868 cell_list_idx(orig), cell_list_bucket(orig),
2869 cell_list_idx(*cell), cell_list_bucket(*cell),
2870 (int)cell_on_claimable_queue(*cell) -
2871 (int)cell_on_claimable_queue(orig));
2872 }
2873
2874 mteinfo_free_queue_requeue(cell, orig, MTE_FREE_NOT_QUEUED,
2875 mteinfo_free_queue_idx(*cell));
2876 }
2877
2878 *colorp = color;
2879
2880 /*
2881 * Some existing driver/IOKit code deals badly with getting physically
2882 * contiguous memory... which this alloc code is rather likely to
2883 * provide by accident immediately after boot.
2884 *
2885 * To avoid hitting issues related to this, we'll invert the order of
2886 * the list we return. This code should be removed once we've tracked
2887 * down the various driver issues.
2888 */
2889 vm_page_list_reverse(&list);
2890
2891 if (class == VM_MEMORY_CLASS_REGULAR && list.vmpl_has_tagged) {
2892 /*
2893 * We are pulling pages from the taggable free queue
2894 * to use them as untagged.
2895 *
2896 * This breaks the invariant that pages with vmp_using_mte
2897 * set are either free pages on the free queue that were left
2898 * tagged after being freed (covered by the cell "free_mask"),
2899 * or used tagged pages (covered by the cell "mte_page_count"
2900 * counter).
2901 *
2902 * The caller has allocated these pages from the free queue
2903 * (clearing the proper "free_mask" bit) but didn't increment
2904 * the "mte_page_count". It will then proceed with untagging
2905 * these pages without holding any locks, and doesn't want to
2906 * re-take the free page queue lock for book-keeping.
2907 *
2908 * As a result, invariants are broken for a little while,
2909 * and we need to suspend the deactivation path that someone
2910 * has currently broken this invariant on this core until
2911 * the untagging is finished, otherwise, the deactivating
2912 * thread would not consider these pages as tagged, and would
2913 * retype the page to XNU_DEFAULT causing an SPTM panic.
2914 *
2915 * mteinfo_page_list_fix_tagging() will resume deactivations
2916 * when it is called on the same core.
2917 *
2918 * mteinfo_tag_storage_deactivate_barrier() is called by any
2919 * path performing a deactivation to synchronize with this.
2920 */
2921 os_atomic_store(&mte_pcpu->deactivate_suspend, 1,
2922 compiler_acquire);
2923 }
2924
2925 /*
2926 * If pulling untagged pages tapped above the active(0) pool,
2927 * and there are "active(0)" pages around, then we wake up
2928 * the refill thread to drain this pool in order to make some
2929 * claimable pages available.
2930 */
2931 if (vm_mte_enable_tag_storage_grab &&
2932 class == VM_MEMORY_CLASS_REGULAR &&
2933 idx >= MTE_FREE_ACTIVE_0 &&
2934 mteinfo_tag_storage_should_drain(true)) {
2935 mteinfo_wake_fill_thread();
2936 }
2937
2938 return list;
2939 }
2940
2941 void
mteinfo_page_list_fix_tagging(vm_memory_class_t class,vm_page_list_t * list)2942 mteinfo_page_list_fix_tagging(vm_memory_class_t class, vm_page_list_t *list)
2943 {
2944 const unified_page_list_t pmap_batch_list = {
2945 .page_slist = list->vmpl_head,
2946 .type = UNIFIED_PAGE_LIST_TYPE_VM_PAGE_LIST,
2947 };
2948 mte_pcpu_t mte_pcpu = PERCPU_GET(mte_pcpu);
2949 vm_page_t mem;
2950
2951 assert(get_preemption_level() > 0);
2952
2953 if (class == VM_MEMORY_CLASS_REGULAR && list->vmpl_has_tagged) {
2954 pmap_unmake_tagged_pages(&pmap_batch_list);
2955 vm_page_list_foreach(mem, *list) {
2956 mem->vmp_using_mte = false;
2957 }
2958
2959 /*
2960 * Invariants related to tagged pages are resolved,
2961 * we can allow deactivations again.
2962 */
2963 os_atomic_store(&mte_pcpu->deactivate_suspend, 0, release);
2964 }
2965
2966 if (class == VM_MEMORY_CLASS_TAGGED && list->vmpl_has_untagged) {
2967 pmap_make_tagged_pages(&pmap_batch_list);
2968 vm_page_list_foreach(mem, *list) {
2969 mem->vmp_using_mte = true;
2970 }
2971 }
2972
2973 assert(!mte_pcpu->deactivate_suspend);
2974 }
2975
2976 #endif /* VM_MTE_FF_VERIFY */
2977 #pragma mark Bootstrap
2978
2979 static mte_cell_queue_t
cell_list_init(mte_cell_queue_t qhp,mte_cell_state_t state,mte_cell_list_idx_t lidx)2980 cell_list_init(
2981 mte_cell_queue_t qhp,
2982 mte_cell_state_t state,
2983 mte_cell_list_idx_t lidx)
2984 {
2985 mte_cell_bucket_t buckets = cell_list_idx_buckets(lidx);
2986
2987 mte_info_lists[lidx].buckets = qhp;
2988
2989 for (mte_cell_bucket_t i = 0; i < buckets; i++, qhp++) {
2990 qhp->head = (cell_t){
2991 .prev = cell_idx(qhp),
2992 .next = cell_idx(qhp),
2993 .state = state,
2994 .enqueue_pos = -1,
2995 };
2996 }
2997
2998 return qhp;
2999 }
3000
3001 __startup_func
3002 void
mteinfo_init(uint32_t num_tag_pages)3003 mteinfo_init(uint32_t num_tag_pages)
3004 {
3005 assert(2 * num_tag_pages < (1UL << MTE_FF_CELL_INDEX_BITS));
3006 assert(atop(mte_tag_storage_end - mte_tag_storage_start) == num_tag_pages);
3007 assert(num_tag_pages == mte_tag_storage_count);
3008
3009 vm_size_t size = sizeof(cell_t) * (MTE_QUEUES_COUNT + num_tag_pages);
3010 mte_cell_queue_t queue;
3011 mte_cell_list_t list;
3012
3013 queue = pmap_steal_memory(size, 8);
3014 mte_info_cells = &(queue + MTE_QUEUES_COUNT)->head;
3015
3016 queue = cell_list_init(queue, MTE_STATE_DISABLED, MTE_LIST_DISABLED_IDX);
3017 queue = cell_list_init(queue, MTE_STATE_PINNED, MTE_LIST_PINNED_IDX);
3018 queue = cell_list_init(queue, MTE_STATE_DEACTIVATING, MTE_LIST_DEACTIVATING_IDX);
3019 queue = cell_list_init(queue, MTE_STATE_CLAIMED, MTE_LIST_CLAIMED_IDX);
3020 queue = cell_list_init(queue, MTE_STATE_INACTIVE, MTE_LIST_INACTIVE_IDX);
3021 queue = cell_list_init(queue, MTE_STATE_RECLAIMING, MTE_LIST_RECLAIMING_IDX);
3022 queue = cell_list_init(queue, MTE_STATE_ACTIVATING, MTE_LIST_ACTIVATING_IDX);
3023 queue = cell_list_init(queue, MTE_STATE_ACTIVE, MTE_LIST_ACTIVE_0_IDX);
3024 queue = cell_list_init(queue, MTE_STATE_ACTIVE, MTE_LIST_ACTIVE_IDX);
3025
3026 assert(&queue->head == mte_info_cells);
3027
3028 /*
3029 * Quickly create a list of all possible cells and place it into the
3030 * disabled queue.
3031 */
3032
3033 for (cell_idx_t i = 0; i < num_tag_pages; i++) {
3034 *cell_from_idx(i) = (cell_t){
3035 .prev = i - 1,
3036 .next = i + 1,
3037 .enqueue_pos = -1,
3038 .mte_page_count = 0,
3039 .state = MTE_STATE_DISABLED,
3040 };
3041 }
3042
3043 list = &mte_info_lists[MTE_LIST_DISABLED_IDX];
3044 queue = &list->buckets[0];
3045 queue->head.next = 0;
3046 queue->head.prev = num_tag_pages - 1;
3047 queue->head.cell_count = num_tag_pages;
3048 cell_from_idx(0)->prev = cell_idx(queue);
3049 cell_from_idx(num_tag_pages - 1)->next = cell_idx(queue);
3050 bit_set(list->mask, 0);
3051 list->count = num_tag_pages;
3052
3053 for (mte_free_queue_idx_t idx = MTE_FREE_UNTAGGABLE_0;
3054 idx < MTE_FREE_NOT_QUEUED; idx++) {
3055 for (uint32_t i = 0; i < MAX_COLORS; i++) {
3056 vm_page_queue_init(mteinfo_free_queue_head(idx, i));
3057 }
3058 }
3059
3060 #ifndef VM_MTE_FF_VERIFY
3061 vm_page_free_queue_init(&mte_claimable_queue);
3062 #endif /* VM_MTE_FF_VERIFY */
3063 }
3064
3065 #if HIBERNATION
3066
3067 void
3068 mteinfo_free_queue_foreach(void (^block)(vm_page_t))
3069 {
3070 for (cell_idx_t cidx = 0; cidx < mte_tag_storage_count; cidx++) {
3071 cell_t *cell = cell_from_idx(cidx);
3072 ppnum_t pnum = cell_first_covered_pnum(cell);
3073 uint32_t mask = cell->free_mask;
3074
3075 while (mask) {
3076 block(vm_page_find_canonical(pnum + ffs(mask) - 1));
3077 mask &= mask - 1;
3078 }
3079
3080 if (cell->state == MTE_STATE_INACTIVE) {
3081 block(vm_tag_storage_page_get(cidx));
3082 }
3083 }
3084 }
3085
3086 #endif /* HIBERNATION */
3087 #ifndef VM_MTE_FF_VERIFY
3088
3089 /* List that tracks tag storage pages until mte_tags_object is initialized. */
3090 __startup_data
3091 static vm_page_list_t mte_tag_storage_startup_list;
3092
3093 void
mteinfo_tag_storage_release_startup(vm_page_t tag_page)3094 mteinfo_tag_storage_release_startup(vm_page_t tag_page)
3095 {
3096 cell_t *cell = cell_from_tag_storage_page(tag_page);
3097 ppnum_t tag_pnum = VM_PAGE_GET_PHYS_PAGE(tag_page);
3098 ppnum_t first_pnum = cell_first_covered_pnum(cell);
3099 vm_memory_class_t class = VM_MEMORY_CLASS_TAG_STORAGE;
3100 bool deactivate = true;
3101 uint32_t mte_count = 0;
3102
3103 /*
3104 * If this is a tag storage page we won't even classify as tag
3105 * storage. Just give it to the normal free queues.
3106 *
3107 * Otherwise, keep about a 1/8 of the tag storage page around,
3108 * it should be vastly sufficient to boot. The refill thread
3109 * and various passive policies will let it rebalance later.
3110 *
3111 * Note that this code implicitly relies on the fact that
3112 * the tag storage is toward the end of the vm pages array:
3113 * we only keep tag storage around that have 32 pages free,
3114 * but pages that haven't been created yet appear as "used".
3115 */
3116
3117 assert(pmap_is_tag_storage_page(tag_pnum));
3118
3119 if (pmap_tag_storage_is_discarded(tag_pnum)) {
3120 mteinfo_tag_storage_set_retired(tag_page);
3121 return;
3122 } else if (pmap_tag_storage_is_recursive(tag_pnum)) {
3123 VM_COUNTER_INC(&vm_page_recursive_tag_storage_count);
3124 class = VM_MEMORY_CLASS_DEAD_TAG_STORAGE;
3125 } else if (pmap_tag_storage_is_unmanaged(tag_pnum)) {
3126 VM_COUNTER_INC(&vm_page_unmanaged_tag_storage_count);
3127 class = VM_MEMORY_CLASS_DEAD_TAG_STORAGE;
3128 } else {
3129 for (uint32_t i = 0; i < MTE_PAGES_PER_TAG_PAGE; i++) {
3130 mte_count += pmap_is_tagged_page(first_pnum + i);
3131 }
3132
3133 if (cell_free_page_count(*cell) == MTE_PAGES_PER_TAG_PAGE &&
3134 mteinfo_tag_storage_active(true) < mte_tag_storage_count / 8) {
3135 deactivate = false;
3136 } else if (mte_count) {
3137 deactivate = false;
3138 }
3139 }
3140
3141 if (deactivate) {
3142 pmap_unmake_tag_storage_page(tag_pnum);
3143 if (class == VM_MEMORY_CLASS_DEAD_TAG_STORAGE) {
3144 vm_page_free_queue_enter(class, tag_page, tag_pnum);
3145 } else {
3146 tag_page->vmp_q_state = VM_PAGE_ON_FREE_Q;
3147 mteinfo_tag_storage_set_inactive(tag_page, true);
3148 }
3149 return;
3150 }
3151
3152 mteinfo_tag_storage_set_active(tag_page, mte_count, true);
3153 vm_page_list_push(&mte_tag_storage_startup_list, tag_page);
3154 }
3155
3156 /*!
3157 * @function mteinfo_tag_storage_startup_list_flush()
3158 *
3159 * @abstract
3160 * Adds active tag storage pages to the mte_tags_object.
3161 *
3162 * @discussion
3163 * Adds the list of active tag storage pages updated by @see
3164 * mteinfo_tag_storage_release_startup to the mte_tags_object. This must be
3165 * called at some point after the last @see mteinfo_tag_storage_release_startup
3166 * call.
3167 */
3168 __startup_func
3169 static void
mteinfo_tag_storage_startup_list_flush(void)3170 mteinfo_tag_storage_startup_list_flush(void)
3171 {
3172 vm_page_t page;
3173
3174 vm_object_lock(mte_tags_object);
3175 vm_page_lock_queues();
3176
3177 vm_page_list_foreach_consume(page, &mte_tag_storage_startup_list) {
3178 mteinfo_tag_storage_wire_locked(page);
3179 }
3180
3181 vm_page_unlock_queues();
3182 vm_object_unlock(mte_tags_object);
3183 }
3184 STARTUP(KMEM, STARTUP_RANK_FIRST, mteinfo_tag_storage_startup_list_flush);
3185
3186 /*!
3187 * @abstract
3188 * Initializes the percpu mte queues and locks.
3189 */
3190 __startup_func
3191 static void
mteinfo_tag_storage_lock_init(void)3192 mteinfo_tag_storage_lock_init(void)
3193 {
3194 percpu_foreach(mte_pcpu, mte_pcpu) {
3195 lck_ticket_init(&mte_pcpu->free_claimed_lock,
3196 &vm_page_lck_grp_bucket);
3197 vm_page_queue_init(&mte_pcpu->free_claimed_pages);
3198 }
3199 }
3200 STARTUP(PERCPU, STARTUP_RANK_MIDDLE, mteinfo_tag_storage_lock_init);
3201
3202 /*!
3203 * @function mteinfo_init_fill_thread
3204 *
3205 * @abstract
3206 * Creates the MTE fill thread.
3207 */
3208 __startup_func
3209 static void
mteinfo_init_fill_thread(void)3210 mteinfo_init_fill_thread(void)
3211 {
3212 kern_return_t result;
3213
3214 if (!is_mte_enabled) {
3215 return;
3216 }
3217
3218 result = kernel_thread_start_priority(mteinfo_fill_continue, NULL, BASEPRI_VM,
3219 &vm_mte_fill_thread);
3220
3221 if (result != KERN_SUCCESS) {
3222 panic("Failed to create MTE fill thread.");
3223 }
3224
3225 thread_set_thread_name(vm_mte_fill_thread, "VM_mte_fill");
3226 thread_deallocate(vm_mte_fill_thread);
3227 }
3228 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, mteinfo_init_fill_thread);
3229
3230 static ppnum_t
mteinfo_tag_storage_mark_unmanaged_range(cell_idx_t idx,ppnum_t pnum)3231 mteinfo_tag_storage_mark_unmanaged_range(cell_idx_t idx, ppnum_t pnum)
3232 {
3233 cell_t *end_cell = cell_from_covered_ppnum(pnum);
3234 cell_idx_t end_idx = cell_idx(end_cell);
3235 bool locked = false;
3236
3237 for (; idx < end_idx; idx++) {
3238 cell_t *cell = cell_from_idx(idx);
3239 vm_page_t tag_page = vm_tag_storage_page_get(idx);
3240
3241 if (!locked) {
3242 vm_free_page_lock_spin();
3243 locked = true;
3244 }
3245
3246 if (pmap_tag_storage_is_discarded(VM_PAGE_GET_PHYS_PAGE(tag_page))) {
3247 mteinfo_tag_storage_set_retired(tag_page);
3248 continue;
3249 }
3250
3251 if (cell->mte_page_count != 0) {
3252 /*
3253 * This can happen if some tagged pmap steal
3254 * has not ml_static_mfree()d these pages back
3255 */
3256 continue;
3257 }
3258
3259 if (cell->state == MTE_STATE_DISABLED) {
3260 /*
3261 * Probably an ECC retired page.
3262 */
3263 continue;
3264 }
3265
3266 mteinfo_tag_storage_set_unmanaged(cell,
3267 vm_tag_storage_page_get(idx));
3268 }
3269
3270 if (locked) {
3271 vm_free_page_unlock();
3272 }
3273
3274 return end_idx + 1;
3275 }
3276
3277 static void
mteinfo_tag_storage_unmanaged_discover(void)3278 mteinfo_tag_storage_unmanaged_discover(void)
3279 {
3280 uint32_t count = vm_page_unmanaged_tag_storage_count;
3281 cell_idx_t cur_idx = 0;
3282 ppnum_t pnum;
3283
3284 if (!is_mte_enabled) {
3285 return;
3286 }
3287
3288 vm_pages_radix_for_each_pnum(pnum) {
3289 cur_idx = mteinfo_tag_storage_mark_unmanaged_range(cur_idx, pnum);
3290 }
3291 mteinfo_tag_storage_mark_unmanaged_range(cur_idx,
3292 vm_pages_first_pnum);
3293
3294 printf("MTE: discovered %d tag storage pages for unmanaged memory\n",
3295 vm_page_unmanaged_tag_storage_count - count);
3296 }
3297 STARTUP(LOCKDOWN, STARTUP_RANK_LAST, mteinfo_tag_storage_unmanaged_discover);
3298
3299 extern boolean_t get_range_bounds(char *c, int64_t *lower, int64_t *upper);
3300 static void
mteinfo_tag_storage_process_vm_tags(void)3301 mteinfo_tag_storage_process_vm_tags(void)
3302 {
3303 char *vm_tags_str;
3304
3305 if (!vm_mte_enable_tag_storage_grab) {
3306 return;
3307 }
3308
3309 vm_tags_str = vm_mte_tag_storage_for_vm_tags;
3310 while (*vm_tags_str) {
3311 uint64_t loop_end;
3312 boolean_t ret;
3313 int64_t start = 1, end = VM_MEMORY_COUNT;
3314
3315 ret = get_range_bounds(vm_tags_str, &start, &end);
3316 loop_end = (ret) ? end : start;
3317 for (int64_t i = start; i <= loop_end; i++) {
3318 bitmap_set(vm_mte_tag_storage_for_vm_tags_mask, (uint)i);
3319 }
3320
3321 /* Skip to the next ',' */
3322 while (*vm_tags_str != ',') {
3323 if (*vm_tags_str == '\0') {
3324 break;
3325 }
3326 vm_tags_str++;
3327 }
3328
3329 if (*vm_tags_str == ',') {
3330 vm_tags_str++;
3331 } else {
3332 assert(*vm_tags_str == '\0');
3333 break;
3334 }
3335 }
3336 }
3337 STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, mteinfo_tag_storage_process_vm_tags);
3338
3339 #pragma mark Counter methods
3340
3341 uint32_t
mteinfo_tag_storage_fragmentation(bool actual)3342 mteinfo_tag_storage_fragmentation(bool actual)
3343 {
3344 uint32_t ts_active;
3345 uint32_t value;
3346
3347 vm_free_page_lock_spin();
3348 ts_active = mte_info_lists[MTE_LIST_ACTIVE_IDX].count;
3349 if (actual) {
3350 ts_active += mte_info_lists[MTE_LIST_ACTIVE_0_IDX].count;
3351 }
3352 if (ts_active) {
3353 value = 1000 * vm_page_tagged_count;
3354 value /= (ts_active * MTE_PAGES_PER_TAG_PAGE);
3355 } else {
3356 value = 1000;
3357 }
3358 vm_free_page_unlock();
3359
3360 return 1000 - value;
3361 }
3362
3363 uint32_t
mteinfo_tag_storage_active(bool fq_locked)3364 mteinfo_tag_storage_active(bool fq_locked)
3365 {
3366 uint32_t active;
3367
3368 if (!fq_locked) {
3369 vm_free_page_lock_spin();
3370 }
3371
3372 active = mte_info_lists[MTE_LIST_ACTIVE_0_IDX].count +
3373 mte_info_lists[MTE_LIST_ACTIVE_IDX].count;
3374
3375 if (!fq_locked) {
3376 vm_free_page_unlock();
3377 }
3378
3379 return active;
3380 }
3381
3382 uint32_t
mteinfo_tag_storage_free_pages_for_covered(const struct vm_page * page)3383 mteinfo_tag_storage_free_pages_for_covered(const struct vm_page *page)
3384 {
3385 ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(page);
3386
3387 return cell_free_page_count(*cell_from_covered_ppnum(pnum));
3388 }
3389
3390 void
mteinfo_increment_wire_count(vm_page_t tag_page)3391 mteinfo_increment_wire_count(vm_page_t tag_page)
3392 {
3393 if (vm_page_in_tag_storage_array(tag_page) &&
3394 vm_page_is_tag_storage(tag_page)) {
3395 VM_COUNTER_ATOMIC_INC(&vm_page_wired_tag_storage_count);
3396
3397 DTRACE_VM1(vm_tag_storage_wired, vm_page_t, tag_page);
3398 }
3399 }
3400
3401 void
mteinfo_decrement_wire_count(vm_page_t tag_page,bool pqs_locked)3402 mteinfo_decrement_wire_count(vm_page_t tag_page, bool pqs_locked)
3403 {
3404 LCK_MTX_ASSERT(&vm_page_queue_lock,
3405 pqs_locked ? LCK_MTX_ASSERT_OWNED : LCK_MTX_ASSERT_NOTOWNED);
3406 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_NOTOWNED);
3407
3408 if (vm_page_in_tag_storage_array(tag_page) &&
3409 VM_PAGE_OBJECT(tag_page) != mte_tags_object &&
3410 vm_page_is_tag_storage(tag_page)) {
3411 VM_COUNTER_ATOMIC_DEC(&vm_page_wired_tag_storage_count);
3412
3413 DTRACE_VM1(vm_tag_storage_unwired, vm_page_t, tag_page);
3414
3415 if (tag_page->vmp_ts_wanted) {
3416 /*
3417 * Many callers have the page queue lock held in spin
3418 * when calling this, and mteinfo_tag_storage_wakeup()
3419 * needs to acquire a mutex.
3420 */
3421 if (pqs_locked) {
3422 vm_page_lockconvert_queues();
3423 }
3424 mteinfo_tag_storage_wakeup(tag_page, false);
3425 }
3426 }
3427 }
3428
3429 bool
mteinfo_vm_tag_can_use_tag_storage(vm_tag_t vm_tag)3430 mteinfo_vm_tag_can_use_tag_storage(vm_tag_t vm_tag)
3431 {
3432 return bitmap_test(vm_mte_tag_storage_for_vm_tags_mask, (uint)vm_tag);
3433 }
3434
3435
3436 void
kdp_mteinfo_snapshot(struct mte_info_cell * __counted_by (count)cells,size_t count)3437 kdp_mteinfo_snapshot(struct mte_info_cell * __counted_by(count) cells, size_t count)
3438 {
3439 release_assert(count == mte_tag_storage_count);
3440
3441 if (not_in_kdp) {
3442 panic("panic: kdp_mteinfo_fill called outside of kernel debugger");
3443 }
3444
3445 for (cell_idx_t cidx = 0; cidx < mte_tag_storage_count; cidx++) {
3446 cell_t *cell = cell_from_idx(cidx);
3447 ppnum_t pnum = cell_first_covered_pnum(cell);
3448 vm_page_t mem;
3449 uint8_t wired_count = 0, wired_tagged_count = 0, kernel_wired_tagged_count = 0;
3450
3451 for (ppnum_t i = 0; i < MTE_PAGES_PER_TAG_PAGE; i++) {
3452 mem = vm_page_find_canonical(pnum + i);
3453 if (mem && VM_PAGE_WIRED(mem)) {
3454 wired_count++;
3455 if (mem->vmp_using_mte) {
3456 if (VM_PAGE_OBJECT(mem) == kernel_object_tagged) {
3457 kernel_wired_tagged_count++;
3458 } else {
3459 wired_tagged_count++;
3460 }
3461 }
3462 }
3463 }
3464
3465 cells[cidx] = (struct mte_info_cell) {
3466 .mic_state = cell->state,
3467 .mic_tagged_count = cell->mte_page_count,
3468 .mic_free_count = (uint8_t)cell_free_page_count(*cell),
3469 .mic_wired_count = wired_count,
3470 .mic_wired_tagged_count = wired_tagged_count,
3471 .mic_kernel_wired_tagged_count = kernel_wired_tagged_count
3472 };
3473 }
3474 }
3475 #endif /* VM_MTE_FF_VERIFY */
3476
3477 #endif /* HAS_MTE */
3478