xref: /xnu-12377.81.4/osfmk/vm/vm_mteinfo.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2000-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /* Guard header includes, so that the userspace test can include this file. */
30 #include <os/atomic_private.h>
31 #ifndef VM_MTE_FF_VERIFY
32 #include <debug.h>
33 #include <mach_assert.h>
34 
35 #include <kern/bits.h>
36 #include <kern/kcdata.h>
37 #include <kern/queue.h>
38 
39 #include <mach/sdt.h>
40 
41 #include <vm/pmap.h>
42 #include <vm/vm_compressor_internal.h>
43 #include <vm/vm_kern.h>
44 #include <vm/vm_object_internal.h>
45 #include <vm/vm_page_internal.h>
46 #include <vm/vm_pageout.h>
47 #include <vm/vm_mteinfo_internal.h>
48 
49 extern lck_grp_t vm_page_lck_grp_bucket;
50 
51 #endif /* VM_MTE_FF_VERIFY */
52 #pragma mark Documentation
53 #if HAS_MTE
54 
55 /*
56  * VM MTE Info
57  * ===========
58  *
59  * The top level goal of this code is to implement the policies managing the
60  * selection of tag storage pages on the system, in order to:
61  * - Minimize the number of live tag storage pages at any given time;
62  * - Maximize occupancy (the number of covered pages using MTE compared to tag
63  *   storage pages actually being used for tag storage).
64  *
65  *
66  * Physical Memory Layout
67  * ----------------------
68  *
69  * The diagram below describes the general layout of the physical memory. iBoot
70  * will determine the placement of the tag storage region, at the end of the
71  * managed address space.
72  *
73  * As a result, the tag storage space is always part of the vm_pages array.
74  * However, several things should be noted:
75  *
76  * - The last tag storage pages cover unmanaged DRAM at the end of physical
77  *   memory, as well as the tag storage space itself, and will never be used as
78  *   tag storage memory by the system (the unmanaged space will not be MTE'd,
79  *   and the tag storage space will never itself use MTE).
80  *
81  * - The first tag storage pages also cover unmanaged DRAM space at the
82  *   beginning of physical memory, but might be used for tagging due to early
83  *   boot code.  However, these first tag storage pages will not be used for
84  *   tag storage space dynamically by the system.
85  *
86  * - The beginning of the tag region space is always aligned to a 32 page
87  *   boundary; however the start of the vm_pages array is not. As a result,
88  *   there is a cluster of 32 pages that possibly crosses this boundary. This
89  *   is relevant because dynamic tag storage management only functions for
90  *   taggable pages inside the vm_pages array.
91  *
92  *
93  *                            ┌────────────┐─╮
94  *                            │    P_n+31  │ │
95  *                            ├────────────┤ │
96  *                            ╎     ...    ╎ │
97  *                            ├────────────┤ │
98  *                            │     P_n    │ │
99  *                            ├────────────┤─╯
100  *                            │            │
101  *                            ╎            ╎
102  *                            ╎     ...    ╎
103  *                            ╎            ╎
104  *                            │            │
105  *   mte_tag_storage_end ─ ─ ─├────────────┤ ─ ─ ─ vm_pages_end
106  *              ┬             │TTTTTTTTTTTT│ Tag storage for pages [n:n+31]
107  *              │             ├────────────┤
108  *              │             │            │
109  *              │             ╎     ...    ╎
110  *              │             │            │
111  *              │             ├────────────┤
112  *       1/32   │             │TTTTTTTTTTTT│ Tag storage for pages [i:i+31]
113  *      of DRAM │             ├────────────┤
114  *              │             │            │
115  *              │             ╎     ...    ╎
116  *              │             │            │
117  *              │             ├────────────┤
118  *              │             │TTTTTTTTTTTT│ Tag storage for pages [32:63]
119  *              │             ├────────────┤
120  *              ┴             │TTTTTTTTTTTT│ Tag storage for pages [0:31]
121  * mte_tag_storage_start ─ ─ ─├────────────┤─╮
122  *                            │    P_i+31  │ │
123  *                            ├────────────┤ │
124  *                            ╎     ...    ╎ │
125  *                            ├────────────┤ │
126  *                            │     P_i    │ │
127  *                            ├────────────┤─╯
128  *                            │            │
129  *                            ╎            ╎
130  *                            ╎     ...    ╎
131  *                            ╎            ╎
132  *                            │            │
133  *                            ├────────────┤─╮
134  *                            │            │ │
135  *                            ╎     ...    ╎ │
136  *                            ├────────────┤ │ ─ ─ vm_pages
137  *                            ╎     ...    ╎ │
138  *                            │            │ │
139  *                            │────────────┤─╯
140  *                            │            │
141  *                            ╎            ╎
142  *                            ╎     ...    ╎
143  *                            ╎            ╎
144  *                            │            │
145  *                            ├────────────┤─╮
146  *                            │    P_31    │ │
147  *                            ├────────────┤ │
148  *                            ╎     ...    ╎ │
149  *                            ├────────────┤ │
150  *                            │    P_0     │ │
151  *  pmap_first_pnum        ─ ─└────────────┘─╯ ─ ─ gDramBase
152  *                           Physical Memory
153  *
154  *
155  * Tag storage and cells
156  * ~~~~~~~~~~~~~~~~~~~~~
157  *
158  * Tag storage pages require metadata to track their state machine, in order to
159  * not grow the vm_page_t data structure for all pages on the system when only
160  * 1/32 of them are tag storage.
161  *
162  * The metadata is stored into a data structure called the MTE cell
163  * (@see cell_t) which is queued into the so called MTE Info data structure
164  * (@see @c mte_info_lists).
165  *
166  * The documentation of this file happily calls a cell a tag storage page and
167  * vice versa as result, since the mapping is 1:1.
168  *
169  *
170  * Tag storage state machine
171  * ~~~~~~~~~~~~~~~~~~~~~~~~~
172  *
173  * Disabled is a special state: this is the state cells start in,
174  * and never transition back to unless there is an ECC error.
175  *
176  * The state diagram involving "Disabled" looks like this:
177  *
178  *     ╭──────────────╮          ╭───╴K.3╶──╮          ╔══════════════╗
179  *     │  RECLAIMING  ┼───╮      │          v     ╭───>║    ACTIVE    ║
180  *     ╰──────────────╯  K.1   ╔═╪════════════╗  I.1   ╚══════════════╝
181  *                        ├───>║   DISABLED   ╫───┤
182  *      ╔═════════════╗  K.2   ╚══════════════╝  I.2   ╔══════════════╗
183  *      ║   CLAIMED   ╫───╯      ^          ^     ╰───>║   INACTIVE   ║
184  *      ╚═══════════╪═╝          │          │          ╚═╪════════════╝
185  *                  ╰────╴U.1╶───╯          ╰───╴U.2╶────╯
186  *
187  *   ╔═╗ Double bar square boxes         ╭─╮ Single bar round boxes
188  *   ╚═╝ denote stable states.           ╰─╯ denote transitionary states.
189  *
190  *
191  * Initialization (I.1, I.2)
192  *
193  *   This is performed by mteinfo_tag_storage_release_startup()
194  *   This function might decide to leave pages as disabled.
195  *
196  * Unmanaged discovery (U.1, U.2)
197  *
198  *   This is performed at lockdown by mteinfo_tag_storage_unmanaged_discover()
199  *   to discover tag storage that covers pages that will never have a canonical
200  *   vm_page_t made for them, which are effectively unmanaged.
201  *
202  * Retirement (K.1, K.2, K.3)
203  *
204  *   This is performed by mteinfo_tag_storage_set_retired(),
205  *   itself called by vm_page_retire() which can only happen
206  *   for pages that were never created (the cell will be DISABLED),
207  *   or on the tag storage claimed page free path (the cell
208  *   will either be RECLAIMING or CLAIMED).
209  *
210  *
211  * The rest of the tag storage state machine looks like this:
212  *
213  *                            ╭──────────────╮
214  *               ╭────╴D.2╶───┼ DEACTIVATING │<───╴D.1╶────╮
215  *               │      a     ╰──────────────╯      a      │
216  *               v                                         │
217  *  ╔══════════════╗          ╭──────────────╮           ╔═╪════════════╗
218  *  ║   INACTIVE   ╫──╴A.1╶──>│  ACTIVATING  ┼───╴A.2╶──>║    ACTIVE    ║<─╮
219  *  ╚════════════╪═╝   i/a    ╰──────────────╯    i/a    ╚══════════════╝  │
220  *    ^          │                                                         │
221  *    │          │                                                         │
222  *    │          │                          ╔════════════╗                 │
223  *    │          │              ╭───╴B.2╶───╫   PINNED   ║<───╴B.1╶───╮    │
224  *    │          │              │     i     ╚════════════╝      a     │   R.2
225  *    │          │              │                                     │    a
226  *    │          │              │          ╭─────╴R.x╶─────╮          │    │
227  *    │          │              v          v       a       │          │    │
228  *    │          │            ╔═════════════╗            ╭─┼──────────┼─╮  │
229  *    │          ╰────╴C.1╶──>║   CLAIMED   ╫────╴R.1╶──>│  RECLAIMING  ┼──╯
230  *    │                 i     ╚═╪═══════════╝      a     ╰─┼────────────╯
231  *    │                         │                          │
232  *    ╰──────────╴F.1╶──────────╯<─────────╴F.2╶───────────╯
233  *                 i                         i
234  *
235  *   ╔═╗ Double bar square boxes         ╭─╮ Single bar round boxes
236  *   ╚═╝ denote stable states.           ╰─╯ denote transitionary states.
237  *
238  *    a  the transition can be done by the refill thread (async)
239  *    i  the transition can be done inline by any thread.
240  *
241  *
242  * Activation (A.1, A.2)
243  *
244  *   [A.1 inline] is performed by mteinfo_tag_storage_try_activate() by
245  *   vm_page_grab_slow() if the current grab would deplete the taggable
246  *   space too much and that there seem to be an ample reserve of free
247  *   pages.
248  *
249  *   This path however will limit itself to pages that are really worth
250  *   activating (17+ free associated pages, which coincide with the first 3
251  *   mteinfo buckets for MTE_STATE_INACTIVE).
252  *
253  *
254  *   [A.1 async] is performed by mteinfo_tag_storage_active_refill() when it
255  *   decides that activating pages is the best strategy to get more taggable
256  *   pages.  It will only do so if [R.1 async] isn't more profitable.
257  *
258  *
259  *   [A.2 inline/async] is performed by mteinfo_tag_storage_activate_locked()
260  *   on the results of [A.1 inline/async]. The most notable thing to mention
261  *   is until the tag pages are fully activated, no tagged page can be
262  *   allocated, and if the thread doing this operation inline is a low priority
263  *   thread, this could cause starvation due to priority inversions.
264  *
265  *   To prevent this issue, turnstiles are used for the inline case so that
266  *   there's a single activator at a time with priority inversion avoidance.
267  *   The async path doesn't use this as it is a very high priority thread,
268  *   and is meant to run in case of emergencies.
269  *
270  *
271  * Deactivation (D.1, D.2)
272  *
273  *   [D.1 async] is performed by mteinfo_tag_storage_drain(). The refill
274  *   thread will invoke this function after it is done with activations.
275  *
276  *   This phase will only drain active(0.0) pages, meaning pages that are active
277  *   but have no free pages associated with it nor MTE pages. Having such pages
278  *   on the system is a sign of untagged memory pressure, and it's probably
279  *   a good idea to free that tag storage page so it can be used for untagged
280  *   purposes (i.e., become claimed).
281  *
282  *   It will drain pages until the @c mte_claimable_queue has a healthy level.
283  *
284  *   This transition is triggered lazily from the @c mteinfo_free_queue_grab()
285  *   path when untagged pages have been allocated and tapped into the taggable
286  *   space, and that system conditions permit
287  *   (see @c mteinfo_tag_storage_should_drain()).
288  *
289  *   [D.2 async] is performed by mteinfo_tag_storage_drain_flush(),
290  *   which is called by mteinfo_tag_storage_drain() on the results
291  *   of [D.1 async]
292  *
293  *
294  * Allocation/Claiming (C.1)
295  *
296  *   [C.1 inline] is performed by @c mteinfo_tag_storage_claimable_refill()
297  *   from the context of any @c mteinfo_free_queue_grab() (tagged or regular).
298  *   The path will opportunistically determine that there are enough pages
299  *   on the @c mte_claimable_queue that amortizing the cost of taking
300  *   the spinlock protecting the per-cpu queue is worth it.
301  *
302  *   It is done unconditionally otherwise, as the reclaim thread can steal
303  *   from these queues. The @c vm_page_grab_options() fastpath knows how
304  *   to draw from this directly.
305  *
306  *
307  * Freeing (F.1, F.2)
308  *
309  *   [F.1 inline] is performed by page free paths who eventually call into
310  *   @c vm_page_free_queue_enter(VM_MEMORY_CLASS_TAG_STORAGE).
311  *
312  *   [F.2 inline] is the exact same transition but for the case when the refill
313  *   thread was attempting to reclaim this page (it had performed [R.1 async]).
314  *   It is worth nothing that on paper, the [C.1 inline] transition could happen
315  *   again before the refill thread notices.
316  *
317  *
318  * Reclaiming (R.1, R.2, R.x, B.1, B.2)
319  *
320  *   [R.1 async] is performed by mteinfo_tag_storage_active_refill() when it
321  *   decides that reclaiming (stealing) pages is the best strategy to get more
322  *   taggable pages. It will only do so if [A.1 async] isn't more profitable.
323  *
324  *   Once pages have been marked as reclaiming, it will attempt to either steal
325  *   the page from the cpu free queue, or attempt a relocation.
326  *
327  *   [R.2 async] is exactly the same as [A.2 async], being performed by
328  *   mteinfo_tag_storage_activate_locked() on the results of [R.1 async].
329  *   The major difference however is that it is done one page at a time.
330  *
331  *   [B.1 async] is performed by @c mteinfo_reclaim_tag_storage_page() when
332  *   the relocating a claimed page failed due to the page being pinned.
333  *   In which case, the tag storage page is marked with @c vmp_ts_wanted bit.
334  *
335  *   [B.2 inline] is performed by @c mteinfo_tag_storage_wakeup() when threads
336  *   notice that @c vmp_ts_wanted is set and that the condition causing it to be
337  *   set has cleared.
338  *
339  *   [R.x async] is performed when stealing the page was otherwise not
340  *   successful (in @c mteinfo_reclaim_tag_storage_page() or
341  *   @c mteinfo_tag_storage_flush_reclaiming()).
342  */
343 
344 
345 #pragma mark Types
346 
347 /*!
348  * @typedef cell_state_mask_t
349  *
350  * @abstract
351  * Mask/bit-field version of the @c mte_cell_state_t bit in order to do assertions.
352  */
353 __options_decl(cell_state_mask_t, uint32_t, {
354 	MTE_MASK_DISABLED       = BIT(MTE_STATE_DISABLED),
355 	MTE_MASK_PINNED         = BIT(MTE_STATE_PINNED),
356 	MTE_MASK_DEACTIVATING   = BIT(MTE_STATE_DEACTIVATING),
357 	MTE_MASK_CLAIMED        = BIT(MTE_STATE_CLAIMED),
358 	MTE_MASK_INACTIVE       = BIT(MTE_STATE_INACTIVE),
359 	MTE_MASK_RECLAIMING     = BIT(MTE_STATE_RECLAIMING),
360 	MTE_MASK_ACTIVATING     = BIT(MTE_STATE_ACTIVATING),
361 	MTE_MASK_ACTIVE         = BIT(MTE_STATE_ACTIVE),
362 });
363 
364 #define MTE_FF_CELL_INDEX_BITS          24 /* Number of bits for a cell index */
365 #define MTE_FF_CELL_PAGE_COUNT_BITS     6  /* Number of bits for a page count */
366 #define MTE_FF_CELL_STATE_BITS          3
367 
368 /*!
369  * @typedef cell_idx_t
370  *
371  * @abstract
372  * Represents the index of a cell in the cell array (when positive), or a queue
373  * head (when negative).
374  *
375  * @discussion
376  * This type only has @c MTE_FF_CELL_INDEX_BITS worth of significant bits.
377  * Given that one bit is used to denote queues, it means we can support systems
378  * with up to:
379  * - 2^(MTE_FF_CELL_INDEX_BITS - 1) tag storage pages,
380  * - 2^(MTE_FF_CELL_INDEX_BITS + 4) pages,
381  * - 2^(MTE_FF_CELL_INDEX_BITS + 4 + PAGE_SHIFT) bytes.
382  *
383  * On a 16KB system (PAGE_SHIFT == 14) and with MTE_FF_CELL_INDEX_BITS == 24,
384  * this covers 2^42 == 4TB of physical memory.
385  */
386 typedef int32_t cell_idx_t;
387 
388 typedef uint32_t cell_count_t;
389 
390 /*!
391  * @typedef cell_t
392  *
393  * @abstract
394  * This data structure contains the metadata associated with a tag storage page,
395  * and its covered pages in the mteinfo tracking data structure.
396  *
397  * @discussion
398  * Here are some important invariants for this data structure:
399  * - mte_page_count + popcount(free_mask) <= MTE_PAGES_PER_TAG_PAGE
400  * - mte_page_count must be 0 unless state is DISABLED or ACTIVE.
401  *
402  * @field prev
403  * Linkage to the prev cell (as an index in the cell array).
404  *
405  * @field next
406  * Linkage to the next cell (as an index in the cell array).
407  *
408  * @field enqueue_pos
409  * If @c free_mask isn't 0, this contains the index of the free covered page
410  * which represents this cell in the mte free queues (@see @c mte_free_queues[]).
411  *
412  * @field mte_page_count
413  * The number of pages covered with this tag storage page, that are currently
414  * used and tagged.
415  *
416  * @field state
417  * The current state of the tag storage page this cell represents.
418  * @see mte_cell_state_t.
419  *
420  * @field free_mask
421  * A bitmask where each bit set corresponds to an associated covered page that
422  * is free (tagged or not).
423  *
424  * @field cell_count
425  * When the cell is a queue head, the number of cells enqueued on this bucket.
426  */
427 #pragma pack(4)
428 typedef struct {
429 	cell_idx_t              prev : MTE_FF_CELL_INDEX_BITS;
430 	cell_idx_t              next : MTE_FF_CELL_INDEX_BITS;
431 	cell_count_t            enqueue_pos : MTE_FF_CELL_PAGE_COUNT_BITS;
432 	cell_count_t            mte_page_count : MTE_FF_CELL_PAGE_COUNT_BITS;
433 	mte_cell_state_t        state : MTE_FF_CELL_STATE_BITS;
434 	uint8_t                 __unused_bits : 1;
435 	union {
436 		uint32_t        free_mask;
437 		uint32_t        cell_count;
438 	};
439 } cell_t;
440 #pragma pack()
441 
442 static_assert(sizeof(cell_t) == 12);
443 static_assert(MTE_STATE_ACTIVE < (1u << MTE_FF_CELL_STATE_BITS));
444 static_assert(MTE_PAGES_PER_TAG_PAGE <= (1 << MTE_FF_CELL_PAGE_COUNT_BITS));
445 
446 /*!
447  * @typedef mte_cell_queue_t
448  *
449  * @abstract
450  * This data structure represents a particular queue/bucket of cells.
451  */
452 typedef struct mte_cell_queue_head {
453 	cell_t          head;
454 } *mte_cell_queue_t;
455 
456 /*!
457  * @typedef mte_cell_bucket_t
458  *
459  * @abstract
460  * Represents the index of a bucket inside of a list.
461  */
462 __enum_decl(mte_cell_bucket_t, uint32_t, {
463 	MTE_BUCKET_0,
464 	MTE_BUCKET_1_8,
465 	MTE_BUCKET_9_16,
466 	MTE_BUCKET_17_24,
467 	MTE_BUCKET_25_32,
468 
469 	_MTE_BUCKET_COUNT,
470 });
471 
472 static_assert(_MTE_BUCKET_COUNT == MTE_BUCKETS_COUNT_MAX);
473 
474 #define MTE_QUEUES_COUNT \
475 	(1 /* disabled */ + \
476 	 1 /* pinned */ + \
477 	 MTE_BUCKETS_COUNT_MAX /* claimed */ + \
478 	 MTE_BUCKETS_COUNT_MAX /* inactive */ + \
479 	 1 /* deactivating */ + \
480 	 1 /* reclaiming */ + \
481 	 1 /* activating */ + \
482 	 MTE_BUCKETS_COUNT_MAX /* active_0 */ + \
483 	 1 /* active */ )
484 
485 
486 #pragma mark Behavioral boot-args
487 
488 /*
489  * Boot-arg to enable/disable the interface for grabbing tag storage pages.
490  * This exists in case tunables or settings for tag storage management expose
491  * us to page shortages or system hangs due to wired tag storage pages.  This
492  * boot-arg should allow us to bypass any such issues.
493  */
494 static TUNABLE(bool, vm_mte_enable_tag_storage_grab, "mte_ts_grab", true);
495 
496 /*
497  * Boot-args controlling the draining down of tag storage space
498  *
499  * @var vm_page_tag_storage_reserved
500  * How many tag storage pages the inactive_0 queue needs to preserve
501  * at all times.
502  */
503 TUNABLE(uint32_t, vm_page_tag_storage_reserved, "mte_ts_grab_rsv", 100);
504 
505 /*
506  * Boot-arg to enable/disable grabbing tag storage pages for the compressor
507  * pool.
508  */
509 TUNABLE(bool, vm_mte_tag_storage_for_compressor, "mte_ts_compressor", true);
510 
511 #ifndef VM_MTE_FF_VERIFY
512 /*
513  * Boot-arg to enable/disable grabbing tag storage pages for specific VM tags.
514  * Note that the string length was somewhat arbitrarily chosen, so if the use
515  * case arises, we may need to bump that up...
516  *
517  * Currently, we allow allocations with VM tags of VM_MEMORY_MALLOC_SMALL (2),
518  * VM_MEMORY_MALLOC_TINY (7), and VM_MEMORY_MALLOC_NANO (11) to use tag storage
519  * pages. See vm_statistics.h for other potential candidates.
520  * In particular, VM_MEMORY_STACK (30) is promising.
521  */
522 static TUNABLE_STR(vm_mte_tag_storage_for_vm_tags, 256, "mte_ts_vmtag", "2,7,11");
523 #endif /* VM_MTE_FF_VERIFY */
524 
525 #pragma mark Counters and Globals
526 
527 struct mte_cell_list mte_info_lists[MTE_LISTS_COUNT];
528 
529 static SECURITY_READ_ONLY_LATE(cell_t *) mte_info_cells;
530 
531 #ifndef VM_MTE_FF_VERIFY
532 /*
533  * Fill thread state.  The wake state of the thread is tracked to minimize
534  * scheduler interactions.  Guarded with the free page lock.
535  */
536 static sched_cond_atomic_t fill_thread_cond = SCHED_COND_INIT;
537 static SECURITY_READ_ONLY_LATE(thread_t) vm_mte_fill_thread = THREAD_NULL;
538 static thread_t vm_mte_activator = THREAD_NULL;
539 static bool vm_mte_activator_waiters = false;
540 
541 struct mte_pcpu PERCPU_DATA(mte_pcpu);
542 SCALABLE_COUNTER_DEFINE(vm_cpu_free_tagged_count);
543 SCALABLE_COUNTER_DEFINE(vm_cpu_free_claimed_count);
544 #endif
545 
546 /*
547  * Free taggable pages queue, per-cpu queues, and its counters.
548  *
549  * guarded by the free page lock
550  */
551 uint32_t vm_page_free_taggable_count;
552 uint32_t vm_page_free_unmanaged_tag_storage_count;
553 uint32_t vm_page_tagged_count; /* Total tagged covered pages. */
554 uint32_t vm_page_free_wanted_tagged = 0;
555 uint32_t vm_page_free_wanted_tagged_privileged = 0;
556 
557 /*
558  * Counters for tag storage pages we will just give to the system permanently
559  * for use as regular memory.  These could technically be a subset of the
560  * claimed tag storage, but counting them separately is useful because they
561  * will have a different page lifecycle than the claimed tag storage pages...
562  * as when freed, these pages will go to the regular free queues.
563  *
564  * These shouldn't be mutated after bootstrap... so they have no lock.
565  */
566 uint32_t vm_page_recursive_tag_storage_count;
567 uint32_t vm_page_retired_tag_storage_count;
568 uint32_t vm_page_unmanaged_tag_storage_count;
569 
570 /*
571  * The wired tag storage page count is guarded by the page queues lock.  This
572  * counter is diagnostic; it exists to inform investigations about reclaim
573  * efficiency.
574  */
575 uint32_t vm_page_wired_tag_storage_count;
576 
577 /*
578  * Diagnostic counters for reclamation; describes how many times reclamation
579  * attempts have succeeded or failed (as well as a breakout for failures due to
580  * the page being wired).  Guarded by the free page lock.
581  */
582 uint64_t vm_mte_refill_thread_wakeups;
583 uint64_t vm_page_tag_storage_activation_count;
584 uint64_t vm_page_tag_storage_deactivation_count;
585 uint64_t vm_page_tag_storage_reclaim_from_cpu_count;
586 uint64_t vm_page_tag_storage_reclaim_success_count;
587 uint64_t vm_page_tag_storage_reclaim_failure_count;
588 uint64_t vm_page_tag_storage_reclaim_wired_failure_count;
589 uint64_t vm_page_tag_storage_wire_relocation_count;
590 uint64_t vm_page_tag_storage_reclaim_compressor_failure_count;
591 uint64_t vm_page_tag_storage_compressor_relocation_count;
592 
593 #ifndef VM_MTE_FF_VERIFY
594 /*
595  * Diagnostic counter for reclamation describing the number of tag storage
596  * pages that have ever been allocated as claimed. Note that this value
597  * only increases.
598  */
599 SCALABLE_COUNTER_DEFINE(vm_cpu_claimed_count);
600 #endif /* VM_MTE_FF_VERIFY */
601 
602 /*
603  * Array of 4 64-bit masks for which VM tags can use tag storage.
604  * There are a total of 256 VM tags.
605  * This shouldn't be mutated after bootstrap... so it has no lock.
606  */
607 bitmap_t vm_mte_tag_storage_for_vm_tags_mask[BITMAP_LEN(VM_MEMORY_COUNT)];
608 
609 #pragma mark cell_idx_t
610 
611 __pure2
612 static bool
cell_idx_is_queue(cell_idx_t idx)613 cell_idx_is_queue(cell_idx_t idx)
614 {
615 	return idx < 0;
616 }
617 
618 __pure2
619 static cell_t *
cell_from_idx(cell_idx_t idx)620 cell_from_idx(cell_idx_t idx)
621 {
622 	return &mte_info_cells[idx];
623 }
624 
625 __pure2
626 __attribute__((overloadable))
627 static cell_idx_t
cell_idx(const cell_t * cell)628 cell_idx(const cell_t *cell)
629 {
630 	return (cell_idx_t)(cell - mte_info_cells);
631 }
632 
633 __pure2
634 __attribute__((overloadable))
635 static cell_idx_t
cell_idx(mte_cell_queue_t queue)636 cell_idx(mte_cell_queue_t queue)
637 {
638 	return cell_idx(&queue->head);
639 }
640 
641 __pure2
642 static cell_count_t
cell_free_page_count(cell_t cell)643 cell_free_page_count(cell_t cell)
644 {
645 	return __builtin_popcountll(cell.free_mask);
646 }
647 
648 __pure2
649 static ppnum_t
cell_first_covered_pnum(const cell_t * cell)650 cell_first_covered_pnum(const cell_t *cell)
651 {
652 	return pmap_first_pnum + cell_idx(cell) * MTE_PAGES_PER_TAG_PAGE;
653 }
654 
655 
656 #pragma mark mte_cell_queue_t
657 
658 /*
659  * Based on the existing queue code in XNU.  Look at <kern/queue.h> for the
660  * original code; done here due to the custom linkages.
661  */
662 
663 static cell_idx_t
cell_queue_first_idx(mte_cell_queue_t queue)664 cell_queue_first_idx(mte_cell_queue_t queue)
665 {
666 	return queue->head.next;
667 }
668 
669 static cell_idx_t
cell_queue_last_idx(mte_cell_queue_t queue)670 cell_queue_last_idx(mte_cell_queue_t queue)
671 {
672 	return queue->head.prev;
673 }
674 
675 static cell_t *
cell_queue_first(mte_cell_queue_t queue)676 cell_queue_first(mte_cell_queue_t queue)
677 {
678 	return cell_from_idx(cell_queue_first_idx(queue));
679 }
680 
681 static uint32_t
cell_queue_count(mte_cell_queue_t queue)682 cell_queue_count(mte_cell_queue_t queue)
683 {
684 	return queue->head.cell_count;
685 }
686 
687 
688 static bool
cell_queue_insert_tail(mte_cell_queue_t queue,cell_t * cell)689 cell_queue_insert_tail(mte_cell_queue_t queue, cell_t *cell)
690 {
691 	cell_idx_t qidx = cell_idx(queue);
692 	cell_idx_t tidx = cell_queue_last_idx(queue);
693 	cell_t    *tail = cell_from_idx(tidx);
694 
695 	if (tail->next != qidx) {
696 		__queue_element_linkage_invalid(tail);
697 	}
698 
699 	cell->next = qidx;
700 	cell->prev = tidx;
701 	queue->head.prev = tail->next = cell_idx(cell);
702 
703 	/* If the original tail was the queue, then it was empty. */
704 	return cell_idx_is_queue(tidx);
705 }
706 
707 static bool
cell_queue_remove(cell_t * cell)708 cell_queue_remove(cell_t *cell)
709 {
710 	cell_idx_t pidx = cell->prev;
711 	cell_idx_t nidx = cell->next;
712 	cell_idx_t cidx = cell_idx(cell);
713 	cell_t    *prev = cell_from_idx(pidx);
714 	cell_t    *next = cell_from_idx(nidx);
715 
716 	if (prev->next != cidx || next->prev != cidx) {
717 		__queue_element_linkage_invalid(cell);
718 	}
719 
720 	next->prev = pidx;
721 	prev->next = nidx;
722 	/* No linkage cleanup because cells are never dequeued at rest. */
723 
724 	/*
725 	 * If the prev and next indices are the same, then this is the head
726 	 * index, and the queue became empty
727 	 */
728 
729 	return pidx == nidx;
730 }
731 
732 #define cell_queue_foreach(it, q) \
733 	for (cell_t *it = cell_queue_first(q); \
734 	     it != &(q)->head; \
735 	     it = cell_from_idx(it->next))
736 
737 #define cell_queue_foreach_safe(it, q) \
738 	for (cell_t *__next_it, *it = cell_queue_first(q); \
739 	     it != &(q)->head && (__next_it = cell_from_idx(it->next), 1); \
740 	     it = __next_it)
741 
742 
743 #pragma mark MTE free queue
744 
745 /*
746  * The MTE free queue is a multi-dimensioned queue that replaces the
747  * vm_page_free_queue for covered pages on MTE targets.
748  *
749  * It is an array of colored free queues indexed by @c mte_free_queue_idx_t.
750  *
751  *
752  * A queue of tag storage pages
753  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
754  *
755  * When a tag storage page has no associated free covered pages, no page is
756  * enqueued on the mte free queue. However when a tag storage page has one or
757  * more free covered pages associated then there is one and only one of these
758  * pages enqueued on the mte free queues.
759  *
760  * The chosen representative for the cell is remembered on the cell of the
761  * associated tag storage @c cell_t::enqueue_pos value.
762  *
763  *
764  * Enqueue / dequeue algorithm
765  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
766  *
767  * This chosen representative makes the cluster available for its page color,
768  * and only this color, despite other colors being possibly available for this
769  * tag storage page.
770  *
771  * When removing a free page from the MTE queue, if the page being grabbed
772  * was the enqueued candidate, then the next enqueued candidate is chosen
773  * as the next free page in bitmask "circular" order
774  * (@see mteinfo_free_queue_next_bit()).
775  *
776  * As a result, by "pushing" the page forward this way, the tag storage page
777  * will be made available through all colors that it can provide.
778  *
779  *
780  * Allocation stability and bucket selection
781  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
782  *
783  * The free queues are in that order:
784  *
785  *   {claimed/disabled} -> {inactive_0, inactive_1} ->
786  *   {active_0, active_1, active_2, active_3} -> {activating}
787  *
788  * This is selected carefully to have the following crucial properties:
789  *
790  * - allocating untagged pages chooses buckets "left to right"
791  *   (in increasing free queue index order).
792  *
793  * - allocating tagged pages chooses active buckets "right to left"
794  *   (in decreasing free queue index from the active_* queues).
795  *
796  * - when allocating untagged pages, the impact on the tag storage page will
797  *   be that it stays in the same free queue or moves "down" in the free queue
798  *   indices order.
799  *
800  * - when allocating tagged pages, the impact on the tag storage page will
801  *   be that it stays in the same free queue or moves "up" in the free queue
802  *   indices order.
803  *
804  * This is important and allows for a nice optimization: if a tag storage page
805  * was found to be a good candidate for a given grab operation, it always will
806  * stay a "best" candidate until it has no free pages left, which allows for
807  * allocations of contiguous spans of pages at once
808  * (@see mteinfo_free_queue_grab()).
809  *
810  * Lastly, in order to find the first free bucket quickly,
811  * @c mte_claimable_queue is a bitmask where a bit being set means that the
812  * corresponding bucket has at least one queue non empty.
813  *
814  *
815  * Tag Storage Free queue
816  * ~~~~~~~~~~~~~~~~~~~~~~
817  *
818  * Tag storage pages can only be claimed if they are inactive with the [C.1]
819  * transition. Getting pages to inactive is done via the Deactivation [D.*].
820  *
821  * However, as we mentioned the MTE free queue is only about covered pages
822  * proper, and do not contain the tag storage pages. Another point is that
823  * we do not want to claim pages too aggressively as it could get in the way
824  * of the Activation [A.*] transition when tagged pages are required.
825  *
826  * To solve this tension, the @c mte_claimable_queue holds inactive tag storage
827  * pages that have 8 free pages or less at any given time. These are unlikely
828  * to be profitable activation candidates, but also demonstrate that there is
829  * enough untagged memory pressure on the system that we have clusters of
830  * covered pages in use.
831  *
832  * The @c mteinfo_free_queue_grab() code will promote these to a per-cpu
833  * free queue that in turn the @c vm_page_grab_options() fastpath can tap into
834  * as another opportunistic source of pages.
835  */
836 struct vm_page_free_queue mte_free_queues[MTE_FREE_NOT_QUEUED];
837 struct vm_page_free_queue mte_claimable_queue;
838 static uint32_t mte_free_queue_mask;
839 
840 /*!
841  * @abstract
842  * Computes the proper mte free queue index for a given cell.
843  */
844 __pure2
845 static mte_free_queue_idx_t
mteinfo_free_queue_idx(cell_t cell)846 mteinfo_free_queue_idx(cell_t cell)
847 {
848 	uint32_t free   = cell_free_page_count(cell);
849 	uint32_t tagged = cell.mte_page_count;
850 	uint32_t used   = MTE_PAGES_PER_TAG_PAGE - free - tagged;
851 	uint32_t n;
852 
853 	if (cell.free_mask == 0) {
854 		return MTE_FREE_NOT_QUEUED;
855 	}
856 
857 	switch (cell.state) {
858 	case MTE_STATE_DISABLED:
859 	case MTE_STATE_PINNED:
860 	case MTE_STATE_DEACTIVATING:
861 		return MTE_FREE_UNTAGGABLE_0;
862 
863 	case MTE_STATE_CLAIMED:
864 	case MTE_STATE_INACTIVE:
865 		/*
866 		 * This is "clever" code to map:
867 		 * MTE_FREE_UNTAGGABLE_0: Claimed[0-16]
868 		 * MTE_FREE_UNTAGGABLE_1: Claimed[16-32], Inactive[0-16]
869 		 * MTE_FREE_UNTAGGABLE_2: Inactive[16-32]
870 		 */
871 		n = MTE_FREE_UNTAGGABLE_0 + cell.state - MTE_STATE_CLAIMED;
872 		static_assert(MTE_STATE_INACTIVE == MTE_STATE_CLAIMED + 1);
873 		return n + (free > MTE_PAGES_PER_TAG_PAGE / 2);
874 
875 	case MTE_STATE_RECLAIMING:
876 	case MTE_STATE_ACTIVATING:
877 		return MTE_FREE_UNTAGGABLE_ACTIVATING;
878 
879 	case MTE_STATE_ACTIVE:
880 		break;
881 	}
882 
883 	/*
884 	 * Empirically this seems to give decent fragmentation results
885 	 * with alternating MTE/non-MTE workloads.
886 	 *
887 	 * This tries to find a balance between favoring buckets with mte pages
888 	 * allocated and to penalize buckets with untagged pages allocated,
889 	 * while keeping buckets with the most free pages on the fence.
890 	 *
891 	 * The distribution it generates can be printed by running the
892 	 * "active_buckets" subtest of tests/vm/vm_mteinfo.c
893 	 */
894 
895 	n  = tagged + free / 5;
896 	n -= MIN(n, used) / 3;
897 	return MTE_FREE_ACTIVE_0 + fls(n / 4);
898 }
899 
900 static vm_page_queue_t
mteinfo_free_queue_head(mte_free_queue_idx_t idx,uint32_t color)901 mteinfo_free_queue_head(mte_free_queue_idx_t idx, uint32_t color)
902 {
903 	return &mte_free_queues[idx].vmpfq_queues[color].qhead;
904 }
905 
906 /*!
907  * @abstract
908  * Computes the next bit in "circular" mask order
909  *
910  * @discussion
911  * This computes the next bit set in @c mask that is larger or equal
912  * to @c bit, or if none exist, then the smallest bit set in @c mask.
913  *
914  * This means that for a mask with positions mask={1, 5, 6, 10} set,
915  * the "next" bit for:
916  * - 4 is 5,
917  * - 10 is 10,
918  * - 12 is 1.
919  *
920  * @param mask        The mask to scan. The mask must be non 0.
921  * @param bit         The bit to scan from.
922  * @returns           The next bit set in "circular" order.
923  */
924 static cell_count_t
mteinfo_free_queue_next_bit(uint32_t mask,cell_count_t bit)925 mteinfo_free_queue_next_bit(uint32_t mask, cell_count_t bit)
926 {
927 	cell_count_t cur = bit % MTE_PAGES_PER_TAG_PAGE;
928 
929 	mask = (mask >> cur) | (mask << (32 - cur));
930 	bit += ffs(mask) - 1;
931 
932 	return bit % MTE_PAGES_PER_TAG_PAGE;
933 }
934 
935 /*!
936  * @abstract
937  * Backend for CELL_UPDATE() to manage update/requeues to the mte free queue.
938  *
939  * @param cell        The new state of the cell.
940  * @param orig        The original state of the cell.
941  * @param oidx        The original free queue index for the cell.
942  * @param nidx        The new free queue index for the cell.
943  */
944 __attribute__((noinline))
945 static void
mteinfo_free_queue_requeue(cell_t * cell,const cell_t orig,mte_free_queue_idx_t oidx,mte_free_queue_idx_t nidx)946 mteinfo_free_queue_requeue(
947 	cell_t                 *cell,
948 	const cell_t            orig,
949 	mte_free_queue_idx_t    oidx,
950 	mte_free_queue_idx_t    nidx)
951 {
952 	ppnum_t         first_pnum = cell_first_covered_pnum(cell);
953 	vm_page_queue_t queue;
954 	cell_count_t    bit = orig.enqueue_pos;
955 	vm_page_t       mem;
956 
957 	if (oidx == MTE_FREE_NOT_QUEUED && nidx == MTE_FREE_NOT_QUEUED) {
958 		cell->enqueue_pos = -1;
959 		return;
960 	}
961 
962 	if (oidx != MTE_FREE_NOT_QUEUED) {
963 		mem   = vm_page_find_canonical(first_pnum + bit);
964 		queue = mteinfo_free_queue_head(oidx,
965 		    (first_pnum + bit) & vm_color_mask);
966 		assert(bit_test(orig.free_mask, bit));
967 
968 		vm_page_queue_remove(queue, mem, vmp_pageq);
969 		VM_COUNTER_DEC(&mte_free_queues[oidx].vmpfq_count);
970 		if (mte_free_queues[oidx].vmpfq_count == 0) {
971 			bit_clear(mte_free_queue_mask, oidx);
972 		}
973 	}
974 
975 	if (nidx == MTE_FREE_NOT_QUEUED) {
976 		cell->enqueue_pos = -1;
977 	} else {
978 		bit   = mteinfo_free_queue_next_bit(cell->free_mask, bit);
979 		mem   = vm_page_find_canonical(first_pnum + bit);
980 		queue = mteinfo_free_queue_head(nidx,
981 		    (first_pnum + bit) & vm_color_mask);
982 		assert(bit_test(cell->free_mask, bit));
983 
984 		cell->enqueue_pos = bit;
985 		vm_page_queue_enter_first(queue, mem, vmp_pageq);
986 		if (mte_free_queues[nidx].vmpfq_count == 0) {
987 			bit_set(mte_free_queue_mask, nidx);
988 		}
989 		VM_COUNTER_INC(&mte_free_queues[nidx].vmpfq_count);
990 	}
991 }
992 
993 
994 #pragma mark mte_cell_list_t
995 
996 __pure2
997 static mte_cell_bucket_t
cell_list_idx_buckets(mte_cell_list_idx_t idx)998 cell_list_idx_buckets(mte_cell_list_idx_t idx)
999 {
1000 	switch (idx) {
1001 	case MTE_LIST_INACTIVE_IDX:
1002 	case MTE_LIST_CLAIMED_IDX:
1003 	case MTE_LIST_ACTIVE_0_IDX:
1004 		return MTE_BUCKETS_COUNT_MAX;
1005 	default:
1006 		return 1;
1007 	}
1008 }
1009 
1010 __pure2
1011 static mte_cell_list_idx_t
cell_list_idx(const cell_t cell)1012 cell_list_idx(const cell_t cell)
1013 {
1014 	if (cell.state != MTE_STATE_ACTIVE || cell.mte_page_count == 0) {
1015 		return (mte_cell_list_idx_t)cell.state;
1016 	}
1017 
1018 	return MTE_LIST_ACTIVE_IDX;
1019 }
1020 
1021 __pure2
1022 static mte_cell_bucket_t
cell_list_bucket(const cell_t cell)1023 cell_list_bucket(const cell_t cell)
1024 {
1025 	if (cell_list_idx_buckets(cell_list_idx(cell)) > 1) {
1026 		return (cell_free_page_count(cell) + 7) / 8;
1027 	}
1028 	return 0;
1029 }
1030 
1031 __pure2
1032 static inline bool
cell_on_claimable_queue(const cell_t cell)1033 cell_on_claimable_queue(const cell_t cell)
1034 {
1035 	if (cell.state == MTE_STATE_INACTIVE) {
1036 		return cell_list_bucket(cell) <= MTE_BUCKET_1_8;
1037 	}
1038 	return false;
1039 }
1040 
1041 __attribute__((noinline))
1042 static void
cell_list_requeue(cell_t * cell,vm_page_t tag_page,mte_cell_list_idx_t oidx,mte_cell_bucket_t obucket,mte_cell_list_idx_t nidx,mte_cell_bucket_t nbucket,int claim_requeue)1043 cell_list_requeue(
1044 	cell_t                 *cell,
1045 	vm_page_t               tag_page,
1046 	mte_cell_list_idx_t     oidx,
1047 	mte_cell_bucket_t       obucket,
1048 	mte_cell_list_idx_t     nidx,
1049 	mte_cell_bucket_t       nbucket,
1050 	int                     claim_requeue)
1051 {
1052 	mte_cell_list_t olist = &mte_info_lists[oidx];
1053 	mte_cell_list_t nlist = &mte_info_lists[nidx];
1054 
1055 	if (cell_queue_remove(cell)) {
1056 		bit_clear(olist->mask, obucket);
1057 	}
1058 
1059 	if (cell_queue_insert_tail(&nlist->buckets[nbucket], cell)) {
1060 		bit_set(nlist->mask, nbucket);
1061 	}
1062 
1063 	olist->buckets[obucket].head.cell_count--;
1064 	nlist->buckets[nbucket].head.cell_count++;
1065 
1066 	if (olist != nlist) {
1067 		olist->count--;
1068 		nlist->count++;
1069 	}
1070 
1071 	if (claim_requeue) {
1072 #ifndef VM_MTE_FF_VERIFY
1073 		uint32_t        color = VM_PAGE_GET_COLOR(tag_page);
1074 		vm_page_queue_t queue;
1075 
1076 		queue = &mte_claimable_queue.vmpfq_queues[color].qhead;
1077 		if (claim_requeue > 0) {
1078 			vm_page_queue_enter(queue, tag_page, vmp_pageq);
1079 		} else {
1080 			vm_page_queue_remove(queue, tag_page, vmp_pageq);
1081 		}
1082 		VM_COUNTER_DELTA(&mte_claimable_queue.vmpfq_count, claim_requeue);
1083 #endif /* VM_MTE_FF_VERIFY */
1084 	}
1085 }
1086 
1087 /*!
1088  * @abstract
1089  * Find a page in the last non-empty bucket that is larger than the
1090  * specified bucket index.
1091  *
1092  * @param lidx          The list index to scan.
1093  * @param min_bucket    The minimum bucket index to consider.
1094  * @param tag_page      The tag page associated with the returned cell.
1095  * @returns             The cell that was found or NULL.
1096  */
1097 static cell_t *
cell_list_find_last_page(mte_cell_list_idx_t lidx,mte_cell_bucket_t min_bucket,vm_page_t * tag_page)1098 cell_list_find_last_page(
1099 	mte_cell_list_idx_t     lidx,
1100 	mte_cell_bucket_t       min_bucket,
1101 	vm_page_t              *tag_page)
1102 {
1103 	mte_cell_list_t  list = &mte_info_lists[lidx];
1104 	uint32_t         mask = list->mask & ~mask(min_bucket);
1105 	mte_cell_queue_t queue;
1106 
1107 	if (__improbable(mask == 0)) {
1108 		*tag_page = VM_PAGE_NULL;
1109 		return NULL;
1110 	}
1111 
1112 	queue = &list->buckets[fls(mask) - 1];
1113 	*tag_page = vm_tag_storage_page_get(cell_queue_first_idx(queue));
1114 	return cell_queue_first(queue);
1115 }
1116 
1117 
1118 #pragma mark Tag storage space state machine
1119 
1120 /*!
1121  * Assert that a cell is in one of the states specified by the mask.
1122  */
1123 #define assert_cell_state(cell, mask) \
1124 	release_assert(((mask) & (1 << (cell)->state)) != 0)
1125 
1126 /*!
1127  * Perform an arbitrary update on a cell, and update the MTE info queues
1128  * accordingly.
1129  *
1130  * This should be used this way:
1131  *
1132  * <code>
1133  *   // Preflights and asserts here
1134  *   assert_cell_state(cell_var, ...);
1135  *
1136  *   CELL_UPDATE(cell_var, tag_page, cleared_bit, {
1137  *       // Mutations of cell_var here
1138  *       cell_var->state = ...;
1139  *   });
1140  * </code>
1141  *
1142  * @param cell          The cell to update.
1143  * @param tag_page      The tag page corresponding to @c cell.
1144  * @param cleared_bit   The bit that was cleared or -1
1145  * @param mut           Code that mutates its argument, and performs the
1146  *                      required update.
1147  */
1148 #define CELL_UPDATE(cell, tag_page, cleared_bit, ...)  ({                       \
1149 	mte_cell_list_idx_t  __ol, __nl;                                        \
1150 	mte_cell_bucket_t    __ob, __nb;                                        \
1151 	mte_free_queue_idx_t __oi, __ni;                                        \
1152 	int                  __ocq, __ncq;                                      \
1153 	cell_t              *__cell = (cell);                                   \
1154 	cell_t               __orig = *__cell;                                  \
1155                                                                                 \
1156 	__ol  = cell_list_idx(__orig);                                          \
1157 	__ob  = cell_list_bucket(__orig);                                       \
1158 	__ocq = cell_on_claimable_queue(__orig);                                \
1159 	__oi  = mteinfo_free_queue_idx(__orig);                                 \
1160                                                                                 \
1161 	__VA_ARGS__;                                                            \
1162                                                                                 \
1163 	__nl  = cell_list_idx(*__cell);                                         \
1164 	__nb  = cell_list_bucket(*__cell);                                      \
1165 	__ncq = cell_on_claimable_queue(*__cell);                               \
1166 	__ni  = mteinfo_free_queue_idx(*__cell);                                \
1167                                                                                 \
1168 	if (__ol != __nl || __ob != __nb) {                                     \
1169 	        cell_list_requeue(__cell, tag_page, __ol, __ob, __nl, __nb,     \
1170 	            __ncq - __ocq);                                             \
1171 	}                                                                       \
1172 	if (__oi != __ni || (cleared_bit)) {                                    \
1173 	        mteinfo_free_queue_requeue(__cell, __orig, __oi, __ni);         \
1174 	}                                                                       \
1175 })
1176 
1177 __pure2
1178 static cell_t *
cell_from_tag_storage_page(const struct vm_page * page)1179 cell_from_tag_storage_page(const struct vm_page *page)
1180 {
1181 	cell_idx_t pidx;
1182 
1183 	pidx = (cell_idx_t)(page - vm_pages_tag_storage_array_internal());
1184 	return cell_from_idx(pidx);
1185 }
1186 
1187 __pure2
1188 __attribute__((overloadable))
1189 static cell_t *
cell_from_covered_ppnum(ppnum_t pnum)1190 cell_from_covered_ppnum(ppnum_t pnum)
1191 {
1192 	cell_idx_t cidx = (pnum - pmap_first_pnum) / MTE_PAGES_PER_TAG_PAGE;
1193 
1194 	return cell_from_idx(cidx);
1195 }
1196 
1197 __pure2
1198 __attribute__((overloadable))
1199 static cell_t *
cell_from_covered_ppnum(ppnum_t pnum,vm_page_t * tag_page)1200 cell_from_covered_ppnum(ppnum_t pnum, vm_page_t *tag_page)
1201 {
1202 	cell_idx_t cidx = (pnum - pmap_first_pnum) / MTE_PAGES_PER_TAG_PAGE;
1203 
1204 	*tag_page = vm_tag_storage_page_get(cidx);
1205 	return cell_from_idx(cidx);
1206 }
1207 
1208 /*!
1209  * @function mteinfo_tag_storage_set_active()
1210  *
1211  * @abstract
1212  * Mark a tag storage page as active.
1213  *
1214  * @discussion
1215  * The page should be disabled (initial activation) or activating.
1216  *
1217  * @param tag_page      The pointer to a page inside the tag storage space.
1218  * @param mte_count     How many covered pages are used and tagged for @c tag_page.
1219  * @param init          Whether this is the initial transition.
1220  * @returns             The number of covered pages this made taggable.
1221  */
1222 static uint32_t
mteinfo_tag_storage_set_active(vm_page_t tag_page,uint32_t mte_count,bool init)1223 mteinfo_tag_storage_set_active(vm_page_t tag_page, uint32_t mte_count, bool init)
1224 {
1225 	cell_t      *cell = cell_from_tag_storage_page(tag_page);
1226 	cell_count_t free_page_count = cell_free_page_count(*cell);
1227 
1228 	assert(mte_count + free_page_count <= MTE_PAGES_PER_TAG_PAGE);
1229 	if (init) {
1230 		assert_cell_state(cell,
1231 		    /* [I.1] */ MTE_MASK_DISABLED);
1232 	} else {
1233 		assert_cell_state(cell,
1234 		    /* [R.2] */ MTE_MASK_RECLAIMING |
1235 		    /* [A.2] */ MTE_MASK_ACTIVATING);
1236 	}
1237 
1238 	VM_COUNTER_ADD(&vm_page_free_taggable_count, free_page_count);
1239 	vm_page_tag_storage_activation_count++;
1240 
1241 	CELL_UPDATE(cell, tag_page, false, {
1242 		cell->state = MTE_STATE_ACTIVE;
1243 		cell->mte_page_count = mte_count;
1244 	});
1245 
1246 	return free_page_count;
1247 }
1248 
1249 bool
mteinfo_tag_storage_disabled(const struct vm_page * tag_page)1250 mteinfo_tag_storage_disabled(const struct vm_page *tag_page)
1251 {
1252 	return cell_from_tag_storage_page(tag_page)->state == MTE_STATE_DISABLED;
1253 }
1254 
1255 bool
mteinfo_tag_storage_is_active(const struct vm_page * tag_page)1256 mteinfo_tag_storage_is_active(const struct vm_page *tag_page)
1257 {
1258 	return cell_from_tag_storage_page(tag_page)->state == MTE_STATE_ACTIVE;
1259 }
1260 
1261 void
mteinfo_tag_storage_set_retired(vm_page_t tag_page)1262 mteinfo_tag_storage_set_retired(vm_page_t tag_page)
1263 {
1264 	cell_t *cell = cell_from_tag_storage_page(tag_page);
1265 
1266 	assert(cell->mte_page_count == 0);
1267 	assert_cell_state(cell,
1268 	    /* [K.3] */ MTE_MASK_DISABLED |
1269 	    /* [K.2] */ MTE_MASK_CLAIMED |
1270 	    /* [K.1] */ MTE_MASK_RECLAIMING);
1271 
1272 	VM_COUNTER_INC(&vm_page_retired_tag_storage_count);
1273 
1274 	CELL_UPDATE(cell, tag_page, false, {
1275 		cell->state = MTE_STATE_DISABLED;
1276 	});
1277 }
1278 
1279 #ifndef VM_MTE_FF_VERIFY
1280 /*!
1281  * @function mteinfo_tag_storage_set_unmanaged()
1282  *
1283  * @abstract
1284  * Mark a tag storage page as actually being disabled-unmanaged
1285  *
1286  * @discussion
1287  * The tag storage page must be claimed or inactive.
1288  *
1289  * @param cell          The cell to mark as disabled.
1290  * @param tag_page      The tag page corresponding to @c cell.
1291  */
1292 static void
mteinfo_tag_storage_set_unmanaged(cell_t * cell,vm_page_t tag_page)1293 mteinfo_tag_storage_set_unmanaged(cell_t *cell, vm_page_t tag_page)
1294 {
1295 	bool queue = cell->state == MTE_STATE_INACTIVE;
1296 
1297 	assert(cell->mte_page_count == 0);
1298 	assert(cell->free_mask == 0);
1299 
1300 	assert_cell_state(cell,
1301 	    /* [U.1] */ MTE_MASK_CLAIMED |
1302 	    /* [U.2] */ MTE_MASK_INACTIVE);
1303 
1304 	VM_COUNTER_INC(&vm_page_unmanaged_tag_storage_count);
1305 
1306 	CELL_UPDATE(cell, tag_page, false, {
1307 		cell->state = MTE_STATE_DISABLED;
1308 	});
1309 
1310 	if (queue) {
1311 		vm_page_free_queue_enter(VM_MEMORY_CLASS_DEAD_TAG_STORAGE,
1312 		    tag_page, VM_PAGE_GET_PHYS_PAGE(tag_page));
1313 	}
1314 }
1315 #endif /* VM_MTE_FF_VERIFY */
1316 
1317 void
mteinfo_tag_storage_set_inactive(vm_page_t tag_page,bool init)1318 mteinfo_tag_storage_set_inactive(vm_page_t tag_page, bool init)
1319 {
1320 	cell_t *cell = cell_from_tag_storage_page(tag_page);
1321 
1322 	assert(cell->mte_page_count == 0);
1323 	if (init) {
1324 		assert_cell_state(cell,
1325 		    /* [I.2] */ MTE_MASK_DISABLED);
1326 	} else {
1327 		assert_cell_state(cell,
1328 		    /* [D.2] */ MTE_MASK_DEACTIVATING |
1329 		    /* [F.1] */ MTE_MASK_CLAIMED |
1330 		    /* [F.2] */ MTE_MASK_RECLAIMING);
1331 	}
1332 
1333 #ifndef VM_MTE_FF_VERIFY
1334 	if (cell->state == MTE_STATE_CLAIMED) {
1335 		/*
1336 		 * This is to account for [F.1].
1337 		 * For [F.2], we already decremented due to [R.1]
1338 		 */
1339 		counter_dec(&vm_cpu_claimed_count);
1340 	}
1341 #endif /* VM_MTE_FF_VERIFY */
1342 
1343 	CELL_UPDATE(cell, tag_page, false, {
1344 		cell->state = MTE_STATE_INACTIVE;
1345 	});
1346 }
1347 
1348 void
mteinfo_tag_storage_set_claimed(vm_page_t tag_page)1349 mteinfo_tag_storage_set_claimed(vm_page_t tag_page)
1350 {
1351 	cell_t *cell = cell_from_tag_storage_page(tag_page);
1352 
1353 	assert(cell->mte_page_count == 0);
1354 	assert_cell_state(cell,
1355 	    /* [C.1] */ MTE_MASK_INACTIVE |
1356 	    /* [R.x] */ MTE_MASK_RECLAIMING);
1357 
1358 #ifndef VM_MTE_FF_VERIFY
1359 	if (cell->state == MTE_STATE_RECLAIMING) {
1360 		counter_inc(&vm_cpu_claimed_count);
1361 	}
1362 #endif /* VM_MTE_FF_VERIFY */
1363 
1364 	CELL_UPDATE(cell, tag_page, false, {
1365 		cell->state = MTE_STATE_CLAIMED;
1366 	});
1367 }
1368 
1369 /*!
1370  * @function mteinfo_tag_storage_set_reclaiming()
1371  *
1372  * @abstract
1373  * Mark a tag storage page as being reclaimed.
1374  *
1375  * @discussion
1376  * The tag storage page must be claimed.
1377  *
1378  * @param cell          The cell to mark as reclaiming
1379  * @param tag_page      The tag page corresponding to @c cell.
1380  */
1381 static void
mteinfo_tag_storage_set_reclaiming(cell_t * cell,vm_page_t tag_page)1382 mteinfo_tag_storage_set_reclaiming(cell_t *cell, vm_page_t tag_page)
1383 {
1384 	assert(cell->mte_page_count == 0);
1385 	assert_cell_state(cell, /* [R.1] */ MTE_MASK_CLAIMED);
1386 
1387 	CELL_UPDATE(cell, tag_page, false, {
1388 		cell->state = MTE_STATE_RECLAIMING;
1389 	});
1390 
1391 #ifndef VM_MTE_FF_VERIFY
1392 	counter_dec(&vm_cpu_claimed_count);
1393 #endif /* VM_MTE_FF_VERIFY */
1394 }
1395 
1396 /*!
1397  * @function mteinfo_tag_storage_flush_reclaiming()
1398  *
1399  * @abstract
1400  * Empties the reclaiming queue, moving all pages on it back to claimed.
1401  */
1402 static void
mteinfo_tag_storage_flush_reclaiming(void)1403 mteinfo_tag_storage_flush_reclaiming(void)
1404 {
1405 	mte_cell_list_t  list  = &mte_info_lists[MTE_LIST_RECLAIMING_IDX];
1406 	mte_cell_queue_t queue = &list->buckets[0];
1407 	uint32_t         batch = VMP_FREE_BATCH_SIZE;
1408 
1409 	while (cell_queue_count(queue) > 0) {
1410 		cell_idx_t idx      = cell_queue_first_idx(queue);
1411 		vm_page_t  tag_page = vm_tag_storage_page_get(idx);
1412 		cell_t    *cell     = cell_from_idx(idx);
1413 
1414 		assert_cell_state(cell, /* [R.x] */ MTE_MASK_RECLAIMING);
1415 		CELL_UPDATE(cell, tag_page, false, {
1416 			cell->state = MTE_STATE_CLAIMED;
1417 		});
1418 
1419 #ifndef VM_MTE_FF_VERIFY
1420 		counter_inc(&vm_cpu_claimed_count);
1421 #endif /* VM_MTE_FF_VERIFY */
1422 
1423 		if (--batch == 0 && cell_queue_count(queue)) {
1424 #ifndef VM_MTE_FF_VERIFY
1425 			vm_free_page_unlock();
1426 			vm_free_page_lock_spin();
1427 #endif /* VM_MTE_FF_VERIFY */
1428 			batch = VMP_FREE_BATCH_SIZE;
1429 		}
1430 	}
1431 }
1432 
1433 #ifndef VM_MTE_FF_VERIFY
1434 
1435 void
mteinfo_tag_storage_wakeup(vm_page_t tag_page,bool fq_locked)1436 mteinfo_tag_storage_wakeup(vm_page_t tag_page, bool fq_locked)
1437 {
1438 	cell_t *cell = cell_from_tag_storage_page(tag_page);
1439 
1440 	if (!fq_locked) {
1441 		vm_free_page_lock_spin();
1442 	}
1443 
1444 	assert(tag_page->vmp_ts_wanted);
1445 	tag_page->vmp_ts_wanted = false;
1446 
1447 	assert_cell_state(cell, /* [B.2] */ MTE_MASK_PINNED);
1448 	CELL_UPDATE(cell, tag_page, false, {
1449 		cell->state = MTE_STATE_CLAIMED;
1450 	});
1451 
1452 	if (cell->free_mask != 0 &&
1453 	    (vm_page_free_wanted_tagged_privileged || vm_page_free_wanted_tagged)) {
1454 		mteinfo_wake_fill_thread();
1455 	}
1456 
1457 	if (!fq_locked) {
1458 		vm_free_page_unlock();
1459 	}
1460 
1461 	counter_inc(&vm_cpu_claimed_count);
1462 }
1463 
1464 #endif /* VM_MTE_FF_VERIFY */
1465 #pragma mark Covered pages state machine
1466 
1467 bool
mteinfo_covered_page_taggable(ppnum_t pnum)1468 mteinfo_covered_page_taggable(ppnum_t pnum)
1469 {
1470 	return cell_from_covered_ppnum(pnum)->state == MTE_STATE_ACTIVE;
1471 }
1472 
1473 void
mteinfo_covered_page_set_free(ppnum_t pnum,bool tagged)1474 mteinfo_covered_page_set_free(ppnum_t pnum, bool tagged)
1475 {
1476 	vm_page_t tag_page;
1477 	cell_t   *cell = cell_from_covered_ppnum(pnum, &tag_page);
1478 	int       bit  = pnum % MTE_PAGES_PER_TAG_PAGE;
1479 
1480 	assert(cell->mte_page_count >= tagged);
1481 	assert(!bit_test(cell->free_mask, bit));
1482 
1483 	VM_COUNTER_INC(&vm_page_free_count);
1484 	if (cell->state == MTE_STATE_ACTIVE) {
1485 		VM_COUNTER_INC(&vm_page_free_taggable_count);
1486 	}
1487 	if (tagged) {
1488 		VM_COUNTER_DEC(&vm_page_tagged_count);
1489 	}
1490 
1491 	CELL_UPDATE(cell, tag_page, false, {
1492 		cell->mte_page_count -= tagged;
1493 		bit_set(cell->free_mask, bit);
1494 	});
1495 }
1496 
1497 void
mteinfo_covered_page_set_used(ppnum_t pnum,bool tagged)1498 mteinfo_covered_page_set_used(ppnum_t pnum, bool tagged)
1499 {
1500 	vm_page_t tag_page;
1501 	cell_t   *cell = cell_from_covered_ppnum(pnum, &tag_page);
1502 	int       bit  = pnum % MTE_PAGES_PER_TAG_PAGE;
1503 
1504 	assert(cell->mte_page_count + tagged <= MTE_PAGES_PER_TAG_PAGE);
1505 	assert(bit_test(cell->free_mask, bit));
1506 
1507 	VM_COUNTER_DEC(&vm_page_free_count);
1508 	if (cell->state == MTE_STATE_ACTIVE) {
1509 		VM_COUNTER_DEC(&vm_page_free_taggable_count);
1510 	}
1511 	if (tagged) {
1512 		VM_COUNTER_INC(&vm_page_tagged_count);
1513 	}
1514 
1515 	CELL_UPDATE(cell, tag_page, true, {
1516 		bit_clear(cell->free_mask, bit);
1517 		cell->mte_page_count += tagged;
1518 	});
1519 }
1520 
1521 __startup_func
1522 void
mteinfo_covered_page_set_stolen_tagged(ppnum_t pnum)1523 mteinfo_covered_page_set_stolen_tagged(ppnum_t pnum)
1524 {
1525 	vm_page_t tag_page;
1526 	cell_t   *cell = cell_from_covered_ppnum(pnum, &tag_page);
1527 
1528 	assert(cell->mte_page_count < MTE_PAGES_PER_TAG_PAGE);
1529 	assert(!bit_test(cell->free_mask, pnum % MTE_PAGES_PER_TAG_PAGE));
1530 
1531 	CELL_UPDATE(cell, tag_page, false, {
1532 		cell->mte_page_count++;
1533 	});
1534 }
1535 
1536 void
mteinfo_covered_page_clear_tagged(ppnum_t pnum)1537 mteinfo_covered_page_clear_tagged(ppnum_t pnum)
1538 {
1539 	vm_page_t tag_page;
1540 	cell_t   *cell = cell_from_covered_ppnum(pnum, &tag_page);
1541 
1542 	assert(cell->mte_page_count > 0);
1543 	assert(!bit_test(cell->free_mask, pnum % MTE_PAGES_PER_TAG_PAGE));
1544 
1545 	CELL_UPDATE(cell, tag_page, false, {
1546 		cell->mte_page_count--;
1547 	});
1548 }
1549 
1550 #if DEBUG || DEVELOPMENT
1551 vm_page_t
mteinfo_tag_page_from_covered_page(ppnum_t pnum,vm_offset_t * offset_to_tag_data)1552 mteinfo_tag_page_from_covered_page(ppnum_t pnum, vm_offset_t * offset_to_tag_data)
1553 {
1554 	cell_idx_t cidx;
1555 	cell_t *cell;
1556 
1557 	if (!mteinfo_covered_page_taggable(pnum)) {
1558 		return NULL;
1559 	}
1560 
1561 	cidx = (pnum - pmap_first_pnum) / MTE_PAGES_PER_TAG_PAGE;
1562 	cell = cell_from_idx(cidx);
1563 
1564 	vm_page_t tag_page = vm_tag_storage_page_get(cidx);
1565 	assert(vm_page_in_tag_storage_array(tag_page));
1566 
1567 	*offset_to_tag_data =
1568 	    (PAGE_SIZE / MTE_PAGES_PER_TAG_PAGE) *                      /* size of tag data */
1569 	    ((pnum - pmap_first_pnum) % MTE_PAGES_PER_TAG_PAGE);        /* index within cell */
1570 
1571 	return tag_page;
1572 }
1573 #endif /* DEBUG || DEVELOPMENT */
1574 
1575 #pragma mark Activate
1576 #ifndef VM_MTE_FF_VERIFY
1577 
1578 /*!
1579  * @function mteinfo_tag_storage_wire_locked()
1580  *
1581  * @abstract
1582  * Wire the given tag storage page.
1583  *
1584  * @discussion
1585  * The page will be wired as part of mte_tags_object.
1586  *
1587  * This must be called with the object lock and the page queues lock held.
1588  *
1589  * @param tag_page
1590  * A tag storage page.
1591  */
1592 static void
mteinfo_tag_storage_wire_locked(vm_page_t tag_page)1593 mteinfo_tag_storage_wire_locked(vm_page_t tag_page)
1594 {
1595 	vm_object_offset_t page_addr = ptoa(VM_PAGE_GET_PHYS_PAGE(tag_page));
1596 
1597 	assert(tag_page->vmp_wire_count == 0);
1598 	vm_page_wire(tag_page, VM_KERN_MEMORY_MTAG,
1599 	    /* Don't check memory status. */ FALSE);
1600 
1601 	vm_page_insert_internal(tag_page, mte_tags_object, page_addr,
1602 	    VM_KERN_MEMORY_MTAG,
1603 	    /* We already hold the queue locks. */ TRUE,
1604 	    /* Add this page to the hash. */ TRUE,
1605 	    /* Don't bother batching pmap operations. */ FALSE,
1606 	    /* Don't bother batching accounting. */ FALSE,
1607 	    /* Don't bother with delayed ledger updates. */ NULL);
1608 }
1609 
1610 /*!
1611  * @function mteinfo_tag_storage_select_activating()
1612  *
1613  * @abstract
1614  * Select tag storage pages to activate toward a certain number of free covered
1615  * pages to make taggable.
1616  *
1617  * @discussion
1618  * The caller must make sure there's at least one page to activate for the
1619  * selected buckets.
1620  *
1621  * @param target        how many covered taggable free pages to try to generate
1622  *                      as a result of this activation.
1623  * @param bucket        which inactive bucket to start drawing from
1624  *
1625  * @returns             the list of tag storage pages to activate
1626  *                      with mteinfo_tag_storage_activate_locked().
1627  */
1628 static vm_page_list_t
mteinfo_tag_storage_select_activating(uint32_t target,mte_cell_bucket_t bucket)1629 mteinfo_tag_storage_select_activating(uint32_t target, mte_cell_bucket_t bucket)
1630 {
1631 	vm_page_list_t list      = { };
1632 	vm_page_t      tag_page  = VM_PAGE_NULL;
1633 	cell_t        *cell      = NULL;
1634 	uint32_t       total     = 0;
1635 	uint32_t       covered   = 0;
1636 
1637 	/*
1638 	 * Convert the lock hold into a mutex, to signal to waiters that the
1639 	 * lock may be held for longer.
1640 	 */
1641 	vm_free_page_lock_convert();
1642 
1643 	do {
1644 		cell = cell_list_find_last_page(MTE_LIST_INACTIVE_IDX,
1645 		    bucket, &tag_page);
1646 		if (tag_page == VM_PAGE_NULL) {
1647 			break;
1648 		}
1649 
1650 		assert_cell_state(cell, /* [A.1] */ MTE_MASK_INACTIVE);
1651 		CELL_UPDATE(cell, tag_page, false, {
1652 			cell->state = MTE_STATE_ACTIVATING;
1653 		});
1654 
1655 		covered = cell_free_page_count(*cell);
1656 		total  += covered;
1657 
1658 		KDBG(VMDBG_CODE(DBG_VM_TAG_PAGE_INACTIVE) | DBG_FUNC_NONE,
1659 		    VM_KERNEL_ADDRHIDE(tag_page), covered);
1660 
1661 		tag_page->vmp_q_state = VM_PAGE_NOT_ON_Q;
1662 		vm_page_list_push(&list, tag_page);
1663 	} while (total < target);
1664 
1665 	return list;
1666 }
1667 
1668 /*!
1669  * @function mteinfo_tag_storage_activate_locked()
1670  *
1671  * @abstract
1672  * Activate a list of tag storage pages in reclaiming or activating state.
1673  *
1674  * @discussion
1675  * The page free queue lock must be held, however it is dropped and retaken by
1676  * this function.
1677  *
1678  * @param list          the list of pages to activate.
1679  * @param spin_mode     whether to take the free page queue lock in spin mode.
1680  *
1681  * @returns             how many covered pages have been made taggable.
1682  */
1683 static uint32_t
mteinfo_tag_storage_activate_locked(vm_page_list_t list,bool spin_mode)1684 mteinfo_tag_storage_activate_locked(vm_page_list_t list, bool spin_mode)
1685 {
1686 	vm_page_t tag_page  = VM_PAGE_NULL;
1687 	uint32_t  result, total;
1688 
1689 	vm_free_page_unlock();
1690 
1691 	/*
1692 	 * First, retype the pages and add them to the MTE object.
1693 	 */
1694 
1695 	vm_page_list_foreach(tag_page, list) {
1696 		ppnum_t tag_pnum = VM_PAGE_GET_PHYS_PAGE(tag_page);
1697 
1698 		assert(vm_page_is_tag_storage_pnum(tag_page, tag_pnum));
1699 		pmap_make_tag_storage_page(tag_pnum);
1700 	}
1701 
1702 	vm_object_lock(mte_tags_object);
1703 	vm_page_lock_queues();
1704 	vm_page_list_foreach(tag_page, list) {
1705 		vm_page_t save_snext = NEXT_PAGE(tag_page);
1706 
1707 		NEXT_PAGE(tag_page) = VM_PAGE_NULL;
1708 		mteinfo_tag_storage_wire_locked(tag_page);
1709 		NEXT_PAGE(tag_page) = save_snext;
1710 	}
1711 	vm_page_unlock_queues();
1712 	vm_object_unlock(mte_tags_object);
1713 
1714 	if (spin_mode) {
1715 		vm_free_page_lock_spin();
1716 	} else {
1717 		vm_free_page_lock();
1718 	}
1719 
1720 	/*
1721 	 * Second, mark all the pages as active now, which makes the
1722 	 * covered pages available for taggable allocation.
1723 	 *
1724 	 * And recompute how many taggable pages we really freed,
1725 	 * as allocations/free of untagged pages could have made
1726 	 * progress while we dropped the free page queue lock.
1727 	 */
1728 
1729 	total = 0;
1730 	vm_page_list_foreach_consume(tag_page, &list) {
1731 		total += mteinfo_tag_storage_set_active(tag_page, 0, false);
1732 	}
1733 	result = total;
1734 
1735 
1736 	/*
1737 	 * Last perform wakeups.
1738 	 *
1739 	 * 1. wake up other activators
1740 	 * 2. wake up privileged waiters
1741 	 * 3. wake up regular waiters
1742 	 *
1743 	 * We do not need to consider secluded pools, or other waiters because
1744 	 * we never prevent them from allocating the pages associated with
1745 	 * the tag storage we are activating during this process. Which is why
1746 	 * we don't use vm_page_free_queue_handle_wakeups_and_unlock() but
1747 	 * instead have this simplified implementation.
1748 	 */
1749 
1750 	if (vm_mte_activator_waiters) {
1751 		vm_mte_activator_waiters = false;
1752 		wakeup_all_with_inheritor(&vm_mte_activator_waiters,
1753 		    THREAD_AWAKENED);
1754 	}
1755 
1756 	if (vm_page_free_wanted_tagged_privileged && total) {
1757 		if (total < vm_page_free_wanted_tagged_privileged) {
1758 			vm_page_free_wanted_tagged_privileged -= total;
1759 			total = 0;
1760 		} else {
1761 			total -= vm_page_free_wanted_tagged_privileged;
1762 			vm_page_free_wanted_tagged_privileged = 0;
1763 		}
1764 		vm_page_free_wakeup(&vm_page_free_wanted_tagged_privileged,
1765 		    UINT32_MAX);
1766 	}
1767 
1768 	if (vm_page_free_wanted_tagged && total) {
1769 		uint32_t wakeup = 0;
1770 
1771 		if (total < vm_page_free_wanted_tagged) {
1772 			wakeup = total;
1773 			vm_page_free_wanted_tagged -= total;
1774 			total  = 0;
1775 		} else {
1776 			total -= vm_page_free_wanted_tagged;
1777 			vm_page_free_wanted_tagged = 0;
1778 			wakeup = UINT32_MAX;
1779 		}
1780 		vm_page_free_wakeup(&vm_page_free_wanted_tagged, wakeup);
1781 	}
1782 
1783 	return result;
1784 }
1785 
1786 bool
mteinfo_tag_storage_try_activate(uint32_t target,bool spin_mode)1787 mteinfo_tag_storage_try_activate(uint32_t target, bool spin_mode)
1788 {
1789 	mte_cell_bucket_t first_bucket = MTE_BUCKET_17_24;
1790 	thread_t          thread_self  = current_thread();
1791 	vm_page_list_t    list         = { };
1792 
1793 	/*
1794 	 * We only draw from buckets covering more than half of the pages free.
1795 	 * We do not want to do buckets that are less full, as this is too slow
1796 	 * for the inline path and will rely on the refill thread instead.
1797 	 */
1798 
1799 	if (mte_info_lists[MTE_LIST_INACTIVE_IDX].mask < BIT(first_bucket)) {
1800 		return false;
1801 	}
1802 
1803 	if (vm_mte_activator) {
1804 		/*
1805 		 * We only allow one thread activating pages at a time,
1806 		 * only wait if we the caller can't make progress without
1807 		 * this though.
1808 		 *
1809 		 * We do not need to consider that the waiters is privileged
1810 		 * for the wait however, because activation isn't affected
1811 		 * by TH_OPT_VMPRIV.
1812 		 */
1813 
1814 		if (vm_page_free_taggable_count > vm_page_free_reserved) {
1815 			return false;
1816 		}
1817 		if (vm_page_free_taggable_count > 0 &&
1818 		    (thread_self->options & TH_OPT_VMPRIV)) {
1819 			return false;
1820 		}
1821 
1822 		vm_mte_activator_waiters = true;
1823 		lck_mtx_sleep_with_inheritor(&vm_page_queue_free_lock,
1824 		    spin_mode ? LCK_SLEEP_SPIN : LCK_SLEEP_DEFAULT,
1825 		    &vm_mte_activator_waiters, vm_mte_activator,
1826 		    THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1827 
1828 		return true;
1829 	}
1830 
1831 	vm_mte_activator = thread_self;
1832 	list = mteinfo_tag_storage_select_activating(target, first_bucket);
1833 	mteinfo_tag_storage_activate_locked(list, spin_mode);
1834 	vm_mte_activator = THREAD_NULL;
1835 
1836 	return true;
1837 }
1838 
1839 
1840 #pragma mark Deactivate
1841 
1842 /*!
1843  * @abstract
1844  * Returns whether the active(0.0) bucket should be drained to make inactive
1845  * pages.
1846  *
1847  * @param for_wakeup    Whether the question is to wakeup the refill thread
1848  *                      (true) or decide whether the refill thread should keep
1849  *                      going (false).
1850  */
1851 static bool
mteinfo_tag_storage_should_drain(bool for_wakeup)1852 mteinfo_tag_storage_should_drain(bool for_wakeup)
1853 {
1854 	mte_cell_list_t active_0  = &mte_info_lists[MTE_LIST_ACTIVE_0_IDX];
1855 	uint32_t        threshold = VMP_FREE_BATCH_SIZE * (for_wakeup ? 2 : 1);
1856 
1857 	if (!vm_mte_enable_tag_storage_grab) {
1858 		return false;
1859 	}
1860 
1861 	if (mte_claimable_queue.vmpfq_count >= vm_free_magazine_refill_limit) {
1862 		return false;
1863 	}
1864 
1865 	if (active_0->count <= vm_page_tag_storage_reserved) {
1866 		return false;
1867 	}
1868 
1869 	return cell_queue_count(&active_0->buckets[0]) >= threshold;
1870 }
1871 
1872 /*
1873  * @function mteinfo_tag_storage_deactivate_barrier()
1874  *
1875  * @abstract
1876  * Wait until all possible untagging operations that could make deactivation
1877  * invalid have finished.
1878  *
1879  * @discussion
1880  * Before we can do any deactivation we must make sure
1881  * that no CPU has untagging activity in flight.
1882  *
1883  * See mteinfo_free_queue_grab() and mteinfo_page_list_fix_tagging().
1884  */
1885 static void
mteinfo_tag_storage_deactivate_barrier(void)1886 mteinfo_tag_storage_deactivate_barrier(void)
1887 {
1888 	mte_pcpu_t this_cpu = PERCPU_GET(mte_pcpu);
1889 
1890 	assert(get_preemption_level() > 0);
1891 
1892 	percpu_foreach(it, mte_pcpu) {
1893 		if (it == this_cpu) {
1894 			/*
1895 			 * A thread is allowed to both have pending untagging
1896 			 * going on and a page to deactivate.
1897 			 *
1898 			 * As a result, ignore the current core's suspension
1899 			 * state as it is harmless as long as the core commits
1900 			 * to untagging before it does its deactivations.
1901 			 *
1902 			 * If a thread fails to do that, this will reliably
1903 			 * panic in SPTM, so the risk of silent bugs is rather
1904 			 * unlikely.
1905 			 */
1906 			continue;
1907 		}
1908 
1909 		if (os_atomic_load(&it->deactivate_suspend, relaxed)) {
1910 			hw_wait_while_equals32(&it->deactivate_suspend, 1);
1911 		}
1912 	}
1913 	os_atomic_thread_fence(seq_cst);
1914 }
1915 
1916 /*!
1917  * @abstract
1918  * Flush a list of deactivating page storage.
1919  *
1920  * @discussion
1921  * The page free queue lock must be held, but will be dropped while this
1922  * function operates.
1923  *
1924  * @param list          The list of pages in @c MTE_STATE_DEACTIVATING state.
1925  */
1926 static void
mteinfo_tag_storage_drain_flush(vm_page_list_t list)1927 mteinfo_tag_storage_drain_flush(vm_page_list_t list)
1928 {
1929 	vm_page_t tag_page = VM_PAGE_NULL;
1930 
1931 	mteinfo_tag_storage_deactivate_barrier();
1932 
1933 	vm_free_page_unlock();
1934 
1935 	vm_object_lock(mte_tags_object);
1936 	vm_page_lock_queues();
1937 
1938 	vm_page_list_foreach(tag_page, list) {
1939 		vm_page_t save_next = NEXT_PAGE(tag_page);
1940 
1941 
1942 		/*
1943 		 * The unwiring path expects the page linkage to be
1944 		 * NULL, so transiently make it NULL.  We'll restore
1945 		 * the linkage after the unwire is done.
1946 		 */
1947 
1948 		NEXT_PAGE(tag_page) = VM_PAGE_NULL;
1949 		vm_page_unwire(tag_page,
1950 		    /* Don't put the page into aging queues. */ FALSE);
1951 		vm_page_remove(tag_page,
1952 		    /* Remove the page from the hash. */ TRUE);
1953 		NEXT_PAGE(tag_page) = save_next;
1954 	}
1955 
1956 	vm_page_unlock_queues();
1957 	vm_object_unlock(mte_tags_object);
1958 
1959 	vm_page_list_foreach(tag_page, list) {
1960 		pmap_unmake_tag_storage_page(VM_PAGE_GET_PHYS_PAGE(tag_page));
1961 	}
1962 
1963 	vm_free_page_lock_spin();
1964 
1965 	vm_page_tag_storage_deactivation_count += list.vmpl_count;
1966 
1967 	vm_page_list_foreach_consume(tag_page, &list) {
1968 		vm_page_free_queue_enter(VM_MEMORY_CLASS_TAG_STORAGE,
1969 		    tag_page, VM_PAGE_GET_PHYS_PAGE(tag_page));
1970 	}
1971 }
1972 
1973 /*!
1974  * @function mteinfo_tag_storage_drain()
1975  *
1976  * @abstract
1977  * Attempt to drain the active(0.0) bucket of pages since these are always
1978  * wasted.
1979  *
1980  * @discussion
1981  * This is one of the core routines of the fill thread.
1982  *
1983  * @returns
1984  * How many tag storage pages were deactivated.
1985  */
1986 static uint32_t
mteinfo_tag_storage_drain(void)1987 mteinfo_tag_storage_drain(void)
1988 {
1989 	mte_cell_list_t  active_0 = &mte_info_lists[MTE_LIST_ACTIVE_0_IDX];
1990 	mte_cell_queue_t bucket_0 = &active_0->buckets[0];
1991 	vm_page_t        tag_page = VM_PAGE_NULL;
1992 	cell_t          *cell     = NULL;
1993 	uint32_t         total    = 0;
1994 	vm_page_list_t   list     = { };
1995 
1996 	LCK_MTX_ASSERT_OWNED_SPIN(&vm_page_queue_free_lock);
1997 
1998 	while (mteinfo_tag_storage_should_drain(false)) {
1999 		tag_page   = vm_tag_storage_page_get(cell_queue_first_idx(bucket_0));
2000 		cell       = cell_queue_first(bucket_0);
2001 
2002 		assert(cell->free_mask == 0);
2003 		assert_cell_state(cell, /* [D.1] */ MTE_MASK_ACTIVE);
2004 		CELL_UPDATE(cell, tag_page, false, {
2005 			cell->state = MTE_STATE_DEACTIVATING;
2006 		});
2007 
2008 		vm_page_list_push(&list, tag_page);
2009 
2010 		if (list.vmpl_count >= VMP_FREE_BATCH_SIZE) {
2011 			total += list.vmpl_count;
2012 			mteinfo_tag_storage_drain_flush(list);
2013 			list   = (vm_page_list_t){ };
2014 		}
2015 	}
2016 
2017 	if (list.vmpl_count) {
2018 		total += list.vmpl_count;
2019 		mteinfo_tag_storage_drain_flush(list);
2020 	}
2021 
2022 	return total;
2023 }
2024 
2025 
2026 #pragma mark Reclaim
2027 
2028 /*!
2029  * @abstract
2030  * Attempt to steal a tag page from a per cpu claimed free queue.
2031  *
2032  * @discussion
2033  * The caller must have checked that the tag_page is on a local free queue,
2034  * even if this check is racy.
2035  *
2036  * @param tag_page      A tag storage page appearing to sit on a per cpu queue.
2037  *
2038  * @returns             Whether stealing was successful (true) or not (false).
2039  */
2040 static bool
mteinfo_reclaim_tag_storage_page_try_pcpu(vm_page_t tag_page)2041 mteinfo_reclaim_tag_storage_page_try_pcpu(vm_page_t tag_page)
2042 {
2043 	mte_pcpu_t mte_pcpu;
2044 	uint16_t   cpu;
2045 
2046 	cpu      = os_atomic_load(&tag_page->vmp_local_id, relaxed);
2047 	mte_pcpu = PERCPU_GET_WITH_BASE(other_percpu_base(cpu), mte_pcpu);
2048 
2049 	lck_ticket_lock(&mte_pcpu->free_claimed_lock, &vm_page_lck_grp_bucket);
2050 
2051 	if (tag_page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q &&
2052 	    tag_page->vmp_local_id == cpu) {
2053 		vm_page_queue_remove(&mte_pcpu->free_claimed_pages,
2054 		    tag_page, vmp_pageq);
2055 		tag_page->vmp_q_state  = VM_PAGE_NOT_ON_Q;
2056 		tag_page->vmp_local_id = 0;
2057 		counter_dec_preemption_disabled(&vm_cpu_free_claimed_count);
2058 	} else {
2059 		tag_page = VM_PAGE_NULL;
2060 	}
2061 
2062 	lck_ticket_unlock(&mte_pcpu->free_claimed_lock);
2063 
2064 	return tag_page != VM_PAGE_NULL;
2065 }
2066 
2067 /*!
2068  * @function mteinfo_reclaim_tag_storage_page()
2069  *
2070  * @abstract
2071  * Attempt to reclaim a claimed tag storage page.
2072  *
2073  * @discussion
2074  * This will try to reclaim a tag storage page by relocating its contents to a
2075  * different page, so that the tag storage page becomes (effectively) free.
2076  *
2077  * This expects a claimed tag storage page, and on success, will finish with
2078  * the page in the reclaimed state.  On failure, no guarantees are made about
2079  * the state of the page (due to locking operations); the page could still be
2080  * claimed, or reclamation may have failed because the page became free in the
2081  * interim.  However, if the page was not in a relocatable state, this function
2082  * will not force it out of the reclaiming state, so that the client can choose
2083  * when and why the page is returned to claimed.
2084  *
2085  * This function is called with the free page queue lock in spin mode and
2086  * returns with it held in spin mode.
2087  *
2088  * @param tag_page
2089  * The claimed tag storage page to try reclaiming.
2090  *
2091  * @returns
2092  * - KERN_SUCCESS               success,
2093  *
2094  * - KERN_INVALID_OBJECT        the page has no object set
2095  *
2096  * - KERN_NOT_WAITING           the state of the cell/tag page changed
2097  *                              during evaluation.
2098  *
2099  * - KERN_ABORTED               the tag page was wired. reclaiming it was
2100  *                              aborted and it was marked as MTE_STATE_PINNED.
2101  *
2102  * - KERN_RESOURCE_SHORTAGE     from vm_page_relocate(): relocation failed due
2103  *                              to being out of replacement memory.
2104  *
2105  * - KERN_FAILURE               from vm_page_relocate(): relocation failed due
2106  *                              to the page not being currently relocatable.
2107  */
2108 static kern_return_t
mteinfo_reclaim_tag_storage_page(vm_page_t tag_page)2109 mteinfo_reclaim_tag_storage_page(vm_page_t tag_page)
2110 {
2111 	cell_t *cell = cell_from_tag_storage_page(tag_page);
2112 	kern_return_t kr = KERN_FAILURE;
2113 	vm_object_t object;
2114 	bool compressor_locked = false;
2115 	bool vm_object_trylock_failed = false;
2116 
2117 	/* We need to try and reclaim the tag storage page. */
2118 	mteinfo_tag_storage_set_reclaiming(cell, tag_page);
2119 
2120 	if (tag_page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q &&
2121 	    mteinfo_reclaim_tag_storage_page_try_pcpu(tag_page)) {
2122 		vm_page_tag_storage_reclaim_from_cpu_count++;
2123 		vm_page_tag_storage_reclaim_success_count++;
2124 
2125 		KDBG(VMDBG_CODE(DBG_VM_TAG_PAGE_CLAIMED) | DBG_FUNC_NONE,
2126 		    VM_KERNEL_ADDRHIDE(tag_page),
2127 		    mteinfo_tag_storage_free_pages_for_covered(tag_page));
2128 
2129 		return KERN_SUCCESS;
2130 	}
2131 
2132 	vm_free_page_unlock();
2133 
2134 	/*
2135 	 * Snoop the vmp_q_state. If the page is currently used by the compressor
2136 	 * (VM_PAGE_USED_BY_COMPRESSOR), we'll grab the global compressor lock
2137 	 * for write (PAGE_REPLACEMENT_ALLOWED(TRUE)) and the compressor
2138 	 * object lock.
2139 	 *
2140 	 * Typically, we can't know that the object will be stable
2141 	 * without grabbing the object or page queues lock (see the comment on
2142 	 * "relocation lock dance" below), but we know that the compressor object
2143 	 * is stable. So, we do _not_ need to grab the page queues and object locks
2144 	 * in the wrong order. This ensures that we will wait our turn in case
2145 	 * someone else is using the compressor object lock, and there is no chance
2146 	 * the reclaim will fail because we can't acquire the right locks.
2147 	 *
2148 	 * The contiguous memory allocator grabs this lock before the page queues
2149 	 * and object lock, so we must do the same here.
2150 	 */
2151 	if (tag_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2152 		assert(vm_mte_tag_storage_for_compressor);
2153 		PAGE_REPLACEMENT_ALLOWED(TRUE);
2154 		vm_object_lock(compressor_object);
2155 		compressor_locked = true;
2156 
2157 		/*
2158 		 * The page state transitions into and out of VM_PAGE_USED_BY_COMPRESSOR
2159 		 * happen under the compressor object, so now the page state is stable.
2160 		 */
2161 		if (tag_page->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR) {
2162 			/*
2163 			 * The page was removed from the compressor pool. It could be
2164 			 * in any state now, but it's probably free and unusable. Give up.
2165 			 */
2166 			vm_object_unlock(compressor_object);
2167 			PAGE_REPLACEMENT_ALLOWED(FALSE);
2168 			compressor_locked = false;
2169 			vm_free_page_lock_spin();
2170 			kr = KERN_FAILURE;
2171 			goto locks_acquired;
2172 		}
2173 	}
2174 
2175 	/*
2176 	 * Do the relocation lock dance.  This is a little odd; because we're
2177 	 * starting with a page, and trying to look up the object, we need the
2178 	 * queues lock to keep the object from being deallocated or changed.
2179 	 *
2180 	 * This means we need to get the object lock after the queues lock;
2181 	 * this inverts the lock ordering, so we can only TRY the object lock.
2182 	 */
2183 	vm_page_lock_queues();
2184 
2185 	object = VM_PAGE_OBJECT(tag_page);
2186 	if (compressor_locked) {
2187 		assert(object == compressor_object);
2188 	}
2189 
2190 	if (object == VM_OBJECT_NULL) {
2191 		/* [PH] XXX: Can this even happen? */
2192 		kr = KERN_INVALID_OBJECT;
2193 		goto release_locks;
2194 	} else if (!compressor_locked && !vm_object_lock_try_scan(object)) {
2195 		/*
2196 		 * hopefully the next time we drain reclaiming pages taking
2197 		 * that object lock will work.
2198 		 */
2199 		vm_object_trylock_failed = true;
2200 		kr = KERN_NOT_WAITING;
2201 		goto release_locks;
2202 	} else if (VM_PAGE_OBJECT(tag_page) != object) {
2203 		/*
2204 		 * vm_page_insert_internal() doesn't require the page queue lock
2205 		 * to be held if the page is wired, so the object could change
2206 		 * under us.
2207 		 */
2208 		vm_object_unlock(object);
2209 
2210 		kr = KERN_NOT_WAITING;
2211 		goto release_locks;
2212 	}
2213 
2214 	/*
2215 	 * Now that all the locking is out of the way,
2216 	 * see if the page is actually relocatable.
2217 	 */
2218 	if (VM_PAGE_WIRED(tag_page) ||
2219 	    (tag_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR && tag_page->vmp_busy)) {
2220 		/*
2221 		 * TODO: Relocation fails when one of these conditions is met:
2222 		 *
2223 		 *     VM_PAGE_WIRED(tag_page)
2224 		 *     tag_page->vmp_gobbled
2225 		 *     tag_page->vmp_laundry
2226 		 *     tag_page->vmp_wanted
2227 		 *     tag_page->vmp_cleaning
2228 		 *     tag_page->vmp_overwriting
2229 		 *     tag_page->vmp_free_when_done
2230 		 *     tag_page->vmp_busy
2231 		 *     tag_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q
2232 		 *
2233 		 * We only handle VM_PAGE_WIRED() and when the tag page is being
2234 		 * swapped out (from usage in the compressor pool) for now,
2235 		 * because these are the most likely, but we should use vmp_ts_wanted
2236 		 * for all cases.
2237 		 *
2238 		 * We would need to find all places in the kernel that alter
2239 		 * this condition, to notice that a relocation was attempted
2240 		 * (vmp_ts_wanted is set) and call mteinfo_tag_storage_wakeup().
2241 		 */
2242 
2243 		/*
2244 		 * Take the page free lock before setting vmp_ts_wanted,
2245 		 * before we drop the object lock, otherwise
2246 		 * mteinfo_tag_storage_wakeup() might see vmp_ts_wanted
2247 		 * before the transition to MTE_STATE_PINNED has happened.
2248 		 *
2249 		 * Note that we should do nothing if the cell is no longer in
2250 		 * the MTE_STATE_RECLAIMING state, which could hypothetically
2251 		 * happen since we dropped the free queue lock above.
2252 		 */
2253 		vm_free_page_lock_spin();
2254 
2255 		if (cell->state == MTE_STATE_RECLAIMING) {
2256 			assert(tag_page->vmp_ts_wanted == false);
2257 			tag_page->vmp_ts_wanted = true;
2258 			kr = KERN_ABORTED;
2259 		} else {
2260 			kr = KERN_NOT_WAITING;
2261 		}
2262 
2263 		vm_object_unlock(object);
2264 		vm_page_unlock_queues();
2265 		if (compressor_locked) {
2266 			PAGE_REPLACEMENT_ALLOWED(FALSE);
2267 			compressor_locked = false;
2268 		}
2269 
2270 		if (kr == KERN_ABORTED) {
2271 			assert_cell_state(cell, /* [B.1] */ MTE_MASK_RECLAIMING);
2272 			CELL_UPDATE(cell, tag_page, false, {
2273 				cell->state = MTE_STATE_PINNED;
2274 			});
2275 			if (tag_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2276 				vm_page_tag_storage_reclaim_compressor_failure_count++;
2277 			} else {
2278 				vm_page_tag_storage_reclaim_wired_failure_count++;
2279 			}
2280 		}
2281 
2282 		goto locks_acquired;
2283 	} else if ((*vm_mte_tag_storage_for_vm_tags) &&
2284 	    !vm_page_is_relocatable(tag_page, VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM)) {
2285 		/*
2286 		 * If we're allowing tag storage pages to be used for specific VM tags,
2287 		 * those pages could be unrelocatable for reasons we haven't
2288 		 * expected. We're also assuming that if a tag storage page were to
2289 		 * be unrelocatable for whatever reason, it's (at the very least) not
2290 		 * because the page is wired or involved in an IO that could take a
2291 		 * long time, so hopefully it won't be unavailable for too long, and
2292 		 * the fill thread won't churn over the same set of unavailable claimed
2293 		 * pages.
2294 		 *
2295 		 * We'll just skip over this page and move it back to claiming at the
2296 		 * bottom of this function.
2297 		 */
2298 		kr = KERN_NOT_WAITING;
2299 		vm_object_unlock(object);
2300 	} else {
2301 		kr = vm_page_relocate(tag_page, NULL,
2302 		    VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM, NULL);
2303 		vm_object_unlock(object);
2304 
2305 		assert(kr != KERN_ABORTED);
2306 	}
2307 
2308 release_locks:
2309 	if (compressor_locked) {
2310 		PAGE_REPLACEMENT_ALLOWED(FALSE);
2311 	}
2312 	vm_page_unlock_queues();
2313 	if (vm_object_trylock_failed && vm_object_lock_avoid(object)) {
2314 		/*
2315 		 * We failed to lock the VM object, and pageout_scan
2316 		 * wants this object. Back off for a little bit.
2317 		 *
2318 		 * Note that the VM object may no longer be valid after releasing
2319 		 * the VM object lock, but `vm_object_lock_avoid` only compares
2320 		 * pointers and doesn't dereference them, so it's fine.
2321 		 */
2322 		mutex_pause(2);
2323 	}
2324 	vm_free_page_lock_spin();
2325 
2326 
2327 locks_acquired:
2328 	/*
2329 	 * Assert that all codepaths leading up to this point have the lock
2330 	 * held in spin mode (and therefore, preemption disabled).
2331 	 */
2332 	LCK_MTX_ASSERT_OWNED_SPIN(&vm_page_queue_free_lock);
2333 
2334 	if (kr == KERN_SUCCESS) {
2335 		vm_page_tag_storage_reclaim_success_count++;
2336 
2337 		/* We relocated the page.  Now we can use it. */
2338 		if (cell->state != MTE_STATE_RECLAIMING) {
2339 			/*
2340 			 * The page was manipulated while we were relocating
2341 			 * it.  This likely means it was freed and reallocated
2342 			 * between us dropping the free page lock and getting
2343 			 * the queues lock.
2344 			 *
2345 			 * This should be ludicrously rare, and should still
2346 			 * mean that the page is claimed (otherwise relocate
2347 			 * would have failed).  Set to reclaiming for client
2348 			 * consistency.
2349 			 *
2350 			 * In the state diagram this corresponds to other
2351 			 * threads having performed [F.2 inline] followed
2352 			 * by [C.1 inline], possibly multiple times.
2353 			 */
2354 			mteinfo_tag_storage_set_reclaiming(cell, tag_page);
2355 		}
2356 
2357 		KDBG(VMDBG_CODE(DBG_VM_TAG_PAGE_CLAIMED) | DBG_FUNC_NONE,
2358 		    VM_KERNEL_ADDRHIDE(tag_page),
2359 		    mteinfo_tag_storage_free_pages_for_covered(tag_page));
2360 
2361 		assert(tag_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
2362 	} else {
2363 		vm_page_tag_storage_reclaim_failure_count++;
2364 
2365 		if (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_NOT_WAITING) {
2366 			/*
2367 			 * If there was no available page to relocate the tag
2368 			 * storage page to, or that some race happened that
2369 			 * changed the page state under our feet, just put the
2370 			 * page back as claimed if it's still reclaiming.
2371 			 *
2372 			 * It will as a result get reconsidered more quickly...
2373 			 * it WAS our best candidate, after all.
2374 			 */
2375 			if (cell->state == MTE_STATE_RECLAIMING) {
2376 				mteinfo_tag_storage_set_claimed(tag_page);
2377 			}
2378 		}
2379 	}
2380 
2381 	return kr;
2382 }
2383 
2384 
2385 #pragma mark Refill Thread
2386 
2387 /*!
2388  * @abstract
2389  * Returns whether the refill thread should keep refilling the active pool.
2390  *
2391  * @discussion
2392  * If we're below the free target, and there are no tagged waiters of any kind,
2393  * avoid activating any pages if the untagged pool is not extremely healthy.
2394  */
2395 static inline bool
mteinfo_tag_storage_active_should_refill(void)2396 mteinfo_tag_storage_active_should_refill(void)
2397 {
2398 	if (vm_page_free_taggable_count >= vm_page_free_target) {
2399 		return false;
2400 	}
2401 
2402 	if (vm_page_free_taggable_count <= vm_page_free_reserved) {
2403 		return true;
2404 	}
2405 
2406 	if (vm_page_free_wanted_tagged_privileged || vm_page_free_wanted_tagged) {
2407 		return true;
2408 	}
2409 
2410 	/*
2411 	 * 16/15 is ~1.07: we define "healthy" as at least 7% excess pages
2412 	 * over the target.
2413 	 *
2414 	 * We want some slop because a system under pressure will sometimes go
2415 	 * above @c vm_page_free_target and we want to avoid thrashing.
2416 	 */
2417 	return vm_page_free_count * 15ull >= vm_page_free_target * 16ull;
2418 }
2419 
2420 /*!
2421  * @function mteinfo_tag_storage_active_refill()
2422  *
2423  * @abstract
2424  * Attempt to fill the global free tagged covered page queue.
2425  *
2426  * @discussion
2427  * This is one of the core routines of the fill thread.  It will attempt to get
2428  * the global free tagged covered page queue to or above a target value.  It
2429  * will also wake threads waiting for more of these pages as appropriate.
2430  *
2431  * This function is called with the free page queue lock held in spin mode
2432  * and returns with it held in spin mode.
2433  *
2434  * @param taggablep     How many free taggable pages have been added.
2435  * @returns             The number of tag storage pages this function activated.
2436  */
2437 static uint32_t
mteinfo_tag_storage_active_refill(uint32_t * taggablep)2438 mteinfo_tag_storage_active_refill(uint32_t *taggablep)
2439 {
2440 	mte_cell_list_t  claimed_list  = &mte_info_lists[MTE_LIST_CLAIMED_IDX];
2441 	mte_cell_list_t  inactive_list = &mte_info_lists[MTE_LIST_INACTIVE_IDX];
2442 	uint32_t         taggable      = 0;
2443 	uint32_t         activated     = 0;
2444 
2445 	LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
2446 
2447 	while (mteinfo_tag_storage_active_should_refill()) {
2448 		mte_cell_bucket_t i_bucket = 0;
2449 		mte_cell_bucket_t c_bucket = 0;
2450 		vm_page_list_t    list     = { };
2451 		kern_return_t     kr       = KERN_SUCCESS;
2452 
2453 		/*
2454 		 *	Step 1: try to activate or reclaim pages.
2455 		 *
2456 		 *	Pick the pool between inactive and claimed that will
2457 		 *	make us progress the fastest (picking inactive over
2458 		 *	claimed for equivalent buckets, given that reclaiming
2459 		 *	is more expensive).
2460 		 *
2461 		 *	In particular always pick active buckets over reclaiming
2462 		 *	pages if they have more than 50% of the pages free.
2463 		 */
2464 
2465 		if (inactive_list->mask) {
2466 			i_bucket = fls(inactive_list->mask) - 1;
2467 		} else {
2468 			i_bucket = 0;
2469 		}
2470 		if (claimed_list->mask) {
2471 			c_bucket = fls(claimed_list->mask) - 1;
2472 		} else {
2473 			c_bucket = 0;
2474 		}
2475 
2476 		if (i_bucket && i_bucket >= MIN(MTE_BUCKET_17_24, c_bucket)) {
2477 			list = mteinfo_tag_storage_select_activating(VMP_FREE_BATCH_SIZE,
2478 			    MIN(i_bucket, MTE_BUCKET_17_24));
2479 		} else if (c_bucket > MTE_BUCKET_0) {
2480 			mte_cell_queue_t queue = &claimed_list->buckets[c_bucket];
2481 			cell_idx_t       idx   = cell_queue_first_idx(queue);
2482 			vm_page_t        page  = vm_tag_storage_page_get(idx);
2483 
2484 			kr = mteinfo_reclaim_tag_storage_page(page);
2485 			if (kr == KERN_SUCCESS) {
2486 				list = vm_page_list_for_page(page);
2487 			}
2488 		} else {
2489 			/*
2490 			 * There is no progress we can do here because we do not
2491 			 * have good candidates to activate or reclaim.
2492 			 *
2493 			 * As a result, even if the system has free untaggable
2494 			 * pages, they can't be converted to taggable either
2495 			 * because they're permanently untaggable, or beacuse
2496 			 * their associated tag storage can't be reclaimed.
2497 			 *
2498 			 * Waiting in VM_PAGE_WAIT() below sounds appealing
2499 			 * but will result in busy loops, so we should just
2500 			 * go park and wait until some page free is saving us
2501 			 * via the "wakeup_refill_thread" cases in
2502 			 * @c vm_page_free_queue_handle_wakeups_and_unlock().
2503 			 */
2504 			break;
2505 		}
2506 
2507 		if (kr == KERN_SUCCESS) {
2508 			activated += list.vmpl_count;
2509 			taggable += mteinfo_tag_storage_activate_locked(list,
2510 			    /* spin-mode */ true);
2511 			continue;
2512 		}
2513 
2514 		/*
2515 		 *	Step 2: wait if needed
2516 		 *
2517 		 *	KERN_RESOURCE_SHORTAGE means that we were out of pages
2518 		 *	to relocate or tag storage candidates.
2519 		 *
2520 		 *	Other errors are relocation failures and we can just
2521 		 *	retry immediately.
2522 		 */
2523 
2524 		if (kr == KERN_RESOURCE_SHORTAGE) {
2525 			/*
2526 			 * There was no good candidate tag storage page.  Wait
2527 			 * on the VM to make new pages available.
2528 			 *
2529 			 * TODO: This isn't a great solution; the VM doesn't
2530 			 * understand what we are actually waiting on.  This
2531 			 * should converge eventually due to VM activity... but
2532 			 * the bigger picture fix is to make all free pages
2533 			 * eligible for MTE.  Then our only significant concern
2534 			 * around tag storage pages will be tag storage pages
2535 			 * with ECC errors, which should be a small number.
2536 			 */
2537 			vm_free_page_unlock();
2538 			current_thread()->page_wait_class = VM_MEMORY_CLASS_REGULAR;
2539 			VM_PAGE_WAIT();
2540 			vm_free_page_lock_spin();
2541 
2542 			/*
2543 			 * We waited above, the system conditions changed,
2544 			 * flush our reclaiming queue.
2545 			 */
2546 			mteinfo_tag_storage_flush_reclaiming();
2547 		}
2548 	}
2549 
2550 	mteinfo_tag_storage_flush_reclaiming();
2551 
2552 	*taggablep += taggable;
2553 	return activated;
2554 }
2555 
2556 /*!
2557  * @function mteinfo_fill_continue()
2558  *
2559  * @abstract
2560  * Continuation for the MTE fill thread.
2561  *
2562  * @discussion
2563  * The MTE fill thread manages the global free queue of covered tagged pages,
2564  * and moving tag storage pages between the active and inactive states.
2565  *
2566  * @param param
2567  * Unused.
2568  *
2569  * @param wr
2570  * Unused.
2571  */
2572 __dead2
2573 static void
mteinfo_fill_continue(void * param __unused,wait_result_t wr __unused)2574 mteinfo_fill_continue(void *param __unused, wait_result_t wr __unused)
2575 {
2576 #if CONFIG_THREAD_GROUPS
2577 	static bool _fill_thread_self_inited;
2578 
2579 	if (!_fill_thread_self_inited) {
2580 		thread_group_vm_add();
2581 		_fill_thread_self_inited = true;
2582 	}
2583 #endif /* CONFIG_THREAD_GROUPS */
2584 
2585 	(void)sched_cond_ack(&fill_thread_cond);
2586 	vm_mte_refill_thread_wakeups++;
2587 
2588 	for (;;) {
2589 		uint32_t added = 0;
2590 		uint32_t activated = 0;
2591 		uint32_t deactivated = 0;
2592 
2593 		VM_DEBUG_CONSTANT_EVENT(, DBG_VM_REFILL_MTE, DBG_FUNC_START,
2594 		    0, 0, 0, 0);
2595 
2596 		/*
2597 		 * NB: We take the free queue lock in spin mode here because there are
2598 		 * a number of operations that occur during active_refill and drain
2599 		 * that requires preemption to be disabled. For example:
2600 		 *  - in active_refill: if the fill thread tries to reclaim a tag
2601 		 *    storage page, it first tries to steal a free tag storage page
2602 		 *    from the local free queue.
2603 		 *  - in drain: when flushing the queue of deactivating tag storage
2604 		 *    pages, the fill thread waits for all cores to finish any untagging
2605 		 *    before proceeding. See mteinfo_tag_storage_deactivate_barrier().
2606 		 *
2607 		 * Coupling enabling/disabling preemption with acquiring/releasing the
2608 		 * free queue lock is easier than managing preemption by hand, so all
2609 		 * instances of free queue lock acquisition must be done in spin mode.
2610 		 */
2611 		vm_free_page_lock_spin();
2612 
2613 		activated   += mteinfo_tag_storage_active_refill(&added);
2614 		deactivated += mteinfo_tag_storage_drain();
2615 
2616 		vm_free_page_unlock();
2617 
2618 		VM_DEBUG_CONSTANT_EVENT(, DBG_VM_REFILL_MTE, DBG_FUNC_END,
2619 		    added, activated, deactivated, 0);
2620 
2621 		sched_cond_wait_parameter(&fill_thread_cond, THREAD_UNINT,
2622 		    mteinfo_fill_continue, NULL);
2623 	}
2624 }
2625 
2626 void
mteinfo_wake_fill_thread(void)2627 mteinfo_wake_fill_thread(void)
2628 {
2629 	if (is_mte_enabled) {
2630 		sched_cond_signal(&fill_thread_cond, vm_mte_fill_thread);
2631 	}
2632 }
2633 
2634 
2635 #pragma mark Alloc
2636 
2637 /*!
2638  * @abstract
2639  * Returns whether @c mteinfo_free_queue_grab() should refill the per-cpu
2640  * claimable queue.
2641  *
2642  * @discussion
2643  * The policy is to refill if the queue is empty and that the claimable
2644  * queue has a full batch of @c VMP_FREE_BATCH_SIZE free pages.
2645  *
2646  * This is chosen so that the taking of the spinlock it implies is amortized
2647  * well and reduce thrashing.
2648  *
2649  * The function must be called with preemption disabled.
2650  *
2651  * @param mte_pcpu      The current CPU's mte_pcpu_t data structure.
2652  */
2653 static bool
mteinfo_tag_storage_claimable_should_refill(mte_pcpu_t mte_pcpu)2654 mteinfo_tag_storage_claimable_should_refill(mte_pcpu_t mte_pcpu)
2655 {
2656 	if (__improbable(!vm_mte_enable_tag_storage_grab)) {
2657 		return false;
2658 	}
2659 
2660 	if (!vm_page_queue_empty(&mte_pcpu->free_claimed_pages)) {
2661 		return false;
2662 	}
2663 
2664 	return mte_claimable_queue.vmpfq_count >= VMP_FREE_BATCH_SIZE;
2665 }
2666 
2667 /*!
2668  * @abstract
2669  * Refill the current CPU's claimed free queue.
2670  *
2671  * @discussion
2672  * This is done opportunistically by @c mteinfo_free_queue_grab()
2673  * When it notices that it should refill the claimable queue
2674  * (see @mteinfo_tag_storage_claimable_should_refill()).
2675  *
2676  * The function must be called with preemption disabled.
2677  *
2678  * @param mte_pcpu      The current CPU's mte_pcpu_t data structure.
2679  * @param target        The number of tag storage pages to grab.
2680  * @param colorp        A pointer to the current color selector.
2681  */
2682 static void
mteinfo_tag_storage_claimable_refill(mte_pcpu_t mte_pcpu,uint32_t target,uint32_t * colorp)2683 mteinfo_tag_storage_claimable_refill(
2684 	mte_pcpu_t              mte_pcpu,
2685 	uint32_t                target,
2686 	uint32_t               *colorp)
2687 {
2688 	const int       cpu = cpu_number();
2689 	vm_page_queue_t queue;
2690 	ppnum_t         pnum;
2691 	vm_page_t       mem;
2692 
2693 	lck_ticket_lock_nopreempt(&mte_pcpu->free_claimed_lock,
2694 	    &vm_page_lck_grp_bucket);
2695 
2696 	for (uint32_t i = target; i-- > 0;) {
2697 		queue = &mte_claimable_queue.vmpfq_queues[*colorp].qhead;
2698 		while (vm_page_queue_empty(queue)) {
2699 			*colorp = (*colorp + 1) & vm_color_mask;
2700 			queue = &mte_claimable_queue.vmpfq_queues[*colorp].qhead;
2701 		}
2702 
2703 		mem  = (vm_page_t)vm_page_queue_first(queue);
2704 		pnum = VM_PAGE_GET_PHYS_PAGE(mem);
2705 
2706 		assert(mem->vmp_q_state == VM_PAGE_ON_FREE_Q);
2707 		mteinfo_tag_storage_set_claimed(mem);
2708 		mem->vmp_q_state = VM_PAGE_ON_FREE_LOCAL_Q;
2709 		mem->vmp_local_id = (uint16_t)cpu;
2710 		vm_page_queue_enter(&mte_pcpu->free_claimed_pages, mem, vmp_pageq);
2711 	}
2712 
2713 	lck_ticket_unlock_nopreempt(&mte_pcpu->free_claimed_lock);
2714 
2715 	counter_add_preemption_disabled(&vm_cpu_free_claimed_count,
2716 	    target);
2717 }
2718 
2719 vm_page_list_t
mteinfo_free_queue_grab(vm_grab_options_t options,vm_memory_class_t class,unsigned int num_pages,vm_page_q_state_t q_state)2720 mteinfo_free_queue_grab(
2721 	vm_grab_options_t       options,
2722 	vm_memory_class_t       class,
2723 	unsigned int            num_pages,
2724 	vm_page_q_state_t       q_state)
2725 {
2726 	mte_pcpu_t           mte_pcpu = PERCPU_GET(mte_pcpu);
2727 	unsigned int        *colorp;
2728 	unsigned int         color;
2729 	vm_page_list_t       list = { };
2730 	mte_free_queue_idx_t idx;
2731 
2732 	assert(!mte_pcpu->deactivate_suspend && get_preemption_level() > 0);
2733 
2734 	if (class == VM_MEMORY_CLASS_REGULAR) {
2735 		/*
2736 		 * VM_MEMORY_CLASS_DEAD_TAG_STORAGE is not part of
2737 		 * vm_page_free_count, which means the caller didn't take them
2738 		 * into account when making this allocation ask.
2739 		 *
2740 		 * As a result do not respect num_pages. However these are
2741 		 * different than the regular claimable pool because we can
2742 		 * always safely wire them.
2743 		 */
2744 		if (vm_page_queue_free.vmpfq_count) {
2745 			list = vm_page_free_queue_grab(options,
2746 			    VM_MEMORY_CLASS_DEAD_TAG_STORAGE,
2747 			    MIN(vm_free_magazine_refill_limit / 2,
2748 			    vm_page_queue_free.vmpfq_count), q_state);
2749 		}
2750 
2751 		assert(num_pages <= vm_page_free_count);
2752 	} else {
2753 		assert(num_pages <= vm_page_free_taggable_count);
2754 	}
2755 
2756 	colorp = PERCPU_GET(start_color);
2757 	color  = *colorp;
2758 
2759 	if (mteinfo_tag_storage_claimable_should_refill(mte_pcpu)) {
2760 		mteinfo_tag_storage_claimable_refill(mte_pcpu,
2761 		    VMP_FREE_BATCH_SIZE, &color);
2762 	}
2763 
2764 	while (list.vmpl_count < num_pages) {
2765 		vm_page_queue_t queue;
2766 		cell_count_t bit;
2767 		vm_page_t tag_page;
2768 		vm_page_t mem;
2769 		uint32_t count;
2770 		ppnum_t first_pnum;
2771 		cell_t orig;
2772 		cell_t *cell;
2773 
2774 		/*
2775 		 * Select which queue we dequeue from
2776 		 *
2777 		 * Regular allocations can allocate from any bucket.
2778 		 * Tagged allocations must draw from an MTE_FREE_ACTIVE_* one.
2779 		 */
2780 
2781 		if (class == VM_MEMORY_CLASS_REGULAR) {
2782 			idx = ffs(mte_free_queue_mask) - 1;
2783 		} else {
2784 			uint32_t mask = mte_free_queue_mask;
2785 
2786 			mask &= BIT(MTE_FREE_ACTIVE_0) |
2787 			    BIT(MTE_FREE_ACTIVE_1) |
2788 			    BIT(MTE_FREE_ACTIVE_2) |
2789 			    BIT(MTE_FREE_ACTIVE_3);
2790 
2791 			assert(mask);
2792 			idx = fls(mask) - 1;
2793 		}
2794 
2795 		queue = mteinfo_free_queue_head(idx, color);
2796 		while (vm_page_queue_empty(queue)) {
2797 			color = (color + 1) & vm_color_mask;
2798 			queue = mteinfo_free_queue_head(idx, color);
2799 		}
2800 
2801 		/*
2802 		 * Dequeue the linkage, find the page of the right color.
2803 		 */
2804 
2805 		vm_page_queue_remove_first(queue, mem, vmp_pageq);
2806 
2807 		VM_COUNTER_DEC(&mte_free_queues[idx].vmpfq_count);
2808 		if (mte_free_queues[idx].vmpfq_count == 0) {
2809 			bit_clear(mte_free_queue_mask, idx);
2810 		}
2811 
2812 		first_pnum = VM_PAGE_GET_PHYS_PAGE(mem) & -MTE_PAGES_PER_TAG_PAGE;
2813 		cell       = cell_from_covered_ppnum(first_pnum, &tag_page);
2814 		orig       = *cell;
2815 		bit        = orig.enqueue_pos;
2816 		count      = 0;
2817 		assert((orig.enqueue_pos & vm_color_mask) ==
2818 		    color % MTE_PAGES_PER_TAG_PAGE);
2819 
2820 		/*
2821 		 * Dequeue a span of covered pages from that tag storage
2822 		 *
2823 		 * If we have a contiguous run of free pages and we need more,
2824 		 * we know this tag storage page is going to be the one we pick
2825 		 * next.
2826 		 */
2827 
2828 		for (;;) {
2829 			assert(bit_test(orig.free_mask, bit));
2830 			bit_clear(cell->free_mask, bit);
2831 
2832 			mem->vmp_q_state = q_state;
2833 			vm_page_list_push(&list, mem);
2834 
2835 			count += 1;
2836 			bit   += 1;
2837 
2838 			if (!bit_test(cell->free_mask, bit) ||
2839 			    list.vmpl_count >= num_pages) {
2840 				break;
2841 			}
2842 
2843 			mem = vm_page_find_canonical(first_pnum + bit);
2844 		}
2845 
2846 		color = (color + count) & vm_color_mask;
2847 
2848 		/*
2849 		 * Update counters (see mteinfo_covered_page_set_used())
2850 		 */
2851 
2852 		VM_COUNTER_SUB(&vm_page_free_count, count);
2853 		if (idx >= MTE_FREE_ACTIVE_0 && idx <= MTE_FREE_ACTIVE_3) {
2854 			VM_COUNTER_SUB(&vm_page_free_taggable_count, count);
2855 		}
2856 		if (class != VM_MEMORY_CLASS_REGULAR) {
2857 			VM_COUNTER_ADD(&vm_page_tagged_count, count);
2858 			cell->mte_page_count += count;
2859 		}
2860 
2861 		/*
2862 		 * Requeue the tag storage (tail end of CELL_UPDATE())
2863 		 */
2864 
2865 		if (cell_list_idx(orig) != cell_list_idx(*cell) ||
2866 		    cell_list_bucket(orig) != cell_list_bucket(*cell)) {
2867 			cell_list_requeue(cell, tag_page,
2868 			    cell_list_idx(orig), cell_list_bucket(orig),
2869 			    cell_list_idx(*cell), cell_list_bucket(*cell),
2870 			    (int)cell_on_claimable_queue(*cell) -
2871 			    (int)cell_on_claimable_queue(orig));
2872 		}
2873 
2874 		mteinfo_free_queue_requeue(cell, orig, MTE_FREE_NOT_QUEUED,
2875 		    mteinfo_free_queue_idx(*cell));
2876 	}
2877 
2878 	*colorp = color;
2879 
2880 	/*
2881 	 * Some existing driver/IOKit code deals badly with getting physically
2882 	 * contiguous memory... which this alloc code is rather likely to
2883 	 * provide by accident immediately after boot.
2884 	 *
2885 	 * To avoid hitting issues related to this, we'll invert the order of
2886 	 * the list we return.  This code should be removed once we've tracked
2887 	 * down the various driver issues.
2888 	 */
2889 	vm_page_list_reverse(&list);
2890 
2891 	if (class == VM_MEMORY_CLASS_REGULAR && list.vmpl_has_tagged) {
2892 		/*
2893 		 * We are pulling pages from the taggable free queue
2894 		 * to use them as untagged.
2895 		 *
2896 		 * This breaks the invariant that pages with vmp_using_mte
2897 		 * set are either free pages on the free queue that were left
2898 		 * tagged after being freed (covered by the cell "free_mask"),
2899 		 * or used tagged pages (covered by the cell "mte_page_count"
2900 		 * counter).
2901 		 *
2902 		 * The caller has allocated these pages from the free queue
2903 		 * (clearing the proper "free_mask" bit) but didn't increment
2904 		 * the "mte_page_count". It will then proceed with untagging
2905 		 * these pages without holding any locks, and doesn't want to
2906 		 * re-take the free page queue lock for book-keeping.
2907 		 *
2908 		 * As a result, invariants are broken for a little while,
2909 		 * and we need to suspend the deactivation path that someone
2910 		 * has currently broken this invariant on this core until
2911 		 * the untagging is finished, otherwise, the deactivating
2912 		 * thread would not consider these pages as tagged, and would
2913 		 * retype the page to XNU_DEFAULT causing an SPTM panic.
2914 		 *
2915 		 * mteinfo_page_list_fix_tagging() will resume deactivations
2916 		 * when it is called on the same core.
2917 		 *
2918 		 * mteinfo_tag_storage_deactivate_barrier() is called by any
2919 		 * path performing a deactivation to synchronize with this.
2920 		 */
2921 		os_atomic_store(&mte_pcpu->deactivate_suspend, 1,
2922 		    compiler_acquire);
2923 	}
2924 
2925 	/*
2926 	 * If pulling untagged pages tapped above the active(0) pool,
2927 	 * and there are "active(0)" pages around, then we wake up
2928 	 * the refill thread to drain this pool in order to make some
2929 	 * claimable pages available.
2930 	 */
2931 	if (vm_mte_enable_tag_storage_grab &&
2932 	    class == VM_MEMORY_CLASS_REGULAR &&
2933 	    idx >= MTE_FREE_ACTIVE_0 &&
2934 	    mteinfo_tag_storage_should_drain(true)) {
2935 		mteinfo_wake_fill_thread();
2936 	}
2937 
2938 	return list;
2939 }
2940 
2941 void
mteinfo_page_list_fix_tagging(vm_memory_class_t class,vm_page_list_t * list)2942 mteinfo_page_list_fix_tagging(vm_memory_class_t class, vm_page_list_t *list)
2943 {
2944 	const unified_page_list_t pmap_batch_list = {
2945 		.page_slist = list->vmpl_head,
2946 		.type = UNIFIED_PAGE_LIST_TYPE_VM_PAGE_LIST,
2947 	};
2948 	mte_pcpu_t mte_pcpu = PERCPU_GET(mte_pcpu);
2949 	vm_page_t mem;
2950 
2951 	assert(get_preemption_level() > 0);
2952 
2953 	if (class == VM_MEMORY_CLASS_REGULAR && list->vmpl_has_tagged) {
2954 		pmap_unmake_tagged_pages(&pmap_batch_list);
2955 		vm_page_list_foreach(mem, *list) {
2956 			mem->vmp_using_mte = false;
2957 		}
2958 
2959 		/*
2960 		 * Invariants related to tagged pages are resolved,
2961 		 * we can allow deactivations again.
2962 		 */
2963 		os_atomic_store(&mte_pcpu->deactivate_suspend, 0, release);
2964 	}
2965 
2966 	if (class == VM_MEMORY_CLASS_TAGGED && list->vmpl_has_untagged) {
2967 		pmap_make_tagged_pages(&pmap_batch_list);
2968 		vm_page_list_foreach(mem, *list) {
2969 			mem->vmp_using_mte = true;
2970 		}
2971 	}
2972 
2973 	assert(!mte_pcpu->deactivate_suspend);
2974 }
2975 
2976 #endif /* VM_MTE_FF_VERIFY */
2977 #pragma mark Bootstrap
2978 
2979 static mte_cell_queue_t
cell_list_init(mte_cell_queue_t qhp,mte_cell_state_t state,mte_cell_list_idx_t lidx)2980 cell_list_init(
2981 	mte_cell_queue_t        qhp,
2982 	mte_cell_state_t        state,
2983 	mte_cell_list_idx_t     lidx)
2984 {
2985 	mte_cell_bucket_t buckets = cell_list_idx_buckets(lidx);
2986 
2987 	mte_info_lists[lidx].buckets = qhp;
2988 
2989 	for (mte_cell_bucket_t i = 0; i < buckets; i++, qhp++) {
2990 		qhp->head = (cell_t){
2991 			.prev = cell_idx(qhp),
2992 			.next = cell_idx(qhp),
2993 			.state = state,
2994 			.enqueue_pos = -1,
2995 		};
2996 	}
2997 
2998 	return qhp;
2999 }
3000 
3001 __startup_func
3002 void
mteinfo_init(uint32_t num_tag_pages)3003 mteinfo_init(uint32_t num_tag_pages)
3004 {
3005 	assert(2 * num_tag_pages < (1UL << MTE_FF_CELL_INDEX_BITS));
3006 	assert(atop(mte_tag_storage_end - mte_tag_storage_start) == num_tag_pages);
3007 	assert(num_tag_pages == mte_tag_storage_count);
3008 
3009 	vm_size_t size = sizeof(cell_t) * (MTE_QUEUES_COUNT + num_tag_pages);
3010 	mte_cell_queue_t queue;
3011 	mte_cell_list_t list;
3012 
3013 	queue = pmap_steal_memory(size, 8);
3014 	mte_info_cells = &(queue + MTE_QUEUES_COUNT)->head;
3015 
3016 	queue = cell_list_init(queue, MTE_STATE_DISABLED, MTE_LIST_DISABLED_IDX);
3017 	queue = cell_list_init(queue, MTE_STATE_PINNED, MTE_LIST_PINNED_IDX);
3018 	queue = cell_list_init(queue, MTE_STATE_DEACTIVATING, MTE_LIST_DEACTIVATING_IDX);
3019 	queue = cell_list_init(queue, MTE_STATE_CLAIMED, MTE_LIST_CLAIMED_IDX);
3020 	queue = cell_list_init(queue, MTE_STATE_INACTIVE, MTE_LIST_INACTIVE_IDX);
3021 	queue = cell_list_init(queue, MTE_STATE_RECLAIMING, MTE_LIST_RECLAIMING_IDX);
3022 	queue = cell_list_init(queue, MTE_STATE_ACTIVATING, MTE_LIST_ACTIVATING_IDX);
3023 	queue = cell_list_init(queue, MTE_STATE_ACTIVE, MTE_LIST_ACTIVE_0_IDX);
3024 	queue = cell_list_init(queue, MTE_STATE_ACTIVE, MTE_LIST_ACTIVE_IDX);
3025 
3026 	assert(&queue->head == mte_info_cells);
3027 
3028 	/*
3029 	 * Quickly create a list of all possible cells and place it into the
3030 	 * disabled queue.
3031 	 */
3032 
3033 	for (cell_idx_t i = 0; i < num_tag_pages; i++) {
3034 		*cell_from_idx(i) = (cell_t){
3035 			.prev = i - 1,
3036 			.next = i + 1,
3037 			.enqueue_pos = -1,
3038 			.mte_page_count = 0,
3039 			.state = MTE_STATE_DISABLED,
3040 		};
3041 	}
3042 
3043 	list = &mte_info_lists[MTE_LIST_DISABLED_IDX];
3044 	queue = &list->buckets[0];
3045 	queue->head.next = 0;
3046 	queue->head.prev = num_tag_pages - 1;
3047 	queue->head.cell_count = num_tag_pages;
3048 	cell_from_idx(0)->prev = cell_idx(queue);
3049 	cell_from_idx(num_tag_pages - 1)->next = cell_idx(queue);
3050 	bit_set(list->mask, 0);
3051 	list->count = num_tag_pages;
3052 
3053 	for (mte_free_queue_idx_t idx = MTE_FREE_UNTAGGABLE_0;
3054 	    idx < MTE_FREE_NOT_QUEUED; idx++) {
3055 		for (uint32_t i = 0; i < MAX_COLORS; i++) {
3056 			vm_page_queue_init(mteinfo_free_queue_head(idx, i));
3057 		}
3058 	}
3059 
3060 #ifndef VM_MTE_FF_VERIFY
3061 	vm_page_free_queue_init(&mte_claimable_queue);
3062 #endif /* VM_MTE_FF_VERIFY */
3063 }
3064 
3065 #if HIBERNATION
3066 
3067 void
3068 mteinfo_free_queue_foreach(void (^block)(vm_page_t))
3069 {
3070 	for (cell_idx_t cidx = 0; cidx < mte_tag_storage_count; cidx++) {
3071 		cell_t  *cell = cell_from_idx(cidx);
3072 		ppnum_t  pnum = cell_first_covered_pnum(cell);
3073 		uint32_t mask = cell->free_mask;
3074 
3075 		while (mask) {
3076 			block(vm_page_find_canonical(pnum + ffs(mask) - 1));
3077 			mask &= mask - 1;
3078 		}
3079 
3080 		if (cell->state == MTE_STATE_INACTIVE) {
3081 			block(vm_tag_storage_page_get(cidx));
3082 		}
3083 	}
3084 }
3085 
3086 #endif /* HIBERNATION */
3087 #ifndef VM_MTE_FF_VERIFY
3088 
3089 /* List that tracks tag storage pages until mte_tags_object is initialized. */
3090 __startup_data
3091 static vm_page_list_t mte_tag_storage_startup_list;
3092 
3093 void
mteinfo_tag_storage_release_startup(vm_page_t tag_page)3094 mteinfo_tag_storage_release_startup(vm_page_t tag_page)
3095 {
3096 	cell_t           *cell       = cell_from_tag_storage_page(tag_page);
3097 	ppnum_t           tag_pnum   = VM_PAGE_GET_PHYS_PAGE(tag_page);
3098 	ppnum_t           first_pnum = cell_first_covered_pnum(cell);
3099 	vm_memory_class_t class      = VM_MEMORY_CLASS_TAG_STORAGE;
3100 	bool              deactivate = true;
3101 	uint32_t          mte_count  = 0;
3102 
3103 	/*
3104 	 * If this is a tag storage page we won't even classify as tag
3105 	 * storage.  Just give it to the normal free queues.
3106 	 *
3107 	 * Otherwise, keep about a 1/8 of the tag storage page around,
3108 	 * it should be vastly sufficient to boot. The refill thread
3109 	 * and various passive policies will let it rebalance later.
3110 	 *
3111 	 * Note that this code implicitly relies on the fact that
3112 	 * the tag storage is toward the end of the vm pages array:
3113 	 * we only keep tag storage around that have 32 pages free,
3114 	 * but pages that haven't been created yet appear as "used".
3115 	 */
3116 
3117 	assert(pmap_is_tag_storage_page(tag_pnum));
3118 
3119 	if (pmap_tag_storage_is_discarded(tag_pnum)) {
3120 		mteinfo_tag_storage_set_retired(tag_page);
3121 		return;
3122 	} else if (pmap_tag_storage_is_recursive(tag_pnum)) {
3123 		VM_COUNTER_INC(&vm_page_recursive_tag_storage_count);
3124 		class = VM_MEMORY_CLASS_DEAD_TAG_STORAGE;
3125 	} else if (pmap_tag_storage_is_unmanaged(tag_pnum)) {
3126 		VM_COUNTER_INC(&vm_page_unmanaged_tag_storage_count);
3127 		class = VM_MEMORY_CLASS_DEAD_TAG_STORAGE;
3128 	} else {
3129 		for (uint32_t i = 0; i < MTE_PAGES_PER_TAG_PAGE; i++) {
3130 			mte_count += pmap_is_tagged_page(first_pnum + i);
3131 		}
3132 
3133 		if (cell_free_page_count(*cell) == MTE_PAGES_PER_TAG_PAGE &&
3134 		    mteinfo_tag_storage_active(true) < mte_tag_storage_count / 8) {
3135 			deactivate = false;
3136 		} else if (mte_count) {
3137 			deactivate = false;
3138 		}
3139 	}
3140 
3141 	if (deactivate) {
3142 		pmap_unmake_tag_storage_page(tag_pnum);
3143 		if (class == VM_MEMORY_CLASS_DEAD_TAG_STORAGE) {
3144 			vm_page_free_queue_enter(class, tag_page, tag_pnum);
3145 		} else {
3146 			tag_page->vmp_q_state = VM_PAGE_ON_FREE_Q;
3147 			mteinfo_tag_storage_set_inactive(tag_page, true);
3148 		}
3149 		return;
3150 	}
3151 
3152 	mteinfo_tag_storage_set_active(tag_page, mte_count, true);
3153 	vm_page_list_push(&mte_tag_storage_startup_list, tag_page);
3154 }
3155 
3156 /*!
3157  * @function mteinfo_tag_storage_startup_list_flush()
3158  *
3159  * @abstract
3160  * Adds active tag storage pages to the mte_tags_object.
3161  *
3162  * @discussion
3163  * Adds the list of active tag storage pages updated by @see
3164  * mteinfo_tag_storage_release_startup to the mte_tags_object.  This must be
3165  * called at some point after the last @see mteinfo_tag_storage_release_startup
3166  * call.
3167  */
3168 __startup_func
3169 static void
mteinfo_tag_storage_startup_list_flush(void)3170 mteinfo_tag_storage_startup_list_flush(void)
3171 {
3172 	vm_page_t page;
3173 
3174 	vm_object_lock(mte_tags_object);
3175 	vm_page_lock_queues();
3176 
3177 	vm_page_list_foreach_consume(page, &mte_tag_storage_startup_list) {
3178 		mteinfo_tag_storage_wire_locked(page);
3179 	}
3180 
3181 	vm_page_unlock_queues();
3182 	vm_object_unlock(mte_tags_object);
3183 }
3184 STARTUP(KMEM, STARTUP_RANK_FIRST, mteinfo_tag_storage_startup_list_flush);
3185 
3186 /*!
3187  * @abstract
3188  * Initializes the percpu mte queues and locks.
3189  */
3190 __startup_func
3191 static void
mteinfo_tag_storage_lock_init(void)3192 mteinfo_tag_storage_lock_init(void)
3193 {
3194 	percpu_foreach(mte_pcpu, mte_pcpu) {
3195 		lck_ticket_init(&mte_pcpu->free_claimed_lock,
3196 		    &vm_page_lck_grp_bucket);
3197 		vm_page_queue_init(&mte_pcpu->free_claimed_pages);
3198 	}
3199 }
3200 STARTUP(PERCPU, STARTUP_RANK_MIDDLE, mteinfo_tag_storage_lock_init);
3201 
3202 /*!
3203  * @function mteinfo_init_fill_thread
3204  *
3205  * @abstract
3206  * Creates the MTE fill thread.
3207  */
3208 __startup_func
3209 static void
mteinfo_init_fill_thread(void)3210 mteinfo_init_fill_thread(void)
3211 {
3212 	kern_return_t result;
3213 
3214 	if (!is_mte_enabled) {
3215 		return;
3216 	}
3217 
3218 	result = kernel_thread_start_priority(mteinfo_fill_continue, NULL, BASEPRI_VM,
3219 	    &vm_mte_fill_thread);
3220 
3221 	if (result != KERN_SUCCESS) {
3222 		panic("Failed to create MTE fill thread.");
3223 	}
3224 
3225 	thread_set_thread_name(vm_mte_fill_thread, "VM_mte_fill");
3226 	thread_deallocate(vm_mte_fill_thread);
3227 }
3228 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, mteinfo_init_fill_thread);
3229 
3230 static ppnum_t
mteinfo_tag_storage_mark_unmanaged_range(cell_idx_t idx,ppnum_t pnum)3231 mteinfo_tag_storage_mark_unmanaged_range(cell_idx_t idx, ppnum_t pnum)
3232 {
3233 	cell_t    *end_cell = cell_from_covered_ppnum(pnum);
3234 	cell_idx_t end_idx  = cell_idx(end_cell);
3235 	bool       locked   = false;
3236 
3237 	for (; idx < end_idx; idx++) {
3238 		cell_t *cell = cell_from_idx(idx);
3239 		vm_page_t tag_page = vm_tag_storage_page_get(idx);
3240 
3241 		if (!locked) {
3242 			vm_free_page_lock_spin();
3243 			locked = true;
3244 		}
3245 
3246 		if (pmap_tag_storage_is_discarded(VM_PAGE_GET_PHYS_PAGE(tag_page))) {
3247 			mteinfo_tag_storage_set_retired(tag_page);
3248 			continue;
3249 		}
3250 
3251 		if (cell->mte_page_count != 0) {
3252 			/*
3253 			 * This can happen if some tagged pmap steal
3254 			 * has not ml_static_mfree()d these pages back
3255 			 */
3256 			continue;
3257 		}
3258 
3259 		if (cell->state == MTE_STATE_DISABLED) {
3260 			/*
3261 			 * Probably an ECC retired page.
3262 			 */
3263 			continue;
3264 		}
3265 
3266 		mteinfo_tag_storage_set_unmanaged(cell,
3267 		    vm_tag_storage_page_get(idx));
3268 	}
3269 
3270 	if (locked) {
3271 		vm_free_page_unlock();
3272 	}
3273 
3274 	return end_idx + 1;
3275 }
3276 
3277 static void
mteinfo_tag_storage_unmanaged_discover(void)3278 mteinfo_tag_storage_unmanaged_discover(void)
3279 {
3280 	uint32_t   count   = vm_page_unmanaged_tag_storage_count;
3281 	cell_idx_t cur_idx = 0;
3282 	ppnum_t    pnum;
3283 
3284 	if (!is_mte_enabled) {
3285 		return;
3286 	}
3287 
3288 	vm_pages_radix_for_each_pnum(pnum) {
3289 		cur_idx = mteinfo_tag_storage_mark_unmanaged_range(cur_idx, pnum);
3290 	}
3291 	mteinfo_tag_storage_mark_unmanaged_range(cur_idx,
3292 	    vm_pages_first_pnum);
3293 
3294 	printf("MTE: discovered %d tag storage pages for unmanaged memory\n",
3295 	    vm_page_unmanaged_tag_storage_count - count);
3296 }
3297 STARTUP(LOCKDOWN, STARTUP_RANK_LAST, mteinfo_tag_storage_unmanaged_discover);
3298 
3299 extern boolean_t get_range_bounds(char *c, int64_t *lower, int64_t *upper);
3300 static void
mteinfo_tag_storage_process_vm_tags(void)3301 mteinfo_tag_storage_process_vm_tags(void)
3302 {
3303 	char *vm_tags_str;
3304 
3305 	if (!vm_mte_enable_tag_storage_grab) {
3306 		return;
3307 	}
3308 
3309 	vm_tags_str = vm_mte_tag_storage_for_vm_tags;
3310 	while (*vm_tags_str) {
3311 		uint64_t loop_end;
3312 		boolean_t ret;
3313 		int64_t start = 1, end = VM_MEMORY_COUNT;
3314 
3315 		ret = get_range_bounds(vm_tags_str, &start, &end);
3316 		loop_end = (ret) ? end : start;
3317 		for (int64_t i = start; i <= loop_end; i++) {
3318 			bitmap_set(vm_mte_tag_storage_for_vm_tags_mask, (uint)i);
3319 		}
3320 
3321 		/* Skip to the next ',' */
3322 		while (*vm_tags_str != ',') {
3323 			if (*vm_tags_str == '\0') {
3324 				break;
3325 			}
3326 			vm_tags_str++;
3327 		}
3328 
3329 		if (*vm_tags_str == ',') {
3330 			vm_tags_str++;
3331 		} else {
3332 			assert(*vm_tags_str == '\0');
3333 			break;
3334 		}
3335 	}
3336 }
3337 STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, mteinfo_tag_storage_process_vm_tags);
3338 
3339 #pragma mark Counter methods
3340 
3341 uint32_t
mteinfo_tag_storage_fragmentation(bool actual)3342 mteinfo_tag_storage_fragmentation(bool actual)
3343 {
3344 	uint32_t ts_active;
3345 	uint32_t value;
3346 
3347 	vm_free_page_lock_spin();
3348 	ts_active = mte_info_lists[MTE_LIST_ACTIVE_IDX].count;
3349 	if (actual) {
3350 		ts_active += mte_info_lists[MTE_LIST_ACTIVE_0_IDX].count;
3351 	}
3352 	if (ts_active) {
3353 		value  = 1000 * vm_page_tagged_count;
3354 		value /= (ts_active * MTE_PAGES_PER_TAG_PAGE);
3355 	} else {
3356 		value  = 1000;
3357 	}
3358 	vm_free_page_unlock();
3359 
3360 	return 1000 - value;
3361 }
3362 
3363 uint32_t
mteinfo_tag_storage_active(bool fq_locked)3364 mteinfo_tag_storage_active(bool fq_locked)
3365 {
3366 	uint32_t active;
3367 
3368 	if (!fq_locked) {
3369 		vm_free_page_lock_spin();
3370 	}
3371 
3372 	active = mte_info_lists[MTE_LIST_ACTIVE_0_IDX].count +
3373 	    mte_info_lists[MTE_LIST_ACTIVE_IDX].count;
3374 
3375 	if (!fq_locked) {
3376 		vm_free_page_unlock();
3377 	}
3378 
3379 	return active;
3380 }
3381 
3382 uint32_t
mteinfo_tag_storage_free_pages_for_covered(const struct vm_page * page)3383 mteinfo_tag_storage_free_pages_for_covered(const struct vm_page *page)
3384 {
3385 	ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(page);
3386 
3387 	return cell_free_page_count(*cell_from_covered_ppnum(pnum));
3388 }
3389 
3390 void
mteinfo_increment_wire_count(vm_page_t tag_page)3391 mteinfo_increment_wire_count(vm_page_t tag_page)
3392 {
3393 	if (vm_page_in_tag_storage_array(tag_page) &&
3394 	    vm_page_is_tag_storage(tag_page)) {
3395 		VM_COUNTER_ATOMIC_INC(&vm_page_wired_tag_storage_count);
3396 
3397 		DTRACE_VM1(vm_tag_storage_wired, vm_page_t, tag_page);
3398 	}
3399 }
3400 
3401 void
mteinfo_decrement_wire_count(vm_page_t tag_page,bool pqs_locked)3402 mteinfo_decrement_wire_count(vm_page_t tag_page, bool pqs_locked)
3403 {
3404 	LCK_MTX_ASSERT(&vm_page_queue_lock,
3405 	    pqs_locked ? LCK_MTX_ASSERT_OWNED : LCK_MTX_ASSERT_NOTOWNED);
3406 	LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_NOTOWNED);
3407 
3408 	if (vm_page_in_tag_storage_array(tag_page) &&
3409 	    VM_PAGE_OBJECT(tag_page) != mte_tags_object &&
3410 	    vm_page_is_tag_storage(tag_page)) {
3411 		VM_COUNTER_ATOMIC_DEC(&vm_page_wired_tag_storage_count);
3412 
3413 		DTRACE_VM1(vm_tag_storage_unwired, vm_page_t, tag_page);
3414 
3415 		if (tag_page->vmp_ts_wanted) {
3416 			/*
3417 			 * Many callers have the page queue lock held in spin
3418 			 * when calling this, and mteinfo_tag_storage_wakeup()
3419 			 * needs to acquire a mutex.
3420 			 */
3421 			if (pqs_locked) {
3422 				vm_page_lockconvert_queues();
3423 			}
3424 			mteinfo_tag_storage_wakeup(tag_page, false);
3425 		}
3426 	}
3427 }
3428 
3429 bool
mteinfo_vm_tag_can_use_tag_storage(vm_tag_t vm_tag)3430 mteinfo_vm_tag_can_use_tag_storage(vm_tag_t vm_tag)
3431 {
3432 	return bitmap_test(vm_mte_tag_storage_for_vm_tags_mask, (uint)vm_tag);
3433 }
3434 
3435 
3436 void
kdp_mteinfo_snapshot(struct mte_info_cell * __counted_by (count)cells,size_t count)3437 kdp_mteinfo_snapshot(struct mte_info_cell * __counted_by(count) cells, size_t count)
3438 {
3439 	release_assert(count == mte_tag_storage_count);
3440 
3441 	if (not_in_kdp) {
3442 		panic("panic: kdp_mteinfo_fill called outside of kernel debugger");
3443 	}
3444 
3445 	for (cell_idx_t cidx = 0; cidx < mte_tag_storage_count; cidx++) {
3446 		cell_t  *cell = cell_from_idx(cidx);
3447 		ppnum_t  pnum = cell_first_covered_pnum(cell);
3448 		vm_page_t mem;
3449 		uint8_t wired_count = 0, wired_tagged_count = 0, kernel_wired_tagged_count = 0;
3450 
3451 		for (ppnum_t i = 0; i < MTE_PAGES_PER_TAG_PAGE; i++) {
3452 			mem = vm_page_find_canonical(pnum + i);
3453 			if (mem && VM_PAGE_WIRED(mem)) {
3454 				wired_count++;
3455 				if (mem->vmp_using_mte) {
3456 					if (VM_PAGE_OBJECT(mem) == kernel_object_tagged) {
3457 						kernel_wired_tagged_count++;
3458 					} else {
3459 						wired_tagged_count++;
3460 					}
3461 				}
3462 			}
3463 		}
3464 
3465 		cells[cidx] = (struct mte_info_cell) {
3466 			.mic_state = cell->state,
3467 			.mic_tagged_count = cell->mte_page_count,
3468 			.mic_free_count = (uint8_t)cell_free_page_count(*cell),
3469 			.mic_wired_count = wired_count,
3470 			.mic_wired_tagged_count = wired_tagged_count,
3471 			.mic_kernel_wired_tagged_count = kernel_wired_tagged_count
3472 		};
3473 	}
3474 }
3475 #endif /* VM_MTE_FF_VERIFY */
3476 
3477 #endif /* HAS_MTE */
3478