xref: /xnu-12377.61.12/osfmk/vm/vm_mteinfo.c (revision 4d495c6e23c53686cf65f45067f79024cf5dcee8)
1 /*
2  * Copyright (c) 2000-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /* Guard header includes, so that the userspace test can include this file. */
30 #include <os/atomic_private.h>
31 #ifndef VM_MTE_FF_VERIFY
32 #include <debug.h>
33 #include <mach_assert.h>
34 
35 #include <kern/bits.h>
36 #include <kern/kcdata.h>
37 #include <kern/queue.h>
38 
39 #include <mach/sdt.h>
40 
41 #include <vm/pmap.h>
42 #include <vm/vm_compressor_internal.h>
43 #include <vm/vm_kern.h>
44 #include <vm/vm_object_internal.h>
45 #include <vm/vm_page_internal.h>
46 #include <vm/vm_pageout.h>
47 #include <vm/vm_mteinfo_internal.h>
48 
49 extern lck_grp_t vm_page_lck_grp_bucket;
50 
51 #endif /* VM_MTE_FF_VERIFY */
52 #pragma mark Documentation
53 #if HAS_MTE
54 
55 /*
56  * VM MTE Info
57  * ===========
58  *
59  * The top level goal of this code is to implement the policies managing the
60  * selection of tag storage pages on the system, in order to:
61  * - Minimize the number of live tag storage pages at any given time;
62  * - Maximize occupancy (the number of covered pages using MTE compared to tag
63  *   storage pages actually being used for tag storage).
64  *
65  *
66  * Physical Memory Layout
67  * ----------------------
68  *
69  * The diagram below describes the general layout of the physical memory. iBoot
70  * will determine the placement of the tag storage region, at the end of the
71  * managed address space.
72  *
73  * As a result, the tag storage space is always part of the vm_pages array.
74  * However, several things should be noted:
75  *
76  * - The last tag storage pages cover unmanaged DRAM at the end of physical
77  *   memory, as well as the tag storage space itself, and will never be used as
78  *   tag storage memory by the system (the unmanaged space will not be MTE'd,
79  *   and the tag storage space will never itself use MTE).
80  *
81  * - The first tag storage pages also cover unmanaged DRAM space at the
82  *   beginning of physical memory, but might be used for tagging due to early
83  *   boot code.  However, these first tag storage pages will not be used for
84  *   tag storage space dynamically by the system.
85  *
86  * - The beginning of the tag region space is always aligned to a 32 page
87  *   boundary; however the start of the vm_pages array is not. As a result,
88  *   there is a cluster of 32 pages that possibly crosses this boundary. This
89  *   is relevant because dynamic tag storage management only functions for
90  *   taggable pages inside the vm_pages array.
91  *
92  *
93  *                            ┌────────────┐─╮
94  *                            │    P_n+31  │ │
95  *                            ├────────────┤ │
96  *                            ╎     ...    ╎ │
97  *                            ├────────────┤ │
98  *                            │     P_n    │ │
99  *                            ├────────────┤─╯
100  *                            │            │
101  *                            ╎            ╎
102  *                            ╎     ...    ╎
103  *                            ╎            ╎
104  *                            │            │
105  *   mte_tag_storage_end ─ ─ ─├────────────┤ ─ ─ ─ vm_pages_end
106  *              ┬             │TTTTTTTTTTTT│ Tag storage for pages [n:n+31]
107  *              │             ├────────────┤
108  *              │             │            │
109  *              │             ╎     ...    ╎
110  *              │             │            │
111  *              │             ├────────────┤
112  *       1/32   │             │TTTTTTTTTTTT│ Tag storage for pages [i:i+31]
113  *      of DRAM │             ├────────────┤
114  *              │             │            │
115  *              │             ╎     ...    ╎
116  *              │             │            │
117  *              │             ├────────────┤
118  *              │             │TTTTTTTTTTTT│ Tag storage for pages [32:63]
119  *              │             ├────────────┤
120  *              ┴             │TTTTTTTTTTTT│ Tag storage for pages [0:31]
121  * mte_tag_storage_start ─ ─ ─├────────────┤─╮
122  *                            │    P_i+31  │ │
123  *                            ├────────────┤ │
124  *                            ╎     ...    ╎ │
125  *                            ├────────────┤ │
126  *                            │     P_i    │ │
127  *                            ├────────────┤─╯
128  *                            │            │
129  *                            ╎            ╎
130  *                            ╎     ...    ╎
131  *                            ╎            ╎
132  *                            │            │
133  *                            ├────────────┤─╮
134  *                            │            │ │
135  *                            ╎     ...    ╎ │
136  *                            ├────────────┤ │ ─ ─ vm_pages
137  *                            ╎     ...    ╎ │
138  *                            │            │ │
139  *                            │────────────┤─╯
140  *                            │            │
141  *                            ╎            ╎
142  *                            ╎     ...    ╎
143  *                            ╎            ╎
144  *                            │            │
145  *                            ├────────────┤─╮
146  *                            │    P_31    │ │
147  *                            ├────────────┤ │
148  *                            ╎     ...    ╎ │
149  *                            ├────────────┤ │
150  *                            │    P_0     │ │
151  *  pmap_first_pnum        ─ ─└────────────┘─╯ ─ ─ gDramBase
152  *                           Physical Memory
153  *
154  *
155  * Tag storage and cells
156  * ~~~~~~~~~~~~~~~~~~~~~
157  *
158  * Tag storage pages require metadata to track their state machine, in order to
159  * not grow the vm_page_t data structure for all pages on the system when only
160  * 1/32 of them are tag storage.
161  *
162  * The metadata is stored into a data structure called the MTE cell
163  * (@see cell_t) which is queued into the so called MTE Info data structure
164  * (@see @c mte_info_lists).
165  *
166  * The documentation of this file happily calls a cell a tag storage page and
167  * vice versa as result, since the mapping is 1:1.
168  *
169  *
170  * Tag storage state machine
171  * ~~~~~~~~~~~~~~~~~~~~~~~~~
172  *
173  * Disabled is a special state: this is the state cells start in,
174  * and never transition back to unless there is an ECC error.
175  *
176  * The state diagram involving "Disabled" looks like this:
177  *
178  *     ╭──────────────╮          ╭───╴K.3╶──╮          ╔══════════════╗
179  *     │  RECLAIMING  ┼───╮      │          v     ╭───>║    ACTIVE    ║
180  *     ╰──────────────╯  K.1   ╔═╪════════════╗  I.1   ╚══════════════╝
181  *                        ├───>║   DISABLED   ╫───┤
182  *      ╔═════════════╗  K.2   ╚══════════════╝  I.2   ╔══════════════╗
183  *      ║   CLAIMED   ╫───╯      ^          ^     ╰───>║   INACTIVE   ║
184  *      ╚═══════════╪═╝          │          │          ╚═╪════════════╝
185  *                  ╰────╴U.1╶───╯          ╰───╴U.2╶────╯
186  *
187  *   ╔═╗ Double bar square boxes         ╭─╮ Single bar round boxes
188  *   ╚═╝ denote stable states.           ╰─╯ denote transitionary states.
189  *
190  *
191  * Initialization (I.1, I.2)
192  *
193  *   This is performed by mteinfo_tag_storage_release_startup()
194  *   This function might decide to leave pages as disabled.
195  *
196  * Unmanaged discovery (U.1, U.2)
197  *
198  *   This is performed at lockdown by mteinfo_tag_storage_unmanaged_discover()
199  *   to discover tag storage that covers pages that will never have a canonical
200  *   vm_page_t made for them, which are effectively unmanaged.
201  *
202  * Retirement (K.1, K.2, K.3)
203  *
204  *   This is performed by mteinfo_tag_storage_set_retired(),
205  *   itself called by vm_page_retire() which can only happen
206  *   for pages that were never created (the cell will be DISABLED),
207  *   or on the tag storage claimed page free path (the cell
208  *   will either be RECLAIMING or CLAIMED).
209  *
210  *
211  * The rest of the tag storage state machine looks like this:
212  *
213  *                            ╭──────────────╮
214  *               ╭────╴D.2╶───┼ DEACTIVATING │<───╴D.1╶────╮
215  *               │      a     ╰──────────────╯      a      │
216  *               v                                         │
217  *  ╔══════════════╗          ╭──────────────╮           ╔═╪════════════╗
218  *  ║   INACTIVE   ╫──╴A.1╶──>│  ACTIVATING  ┼───╴A.2╶──>║    ACTIVE    ║<─╮
219  *  ╚════════════╪═╝   i/a    ╰──────────────╯    i/a    ╚══════════════╝  │
220  *    ^          │                                                         │
221  *    │          │                                                         │
222  *    │          │                          ╔════════════╗                 │
223  *    │          │              ╭───╴B.2╶───╫   PINNED   ║<───╴B.1╶───╮    │
224  *    │          │              │     i     ╚════════════╝      a     │   R.2
225  *    │          │              │                                     │    a
226  *    │          │              │          ╭─────╴R.x╶─────╮          │    │
227  *    │          │              v          v       a       │          │    │
228  *    │          │            ╔═════════════╗            ╭─┼──────────┼─╮  │
229  *    │          ╰────╴C.1╶──>║   CLAIMED   ╫────╴R.1╶──>│  RECLAIMING  ┼──╯
230  *    │                 i     ╚═╪═══════════╝      a     ╰─┼────────────╯
231  *    │                         │                          │
232  *    ╰──────────╴F.1╶──────────╯<─────────╴F.2╶───────────╯
233  *                 i                         i
234  *
235  *   ╔═╗ Double bar square boxes         ╭─╮ Single bar round boxes
236  *   ╚═╝ denote stable states.           ╰─╯ denote transitionary states.
237  *
238  *    a  the transition can be done by the refill thread (async)
239  *    i  the transition can be done inline by any thread.
240  *
241  *
242  * Activation (A.1, A.2)
243  *
244  *   [A.1 inline] is performed by mteinfo_tag_storage_try_activate() by
245  *   vm_page_grab_slow() if the current grab would deplete the taggable
246  *   space too much and that there seem to be an ample reserve of free
247  *   pages.
248  *
249  *   This path however will limit itself to pages that are really worth
250  *   activating (17+ free associated pages, which coincide with the first 3
251  *   mteinfo buckets for MTE_STATE_INACTIVE).
252  *
253  *
254  *   [A.1 async] is performed by mteinfo_tag_storage_active_refill() when it
255  *   decides that activating pages is the best strategy to get more taggable
256  *   pages.  It will only do so if [R.1 async] isn't more profitable.
257  *
258  *
259  *   [A.2 inline/async] is performed by mteinfo_tag_storage_activate_locked()
260  *   on the results of [A.1 inline/async]. The most notable thing to mention
261  *   is until the tag pages are fully activated, no tagged page can be
262  *   allocated, and if the thread doing this operation inline is a low priority
263  *   thread, this could cause starvation due to priority inversions.
264  *
265  *   To prevent this issue, turnstiles are used for the inline case so that
266  *   there's a single activator at a time with priority inversion avoidance.
267  *   The async path doesn't use this as it is a very high priority thread,
268  *   and is meant to run in case of emergencies.
269  *
270  *
271  * Deactivation (D.1, D.2)
272  *
273  *   [D.1 async] is performed by mteinfo_tag_storage_drain(). The refill
274  *   thread will invoke this function after it is done with activations.
275  *
276  *   This phase will only drain active(0.0) pages, meaning pages that are active
277  *   but have no free pages associated with it nor MTE pages. Having such pages
278  *   on the system is a sign of untagged memory pressure, and it's probably
279  *   a good idea to free that tag storage page so it can be used for untagged
280  *   purposes (i.e., become claimed).
281  *
282  *   It will drain pages until the @c mte_claimable_queue has a healthy level.
283  *
284  *   This transition is triggered lazily from the @c mteinfo_free_queue_grab()
285  *   path when untagged pages have been allocated and tapped into the taggable
286  *   space, and that system conditions permit
287  *   (see @c mteinfo_tag_storage_should_drain()).
288  *
289  *   [D.2 async] is performed by mteinfo_tag_storage_drain_flush(),
290  *   which is called by mteinfo_tag_storage_drain() on the results
291  *   of [D.1 async]
292  *
293  *
294  * Allocation/Claiming (C.1)
295  *
296  *   [C.1 inline] is performed by @c mteinfo_tag_storage_claimable_refill()
297  *   from the context of any @c mteinfo_free_queue_grab() (tagged or regular).
298  *   The path will opportunistically determine that there are enough pages
299  *   on the @c mte_claimable_queue that amortizing the cost of taking
300  *   the spinlock protecting the per-cpu queue is worth it.
301  *
302  *   It is done unconditionally otherwise, as the reclaim thread can steal
303  *   from these queues. The @c vm_page_grab_options() fastpath knows how
304  *   to draw from this directly.
305  *
306  *
307  * Freeing (F.1, F.2)
308  *
309  *   [F.1 inline] is performed by page free paths who eventually call into
310  *   @c vm_page_free_queue_enter(VM_MEMORY_CLASS_TAG_STORAGE).
311  *
312  *   [F.2 inline] is the exact same transition but for the case when the refill
313  *   thread was attempting to reclaim this page (it had performed [R.1 async]).
314  *   It is worth nothing that on paper, the [C.1 inline] transition could happen
315  *   again before the refill thread notices.
316  *
317  *
318  * Reclaiming (R.1, R.2, R.x, B.1, B.2)
319  *
320  *   [R.1 async] is performed by mteinfo_tag_storage_active_refill() when it
321  *   decides that reclaiming (stealing) pages is the best strategy to get more
322  *   taggable pages. It will only do so if [A.1 async] isn't more profitable.
323  *
324  *   Once pages have been marked as reclaiming, it will attempt to either steal
325  *   the page from the cpu free queue, or attempt a relocation.
326  *
327  *   [R.2 async] is exactly the same as [A.2 async], being performed by
328  *   mteinfo_tag_storage_activate_locked() on the results of [R.1 async].
329  *   The major difference however is that it is done one page at a time.
330  *
331  *   [B.1 async] is performed by @c mteinfo_reclaim_tag_storage_page() when
332  *   the relocating a claimed page failed due to the page being pinned.
333  *   In which case, the tag storage page is marked with @c vmp_ts_wanted bit.
334  *
335  *   [B.2 inline] is performed by @c mteinfo_tag_storage_wakeup() when threads
336  *   notice that @c vmp_ts_wanted is set and that the condition causing it to be
337  *   set has cleared.
338  *
339  *   [R.x async] is performed when stealing the page was otherwise not
340  *   successful (in @c mteinfo_reclaim_tag_storage_page() or
341  *   @c mteinfo_tag_storage_flush_reclaiming()).
342  */
343 
344 
345 #pragma mark Types
346 
347 /*!
348  * @typedef cell_state_mask_t
349  *
350  * @abstract
351  * Mask/bit-field version of the @c mte_cell_state_t bit in order to do assertions.
352  */
353 __options_decl(cell_state_mask_t, uint32_t, {
354 	MTE_MASK_DISABLED       = BIT(MTE_STATE_DISABLED),
355 	MTE_MASK_PINNED         = BIT(MTE_STATE_PINNED),
356 	MTE_MASK_DEACTIVATING   = BIT(MTE_STATE_DEACTIVATING),
357 	MTE_MASK_CLAIMED        = BIT(MTE_STATE_CLAIMED),
358 	MTE_MASK_INACTIVE       = BIT(MTE_STATE_INACTIVE),
359 	MTE_MASK_RECLAIMING     = BIT(MTE_STATE_RECLAIMING),
360 	MTE_MASK_ACTIVATING     = BIT(MTE_STATE_ACTIVATING),
361 	MTE_MASK_ACTIVE         = BIT(MTE_STATE_ACTIVE),
362 });
363 
364 #define MTE_FF_CELL_INDEX_BITS          24 /* Number of bits for a cell index */
365 #define MTE_FF_CELL_PAGE_COUNT_BITS     6  /* Number of bits for a page count */
366 #define MTE_FF_CELL_STATE_BITS          3
367 
368 /*!
369  * @typedef cell_idx_t
370  *
371  * @abstract
372  * Represents the index of a cell in the cell array (when positive), or a queue
373  * head (when negative).
374  *
375  * @discussion
376  * This type only has @c MTE_FF_CELL_INDEX_BITS worth of significant bits.
377  * Given that one bit is used to denote queues, it means we can support systems
378  * with up to:
379  * - 2^(MTE_FF_CELL_INDEX_BITS - 1) tag storage pages,
380  * - 2^(MTE_FF_CELL_INDEX_BITS + 4) pages,
381  * - 2^(MTE_FF_CELL_INDEX_BITS + 4 + PAGE_SHIFT) bytes.
382  *
383  * On a 16KB system (PAGE_SHIFT == 14) and with MTE_FF_CELL_INDEX_BITS == 24,
384  * this covers 2^42 == 4TB of physical memory.
385  */
386 typedef int32_t cell_idx_t;
387 
388 typedef uint32_t cell_count_t;
389 
390 /*!
391  * @typedef cell_t
392  *
393  * @abstract
394  * This data structure contains the metadata associated with a tag storage page,
395  * and its covered pages in the mteinfo tracking data structure.
396  *
397  * @discussion
398  * Here are some important invariants for this data structure:
399  * - mte_page_count + popcount(free_mask) <= MTE_PAGES_PER_TAG_PAGE
400  * - mte_page_count must be 0 unless state is DISABLED or ACTIVE.
401  *
402  * @field prev
403  * Linkage to the prev cell (as an index in the cell array).
404  *
405  * @field next
406  * Linkage to the next cell (as an index in the cell array).
407  *
408  * @field enqueue_pos
409  * If @c free_mask isn't 0, this contains the index of the free covered page
410  * which represents this cell in the mte free queues (@see @c mte_free_queues[]).
411  *
412  * @field mte_page_count
413  * The number of pages covered with this tag storage page, that are currently
414  * used and tagged.
415  *
416  * @field state
417  * The current state of the tag storage page this cell represents.
418  * @see mte_cell_state_t.
419  *
420  * @field free_mask
421  * A bitmask where each bit set corresponds to an associated covered page that
422  * is free (tagged or not).
423  *
424  * @field cell_count
425  * When the cell is a queue head, the number of cells enqueued on this bucket.
426  */
427 #pragma pack(4)
428 typedef struct {
429 	cell_idx_t              prev : MTE_FF_CELL_INDEX_BITS;
430 	cell_idx_t              next : MTE_FF_CELL_INDEX_BITS;
431 	cell_count_t            enqueue_pos : MTE_FF_CELL_PAGE_COUNT_BITS;
432 	cell_count_t            mte_page_count : MTE_FF_CELL_PAGE_COUNT_BITS;
433 	mte_cell_state_t        state : MTE_FF_CELL_STATE_BITS;
434 	uint8_t                 __unused_bits : 1;
435 	union {
436 		uint32_t        free_mask;
437 		uint32_t        cell_count;
438 	};
439 } cell_t;
440 #pragma pack()
441 
442 static_assert(sizeof(cell_t) == 12);
443 static_assert(MTE_STATE_ACTIVE < (1u << MTE_FF_CELL_STATE_BITS));
444 static_assert(MTE_PAGES_PER_TAG_PAGE <= (1 << MTE_FF_CELL_PAGE_COUNT_BITS));
445 
446 /*!
447  * @typedef mte_cell_queue_t
448  *
449  * @abstract
450  * This data structure represents a particular queue/bucket of cells.
451  */
452 typedef struct mte_cell_queue_head {
453 	cell_t          head;
454 } *mte_cell_queue_t;
455 
456 /*!
457  * @typedef mte_cell_bucket_t
458  *
459  * @abstract
460  * Represents the index of a bucket inside of a list.
461  */
462 __enum_decl(mte_cell_bucket_t, uint32_t, {
463 	MTE_BUCKET_0,
464 	MTE_BUCKET_1_8,
465 	MTE_BUCKET_9_16,
466 	MTE_BUCKET_17_24,
467 	MTE_BUCKET_25_32,
468 
469 	_MTE_BUCKET_COUNT,
470 });
471 
472 static_assert(_MTE_BUCKET_COUNT == MTE_BUCKETS_COUNT_MAX);
473 
474 #define MTE_QUEUES_COUNT \
475 	(1 /* disabled */ + \
476 	 1 /* pinned */ + \
477 	 MTE_BUCKETS_COUNT_MAX /* claimed */ + \
478 	 MTE_BUCKETS_COUNT_MAX /* inactive */ + \
479 	 1 /* deactivating */ + \
480 	 1 /* reclaiming */ + \
481 	 1 /* activating */ + \
482 	 MTE_BUCKETS_COUNT_MAX /* active_0 */ + \
483 	 1 /* active */ )
484 
485 
486 #pragma mark Behavioral boot-args
487 
488 /*
489  * Boot-arg to enable/disable the interface for grabbing tag storage pages.
490  * This exists in case tunables or settings for tag storage management expose
491  * us to page shortages or system hangs due to wired tag storage pages.  This
492  * boot-arg should allow us to bypass any such issues.
493  */
494 static TUNABLE(bool, vm_mte_enable_tag_storage_grab, "mte_ts_grab", true);
495 
496 /*
497  * Boot-args controlling the draining down of tag storage space
498  *
499  * @var vm_page_tag_storage_reserved
500  * How many tag storage pages the inactive_0 queue needs to preserve
501  * at all times.
502  */
503 TUNABLE(uint32_t, vm_page_tag_storage_reserved, "mte_ts_grab_rsv", 100);
504 
505 /*
506  * Boot-arg to enable/disable grabbing tag storage pages for the compressor
507  * pool.
508  */
509 TUNABLE(bool, vm_mte_tag_storage_for_compressor, "mte_ts_compressor", true);
510 
511 #ifndef VM_MTE_FF_VERIFY
512 /*
513  * Boot-arg to enable/disable grabbing tag storage pages for specific VM tags.
514  * Note that the string length was somewhat arbitrarily chosen, so if the use
515  * case arises, we may need to bump that up...
516  *
517  * Currently, we allow allocations with VM tags of VM_MEMORY_MALLOC_SMALL (2),
518  * VM_MEMORY_MALLOC_TINY (7), and VM_MEMORY_MALLOC_NANO (11) to use tag storage
519  * pages. See vm_statistics.h for other potential candidates.
520  * In particular, VM_MEMORY_STACK (30) is promising.
521  */
522 static TUNABLE_STR(vm_mte_tag_storage_for_vm_tags, 256, "mte_ts_vmtag", "2,7,11");
523 #endif /* VM_MTE_FF_VERIFY */
524 
525 #pragma mark Counters and Globals
526 
527 struct mte_cell_list mte_info_lists[MTE_LISTS_COUNT];
528 
529 static SECURITY_READ_ONLY_LATE(cell_t *) mte_info_cells;
530 
531 #ifndef VM_MTE_FF_VERIFY
532 /*
533  * Fill thread state.  The wake state of the thread is tracked to minimize
534  * scheduler interactions.  Guarded with the free page lock.
535  */
536 static sched_cond_atomic_t fill_thread_cond = SCHED_COND_INIT;
537 static SECURITY_READ_ONLY_LATE(thread_t) vm_mte_fill_thread = THREAD_NULL;
538 static thread_t vm_mte_activator = THREAD_NULL;
539 static bool vm_mte_activator_waiters = false;
540 
541 struct mte_pcpu PERCPU_DATA(mte_pcpu);
542 SCALABLE_COUNTER_DEFINE(vm_cpu_free_tagged_count);
543 SCALABLE_COUNTER_DEFINE(vm_cpu_free_claimed_count);
544 #endif
545 
546 /*
547  * Free taggable pages queue, per-cpu queues, and its counters.
548  *
549  * guarded by the free page lock
550  */
551 uint32_t vm_page_free_taggable_count;
552 uint32_t vm_page_free_unmanaged_tag_storage_count;
553 uint32_t vm_page_tagged_count; /* Total tagged covered pages. */
554 uint32_t vm_page_free_wanted_tagged = 0;
555 uint32_t vm_page_free_wanted_tagged_privileged = 0;
556 
557 /*
558  * Counters for tag storage pages we will just give to the system permanently
559  * for use as regular memory.  These could technically be a subset of the
560  * claimed tag storage, but counting them separately is useful because they
561  * will have a different page lifecycle than the claimed tag storage pages...
562  * as when freed, these pages will go to the regular free queues.
563  *
564  * These shouldn't be mutated after bootstrap... so they have no lock.
565  */
566 uint32_t vm_page_recursive_tag_storage_count;
567 uint32_t vm_page_retired_tag_storage_count;
568 uint32_t vm_page_unmanaged_tag_storage_count;
569 
570 /*
571  * The wired tag storage page count is guarded by the page queues lock.  This
572  * counter is diagnostic; it exists to inform investigations about reclaim
573  * efficiency.
574  */
575 uint32_t vm_page_wired_tag_storage_count;
576 
577 /*
578  * Diagnostic counters for reclamation; describes how many times reclamation
579  * attempts have succeeded or failed (as well as a breakout for failures due to
580  * the page being wired).  Guarded by the free page lock.
581  */
582 uint64_t vm_mte_refill_thread_wakeups;
583 uint64_t vm_page_tag_storage_activation_count;
584 uint64_t vm_page_tag_storage_deactivation_count;
585 uint64_t vm_page_tag_storage_reclaim_from_cpu_count;
586 uint64_t vm_page_tag_storage_reclaim_success_count;
587 uint64_t vm_page_tag_storage_reclaim_failure_count;
588 uint64_t vm_page_tag_storage_reclaim_wired_failure_count;
589 uint64_t vm_page_tag_storage_wire_relocation_count;
590 uint64_t vm_page_tag_storage_reclaim_compressor_failure_count;
591 uint64_t vm_page_tag_storage_compressor_relocation_count;
592 
593 #ifndef VM_MTE_FF_VERIFY
594 /*
595  * Diagnostic counter for reclamation describing the number of tag storage
596  * pages that have ever been allocated as claimed. Note that this value
597  * only increases.
598  */
599 SCALABLE_COUNTER_DEFINE(vm_cpu_claimed_count);
600 #endif /* VM_MTE_FF_VERIFY */
601 
602 /*
603  * Array of 4 64-bit masks for which VM tags can use tag storage.
604  * There are a total of 256 VM tags.
605  * This shouldn't be mutated after bootstrap... so it has no lock.
606  */
607 bitmap_t vm_mte_tag_storage_for_vm_tags_mask[BITMAP_LEN(VM_MEMORY_COUNT)];
608 
609 #pragma mark cell_idx_t
610 
611 __pure2
612 static bool
cell_idx_is_queue(cell_idx_t idx)613 cell_idx_is_queue(cell_idx_t idx)
614 {
615 	return idx < 0;
616 }
617 
618 __pure2
619 static cell_t *
cell_from_idx(cell_idx_t idx)620 cell_from_idx(cell_idx_t idx)
621 {
622 	return &mte_info_cells[idx];
623 }
624 
625 __pure2
626 __attribute__((overloadable))
627 static cell_idx_t
cell_idx(const cell_t * cell)628 cell_idx(const cell_t *cell)
629 {
630 	return (cell_idx_t)(cell - mte_info_cells);
631 }
632 
633 __pure2
634 __attribute__((overloadable))
635 static cell_idx_t
cell_idx(mte_cell_queue_t queue)636 cell_idx(mte_cell_queue_t queue)
637 {
638 	return cell_idx(&queue->head);
639 }
640 
641 __pure2
642 static cell_count_t
cell_free_page_count(cell_t cell)643 cell_free_page_count(cell_t cell)
644 {
645 	return __builtin_popcountll(cell.free_mask);
646 }
647 
648 __pure2
649 static ppnum_t
cell_first_covered_pnum(const cell_t * cell)650 cell_first_covered_pnum(const cell_t *cell)
651 {
652 	return pmap_first_pnum + cell_idx(cell) * MTE_PAGES_PER_TAG_PAGE;
653 }
654 
655 
656 #pragma mark mte_cell_queue_t
657 
658 /*
659  * Based on the existing queue code in XNU.  Look at <kern/queue.h> for the
660  * original code; done here due to the custom linkages.
661  */
662 
663 static cell_idx_t
cell_queue_first_idx(mte_cell_queue_t queue)664 cell_queue_first_idx(mte_cell_queue_t queue)
665 {
666 	return queue->head.next;
667 }
668 
669 static cell_idx_t
cell_queue_last_idx(mte_cell_queue_t queue)670 cell_queue_last_idx(mte_cell_queue_t queue)
671 {
672 	return queue->head.prev;
673 }
674 
675 static cell_t *
cell_queue_first(mte_cell_queue_t queue)676 cell_queue_first(mte_cell_queue_t queue)
677 {
678 	return cell_from_idx(cell_queue_first_idx(queue));
679 }
680 
681 static uint32_t
cell_queue_count(mte_cell_queue_t queue)682 cell_queue_count(mte_cell_queue_t queue)
683 {
684 	return queue->head.cell_count;
685 }
686 
687 
688 static bool
cell_queue_insert_tail(mte_cell_queue_t queue,cell_t * cell)689 cell_queue_insert_tail(mte_cell_queue_t queue, cell_t *cell)
690 {
691 	cell_idx_t qidx = cell_idx(queue);
692 	cell_idx_t tidx = cell_queue_last_idx(queue);
693 	cell_t    *tail = cell_from_idx(tidx);
694 
695 	if (tail->next != qidx) {
696 		__queue_element_linkage_invalid(tail);
697 	}
698 
699 	cell->next = qidx;
700 	cell->prev = tidx;
701 	queue->head.prev = tail->next = cell_idx(cell);
702 
703 	/* If the original tail was the queue, then it was empty. */
704 	return cell_idx_is_queue(tidx);
705 }
706 
707 static bool
cell_queue_remove(cell_t * cell)708 cell_queue_remove(cell_t *cell)
709 {
710 	cell_idx_t pidx = cell->prev;
711 	cell_idx_t nidx = cell->next;
712 	cell_idx_t cidx = cell_idx(cell);
713 	cell_t    *prev = cell_from_idx(pidx);
714 	cell_t    *next = cell_from_idx(nidx);
715 
716 	if (prev->next != cidx || next->prev != cidx) {
717 		__queue_element_linkage_invalid(cell);
718 	}
719 
720 	next->prev = pidx;
721 	prev->next = nidx;
722 	/* No linkage cleanup because cells are never dequeued at rest. */
723 
724 	/*
725 	 * If the prev and next indices are the same, then this is the head
726 	 * index, and the queue became empty
727 	 */
728 
729 	return pidx == nidx;
730 }
731 
732 #define cell_queue_foreach(it, q) \
733 	for (cell_t *it = cell_queue_first(q); \
734 	     it != &(q)->head; \
735 	     it = cell_from_idx(it->next))
736 
737 #define cell_queue_foreach_safe(it, q) \
738 	for (cell_t *__next_it, *it = cell_queue_first(q); \
739 	     it != &(q)->head && (__next_it = cell_from_idx(it->next), 1); \
740 	     it = __next_it)
741 
742 
743 #pragma mark MTE free queue
744 
745 /*
746  * The MTE free queue is a multi-dimensioned queue that replaces the
747  * vm_page_free_queue for covered pages on MTE targets.
748  *
749  * It is an array of colored free queues indexed by @c mte_free_queue_idx_t.
750  *
751  *
752  * A queue of tag storage pages
753  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
754  *
755  * When a tag storage page has no associated free covered pages, no page is
756  * enqueued on the mte free queue. However when a tag storage page has one or
757  * more free covered pages associated then there is one and only one of these
758  * pages enqueued on the mte free queues.
759  *
760  * The chosen representative for the cell is remembered on the cell of the
761  * associated tag storage @c cell_t::enqueue_pos value.
762  *
763  *
764  * Enqueue / dequeue algorithm
765  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
766  *
767  * This chosen representative makes the cluster available for its page color,
768  * and only this color, despite other colors being possibly available for this
769  * tag storage page.
770  *
771  * When removing a free page from the MTE queue, if the page being grabbed
772  * was the enqueued candidate, then the next enqueued candidate is chosen
773  * as the next free page in bitmask "circular" order
774  * (@see mteinfo_free_queue_next_bit()).
775  *
776  * As a result, by "pushing" the page forward this way, the tag storage page
777  * will be made available through all colors that it can provide.
778  *
779  *
780  * Allocation stability and bucket selection
781  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
782  *
783  * The free queues are in that order:
784  *
785  *   {claimed/disabled} -> {inactive_0, inactive_1} ->
786  *   {active_0, active_1, active_2, active_3} -> {activating}
787  *
788  * This is selected carefully to have the following crucial properties:
789  *
790  * - allocating untagged pages chooses buckets "left to right"
791  *   (in increasing free queue index order).
792  *
793  * - allocating tagged pages chooses active buckets "right to left"
794  *   (in decreasing free queue index from the active_* queues).
795  *
796  * - when allocating untagged pages, the impact on the tag storage page will
797  *   be that it stays in the same free queue or moves "down" in the free queue
798  *   indices order.
799  *
800  * - when allocating tagged pages, the impact on the tag storage page will
801  *   be that it stays in the same free queue or moves "up" in the free queue
802  *   indices order.
803  *
804  * This is important and allows for a nice optimization: if a tag storage page
805  * was found to be a good candidate for a given grab operation, it always will
806  * stay a "best" candidate until it has no free pages left, which allows for
807  * allocations of contiguous spans of pages at once
808  * (@see mteinfo_free_queue_grab()).
809  *
810  * Lastly, in order to find the first free bucket quickly,
811  * @c mte_claimable_queue is a bitmask where a bit being set means that the
812  * corresponding bucket has at least one queue non empty.
813  *
814  *
815  * Tag Storage Free queue
816  * ~~~~~~~~~~~~~~~~~~~~~~
817  *
818  * Tag storage pages can only be claimed if they are inactive with the [C.1]
819  * transition. Getting pages to inactive is done via the Deactivation [D.*].
820  *
821  * However, as we mentioned the MTE free queue is only about covered pages
822  * proper, and do not contain the tag storage pages. Another point is that
823  * we do not want to claim pages too aggressively as it could get in the way
824  * of the Activation [A.*] transition when tagged pages are required.
825  *
826  * To solve this tension, the @c mte_claimable_queue holds inactive tag storage
827  * pages that have 8 free pages or less at any given time. These are unlikely
828  * to be profitable activation candidates, but also demonstrate that there is
829  * enough untagged memory pressure on the system that we have clusters of
830  * covered pages in use.
831  *
832  * The @c mteinfo_free_queue_grab() code will promote these to a per-cpu
833  * free queue that in turn the @c vm_page_grab_options() fastpath can tap into
834  * as another opportunistic source of pages.
835  */
836 struct vm_page_free_queue mte_free_queues[MTE_FREE_NOT_QUEUED];
837 struct vm_page_free_queue mte_claimable_queue;
838 static uint32_t mte_free_queue_mask;
839 
840 /*!
841  * @abstract
842  * Computes the proper mte free queue index for a given cell.
843  */
844 __pure2
845 static mte_free_queue_idx_t
mteinfo_free_queue_idx(cell_t cell)846 mteinfo_free_queue_idx(cell_t cell)
847 {
848 	uint32_t free   = cell_free_page_count(cell);
849 	uint32_t tagged = cell.mte_page_count;
850 	uint32_t used   = MTE_PAGES_PER_TAG_PAGE - free - tagged;
851 	uint32_t n;
852 
853 	if (cell.free_mask == 0) {
854 		return MTE_FREE_NOT_QUEUED;
855 	}
856 
857 	switch (cell.state) {
858 	case MTE_STATE_DISABLED:
859 	case MTE_STATE_PINNED:
860 	case MTE_STATE_DEACTIVATING:
861 		return MTE_FREE_UNTAGGABLE_0;
862 
863 	case MTE_STATE_CLAIMED:
864 	case MTE_STATE_INACTIVE:
865 		/*
866 		 * This is "clever" code to map:
867 		 * MTE_FREE_UNTAGGABLE_0: Claimed[0-16]
868 		 * MTE_FREE_UNTAGGABLE_1: Claimed[16-32], Inactive[0-16]
869 		 * MTE_FREE_UNTAGGABLE_2: Inactive[16-32]
870 		 */
871 		n = MTE_FREE_UNTAGGABLE_0 + cell.state - MTE_STATE_CLAIMED;
872 		static_assert(MTE_STATE_INACTIVE == MTE_STATE_CLAIMED + 1);
873 		return n + (free > MTE_PAGES_PER_TAG_PAGE / 2);
874 
875 	case MTE_STATE_RECLAIMING:
876 	case MTE_STATE_ACTIVATING:
877 		return MTE_FREE_UNTAGGABLE_ACTIVATING;
878 
879 	case MTE_STATE_ACTIVE:
880 		break;
881 	}
882 
883 	/*
884 	 * Empirically this seems to give decent fragmentation results
885 	 * with alternating MTE/non-MTE workloads.
886 	 *
887 	 * This tries to find a balance between favoring buckets with mte pages
888 	 * allocated and to penalize buckets with untagged pages allocated,
889 	 * while keeping buckets with the most free pages on the fence.
890 	 *
891 	 * The distribution it generates can be printed by running the
892 	 * "active_buckets" subtest of tests/vm/vm_mteinfo.c
893 	 */
894 
895 	n  = tagged + free / 5;
896 	n -= MIN(n, used) / 3;
897 	return MTE_FREE_ACTIVE_0 + fls(n / 4);
898 }
899 
900 static vm_page_queue_t
mteinfo_free_queue_head(mte_free_queue_idx_t idx,uint32_t color)901 mteinfo_free_queue_head(mte_free_queue_idx_t idx, uint32_t color)
902 {
903 	return &mte_free_queues[idx].vmpfq_queues[color].qhead;
904 }
905 
906 /*!
907  * @abstract
908  * Computes the next bit in "circular" mask order
909  *
910  * @discussion
911  * This computes the next bit set in @c mask that is larger or equal
912  * to @c bit, or if none exist, then the smallest bit set in @c mask.
913  *
914  * This means that for a mask with positions mask={1, 5, 6, 10} set,
915  * the "next" bit for:
916  * - 4 is 5,
917  * - 10 is 10,
918  * - 12 is 1.
919  *
920  * @param mask        The mask to scan. The mask must be non 0.
921  * @param bit         The bit to scan from.
922  * @returns           The next bit set in "circular" order.
923  */
924 static cell_count_t
mteinfo_free_queue_next_bit(uint32_t mask,cell_count_t bit)925 mteinfo_free_queue_next_bit(uint32_t mask, cell_count_t bit)
926 {
927 	cell_count_t cur = bit % MTE_PAGES_PER_TAG_PAGE;
928 
929 	mask = (mask >> cur) | (mask << (32 - cur));
930 	bit += ffs(mask) - 1;
931 
932 	return bit % MTE_PAGES_PER_TAG_PAGE;
933 }
934 
935 /*!
936  * @abstract
937  * Backend for CELL_UPDATE() to manage update/requeues to the mte free queue.
938  *
939  * @param cell        The new state of the cell.
940  * @param orig        The original state of the cell.
941  * @param oidx        The original free queue index for the cell.
942  * @param nidx        The new free queue index for the cell.
943  */
944 __attribute__((noinline))
945 static void
mteinfo_free_queue_requeue(cell_t * cell,const cell_t orig,mte_free_queue_idx_t oidx,mte_free_queue_idx_t nidx)946 mteinfo_free_queue_requeue(
947 	cell_t                 *cell,
948 	const cell_t            orig,
949 	mte_free_queue_idx_t    oidx,
950 	mte_free_queue_idx_t    nidx)
951 {
952 	ppnum_t         first_pnum = cell_first_covered_pnum(cell);
953 	vm_page_queue_t queue;
954 	cell_count_t    bit = orig.enqueue_pos;
955 	vm_page_t       mem;
956 
957 	if (oidx == MTE_FREE_NOT_QUEUED && nidx == MTE_FREE_NOT_QUEUED) {
958 		cell->enqueue_pos = -1;
959 		return;
960 	}
961 
962 	if (oidx != MTE_FREE_NOT_QUEUED) {
963 		mem   = vm_page_find_canonical(first_pnum + bit);
964 		queue = mteinfo_free_queue_head(oidx,
965 		    (first_pnum + bit) & vm_color_mask);
966 		assert(bit_test(orig.free_mask, bit));
967 
968 		vm_page_queue_remove(queue, mem, vmp_pageq);
969 		VM_COUNTER_DEC(&mte_free_queues[oidx].vmpfq_count);
970 		if (mte_free_queues[oidx].vmpfq_count == 0) {
971 			bit_clear(mte_free_queue_mask, oidx);
972 		}
973 	}
974 
975 	if (nidx == MTE_FREE_NOT_QUEUED) {
976 		cell->enqueue_pos = -1;
977 	} else {
978 		bit   = mteinfo_free_queue_next_bit(cell->free_mask, bit);
979 		mem   = vm_page_find_canonical(first_pnum + bit);
980 		queue = mteinfo_free_queue_head(nidx,
981 		    (first_pnum + bit) & vm_color_mask);
982 		assert(bit_test(cell->free_mask, bit));
983 
984 		cell->enqueue_pos = bit;
985 		vm_page_queue_enter_first(queue, mem, vmp_pageq);
986 		if (mte_free_queues[nidx].vmpfq_count == 0) {
987 			bit_set(mte_free_queue_mask, nidx);
988 		}
989 		VM_COUNTER_INC(&mte_free_queues[nidx].vmpfq_count);
990 	}
991 }
992 
993 
994 #pragma mark mte_cell_list_t
995 
996 __pure2
997 static mte_cell_bucket_t
cell_list_idx_buckets(mte_cell_list_idx_t idx)998 cell_list_idx_buckets(mte_cell_list_idx_t idx)
999 {
1000 	switch (idx) {
1001 	case MTE_LIST_INACTIVE_IDX:
1002 	case MTE_LIST_CLAIMED_IDX:
1003 	case MTE_LIST_ACTIVE_0_IDX:
1004 		return MTE_BUCKETS_COUNT_MAX;
1005 	default:
1006 		return 1;
1007 	}
1008 }
1009 
1010 __pure2
1011 static mte_cell_list_idx_t
cell_list_idx(const cell_t cell)1012 cell_list_idx(const cell_t cell)
1013 {
1014 	if (cell.state != MTE_STATE_ACTIVE || cell.mte_page_count == 0) {
1015 		return (mte_cell_list_idx_t)cell.state;
1016 	}
1017 
1018 	return MTE_LIST_ACTIVE_IDX;
1019 }
1020 
1021 __pure2
1022 static mte_cell_bucket_t
cell_list_bucket(const cell_t cell)1023 cell_list_bucket(const cell_t cell)
1024 {
1025 	if (cell_list_idx_buckets(cell_list_idx(cell)) > 1) {
1026 		return (cell_free_page_count(cell) + 7) / 8;
1027 	}
1028 	return 0;
1029 }
1030 
1031 __pure2
1032 static inline bool
cell_on_claimable_queue(const cell_t cell)1033 cell_on_claimable_queue(const cell_t cell)
1034 {
1035 	if (cell.state == MTE_STATE_INACTIVE) {
1036 		return cell_list_bucket(cell) <= MTE_BUCKET_1_8;
1037 	}
1038 	return false;
1039 }
1040 
1041 __attribute__((noinline))
1042 static void
cell_list_requeue(cell_t * cell,vm_page_t tag_page,mte_cell_list_idx_t oidx,mte_cell_bucket_t obucket,mte_cell_list_idx_t nidx,mte_cell_bucket_t nbucket,int claim_requeue)1043 cell_list_requeue(
1044 	cell_t                 *cell,
1045 	vm_page_t               tag_page,
1046 	mte_cell_list_idx_t     oidx,
1047 	mte_cell_bucket_t       obucket,
1048 	mte_cell_list_idx_t     nidx,
1049 	mte_cell_bucket_t       nbucket,
1050 	int                     claim_requeue)
1051 {
1052 	mte_cell_list_t olist = &mte_info_lists[oidx];
1053 	mte_cell_list_t nlist = &mte_info_lists[nidx];
1054 
1055 	if (cell_queue_remove(cell)) {
1056 		bit_clear(olist->mask, obucket);
1057 	}
1058 
1059 	if (cell_queue_insert_tail(&nlist->buckets[nbucket], cell)) {
1060 		bit_set(nlist->mask, nbucket);
1061 	}
1062 
1063 	olist->buckets[obucket].head.cell_count--;
1064 	nlist->buckets[nbucket].head.cell_count++;
1065 
1066 	if (olist != nlist) {
1067 		olist->count--;
1068 		nlist->count++;
1069 	}
1070 
1071 	if (claim_requeue) {
1072 #ifndef VM_MTE_FF_VERIFY
1073 		uint32_t        color = VM_PAGE_GET_COLOR(tag_page);
1074 		vm_page_queue_t queue;
1075 
1076 		queue = &mte_claimable_queue.vmpfq_queues[color].qhead;
1077 		if (claim_requeue > 0) {
1078 			vm_page_queue_enter(queue, tag_page, vmp_pageq);
1079 		} else {
1080 			vm_page_queue_remove(queue, tag_page, vmp_pageq);
1081 		}
1082 		VM_COUNTER_DELTA(&mte_claimable_queue.vmpfq_count, claim_requeue);
1083 #endif /* VM_MTE_FF_VERIFY */
1084 	}
1085 }
1086 
1087 /*!
1088  * @abstract
1089  * Find a page in the last non-empty bucket that is larger than the
1090  * specified bucket index.
1091  *
1092  * @param lidx          The list index to scan.
1093  * @param min_bucket    The minimum bucket index to consider.
1094  * @param tag_page      The tag page associated with the returned cell.
1095  * @returns             The cell that was found or NULL.
1096  */
1097 static cell_t *
cell_list_find_last_page(mte_cell_list_idx_t lidx,mte_cell_bucket_t min_bucket,vm_page_t * tag_page)1098 cell_list_find_last_page(
1099 	mte_cell_list_idx_t     lidx,
1100 	mte_cell_bucket_t       min_bucket,
1101 	vm_page_t              *tag_page)
1102 {
1103 	mte_cell_list_t  list = &mte_info_lists[lidx];
1104 	uint32_t         mask = list->mask & ~mask(min_bucket);
1105 	mte_cell_queue_t queue;
1106 
1107 	if (__improbable(mask == 0)) {
1108 		*tag_page = VM_PAGE_NULL;
1109 		return NULL;
1110 	}
1111 
1112 	queue = &list->buckets[fls(mask) - 1];
1113 	*tag_page = vm_tag_storage_page_get(cell_queue_first_idx(queue));
1114 	return cell_queue_first(queue);
1115 }
1116 
1117 
1118 #pragma mark Tag storage space state machine
1119 
1120 /*!
1121  * Assert that a cell is in one of the states specified by the mask.
1122  */
1123 #define assert_cell_state(cell, mask) \
1124 	release_assert(((mask) & (1 << (cell)->state)) != 0)
1125 
1126 /*!
1127  * Perform an arbitrary update on a cell, and update the MTE info queues
1128  * accordingly.
1129  *
1130  * This should be used this way:
1131  *
1132  * <code>
1133  *   // Preflights and asserts here
1134  *   assert_cell_state(cell_var, ...);
1135  *
1136  *   CELL_UPDATE(cell_var, tag_page, cleared_bit, {
1137  *       // Mutations of cell_var here
1138  *       cell_var->state = ...;
1139  *   });
1140  * </code>
1141  *
1142  * @param cell          The cell to update.
1143  * @param tag_page      The tag page corresponding to @c cell.
1144  * @param cleared_bit   The bit that was cleared or -1
1145  * @param mut           Code that mutates its argument, and performs the
1146  *                      required update.
1147  */
1148 #define CELL_UPDATE(cell, tag_page, cleared_bit, ...)  ({                       \
1149 	mte_cell_list_idx_t  __ol, __nl;                                        \
1150 	mte_cell_bucket_t    __ob, __nb;                                        \
1151 	mte_free_queue_idx_t __oi, __ni;                                        \
1152 	int                  __ocq, __ncq;                                      \
1153 	cell_t              *__cell = (cell);                                   \
1154 	cell_t               __orig = *__cell;                                  \
1155                                                                                 \
1156 	__ol  = cell_list_idx(__orig);                                          \
1157 	__ob  = cell_list_bucket(__orig);                                       \
1158 	__ocq = cell_on_claimable_queue(__orig);                                \
1159 	__oi  = mteinfo_free_queue_idx(__orig);                                 \
1160                                                                                 \
1161 	__VA_ARGS__;                                                            \
1162                                                                                 \
1163 	__nl  = cell_list_idx(*__cell);                                         \
1164 	__nb  = cell_list_bucket(*__cell);                                      \
1165 	__ncq = cell_on_claimable_queue(*__cell);                               \
1166 	__ni  = mteinfo_free_queue_idx(*__cell);                                \
1167                                                                                 \
1168 	if (__ol != __nl || __ob != __nb) {                                     \
1169 	        cell_list_requeue(__cell, tag_page, __ol, __ob, __nl, __nb,     \
1170 	            __ncq - __ocq);                                             \
1171 	}                                                                       \
1172 	if (__oi != __ni || (cleared_bit)) {                                    \
1173 	        mteinfo_free_queue_requeue(__cell, __orig, __oi, __ni);         \
1174 	}                                                                       \
1175 })
1176 
1177 __pure2
1178 static cell_t *
cell_from_tag_storage_page(const struct vm_page * page)1179 cell_from_tag_storage_page(const struct vm_page *page)
1180 {
1181 	cell_idx_t pidx;
1182 
1183 	pidx = (cell_idx_t)(page - vm_pages_tag_storage_array_internal());
1184 	return cell_from_idx(pidx);
1185 }
1186 
1187 __pure2
1188 __attribute__((overloadable))
1189 static cell_t *
cell_from_covered_ppnum(ppnum_t pnum)1190 cell_from_covered_ppnum(ppnum_t pnum)
1191 {
1192 	cell_idx_t cidx = (pnum - pmap_first_pnum) / MTE_PAGES_PER_TAG_PAGE;
1193 
1194 	return cell_from_idx(cidx);
1195 }
1196 
1197 __pure2
1198 __attribute__((overloadable))
1199 static cell_t *
cell_from_covered_ppnum(ppnum_t pnum,vm_page_t * tag_page)1200 cell_from_covered_ppnum(ppnum_t pnum, vm_page_t *tag_page)
1201 {
1202 	cell_idx_t cidx = (pnum - pmap_first_pnum) / MTE_PAGES_PER_TAG_PAGE;
1203 
1204 	*tag_page = vm_tag_storage_page_get(cidx);
1205 	return cell_from_idx(cidx);
1206 }
1207 
1208 /*!
1209  * @function mteinfo_tag_storage_set_active()
1210  *
1211  * @abstract
1212  * Mark a tag storage page as active.
1213  *
1214  * @discussion
1215  * The page should be disabled (initial activation) or activating.
1216  *
1217  * @param tag_page      The pointer to a page inside the tag storage space.
1218  * @param mte_count     How many covered pages are used and tagged for @c tag_page.
1219  * @param init          Whether this is the initial transition.
1220  * @returns             The number of covered pages this made taggable.
1221  */
1222 static uint32_t
mteinfo_tag_storage_set_active(vm_page_t tag_page,uint32_t mte_count,bool init)1223 mteinfo_tag_storage_set_active(vm_page_t tag_page, uint32_t mte_count, bool init)
1224 {
1225 	cell_t      *cell = cell_from_tag_storage_page(tag_page);
1226 	cell_count_t free_page_count = cell_free_page_count(*cell);
1227 
1228 	assert(mte_count + free_page_count <= MTE_PAGES_PER_TAG_PAGE);
1229 	if (init) {
1230 		assert_cell_state(cell,
1231 		    /* [I.1] */ MTE_MASK_DISABLED);
1232 	} else {
1233 		assert_cell_state(cell,
1234 		    /* [R.2] */ MTE_MASK_RECLAIMING |
1235 		    /* [A.2] */ MTE_MASK_ACTIVATING);
1236 	}
1237 
1238 	VM_COUNTER_ADD(&vm_page_free_taggable_count, free_page_count);
1239 	vm_page_tag_storage_activation_count++;
1240 
1241 	CELL_UPDATE(cell, tag_page, false, {
1242 		cell->state = MTE_STATE_ACTIVE;
1243 		cell->mte_page_count = mte_count;
1244 	});
1245 
1246 	return free_page_count;
1247 }
1248 
1249 bool
mteinfo_tag_storage_disabled(const struct vm_page * tag_page)1250 mteinfo_tag_storage_disabled(const struct vm_page *tag_page)
1251 {
1252 	return cell_from_tag_storage_page(tag_page)->state == MTE_STATE_DISABLED;
1253 }
1254 
1255 void
mteinfo_tag_storage_set_retired(vm_page_t tag_page)1256 mteinfo_tag_storage_set_retired(vm_page_t tag_page)
1257 {
1258 	cell_t *cell = cell_from_tag_storage_page(tag_page);
1259 
1260 	assert(cell->mte_page_count == 0);
1261 	assert_cell_state(cell,
1262 	    /* [K.3] */ MTE_MASK_DISABLED |
1263 	    /* [K.2] */ MTE_MASK_CLAIMED |
1264 	    /* [K.1] */ MTE_MASK_RECLAIMING);
1265 
1266 	VM_COUNTER_INC(&vm_page_retired_tag_storage_count);
1267 
1268 	CELL_UPDATE(cell, tag_page, false, {
1269 		cell->state = MTE_STATE_DISABLED;
1270 	});
1271 }
1272 
1273 #ifndef VM_MTE_FF_VERIFY
1274 /*!
1275  * @function mteinfo_tag_storage_set_unmanaged()
1276  *
1277  * @abstract
1278  * Mark a tag storage page as actually being disabled-unmanaged
1279  *
1280  * @discussion
1281  * The tag storage page must be claimed or inactive.
1282  *
1283  * @param cell          The cell to mark as disabled.
1284  * @param tag_page      The tag page corresponding to @c cell.
1285  */
1286 static void
mteinfo_tag_storage_set_unmanaged(cell_t * cell,vm_page_t tag_page)1287 mteinfo_tag_storage_set_unmanaged(cell_t *cell, vm_page_t tag_page)
1288 {
1289 	bool queue = cell->state == MTE_STATE_INACTIVE;
1290 
1291 	assert(cell->mte_page_count == 0);
1292 	assert(cell->free_mask == 0);
1293 
1294 	assert_cell_state(cell,
1295 	    /* [U.1] */ MTE_MASK_CLAIMED |
1296 	    /* [U.2] */ MTE_MASK_INACTIVE);
1297 
1298 	VM_COUNTER_INC(&vm_page_unmanaged_tag_storage_count);
1299 
1300 	CELL_UPDATE(cell, tag_page, false, {
1301 		cell->state = MTE_STATE_DISABLED;
1302 	});
1303 
1304 	if (queue) {
1305 		vm_page_free_queue_enter(VM_MEMORY_CLASS_DEAD_TAG_STORAGE,
1306 		    tag_page, VM_PAGE_GET_PHYS_PAGE(tag_page));
1307 	}
1308 }
1309 #endif /* VM_MTE_FF_VERIFY */
1310 
1311 void
mteinfo_tag_storage_set_inactive(vm_page_t tag_page,bool init)1312 mteinfo_tag_storage_set_inactive(vm_page_t tag_page, bool init)
1313 {
1314 	cell_t *cell = cell_from_tag_storage_page(tag_page);
1315 
1316 	assert(cell->mte_page_count == 0);
1317 	if (init) {
1318 		assert_cell_state(cell,
1319 		    /* [I.2] */ MTE_MASK_DISABLED);
1320 	} else {
1321 		assert_cell_state(cell,
1322 		    /* [D.2] */ MTE_MASK_DEACTIVATING |
1323 		    /* [F.1] */ MTE_MASK_CLAIMED |
1324 		    /* [F.2] */ MTE_MASK_RECLAIMING);
1325 	}
1326 
1327 #ifndef VM_MTE_FF_VERIFY
1328 	if (cell->state == MTE_STATE_CLAIMED) {
1329 		/*
1330 		 * This is to account for [F.1].
1331 		 * For [F.2], we already decremented due to [R.1]
1332 		 */
1333 		counter_dec(&vm_cpu_claimed_count);
1334 	}
1335 #endif /* VM_MTE_FF_VERIFY */
1336 
1337 	CELL_UPDATE(cell, tag_page, false, {
1338 		cell->state = MTE_STATE_INACTIVE;
1339 	});
1340 }
1341 
1342 void
mteinfo_tag_storage_set_claimed(vm_page_t tag_page)1343 mteinfo_tag_storage_set_claimed(vm_page_t tag_page)
1344 {
1345 	cell_t *cell = cell_from_tag_storage_page(tag_page);
1346 
1347 	assert(cell->mte_page_count == 0);
1348 	assert_cell_state(cell,
1349 	    /* [C.1] */ MTE_MASK_INACTIVE |
1350 	    /* [R.x] */ MTE_MASK_RECLAIMING);
1351 
1352 #ifndef VM_MTE_FF_VERIFY
1353 	if (cell->state == MTE_STATE_RECLAIMING) {
1354 		counter_inc(&vm_cpu_claimed_count);
1355 	}
1356 #endif /* VM_MTE_FF_VERIFY */
1357 
1358 	CELL_UPDATE(cell, tag_page, false, {
1359 		cell->state = MTE_STATE_CLAIMED;
1360 	});
1361 }
1362 
1363 /*!
1364  * @function mteinfo_tag_storage_set_reclaiming()
1365  *
1366  * @abstract
1367  * Mark a tag storage page as being reclaimed.
1368  *
1369  * @discussion
1370  * The tag storage page must be claimed.
1371  *
1372  * @param cell          The cell to mark as reclaiming
1373  * @param tag_page      The tag page corresponding to @c cell.
1374  */
1375 static void
mteinfo_tag_storage_set_reclaiming(cell_t * cell,vm_page_t tag_page)1376 mteinfo_tag_storage_set_reclaiming(cell_t *cell, vm_page_t tag_page)
1377 {
1378 	assert(cell->mte_page_count == 0);
1379 	assert_cell_state(cell, /* [R.1] */ MTE_MASK_CLAIMED);
1380 
1381 	CELL_UPDATE(cell, tag_page, false, {
1382 		cell->state = MTE_STATE_RECLAIMING;
1383 	});
1384 
1385 #ifndef VM_MTE_FF_VERIFY
1386 	counter_dec(&vm_cpu_claimed_count);
1387 #endif /* VM_MTE_FF_VERIFY */
1388 }
1389 
1390 /*!
1391  * @function mteinfo_tag_storage_flush_reclaiming()
1392  *
1393  * @abstract
1394  * Empties the reclaiming queue, moving all pages on it back to claimed.
1395  */
1396 static void
mteinfo_tag_storage_flush_reclaiming(void)1397 mteinfo_tag_storage_flush_reclaiming(void)
1398 {
1399 	mte_cell_list_t  list  = &mte_info_lists[MTE_LIST_RECLAIMING_IDX];
1400 	mte_cell_queue_t queue = &list->buckets[0];
1401 	uint32_t         batch = VMP_FREE_BATCH_SIZE;
1402 
1403 	while (cell_queue_count(queue) > 0) {
1404 		cell_idx_t idx      = cell_queue_first_idx(queue);
1405 		vm_page_t  tag_page = vm_tag_storage_page_get(idx);
1406 		cell_t    *cell     = cell_from_idx(idx);
1407 
1408 		assert_cell_state(cell, /* [R.x] */ MTE_MASK_RECLAIMING);
1409 		CELL_UPDATE(cell, tag_page, false, {
1410 			cell->state = MTE_STATE_CLAIMED;
1411 		});
1412 
1413 #ifndef VM_MTE_FF_VERIFY
1414 		counter_inc(&vm_cpu_claimed_count);
1415 #endif /* VM_MTE_FF_VERIFY */
1416 
1417 		if (--batch == 0 && cell_queue_count(queue)) {
1418 #ifndef VM_MTE_FF_VERIFY
1419 			vm_free_page_unlock();
1420 			vm_free_page_lock_spin();
1421 #endif /* VM_MTE_FF_VERIFY */
1422 			batch = VMP_FREE_BATCH_SIZE;
1423 		}
1424 	}
1425 }
1426 
1427 #ifndef VM_MTE_FF_VERIFY
1428 
1429 void
mteinfo_tag_storage_wakeup(vm_page_t tag_page,bool fq_locked)1430 mteinfo_tag_storage_wakeup(vm_page_t tag_page, bool fq_locked)
1431 {
1432 	cell_t *cell = cell_from_tag_storage_page(tag_page);
1433 
1434 	if (!fq_locked) {
1435 		vm_free_page_lock_spin();
1436 	}
1437 
1438 	assert(tag_page->vmp_ts_wanted);
1439 	tag_page->vmp_ts_wanted = false;
1440 
1441 	assert_cell_state(cell, /* [B.2] */ MTE_MASK_PINNED);
1442 	CELL_UPDATE(cell, tag_page, false, {
1443 		cell->state = MTE_STATE_CLAIMED;
1444 	});
1445 
1446 	if (cell->free_mask != 0 &&
1447 	    (vm_page_free_wanted_tagged_privileged || vm_page_free_wanted_tagged)) {
1448 		mteinfo_wake_fill_thread();
1449 	}
1450 
1451 	if (!fq_locked) {
1452 		vm_free_page_unlock();
1453 	}
1454 
1455 	counter_inc(&vm_cpu_claimed_count);
1456 }
1457 
1458 #endif /* VM_MTE_FF_VERIFY */
1459 #pragma mark Covered pages state machine
1460 
1461 bool
mteinfo_covered_page_taggable(ppnum_t pnum)1462 mteinfo_covered_page_taggable(ppnum_t pnum)
1463 {
1464 	return cell_from_covered_ppnum(pnum)->state == MTE_STATE_ACTIVE;
1465 }
1466 
1467 void
mteinfo_covered_page_set_free(ppnum_t pnum,bool tagged)1468 mteinfo_covered_page_set_free(ppnum_t pnum, bool tagged)
1469 {
1470 	vm_page_t tag_page;
1471 	cell_t   *cell = cell_from_covered_ppnum(pnum, &tag_page);
1472 	int       bit  = pnum % MTE_PAGES_PER_TAG_PAGE;
1473 
1474 	assert(cell->mte_page_count >= tagged);
1475 	assert(!bit_test(cell->free_mask, bit));
1476 
1477 	VM_COUNTER_INC(&vm_page_free_count);
1478 	if (cell->state == MTE_STATE_ACTIVE) {
1479 		VM_COUNTER_INC(&vm_page_free_taggable_count);
1480 	}
1481 	if (tagged) {
1482 		VM_COUNTER_DEC(&vm_page_tagged_count);
1483 	}
1484 
1485 	CELL_UPDATE(cell, tag_page, false, {
1486 		cell->mte_page_count -= tagged;
1487 		bit_set(cell->free_mask, bit);
1488 	});
1489 }
1490 
1491 void
mteinfo_covered_page_set_used(ppnum_t pnum,bool tagged)1492 mteinfo_covered_page_set_used(ppnum_t pnum, bool tagged)
1493 {
1494 	vm_page_t tag_page;
1495 	cell_t   *cell = cell_from_covered_ppnum(pnum, &tag_page);
1496 	int       bit  = pnum % MTE_PAGES_PER_TAG_PAGE;
1497 
1498 	assert(cell->mte_page_count + tagged <= MTE_PAGES_PER_TAG_PAGE);
1499 	assert(bit_test(cell->free_mask, bit));
1500 
1501 	VM_COUNTER_DEC(&vm_page_free_count);
1502 	if (cell->state == MTE_STATE_ACTIVE) {
1503 		VM_COUNTER_DEC(&vm_page_free_taggable_count);
1504 	}
1505 	if (tagged) {
1506 		VM_COUNTER_INC(&vm_page_tagged_count);
1507 	}
1508 
1509 	CELL_UPDATE(cell, tag_page, true, {
1510 		bit_clear(cell->free_mask, bit);
1511 		cell->mte_page_count += tagged;
1512 	});
1513 }
1514 
1515 __startup_func
1516 void
mteinfo_covered_page_set_stolen_tagged(ppnum_t pnum)1517 mteinfo_covered_page_set_stolen_tagged(ppnum_t pnum)
1518 {
1519 	vm_page_t tag_page;
1520 	cell_t   *cell = cell_from_covered_ppnum(pnum, &tag_page);
1521 
1522 	assert(cell->mte_page_count < MTE_PAGES_PER_TAG_PAGE);
1523 	assert(!bit_test(cell->free_mask, pnum % MTE_PAGES_PER_TAG_PAGE));
1524 
1525 	CELL_UPDATE(cell, tag_page, false, {
1526 		cell->mte_page_count++;
1527 	});
1528 }
1529 
1530 void
mteinfo_covered_page_clear_tagged(ppnum_t pnum)1531 mteinfo_covered_page_clear_tagged(ppnum_t pnum)
1532 {
1533 	vm_page_t tag_page;
1534 	cell_t   *cell = cell_from_covered_ppnum(pnum, &tag_page);
1535 
1536 	assert(cell->mte_page_count > 0);
1537 	assert(!bit_test(cell->free_mask, pnum % MTE_PAGES_PER_TAG_PAGE));
1538 
1539 	CELL_UPDATE(cell, tag_page, false, {
1540 		cell->mte_page_count--;
1541 	});
1542 }
1543 
1544 
1545 #pragma mark Activate
1546 #ifndef VM_MTE_FF_VERIFY
1547 
1548 /*!
1549  * @function mteinfo_tag_storage_wire_locked()
1550  *
1551  * @abstract
1552  * Wire the given tag storage page.
1553  *
1554  * @discussion
1555  * The page will be wired as part of mte_tags_object.
1556  *
1557  * This must be called with the object lock and the page queues lock held.
1558  *
1559  * @param tag_page
1560  * A tag storage page.
1561  */
1562 static void
mteinfo_tag_storage_wire_locked(vm_page_t tag_page)1563 mteinfo_tag_storage_wire_locked(vm_page_t tag_page)
1564 {
1565 	vm_object_offset_t page_addr = ptoa(VM_PAGE_GET_PHYS_PAGE(tag_page));
1566 
1567 	assert(tag_page->vmp_wire_count == 0);
1568 	vm_page_wire(tag_page, VM_KERN_MEMORY_MTAG,
1569 	    /* Don't check memory status. */ FALSE);
1570 
1571 	vm_page_insert_internal(tag_page, mte_tags_object, page_addr,
1572 	    VM_KERN_MEMORY_MTAG,
1573 	    /* We already hold the queue locks. */ TRUE,
1574 	    /* Add this page to the hash. */ TRUE,
1575 	    /* Don't bother batching pmap operations. */ FALSE,
1576 	    /* Don't bother batching accounting. */ FALSE,
1577 	    /* Don't bother with delayed ledger updates. */ NULL);
1578 }
1579 
1580 /*!
1581  * @function mteinfo_tag_storage_select_activating()
1582  *
1583  * @abstract
1584  * Select tag storage pages to activate toward a certain number of free covered
1585  * pages to make taggable.
1586  *
1587  * @discussion
1588  * The caller must make sure there's at least one page to activate for the
1589  * selected buckets.
1590  *
1591  * @param target        how many covered taggable free pages to try to generate
1592  *                      as a result of this activation.
1593  * @param bucket        which inactive bucket to start drawing from
1594  *
1595  * @returns             the list of tag storage pages to activate
1596  *                      with mteinfo_tag_storage_activate_locked().
1597  */
1598 static vm_page_list_t
mteinfo_tag_storage_select_activating(uint32_t target,mte_cell_bucket_t bucket)1599 mteinfo_tag_storage_select_activating(uint32_t target, mte_cell_bucket_t bucket)
1600 {
1601 	vm_page_list_t list      = { };
1602 	vm_page_t      tag_page  = VM_PAGE_NULL;
1603 	cell_t        *cell      = NULL;
1604 	uint32_t       total     = 0;
1605 	uint32_t       covered   = 0;
1606 
1607 	/*
1608 	 * Convert the lock hold into a mutex, to signal to waiters that the
1609 	 * lock may be held for longer.
1610 	 */
1611 	vm_free_page_lock_convert();
1612 
1613 	do {
1614 		cell = cell_list_find_last_page(MTE_LIST_INACTIVE_IDX,
1615 		    bucket, &tag_page);
1616 		if (tag_page == VM_PAGE_NULL) {
1617 			break;
1618 		}
1619 
1620 		assert_cell_state(cell, /* [A.1] */ MTE_MASK_INACTIVE);
1621 		CELL_UPDATE(cell, tag_page, false, {
1622 			cell->state = MTE_STATE_ACTIVATING;
1623 		});
1624 
1625 		covered = cell_free_page_count(*cell);
1626 		total  += covered;
1627 
1628 		KDBG(VMDBG_CODE(DBG_VM_TAG_PAGE_INACTIVE) | DBG_FUNC_NONE,
1629 		    VM_KERNEL_ADDRHIDE(tag_page), covered);
1630 
1631 		tag_page->vmp_q_state = VM_PAGE_NOT_ON_Q;
1632 		vm_page_list_push(&list, tag_page);
1633 	} while (total < target);
1634 
1635 	return list;
1636 }
1637 
1638 /*!
1639  * @function mteinfo_tag_storage_activate_locked()
1640  *
1641  * @abstract
1642  * Activate a list of tag storage pages in reclaiming or activating state.
1643  *
1644  * @discussion
1645  * The page free queue lock must be held, however it is dropped and retaken by
1646  * this function.
1647  *
1648  * @param list          the list of pages to activate.
1649  * @param spin_mode     whether to take the free page queue lock in spin mode.
1650  *
1651  * @returns             how many covered pages have been made taggable.
1652  */
1653 static uint32_t
mteinfo_tag_storage_activate_locked(vm_page_list_t list,bool spin_mode)1654 mteinfo_tag_storage_activate_locked(vm_page_list_t list, bool spin_mode)
1655 {
1656 	vm_page_t tag_page  = VM_PAGE_NULL;
1657 	uint32_t  result, total;
1658 
1659 	vm_free_page_unlock();
1660 
1661 	/*
1662 	 * First, retype the pages and add them to the MTE object.
1663 	 */
1664 
1665 	vm_page_list_foreach(tag_page, list) {
1666 		ppnum_t tag_pnum = VM_PAGE_GET_PHYS_PAGE(tag_page);
1667 
1668 		assert(vm_page_is_tag_storage_pnum(tag_page, tag_pnum));
1669 		pmap_make_tag_storage_page(tag_pnum);
1670 	}
1671 
1672 	vm_object_lock(mte_tags_object);
1673 	vm_page_lock_queues();
1674 	vm_page_list_foreach(tag_page, list) {
1675 		vm_page_t save_snext = NEXT_PAGE(tag_page);
1676 
1677 		NEXT_PAGE(tag_page) = VM_PAGE_NULL;
1678 		mteinfo_tag_storage_wire_locked(tag_page);
1679 		NEXT_PAGE(tag_page) = save_snext;
1680 	}
1681 	vm_page_unlock_queues();
1682 	vm_object_unlock(mte_tags_object);
1683 
1684 	if (spin_mode) {
1685 		vm_free_page_lock_spin();
1686 	} else {
1687 		vm_free_page_lock();
1688 	}
1689 
1690 	/*
1691 	 * Second, mark all the pages as active now, which makes the
1692 	 * covered pages available for taggable allocation.
1693 	 *
1694 	 * And recompute how many taggable pages we really freed,
1695 	 * as allocations/free of untagged pages could have made
1696 	 * progress while we dropped the free page queue lock.
1697 	 */
1698 
1699 	total = 0;
1700 	vm_page_list_foreach_consume(tag_page, &list) {
1701 		total += mteinfo_tag_storage_set_active(tag_page, 0, false);
1702 	}
1703 	result = total;
1704 
1705 
1706 	/*
1707 	 * Last perform wakeups.
1708 	 *
1709 	 * 1. wake up other activators
1710 	 * 2. wake up privileged waiters
1711 	 * 3. wake up regular waiters
1712 	 *
1713 	 * We do not need to consider secluded pools, or other waiters because
1714 	 * we never prevent them from allocating the pages associated with
1715 	 * the tag storage we are activating during this process. Which is why
1716 	 * we don't use vm_page_free_queue_handle_wakeups_and_unlock() but
1717 	 * instead have this simplified implementation.
1718 	 */
1719 
1720 	if (vm_mte_activator_waiters) {
1721 		vm_mte_activator_waiters = false;
1722 		wakeup_all_with_inheritor(&vm_mte_activator_waiters,
1723 		    THREAD_AWAKENED);
1724 	}
1725 
1726 	if (vm_page_free_wanted_tagged_privileged && total) {
1727 		if (total < vm_page_free_wanted_tagged_privileged) {
1728 			vm_page_free_wanted_tagged_privileged -= total;
1729 			total = 0;
1730 		} else {
1731 			total -= vm_page_free_wanted_tagged_privileged;
1732 			vm_page_free_wanted_tagged_privileged = 0;
1733 		}
1734 		vm_page_free_wakeup(&vm_page_free_wanted_tagged_privileged,
1735 		    UINT32_MAX);
1736 	}
1737 
1738 	if (vm_page_free_wanted_tagged && total) {
1739 		uint32_t wakeup = 0;
1740 
1741 		if (total < vm_page_free_wanted_tagged) {
1742 			wakeup = total;
1743 			vm_page_free_wanted_tagged -= total;
1744 			total  = 0;
1745 		} else {
1746 			total -= vm_page_free_wanted_tagged;
1747 			vm_page_free_wanted_tagged = 0;
1748 			wakeup = UINT32_MAX;
1749 		}
1750 		vm_page_free_wakeup(&vm_page_free_wanted_tagged, wakeup);
1751 	}
1752 
1753 	return result;
1754 }
1755 
1756 bool
mteinfo_tag_storage_try_activate(uint32_t target,bool spin_mode)1757 mteinfo_tag_storage_try_activate(uint32_t target, bool spin_mode)
1758 {
1759 	mte_cell_bucket_t first_bucket = MTE_BUCKET_17_24;
1760 	thread_t          thread_self  = current_thread();
1761 	vm_page_list_t    list         = { };
1762 
1763 	/*
1764 	 * We only draw from buckets covering more than half of the pages free.
1765 	 * We do not want to do buckets that are less full, as this is too slow
1766 	 * for the inline path and will rely on the refill thread instead.
1767 	 */
1768 
1769 	if (mte_info_lists[MTE_LIST_INACTIVE_IDX].mask < BIT(first_bucket)) {
1770 		return false;
1771 	}
1772 
1773 	if (vm_mte_activator) {
1774 		/*
1775 		 * We only allow one thread activating pages at a time,
1776 		 * only wait if we the caller can't make progress without
1777 		 * this though.
1778 		 *
1779 		 * We do not need to consider that the waiters is privileged
1780 		 * for the wait however, because activation isn't affected
1781 		 * by TH_OPT_VMPRIV.
1782 		 */
1783 
1784 		if (vm_page_free_taggable_count > vm_page_free_reserved) {
1785 			return false;
1786 		}
1787 		if (vm_page_free_taggable_count > 0 &&
1788 		    (thread_self->options & TH_OPT_VMPRIV)) {
1789 			return false;
1790 		}
1791 
1792 		vm_mte_activator_waiters = true;
1793 		lck_mtx_sleep_with_inheritor(&vm_page_queue_free_lock,
1794 		    spin_mode ? LCK_SLEEP_SPIN : LCK_SLEEP_DEFAULT,
1795 		    &vm_mte_activator_waiters, vm_mte_activator,
1796 		    THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1797 
1798 		return true;
1799 	}
1800 
1801 	vm_mte_activator = thread_self;
1802 	list = mteinfo_tag_storage_select_activating(target, first_bucket);
1803 	mteinfo_tag_storage_activate_locked(list, spin_mode);
1804 	vm_mte_activator = THREAD_NULL;
1805 
1806 	return true;
1807 }
1808 
1809 
1810 #pragma mark Deactivate
1811 
1812 /*!
1813  * @abstract
1814  * Returns whether the active(0.0) bucket should be drained to make inactive
1815  * pages.
1816  *
1817  * @param for_wakeup    Whether the question is to wakeup the refill thread
1818  *                      (true) or decide whether the refill thread should keep
1819  *                      going (false).
1820  */
1821 static bool
mteinfo_tag_storage_should_drain(bool for_wakeup)1822 mteinfo_tag_storage_should_drain(bool for_wakeup)
1823 {
1824 	mte_cell_list_t active_0  = &mte_info_lists[MTE_LIST_ACTIVE_0_IDX];
1825 	uint32_t        threshold = VMP_FREE_BATCH_SIZE * (for_wakeup ? 2 : 1);
1826 
1827 	if (!vm_mte_enable_tag_storage_grab) {
1828 		return false;
1829 	}
1830 
1831 	if (mte_claimable_queue.vmpfq_count >= vm_free_magazine_refill_limit) {
1832 		return false;
1833 	}
1834 
1835 	if (active_0->count <= vm_page_tag_storage_reserved) {
1836 		return false;
1837 	}
1838 
1839 	return cell_queue_count(&active_0->buckets[0]) >= threshold;
1840 }
1841 
1842 /*
1843  * @function mteinfo_tag_storage_deactivate_barrier()
1844  *
1845  * @abstract
1846  * Wait until all possible untagging operations that could make deactivation
1847  * invalid have finished.
1848  *
1849  * @discussion
1850  * Before we can do any deactivation we must make sure
1851  * that no CPU has untagging activity in flight.
1852  *
1853  * See mteinfo_free_queue_grab() and mteinfo_page_list_fix_tagging().
1854  */
1855 static void
mteinfo_tag_storage_deactivate_barrier(void)1856 mteinfo_tag_storage_deactivate_barrier(void)
1857 {
1858 	mte_pcpu_t this_cpu = PERCPU_GET(mte_pcpu);
1859 
1860 	assert(get_preemption_level() > 0);
1861 
1862 	percpu_foreach(it, mte_pcpu) {
1863 		if (it == this_cpu) {
1864 			/*
1865 			 * A thread is allowed to both have pending untagging
1866 			 * going on and a page to deactivate.
1867 			 *
1868 			 * As a result, ignore the current core's suspension
1869 			 * state as it is harmless as long as the core commits
1870 			 * to untagging before it does its deactivations.
1871 			 *
1872 			 * If a thread fails to do that, this will reliably
1873 			 * panic in SPTM, so the risk of silent bugs is rather
1874 			 * unlikely.
1875 			 */
1876 			continue;
1877 		}
1878 
1879 		if (os_atomic_load(&it->deactivate_suspend, relaxed)) {
1880 			hw_wait_while_equals32(&it->deactivate_suspend, 1);
1881 		}
1882 	}
1883 	os_atomic_thread_fence(seq_cst);
1884 }
1885 
1886 /*!
1887  * @abstract
1888  * Flush a list of deactivating page storage.
1889  *
1890  * @discussion
1891  * The page free queue lock must be held, but will be dropped while this
1892  * function operates.
1893  *
1894  * @param list          The list of pages in @c MTE_STATE_DEACTIVATING state.
1895  */
1896 static void
mteinfo_tag_storage_drain_flush(vm_page_list_t list)1897 mteinfo_tag_storage_drain_flush(vm_page_list_t list)
1898 {
1899 	vm_page_t tag_page = VM_PAGE_NULL;
1900 
1901 	mteinfo_tag_storage_deactivate_barrier();
1902 
1903 	vm_free_page_unlock();
1904 
1905 	vm_object_lock(mte_tags_object);
1906 	vm_page_lock_queues();
1907 
1908 	vm_page_list_foreach(tag_page, list) {
1909 		vm_page_t save_next = NEXT_PAGE(tag_page);
1910 
1911 
1912 		/*
1913 		 * The unwiring path expects the page linkage to be
1914 		 * NULL, so transiently make it NULL.  We'll restore
1915 		 * the linkage after the unwire is done.
1916 		 */
1917 
1918 		NEXT_PAGE(tag_page) = VM_PAGE_NULL;
1919 		vm_page_unwire(tag_page,
1920 		    /* Don't put the page into aging queues. */ FALSE);
1921 		vm_page_remove(tag_page,
1922 		    /* Remove the page from the hash. */ TRUE);
1923 		NEXT_PAGE(tag_page) = save_next;
1924 	}
1925 
1926 	vm_page_unlock_queues();
1927 	vm_object_unlock(mte_tags_object);
1928 
1929 	vm_page_list_foreach(tag_page, list) {
1930 		pmap_unmake_tag_storage_page(VM_PAGE_GET_PHYS_PAGE(tag_page));
1931 	}
1932 
1933 	vm_free_page_lock_spin();
1934 
1935 	vm_page_tag_storage_deactivation_count += list.vmpl_count;
1936 
1937 	vm_page_list_foreach_consume(tag_page, &list) {
1938 		vm_page_free_queue_enter(VM_MEMORY_CLASS_TAG_STORAGE,
1939 		    tag_page, VM_PAGE_GET_PHYS_PAGE(tag_page));
1940 	}
1941 }
1942 
1943 /*!
1944  * @function mteinfo_tag_storage_drain()
1945  *
1946  * @abstract
1947  * Attempt to drain the active(0.0) bucket of pages since these are always
1948  * wasted.
1949  *
1950  * @discussion
1951  * This is one of the core routines of the fill thread.
1952  *
1953  * @returns
1954  * How many tag storage pages were deactivated.
1955  */
1956 static uint32_t
mteinfo_tag_storage_drain(void)1957 mteinfo_tag_storage_drain(void)
1958 {
1959 	mte_cell_list_t  active_0 = &mte_info_lists[MTE_LIST_ACTIVE_0_IDX];
1960 	mte_cell_queue_t bucket_0 = &active_0->buckets[0];
1961 	vm_page_t        tag_page = VM_PAGE_NULL;
1962 	cell_t          *cell     = NULL;
1963 	uint32_t         total    = 0;
1964 	vm_page_list_t   list     = { };
1965 
1966 	LCK_MTX_ASSERT_OWNED_SPIN(&vm_page_queue_free_lock);
1967 
1968 	while (mteinfo_tag_storage_should_drain(false)) {
1969 		tag_page   = vm_tag_storage_page_get(cell_queue_first_idx(bucket_0));
1970 		cell       = cell_queue_first(bucket_0);
1971 
1972 		assert(cell->free_mask == 0);
1973 		assert_cell_state(cell, /* [D.1] */ MTE_MASK_ACTIVE);
1974 		CELL_UPDATE(cell, tag_page, false, {
1975 			cell->state = MTE_STATE_DEACTIVATING;
1976 		});
1977 
1978 		vm_page_list_push(&list, tag_page);
1979 
1980 		if (list.vmpl_count >= VMP_FREE_BATCH_SIZE) {
1981 			total += list.vmpl_count;
1982 			mteinfo_tag_storage_drain_flush(list);
1983 			list   = (vm_page_list_t){ };
1984 		}
1985 	}
1986 
1987 	if (list.vmpl_count) {
1988 		total += list.vmpl_count;
1989 		mteinfo_tag_storage_drain_flush(list);
1990 	}
1991 
1992 	return total;
1993 }
1994 
1995 
1996 #pragma mark Reclaim
1997 
1998 /*!
1999  * @abstract
2000  * Attempt to steal a tag page from a per cpu claimed free queue.
2001  *
2002  * @discussion
2003  * The caller must have checked that the tag_page is on a local free queue,
2004  * even if this check is racy.
2005  *
2006  * @param tag_page      A tag storage page appearing to sit on a per cpu queue.
2007  *
2008  * @returns             Whether stealing was successful (true) or not (false).
2009  */
2010 static bool
mteinfo_reclaim_tag_storage_page_try_pcpu(vm_page_t tag_page)2011 mteinfo_reclaim_tag_storage_page_try_pcpu(vm_page_t tag_page)
2012 {
2013 	mte_pcpu_t mte_pcpu;
2014 	uint16_t   cpu;
2015 
2016 	cpu      = os_atomic_load(&tag_page->vmp_local_id, relaxed);
2017 	mte_pcpu = PERCPU_GET_WITH_BASE(other_percpu_base(cpu), mte_pcpu);
2018 
2019 	lck_ticket_lock(&mte_pcpu->free_claimed_lock, &vm_page_lck_grp_bucket);
2020 
2021 	if (tag_page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q &&
2022 	    tag_page->vmp_local_id == cpu) {
2023 		vm_page_queue_remove(&mte_pcpu->free_claimed_pages,
2024 		    tag_page, vmp_pageq);
2025 		tag_page->vmp_q_state  = VM_PAGE_NOT_ON_Q;
2026 		tag_page->vmp_local_id = 0;
2027 		counter_dec_preemption_disabled(&vm_cpu_free_claimed_count);
2028 	} else {
2029 		tag_page = VM_PAGE_NULL;
2030 	}
2031 
2032 	lck_ticket_unlock(&mte_pcpu->free_claimed_lock);
2033 
2034 	return tag_page != VM_PAGE_NULL;
2035 }
2036 
2037 /*!
2038  * @function mteinfo_reclaim_tag_storage_page()
2039  *
2040  * @abstract
2041  * Attempt to reclaim a claimed tag storage page.
2042  *
2043  * @discussion
2044  * This will try to reclaim a tag storage page by relocating its contents to a
2045  * different page, so that the tag storage page becomes (effectively) free.
2046  *
2047  * This expects a claimed tag storage page, and on success, will finish with
2048  * the page in the reclaimed state.  On failure, no guarantees are made about
2049  * the state of the page (due to locking operations); the page could still be
2050  * claimed, or reclamation may have failed because the page became free in the
2051  * interim.  However, if the page was not in a relocatable state, this function
2052  * will not force it out of the reclaiming state, so that the client can choose
2053  * when and why the page is returned to claimed.
2054  *
2055  * This function is called with the free page queue lock in spin mode and
2056  * returns with it held in spin mode.
2057  *
2058  * @param tag_page
2059  * The claimed tag storage page to try reclaiming.
2060  *
2061  * @returns
2062  * - KERN_SUCCESS               success,
2063  *
2064  * - KERN_INVALID_OBJECT        the page has no object set
2065  *
2066  * - KERN_NOT_WAITING           the state of the cell/tag page changed
2067  *                              during evaluation.
2068  *
2069  * - KERN_ABORTED               the tag page was wired. reclaiming it was
2070  *                              aborted and it was marked as MTE_STATE_PINNED.
2071  *
2072  * - KERN_RESOURCE_SHORTAGE     from vm_page_relocate(): relocation failed due
2073  *                              to being out of replacement memory.
2074  *
2075  * - KERN_FAILURE               from vm_page_relocate(): relocation failed due
2076  *                              to the page not being currently relocatable.
2077  */
2078 static kern_return_t
mteinfo_reclaim_tag_storage_page(vm_page_t tag_page)2079 mteinfo_reclaim_tag_storage_page(vm_page_t tag_page)
2080 {
2081 	cell_t *cell = cell_from_tag_storage_page(tag_page);
2082 	kern_return_t kr = KERN_FAILURE;
2083 	vm_object_t object;
2084 	bool compressor_locked = false;
2085 	bool vm_object_trylock_failed = false;
2086 
2087 	/* We need to try and reclaim the tag storage page. */
2088 	mteinfo_tag_storage_set_reclaiming(cell, tag_page);
2089 
2090 	if (tag_page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q &&
2091 	    mteinfo_reclaim_tag_storage_page_try_pcpu(tag_page)) {
2092 		vm_page_tag_storage_reclaim_from_cpu_count++;
2093 		vm_page_tag_storage_reclaim_success_count++;
2094 
2095 		KDBG(VMDBG_CODE(DBG_VM_TAG_PAGE_CLAIMED) | DBG_FUNC_NONE,
2096 		    VM_KERNEL_ADDRHIDE(tag_page),
2097 		    mteinfo_tag_storage_free_pages_for_covered(tag_page));
2098 
2099 		return KERN_SUCCESS;
2100 	}
2101 
2102 	vm_free_page_unlock();
2103 
2104 	/*
2105 	 * Snoop the vmp_q_state. If the page is currently used by the compressor
2106 	 * (VM_PAGE_USED_BY_COMPRESSOR), we'll grab the global compressor lock
2107 	 * for write (PAGE_REPLACEMENT_ALLOWED(TRUE)) and the compressor
2108 	 * object lock.
2109 	 *
2110 	 * Typically, we can't know that the object will be stable
2111 	 * without grabbing the object or page queues lock (see the comment on
2112 	 * "relocation lock dance" below), but we know that the compressor object
2113 	 * is stable. So, we do _not_ need to grab the page queues and object locks
2114 	 * in the wrong order. This ensures that we will wait our turn in case
2115 	 * someone else is using the compressor object lock, and there is no chance
2116 	 * the reclaim will fail because we can't acquire the right locks.
2117 	 *
2118 	 * The contiguous memory allocator grabs this lock before the page queues
2119 	 * and object lock, so we must do the same here.
2120 	 */
2121 	if (tag_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2122 		assert(vm_mte_tag_storage_for_compressor);
2123 		PAGE_REPLACEMENT_ALLOWED(TRUE);
2124 		vm_object_lock(compressor_object);
2125 		compressor_locked = true;
2126 
2127 		/*
2128 		 * The page state transitions into and out of VM_PAGE_USED_BY_COMPRESSOR
2129 		 * happen under the compressor object, so now the page state is stable.
2130 		 */
2131 		if (tag_page->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR) {
2132 			/*
2133 			 * The page was removed from the compressor pool. It could be
2134 			 * in any state now, but it's probably free and unusable. Give up.
2135 			 */
2136 			vm_object_unlock(compressor_object);
2137 			PAGE_REPLACEMENT_ALLOWED(FALSE);
2138 			compressor_locked = false;
2139 			vm_free_page_lock_spin();
2140 			kr = KERN_FAILURE;
2141 			goto locks_acquired;
2142 		}
2143 	}
2144 
2145 	/*
2146 	 * Do the relocation lock dance.  This is a little odd; because we're
2147 	 * starting with a page, and trying to look up the object, we need the
2148 	 * queues lock to keep the object from being deallocated or changed.
2149 	 *
2150 	 * This means we need to get the object lock after the queues lock;
2151 	 * this inverts the lock ordering, so we can only TRY the object lock.
2152 	 */
2153 	vm_page_lock_queues();
2154 
2155 	object = VM_PAGE_OBJECT(tag_page);
2156 	if (compressor_locked) {
2157 		assert(object == compressor_object);
2158 	}
2159 
2160 	if (object == VM_OBJECT_NULL) {
2161 		/* [PH] XXX: Can this even happen? */
2162 		kr = KERN_INVALID_OBJECT;
2163 		goto release_locks;
2164 	} else if (!compressor_locked && !vm_object_lock_try_scan(object)) {
2165 		/*
2166 		 * hopefully the next time we drain reclaiming pages taking
2167 		 * that object lock will work.
2168 		 */
2169 		vm_object_trylock_failed = true;
2170 		kr = KERN_NOT_WAITING;
2171 		goto release_locks;
2172 	} else if (VM_PAGE_OBJECT(tag_page) != object) {
2173 		/*
2174 		 * vm_page_insert_internal() doesn't require the page queue lock
2175 		 * to be held if the page is wired, so the object could change
2176 		 * under us.
2177 		 */
2178 		vm_object_unlock(object);
2179 
2180 		kr = KERN_NOT_WAITING;
2181 		goto release_locks;
2182 	}
2183 
2184 	/*
2185 	 * Now that all the locking is out of the way,
2186 	 * see if the page is actually relocatable.
2187 	 */
2188 	if (VM_PAGE_WIRED(tag_page) ||
2189 	    (tag_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR && tag_page->vmp_busy)) {
2190 		/*
2191 		 * TODO: Relocation fails when one of these conditions is met:
2192 		 *
2193 		 *     VM_PAGE_WIRED(tag_page)
2194 		 *     tag_page->vmp_gobbled
2195 		 *     tag_page->vmp_laundry
2196 		 *     tag_page->vmp_wanted
2197 		 *     tag_page->vmp_cleaning
2198 		 *     tag_page->vmp_overwriting
2199 		 *     tag_page->vmp_free_when_done
2200 		 *     tag_page->vmp_busy
2201 		 *     tag_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q
2202 		 *
2203 		 * We only handle VM_PAGE_WIRED() and when the tag page is being
2204 		 * swapped out (from usage in the compressor pool) for now,
2205 		 * because these are the most likely, but we should use vmp_ts_wanted
2206 		 * for all cases.
2207 		 *
2208 		 * We would need to find all places in the kernel that alter
2209 		 * this condition, to notice that a relocation was attempted
2210 		 * (vmp_ts_wanted is set) and call mteinfo_tag_storage_wakeup().
2211 		 */
2212 
2213 		/*
2214 		 * Take the page free lock before setting vmp_ts_wanted,
2215 		 * before we drop the object lock, otherwise
2216 		 * mteinfo_tag_storage_wakeup() might see vmp_ts_wanted
2217 		 * before the transition to MTE_STATE_PINNED has happened.
2218 		 *
2219 		 * Note that we should do nothing if the cell is no longer in
2220 		 * the MTE_STATE_RECLAIMING state, which could hypothetically
2221 		 * happen since we dropped the free queue lock above.
2222 		 */
2223 		vm_free_page_lock_spin();
2224 
2225 		if (cell->state == MTE_STATE_RECLAIMING) {
2226 			assert(tag_page->vmp_ts_wanted == false);
2227 			tag_page->vmp_ts_wanted = true;
2228 			kr = KERN_ABORTED;
2229 		} else {
2230 			kr = KERN_NOT_WAITING;
2231 		}
2232 
2233 		vm_object_unlock(object);
2234 		vm_page_unlock_queues();
2235 		if (compressor_locked) {
2236 			PAGE_REPLACEMENT_ALLOWED(FALSE);
2237 			compressor_locked = false;
2238 		}
2239 
2240 		if (kr == KERN_ABORTED) {
2241 			assert_cell_state(cell, /* [B.1] */ MTE_MASK_RECLAIMING);
2242 			CELL_UPDATE(cell, tag_page, false, {
2243 				cell->state = MTE_STATE_PINNED;
2244 			});
2245 			if (tag_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2246 				vm_page_tag_storage_reclaim_compressor_failure_count++;
2247 			} else {
2248 				vm_page_tag_storage_reclaim_wired_failure_count++;
2249 			}
2250 		}
2251 
2252 		goto locks_acquired;
2253 	} else if ((*vm_mte_tag_storage_for_vm_tags) &&
2254 	    !vm_page_is_relocatable(tag_page, VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM)) {
2255 		/*
2256 		 * If we're allowing tag storage pages to be used for specific VM tags,
2257 		 * those pages could be unrelocatable for reasons we haven't
2258 		 * expected. We're also assuming that if a tag storage page were to
2259 		 * be unrelocatable for whatever reason, it's (at the very least) not
2260 		 * because the page is wired or involved in an IO that could take a
2261 		 * long time, so hopefully it won't be unavailable for too long, and
2262 		 * the fill thread won't churn over the same set of unavailable claimed
2263 		 * pages.
2264 		 *
2265 		 * We'll just skip over this page and move it back to claiming at the
2266 		 * bottom of this function.
2267 		 */
2268 		kr = KERN_NOT_WAITING;
2269 		vm_object_unlock(object);
2270 	} else {
2271 		kr = vm_page_relocate(tag_page, NULL,
2272 		    VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM, NULL);
2273 		vm_object_unlock(object);
2274 
2275 		assert(kr != KERN_ABORTED);
2276 	}
2277 
2278 release_locks:
2279 	if (compressor_locked) {
2280 		PAGE_REPLACEMENT_ALLOWED(FALSE);
2281 	}
2282 	vm_page_unlock_queues();
2283 	if (vm_object_trylock_failed && vm_object_lock_avoid(object)) {
2284 		/*
2285 		 * We failed to lock the VM object, and pageout_scan
2286 		 * wants this object. Back off for a little bit.
2287 		 *
2288 		 * Note that the VM object may no longer be valid after releasing
2289 		 * the VM object lock, but `vm_object_lock_avoid` only compares
2290 		 * pointers and doesn't dereference them, so it's fine.
2291 		 */
2292 		mutex_pause(2);
2293 	}
2294 	vm_free_page_lock_spin();
2295 
2296 
2297 locks_acquired:
2298 	/*
2299 	 * Assert that all codepaths leading up to this point have the lock
2300 	 * held in spin mode (and therefore, preemption disabled).
2301 	 */
2302 	LCK_MTX_ASSERT_OWNED_SPIN(&vm_page_queue_free_lock);
2303 
2304 	if (kr == KERN_SUCCESS) {
2305 		vm_page_tag_storage_reclaim_success_count++;
2306 
2307 		/* We relocated the page.  Now we can use it. */
2308 		if (cell->state != MTE_STATE_RECLAIMING) {
2309 			/*
2310 			 * The page was manipulated while we were relocating
2311 			 * it.  This likely means it was freed and reallocated
2312 			 * between us dropping the free page lock and getting
2313 			 * the queues lock.
2314 			 *
2315 			 * This should be ludicrously rare, and should still
2316 			 * mean that the page is claimed (otherwise relocate
2317 			 * would have failed).  Set to reclaiming for client
2318 			 * consistency.
2319 			 *
2320 			 * In the state diagram this corresponds to other
2321 			 * threads having performed [F.2 inline] followed
2322 			 * by [C.1 inline], possibly multiple times.
2323 			 */
2324 			mteinfo_tag_storage_set_reclaiming(cell, tag_page);
2325 		}
2326 
2327 		KDBG(VMDBG_CODE(DBG_VM_TAG_PAGE_CLAIMED) | DBG_FUNC_NONE,
2328 		    VM_KERNEL_ADDRHIDE(tag_page),
2329 		    mteinfo_tag_storage_free_pages_for_covered(tag_page));
2330 
2331 		assert(tag_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
2332 	} else {
2333 		vm_page_tag_storage_reclaim_failure_count++;
2334 
2335 		if (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_NOT_WAITING) {
2336 			/*
2337 			 * If there was no available page to relocate the tag
2338 			 * storage page to, or that some race happened that
2339 			 * changed the page state under our feet, just put the
2340 			 * page back as claimed if it's still reclaiming.
2341 			 *
2342 			 * It will as a result get reconsidered more quickly...
2343 			 * it WAS our best candidate, after all.
2344 			 */
2345 			if (cell->state == MTE_STATE_RECLAIMING) {
2346 				mteinfo_tag_storage_set_claimed(tag_page);
2347 			}
2348 		}
2349 	}
2350 
2351 	return kr;
2352 }
2353 
2354 
2355 #pragma mark Refill Thread
2356 
2357 /*!
2358  * @abstract
2359  * Returns whether the refill thread should keep refilling the active pool.
2360  *
2361  * @discussion
2362  * If we're below the free target, and there are no tagged waiters of any kind,
2363  * avoid activating any pages if the untagged pool is not extremely healthy.
2364  */
2365 static inline bool
mteinfo_tag_storage_active_should_refill(void)2366 mteinfo_tag_storage_active_should_refill(void)
2367 {
2368 	if (vm_page_free_taggable_count >= vm_page_free_target) {
2369 		return false;
2370 	}
2371 
2372 	if (vm_page_free_taggable_count <= vm_page_free_reserved) {
2373 		return true;
2374 	}
2375 
2376 	if (vm_page_free_wanted_tagged_privileged || vm_page_free_wanted_tagged) {
2377 		return true;
2378 	}
2379 
2380 	/*
2381 	 * 16/15 is ~1.07: we define "healthy" as at least 7% excess pages
2382 	 * over the target.
2383 	 *
2384 	 * We want some slop because a system under pressure will sometimes go
2385 	 * above @c vm_page_free_target and we want to avoid thrashing.
2386 	 */
2387 	return vm_page_free_count * 15ull >= vm_page_free_target * 16ull;
2388 }
2389 
2390 /*!
2391  * @function mteinfo_tag_storage_active_refill()
2392  *
2393  * @abstract
2394  * Attempt to fill the global free tagged covered page queue.
2395  *
2396  * @discussion
2397  * This is one of the core routines of the fill thread.  It will attempt to get
2398  * the global free tagged covered page queue to or above a target value.  It
2399  * will also wake threads waiting for more of these pages as appropriate.
2400  *
2401  * This function is called with the free page queue lock held in spin mode
2402  * and returns with it held in spin mode.
2403  *
2404  * @param taggablep     How many free taggable pages have been added.
2405  * @returns             The number of tag storage pages this function activated.
2406  */
2407 static uint32_t
mteinfo_tag_storage_active_refill(uint32_t * taggablep)2408 mteinfo_tag_storage_active_refill(uint32_t *taggablep)
2409 {
2410 	mte_cell_list_t  claimed_list  = &mte_info_lists[MTE_LIST_CLAIMED_IDX];
2411 	mte_cell_list_t  inactive_list = &mte_info_lists[MTE_LIST_INACTIVE_IDX];
2412 	uint32_t         taggable      = 0;
2413 	uint32_t         activated     = 0;
2414 
2415 	LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
2416 
2417 	while (mteinfo_tag_storage_active_should_refill()) {
2418 		mte_cell_bucket_t i_bucket = 0;
2419 		mte_cell_bucket_t c_bucket = 0;
2420 		vm_page_list_t    list     = { };
2421 		kern_return_t     kr       = KERN_SUCCESS;
2422 
2423 		/*
2424 		 *	Step 1: try to activate or reclaim pages.
2425 		 *
2426 		 *	Pick the pool between inactive and claimed that will
2427 		 *	make us progress the fastest (picking inactive over
2428 		 *	claimed for equivalent buckets, given that reclaiming
2429 		 *	is more expensive).
2430 		 *
2431 		 *	In particular always pick active buckets over reclaiming
2432 		 *	pages if they have more than 50% of the pages free.
2433 		 */
2434 
2435 		if (inactive_list->mask) {
2436 			i_bucket = fls(inactive_list->mask) - 1;
2437 		} else {
2438 			i_bucket = 0;
2439 		}
2440 		if (claimed_list->mask) {
2441 			c_bucket = fls(claimed_list->mask) - 1;
2442 		} else {
2443 			c_bucket = 0;
2444 		}
2445 
2446 		if (i_bucket && i_bucket >= MIN(MTE_BUCKET_17_24, c_bucket)) {
2447 			list = mteinfo_tag_storage_select_activating(VMP_FREE_BATCH_SIZE,
2448 			    MIN(i_bucket, MTE_BUCKET_17_24));
2449 		} else if (c_bucket > MTE_BUCKET_0) {
2450 			mte_cell_queue_t queue = &claimed_list->buckets[c_bucket];
2451 			cell_idx_t       idx   = cell_queue_first_idx(queue);
2452 			vm_page_t        page  = vm_tag_storage_page_get(idx);
2453 
2454 			kr = mteinfo_reclaim_tag_storage_page(page);
2455 			if (kr == KERN_SUCCESS) {
2456 				list = vm_page_list_for_page(page);
2457 			}
2458 		} else {
2459 			/*
2460 			 * There is no progress we can do here because we do not
2461 			 * have good candidates to activate or reclaim.
2462 			 *
2463 			 * As a result, even if the system has free untaggable
2464 			 * pages, they can't be converted to taggable either
2465 			 * because they're permanently untaggable, or beacuse
2466 			 * their associated tag storage can't be reclaimed.
2467 			 *
2468 			 * Waiting in VM_PAGE_WAIT() below sounds appealing
2469 			 * but will result in busy loops, so we should just
2470 			 * go park and wait until some page free is saving us
2471 			 * via the "wakeup_refill_thread" cases in
2472 			 * @c vm_page_free_queue_handle_wakeups_and_unlock().
2473 			 */
2474 			break;
2475 		}
2476 
2477 		if (kr == KERN_SUCCESS) {
2478 			activated += list.vmpl_count;
2479 			taggable += mteinfo_tag_storage_activate_locked(list,
2480 			    /* spin-mode */ true);
2481 			continue;
2482 		}
2483 
2484 		/*
2485 		 *	Step 2: wait if needed
2486 		 *
2487 		 *	KERN_RESOURCE_SHORTAGE means that we were out of pages
2488 		 *	to relocate or tag storage candidates.
2489 		 *
2490 		 *	Other errors are relocation failures and we can just
2491 		 *	retry immediately.
2492 		 */
2493 
2494 		if (kr == KERN_RESOURCE_SHORTAGE) {
2495 			/*
2496 			 * There was no good candidate tag storage page.  Wait
2497 			 * on the VM to make new pages available.
2498 			 *
2499 			 * TODO: This isn't a great solution; the VM doesn't
2500 			 * understand what we are actually waiting on.  This
2501 			 * should converge eventually due to VM activity... but
2502 			 * the bigger picture fix is to make all free pages
2503 			 * eligible for MTE.  Then our only significant concern
2504 			 * around tag storage pages will be tag storage pages
2505 			 * with ECC errors, which should be a small number.
2506 			 */
2507 			vm_free_page_unlock();
2508 			current_thread()->page_wait_class = VM_MEMORY_CLASS_REGULAR;
2509 			VM_PAGE_WAIT();
2510 			vm_free_page_lock_spin();
2511 
2512 			/*
2513 			 * We waited above, the system conditions changed,
2514 			 * flush our reclaiming queue.
2515 			 */
2516 			mteinfo_tag_storage_flush_reclaiming();
2517 		}
2518 	}
2519 
2520 	mteinfo_tag_storage_flush_reclaiming();
2521 
2522 	*taggablep += taggable;
2523 	return activated;
2524 }
2525 
2526 /*!
2527  * @function mteinfo_fill_continue()
2528  *
2529  * @abstract
2530  * Continuation for the MTE fill thread.
2531  *
2532  * @discussion
2533  * The MTE fill thread manages the global free queue of covered tagged pages,
2534  * and moving tag storage pages between the active and inactive states.
2535  *
2536  * @param param
2537  * Unused.
2538  *
2539  * @param wr
2540  * Unused.
2541  */
2542 __dead2
2543 static void
mteinfo_fill_continue(void * param __unused,wait_result_t wr __unused)2544 mteinfo_fill_continue(void *param __unused, wait_result_t wr __unused)
2545 {
2546 #if CONFIG_THREAD_GROUPS
2547 	static bool _fill_thread_self_inited;
2548 
2549 	if (!_fill_thread_self_inited) {
2550 		thread_group_vm_add();
2551 		_fill_thread_self_inited = true;
2552 	}
2553 #endif /* CONFIG_THREAD_GROUPS */
2554 
2555 	(void)sched_cond_ack(&fill_thread_cond);
2556 	vm_mte_refill_thread_wakeups++;
2557 
2558 	for (;;) {
2559 		uint32_t added = 0;
2560 		uint32_t activated = 0;
2561 		uint32_t deactivated = 0;
2562 
2563 		VM_DEBUG_CONSTANT_EVENT(, DBG_VM_REFILL_MTE, DBG_FUNC_START,
2564 		    0, 0, 0, 0);
2565 
2566 		/*
2567 		 * NB: We take the free queue lock in spin mode here because there are
2568 		 * a number of operations that occur during active_refill and drain
2569 		 * that requires preemption to be disabled. For example:
2570 		 *  - in active_refill: if the fill thread tries to reclaim a tag
2571 		 *    storage page, it first tries to steal a free tag storage page
2572 		 *    from the local free queue.
2573 		 *  - in drain: when flushing the queue of deactivating tag storage
2574 		 *    pages, the fill thread waits for all cores to finish any untagging
2575 		 *    before proceeding. See mteinfo_tag_storage_deactivate_barrier().
2576 		 *
2577 		 * Coupling enabling/disabling preemption with acquiring/releasing the
2578 		 * free queue lock is easier than managing preemption by hand, so all
2579 		 * instances of free queue lock acquisition must be done in spin mode.
2580 		 */
2581 		vm_free_page_lock_spin();
2582 
2583 		activated   += mteinfo_tag_storage_active_refill(&added);
2584 		deactivated += mteinfo_tag_storage_drain();
2585 
2586 		vm_free_page_unlock();
2587 
2588 		VM_DEBUG_CONSTANT_EVENT(, DBG_VM_REFILL_MTE, DBG_FUNC_END,
2589 		    added, activated, deactivated, 0);
2590 
2591 		sched_cond_wait_parameter(&fill_thread_cond, THREAD_UNINT,
2592 		    mteinfo_fill_continue, NULL);
2593 	}
2594 }
2595 
2596 void
mteinfo_wake_fill_thread(void)2597 mteinfo_wake_fill_thread(void)
2598 {
2599 	if (is_mte_enabled) {
2600 		sched_cond_signal(&fill_thread_cond, vm_mte_fill_thread);
2601 	}
2602 }
2603 
2604 
2605 #pragma mark Alloc
2606 
2607 /*!
2608  * @abstract
2609  * Returns whether @c mteinfo_free_queue_grab() should refill the per-cpu
2610  * claimable queue.
2611  *
2612  * @discussion
2613  * The policy is to refill if the queue is empty and that the claimable
2614  * queue has a full batch of @c VMP_FREE_BATCH_SIZE free pages.
2615  *
2616  * This is chosen so that the taking of the spinlock it implies is amortized
2617  * well and reduce thrashing.
2618  *
2619  * The function must be called with preemption disabled.
2620  *
2621  * @param mte_pcpu      The current CPU's mte_pcpu_t data structure.
2622  */
2623 static bool
mteinfo_tag_storage_claimable_should_refill(mte_pcpu_t mte_pcpu)2624 mteinfo_tag_storage_claimable_should_refill(mte_pcpu_t mte_pcpu)
2625 {
2626 	if (__improbable(!vm_mte_enable_tag_storage_grab)) {
2627 		return false;
2628 	}
2629 
2630 	if (!vm_page_queue_empty(&mte_pcpu->free_claimed_pages)) {
2631 		return false;
2632 	}
2633 
2634 	return mte_claimable_queue.vmpfq_count >= VMP_FREE_BATCH_SIZE;
2635 }
2636 
2637 /*!
2638  * @abstract
2639  * Refill the current CPU's claimed free queue.
2640  *
2641  * @discussion
2642  * This is done opportunistically by @c mteinfo_free_queue_grab()
2643  * When it notices that it should refill the claimable queue
2644  * (see @mteinfo_tag_storage_claimable_should_refill()).
2645  *
2646  * The function must be called with preemption disabled.
2647  *
2648  * @param mte_pcpu      The current CPU's mte_pcpu_t data structure.
2649  * @param target        The number of tag storage pages to grab.
2650  * @param colorp        A pointer to the current color selector.
2651  */
2652 static void
mteinfo_tag_storage_claimable_refill(mte_pcpu_t mte_pcpu,uint32_t target,uint32_t * colorp)2653 mteinfo_tag_storage_claimable_refill(
2654 	mte_pcpu_t              mte_pcpu,
2655 	uint32_t                target,
2656 	uint32_t               *colorp)
2657 {
2658 	const int       cpu = cpu_number();
2659 	vm_page_queue_t queue;
2660 	ppnum_t         pnum;
2661 	vm_page_t       mem;
2662 
2663 	lck_ticket_lock_nopreempt(&mte_pcpu->free_claimed_lock,
2664 	    &vm_page_lck_grp_bucket);
2665 
2666 	for (uint32_t i = target; i-- > 0;) {
2667 		queue = &mte_claimable_queue.vmpfq_queues[*colorp].qhead;
2668 		while (vm_page_queue_empty(queue)) {
2669 			*colorp = (*colorp + 1) & vm_color_mask;
2670 			queue = &mte_claimable_queue.vmpfq_queues[*colorp].qhead;
2671 		}
2672 
2673 		mem  = (vm_page_t)vm_page_queue_first(queue);
2674 		pnum = VM_PAGE_GET_PHYS_PAGE(mem);
2675 
2676 		assert(mem->vmp_q_state == VM_PAGE_ON_FREE_Q);
2677 		mteinfo_tag_storage_set_claimed(mem);
2678 		mem->vmp_q_state = VM_PAGE_ON_FREE_LOCAL_Q;
2679 		mem->vmp_local_id = (uint16_t)cpu;
2680 		vm_page_queue_enter(&mte_pcpu->free_claimed_pages, mem, vmp_pageq);
2681 	}
2682 
2683 	lck_ticket_unlock_nopreempt(&mte_pcpu->free_claimed_lock);
2684 
2685 	counter_add_preemption_disabled(&vm_cpu_free_claimed_count,
2686 	    target);
2687 }
2688 
2689 vm_page_list_t
mteinfo_free_queue_grab(vm_grab_options_t options,vm_memory_class_t class,unsigned int num_pages,vm_page_q_state_t q_state)2690 mteinfo_free_queue_grab(
2691 	vm_grab_options_t       options,
2692 	vm_memory_class_t       class,
2693 	unsigned int            num_pages,
2694 	vm_page_q_state_t       q_state)
2695 {
2696 	mte_pcpu_t           mte_pcpu = PERCPU_GET(mte_pcpu);
2697 	unsigned int        *colorp;
2698 	unsigned int         color;
2699 	vm_page_list_t       list = { };
2700 	mte_free_queue_idx_t idx;
2701 
2702 	assert(!mte_pcpu->deactivate_suspend && get_preemption_level() > 0);
2703 
2704 	if (class == VM_MEMORY_CLASS_REGULAR) {
2705 		/*
2706 		 * VM_MEMORY_CLASS_DEAD_TAG_STORAGE is not part of
2707 		 * vm_page_free_count, which means the caller didn't take them
2708 		 * into account when making this allocation ask.
2709 		 *
2710 		 * As a result do not respect num_pages. However these are
2711 		 * different than the regular claimable pool because we can
2712 		 * always safely wire them.
2713 		 */
2714 		if (vm_page_queue_free.vmpfq_count) {
2715 			list = vm_page_free_queue_grab(options,
2716 			    VM_MEMORY_CLASS_DEAD_TAG_STORAGE,
2717 			    MIN(vm_free_magazine_refill_limit / 2,
2718 			    vm_page_queue_free.vmpfq_count), q_state);
2719 		}
2720 
2721 		assert(num_pages <= vm_page_free_count);
2722 	} else {
2723 		assert(num_pages <= vm_page_free_taggable_count);
2724 	}
2725 
2726 	colorp = PERCPU_GET(start_color);
2727 	color  = *colorp;
2728 
2729 	if (mteinfo_tag_storage_claimable_should_refill(mte_pcpu)) {
2730 		mteinfo_tag_storage_claimable_refill(mte_pcpu,
2731 		    VMP_FREE_BATCH_SIZE, &color);
2732 	}
2733 
2734 	while (list.vmpl_count < num_pages) {
2735 		vm_page_queue_t queue;
2736 		cell_count_t bit;
2737 		vm_page_t tag_page;
2738 		vm_page_t mem;
2739 		uint32_t count;
2740 		ppnum_t first_pnum;
2741 		cell_t orig;
2742 		cell_t *cell;
2743 
2744 		/*
2745 		 * Select which queue we dequeue from
2746 		 *
2747 		 * Regular allocations can allocate from any bucket.
2748 		 * Tagged allocations must draw from an MTE_FREE_ACTIVE_* one.
2749 		 */
2750 
2751 		if (class == VM_MEMORY_CLASS_REGULAR) {
2752 			idx = ffs(mte_free_queue_mask) - 1;
2753 		} else {
2754 			uint32_t mask = mte_free_queue_mask;
2755 
2756 			mask &= BIT(MTE_FREE_ACTIVE_0) |
2757 			    BIT(MTE_FREE_ACTIVE_1) |
2758 			    BIT(MTE_FREE_ACTIVE_2) |
2759 			    BIT(MTE_FREE_ACTIVE_3);
2760 
2761 			assert(mask);
2762 			idx = fls(mask) - 1;
2763 		}
2764 
2765 		queue = mteinfo_free_queue_head(idx, color);
2766 		while (vm_page_queue_empty(queue)) {
2767 			color = (color + 1) & vm_color_mask;
2768 			queue = mteinfo_free_queue_head(idx, color);
2769 		}
2770 
2771 		/*
2772 		 * Dequeue the linkage, find the page of the right color.
2773 		 */
2774 
2775 		vm_page_queue_remove_first(queue, mem, vmp_pageq);
2776 
2777 		VM_COUNTER_DEC(&mte_free_queues[idx].vmpfq_count);
2778 		if (mte_free_queues[idx].vmpfq_count == 0) {
2779 			bit_clear(mte_free_queue_mask, idx);
2780 		}
2781 
2782 		first_pnum = VM_PAGE_GET_PHYS_PAGE(mem) & -MTE_PAGES_PER_TAG_PAGE;
2783 		cell       = cell_from_covered_ppnum(first_pnum, &tag_page);
2784 		orig       = *cell;
2785 		bit        = orig.enqueue_pos;
2786 		count      = 0;
2787 		assert((orig.enqueue_pos & vm_color_mask) ==
2788 		    color % MTE_PAGES_PER_TAG_PAGE);
2789 
2790 		/*
2791 		 * Dequeue a span of covered pages from that tag storage
2792 		 *
2793 		 * If we have a contiguous run of free pages and we need more,
2794 		 * we know this tag storage page is going to be the one we pick
2795 		 * next.
2796 		 */
2797 
2798 		for (;;) {
2799 			assert(bit_test(orig.free_mask, bit));
2800 			bit_clear(cell->free_mask, bit);
2801 
2802 			mem->vmp_q_state = q_state;
2803 			vm_page_list_push(&list, mem);
2804 
2805 			count += 1;
2806 			bit   += 1;
2807 
2808 			if (!bit_test(cell->free_mask, bit) ||
2809 			    list.vmpl_count >= num_pages) {
2810 				break;
2811 			}
2812 
2813 			mem = vm_page_find_canonical(first_pnum + bit);
2814 		}
2815 
2816 		color = (color + count) & vm_color_mask;
2817 
2818 		/*
2819 		 * Update counters (see mteinfo_covered_page_set_used())
2820 		 */
2821 
2822 		VM_COUNTER_SUB(&vm_page_free_count, count);
2823 		if (idx >= MTE_FREE_ACTIVE_0 && idx <= MTE_FREE_ACTIVE_3) {
2824 			VM_COUNTER_SUB(&vm_page_free_taggable_count, count);
2825 		}
2826 		if (class != VM_MEMORY_CLASS_REGULAR) {
2827 			VM_COUNTER_ADD(&vm_page_tagged_count, count);
2828 			cell->mte_page_count += count;
2829 		}
2830 
2831 		/*
2832 		 * Requeue the tag storage (tail end of CELL_UPDATE())
2833 		 */
2834 
2835 		if (cell_list_idx(orig) != cell_list_idx(*cell) ||
2836 		    cell_list_bucket(orig) != cell_list_bucket(*cell)) {
2837 			cell_list_requeue(cell, tag_page,
2838 			    cell_list_idx(orig), cell_list_bucket(orig),
2839 			    cell_list_idx(*cell), cell_list_bucket(*cell),
2840 			    (int)cell_on_claimable_queue(*cell) -
2841 			    (int)cell_on_claimable_queue(orig));
2842 		}
2843 
2844 		mteinfo_free_queue_requeue(cell, orig, MTE_FREE_NOT_QUEUED,
2845 		    mteinfo_free_queue_idx(*cell));
2846 	}
2847 
2848 	*colorp = color;
2849 
2850 	/*
2851 	 * Some existing driver/IOKit code deals badly with getting physically
2852 	 * contiguous memory... which this alloc code is rather likely to
2853 	 * provide by accident immediately after boot.
2854 	 *
2855 	 * To avoid hitting issues related to this, we'll invert the order of
2856 	 * the list we return.  This code should be removed once we've tracked
2857 	 * down the various driver issues.
2858 	 */
2859 	vm_page_list_reverse(&list);
2860 
2861 	if (class == VM_MEMORY_CLASS_REGULAR && list.vmpl_has_tagged) {
2862 		/*
2863 		 * We are pulling pages from the taggable free queue
2864 		 * to use them as untagged.
2865 		 *
2866 		 * This breaks the invariant that pages with vmp_using_mte
2867 		 * set are either free pages on the free queue that were left
2868 		 * tagged after being freed (covered by the cell "free_mask"),
2869 		 * or used tagged pages (covered by the cell "mte_page_count"
2870 		 * counter).
2871 		 *
2872 		 * The caller has allocated these pages from the free queue
2873 		 * (clearing the proper "free_mask" bit) but didn't increment
2874 		 * the "mte_page_count". It will then proceed with untagging
2875 		 * these pages without holding any locks, and doesn't want to
2876 		 * re-take the free page queue lock for book-keeping.
2877 		 *
2878 		 * As a result, invariants are broken for a little while,
2879 		 * and we need to suspend the deactivation path that someone
2880 		 * has currently broken this invariant on this core until
2881 		 * the untagging is finished, otherwise, the deactivating
2882 		 * thread would not consider these pages as tagged, and would
2883 		 * retype the page to XNU_DEFAULT causing an SPTM panic.
2884 		 *
2885 		 * mteinfo_page_list_fix_tagging() will resume deactivations
2886 		 * when it is called on the same core.
2887 		 *
2888 		 * mteinfo_tag_storage_deactivate_barrier() is called by any
2889 		 * path performing a deactivation to synchronize with this.
2890 		 */
2891 		os_atomic_store(&mte_pcpu->deactivate_suspend, 1,
2892 		    compiler_acquire);
2893 	}
2894 
2895 	/*
2896 	 * If pulling untagged pages tapped above the active(0) pool,
2897 	 * and there are "active(0)" pages around, then we wake up
2898 	 * the refill thread to drain this pool in order to make some
2899 	 * claimable pages available.
2900 	 */
2901 	if (vm_mte_enable_tag_storage_grab &&
2902 	    class == VM_MEMORY_CLASS_REGULAR &&
2903 	    idx >= MTE_FREE_ACTIVE_0 &&
2904 	    mteinfo_tag_storage_should_drain(true)) {
2905 		mteinfo_wake_fill_thread();
2906 	}
2907 
2908 	return list;
2909 }
2910 
2911 void
mteinfo_page_list_fix_tagging(vm_memory_class_t class,vm_page_list_t * list)2912 mteinfo_page_list_fix_tagging(vm_memory_class_t class, vm_page_list_t *list)
2913 {
2914 	const unified_page_list_t pmap_batch_list = {
2915 		.page_slist = list->vmpl_head,
2916 		.type = UNIFIED_PAGE_LIST_TYPE_VM_PAGE_LIST,
2917 	};
2918 	mte_pcpu_t mte_pcpu = PERCPU_GET(mte_pcpu);
2919 	vm_page_t mem;
2920 
2921 	assert(get_preemption_level() > 0);
2922 
2923 	if (class == VM_MEMORY_CLASS_REGULAR && list->vmpl_has_tagged) {
2924 		pmap_unmake_tagged_pages(&pmap_batch_list);
2925 		vm_page_list_foreach(mem, *list) {
2926 			mem->vmp_using_mte = false;
2927 		}
2928 
2929 		/*
2930 		 * Invariants related to tagged pages are resolved,
2931 		 * we can allow deactivations again.
2932 		 */
2933 		os_atomic_store(&mte_pcpu->deactivate_suspend, 0, release);
2934 	}
2935 
2936 	if (class == VM_MEMORY_CLASS_TAGGED && list->vmpl_has_untagged) {
2937 		pmap_make_tagged_pages(&pmap_batch_list);
2938 		vm_page_list_foreach(mem, *list) {
2939 			mem->vmp_using_mte = true;
2940 		}
2941 	}
2942 
2943 	assert(!mte_pcpu->deactivate_suspend);
2944 }
2945 
2946 #endif /* VM_MTE_FF_VERIFY */
2947 #pragma mark Bootstrap
2948 
2949 static mte_cell_queue_t
cell_list_init(mte_cell_queue_t qhp,mte_cell_state_t state,mte_cell_list_idx_t lidx)2950 cell_list_init(
2951 	mte_cell_queue_t        qhp,
2952 	mte_cell_state_t        state,
2953 	mte_cell_list_idx_t     lidx)
2954 {
2955 	mte_cell_bucket_t buckets = cell_list_idx_buckets(lidx);
2956 
2957 	mte_info_lists[lidx].buckets = qhp;
2958 
2959 	for (mte_cell_bucket_t i = 0; i < buckets; i++, qhp++) {
2960 		qhp->head = (cell_t){
2961 			.prev = cell_idx(qhp),
2962 			.next = cell_idx(qhp),
2963 			.state = state,
2964 			.enqueue_pos = -1,
2965 		};
2966 	}
2967 
2968 	return qhp;
2969 }
2970 
2971 __startup_func
2972 void
mteinfo_init(uint32_t num_tag_pages)2973 mteinfo_init(uint32_t num_tag_pages)
2974 {
2975 	assert(2 * num_tag_pages < (1UL << MTE_FF_CELL_INDEX_BITS));
2976 	assert(atop(mte_tag_storage_end - mte_tag_storage_start) == num_tag_pages);
2977 	assert(num_tag_pages == mte_tag_storage_count);
2978 
2979 	vm_size_t size = sizeof(cell_t) * (MTE_QUEUES_COUNT + num_tag_pages);
2980 	mte_cell_queue_t queue;
2981 	mte_cell_list_t list;
2982 
2983 	queue = pmap_steal_memory(size, 8);
2984 	mte_info_cells = &(queue + MTE_QUEUES_COUNT)->head;
2985 
2986 	queue = cell_list_init(queue, MTE_STATE_DISABLED, MTE_LIST_DISABLED_IDX);
2987 	queue = cell_list_init(queue, MTE_STATE_PINNED, MTE_LIST_PINNED_IDX);
2988 	queue = cell_list_init(queue, MTE_STATE_DEACTIVATING, MTE_LIST_DEACTIVATING_IDX);
2989 	queue = cell_list_init(queue, MTE_STATE_CLAIMED, MTE_LIST_CLAIMED_IDX);
2990 	queue = cell_list_init(queue, MTE_STATE_INACTIVE, MTE_LIST_INACTIVE_IDX);
2991 	queue = cell_list_init(queue, MTE_STATE_RECLAIMING, MTE_LIST_RECLAIMING_IDX);
2992 	queue = cell_list_init(queue, MTE_STATE_ACTIVATING, MTE_LIST_ACTIVATING_IDX);
2993 	queue = cell_list_init(queue, MTE_STATE_ACTIVE, MTE_LIST_ACTIVE_0_IDX);
2994 	queue = cell_list_init(queue, MTE_STATE_ACTIVE, MTE_LIST_ACTIVE_IDX);
2995 
2996 	assert(&queue->head == mte_info_cells);
2997 
2998 	/*
2999 	 * Quickly create a list of all possible cells and place it into the
3000 	 * disabled queue.
3001 	 */
3002 
3003 	for (cell_idx_t i = 0; i < num_tag_pages; i++) {
3004 		*cell_from_idx(i) = (cell_t){
3005 			.prev = i - 1,
3006 			.next = i + 1,
3007 			.enqueue_pos = -1,
3008 			.mte_page_count = 0,
3009 			.state = MTE_STATE_DISABLED,
3010 		};
3011 	}
3012 
3013 	list = &mte_info_lists[MTE_LIST_DISABLED_IDX];
3014 	queue = &list->buckets[0];
3015 	queue->head.next = 0;
3016 	queue->head.prev = num_tag_pages - 1;
3017 	queue->head.cell_count = num_tag_pages;
3018 	cell_from_idx(0)->prev = cell_idx(queue);
3019 	cell_from_idx(num_tag_pages - 1)->next = cell_idx(queue);
3020 	bit_set(list->mask, 0);
3021 	list->count = num_tag_pages;
3022 
3023 	for (mte_free_queue_idx_t idx = MTE_FREE_UNTAGGABLE_0;
3024 	    idx < MTE_FREE_NOT_QUEUED; idx++) {
3025 		for (uint32_t i = 0; i < MAX_COLORS; i++) {
3026 			vm_page_queue_init(mteinfo_free_queue_head(idx, i));
3027 		}
3028 	}
3029 
3030 #ifndef VM_MTE_FF_VERIFY
3031 	vm_page_free_queue_init(&mte_claimable_queue);
3032 #endif /* VM_MTE_FF_VERIFY */
3033 }
3034 
3035 #if HIBERNATION
3036 
3037 void
3038 mteinfo_free_queue_foreach(void (^block)(vm_page_t))
3039 {
3040 	for (cell_idx_t cidx = 0; cidx < mte_tag_storage_count; cidx++) {
3041 		cell_t  *cell = cell_from_idx(cidx);
3042 		ppnum_t  pnum = cell_first_covered_pnum(cell);
3043 		uint32_t mask = cell->free_mask;
3044 
3045 		while (mask) {
3046 			block(vm_page_find_canonical(pnum + ffs(mask) - 1));
3047 			mask &= mask - 1;
3048 		}
3049 
3050 		if (cell->state == MTE_STATE_INACTIVE) {
3051 			block(vm_tag_storage_page_get(cidx));
3052 		}
3053 	}
3054 }
3055 
3056 #endif /* HIBERNATION */
3057 #ifndef VM_MTE_FF_VERIFY
3058 
3059 /* List that tracks tag storage pages until mte_tags_object is initialized. */
3060 __startup_data
3061 static vm_page_list_t mte_tag_storage_startup_list;
3062 
3063 void
mteinfo_tag_storage_release_startup(vm_page_t tag_page)3064 mteinfo_tag_storage_release_startup(vm_page_t tag_page)
3065 {
3066 	cell_t           *cell       = cell_from_tag_storage_page(tag_page);
3067 	ppnum_t           tag_pnum   = VM_PAGE_GET_PHYS_PAGE(tag_page);
3068 	ppnum_t           first_pnum = cell_first_covered_pnum(cell);
3069 	vm_memory_class_t class      = VM_MEMORY_CLASS_TAG_STORAGE;
3070 	bool              deactivate = true;
3071 	uint32_t          mte_count  = 0;
3072 
3073 	/*
3074 	 * If this is a tag storage page we won't even classify as tag
3075 	 * storage.  Just give it to the normal free queues.
3076 	 *
3077 	 * Otherwise, keep about a 1/8 of the tag storage page around,
3078 	 * it should be vastly sufficient to boot. The refill thread
3079 	 * and various passive policies will let it rebalance later.
3080 	 *
3081 	 * Note that this code implicitly relies on the fact that
3082 	 * the tag storage is toward the end of the vm pages array:
3083 	 * we only keep tag storage around that have 32 pages free,
3084 	 * but pages that haven't been created yet appear as "used".
3085 	 */
3086 
3087 	assert(pmap_is_tag_storage_page(tag_pnum));
3088 
3089 	if (pmap_tag_storage_is_discarded(tag_pnum)) {
3090 		mteinfo_tag_storage_set_retired(tag_page);
3091 		return;
3092 	} else if (pmap_tag_storage_is_recursive(tag_pnum)) {
3093 		VM_COUNTER_INC(&vm_page_recursive_tag_storage_count);
3094 		class = VM_MEMORY_CLASS_DEAD_TAG_STORAGE;
3095 	} else if (pmap_tag_storage_is_unmanaged(tag_pnum)) {
3096 		VM_COUNTER_INC(&vm_page_unmanaged_tag_storage_count);
3097 		class = VM_MEMORY_CLASS_DEAD_TAG_STORAGE;
3098 	} else {
3099 		for (uint32_t i = 0; i < MTE_PAGES_PER_TAG_PAGE; i++) {
3100 			mte_count += pmap_is_tagged_page(first_pnum + i);
3101 		}
3102 
3103 		if (cell_free_page_count(*cell) == MTE_PAGES_PER_TAG_PAGE &&
3104 		    mteinfo_tag_storage_active(true) < mte_tag_storage_count / 8) {
3105 			deactivate = false;
3106 		} else if (mte_count) {
3107 			deactivate = false;
3108 		}
3109 	}
3110 
3111 	if (deactivate) {
3112 		pmap_unmake_tag_storage_page(tag_pnum);
3113 		if (class == VM_MEMORY_CLASS_DEAD_TAG_STORAGE) {
3114 			vm_page_free_queue_enter(class, tag_page, tag_pnum);
3115 		} else {
3116 			tag_page->vmp_q_state = VM_PAGE_ON_FREE_Q;
3117 			mteinfo_tag_storage_set_inactive(tag_page, true);
3118 		}
3119 		return;
3120 	}
3121 
3122 	mteinfo_tag_storage_set_active(tag_page, mte_count, true);
3123 	vm_page_list_push(&mte_tag_storage_startup_list, tag_page);
3124 }
3125 
3126 /*!
3127  * @function mteinfo_tag_storage_startup_list_flush()
3128  *
3129  * @abstract
3130  * Adds active tag storage pages to the mte_tags_object.
3131  *
3132  * @discussion
3133  * Adds the list of active tag storage pages updated by @see
3134  * mteinfo_tag_storage_release_startup to the mte_tags_object.  This must be
3135  * called at some point after the last @see mteinfo_tag_storage_release_startup
3136  * call.
3137  */
3138 __startup_func
3139 static void
mteinfo_tag_storage_startup_list_flush(void)3140 mteinfo_tag_storage_startup_list_flush(void)
3141 {
3142 	vm_page_t page;
3143 
3144 	vm_object_lock(mte_tags_object);
3145 	vm_page_lock_queues();
3146 
3147 	vm_page_list_foreach_consume(page, &mte_tag_storage_startup_list) {
3148 		mteinfo_tag_storage_wire_locked(page);
3149 	}
3150 
3151 	vm_page_unlock_queues();
3152 	vm_object_unlock(mte_tags_object);
3153 }
3154 STARTUP(KMEM, STARTUP_RANK_FIRST, mteinfo_tag_storage_startup_list_flush);
3155 
3156 /*!
3157  * @abstract
3158  * Initializes the percpu mte queues and locks.
3159  */
3160 __startup_func
3161 static void
mteinfo_tag_storage_lock_init(void)3162 mteinfo_tag_storage_lock_init(void)
3163 {
3164 	percpu_foreach(mte_pcpu, mte_pcpu) {
3165 		lck_ticket_init(&mte_pcpu->free_claimed_lock,
3166 		    &vm_page_lck_grp_bucket);
3167 		vm_page_queue_init(&mte_pcpu->free_claimed_pages);
3168 	}
3169 }
3170 STARTUP(PERCPU, STARTUP_RANK_MIDDLE, mteinfo_tag_storage_lock_init);
3171 
3172 /*!
3173  * @function mteinfo_init_fill_thread
3174  *
3175  * @abstract
3176  * Creates the MTE fill thread.
3177  */
3178 __startup_func
3179 static void
mteinfo_init_fill_thread(void)3180 mteinfo_init_fill_thread(void)
3181 {
3182 	kern_return_t result;
3183 
3184 	if (!is_mte_enabled) {
3185 		return;
3186 	}
3187 
3188 	result = kernel_thread_start_priority(mteinfo_fill_continue, NULL, BASEPRI_VM,
3189 	    &vm_mte_fill_thread);
3190 
3191 	if (result != KERN_SUCCESS) {
3192 		panic("Failed to create MTE fill thread.");
3193 	}
3194 
3195 	thread_set_thread_name(vm_mte_fill_thread, "VM_mte_fill");
3196 	thread_deallocate(vm_mte_fill_thread);
3197 }
3198 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, mteinfo_init_fill_thread);
3199 
3200 static ppnum_t
mteinfo_tag_storage_mark_unmanaged_range(cell_idx_t idx,ppnum_t pnum)3201 mteinfo_tag_storage_mark_unmanaged_range(cell_idx_t idx, ppnum_t pnum)
3202 {
3203 	cell_t    *end_cell = cell_from_covered_ppnum(pnum);
3204 	cell_idx_t end_idx  = cell_idx(end_cell);
3205 	bool       locked   = false;
3206 
3207 	for (; idx < end_idx; idx++) {
3208 		cell_t *cell = cell_from_idx(idx);
3209 		vm_page_t tag_page = vm_tag_storage_page_get(idx);
3210 
3211 		if (!locked) {
3212 			vm_free_page_lock_spin();
3213 			locked = true;
3214 		}
3215 
3216 		if (pmap_tag_storage_is_discarded(VM_PAGE_GET_PHYS_PAGE(tag_page))) {
3217 			mteinfo_tag_storage_set_retired(tag_page);
3218 			continue;
3219 		}
3220 
3221 		if (cell->mte_page_count != 0) {
3222 			/*
3223 			 * This can happen if some tagged pmap steal
3224 			 * has not ml_static_mfree()d these pages back
3225 			 */
3226 			continue;
3227 		}
3228 
3229 		if (cell->state == MTE_STATE_DISABLED) {
3230 			/*
3231 			 * Probably an ECC retired page.
3232 			 */
3233 			continue;
3234 		}
3235 
3236 		mteinfo_tag_storage_set_unmanaged(cell,
3237 		    vm_tag_storage_page_get(idx));
3238 	}
3239 
3240 	if (locked) {
3241 		vm_free_page_unlock();
3242 	}
3243 
3244 	return end_idx + 1;
3245 }
3246 
3247 static void
mteinfo_tag_storage_unmanaged_discover(void)3248 mteinfo_tag_storage_unmanaged_discover(void)
3249 {
3250 	uint32_t   count   = vm_page_unmanaged_tag_storage_count;
3251 	cell_idx_t cur_idx = 0;
3252 	ppnum_t    pnum;
3253 
3254 	if (!is_mte_enabled) {
3255 		return;
3256 	}
3257 
3258 	vm_pages_radix_for_each_pnum(pnum) {
3259 		cur_idx = mteinfo_tag_storage_mark_unmanaged_range(cur_idx, pnum);
3260 	}
3261 	mteinfo_tag_storage_mark_unmanaged_range(cur_idx,
3262 	    vm_pages_first_pnum);
3263 
3264 	printf("MTE: discovered %d tag storage pages for unmanaged memory\n",
3265 	    vm_page_unmanaged_tag_storage_count - count);
3266 }
3267 STARTUP(LOCKDOWN, STARTUP_RANK_LAST, mteinfo_tag_storage_unmanaged_discover);
3268 
3269 extern boolean_t get_range_bounds(char *c, int64_t *lower, int64_t *upper);
3270 static void
mteinfo_tag_storage_process_vm_tags(void)3271 mteinfo_tag_storage_process_vm_tags(void)
3272 {
3273 	char *vm_tags_str;
3274 
3275 	if (!vm_mte_enable_tag_storage_grab) {
3276 		return;
3277 	}
3278 
3279 	vm_tags_str = vm_mte_tag_storage_for_vm_tags;
3280 	while (*vm_tags_str) {
3281 		uint64_t loop_end;
3282 		boolean_t ret;
3283 		int64_t start = 1, end = VM_MEMORY_COUNT;
3284 
3285 		ret = get_range_bounds(vm_tags_str, &start, &end);
3286 		loop_end = (ret) ? end : start;
3287 		for (int64_t i = start; i <= loop_end; i++) {
3288 			bitmap_set(vm_mte_tag_storage_for_vm_tags_mask, (uint)i);
3289 		}
3290 
3291 		/* Skip to the next ',' */
3292 		while (*vm_tags_str != ',') {
3293 			if (*vm_tags_str == '\0') {
3294 				break;
3295 			}
3296 			vm_tags_str++;
3297 		}
3298 
3299 		if (*vm_tags_str == ',') {
3300 			vm_tags_str++;
3301 		} else {
3302 			assert(*vm_tags_str == '\0');
3303 			break;
3304 		}
3305 	}
3306 }
3307 STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, mteinfo_tag_storage_process_vm_tags);
3308 
3309 #pragma mark Counter methods
3310 
3311 uint32_t
mteinfo_tag_storage_fragmentation(bool actual)3312 mteinfo_tag_storage_fragmentation(bool actual)
3313 {
3314 	uint32_t ts_active;
3315 	uint32_t value;
3316 
3317 	vm_free_page_lock_spin();
3318 	ts_active = mte_info_lists[MTE_LIST_ACTIVE_IDX].count;
3319 	if (actual) {
3320 		ts_active += mte_info_lists[MTE_LIST_ACTIVE_0_IDX].count;
3321 	}
3322 	if (ts_active) {
3323 		value  = 1000 * vm_page_tagged_count;
3324 		value /= (ts_active * MTE_PAGES_PER_TAG_PAGE);
3325 	} else {
3326 		value  = 1000;
3327 	}
3328 	vm_free_page_unlock();
3329 
3330 	return 1000 - value;
3331 }
3332 
3333 uint32_t
mteinfo_tag_storage_active(bool fq_locked)3334 mteinfo_tag_storage_active(bool fq_locked)
3335 {
3336 	uint32_t active;
3337 
3338 	if (!fq_locked) {
3339 		vm_free_page_lock_spin();
3340 	}
3341 
3342 	active = mte_info_lists[MTE_LIST_ACTIVE_0_IDX].count +
3343 	    mte_info_lists[MTE_LIST_ACTIVE_IDX].count;
3344 
3345 	if (!fq_locked) {
3346 		vm_free_page_unlock();
3347 	}
3348 
3349 	return active;
3350 }
3351 
3352 uint32_t
mteinfo_tag_storage_free_pages_for_covered(const struct vm_page * page)3353 mteinfo_tag_storage_free_pages_for_covered(const struct vm_page *page)
3354 {
3355 	ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(page);
3356 
3357 	return cell_free_page_count(*cell_from_covered_ppnum(pnum));
3358 }
3359 
3360 void
mteinfo_increment_wire_count(vm_page_t tag_page)3361 mteinfo_increment_wire_count(vm_page_t tag_page)
3362 {
3363 	if (vm_page_in_tag_storage_array(tag_page) &&
3364 	    vm_page_is_tag_storage(tag_page)) {
3365 		VM_COUNTER_ATOMIC_INC(&vm_page_wired_tag_storage_count);
3366 
3367 		DTRACE_VM1(vm_tag_storage_wired, vm_page_t, tag_page);
3368 	}
3369 }
3370 
3371 void
mteinfo_decrement_wire_count(vm_page_t tag_page,bool pqs_locked)3372 mteinfo_decrement_wire_count(vm_page_t tag_page, bool pqs_locked)
3373 {
3374 	LCK_MTX_ASSERT(&vm_page_queue_lock,
3375 	    pqs_locked ? LCK_MTX_ASSERT_OWNED : LCK_MTX_ASSERT_NOTOWNED);
3376 	LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_NOTOWNED);
3377 
3378 	if (vm_page_in_tag_storage_array(tag_page) &&
3379 	    VM_PAGE_OBJECT(tag_page) != mte_tags_object &&
3380 	    vm_page_is_tag_storage(tag_page)) {
3381 		VM_COUNTER_ATOMIC_DEC(&vm_page_wired_tag_storage_count);
3382 
3383 		DTRACE_VM1(vm_tag_storage_unwired, vm_page_t, tag_page);
3384 
3385 		if (tag_page->vmp_ts_wanted) {
3386 			/*
3387 			 * Many callers have the page queue lock held in spin
3388 			 * when calling this, and mteinfo_tag_storage_wakeup()
3389 			 * needs to acquire a mutex.
3390 			 */
3391 			if (pqs_locked) {
3392 				vm_page_lockconvert_queues();
3393 			}
3394 			mteinfo_tag_storage_wakeup(tag_page, false);
3395 		}
3396 	}
3397 }
3398 
3399 bool
mteinfo_vm_tag_can_use_tag_storage(vm_tag_t vm_tag)3400 mteinfo_vm_tag_can_use_tag_storage(vm_tag_t vm_tag)
3401 {
3402 	return bitmap_test(vm_mte_tag_storage_for_vm_tags_mask, (uint)vm_tag);
3403 }
3404 
3405 
3406 void
kdp_mteinfo_snapshot(struct mte_info_cell * __counted_by (count)cells,size_t count)3407 kdp_mteinfo_snapshot(struct mte_info_cell * __counted_by(count) cells, size_t count)
3408 {
3409 	release_assert(count == mte_tag_storage_count);
3410 
3411 	if (not_in_kdp) {
3412 		panic("panic: kdp_mteinfo_fill called outside of kernel debugger");
3413 	}
3414 
3415 	for (cell_idx_t cidx = 0; cidx < mte_tag_storage_count; cidx++) {
3416 		cell_t  *cell = cell_from_idx(cidx);
3417 		ppnum_t  pnum = cell_first_covered_pnum(cell);
3418 		vm_page_t mem;
3419 		uint8_t wired_count = 0, wired_tagged_count = 0, kernel_wired_tagged_count = 0;
3420 
3421 		for (ppnum_t i = 0; i < MTE_PAGES_PER_TAG_PAGE; i++) {
3422 			mem = vm_page_find_canonical(pnum + i);
3423 			if (mem && VM_PAGE_WIRED(mem)) {
3424 				wired_count++;
3425 				if (mem->vmp_using_mte) {
3426 					if (VM_PAGE_OBJECT(mem) == kernel_object_tagged) {
3427 						kernel_wired_tagged_count++;
3428 					} else {
3429 						wired_tagged_count++;
3430 					}
3431 				}
3432 			}
3433 		}
3434 
3435 		cells[cidx] = (struct mte_info_cell) {
3436 			.mic_state = cell->state,
3437 			.mic_tagged_count = cell->mte_page_count,
3438 			.mic_free_count = (uint8_t)cell_free_page_count(*cell),
3439 			.mic_wired_count = wired_count,
3440 			.mic_wired_tagged_count = wired_tagged_count,
3441 			.mic_kernel_wired_tagged_count = kernel_wired_tagged_count
3442 		};
3443 	}
3444 }
3445 #endif /* VM_MTE_FF_VERIFY */
3446 
3447 #endif /* HAS_MTE */
3448