1 /*
2 * Copyright (c) 2000-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /* Guard header includes, so that the userspace test can include this file. */
30 #include <os/atomic_private.h>
31 #ifndef VM_MTE_FF_VERIFY
32 #include <debug.h>
33 #include <mach_assert.h>
34
35 #include <kern/bits.h>
36 #include <kern/kcdata.h>
37 #include <kern/queue.h>
38
39 #include <mach/sdt.h>
40
41 #include <vm/pmap.h>
42 #include <vm/vm_compressor_internal.h>
43 #include <vm/vm_kern.h>
44 #include <vm/vm_object_internal.h>
45 #include <vm/vm_page_internal.h>
46 #include <vm/vm_pageout.h>
47 #include <vm/vm_mteinfo_internal.h>
48
49 extern lck_grp_t vm_page_lck_grp_bucket;
50
51 #endif /* VM_MTE_FF_VERIFY */
52 #pragma mark Documentation
53 #if HAS_MTE
54
55 /*
56 * VM MTE Info
57 * ===========
58 *
59 * The top level goal of this code is to implement the policies managing the
60 * selection of tag storage pages on the system, in order to:
61 * - Minimize the number of live tag storage pages at any given time;
62 * - Maximize occupancy (the number of covered pages using MTE compared to tag
63 * storage pages actually being used for tag storage).
64 *
65 *
66 * Physical Memory Layout
67 * ----------------------
68 *
69 * The diagram below describes the general layout of the physical memory. iBoot
70 * will determine the placement of the tag storage region, at the end of the
71 * managed address space.
72 *
73 * As a result, the tag storage space is always part of the vm_pages array.
74 * However, several things should be noted:
75 *
76 * - The last tag storage pages cover unmanaged DRAM at the end of physical
77 * memory, as well as the tag storage space itself, and will never be used as
78 * tag storage memory by the system (the unmanaged space will not be MTE'd,
79 * and the tag storage space will never itself use MTE).
80 *
81 * - The first tag storage pages also cover unmanaged DRAM space at the
82 * beginning of physical memory, but might be used for tagging due to early
83 * boot code. However, these first tag storage pages will not be used for
84 * tag storage space dynamically by the system.
85 *
86 * - The beginning of the tag region space is always aligned to a 32 page
87 * boundary; however the start of the vm_pages array is not. As a result,
88 * there is a cluster of 32 pages that possibly crosses this boundary. This
89 * is relevant because dynamic tag storage management only functions for
90 * taggable pages inside the vm_pages array.
91 *
92 *
93 * ┌────────────┐─╮
94 * │ P_n+31 │ │
95 * ├────────────┤ │
96 * ╎ ... ╎ │
97 * ├────────────┤ │
98 * │ P_n │ │
99 * ├────────────┤─╯
100 * │ │
101 * ╎ ╎
102 * ╎ ... ╎
103 * ╎ ╎
104 * │ │
105 * mte_tag_storage_end ─ ─ ─├────────────┤ ─ ─ ─ vm_pages_end
106 * ┬ │TTTTTTTTTTTT│ Tag storage for pages [n:n+31]
107 * │ ├────────────┤
108 * │ │ │
109 * │ ╎ ... ╎
110 * │ │ │
111 * │ ├────────────┤
112 * 1/32 │ │TTTTTTTTTTTT│ Tag storage for pages [i:i+31]
113 * of DRAM │ ├────────────┤
114 * │ │ │
115 * │ ╎ ... ╎
116 * │ │ │
117 * │ ├────────────┤
118 * │ │TTTTTTTTTTTT│ Tag storage for pages [32:63]
119 * │ ├────────────┤
120 * ┴ │TTTTTTTTTTTT│ Tag storage for pages [0:31]
121 * mte_tag_storage_start ─ ─ ─├────────────┤─╮
122 * │ P_i+31 │ │
123 * ├────────────┤ │
124 * ╎ ... ╎ │
125 * ├────────────┤ │
126 * │ P_i │ │
127 * ├────────────┤─╯
128 * │ │
129 * ╎ ╎
130 * ╎ ... ╎
131 * ╎ ╎
132 * │ │
133 * ├────────────┤─╮
134 * │ │ │
135 * ╎ ... ╎ │
136 * ├────────────┤ │ ─ ─ vm_pages
137 * ╎ ... ╎ │
138 * │ │ │
139 * │────────────┤─╯
140 * │ │
141 * ╎ ╎
142 * ╎ ... ╎
143 * ╎ ╎
144 * │ │
145 * ├────────────┤─╮
146 * │ P_31 │ │
147 * ├────────────┤ │
148 * ╎ ... ╎ │
149 * ├────────────┤ │
150 * │ P_0 │ │
151 * pmap_first_pnum ─ ─└────────────┘─╯ ─ ─ gDramBase
152 * Physical Memory
153 *
154 *
155 * Tag storage and cells
156 * ~~~~~~~~~~~~~~~~~~~~~
157 *
158 * Tag storage pages require metadata to track their state machine, in order to
159 * not grow the vm_page_t data structure for all pages on the system when only
160 * 1/32 of them are tag storage.
161 *
162 * The metadata is stored into a data structure called the MTE cell
163 * (@see cell_t) which is queued into the so called MTE Info data structure
164 * (@see @c mte_info_lists).
165 *
166 * The documentation of this file happily calls a cell a tag storage page and
167 * vice versa as result, since the mapping is 1:1.
168 *
169 *
170 * Tag storage state machine
171 * ~~~~~~~~~~~~~~~~~~~~~~~~~
172 *
173 * Disabled is a special state: this is the state cells start in,
174 * and never transition back to unless there is an ECC error.
175 *
176 * The state diagram involving "Disabled" looks like this:
177 *
178 * ╭──────────────╮ ╭───╴K.3╶──╮ ╔══════════════╗
179 * │ RECLAIMING ┼───╮ │ v ╭───>║ ACTIVE ║
180 * ╰──────────────╯ K.1 ╔═╪════════════╗ I.1 ╚══════════════╝
181 * ├───>║ DISABLED ╫───┤
182 * ╔═════════════╗ K.2 ╚══════════════╝ I.2 ╔══════════════╗
183 * ║ CLAIMED ╫───╯ ^ ^ ╰───>║ INACTIVE ║
184 * ╚═══════════╪═╝ │ │ ╚═╪════════════╝
185 * ╰────╴U.1╶───╯ ╰───╴U.2╶────╯
186 *
187 * ╔═╗ Double bar square boxes ╭─╮ Single bar round boxes
188 * ╚═╝ denote stable states. ╰─╯ denote transitionary states.
189 *
190 *
191 * Initialization (I.1, I.2)
192 *
193 * This is performed by mteinfo_tag_storage_release_startup()
194 * This function might decide to leave pages as disabled.
195 *
196 * Unmanaged discovery (U.1, U.2)
197 *
198 * This is performed at lockdown by mteinfo_tag_storage_unmanaged_discover()
199 * to discover tag storage that covers pages that will never have a canonical
200 * vm_page_t made for them, which are effectively unmanaged.
201 *
202 * Retirement (K.1, K.2, K.3)
203 *
204 * This is performed by mteinfo_tag_storage_set_retired(),
205 * itself called by vm_page_retire() which can only happen
206 * for pages that were never created (the cell will be DISABLED),
207 * or on the tag storage claimed page free path (the cell
208 * will either be RECLAIMING or CLAIMED).
209 *
210 *
211 * The rest of the tag storage state machine looks like this:
212 *
213 * ╭──────────────╮
214 * ╭────╴D.2╶───┼ DEACTIVATING │<───╴D.1╶────╮
215 * │ a ╰──────────────╯ a │
216 * v │
217 * ╔══════════════╗ ╭──────────────╮ ╔═╪════════════╗
218 * ║ INACTIVE ╫──╴A.1╶──>│ ACTIVATING ┼───╴A.2╶──>║ ACTIVE ║<─╮
219 * ╚════════════╪═╝ i/a ╰──────────────╯ i/a ╚══════════════╝ │
220 * ^ │ │
221 * │ │ │
222 * │ │ ╔════════════╗ │
223 * │ │ ╭───╴B.2╶───╫ PINNED ║<───╴B.1╶───╮ │
224 * │ │ │ i ╚════════════╝ a │ R.2
225 * │ │ │ │ a
226 * │ │ │ ╭─────╴R.x╶─────╮ │ │
227 * │ │ v v a │ │ │
228 * │ │ ╔═════════════╗ ╭─┼──────────┼─╮ │
229 * │ ╰────╴C.1╶──>║ CLAIMED ╫────╴R.1╶──>│ RECLAIMING ┼──╯
230 * │ i ╚═╪═══════════╝ a ╰─┼────────────╯
231 * │ │ │
232 * ╰──────────╴F.1╶──────────╯<─────────╴F.2╶───────────╯
233 * i i
234 *
235 * ╔═╗ Double bar square boxes ╭─╮ Single bar round boxes
236 * ╚═╝ denote stable states. ╰─╯ denote transitionary states.
237 *
238 * a the transition can be done by the refill thread (async)
239 * i the transition can be done inline by any thread.
240 *
241 *
242 * Activation (A.1, A.2)
243 *
244 * [A.1 inline] is performed by mteinfo_tag_storage_try_activate() by
245 * vm_page_grab_slow() if the current grab would deplete the taggable
246 * space too much and that there seem to be an ample reserve of free
247 * pages.
248 *
249 * This path however will limit itself to pages that are really worth
250 * activating (17+ free associated pages, which coincide with the first 3
251 * mteinfo buckets for MTE_STATE_INACTIVE).
252 *
253 *
254 * [A.1 async] is performed by mteinfo_tag_storage_active_refill() when it
255 * decides that activating pages is the best strategy to get more taggable
256 * pages. It will only do so if [R.1 async] isn't more profitable.
257 *
258 *
259 * [A.2 inline/async] is performed by mteinfo_tag_storage_activate_locked()
260 * on the results of [A.1 inline/async]. The most notable thing to mention
261 * is until the tag pages are fully activated, no tagged page can be
262 * allocated, and if the thread doing this operation inline is a low priority
263 * thread, this could cause starvation due to priority inversions.
264 *
265 * To prevent this issue, turnstiles are used for the inline case so that
266 * there's a single activator at a time with priority inversion avoidance.
267 * The async path doesn't use this as it is a very high priority thread,
268 * and is meant to run in case of emergencies.
269 *
270 *
271 * Deactivation (D.1, D.2)
272 *
273 * [D.1 async] is performed by mteinfo_tag_storage_drain(). The refill
274 * thread will invoke this function after it is done with activations.
275 *
276 * This phase will only drain active(0.0) pages, meaning pages that are active
277 * but have no free pages associated with it nor MTE pages. Having such pages
278 * on the system is a sign of untagged memory pressure, and it's probably
279 * a good idea to free that tag storage page so it can be used for untagged
280 * purposes (i.e., become claimed).
281 *
282 * It will drain pages until the @c mte_claimable_queue has a healthy level.
283 *
284 * This transition is triggered lazily from the @c mteinfo_free_queue_grab()
285 * path when untagged pages have been allocated and tapped into the taggable
286 * space, and that system conditions permit
287 * (see @c mteinfo_tag_storage_should_drain()).
288 *
289 * [D.2 async] is performed by mteinfo_tag_storage_drain_flush(),
290 * which is called by mteinfo_tag_storage_drain() on the results
291 * of [D.1 async]
292 *
293 *
294 * Allocation/Claiming (C.1)
295 *
296 * [C.1 inline] is performed by @c mteinfo_tag_storage_claimable_refill()
297 * from the context of any @c mteinfo_free_queue_grab() (tagged or regular).
298 * The path will opportunistically determine that there are enough pages
299 * on the @c mte_claimable_queue that amortizing the cost of taking
300 * the spinlock protecting the per-cpu queue is worth it.
301 *
302 * It is done unconditionally otherwise, as the reclaim thread can steal
303 * from these queues. The @c vm_page_grab_options() fastpath knows how
304 * to draw from this directly.
305 *
306 *
307 * Freeing (F.1, F.2)
308 *
309 * [F.1 inline] is performed by page free paths who eventually call into
310 * @c vm_page_free_queue_enter(VM_MEMORY_CLASS_TAG_STORAGE).
311 *
312 * [F.2 inline] is the exact same transition but for the case when the refill
313 * thread was attempting to reclaim this page (it had performed [R.1 async]).
314 * It is worth nothing that on paper, the [C.1 inline] transition could happen
315 * again before the refill thread notices.
316 *
317 *
318 * Reclaiming (R.1, R.2, R.x, B.1, B.2)
319 *
320 * [R.1 async] is performed by mteinfo_tag_storage_active_refill() when it
321 * decides that reclaiming (stealing) pages is the best strategy to get more
322 * taggable pages. It will only do so if [A.1 async] isn't more profitable.
323 *
324 * Once pages have been marked as reclaiming, it will attempt to either steal
325 * the page from the cpu free queue, or attempt a relocation.
326 *
327 * [R.2 async] is exactly the same as [A.2 async], being performed by
328 * mteinfo_tag_storage_activate_locked() on the results of [R.1 async].
329 * The major difference however is that it is done one page at a time.
330 *
331 * [B.1 async] is performed by @c mteinfo_reclaim_tag_storage_page() when
332 * the relocating a claimed page failed due to the page being pinned.
333 * In which case, the tag storage page is marked with @c vmp_ts_wanted bit.
334 *
335 * [B.2 inline] is performed by @c mteinfo_tag_storage_wakeup() when threads
336 * notice that @c vmp_ts_wanted is set and that the condition causing it to be
337 * set has cleared.
338 *
339 * [R.x async] is performed when stealing the page was otherwise not
340 * successful (in @c mteinfo_reclaim_tag_storage_page() or
341 * @c mteinfo_tag_storage_flush_reclaiming()).
342 */
343
344
345 #pragma mark Types
346
347 /*!
348 * @typedef cell_state_mask_t
349 *
350 * @abstract
351 * Mask/bit-field version of the @c mte_cell_state_t bit in order to do assertions.
352 */
353 __options_decl(cell_state_mask_t, uint32_t, {
354 MTE_MASK_DISABLED = BIT(MTE_STATE_DISABLED),
355 MTE_MASK_PINNED = BIT(MTE_STATE_PINNED),
356 MTE_MASK_DEACTIVATING = BIT(MTE_STATE_DEACTIVATING),
357 MTE_MASK_CLAIMED = BIT(MTE_STATE_CLAIMED),
358 MTE_MASK_INACTIVE = BIT(MTE_STATE_INACTIVE),
359 MTE_MASK_RECLAIMING = BIT(MTE_STATE_RECLAIMING),
360 MTE_MASK_ACTIVATING = BIT(MTE_STATE_ACTIVATING),
361 MTE_MASK_ACTIVE = BIT(MTE_STATE_ACTIVE),
362 });
363
364 #define MTE_FF_CELL_INDEX_BITS 24 /* Number of bits for a cell index */
365 #define MTE_FF_CELL_PAGE_COUNT_BITS 6 /* Number of bits for a page count */
366 #define MTE_FF_CELL_STATE_BITS 3
367
368 /*!
369 * @typedef cell_idx_t
370 *
371 * @abstract
372 * Represents the index of a cell in the cell array (when positive), or a queue
373 * head (when negative).
374 *
375 * @discussion
376 * This type only has @c MTE_FF_CELL_INDEX_BITS worth of significant bits.
377 * Given that one bit is used to denote queues, it means we can support systems
378 * with up to:
379 * - 2^(MTE_FF_CELL_INDEX_BITS - 1) tag storage pages,
380 * - 2^(MTE_FF_CELL_INDEX_BITS + 4) pages,
381 * - 2^(MTE_FF_CELL_INDEX_BITS + 4 + PAGE_SHIFT) bytes.
382 *
383 * On a 16KB system (PAGE_SHIFT == 14) and with MTE_FF_CELL_INDEX_BITS == 24,
384 * this covers 2^42 == 4TB of physical memory.
385 */
386 typedef int32_t cell_idx_t;
387
388 typedef uint32_t cell_count_t;
389
390 /*!
391 * @typedef cell_t
392 *
393 * @abstract
394 * This data structure contains the metadata associated with a tag storage page,
395 * and its covered pages in the mteinfo tracking data structure.
396 *
397 * @discussion
398 * Here are some important invariants for this data structure:
399 * - mte_page_count + popcount(free_mask) <= MTE_PAGES_PER_TAG_PAGE
400 * - mte_page_count must be 0 unless state is DISABLED or ACTIVE.
401 *
402 * @field prev
403 * Linkage to the prev cell (as an index in the cell array).
404 *
405 * @field next
406 * Linkage to the next cell (as an index in the cell array).
407 *
408 * @field enqueue_pos
409 * If @c free_mask isn't 0, this contains the index of the free covered page
410 * which represents this cell in the mte free queues (@see @c mte_free_queues[]).
411 *
412 * @field mte_page_count
413 * The number of pages covered with this tag storage page, that are currently
414 * used and tagged.
415 *
416 * @field state
417 * The current state of the tag storage page this cell represents.
418 * @see mte_cell_state_t.
419 *
420 * @field free_mask
421 * A bitmask where each bit set corresponds to an associated covered page that
422 * is free (tagged or not).
423 *
424 * @field cell_count
425 * When the cell is a queue head, the number of cells enqueued on this bucket.
426 */
427 #pragma pack(4)
428 typedef struct {
429 cell_idx_t prev : MTE_FF_CELL_INDEX_BITS;
430 cell_idx_t next : MTE_FF_CELL_INDEX_BITS;
431 cell_count_t enqueue_pos : MTE_FF_CELL_PAGE_COUNT_BITS;
432 cell_count_t mte_page_count : MTE_FF_CELL_PAGE_COUNT_BITS;
433 mte_cell_state_t state : MTE_FF_CELL_STATE_BITS;
434 uint8_t __unused_bits : 1;
435 union {
436 uint32_t free_mask;
437 uint32_t cell_count;
438 };
439 } cell_t;
440 #pragma pack()
441
442 static_assert(sizeof(cell_t) == 12);
443 static_assert(MTE_STATE_ACTIVE < (1u << MTE_FF_CELL_STATE_BITS));
444 static_assert(MTE_PAGES_PER_TAG_PAGE <= (1 << MTE_FF_CELL_PAGE_COUNT_BITS));
445
446 /*!
447 * @typedef mte_cell_queue_t
448 *
449 * @abstract
450 * This data structure represents a particular queue/bucket of cells.
451 */
452 typedef struct mte_cell_queue_head {
453 cell_t head;
454 } *mte_cell_queue_t;
455
456 /*!
457 * @typedef mte_cell_bucket_t
458 *
459 * @abstract
460 * Represents the index of a bucket inside of a list.
461 */
462 __enum_decl(mte_cell_bucket_t, uint32_t, {
463 MTE_BUCKET_0,
464 MTE_BUCKET_1_8,
465 MTE_BUCKET_9_16,
466 MTE_BUCKET_17_24,
467 MTE_BUCKET_25_32,
468
469 _MTE_BUCKET_COUNT,
470 });
471
472 static_assert(_MTE_BUCKET_COUNT == MTE_BUCKETS_COUNT_MAX);
473
474 #define MTE_QUEUES_COUNT \
475 (1 /* disabled */ + \
476 1 /* pinned */ + \
477 MTE_BUCKETS_COUNT_MAX /* claimed */ + \
478 MTE_BUCKETS_COUNT_MAX /* inactive */ + \
479 1 /* deactivating */ + \
480 1 /* reclaiming */ + \
481 1 /* activating */ + \
482 MTE_BUCKETS_COUNT_MAX /* active_0 */ + \
483 1 /* active */ )
484
485
486 #pragma mark Behavioral boot-args
487
488 /*
489 * Boot-arg to enable/disable the interface for grabbing tag storage pages.
490 * This exists in case tunables or settings for tag storage management expose
491 * us to page shortages or system hangs due to wired tag storage pages. This
492 * boot-arg should allow us to bypass any such issues.
493 */
494 static TUNABLE(bool, vm_mte_enable_tag_storage_grab, "mte_ts_grab", true);
495
496 /*
497 * Boot-args controlling the draining down of tag storage space
498 *
499 * @var vm_page_tag_storage_reserved
500 * How many tag storage pages the inactive_0 queue needs to preserve
501 * at all times.
502 */
503 TUNABLE(uint32_t, vm_page_tag_storage_reserved, "mte_ts_grab_rsv", 100);
504
505 /*
506 * Boot-arg to enable/disable grabbing tag storage pages for the compressor
507 * pool.
508 */
509 TUNABLE(bool, vm_mte_tag_storage_for_compressor, "mte_ts_compressor", true);
510
511 #ifndef VM_MTE_FF_VERIFY
512 /*
513 * Boot-arg to enable/disable grabbing tag storage pages for specific VM tags.
514 * Note that the string length was somewhat arbitrarily chosen, so if the use
515 * case arises, we may need to bump that up...
516 *
517 * Currently, we allow allocations with VM tags of VM_MEMORY_MALLOC_SMALL (2),
518 * VM_MEMORY_MALLOC_TINY (7), and VM_MEMORY_MALLOC_NANO (11) to use tag storage
519 * pages. See vm_statistics.h for other potential candidates.
520 * In particular, VM_MEMORY_STACK (30) is promising.
521 */
522 static TUNABLE_STR(vm_mte_tag_storage_for_vm_tags, 256, "mte_ts_vmtag", "2,7,11");
523 #endif /* VM_MTE_FF_VERIFY */
524
525 #pragma mark Counters and Globals
526
527 struct mte_cell_list mte_info_lists[MTE_LISTS_COUNT];
528
529 static SECURITY_READ_ONLY_LATE(cell_t *) mte_info_cells;
530
531 #ifndef VM_MTE_FF_VERIFY
532 /*
533 * Fill thread state. The wake state of the thread is tracked to minimize
534 * scheduler interactions. Guarded with the free page lock.
535 */
536 static sched_cond_atomic_t fill_thread_cond = SCHED_COND_INIT;
537 static SECURITY_READ_ONLY_LATE(thread_t) vm_mte_fill_thread = THREAD_NULL;
538 static thread_t vm_mte_activator = THREAD_NULL;
539 static bool vm_mte_activator_waiters = false;
540
541 struct mte_pcpu PERCPU_DATA(mte_pcpu);
542 SCALABLE_COUNTER_DEFINE(vm_cpu_free_tagged_count);
543 SCALABLE_COUNTER_DEFINE(vm_cpu_free_claimed_count);
544 #endif
545
546 /*
547 * Free taggable pages queue, per-cpu queues, and its counters.
548 *
549 * guarded by the free page lock
550 */
551 uint32_t vm_page_free_taggable_count;
552 uint32_t vm_page_free_unmanaged_tag_storage_count;
553 uint32_t vm_page_tagged_count; /* Total tagged covered pages. */
554 uint32_t vm_page_free_wanted_tagged = 0;
555 uint32_t vm_page_free_wanted_tagged_privileged = 0;
556
557 /*
558 * Counters for tag storage pages we will just give to the system permanently
559 * for use as regular memory. These could technically be a subset of the
560 * claimed tag storage, but counting them separately is useful because they
561 * will have a different page lifecycle than the claimed tag storage pages...
562 * as when freed, these pages will go to the regular free queues.
563 *
564 * These shouldn't be mutated after bootstrap... so they have no lock.
565 */
566 uint32_t vm_page_recursive_tag_storage_count;
567 uint32_t vm_page_retired_tag_storage_count;
568 uint32_t vm_page_unmanaged_tag_storage_count;
569
570 /*
571 * The wired tag storage page count is guarded by the page queues lock. This
572 * counter is diagnostic; it exists to inform investigations about reclaim
573 * efficiency.
574 */
575 uint32_t vm_page_wired_tag_storage_count;
576
577 /*
578 * Diagnostic counters for reclamation; describes how many times reclamation
579 * attempts have succeeded or failed (as well as a breakout for failures due to
580 * the page being wired). Guarded by the free page lock.
581 */
582 uint64_t vm_mte_refill_thread_wakeups;
583 uint64_t vm_page_tag_storage_activation_count;
584 uint64_t vm_page_tag_storage_deactivation_count;
585 uint64_t vm_page_tag_storage_reclaim_from_cpu_count;
586 uint64_t vm_page_tag_storage_reclaim_success_count;
587 uint64_t vm_page_tag_storage_reclaim_failure_count;
588 uint64_t vm_page_tag_storage_reclaim_wired_failure_count;
589 uint64_t vm_page_tag_storage_wire_relocation_count;
590 uint64_t vm_page_tag_storage_reclaim_compressor_failure_count;
591 uint64_t vm_page_tag_storage_compressor_relocation_count;
592
593 #ifndef VM_MTE_FF_VERIFY
594 /*
595 * Diagnostic counter for reclamation describing the number of tag storage
596 * pages that have ever been allocated as claimed. Note that this value
597 * only increases.
598 */
599 SCALABLE_COUNTER_DEFINE(vm_cpu_claimed_count);
600 #endif /* VM_MTE_FF_VERIFY */
601
602 /*
603 * Array of 4 64-bit masks for which VM tags can use tag storage.
604 * There are a total of 256 VM tags.
605 * This shouldn't be mutated after bootstrap... so it has no lock.
606 */
607 bitmap_t vm_mte_tag_storage_for_vm_tags_mask[BITMAP_LEN(VM_MEMORY_COUNT)];
608
609 #pragma mark cell_idx_t
610
611 __pure2
612 static bool
cell_idx_is_queue(cell_idx_t idx)613 cell_idx_is_queue(cell_idx_t idx)
614 {
615 return idx < 0;
616 }
617
618 __pure2
619 static cell_t *
cell_from_idx(cell_idx_t idx)620 cell_from_idx(cell_idx_t idx)
621 {
622 return &mte_info_cells[idx];
623 }
624
625 __pure2
626 __attribute__((overloadable))
627 static cell_idx_t
cell_idx(const cell_t * cell)628 cell_idx(const cell_t *cell)
629 {
630 return (cell_idx_t)(cell - mte_info_cells);
631 }
632
633 __pure2
634 __attribute__((overloadable))
635 static cell_idx_t
cell_idx(mte_cell_queue_t queue)636 cell_idx(mte_cell_queue_t queue)
637 {
638 return cell_idx(&queue->head);
639 }
640
641 __pure2
642 static cell_count_t
cell_free_page_count(cell_t cell)643 cell_free_page_count(cell_t cell)
644 {
645 return __builtin_popcountll(cell.free_mask);
646 }
647
648 __pure2
649 static ppnum_t
cell_first_covered_pnum(const cell_t * cell)650 cell_first_covered_pnum(const cell_t *cell)
651 {
652 return pmap_first_pnum + cell_idx(cell) * MTE_PAGES_PER_TAG_PAGE;
653 }
654
655
656 #pragma mark mte_cell_queue_t
657
658 /*
659 * Based on the existing queue code in XNU. Look at <kern/queue.h> for the
660 * original code; done here due to the custom linkages.
661 */
662
663 static cell_idx_t
cell_queue_first_idx(mte_cell_queue_t queue)664 cell_queue_first_idx(mte_cell_queue_t queue)
665 {
666 return queue->head.next;
667 }
668
669 static cell_idx_t
cell_queue_last_idx(mte_cell_queue_t queue)670 cell_queue_last_idx(mte_cell_queue_t queue)
671 {
672 return queue->head.prev;
673 }
674
675 static cell_t *
cell_queue_first(mte_cell_queue_t queue)676 cell_queue_first(mte_cell_queue_t queue)
677 {
678 return cell_from_idx(cell_queue_first_idx(queue));
679 }
680
681 static uint32_t
cell_queue_count(mte_cell_queue_t queue)682 cell_queue_count(mte_cell_queue_t queue)
683 {
684 return queue->head.cell_count;
685 }
686
687
688 static bool
cell_queue_insert_tail(mte_cell_queue_t queue,cell_t * cell)689 cell_queue_insert_tail(mte_cell_queue_t queue, cell_t *cell)
690 {
691 cell_idx_t qidx = cell_idx(queue);
692 cell_idx_t tidx = cell_queue_last_idx(queue);
693 cell_t *tail = cell_from_idx(tidx);
694
695 if (tail->next != qidx) {
696 __queue_element_linkage_invalid(tail);
697 }
698
699 cell->next = qidx;
700 cell->prev = tidx;
701 queue->head.prev = tail->next = cell_idx(cell);
702
703 /* If the original tail was the queue, then it was empty. */
704 return cell_idx_is_queue(tidx);
705 }
706
707 static bool
cell_queue_remove(cell_t * cell)708 cell_queue_remove(cell_t *cell)
709 {
710 cell_idx_t pidx = cell->prev;
711 cell_idx_t nidx = cell->next;
712 cell_idx_t cidx = cell_idx(cell);
713 cell_t *prev = cell_from_idx(pidx);
714 cell_t *next = cell_from_idx(nidx);
715
716 if (prev->next != cidx || next->prev != cidx) {
717 __queue_element_linkage_invalid(cell);
718 }
719
720 next->prev = pidx;
721 prev->next = nidx;
722 /* No linkage cleanup because cells are never dequeued at rest. */
723
724 /*
725 * If the prev and next indices are the same, then this is the head
726 * index, and the queue became empty
727 */
728
729 return pidx == nidx;
730 }
731
732 #define cell_queue_foreach(it, q) \
733 for (cell_t *it = cell_queue_first(q); \
734 it != &(q)->head; \
735 it = cell_from_idx(it->next))
736
737 #define cell_queue_foreach_safe(it, q) \
738 for (cell_t *__next_it, *it = cell_queue_first(q); \
739 it != &(q)->head && (__next_it = cell_from_idx(it->next), 1); \
740 it = __next_it)
741
742
743 #pragma mark MTE free queue
744
745 /*
746 * The MTE free queue is a multi-dimensioned queue that replaces the
747 * vm_page_free_queue for covered pages on MTE targets.
748 *
749 * It is an array of colored free queues indexed by @c mte_free_queue_idx_t.
750 *
751 *
752 * A queue of tag storage pages
753 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
754 *
755 * When a tag storage page has no associated free covered pages, no page is
756 * enqueued on the mte free queue. However when a tag storage page has one or
757 * more free covered pages associated then there is one and only one of these
758 * pages enqueued on the mte free queues.
759 *
760 * The chosen representative for the cell is remembered on the cell of the
761 * associated tag storage @c cell_t::enqueue_pos value.
762 *
763 *
764 * Enqueue / dequeue algorithm
765 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
766 *
767 * This chosen representative makes the cluster available for its page color,
768 * and only this color, despite other colors being possibly available for this
769 * tag storage page.
770 *
771 * When removing a free page from the MTE queue, if the page being grabbed
772 * was the enqueued candidate, then the next enqueued candidate is chosen
773 * as the next free page in bitmask "circular" order
774 * (@see mteinfo_free_queue_next_bit()).
775 *
776 * As a result, by "pushing" the page forward this way, the tag storage page
777 * will be made available through all colors that it can provide.
778 *
779 *
780 * Allocation stability and bucket selection
781 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
782 *
783 * The free queues are in that order:
784 *
785 * {claimed/disabled} -> {inactive_0, inactive_1} ->
786 * {active_0, active_1, active_2, active_3} -> {activating}
787 *
788 * This is selected carefully to have the following crucial properties:
789 *
790 * - allocating untagged pages chooses buckets "left to right"
791 * (in increasing free queue index order).
792 *
793 * - allocating tagged pages chooses active buckets "right to left"
794 * (in decreasing free queue index from the active_* queues).
795 *
796 * - when allocating untagged pages, the impact on the tag storage page will
797 * be that it stays in the same free queue or moves "down" in the free queue
798 * indices order.
799 *
800 * - when allocating tagged pages, the impact on the tag storage page will
801 * be that it stays in the same free queue or moves "up" in the free queue
802 * indices order.
803 *
804 * This is important and allows for a nice optimization: if a tag storage page
805 * was found to be a good candidate for a given grab operation, it always will
806 * stay a "best" candidate until it has no free pages left, which allows for
807 * allocations of contiguous spans of pages at once
808 * (@see mteinfo_free_queue_grab()).
809 *
810 * Lastly, in order to find the first free bucket quickly,
811 * @c mte_claimable_queue is a bitmask where a bit being set means that the
812 * corresponding bucket has at least one queue non empty.
813 *
814 *
815 * Tag Storage Free queue
816 * ~~~~~~~~~~~~~~~~~~~~~~
817 *
818 * Tag storage pages can only be claimed if they are inactive with the [C.1]
819 * transition. Getting pages to inactive is done via the Deactivation [D.*].
820 *
821 * However, as we mentioned the MTE free queue is only about covered pages
822 * proper, and do not contain the tag storage pages. Another point is that
823 * we do not want to claim pages too aggressively as it could get in the way
824 * of the Activation [A.*] transition when tagged pages are required.
825 *
826 * To solve this tension, the @c mte_claimable_queue holds inactive tag storage
827 * pages that have 8 free pages or less at any given time. These are unlikely
828 * to be profitable activation candidates, but also demonstrate that there is
829 * enough untagged memory pressure on the system that we have clusters of
830 * covered pages in use.
831 *
832 * The @c mteinfo_free_queue_grab() code will promote these to a per-cpu
833 * free queue that in turn the @c vm_page_grab_options() fastpath can tap into
834 * as another opportunistic source of pages.
835 */
836 struct vm_page_free_queue mte_free_queues[MTE_FREE_NOT_QUEUED];
837 struct vm_page_free_queue mte_claimable_queue;
838 static uint32_t mte_free_queue_mask;
839
840 /*!
841 * @abstract
842 * Computes the proper mte free queue index for a given cell.
843 */
844 __pure2
845 static mte_free_queue_idx_t
mteinfo_free_queue_idx(cell_t cell)846 mteinfo_free_queue_idx(cell_t cell)
847 {
848 uint32_t free = cell_free_page_count(cell);
849 uint32_t tagged = cell.mte_page_count;
850 uint32_t used = MTE_PAGES_PER_TAG_PAGE - free - tagged;
851 uint32_t n;
852
853 if (cell.free_mask == 0) {
854 return MTE_FREE_NOT_QUEUED;
855 }
856
857 switch (cell.state) {
858 case MTE_STATE_DISABLED:
859 case MTE_STATE_PINNED:
860 case MTE_STATE_DEACTIVATING:
861 return MTE_FREE_UNTAGGABLE_0;
862
863 case MTE_STATE_CLAIMED:
864 case MTE_STATE_INACTIVE:
865 /*
866 * This is "clever" code to map:
867 * MTE_FREE_UNTAGGABLE_0: Claimed[0-16]
868 * MTE_FREE_UNTAGGABLE_1: Claimed[16-32], Inactive[0-16]
869 * MTE_FREE_UNTAGGABLE_2: Inactive[16-32]
870 */
871 n = MTE_FREE_UNTAGGABLE_0 + cell.state - MTE_STATE_CLAIMED;
872 static_assert(MTE_STATE_INACTIVE == MTE_STATE_CLAIMED + 1);
873 return n + (free > MTE_PAGES_PER_TAG_PAGE / 2);
874
875 case MTE_STATE_RECLAIMING:
876 case MTE_STATE_ACTIVATING:
877 return MTE_FREE_UNTAGGABLE_ACTIVATING;
878
879 case MTE_STATE_ACTIVE:
880 break;
881 }
882
883 /*
884 * Empirically this seems to give decent fragmentation results
885 * with alternating MTE/non-MTE workloads.
886 *
887 * This tries to find a balance between favoring buckets with mte pages
888 * allocated and to penalize buckets with untagged pages allocated,
889 * while keeping buckets with the most free pages on the fence.
890 *
891 * The distribution it generates can be printed by running the
892 * "active_buckets" subtest of tests/vm/vm_mteinfo.c
893 */
894
895 n = tagged + free / 5;
896 n -= MIN(n, used) / 3;
897 return MTE_FREE_ACTIVE_0 + fls(n / 4);
898 }
899
900 static vm_page_queue_t
mteinfo_free_queue_head(mte_free_queue_idx_t idx,uint32_t color)901 mteinfo_free_queue_head(mte_free_queue_idx_t idx, uint32_t color)
902 {
903 return &mte_free_queues[idx].vmpfq_queues[color].qhead;
904 }
905
906 /*!
907 * @abstract
908 * Computes the next bit in "circular" mask order
909 *
910 * @discussion
911 * This computes the next bit set in @c mask that is larger or equal
912 * to @c bit, or if none exist, then the smallest bit set in @c mask.
913 *
914 * This means that for a mask with positions mask={1, 5, 6, 10} set,
915 * the "next" bit for:
916 * - 4 is 5,
917 * - 10 is 10,
918 * - 12 is 1.
919 *
920 * @param mask The mask to scan. The mask must be non 0.
921 * @param bit The bit to scan from.
922 * @returns The next bit set in "circular" order.
923 */
924 static cell_count_t
mteinfo_free_queue_next_bit(uint32_t mask,cell_count_t bit)925 mteinfo_free_queue_next_bit(uint32_t mask, cell_count_t bit)
926 {
927 cell_count_t cur = bit % MTE_PAGES_PER_TAG_PAGE;
928
929 mask = (mask >> cur) | (mask << (32 - cur));
930 bit += ffs(mask) - 1;
931
932 return bit % MTE_PAGES_PER_TAG_PAGE;
933 }
934
935 /*!
936 * @abstract
937 * Backend for CELL_UPDATE() to manage update/requeues to the mte free queue.
938 *
939 * @param cell The new state of the cell.
940 * @param orig The original state of the cell.
941 * @param oidx The original free queue index for the cell.
942 * @param nidx The new free queue index for the cell.
943 */
944 __attribute__((noinline))
945 static void
mteinfo_free_queue_requeue(cell_t * cell,const cell_t orig,mte_free_queue_idx_t oidx,mte_free_queue_idx_t nidx)946 mteinfo_free_queue_requeue(
947 cell_t *cell,
948 const cell_t orig,
949 mte_free_queue_idx_t oidx,
950 mte_free_queue_idx_t nidx)
951 {
952 ppnum_t first_pnum = cell_first_covered_pnum(cell);
953 vm_page_queue_t queue;
954 cell_count_t bit = orig.enqueue_pos;
955 vm_page_t mem;
956
957 if (oidx == MTE_FREE_NOT_QUEUED && nidx == MTE_FREE_NOT_QUEUED) {
958 cell->enqueue_pos = -1;
959 return;
960 }
961
962 if (oidx != MTE_FREE_NOT_QUEUED) {
963 mem = vm_page_find_canonical(first_pnum + bit);
964 queue = mteinfo_free_queue_head(oidx,
965 (first_pnum + bit) & vm_color_mask);
966 assert(bit_test(orig.free_mask, bit));
967
968 vm_page_queue_remove(queue, mem, vmp_pageq);
969 VM_COUNTER_DEC(&mte_free_queues[oidx].vmpfq_count);
970 if (mte_free_queues[oidx].vmpfq_count == 0) {
971 bit_clear(mte_free_queue_mask, oidx);
972 }
973 }
974
975 if (nidx == MTE_FREE_NOT_QUEUED) {
976 cell->enqueue_pos = -1;
977 } else {
978 bit = mteinfo_free_queue_next_bit(cell->free_mask, bit);
979 mem = vm_page_find_canonical(first_pnum + bit);
980 queue = mteinfo_free_queue_head(nidx,
981 (first_pnum + bit) & vm_color_mask);
982 assert(bit_test(cell->free_mask, bit));
983
984 cell->enqueue_pos = bit;
985 vm_page_queue_enter_first(queue, mem, vmp_pageq);
986 if (mte_free_queues[nidx].vmpfq_count == 0) {
987 bit_set(mte_free_queue_mask, nidx);
988 }
989 VM_COUNTER_INC(&mte_free_queues[nidx].vmpfq_count);
990 }
991 }
992
993
994 #pragma mark mte_cell_list_t
995
996 __pure2
997 static mte_cell_bucket_t
cell_list_idx_buckets(mte_cell_list_idx_t idx)998 cell_list_idx_buckets(mte_cell_list_idx_t idx)
999 {
1000 switch (idx) {
1001 case MTE_LIST_INACTIVE_IDX:
1002 case MTE_LIST_CLAIMED_IDX:
1003 case MTE_LIST_ACTIVE_0_IDX:
1004 return MTE_BUCKETS_COUNT_MAX;
1005 default:
1006 return 1;
1007 }
1008 }
1009
1010 __pure2
1011 static mte_cell_list_idx_t
cell_list_idx(const cell_t cell)1012 cell_list_idx(const cell_t cell)
1013 {
1014 if (cell.state != MTE_STATE_ACTIVE || cell.mte_page_count == 0) {
1015 return (mte_cell_list_idx_t)cell.state;
1016 }
1017
1018 return MTE_LIST_ACTIVE_IDX;
1019 }
1020
1021 __pure2
1022 static mte_cell_bucket_t
cell_list_bucket(const cell_t cell)1023 cell_list_bucket(const cell_t cell)
1024 {
1025 if (cell_list_idx_buckets(cell_list_idx(cell)) > 1) {
1026 return (cell_free_page_count(cell) + 7) / 8;
1027 }
1028 return 0;
1029 }
1030
1031 __pure2
1032 static inline bool
cell_on_claimable_queue(const cell_t cell)1033 cell_on_claimable_queue(const cell_t cell)
1034 {
1035 if (cell.state == MTE_STATE_INACTIVE) {
1036 return cell_list_bucket(cell) <= MTE_BUCKET_1_8;
1037 }
1038 return false;
1039 }
1040
1041 __attribute__((noinline))
1042 static void
cell_list_requeue(cell_t * cell,vm_page_t tag_page,mte_cell_list_idx_t oidx,mte_cell_bucket_t obucket,mte_cell_list_idx_t nidx,mte_cell_bucket_t nbucket,int claim_requeue)1043 cell_list_requeue(
1044 cell_t *cell,
1045 vm_page_t tag_page,
1046 mte_cell_list_idx_t oidx,
1047 mte_cell_bucket_t obucket,
1048 mte_cell_list_idx_t nidx,
1049 mte_cell_bucket_t nbucket,
1050 int claim_requeue)
1051 {
1052 mte_cell_list_t olist = &mte_info_lists[oidx];
1053 mte_cell_list_t nlist = &mte_info_lists[nidx];
1054
1055 if (cell_queue_remove(cell)) {
1056 bit_clear(olist->mask, obucket);
1057 }
1058
1059 if (cell_queue_insert_tail(&nlist->buckets[nbucket], cell)) {
1060 bit_set(nlist->mask, nbucket);
1061 }
1062
1063 olist->buckets[obucket].head.cell_count--;
1064 nlist->buckets[nbucket].head.cell_count++;
1065
1066 if (olist != nlist) {
1067 olist->count--;
1068 nlist->count++;
1069 }
1070
1071 if (claim_requeue) {
1072 #ifndef VM_MTE_FF_VERIFY
1073 uint32_t color = VM_PAGE_GET_COLOR(tag_page);
1074 vm_page_queue_t queue;
1075
1076 queue = &mte_claimable_queue.vmpfq_queues[color].qhead;
1077 if (claim_requeue > 0) {
1078 vm_page_queue_enter(queue, tag_page, vmp_pageq);
1079 } else {
1080 vm_page_queue_remove(queue, tag_page, vmp_pageq);
1081 }
1082 VM_COUNTER_DELTA(&mte_claimable_queue.vmpfq_count, claim_requeue);
1083 #endif /* VM_MTE_FF_VERIFY */
1084 }
1085 }
1086
1087 /*!
1088 * @abstract
1089 * Find a page in the last non-empty bucket that is larger than the
1090 * specified bucket index.
1091 *
1092 * @param lidx The list index to scan.
1093 * @param min_bucket The minimum bucket index to consider.
1094 * @param tag_page The tag page associated with the returned cell.
1095 * @returns The cell that was found or NULL.
1096 */
1097 static cell_t *
cell_list_find_last_page(mte_cell_list_idx_t lidx,mte_cell_bucket_t min_bucket,vm_page_t * tag_page)1098 cell_list_find_last_page(
1099 mte_cell_list_idx_t lidx,
1100 mte_cell_bucket_t min_bucket,
1101 vm_page_t *tag_page)
1102 {
1103 mte_cell_list_t list = &mte_info_lists[lidx];
1104 uint32_t mask = list->mask & ~mask(min_bucket);
1105 mte_cell_queue_t queue;
1106
1107 if (__improbable(mask == 0)) {
1108 *tag_page = VM_PAGE_NULL;
1109 return NULL;
1110 }
1111
1112 queue = &list->buckets[fls(mask) - 1];
1113 *tag_page = vm_tag_storage_page_get(cell_queue_first_idx(queue));
1114 return cell_queue_first(queue);
1115 }
1116
1117
1118 #pragma mark Tag storage space state machine
1119
1120 /*!
1121 * Assert that a cell is in one of the states specified by the mask.
1122 */
1123 #define assert_cell_state(cell, mask) \
1124 release_assert(((mask) & (1 << (cell)->state)) != 0)
1125
1126 /*!
1127 * Perform an arbitrary update on a cell, and update the MTE info queues
1128 * accordingly.
1129 *
1130 * This should be used this way:
1131 *
1132 * <code>
1133 * // Preflights and asserts here
1134 * assert_cell_state(cell_var, ...);
1135 *
1136 * CELL_UPDATE(cell_var, tag_page, cleared_bit, {
1137 * // Mutations of cell_var here
1138 * cell_var->state = ...;
1139 * });
1140 * </code>
1141 *
1142 * @param cell The cell to update.
1143 * @param tag_page The tag page corresponding to @c cell.
1144 * @param cleared_bit The bit that was cleared or -1
1145 * @param mut Code that mutates its argument, and performs the
1146 * required update.
1147 */
1148 #define CELL_UPDATE(cell, tag_page, cleared_bit, ...) ({ \
1149 mte_cell_list_idx_t __ol, __nl; \
1150 mte_cell_bucket_t __ob, __nb; \
1151 mte_free_queue_idx_t __oi, __ni; \
1152 int __ocq, __ncq; \
1153 cell_t *__cell = (cell); \
1154 cell_t __orig = *__cell; \
1155 \
1156 __ol = cell_list_idx(__orig); \
1157 __ob = cell_list_bucket(__orig); \
1158 __ocq = cell_on_claimable_queue(__orig); \
1159 __oi = mteinfo_free_queue_idx(__orig); \
1160 \
1161 __VA_ARGS__; \
1162 \
1163 __nl = cell_list_idx(*__cell); \
1164 __nb = cell_list_bucket(*__cell); \
1165 __ncq = cell_on_claimable_queue(*__cell); \
1166 __ni = mteinfo_free_queue_idx(*__cell); \
1167 \
1168 if (__ol != __nl || __ob != __nb) { \
1169 cell_list_requeue(__cell, tag_page, __ol, __ob, __nl, __nb, \
1170 __ncq - __ocq); \
1171 } \
1172 if (__oi != __ni || (cleared_bit)) { \
1173 mteinfo_free_queue_requeue(__cell, __orig, __oi, __ni); \
1174 } \
1175 })
1176
1177 __pure2
1178 static cell_t *
cell_from_tag_storage_page(const struct vm_page * page)1179 cell_from_tag_storage_page(const struct vm_page *page)
1180 {
1181 cell_idx_t pidx;
1182
1183 pidx = (cell_idx_t)(page - vm_pages_tag_storage_array_internal());
1184 return cell_from_idx(pidx);
1185 }
1186
1187 __pure2
1188 __attribute__((overloadable))
1189 static cell_t *
cell_from_covered_ppnum(ppnum_t pnum)1190 cell_from_covered_ppnum(ppnum_t pnum)
1191 {
1192 cell_idx_t cidx = (pnum - pmap_first_pnum) / MTE_PAGES_PER_TAG_PAGE;
1193
1194 return cell_from_idx(cidx);
1195 }
1196
1197 __pure2
1198 __attribute__((overloadable))
1199 static cell_t *
cell_from_covered_ppnum(ppnum_t pnum,vm_page_t * tag_page)1200 cell_from_covered_ppnum(ppnum_t pnum, vm_page_t *tag_page)
1201 {
1202 cell_idx_t cidx = (pnum - pmap_first_pnum) / MTE_PAGES_PER_TAG_PAGE;
1203
1204 *tag_page = vm_tag_storage_page_get(cidx);
1205 return cell_from_idx(cidx);
1206 }
1207
1208 /*!
1209 * @function mteinfo_tag_storage_set_active()
1210 *
1211 * @abstract
1212 * Mark a tag storage page as active.
1213 *
1214 * @discussion
1215 * The page should be disabled (initial activation) or activating.
1216 *
1217 * @param tag_page The pointer to a page inside the tag storage space.
1218 * @param mte_count How many covered pages are used and tagged for @c tag_page.
1219 * @param init Whether this is the initial transition.
1220 * @returns The number of covered pages this made taggable.
1221 */
1222 static uint32_t
mteinfo_tag_storage_set_active(vm_page_t tag_page,uint32_t mte_count,bool init)1223 mteinfo_tag_storage_set_active(vm_page_t tag_page, uint32_t mte_count, bool init)
1224 {
1225 cell_t *cell = cell_from_tag_storage_page(tag_page);
1226 cell_count_t free_page_count = cell_free_page_count(*cell);
1227
1228 assert(mte_count + free_page_count <= MTE_PAGES_PER_TAG_PAGE);
1229 if (init) {
1230 assert_cell_state(cell,
1231 /* [I.1] */ MTE_MASK_DISABLED);
1232 } else {
1233 assert_cell_state(cell,
1234 /* [R.2] */ MTE_MASK_RECLAIMING |
1235 /* [A.2] */ MTE_MASK_ACTIVATING);
1236 }
1237
1238 VM_COUNTER_ADD(&vm_page_free_taggable_count, free_page_count);
1239 vm_page_tag_storage_activation_count++;
1240
1241 CELL_UPDATE(cell, tag_page, false, {
1242 cell->state = MTE_STATE_ACTIVE;
1243 cell->mte_page_count = mte_count;
1244 });
1245
1246 return free_page_count;
1247 }
1248
1249 bool
mteinfo_tag_storage_disabled(const struct vm_page * tag_page)1250 mteinfo_tag_storage_disabled(const struct vm_page *tag_page)
1251 {
1252 return cell_from_tag_storage_page(tag_page)->state == MTE_STATE_DISABLED;
1253 }
1254
1255 void
mteinfo_tag_storage_set_retired(vm_page_t tag_page)1256 mteinfo_tag_storage_set_retired(vm_page_t tag_page)
1257 {
1258 cell_t *cell = cell_from_tag_storage_page(tag_page);
1259
1260 assert(cell->mte_page_count == 0);
1261 assert_cell_state(cell,
1262 /* [K.3] */ MTE_MASK_DISABLED |
1263 /* [K.2] */ MTE_MASK_CLAIMED |
1264 /* [K.1] */ MTE_MASK_RECLAIMING);
1265
1266 VM_COUNTER_INC(&vm_page_retired_tag_storage_count);
1267
1268 CELL_UPDATE(cell, tag_page, false, {
1269 cell->state = MTE_STATE_DISABLED;
1270 });
1271 }
1272
1273 #ifndef VM_MTE_FF_VERIFY
1274 /*!
1275 * @function mteinfo_tag_storage_set_unmanaged()
1276 *
1277 * @abstract
1278 * Mark a tag storage page as actually being disabled-unmanaged
1279 *
1280 * @discussion
1281 * The tag storage page must be claimed or inactive.
1282 *
1283 * @param cell The cell to mark as disabled.
1284 * @param tag_page The tag page corresponding to @c cell.
1285 */
1286 static void
mteinfo_tag_storage_set_unmanaged(cell_t * cell,vm_page_t tag_page)1287 mteinfo_tag_storage_set_unmanaged(cell_t *cell, vm_page_t tag_page)
1288 {
1289 bool queue = cell->state == MTE_STATE_INACTIVE;
1290
1291 assert(cell->mte_page_count == 0);
1292 assert(cell->free_mask == 0);
1293
1294 assert_cell_state(cell,
1295 /* [U.1] */ MTE_MASK_CLAIMED |
1296 /* [U.2] */ MTE_MASK_INACTIVE);
1297
1298 VM_COUNTER_INC(&vm_page_unmanaged_tag_storage_count);
1299
1300 CELL_UPDATE(cell, tag_page, false, {
1301 cell->state = MTE_STATE_DISABLED;
1302 });
1303
1304 if (queue) {
1305 vm_page_free_queue_enter(VM_MEMORY_CLASS_DEAD_TAG_STORAGE,
1306 tag_page, VM_PAGE_GET_PHYS_PAGE(tag_page));
1307 }
1308 }
1309 #endif /* VM_MTE_FF_VERIFY */
1310
1311 void
mteinfo_tag_storage_set_inactive(vm_page_t tag_page,bool init)1312 mteinfo_tag_storage_set_inactive(vm_page_t tag_page, bool init)
1313 {
1314 cell_t *cell = cell_from_tag_storage_page(tag_page);
1315
1316 assert(cell->mte_page_count == 0);
1317 if (init) {
1318 assert_cell_state(cell,
1319 /* [I.2] */ MTE_MASK_DISABLED);
1320 } else {
1321 assert_cell_state(cell,
1322 /* [D.2] */ MTE_MASK_DEACTIVATING |
1323 /* [F.1] */ MTE_MASK_CLAIMED |
1324 /* [F.2] */ MTE_MASK_RECLAIMING);
1325 }
1326
1327 #ifndef VM_MTE_FF_VERIFY
1328 if (cell->state == MTE_STATE_CLAIMED) {
1329 /*
1330 * This is to account for [F.1].
1331 * For [F.2], we already decremented due to [R.1]
1332 */
1333 counter_dec(&vm_cpu_claimed_count);
1334 }
1335 #endif /* VM_MTE_FF_VERIFY */
1336
1337 CELL_UPDATE(cell, tag_page, false, {
1338 cell->state = MTE_STATE_INACTIVE;
1339 });
1340 }
1341
1342 void
mteinfo_tag_storage_set_claimed(vm_page_t tag_page)1343 mteinfo_tag_storage_set_claimed(vm_page_t tag_page)
1344 {
1345 cell_t *cell = cell_from_tag_storage_page(tag_page);
1346
1347 assert(cell->mte_page_count == 0);
1348 assert_cell_state(cell,
1349 /* [C.1] */ MTE_MASK_INACTIVE |
1350 /* [R.x] */ MTE_MASK_RECLAIMING);
1351
1352 #ifndef VM_MTE_FF_VERIFY
1353 if (cell->state == MTE_STATE_RECLAIMING) {
1354 counter_inc(&vm_cpu_claimed_count);
1355 }
1356 #endif /* VM_MTE_FF_VERIFY */
1357
1358 CELL_UPDATE(cell, tag_page, false, {
1359 cell->state = MTE_STATE_CLAIMED;
1360 });
1361 }
1362
1363 /*!
1364 * @function mteinfo_tag_storage_set_reclaiming()
1365 *
1366 * @abstract
1367 * Mark a tag storage page as being reclaimed.
1368 *
1369 * @discussion
1370 * The tag storage page must be claimed.
1371 *
1372 * @param cell The cell to mark as reclaiming
1373 * @param tag_page The tag page corresponding to @c cell.
1374 */
1375 static void
mteinfo_tag_storage_set_reclaiming(cell_t * cell,vm_page_t tag_page)1376 mteinfo_tag_storage_set_reclaiming(cell_t *cell, vm_page_t tag_page)
1377 {
1378 assert(cell->mte_page_count == 0);
1379 assert_cell_state(cell, /* [R.1] */ MTE_MASK_CLAIMED);
1380
1381 CELL_UPDATE(cell, tag_page, false, {
1382 cell->state = MTE_STATE_RECLAIMING;
1383 });
1384
1385 #ifndef VM_MTE_FF_VERIFY
1386 counter_dec(&vm_cpu_claimed_count);
1387 #endif /* VM_MTE_FF_VERIFY */
1388 }
1389
1390 /*!
1391 * @function mteinfo_tag_storage_flush_reclaiming()
1392 *
1393 * @abstract
1394 * Empties the reclaiming queue, moving all pages on it back to claimed.
1395 */
1396 static void
mteinfo_tag_storage_flush_reclaiming(void)1397 mteinfo_tag_storage_flush_reclaiming(void)
1398 {
1399 mte_cell_list_t list = &mte_info_lists[MTE_LIST_RECLAIMING_IDX];
1400 mte_cell_queue_t queue = &list->buckets[0];
1401 uint32_t batch = VMP_FREE_BATCH_SIZE;
1402
1403 while (cell_queue_count(queue) > 0) {
1404 cell_idx_t idx = cell_queue_first_idx(queue);
1405 vm_page_t tag_page = vm_tag_storage_page_get(idx);
1406 cell_t *cell = cell_from_idx(idx);
1407
1408 assert_cell_state(cell, /* [R.x] */ MTE_MASK_RECLAIMING);
1409 CELL_UPDATE(cell, tag_page, false, {
1410 cell->state = MTE_STATE_CLAIMED;
1411 });
1412
1413 #ifndef VM_MTE_FF_VERIFY
1414 counter_inc(&vm_cpu_claimed_count);
1415 #endif /* VM_MTE_FF_VERIFY */
1416
1417 if (--batch == 0 && cell_queue_count(queue)) {
1418 #ifndef VM_MTE_FF_VERIFY
1419 vm_free_page_unlock();
1420 vm_free_page_lock_spin();
1421 #endif /* VM_MTE_FF_VERIFY */
1422 batch = VMP_FREE_BATCH_SIZE;
1423 }
1424 }
1425 }
1426
1427 #ifndef VM_MTE_FF_VERIFY
1428
1429 void
mteinfo_tag_storage_wakeup(vm_page_t tag_page,bool fq_locked)1430 mteinfo_tag_storage_wakeup(vm_page_t tag_page, bool fq_locked)
1431 {
1432 cell_t *cell = cell_from_tag_storage_page(tag_page);
1433
1434 if (!fq_locked) {
1435 vm_free_page_lock_spin();
1436 }
1437
1438 assert(tag_page->vmp_ts_wanted);
1439 tag_page->vmp_ts_wanted = false;
1440
1441 assert_cell_state(cell, /* [B.2] */ MTE_MASK_PINNED);
1442 CELL_UPDATE(cell, tag_page, false, {
1443 cell->state = MTE_STATE_CLAIMED;
1444 });
1445
1446 if (cell->free_mask != 0 &&
1447 (vm_page_free_wanted_tagged_privileged || vm_page_free_wanted_tagged)) {
1448 mteinfo_wake_fill_thread();
1449 }
1450
1451 if (!fq_locked) {
1452 vm_free_page_unlock();
1453 }
1454
1455 counter_inc(&vm_cpu_claimed_count);
1456 }
1457
1458 #endif /* VM_MTE_FF_VERIFY */
1459 #pragma mark Covered pages state machine
1460
1461 bool
mteinfo_covered_page_taggable(ppnum_t pnum)1462 mteinfo_covered_page_taggable(ppnum_t pnum)
1463 {
1464 return cell_from_covered_ppnum(pnum)->state == MTE_STATE_ACTIVE;
1465 }
1466
1467 void
mteinfo_covered_page_set_free(ppnum_t pnum,bool tagged)1468 mteinfo_covered_page_set_free(ppnum_t pnum, bool tagged)
1469 {
1470 vm_page_t tag_page;
1471 cell_t *cell = cell_from_covered_ppnum(pnum, &tag_page);
1472 int bit = pnum % MTE_PAGES_PER_TAG_PAGE;
1473
1474 assert(cell->mte_page_count >= tagged);
1475 assert(!bit_test(cell->free_mask, bit));
1476
1477 VM_COUNTER_INC(&vm_page_free_count);
1478 if (cell->state == MTE_STATE_ACTIVE) {
1479 VM_COUNTER_INC(&vm_page_free_taggable_count);
1480 }
1481 if (tagged) {
1482 VM_COUNTER_DEC(&vm_page_tagged_count);
1483 }
1484
1485 CELL_UPDATE(cell, tag_page, false, {
1486 cell->mte_page_count -= tagged;
1487 bit_set(cell->free_mask, bit);
1488 });
1489 }
1490
1491 void
mteinfo_covered_page_set_used(ppnum_t pnum,bool tagged)1492 mteinfo_covered_page_set_used(ppnum_t pnum, bool tagged)
1493 {
1494 vm_page_t tag_page;
1495 cell_t *cell = cell_from_covered_ppnum(pnum, &tag_page);
1496 int bit = pnum % MTE_PAGES_PER_TAG_PAGE;
1497
1498 assert(cell->mte_page_count + tagged <= MTE_PAGES_PER_TAG_PAGE);
1499 assert(bit_test(cell->free_mask, bit));
1500
1501 VM_COUNTER_DEC(&vm_page_free_count);
1502 if (cell->state == MTE_STATE_ACTIVE) {
1503 VM_COUNTER_DEC(&vm_page_free_taggable_count);
1504 }
1505 if (tagged) {
1506 VM_COUNTER_INC(&vm_page_tagged_count);
1507 }
1508
1509 CELL_UPDATE(cell, tag_page, true, {
1510 bit_clear(cell->free_mask, bit);
1511 cell->mte_page_count += tagged;
1512 });
1513 }
1514
1515 __startup_func
1516 void
mteinfo_covered_page_set_stolen_tagged(ppnum_t pnum)1517 mteinfo_covered_page_set_stolen_tagged(ppnum_t pnum)
1518 {
1519 vm_page_t tag_page;
1520 cell_t *cell = cell_from_covered_ppnum(pnum, &tag_page);
1521
1522 assert(cell->mte_page_count < MTE_PAGES_PER_TAG_PAGE);
1523 assert(!bit_test(cell->free_mask, pnum % MTE_PAGES_PER_TAG_PAGE));
1524
1525 CELL_UPDATE(cell, tag_page, false, {
1526 cell->mte_page_count++;
1527 });
1528 }
1529
1530 void
mteinfo_covered_page_clear_tagged(ppnum_t pnum)1531 mteinfo_covered_page_clear_tagged(ppnum_t pnum)
1532 {
1533 vm_page_t tag_page;
1534 cell_t *cell = cell_from_covered_ppnum(pnum, &tag_page);
1535
1536 assert(cell->mte_page_count > 0);
1537 assert(!bit_test(cell->free_mask, pnum % MTE_PAGES_PER_TAG_PAGE));
1538
1539 CELL_UPDATE(cell, tag_page, false, {
1540 cell->mte_page_count--;
1541 });
1542 }
1543
1544
1545 #pragma mark Activate
1546 #ifndef VM_MTE_FF_VERIFY
1547
1548 /*!
1549 * @function mteinfo_tag_storage_wire_locked()
1550 *
1551 * @abstract
1552 * Wire the given tag storage page.
1553 *
1554 * @discussion
1555 * The page will be wired as part of mte_tags_object.
1556 *
1557 * This must be called with the object lock and the page queues lock held.
1558 *
1559 * @param tag_page
1560 * A tag storage page.
1561 */
1562 static void
mteinfo_tag_storage_wire_locked(vm_page_t tag_page)1563 mteinfo_tag_storage_wire_locked(vm_page_t tag_page)
1564 {
1565 vm_object_offset_t page_addr = ptoa(VM_PAGE_GET_PHYS_PAGE(tag_page));
1566
1567 assert(tag_page->vmp_wire_count == 0);
1568 vm_page_wire(tag_page, VM_KERN_MEMORY_MTAG,
1569 /* Don't check memory status. */ FALSE);
1570
1571 vm_page_insert_internal(tag_page, mte_tags_object, page_addr,
1572 VM_KERN_MEMORY_MTAG,
1573 /* We already hold the queue locks. */ TRUE,
1574 /* Add this page to the hash. */ TRUE,
1575 /* Don't bother batching pmap operations. */ FALSE,
1576 /* Don't bother batching accounting. */ FALSE,
1577 /* Don't bother with delayed ledger updates. */ NULL);
1578 }
1579
1580 /*!
1581 * @function mteinfo_tag_storage_select_activating()
1582 *
1583 * @abstract
1584 * Select tag storage pages to activate toward a certain number of free covered
1585 * pages to make taggable.
1586 *
1587 * @discussion
1588 * The caller must make sure there's at least one page to activate for the
1589 * selected buckets.
1590 *
1591 * @param target how many covered taggable free pages to try to generate
1592 * as a result of this activation.
1593 * @param bucket which inactive bucket to start drawing from
1594 *
1595 * @returns the list of tag storage pages to activate
1596 * with mteinfo_tag_storage_activate_locked().
1597 */
1598 static vm_page_list_t
mteinfo_tag_storage_select_activating(uint32_t target,mte_cell_bucket_t bucket)1599 mteinfo_tag_storage_select_activating(uint32_t target, mte_cell_bucket_t bucket)
1600 {
1601 vm_page_list_t list = { };
1602 vm_page_t tag_page = VM_PAGE_NULL;
1603 cell_t *cell = NULL;
1604 uint32_t total = 0;
1605 uint32_t covered = 0;
1606
1607 /*
1608 * Convert the lock hold into a mutex, to signal to waiters that the
1609 * lock may be held for longer.
1610 */
1611 vm_free_page_lock_convert();
1612
1613 do {
1614 cell = cell_list_find_last_page(MTE_LIST_INACTIVE_IDX,
1615 bucket, &tag_page);
1616 if (tag_page == VM_PAGE_NULL) {
1617 break;
1618 }
1619
1620 assert_cell_state(cell, /* [A.1] */ MTE_MASK_INACTIVE);
1621 CELL_UPDATE(cell, tag_page, false, {
1622 cell->state = MTE_STATE_ACTIVATING;
1623 });
1624
1625 covered = cell_free_page_count(*cell);
1626 total += covered;
1627
1628 KDBG(VMDBG_CODE(DBG_VM_TAG_PAGE_INACTIVE) | DBG_FUNC_NONE,
1629 VM_KERNEL_ADDRHIDE(tag_page), covered);
1630
1631 tag_page->vmp_q_state = VM_PAGE_NOT_ON_Q;
1632 vm_page_list_push(&list, tag_page);
1633 } while (total < target);
1634
1635 return list;
1636 }
1637
1638 /*!
1639 * @function mteinfo_tag_storage_activate_locked()
1640 *
1641 * @abstract
1642 * Activate a list of tag storage pages in reclaiming or activating state.
1643 *
1644 * @discussion
1645 * The page free queue lock must be held, however it is dropped and retaken by
1646 * this function.
1647 *
1648 * @param list the list of pages to activate.
1649 * @param spin_mode whether to take the free page queue lock in spin mode.
1650 *
1651 * @returns how many covered pages have been made taggable.
1652 */
1653 static uint32_t
mteinfo_tag_storage_activate_locked(vm_page_list_t list,bool spin_mode)1654 mteinfo_tag_storage_activate_locked(vm_page_list_t list, bool spin_mode)
1655 {
1656 vm_page_t tag_page = VM_PAGE_NULL;
1657 uint32_t result, total;
1658
1659 vm_free_page_unlock();
1660
1661 /*
1662 * First, retype the pages and add them to the MTE object.
1663 */
1664
1665 vm_page_list_foreach(tag_page, list) {
1666 ppnum_t tag_pnum = VM_PAGE_GET_PHYS_PAGE(tag_page);
1667
1668 assert(vm_page_is_tag_storage_pnum(tag_page, tag_pnum));
1669 pmap_make_tag_storage_page(tag_pnum);
1670 }
1671
1672 vm_object_lock(mte_tags_object);
1673 vm_page_lock_queues();
1674 vm_page_list_foreach(tag_page, list) {
1675 vm_page_t save_snext = NEXT_PAGE(tag_page);
1676
1677 NEXT_PAGE(tag_page) = VM_PAGE_NULL;
1678 mteinfo_tag_storage_wire_locked(tag_page);
1679 NEXT_PAGE(tag_page) = save_snext;
1680 }
1681 vm_page_unlock_queues();
1682 vm_object_unlock(mte_tags_object);
1683
1684 if (spin_mode) {
1685 vm_free_page_lock_spin();
1686 } else {
1687 vm_free_page_lock();
1688 }
1689
1690 /*
1691 * Second, mark all the pages as active now, which makes the
1692 * covered pages available for taggable allocation.
1693 *
1694 * And recompute how many taggable pages we really freed,
1695 * as allocations/free of untagged pages could have made
1696 * progress while we dropped the free page queue lock.
1697 */
1698
1699 total = 0;
1700 vm_page_list_foreach_consume(tag_page, &list) {
1701 total += mteinfo_tag_storage_set_active(tag_page, 0, false);
1702 }
1703 result = total;
1704
1705
1706 /*
1707 * Last perform wakeups.
1708 *
1709 * 1. wake up other activators
1710 * 2. wake up privileged waiters
1711 * 3. wake up regular waiters
1712 *
1713 * We do not need to consider secluded pools, or other waiters because
1714 * we never prevent them from allocating the pages associated with
1715 * the tag storage we are activating during this process. Which is why
1716 * we don't use vm_page_free_queue_handle_wakeups_and_unlock() but
1717 * instead have this simplified implementation.
1718 */
1719
1720 if (vm_mte_activator_waiters) {
1721 vm_mte_activator_waiters = false;
1722 wakeup_all_with_inheritor(&vm_mte_activator_waiters,
1723 THREAD_AWAKENED);
1724 }
1725
1726 if (vm_page_free_wanted_tagged_privileged && total) {
1727 if (total < vm_page_free_wanted_tagged_privileged) {
1728 vm_page_free_wanted_tagged_privileged -= total;
1729 total = 0;
1730 } else {
1731 total -= vm_page_free_wanted_tagged_privileged;
1732 vm_page_free_wanted_tagged_privileged = 0;
1733 }
1734 vm_page_free_wakeup(&vm_page_free_wanted_tagged_privileged,
1735 UINT32_MAX);
1736 }
1737
1738 if (vm_page_free_wanted_tagged && total) {
1739 uint32_t wakeup = 0;
1740
1741 if (total < vm_page_free_wanted_tagged) {
1742 wakeup = total;
1743 vm_page_free_wanted_tagged -= total;
1744 total = 0;
1745 } else {
1746 total -= vm_page_free_wanted_tagged;
1747 vm_page_free_wanted_tagged = 0;
1748 wakeup = UINT32_MAX;
1749 }
1750 vm_page_free_wakeup(&vm_page_free_wanted_tagged, wakeup);
1751 }
1752
1753 return result;
1754 }
1755
1756 bool
mteinfo_tag_storage_try_activate(uint32_t target,bool spin_mode)1757 mteinfo_tag_storage_try_activate(uint32_t target, bool spin_mode)
1758 {
1759 mte_cell_bucket_t first_bucket = MTE_BUCKET_17_24;
1760 thread_t thread_self = current_thread();
1761 vm_page_list_t list = { };
1762
1763 /*
1764 * We only draw from buckets covering more than half of the pages free.
1765 * We do not want to do buckets that are less full, as this is too slow
1766 * for the inline path and will rely on the refill thread instead.
1767 */
1768
1769 if (mte_info_lists[MTE_LIST_INACTIVE_IDX].mask < BIT(first_bucket)) {
1770 return false;
1771 }
1772
1773 if (vm_mte_activator) {
1774 /*
1775 * We only allow one thread activating pages at a time,
1776 * only wait if we the caller can't make progress without
1777 * this though.
1778 *
1779 * We do not need to consider that the waiters is privileged
1780 * for the wait however, because activation isn't affected
1781 * by TH_OPT_VMPRIV.
1782 */
1783
1784 if (vm_page_free_taggable_count > vm_page_free_reserved) {
1785 return false;
1786 }
1787 if (vm_page_free_taggable_count > 0 &&
1788 (thread_self->options & TH_OPT_VMPRIV)) {
1789 return false;
1790 }
1791
1792 vm_mte_activator_waiters = true;
1793 lck_mtx_sleep_with_inheritor(&vm_page_queue_free_lock,
1794 spin_mode ? LCK_SLEEP_SPIN : LCK_SLEEP_DEFAULT,
1795 &vm_mte_activator_waiters, vm_mte_activator,
1796 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1797
1798 return true;
1799 }
1800
1801 vm_mte_activator = thread_self;
1802 list = mteinfo_tag_storage_select_activating(target, first_bucket);
1803 mteinfo_tag_storage_activate_locked(list, spin_mode);
1804 vm_mte_activator = THREAD_NULL;
1805
1806 return true;
1807 }
1808
1809
1810 #pragma mark Deactivate
1811
1812 /*!
1813 * @abstract
1814 * Returns whether the active(0.0) bucket should be drained to make inactive
1815 * pages.
1816 *
1817 * @param for_wakeup Whether the question is to wakeup the refill thread
1818 * (true) or decide whether the refill thread should keep
1819 * going (false).
1820 */
1821 static bool
mteinfo_tag_storage_should_drain(bool for_wakeup)1822 mteinfo_tag_storage_should_drain(bool for_wakeup)
1823 {
1824 mte_cell_list_t active_0 = &mte_info_lists[MTE_LIST_ACTIVE_0_IDX];
1825 uint32_t threshold = VMP_FREE_BATCH_SIZE * (for_wakeup ? 2 : 1);
1826
1827 if (!vm_mte_enable_tag_storage_grab) {
1828 return false;
1829 }
1830
1831 if (mte_claimable_queue.vmpfq_count >= vm_free_magazine_refill_limit) {
1832 return false;
1833 }
1834
1835 if (active_0->count <= vm_page_tag_storage_reserved) {
1836 return false;
1837 }
1838
1839 return cell_queue_count(&active_0->buckets[0]) >= threshold;
1840 }
1841
1842 /*
1843 * @function mteinfo_tag_storage_deactivate_barrier()
1844 *
1845 * @abstract
1846 * Wait until all possible untagging operations that could make deactivation
1847 * invalid have finished.
1848 *
1849 * @discussion
1850 * Before we can do any deactivation we must make sure
1851 * that no CPU has untagging activity in flight.
1852 *
1853 * See mteinfo_free_queue_grab() and mteinfo_page_list_fix_tagging().
1854 */
1855 static void
mteinfo_tag_storage_deactivate_barrier(void)1856 mteinfo_tag_storage_deactivate_barrier(void)
1857 {
1858 mte_pcpu_t this_cpu = PERCPU_GET(mte_pcpu);
1859
1860 assert(get_preemption_level() > 0);
1861
1862 percpu_foreach(it, mte_pcpu) {
1863 if (it == this_cpu) {
1864 /*
1865 * A thread is allowed to both have pending untagging
1866 * going on and a page to deactivate.
1867 *
1868 * As a result, ignore the current core's suspension
1869 * state as it is harmless as long as the core commits
1870 * to untagging before it does its deactivations.
1871 *
1872 * If a thread fails to do that, this will reliably
1873 * panic in SPTM, so the risk of silent bugs is rather
1874 * unlikely.
1875 */
1876 continue;
1877 }
1878
1879 if (os_atomic_load(&it->deactivate_suspend, relaxed)) {
1880 hw_wait_while_equals32(&it->deactivate_suspend, 1);
1881 }
1882 }
1883 os_atomic_thread_fence(seq_cst);
1884 }
1885
1886 /*!
1887 * @abstract
1888 * Flush a list of deactivating page storage.
1889 *
1890 * @discussion
1891 * The page free queue lock must be held, but will be dropped while this
1892 * function operates.
1893 *
1894 * @param list The list of pages in @c MTE_STATE_DEACTIVATING state.
1895 */
1896 static void
mteinfo_tag_storage_drain_flush(vm_page_list_t list)1897 mteinfo_tag_storage_drain_flush(vm_page_list_t list)
1898 {
1899 vm_page_t tag_page = VM_PAGE_NULL;
1900
1901 mteinfo_tag_storage_deactivate_barrier();
1902
1903 vm_free_page_unlock();
1904
1905 vm_object_lock(mte_tags_object);
1906 vm_page_lock_queues();
1907
1908 vm_page_list_foreach(tag_page, list) {
1909 vm_page_t save_next = NEXT_PAGE(tag_page);
1910
1911
1912 /*
1913 * The unwiring path expects the page linkage to be
1914 * NULL, so transiently make it NULL. We'll restore
1915 * the linkage after the unwire is done.
1916 */
1917
1918 NEXT_PAGE(tag_page) = VM_PAGE_NULL;
1919 vm_page_unwire(tag_page,
1920 /* Don't put the page into aging queues. */ FALSE);
1921 vm_page_remove(tag_page,
1922 /* Remove the page from the hash. */ TRUE);
1923 NEXT_PAGE(tag_page) = save_next;
1924 }
1925
1926 vm_page_unlock_queues();
1927 vm_object_unlock(mte_tags_object);
1928
1929 vm_page_list_foreach(tag_page, list) {
1930 pmap_unmake_tag_storage_page(VM_PAGE_GET_PHYS_PAGE(tag_page));
1931 }
1932
1933 vm_free_page_lock_spin();
1934
1935 vm_page_tag_storage_deactivation_count += list.vmpl_count;
1936
1937 vm_page_list_foreach_consume(tag_page, &list) {
1938 vm_page_free_queue_enter(VM_MEMORY_CLASS_TAG_STORAGE,
1939 tag_page, VM_PAGE_GET_PHYS_PAGE(tag_page));
1940 }
1941 }
1942
1943 /*!
1944 * @function mteinfo_tag_storage_drain()
1945 *
1946 * @abstract
1947 * Attempt to drain the active(0.0) bucket of pages since these are always
1948 * wasted.
1949 *
1950 * @discussion
1951 * This is one of the core routines of the fill thread.
1952 *
1953 * @returns
1954 * How many tag storage pages were deactivated.
1955 */
1956 static uint32_t
mteinfo_tag_storage_drain(void)1957 mteinfo_tag_storage_drain(void)
1958 {
1959 mte_cell_list_t active_0 = &mte_info_lists[MTE_LIST_ACTIVE_0_IDX];
1960 mte_cell_queue_t bucket_0 = &active_0->buckets[0];
1961 vm_page_t tag_page = VM_PAGE_NULL;
1962 cell_t *cell = NULL;
1963 uint32_t total = 0;
1964 vm_page_list_t list = { };
1965
1966 LCK_MTX_ASSERT_OWNED_SPIN(&vm_page_queue_free_lock);
1967
1968 while (mteinfo_tag_storage_should_drain(false)) {
1969 tag_page = vm_tag_storage_page_get(cell_queue_first_idx(bucket_0));
1970 cell = cell_queue_first(bucket_0);
1971
1972 assert(cell->free_mask == 0);
1973 assert_cell_state(cell, /* [D.1] */ MTE_MASK_ACTIVE);
1974 CELL_UPDATE(cell, tag_page, false, {
1975 cell->state = MTE_STATE_DEACTIVATING;
1976 });
1977
1978 vm_page_list_push(&list, tag_page);
1979
1980 if (list.vmpl_count >= VMP_FREE_BATCH_SIZE) {
1981 total += list.vmpl_count;
1982 mteinfo_tag_storage_drain_flush(list);
1983 list = (vm_page_list_t){ };
1984 }
1985 }
1986
1987 if (list.vmpl_count) {
1988 total += list.vmpl_count;
1989 mteinfo_tag_storage_drain_flush(list);
1990 }
1991
1992 return total;
1993 }
1994
1995
1996 #pragma mark Reclaim
1997
1998 /*!
1999 * @abstract
2000 * Attempt to steal a tag page from a per cpu claimed free queue.
2001 *
2002 * @discussion
2003 * The caller must have checked that the tag_page is on a local free queue,
2004 * even if this check is racy.
2005 *
2006 * @param tag_page A tag storage page appearing to sit on a per cpu queue.
2007 *
2008 * @returns Whether stealing was successful (true) or not (false).
2009 */
2010 static bool
mteinfo_reclaim_tag_storage_page_try_pcpu(vm_page_t tag_page)2011 mteinfo_reclaim_tag_storage_page_try_pcpu(vm_page_t tag_page)
2012 {
2013 mte_pcpu_t mte_pcpu;
2014 uint16_t cpu;
2015
2016 cpu = os_atomic_load(&tag_page->vmp_local_id, relaxed);
2017 mte_pcpu = PERCPU_GET_WITH_BASE(other_percpu_base(cpu), mte_pcpu);
2018
2019 lck_ticket_lock(&mte_pcpu->free_claimed_lock, &vm_page_lck_grp_bucket);
2020
2021 if (tag_page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q &&
2022 tag_page->vmp_local_id == cpu) {
2023 vm_page_queue_remove(&mte_pcpu->free_claimed_pages,
2024 tag_page, vmp_pageq);
2025 tag_page->vmp_q_state = VM_PAGE_NOT_ON_Q;
2026 tag_page->vmp_local_id = 0;
2027 counter_dec_preemption_disabled(&vm_cpu_free_claimed_count);
2028 } else {
2029 tag_page = VM_PAGE_NULL;
2030 }
2031
2032 lck_ticket_unlock(&mte_pcpu->free_claimed_lock);
2033
2034 return tag_page != VM_PAGE_NULL;
2035 }
2036
2037 /*!
2038 * @function mteinfo_reclaim_tag_storage_page()
2039 *
2040 * @abstract
2041 * Attempt to reclaim a claimed tag storage page.
2042 *
2043 * @discussion
2044 * This will try to reclaim a tag storage page by relocating its contents to a
2045 * different page, so that the tag storage page becomes (effectively) free.
2046 *
2047 * This expects a claimed tag storage page, and on success, will finish with
2048 * the page in the reclaimed state. On failure, no guarantees are made about
2049 * the state of the page (due to locking operations); the page could still be
2050 * claimed, or reclamation may have failed because the page became free in the
2051 * interim. However, if the page was not in a relocatable state, this function
2052 * will not force it out of the reclaiming state, so that the client can choose
2053 * when and why the page is returned to claimed.
2054 *
2055 * This function is called with the free page queue lock in spin mode and
2056 * returns with it held in spin mode.
2057 *
2058 * @param tag_page
2059 * The claimed tag storage page to try reclaiming.
2060 *
2061 * @returns
2062 * - KERN_SUCCESS success,
2063 *
2064 * - KERN_INVALID_OBJECT the page has no object set
2065 *
2066 * - KERN_NOT_WAITING the state of the cell/tag page changed
2067 * during evaluation.
2068 *
2069 * - KERN_ABORTED the tag page was wired. reclaiming it was
2070 * aborted and it was marked as MTE_STATE_PINNED.
2071 *
2072 * - KERN_RESOURCE_SHORTAGE from vm_page_relocate(): relocation failed due
2073 * to being out of replacement memory.
2074 *
2075 * - KERN_FAILURE from vm_page_relocate(): relocation failed due
2076 * to the page not being currently relocatable.
2077 */
2078 static kern_return_t
mteinfo_reclaim_tag_storage_page(vm_page_t tag_page)2079 mteinfo_reclaim_tag_storage_page(vm_page_t tag_page)
2080 {
2081 cell_t *cell = cell_from_tag_storage_page(tag_page);
2082 kern_return_t kr = KERN_FAILURE;
2083 vm_object_t object;
2084 bool compressor_locked = false;
2085 bool vm_object_trylock_failed = false;
2086
2087 /* We need to try and reclaim the tag storage page. */
2088 mteinfo_tag_storage_set_reclaiming(cell, tag_page);
2089
2090 if (tag_page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q &&
2091 mteinfo_reclaim_tag_storage_page_try_pcpu(tag_page)) {
2092 vm_page_tag_storage_reclaim_from_cpu_count++;
2093 vm_page_tag_storage_reclaim_success_count++;
2094
2095 KDBG(VMDBG_CODE(DBG_VM_TAG_PAGE_CLAIMED) | DBG_FUNC_NONE,
2096 VM_KERNEL_ADDRHIDE(tag_page),
2097 mteinfo_tag_storage_free_pages_for_covered(tag_page));
2098
2099 return KERN_SUCCESS;
2100 }
2101
2102 vm_free_page_unlock();
2103
2104 /*
2105 * Snoop the vmp_q_state. If the page is currently used by the compressor
2106 * (VM_PAGE_USED_BY_COMPRESSOR), we'll grab the global compressor lock
2107 * for write (PAGE_REPLACEMENT_ALLOWED(TRUE)) and the compressor
2108 * object lock.
2109 *
2110 * Typically, we can't know that the object will be stable
2111 * without grabbing the object or page queues lock (see the comment on
2112 * "relocation lock dance" below), but we know that the compressor object
2113 * is stable. So, we do _not_ need to grab the page queues and object locks
2114 * in the wrong order. This ensures that we will wait our turn in case
2115 * someone else is using the compressor object lock, and there is no chance
2116 * the reclaim will fail because we can't acquire the right locks.
2117 *
2118 * The contiguous memory allocator grabs this lock before the page queues
2119 * and object lock, so we must do the same here.
2120 */
2121 if (tag_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2122 assert(vm_mte_tag_storage_for_compressor);
2123 PAGE_REPLACEMENT_ALLOWED(TRUE);
2124 vm_object_lock(compressor_object);
2125 compressor_locked = true;
2126
2127 /*
2128 * The page state transitions into and out of VM_PAGE_USED_BY_COMPRESSOR
2129 * happen under the compressor object, so now the page state is stable.
2130 */
2131 if (tag_page->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR) {
2132 /*
2133 * The page was removed from the compressor pool. It could be
2134 * in any state now, but it's probably free and unusable. Give up.
2135 */
2136 vm_object_unlock(compressor_object);
2137 PAGE_REPLACEMENT_ALLOWED(FALSE);
2138 compressor_locked = false;
2139 vm_free_page_lock_spin();
2140 kr = KERN_FAILURE;
2141 goto locks_acquired;
2142 }
2143 }
2144
2145 /*
2146 * Do the relocation lock dance. This is a little odd; because we're
2147 * starting with a page, and trying to look up the object, we need the
2148 * queues lock to keep the object from being deallocated or changed.
2149 *
2150 * This means we need to get the object lock after the queues lock;
2151 * this inverts the lock ordering, so we can only TRY the object lock.
2152 */
2153 vm_page_lock_queues();
2154
2155 object = VM_PAGE_OBJECT(tag_page);
2156 if (compressor_locked) {
2157 assert(object == compressor_object);
2158 }
2159
2160 if (object == VM_OBJECT_NULL) {
2161 /* [PH] XXX: Can this even happen? */
2162 kr = KERN_INVALID_OBJECT;
2163 goto release_locks;
2164 } else if (!compressor_locked && !vm_object_lock_try_scan(object)) {
2165 /*
2166 * hopefully the next time we drain reclaiming pages taking
2167 * that object lock will work.
2168 */
2169 vm_object_trylock_failed = true;
2170 kr = KERN_NOT_WAITING;
2171 goto release_locks;
2172 } else if (VM_PAGE_OBJECT(tag_page) != object) {
2173 /*
2174 * vm_page_insert_internal() doesn't require the page queue lock
2175 * to be held if the page is wired, so the object could change
2176 * under us.
2177 */
2178 vm_object_unlock(object);
2179
2180 kr = KERN_NOT_WAITING;
2181 goto release_locks;
2182 }
2183
2184 /*
2185 * Now that all the locking is out of the way,
2186 * see if the page is actually relocatable.
2187 */
2188 if (VM_PAGE_WIRED(tag_page) ||
2189 (tag_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR && tag_page->vmp_busy)) {
2190 /*
2191 * TODO: Relocation fails when one of these conditions is met:
2192 *
2193 * VM_PAGE_WIRED(tag_page)
2194 * tag_page->vmp_gobbled
2195 * tag_page->vmp_laundry
2196 * tag_page->vmp_wanted
2197 * tag_page->vmp_cleaning
2198 * tag_page->vmp_overwriting
2199 * tag_page->vmp_free_when_done
2200 * tag_page->vmp_busy
2201 * tag_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q
2202 *
2203 * We only handle VM_PAGE_WIRED() and when the tag page is being
2204 * swapped out (from usage in the compressor pool) for now,
2205 * because these are the most likely, but we should use vmp_ts_wanted
2206 * for all cases.
2207 *
2208 * We would need to find all places in the kernel that alter
2209 * this condition, to notice that a relocation was attempted
2210 * (vmp_ts_wanted is set) and call mteinfo_tag_storage_wakeup().
2211 */
2212
2213 /*
2214 * Take the page free lock before setting vmp_ts_wanted,
2215 * before we drop the object lock, otherwise
2216 * mteinfo_tag_storage_wakeup() might see vmp_ts_wanted
2217 * before the transition to MTE_STATE_PINNED has happened.
2218 *
2219 * Note that we should do nothing if the cell is no longer in
2220 * the MTE_STATE_RECLAIMING state, which could hypothetically
2221 * happen since we dropped the free queue lock above.
2222 */
2223 vm_free_page_lock_spin();
2224
2225 if (cell->state == MTE_STATE_RECLAIMING) {
2226 assert(tag_page->vmp_ts_wanted == false);
2227 tag_page->vmp_ts_wanted = true;
2228 kr = KERN_ABORTED;
2229 } else {
2230 kr = KERN_NOT_WAITING;
2231 }
2232
2233 vm_object_unlock(object);
2234 vm_page_unlock_queues();
2235 if (compressor_locked) {
2236 PAGE_REPLACEMENT_ALLOWED(FALSE);
2237 compressor_locked = false;
2238 }
2239
2240 if (kr == KERN_ABORTED) {
2241 assert_cell_state(cell, /* [B.1] */ MTE_MASK_RECLAIMING);
2242 CELL_UPDATE(cell, tag_page, false, {
2243 cell->state = MTE_STATE_PINNED;
2244 });
2245 if (tag_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2246 vm_page_tag_storage_reclaim_compressor_failure_count++;
2247 } else {
2248 vm_page_tag_storage_reclaim_wired_failure_count++;
2249 }
2250 }
2251
2252 goto locks_acquired;
2253 } else if ((*vm_mte_tag_storage_for_vm_tags) &&
2254 !vm_page_is_relocatable(tag_page, VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM)) {
2255 /*
2256 * If we're allowing tag storage pages to be used for specific VM tags,
2257 * those pages could be unrelocatable for reasons we haven't
2258 * expected. We're also assuming that if a tag storage page were to
2259 * be unrelocatable for whatever reason, it's (at the very least) not
2260 * because the page is wired or involved in an IO that could take a
2261 * long time, so hopefully it won't be unavailable for too long, and
2262 * the fill thread won't churn over the same set of unavailable claimed
2263 * pages.
2264 *
2265 * We'll just skip over this page and move it back to claiming at the
2266 * bottom of this function.
2267 */
2268 kr = KERN_NOT_WAITING;
2269 vm_object_unlock(object);
2270 } else {
2271 kr = vm_page_relocate(tag_page, NULL,
2272 VM_RELOCATE_REASON_TAG_STORAGE_RECLAIM, NULL);
2273 vm_object_unlock(object);
2274
2275 assert(kr != KERN_ABORTED);
2276 }
2277
2278 release_locks:
2279 if (compressor_locked) {
2280 PAGE_REPLACEMENT_ALLOWED(FALSE);
2281 }
2282 vm_page_unlock_queues();
2283 if (vm_object_trylock_failed && vm_object_lock_avoid(object)) {
2284 /*
2285 * We failed to lock the VM object, and pageout_scan
2286 * wants this object. Back off for a little bit.
2287 *
2288 * Note that the VM object may no longer be valid after releasing
2289 * the VM object lock, but `vm_object_lock_avoid` only compares
2290 * pointers and doesn't dereference them, so it's fine.
2291 */
2292 mutex_pause(2);
2293 }
2294 vm_free_page_lock_spin();
2295
2296
2297 locks_acquired:
2298 /*
2299 * Assert that all codepaths leading up to this point have the lock
2300 * held in spin mode (and therefore, preemption disabled).
2301 */
2302 LCK_MTX_ASSERT_OWNED_SPIN(&vm_page_queue_free_lock);
2303
2304 if (kr == KERN_SUCCESS) {
2305 vm_page_tag_storage_reclaim_success_count++;
2306
2307 /* We relocated the page. Now we can use it. */
2308 if (cell->state != MTE_STATE_RECLAIMING) {
2309 /*
2310 * The page was manipulated while we were relocating
2311 * it. This likely means it was freed and reallocated
2312 * between us dropping the free page lock and getting
2313 * the queues lock.
2314 *
2315 * This should be ludicrously rare, and should still
2316 * mean that the page is claimed (otherwise relocate
2317 * would have failed). Set to reclaiming for client
2318 * consistency.
2319 *
2320 * In the state diagram this corresponds to other
2321 * threads having performed [F.2 inline] followed
2322 * by [C.1 inline], possibly multiple times.
2323 */
2324 mteinfo_tag_storage_set_reclaiming(cell, tag_page);
2325 }
2326
2327 KDBG(VMDBG_CODE(DBG_VM_TAG_PAGE_CLAIMED) | DBG_FUNC_NONE,
2328 VM_KERNEL_ADDRHIDE(tag_page),
2329 mteinfo_tag_storage_free_pages_for_covered(tag_page));
2330
2331 assert(tag_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
2332 } else {
2333 vm_page_tag_storage_reclaim_failure_count++;
2334
2335 if (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_NOT_WAITING) {
2336 /*
2337 * If there was no available page to relocate the tag
2338 * storage page to, or that some race happened that
2339 * changed the page state under our feet, just put the
2340 * page back as claimed if it's still reclaiming.
2341 *
2342 * It will as a result get reconsidered more quickly...
2343 * it WAS our best candidate, after all.
2344 */
2345 if (cell->state == MTE_STATE_RECLAIMING) {
2346 mteinfo_tag_storage_set_claimed(tag_page);
2347 }
2348 }
2349 }
2350
2351 return kr;
2352 }
2353
2354
2355 #pragma mark Refill Thread
2356
2357 /*!
2358 * @abstract
2359 * Returns whether the refill thread should keep refilling the active pool.
2360 *
2361 * @discussion
2362 * If we're below the free target, and there are no tagged waiters of any kind,
2363 * avoid activating any pages if the untagged pool is not extremely healthy.
2364 */
2365 static inline bool
mteinfo_tag_storage_active_should_refill(void)2366 mteinfo_tag_storage_active_should_refill(void)
2367 {
2368 if (vm_page_free_taggable_count >= vm_page_free_target) {
2369 return false;
2370 }
2371
2372 if (vm_page_free_taggable_count <= vm_page_free_reserved) {
2373 return true;
2374 }
2375
2376 if (vm_page_free_wanted_tagged_privileged || vm_page_free_wanted_tagged) {
2377 return true;
2378 }
2379
2380 /*
2381 * 16/15 is ~1.07: we define "healthy" as at least 7% excess pages
2382 * over the target.
2383 *
2384 * We want some slop because a system under pressure will sometimes go
2385 * above @c vm_page_free_target and we want to avoid thrashing.
2386 */
2387 return vm_page_free_count * 15ull >= vm_page_free_target * 16ull;
2388 }
2389
2390 /*!
2391 * @function mteinfo_tag_storage_active_refill()
2392 *
2393 * @abstract
2394 * Attempt to fill the global free tagged covered page queue.
2395 *
2396 * @discussion
2397 * This is one of the core routines of the fill thread. It will attempt to get
2398 * the global free tagged covered page queue to or above a target value. It
2399 * will also wake threads waiting for more of these pages as appropriate.
2400 *
2401 * This function is called with the free page queue lock held in spin mode
2402 * and returns with it held in spin mode.
2403 *
2404 * @param taggablep How many free taggable pages have been added.
2405 * @returns The number of tag storage pages this function activated.
2406 */
2407 static uint32_t
mteinfo_tag_storage_active_refill(uint32_t * taggablep)2408 mteinfo_tag_storage_active_refill(uint32_t *taggablep)
2409 {
2410 mte_cell_list_t claimed_list = &mte_info_lists[MTE_LIST_CLAIMED_IDX];
2411 mte_cell_list_t inactive_list = &mte_info_lists[MTE_LIST_INACTIVE_IDX];
2412 uint32_t taggable = 0;
2413 uint32_t activated = 0;
2414
2415 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED);
2416
2417 while (mteinfo_tag_storage_active_should_refill()) {
2418 mte_cell_bucket_t i_bucket = 0;
2419 mte_cell_bucket_t c_bucket = 0;
2420 vm_page_list_t list = { };
2421 kern_return_t kr = KERN_SUCCESS;
2422
2423 /*
2424 * Step 1: try to activate or reclaim pages.
2425 *
2426 * Pick the pool between inactive and claimed that will
2427 * make us progress the fastest (picking inactive over
2428 * claimed for equivalent buckets, given that reclaiming
2429 * is more expensive).
2430 *
2431 * In particular always pick active buckets over reclaiming
2432 * pages if they have more than 50% of the pages free.
2433 */
2434
2435 if (inactive_list->mask) {
2436 i_bucket = fls(inactive_list->mask) - 1;
2437 } else {
2438 i_bucket = 0;
2439 }
2440 if (claimed_list->mask) {
2441 c_bucket = fls(claimed_list->mask) - 1;
2442 } else {
2443 c_bucket = 0;
2444 }
2445
2446 if (i_bucket && i_bucket >= MIN(MTE_BUCKET_17_24, c_bucket)) {
2447 list = mteinfo_tag_storage_select_activating(VMP_FREE_BATCH_SIZE,
2448 MIN(i_bucket, MTE_BUCKET_17_24));
2449 } else if (c_bucket > MTE_BUCKET_0) {
2450 mte_cell_queue_t queue = &claimed_list->buckets[c_bucket];
2451 cell_idx_t idx = cell_queue_first_idx(queue);
2452 vm_page_t page = vm_tag_storage_page_get(idx);
2453
2454 kr = mteinfo_reclaim_tag_storage_page(page);
2455 if (kr == KERN_SUCCESS) {
2456 list = vm_page_list_for_page(page);
2457 }
2458 } else {
2459 /*
2460 * There is no progress we can do here because we do not
2461 * have good candidates to activate or reclaim.
2462 *
2463 * As a result, even if the system has free untaggable
2464 * pages, they can't be converted to taggable either
2465 * because they're permanently untaggable, or beacuse
2466 * their associated tag storage can't be reclaimed.
2467 *
2468 * Waiting in VM_PAGE_WAIT() below sounds appealing
2469 * but will result in busy loops, so we should just
2470 * go park and wait until some page free is saving us
2471 * via the "wakeup_refill_thread" cases in
2472 * @c vm_page_free_queue_handle_wakeups_and_unlock().
2473 */
2474 break;
2475 }
2476
2477 if (kr == KERN_SUCCESS) {
2478 activated += list.vmpl_count;
2479 taggable += mteinfo_tag_storage_activate_locked(list,
2480 /* spin-mode */ true);
2481 continue;
2482 }
2483
2484 /*
2485 * Step 2: wait if needed
2486 *
2487 * KERN_RESOURCE_SHORTAGE means that we were out of pages
2488 * to relocate or tag storage candidates.
2489 *
2490 * Other errors are relocation failures and we can just
2491 * retry immediately.
2492 */
2493
2494 if (kr == KERN_RESOURCE_SHORTAGE) {
2495 /*
2496 * There was no good candidate tag storage page. Wait
2497 * on the VM to make new pages available.
2498 *
2499 * TODO: This isn't a great solution; the VM doesn't
2500 * understand what we are actually waiting on. This
2501 * should converge eventually due to VM activity... but
2502 * the bigger picture fix is to make all free pages
2503 * eligible for MTE. Then our only significant concern
2504 * around tag storage pages will be tag storage pages
2505 * with ECC errors, which should be a small number.
2506 */
2507 vm_free_page_unlock();
2508 current_thread()->page_wait_class = VM_MEMORY_CLASS_REGULAR;
2509 VM_PAGE_WAIT();
2510 vm_free_page_lock_spin();
2511
2512 /*
2513 * We waited above, the system conditions changed,
2514 * flush our reclaiming queue.
2515 */
2516 mteinfo_tag_storage_flush_reclaiming();
2517 }
2518 }
2519
2520 mteinfo_tag_storage_flush_reclaiming();
2521
2522 *taggablep += taggable;
2523 return activated;
2524 }
2525
2526 /*!
2527 * @function mteinfo_fill_continue()
2528 *
2529 * @abstract
2530 * Continuation for the MTE fill thread.
2531 *
2532 * @discussion
2533 * The MTE fill thread manages the global free queue of covered tagged pages,
2534 * and moving tag storage pages between the active and inactive states.
2535 *
2536 * @param param
2537 * Unused.
2538 *
2539 * @param wr
2540 * Unused.
2541 */
2542 __dead2
2543 static void
mteinfo_fill_continue(void * param __unused,wait_result_t wr __unused)2544 mteinfo_fill_continue(void *param __unused, wait_result_t wr __unused)
2545 {
2546 #if CONFIG_THREAD_GROUPS
2547 static bool _fill_thread_self_inited;
2548
2549 if (!_fill_thread_self_inited) {
2550 thread_group_vm_add();
2551 _fill_thread_self_inited = true;
2552 }
2553 #endif /* CONFIG_THREAD_GROUPS */
2554
2555 (void)sched_cond_ack(&fill_thread_cond);
2556 vm_mte_refill_thread_wakeups++;
2557
2558 for (;;) {
2559 uint32_t added = 0;
2560 uint32_t activated = 0;
2561 uint32_t deactivated = 0;
2562
2563 VM_DEBUG_CONSTANT_EVENT(, DBG_VM_REFILL_MTE, DBG_FUNC_START,
2564 0, 0, 0, 0);
2565
2566 /*
2567 * NB: We take the free queue lock in spin mode here because there are
2568 * a number of operations that occur during active_refill and drain
2569 * that requires preemption to be disabled. For example:
2570 * - in active_refill: if the fill thread tries to reclaim a tag
2571 * storage page, it first tries to steal a free tag storage page
2572 * from the local free queue.
2573 * - in drain: when flushing the queue of deactivating tag storage
2574 * pages, the fill thread waits for all cores to finish any untagging
2575 * before proceeding. See mteinfo_tag_storage_deactivate_barrier().
2576 *
2577 * Coupling enabling/disabling preemption with acquiring/releasing the
2578 * free queue lock is easier than managing preemption by hand, so all
2579 * instances of free queue lock acquisition must be done in spin mode.
2580 */
2581 vm_free_page_lock_spin();
2582
2583 activated += mteinfo_tag_storage_active_refill(&added);
2584 deactivated += mteinfo_tag_storage_drain();
2585
2586 vm_free_page_unlock();
2587
2588 VM_DEBUG_CONSTANT_EVENT(, DBG_VM_REFILL_MTE, DBG_FUNC_END,
2589 added, activated, deactivated, 0);
2590
2591 sched_cond_wait_parameter(&fill_thread_cond, THREAD_UNINT,
2592 mteinfo_fill_continue, NULL);
2593 }
2594 }
2595
2596 void
mteinfo_wake_fill_thread(void)2597 mteinfo_wake_fill_thread(void)
2598 {
2599 if (is_mte_enabled) {
2600 sched_cond_signal(&fill_thread_cond, vm_mte_fill_thread);
2601 }
2602 }
2603
2604
2605 #pragma mark Alloc
2606
2607 /*!
2608 * @abstract
2609 * Returns whether @c mteinfo_free_queue_grab() should refill the per-cpu
2610 * claimable queue.
2611 *
2612 * @discussion
2613 * The policy is to refill if the queue is empty and that the claimable
2614 * queue has a full batch of @c VMP_FREE_BATCH_SIZE free pages.
2615 *
2616 * This is chosen so that the taking of the spinlock it implies is amortized
2617 * well and reduce thrashing.
2618 *
2619 * The function must be called with preemption disabled.
2620 *
2621 * @param mte_pcpu The current CPU's mte_pcpu_t data structure.
2622 */
2623 static bool
mteinfo_tag_storage_claimable_should_refill(mte_pcpu_t mte_pcpu)2624 mteinfo_tag_storage_claimable_should_refill(mte_pcpu_t mte_pcpu)
2625 {
2626 if (__improbable(!vm_mte_enable_tag_storage_grab)) {
2627 return false;
2628 }
2629
2630 if (!vm_page_queue_empty(&mte_pcpu->free_claimed_pages)) {
2631 return false;
2632 }
2633
2634 return mte_claimable_queue.vmpfq_count >= VMP_FREE_BATCH_SIZE;
2635 }
2636
2637 /*!
2638 * @abstract
2639 * Refill the current CPU's claimed free queue.
2640 *
2641 * @discussion
2642 * This is done opportunistically by @c mteinfo_free_queue_grab()
2643 * When it notices that it should refill the claimable queue
2644 * (see @mteinfo_tag_storage_claimable_should_refill()).
2645 *
2646 * The function must be called with preemption disabled.
2647 *
2648 * @param mte_pcpu The current CPU's mte_pcpu_t data structure.
2649 * @param target The number of tag storage pages to grab.
2650 * @param colorp A pointer to the current color selector.
2651 */
2652 static void
mteinfo_tag_storage_claimable_refill(mte_pcpu_t mte_pcpu,uint32_t target,uint32_t * colorp)2653 mteinfo_tag_storage_claimable_refill(
2654 mte_pcpu_t mte_pcpu,
2655 uint32_t target,
2656 uint32_t *colorp)
2657 {
2658 const int cpu = cpu_number();
2659 vm_page_queue_t queue;
2660 ppnum_t pnum;
2661 vm_page_t mem;
2662
2663 lck_ticket_lock_nopreempt(&mte_pcpu->free_claimed_lock,
2664 &vm_page_lck_grp_bucket);
2665
2666 for (uint32_t i = target; i-- > 0;) {
2667 queue = &mte_claimable_queue.vmpfq_queues[*colorp].qhead;
2668 while (vm_page_queue_empty(queue)) {
2669 *colorp = (*colorp + 1) & vm_color_mask;
2670 queue = &mte_claimable_queue.vmpfq_queues[*colorp].qhead;
2671 }
2672
2673 mem = (vm_page_t)vm_page_queue_first(queue);
2674 pnum = VM_PAGE_GET_PHYS_PAGE(mem);
2675
2676 assert(mem->vmp_q_state == VM_PAGE_ON_FREE_Q);
2677 mteinfo_tag_storage_set_claimed(mem);
2678 mem->vmp_q_state = VM_PAGE_ON_FREE_LOCAL_Q;
2679 mem->vmp_local_id = (uint16_t)cpu;
2680 vm_page_queue_enter(&mte_pcpu->free_claimed_pages, mem, vmp_pageq);
2681 }
2682
2683 lck_ticket_unlock_nopreempt(&mte_pcpu->free_claimed_lock);
2684
2685 counter_add_preemption_disabled(&vm_cpu_free_claimed_count,
2686 target);
2687 }
2688
2689 vm_page_list_t
mteinfo_free_queue_grab(vm_grab_options_t options,vm_memory_class_t class,unsigned int num_pages,vm_page_q_state_t q_state)2690 mteinfo_free_queue_grab(
2691 vm_grab_options_t options,
2692 vm_memory_class_t class,
2693 unsigned int num_pages,
2694 vm_page_q_state_t q_state)
2695 {
2696 mte_pcpu_t mte_pcpu = PERCPU_GET(mte_pcpu);
2697 unsigned int *colorp;
2698 unsigned int color;
2699 vm_page_list_t list = { };
2700 mte_free_queue_idx_t idx;
2701
2702 assert(!mte_pcpu->deactivate_suspend && get_preemption_level() > 0);
2703
2704 if (class == VM_MEMORY_CLASS_REGULAR) {
2705 /*
2706 * VM_MEMORY_CLASS_DEAD_TAG_STORAGE is not part of
2707 * vm_page_free_count, which means the caller didn't take them
2708 * into account when making this allocation ask.
2709 *
2710 * As a result do not respect num_pages. However these are
2711 * different than the regular claimable pool because we can
2712 * always safely wire them.
2713 */
2714 if (vm_page_queue_free.vmpfq_count) {
2715 list = vm_page_free_queue_grab(options,
2716 VM_MEMORY_CLASS_DEAD_TAG_STORAGE,
2717 MIN(vm_free_magazine_refill_limit / 2,
2718 vm_page_queue_free.vmpfq_count), q_state);
2719 }
2720
2721 assert(num_pages <= vm_page_free_count);
2722 } else {
2723 assert(num_pages <= vm_page_free_taggable_count);
2724 }
2725
2726 colorp = PERCPU_GET(start_color);
2727 color = *colorp;
2728
2729 if (mteinfo_tag_storage_claimable_should_refill(mte_pcpu)) {
2730 mteinfo_tag_storage_claimable_refill(mte_pcpu,
2731 VMP_FREE_BATCH_SIZE, &color);
2732 }
2733
2734 while (list.vmpl_count < num_pages) {
2735 vm_page_queue_t queue;
2736 cell_count_t bit;
2737 vm_page_t tag_page;
2738 vm_page_t mem;
2739 uint32_t count;
2740 ppnum_t first_pnum;
2741 cell_t orig;
2742 cell_t *cell;
2743
2744 /*
2745 * Select which queue we dequeue from
2746 *
2747 * Regular allocations can allocate from any bucket.
2748 * Tagged allocations must draw from an MTE_FREE_ACTIVE_* one.
2749 */
2750
2751 if (class == VM_MEMORY_CLASS_REGULAR) {
2752 idx = ffs(mte_free_queue_mask) - 1;
2753 } else {
2754 uint32_t mask = mte_free_queue_mask;
2755
2756 mask &= BIT(MTE_FREE_ACTIVE_0) |
2757 BIT(MTE_FREE_ACTIVE_1) |
2758 BIT(MTE_FREE_ACTIVE_2) |
2759 BIT(MTE_FREE_ACTIVE_3);
2760
2761 assert(mask);
2762 idx = fls(mask) - 1;
2763 }
2764
2765 queue = mteinfo_free_queue_head(idx, color);
2766 while (vm_page_queue_empty(queue)) {
2767 color = (color + 1) & vm_color_mask;
2768 queue = mteinfo_free_queue_head(idx, color);
2769 }
2770
2771 /*
2772 * Dequeue the linkage, find the page of the right color.
2773 */
2774
2775 vm_page_queue_remove_first(queue, mem, vmp_pageq);
2776
2777 VM_COUNTER_DEC(&mte_free_queues[idx].vmpfq_count);
2778 if (mte_free_queues[idx].vmpfq_count == 0) {
2779 bit_clear(mte_free_queue_mask, idx);
2780 }
2781
2782 first_pnum = VM_PAGE_GET_PHYS_PAGE(mem) & -MTE_PAGES_PER_TAG_PAGE;
2783 cell = cell_from_covered_ppnum(first_pnum, &tag_page);
2784 orig = *cell;
2785 bit = orig.enqueue_pos;
2786 count = 0;
2787 assert((orig.enqueue_pos & vm_color_mask) ==
2788 color % MTE_PAGES_PER_TAG_PAGE);
2789
2790 /*
2791 * Dequeue a span of covered pages from that tag storage
2792 *
2793 * If we have a contiguous run of free pages and we need more,
2794 * we know this tag storage page is going to be the one we pick
2795 * next.
2796 */
2797
2798 for (;;) {
2799 assert(bit_test(orig.free_mask, bit));
2800 bit_clear(cell->free_mask, bit);
2801
2802 mem->vmp_q_state = q_state;
2803 vm_page_list_push(&list, mem);
2804
2805 count += 1;
2806 bit += 1;
2807
2808 if (!bit_test(cell->free_mask, bit) ||
2809 list.vmpl_count >= num_pages) {
2810 break;
2811 }
2812
2813 mem = vm_page_find_canonical(first_pnum + bit);
2814 }
2815
2816 color = (color + count) & vm_color_mask;
2817
2818 /*
2819 * Update counters (see mteinfo_covered_page_set_used())
2820 */
2821
2822 VM_COUNTER_SUB(&vm_page_free_count, count);
2823 if (idx >= MTE_FREE_ACTIVE_0 && idx <= MTE_FREE_ACTIVE_3) {
2824 VM_COUNTER_SUB(&vm_page_free_taggable_count, count);
2825 }
2826 if (class != VM_MEMORY_CLASS_REGULAR) {
2827 VM_COUNTER_ADD(&vm_page_tagged_count, count);
2828 cell->mte_page_count += count;
2829 }
2830
2831 /*
2832 * Requeue the tag storage (tail end of CELL_UPDATE())
2833 */
2834
2835 if (cell_list_idx(orig) != cell_list_idx(*cell) ||
2836 cell_list_bucket(orig) != cell_list_bucket(*cell)) {
2837 cell_list_requeue(cell, tag_page,
2838 cell_list_idx(orig), cell_list_bucket(orig),
2839 cell_list_idx(*cell), cell_list_bucket(*cell),
2840 (int)cell_on_claimable_queue(*cell) -
2841 (int)cell_on_claimable_queue(orig));
2842 }
2843
2844 mteinfo_free_queue_requeue(cell, orig, MTE_FREE_NOT_QUEUED,
2845 mteinfo_free_queue_idx(*cell));
2846 }
2847
2848 *colorp = color;
2849
2850 /*
2851 * Some existing driver/IOKit code deals badly with getting physically
2852 * contiguous memory... which this alloc code is rather likely to
2853 * provide by accident immediately after boot.
2854 *
2855 * To avoid hitting issues related to this, we'll invert the order of
2856 * the list we return. This code should be removed once we've tracked
2857 * down the various driver issues.
2858 */
2859 vm_page_list_reverse(&list);
2860
2861 if (class == VM_MEMORY_CLASS_REGULAR && list.vmpl_has_tagged) {
2862 /*
2863 * We are pulling pages from the taggable free queue
2864 * to use them as untagged.
2865 *
2866 * This breaks the invariant that pages with vmp_using_mte
2867 * set are either free pages on the free queue that were left
2868 * tagged after being freed (covered by the cell "free_mask"),
2869 * or used tagged pages (covered by the cell "mte_page_count"
2870 * counter).
2871 *
2872 * The caller has allocated these pages from the free queue
2873 * (clearing the proper "free_mask" bit) but didn't increment
2874 * the "mte_page_count". It will then proceed with untagging
2875 * these pages without holding any locks, and doesn't want to
2876 * re-take the free page queue lock for book-keeping.
2877 *
2878 * As a result, invariants are broken for a little while,
2879 * and we need to suspend the deactivation path that someone
2880 * has currently broken this invariant on this core until
2881 * the untagging is finished, otherwise, the deactivating
2882 * thread would not consider these pages as tagged, and would
2883 * retype the page to XNU_DEFAULT causing an SPTM panic.
2884 *
2885 * mteinfo_page_list_fix_tagging() will resume deactivations
2886 * when it is called on the same core.
2887 *
2888 * mteinfo_tag_storage_deactivate_barrier() is called by any
2889 * path performing a deactivation to synchronize with this.
2890 */
2891 os_atomic_store(&mte_pcpu->deactivate_suspend, 1,
2892 compiler_acquire);
2893 }
2894
2895 /*
2896 * If pulling untagged pages tapped above the active(0) pool,
2897 * and there are "active(0)" pages around, then we wake up
2898 * the refill thread to drain this pool in order to make some
2899 * claimable pages available.
2900 */
2901 if (vm_mte_enable_tag_storage_grab &&
2902 class == VM_MEMORY_CLASS_REGULAR &&
2903 idx >= MTE_FREE_ACTIVE_0 &&
2904 mteinfo_tag_storage_should_drain(true)) {
2905 mteinfo_wake_fill_thread();
2906 }
2907
2908 return list;
2909 }
2910
2911 void
mteinfo_page_list_fix_tagging(vm_memory_class_t class,vm_page_list_t * list)2912 mteinfo_page_list_fix_tagging(vm_memory_class_t class, vm_page_list_t *list)
2913 {
2914 const unified_page_list_t pmap_batch_list = {
2915 .page_slist = list->vmpl_head,
2916 .type = UNIFIED_PAGE_LIST_TYPE_VM_PAGE_LIST,
2917 };
2918 mte_pcpu_t mte_pcpu = PERCPU_GET(mte_pcpu);
2919 vm_page_t mem;
2920
2921 assert(get_preemption_level() > 0);
2922
2923 if (class == VM_MEMORY_CLASS_REGULAR && list->vmpl_has_tagged) {
2924 pmap_unmake_tagged_pages(&pmap_batch_list);
2925 vm_page_list_foreach(mem, *list) {
2926 mem->vmp_using_mte = false;
2927 }
2928
2929 /*
2930 * Invariants related to tagged pages are resolved,
2931 * we can allow deactivations again.
2932 */
2933 os_atomic_store(&mte_pcpu->deactivate_suspend, 0, release);
2934 }
2935
2936 if (class == VM_MEMORY_CLASS_TAGGED && list->vmpl_has_untagged) {
2937 pmap_make_tagged_pages(&pmap_batch_list);
2938 vm_page_list_foreach(mem, *list) {
2939 mem->vmp_using_mte = true;
2940 }
2941 }
2942
2943 assert(!mte_pcpu->deactivate_suspend);
2944 }
2945
2946 #endif /* VM_MTE_FF_VERIFY */
2947 #pragma mark Bootstrap
2948
2949 static mte_cell_queue_t
cell_list_init(mte_cell_queue_t qhp,mte_cell_state_t state,mte_cell_list_idx_t lidx)2950 cell_list_init(
2951 mte_cell_queue_t qhp,
2952 mte_cell_state_t state,
2953 mte_cell_list_idx_t lidx)
2954 {
2955 mte_cell_bucket_t buckets = cell_list_idx_buckets(lidx);
2956
2957 mte_info_lists[lidx].buckets = qhp;
2958
2959 for (mte_cell_bucket_t i = 0; i < buckets; i++, qhp++) {
2960 qhp->head = (cell_t){
2961 .prev = cell_idx(qhp),
2962 .next = cell_idx(qhp),
2963 .state = state,
2964 .enqueue_pos = -1,
2965 };
2966 }
2967
2968 return qhp;
2969 }
2970
2971 __startup_func
2972 void
mteinfo_init(uint32_t num_tag_pages)2973 mteinfo_init(uint32_t num_tag_pages)
2974 {
2975 assert(2 * num_tag_pages < (1UL << MTE_FF_CELL_INDEX_BITS));
2976 assert(atop(mte_tag_storage_end - mte_tag_storage_start) == num_tag_pages);
2977 assert(num_tag_pages == mte_tag_storage_count);
2978
2979 vm_size_t size = sizeof(cell_t) * (MTE_QUEUES_COUNT + num_tag_pages);
2980 mte_cell_queue_t queue;
2981 mte_cell_list_t list;
2982
2983 queue = pmap_steal_memory(size, 8);
2984 mte_info_cells = &(queue + MTE_QUEUES_COUNT)->head;
2985
2986 queue = cell_list_init(queue, MTE_STATE_DISABLED, MTE_LIST_DISABLED_IDX);
2987 queue = cell_list_init(queue, MTE_STATE_PINNED, MTE_LIST_PINNED_IDX);
2988 queue = cell_list_init(queue, MTE_STATE_DEACTIVATING, MTE_LIST_DEACTIVATING_IDX);
2989 queue = cell_list_init(queue, MTE_STATE_CLAIMED, MTE_LIST_CLAIMED_IDX);
2990 queue = cell_list_init(queue, MTE_STATE_INACTIVE, MTE_LIST_INACTIVE_IDX);
2991 queue = cell_list_init(queue, MTE_STATE_RECLAIMING, MTE_LIST_RECLAIMING_IDX);
2992 queue = cell_list_init(queue, MTE_STATE_ACTIVATING, MTE_LIST_ACTIVATING_IDX);
2993 queue = cell_list_init(queue, MTE_STATE_ACTIVE, MTE_LIST_ACTIVE_0_IDX);
2994 queue = cell_list_init(queue, MTE_STATE_ACTIVE, MTE_LIST_ACTIVE_IDX);
2995
2996 assert(&queue->head == mte_info_cells);
2997
2998 /*
2999 * Quickly create a list of all possible cells and place it into the
3000 * disabled queue.
3001 */
3002
3003 for (cell_idx_t i = 0; i < num_tag_pages; i++) {
3004 *cell_from_idx(i) = (cell_t){
3005 .prev = i - 1,
3006 .next = i + 1,
3007 .enqueue_pos = -1,
3008 .mte_page_count = 0,
3009 .state = MTE_STATE_DISABLED,
3010 };
3011 }
3012
3013 list = &mte_info_lists[MTE_LIST_DISABLED_IDX];
3014 queue = &list->buckets[0];
3015 queue->head.next = 0;
3016 queue->head.prev = num_tag_pages - 1;
3017 queue->head.cell_count = num_tag_pages;
3018 cell_from_idx(0)->prev = cell_idx(queue);
3019 cell_from_idx(num_tag_pages - 1)->next = cell_idx(queue);
3020 bit_set(list->mask, 0);
3021 list->count = num_tag_pages;
3022
3023 for (mte_free_queue_idx_t idx = MTE_FREE_UNTAGGABLE_0;
3024 idx < MTE_FREE_NOT_QUEUED; idx++) {
3025 for (uint32_t i = 0; i < MAX_COLORS; i++) {
3026 vm_page_queue_init(mteinfo_free_queue_head(idx, i));
3027 }
3028 }
3029
3030 #ifndef VM_MTE_FF_VERIFY
3031 vm_page_free_queue_init(&mte_claimable_queue);
3032 #endif /* VM_MTE_FF_VERIFY */
3033 }
3034
3035 #if HIBERNATION
3036
3037 void
3038 mteinfo_free_queue_foreach(void (^block)(vm_page_t))
3039 {
3040 for (cell_idx_t cidx = 0; cidx < mte_tag_storage_count; cidx++) {
3041 cell_t *cell = cell_from_idx(cidx);
3042 ppnum_t pnum = cell_first_covered_pnum(cell);
3043 uint32_t mask = cell->free_mask;
3044
3045 while (mask) {
3046 block(vm_page_find_canonical(pnum + ffs(mask) - 1));
3047 mask &= mask - 1;
3048 }
3049
3050 if (cell->state == MTE_STATE_INACTIVE) {
3051 block(vm_tag_storage_page_get(cidx));
3052 }
3053 }
3054 }
3055
3056 #endif /* HIBERNATION */
3057 #ifndef VM_MTE_FF_VERIFY
3058
3059 /* List that tracks tag storage pages until mte_tags_object is initialized. */
3060 __startup_data
3061 static vm_page_list_t mte_tag_storage_startup_list;
3062
3063 void
mteinfo_tag_storage_release_startup(vm_page_t tag_page)3064 mteinfo_tag_storage_release_startup(vm_page_t tag_page)
3065 {
3066 cell_t *cell = cell_from_tag_storage_page(tag_page);
3067 ppnum_t tag_pnum = VM_PAGE_GET_PHYS_PAGE(tag_page);
3068 ppnum_t first_pnum = cell_first_covered_pnum(cell);
3069 vm_memory_class_t class = VM_MEMORY_CLASS_TAG_STORAGE;
3070 bool deactivate = true;
3071 uint32_t mte_count = 0;
3072
3073 /*
3074 * If this is a tag storage page we won't even classify as tag
3075 * storage. Just give it to the normal free queues.
3076 *
3077 * Otherwise, keep about a 1/8 of the tag storage page around,
3078 * it should be vastly sufficient to boot. The refill thread
3079 * and various passive policies will let it rebalance later.
3080 *
3081 * Note that this code implicitly relies on the fact that
3082 * the tag storage is toward the end of the vm pages array:
3083 * we only keep tag storage around that have 32 pages free,
3084 * but pages that haven't been created yet appear as "used".
3085 */
3086
3087 assert(pmap_is_tag_storage_page(tag_pnum));
3088
3089 if (pmap_tag_storage_is_discarded(tag_pnum)) {
3090 mteinfo_tag_storage_set_retired(tag_page);
3091 return;
3092 } else if (pmap_tag_storage_is_recursive(tag_pnum)) {
3093 VM_COUNTER_INC(&vm_page_recursive_tag_storage_count);
3094 class = VM_MEMORY_CLASS_DEAD_TAG_STORAGE;
3095 } else if (pmap_tag_storage_is_unmanaged(tag_pnum)) {
3096 VM_COUNTER_INC(&vm_page_unmanaged_tag_storage_count);
3097 class = VM_MEMORY_CLASS_DEAD_TAG_STORAGE;
3098 } else {
3099 for (uint32_t i = 0; i < MTE_PAGES_PER_TAG_PAGE; i++) {
3100 mte_count += pmap_is_tagged_page(first_pnum + i);
3101 }
3102
3103 if (cell_free_page_count(*cell) == MTE_PAGES_PER_TAG_PAGE &&
3104 mteinfo_tag_storage_active(true) < mte_tag_storage_count / 8) {
3105 deactivate = false;
3106 } else if (mte_count) {
3107 deactivate = false;
3108 }
3109 }
3110
3111 if (deactivate) {
3112 pmap_unmake_tag_storage_page(tag_pnum);
3113 if (class == VM_MEMORY_CLASS_DEAD_TAG_STORAGE) {
3114 vm_page_free_queue_enter(class, tag_page, tag_pnum);
3115 } else {
3116 tag_page->vmp_q_state = VM_PAGE_ON_FREE_Q;
3117 mteinfo_tag_storage_set_inactive(tag_page, true);
3118 }
3119 return;
3120 }
3121
3122 mteinfo_tag_storage_set_active(tag_page, mte_count, true);
3123 vm_page_list_push(&mte_tag_storage_startup_list, tag_page);
3124 }
3125
3126 /*!
3127 * @function mteinfo_tag_storage_startup_list_flush()
3128 *
3129 * @abstract
3130 * Adds active tag storage pages to the mte_tags_object.
3131 *
3132 * @discussion
3133 * Adds the list of active tag storage pages updated by @see
3134 * mteinfo_tag_storage_release_startup to the mte_tags_object. This must be
3135 * called at some point after the last @see mteinfo_tag_storage_release_startup
3136 * call.
3137 */
3138 __startup_func
3139 static void
mteinfo_tag_storage_startup_list_flush(void)3140 mteinfo_tag_storage_startup_list_flush(void)
3141 {
3142 vm_page_t page;
3143
3144 vm_object_lock(mte_tags_object);
3145 vm_page_lock_queues();
3146
3147 vm_page_list_foreach_consume(page, &mte_tag_storage_startup_list) {
3148 mteinfo_tag_storage_wire_locked(page);
3149 }
3150
3151 vm_page_unlock_queues();
3152 vm_object_unlock(mte_tags_object);
3153 }
3154 STARTUP(KMEM, STARTUP_RANK_FIRST, mteinfo_tag_storage_startup_list_flush);
3155
3156 /*!
3157 * @abstract
3158 * Initializes the percpu mte queues and locks.
3159 */
3160 __startup_func
3161 static void
mteinfo_tag_storage_lock_init(void)3162 mteinfo_tag_storage_lock_init(void)
3163 {
3164 percpu_foreach(mte_pcpu, mte_pcpu) {
3165 lck_ticket_init(&mte_pcpu->free_claimed_lock,
3166 &vm_page_lck_grp_bucket);
3167 vm_page_queue_init(&mte_pcpu->free_claimed_pages);
3168 }
3169 }
3170 STARTUP(PERCPU, STARTUP_RANK_MIDDLE, mteinfo_tag_storage_lock_init);
3171
3172 /*!
3173 * @function mteinfo_init_fill_thread
3174 *
3175 * @abstract
3176 * Creates the MTE fill thread.
3177 */
3178 __startup_func
3179 static void
mteinfo_init_fill_thread(void)3180 mteinfo_init_fill_thread(void)
3181 {
3182 kern_return_t result;
3183
3184 if (!is_mte_enabled) {
3185 return;
3186 }
3187
3188 result = kernel_thread_start_priority(mteinfo_fill_continue, NULL, BASEPRI_VM,
3189 &vm_mte_fill_thread);
3190
3191 if (result != KERN_SUCCESS) {
3192 panic("Failed to create MTE fill thread.");
3193 }
3194
3195 thread_set_thread_name(vm_mte_fill_thread, "VM_mte_fill");
3196 thread_deallocate(vm_mte_fill_thread);
3197 }
3198 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, mteinfo_init_fill_thread);
3199
3200 static ppnum_t
mteinfo_tag_storage_mark_unmanaged_range(cell_idx_t idx,ppnum_t pnum)3201 mteinfo_tag_storage_mark_unmanaged_range(cell_idx_t idx, ppnum_t pnum)
3202 {
3203 cell_t *end_cell = cell_from_covered_ppnum(pnum);
3204 cell_idx_t end_idx = cell_idx(end_cell);
3205 bool locked = false;
3206
3207 for (; idx < end_idx; idx++) {
3208 cell_t *cell = cell_from_idx(idx);
3209 vm_page_t tag_page = vm_tag_storage_page_get(idx);
3210
3211 if (!locked) {
3212 vm_free_page_lock_spin();
3213 locked = true;
3214 }
3215
3216 if (pmap_tag_storage_is_discarded(VM_PAGE_GET_PHYS_PAGE(tag_page))) {
3217 mteinfo_tag_storage_set_retired(tag_page);
3218 continue;
3219 }
3220
3221 if (cell->mte_page_count != 0) {
3222 /*
3223 * This can happen if some tagged pmap steal
3224 * has not ml_static_mfree()d these pages back
3225 */
3226 continue;
3227 }
3228
3229 if (cell->state == MTE_STATE_DISABLED) {
3230 /*
3231 * Probably an ECC retired page.
3232 */
3233 continue;
3234 }
3235
3236 mteinfo_tag_storage_set_unmanaged(cell,
3237 vm_tag_storage_page_get(idx));
3238 }
3239
3240 if (locked) {
3241 vm_free_page_unlock();
3242 }
3243
3244 return end_idx + 1;
3245 }
3246
3247 static void
mteinfo_tag_storage_unmanaged_discover(void)3248 mteinfo_tag_storage_unmanaged_discover(void)
3249 {
3250 uint32_t count = vm_page_unmanaged_tag_storage_count;
3251 cell_idx_t cur_idx = 0;
3252 ppnum_t pnum;
3253
3254 if (!is_mte_enabled) {
3255 return;
3256 }
3257
3258 vm_pages_radix_for_each_pnum(pnum) {
3259 cur_idx = mteinfo_tag_storage_mark_unmanaged_range(cur_idx, pnum);
3260 }
3261 mteinfo_tag_storage_mark_unmanaged_range(cur_idx,
3262 vm_pages_first_pnum);
3263
3264 printf("MTE: discovered %d tag storage pages for unmanaged memory\n",
3265 vm_page_unmanaged_tag_storage_count - count);
3266 }
3267 STARTUP(LOCKDOWN, STARTUP_RANK_LAST, mteinfo_tag_storage_unmanaged_discover);
3268
3269 extern boolean_t get_range_bounds(char *c, int64_t *lower, int64_t *upper);
3270 static void
mteinfo_tag_storage_process_vm_tags(void)3271 mteinfo_tag_storage_process_vm_tags(void)
3272 {
3273 char *vm_tags_str;
3274
3275 if (!vm_mte_enable_tag_storage_grab) {
3276 return;
3277 }
3278
3279 vm_tags_str = vm_mte_tag_storage_for_vm_tags;
3280 while (*vm_tags_str) {
3281 uint64_t loop_end;
3282 boolean_t ret;
3283 int64_t start = 1, end = VM_MEMORY_COUNT;
3284
3285 ret = get_range_bounds(vm_tags_str, &start, &end);
3286 loop_end = (ret) ? end : start;
3287 for (int64_t i = start; i <= loop_end; i++) {
3288 bitmap_set(vm_mte_tag_storage_for_vm_tags_mask, (uint)i);
3289 }
3290
3291 /* Skip to the next ',' */
3292 while (*vm_tags_str != ',') {
3293 if (*vm_tags_str == '\0') {
3294 break;
3295 }
3296 vm_tags_str++;
3297 }
3298
3299 if (*vm_tags_str == ',') {
3300 vm_tags_str++;
3301 } else {
3302 assert(*vm_tags_str == '\0');
3303 break;
3304 }
3305 }
3306 }
3307 STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, mteinfo_tag_storage_process_vm_tags);
3308
3309 #pragma mark Counter methods
3310
3311 uint32_t
mteinfo_tag_storage_fragmentation(bool actual)3312 mteinfo_tag_storage_fragmentation(bool actual)
3313 {
3314 uint32_t ts_active;
3315 uint32_t value;
3316
3317 vm_free_page_lock_spin();
3318 ts_active = mte_info_lists[MTE_LIST_ACTIVE_IDX].count;
3319 if (actual) {
3320 ts_active += mte_info_lists[MTE_LIST_ACTIVE_0_IDX].count;
3321 }
3322 if (ts_active) {
3323 value = 1000 * vm_page_tagged_count;
3324 value /= (ts_active * MTE_PAGES_PER_TAG_PAGE);
3325 } else {
3326 value = 1000;
3327 }
3328 vm_free_page_unlock();
3329
3330 return 1000 - value;
3331 }
3332
3333 uint32_t
mteinfo_tag_storage_active(bool fq_locked)3334 mteinfo_tag_storage_active(bool fq_locked)
3335 {
3336 uint32_t active;
3337
3338 if (!fq_locked) {
3339 vm_free_page_lock_spin();
3340 }
3341
3342 active = mte_info_lists[MTE_LIST_ACTIVE_0_IDX].count +
3343 mte_info_lists[MTE_LIST_ACTIVE_IDX].count;
3344
3345 if (!fq_locked) {
3346 vm_free_page_unlock();
3347 }
3348
3349 return active;
3350 }
3351
3352 uint32_t
mteinfo_tag_storage_free_pages_for_covered(const struct vm_page * page)3353 mteinfo_tag_storage_free_pages_for_covered(const struct vm_page *page)
3354 {
3355 ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(page);
3356
3357 return cell_free_page_count(*cell_from_covered_ppnum(pnum));
3358 }
3359
3360 void
mteinfo_increment_wire_count(vm_page_t tag_page)3361 mteinfo_increment_wire_count(vm_page_t tag_page)
3362 {
3363 if (vm_page_in_tag_storage_array(tag_page) &&
3364 vm_page_is_tag_storage(tag_page)) {
3365 VM_COUNTER_ATOMIC_INC(&vm_page_wired_tag_storage_count);
3366
3367 DTRACE_VM1(vm_tag_storage_wired, vm_page_t, tag_page);
3368 }
3369 }
3370
3371 void
mteinfo_decrement_wire_count(vm_page_t tag_page,bool pqs_locked)3372 mteinfo_decrement_wire_count(vm_page_t tag_page, bool pqs_locked)
3373 {
3374 LCK_MTX_ASSERT(&vm_page_queue_lock,
3375 pqs_locked ? LCK_MTX_ASSERT_OWNED : LCK_MTX_ASSERT_NOTOWNED);
3376 LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_NOTOWNED);
3377
3378 if (vm_page_in_tag_storage_array(tag_page) &&
3379 VM_PAGE_OBJECT(tag_page) != mte_tags_object &&
3380 vm_page_is_tag_storage(tag_page)) {
3381 VM_COUNTER_ATOMIC_DEC(&vm_page_wired_tag_storage_count);
3382
3383 DTRACE_VM1(vm_tag_storage_unwired, vm_page_t, tag_page);
3384
3385 if (tag_page->vmp_ts_wanted) {
3386 /*
3387 * Many callers have the page queue lock held in spin
3388 * when calling this, and mteinfo_tag_storage_wakeup()
3389 * needs to acquire a mutex.
3390 */
3391 if (pqs_locked) {
3392 vm_page_lockconvert_queues();
3393 }
3394 mteinfo_tag_storage_wakeup(tag_page, false);
3395 }
3396 }
3397 }
3398
3399 bool
mteinfo_vm_tag_can_use_tag_storage(vm_tag_t vm_tag)3400 mteinfo_vm_tag_can_use_tag_storage(vm_tag_t vm_tag)
3401 {
3402 return bitmap_test(vm_mte_tag_storage_for_vm_tags_mask, (uint)vm_tag);
3403 }
3404
3405
3406 void
kdp_mteinfo_snapshot(struct mte_info_cell * __counted_by (count)cells,size_t count)3407 kdp_mteinfo_snapshot(struct mte_info_cell * __counted_by(count) cells, size_t count)
3408 {
3409 release_assert(count == mte_tag_storage_count);
3410
3411 if (not_in_kdp) {
3412 panic("panic: kdp_mteinfo_fill called outside of kernel debugger");
3413 }
3414
3415 for (cell_idx_t cidx = 0; cidx < mte_tag_storage_count; cidx++) {
3416 cell_t *cell = cell_from_idx(cidx);
3417 ppnum_t pnum = cell_first_covered_pnum(cell);
3418 vm_page_t mem;
3419 uint8_t wired_count = 0, wired_tagged_count = 0, kernel_wired_tagged_count = 0;
3420
3421 for (ppnum_t i = 0; i < MTE_PAGES_PER_TAG_PAGE; i++) {
3422 mem = vm_page_find_canonical(pnum + i);
3423 if (mem && VM_PAGE_WIRED(mem)) {
3424 wired_count++;
3425 if (mem->vmp_using_mte) {
3426 if (VM_PAGE_OBJECT(mem) == kernel_object_tagged) {
3427 kernel_wired_tagged_count++;
3428 } else {
3429 wired_tagged_count++;
3430 }
3431 }
3432 }
3433 }
3434
3435 cells[cidx] = (struct mte_info_cell) {
3436 .mic_state = cell->state,
3437 .mic_tagged_count = cell->mte_page_count,
3438 .mic_free_count = (uint8_t)cell_free_page_count(*cell),
3439 .mic_wired_count = wired_count,
3440 .mic_wired_tagged_count = wired_tagged_count,
3441 .mic_kernel_wired_tagged_count = kernel_wired_tagged_count
3442 };
3443 }
3444 }
3445 #endif /* VM_MTE_FF_VERIFY */
3446
3447 #endif /* HAS_MTE */
3448