1 /*
2 * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <mach_assert.h>
30
31 #include <vm/pmap.h>
32 #include <vm/vm_map_xnu.h>
33 #include <vm/vm_kern_xnu.h>
34 #include <vm/vm_page_internal.h>
35 #include <kern/ledger.h>
36 #include <kern/zalloc_internal.h>
37 #include <i386/pmap_internal.h>
38
39 void pmap_remove_range(
40 pmap_t pmap,
41 vm_map_offset_t va,
42 pt_entry_t *spte,
43 pt_entry_t *epte);
44
45 static void pmap_remove_range_options(
46 pmap_t pmap,
47 vm_map_offset_t va,
48 pt_entry_t *spte,
49 pt_entry_t *epte,
50 int options);
51
52 void pmap_reusable_range(
53 pmap_t pmap,
54 vm_map_offset_t va,
55 pt_entry_t *spte,
56 pt_entry_t *epte,
57 boolean_t reusable);
58
59 pt_entry_t *PTE_corrupted_ptr;
60
61 #if DEVELOPMENT || DEBUG
62 int pmap_inject_pte_corruption;
63 uint32_t pmap_update_clear_pte_count;
64 uint32_t pmap_update_invalid_pte_count;
65 #endif
66
67 /*
68 * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time,
69 * on a NBPDE boundary.
70 */
71
72 uint64_t
pmap_shared_region_size_min(__unused pmap_t pmap)73 pmap_shared_region_size_min(__unused pmap_t pmap)
74 {
75 return NBPDE;
76 }
77
78 uint64_t
pmap_commpage_size_min(__unused pmap_t pmap)79 pmap_commpage_size_min(__unused pmap_t pmap)
80 {
81 return NBPDE;
82 }
83
84 void
pmap_set_shared_region(pmap_t grand __unused,pmap_t subord __unused,addr64_t vstart __unused,uint64_t size __unused)85 pmap_set_shared_region(
86 pmap_t grand __unused,
87 pmap_t subord __unused,
88 addr64_t vstart __unused,
89 uint64_t size __unused)
90 {
91 }
92
93 kern_return_t
pmap_fork_nest(pmap_t old_pmap __unused,pmap_t new_pmap __unused)94 pmap_fork_nest(
95 pmap_t old_pmap __unused,
96 pmap_t new_pmap __unused)
97 {
98 return KERN_SUCCESS;
99 }
100
101 /*
102 * kern_return_t pmap_nest(grand, subord, va_start, size)
103 *
104 * grand = the pmap that we will nest subord into
105 * subord = the pmap that goes into the grand
106 * va_start = start of range in pmap to be inserted
107 * size = Size of nest area (up to 16TB)
108 *
109 * Inserts a pmap into another. This is used to implement shared segments.
110 *
111 * Note that we depend upon higher level VM locks to insure that things don't change while
112 * we are doing this. For example, VM should not be doing any pmap enters while it is nesting
113 * or do 2 nests at once.
114 */
115
116 /*
117 * This routine can nest subtrees either at the PDPT level (1GiB) or at the
118 * PDE level (2MiB). We currently disallow disparate offsets for the "subord"
119 * container and the "grand" parent. A minor optimization to consider for the
120 * future: make the "subord" truly a container rather than a full-fledged
121 * pagetable hierarchy which can be unnecessarily sparse (DRK).
122 */
123
124 kern_return_t
pmap_nest(pmap_t grand,pmap_t subord,addr64_t va_start,uint64_t size)125 pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, uint64_t size)
126 {
127 vm_map_offset_t vaddr;
128 pd_entry_t *pde, *npde;
129 unsigned int i;
130 uint64_t num_pde;
131
132 assert(!is_ept_pmap(grand));
133 assert(!is_ept_pmap(subord));
134
135 if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
136 (va_start & (pmap_shared_region_size_min(grand) - 1)) ||
137 ((size >> 28) > 65536)) { /* Max size we can nest is 16TB */
138 return KERN_INVALID_VALUE;
139 }
140
141 if (size == 0) {
142 panic("pmap_nest: size is invalid - %016llX", size);
143 }
144
145 PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
146 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
147 VM_KERNEL_ADDRHIDE(va_start));
148
149 vaddr = (vm_map_offset_t)va_start;
150 num_pde = size >> PDESHIFT;
151
152 PMAP_LOCK_EXCLUSIVE(subord);
153
154 subord->pm_shared = TRUE;
155
156 for (i = 0; i < num_pde;) {
157 if (((vaddr & PDPTMASK) == 0) && (num_pde - i) >= NPDEPG) {
158 npde = pmap64_pdpt(subord, vaddr);
159
160 while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
161 PMAP_UNLOCK_EXCLUSIVE(subord);
162 pmap_expand_pdpt(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
163 PMAP_LOCK_EXCLUSIVE(subord);
164 npde = pmap64_pdpt(subord, vaddr);
165 }
166 *npde |= INTEL_PDPTE_NESTED;
167 vaddr += NBPDPT;
168 i += (uint32_t)NPDEPG;
169 } else {
170 npde = pmap_pde(subord, vaddr);
171
172 while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
173 PMAP_UNLOCK_EXCLUSIVE(subord);
174 pmap_expand(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
175 PMAP_LOCK_EXCLUSIVE(subord);
176 npde = pmap_pde(subord, vaddr);
177 }
178 vaddr += NBPDE;
179 i++;
180 }
181 }
182
183 PMAP_UNLOCK_EXCLUSIVE(subord);
184
185 vaddr = (vm_map_offset_t)va_start;
186
187 PMAP_LOCK_EXCLUSIVE(grand);
188
189 for (i = 0; i < num_pde;) {
190 pd_entry_t tpde;
191
192 if (((vaddr & PDPTMASK) == 0) && ((num_pde - i) >= NPDEPG)) {
193 npde = pmap64_pdpt(subord, vaddr);
194 if (npde == 0) {
195 panic("pmap_nest: no PDPT, subord %p nstart 0x%llx", subord, vaddr);
196 }
197 tpde = *npde;
198 pde = pmap64_pdpt(grand, vaddr);
199 if (0 == pde) {
200 PMAP_UNLOCK_EXCLUSIVE(grand);
201 pmap_expand_pml4(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
202 PMAP_LOCK_EXCLUSIVE(grand);
203 pde = pmap64_pdpt(grand, vaddr);
204 }
205 if (pde == 0) {
206 panic("pmap_nest: no PDPT, grand %p vaddr 0x%llx", grand, vaddr);
207 }
208 pmap_store_pte(FALSE, pde, tpde);
209 vaddr += NBPDPT;
210 i += (uint32_t) NPDEPG;
211 } else {
212 npde = pmap_pde(subord, vaddr);
213 if (npde == 0) {
214 panic("pmap_nest: no npde, subord %p vaddr 0x%llx", subord, vaddr);
215 }
216 tpde = *npde;
217 pde = pmap_pde(grand, vaddr);
218 if (0 == pde) {
219 PMAP_UNLOCK_EXCLUSIVE(grand);
220 pmap_expand_pdpt(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
221 PMAP_LOCK_EXCLUSIVE(grand);
222 pde = pmap_pde(grand, vaddr);
223 }
224
225 if (pde == 0) {
226 panic("pmap_nest: no pde, grand %p vaddr 0x%llx", grand, vaddr);
227 }
228 vaddr += NBPDE;
229 pmap_store_pte(FALSE, pde, tpde);
230 i++;
231 }
232 }
233
234 PMAP_UNLOCK_EXCLUSIVE(grand);
235
236 PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, KERN_SUCCESS);
237
238 return KERN_SUCCESS;
239 }
240
241 /*
242 * kern_return_t pmap_unnest(grand, vaddr)
243 *
244 * grand = the pmap that we will un-nest subord from
245 * vaddr = start of range in pmap to be unnested
246 *
247 * Removes a pmap from another. This is used to implement shared segments.
248 */
249
250 kern_return_t
pmap_unnest(pmap_t grand,addr64_t vaddr,uint64_t size)251 pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size)
252 {
253 pd_entry_t *pde;
254 unsigned int i;
255 uint64_t num_pde;
256 addr64_t va_start, va_end;
257 uint64_t npdpt = PMAP_INVALID_PDPTNUM;
258
259 PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
260 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
261
262 if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
263 (vaddr & (pmap_shared_region_size_min(grand) - 1))) {
264 panic("pmap_unnest(%p,0x%llx,0x%llx): unaligned...",
265 grand, vaddr, size);
266 }
267
268 assert(!is_ept_pmap(grand));
269
270 /* align everything to PDE boundaries */
271 va_start = vaddr & ~(NBPDE - 1);
272
273 if (os_add_overflow(vaddr, size + NBPDE - 1, &va_end)) {
274 panic("pmap_unnest: Overflow when calculating range end: s=0x%llx sz=0x%llx\n", vaddr, size);
275 }
276
277 va_end &= ~(NBPDE - 1);
278 size = va_end - va_start;
279
280 PMAP_LOCK_EXCLUSIVE(grand);
281
282 num_pde = size >> PDESHIFT;
283 vaddr = va_start;
284
285 for (i = 0; i < num_pde;) {
286 if (pdptnum(grand, vaddr) != npdpt) {
287 npdpt = pdptnum(grand, vaddr);
288 pde = pmap64_pdpt(grand, vaddr);
289 if (pde && (*pde & INTEL_PDPTE_NESTED)) {
290 pmap_store_pte(FALSE, pde, (pd_entry_t)0);
291 i += (uint32_t) NPDEPG;
292 vaddr += NBPDPT;
293 continue;
294 }
295 }
296 pde = pmap_pde(grand, (vm_map_offset_t)vaddr);
297 if (pde == 0) {
298 panic("pmap_unnest: no pde, grand %p vaddr 0x%llx", grand, vaddr);
299 }
300 pmap_store_pte(FALSE, pde, (pd_entry_t)0);
301 i++;
302 vaddr += NBPDE;
303 }
304
305 PMAP_UPDATE_TLBS(grand, va_start, va_end);
306
307 PMAP_UNLOCK_EXCLUSIVE(grand);
308
309 PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
310
311 return KERN_SUCCESS;
312 }
313
314 kern_return_t
pmap_unnest_options(pmap_t grand,addr64_t vaddr,__unused uint64_t size,__unused unsigned int options)315 pmap_unnest_options(
316 pmap_t grand,
317 addr64_t vaddr,
318 __unused uint64_t size,
319 __unused unsigned int options)
320 {
321 return pmap_unnest(grand, vaddr, size);
322 }
323
324 /* Invoked by the Mach VM to determine the platform specific unnest region */
325
326 boolean_t
pmap_adjust_unnest_parameters(pmap_t p,vm_map_offset_t * s,vm_map_offset_t * e)327 pmap_adjust_unnest_parameters(pmap_t p, vm_map_offset_t *s, vm_map_offset_t *e)
328 {
329 pd_entry_t *pdpte;
330 boolean_t rval = FALSE;
331
332 PMAP_LOCK_EXCLUSIVE(p);
333
334 pdpte = pmap64_pdpt(p, *s);
335 if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
336 *s &= ~(NBPDPT - 1);
337 rval = TRUE;
338 }
339
340 pdpte = pmap64_pdpt(p, *e);
341 if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
342 *e = ((*e + NBPDPT) & ~(NBPDPT - 1));
343 rval = TRUE;
344 }
345
346 PMAP_UNLOCK_EXCLUSIVE(p);
347
348 return rval;
349 }
350
351 pmap_paddr_t
pmap_find_pa(pmap_t pmap,addr64_t va)352 pmap_find_pa(pmap_t pmap, addr64_t va)
353 {
354 pt_entry_t *ptp;
355 pd_entry_t *pdep;
356 pd_entry_t pde;
357 pt_entry_t pte;
358 boolean_t is_ept, locked = FALSE;
359 pmap_paddr_t pa = 0;
360
361 is_ept = is_ept_pmap(pmap);
362
363 if ((pmap != kernel_pmap) && not_in_kdp) {
364 PMAP_LOCK_EXCLUSIVE(pmap);
365 locked = TRUE;
366 } else {
367 mp_disable_preemption();
368 }
369
370 if (os_ref_get_count(&pmap->ref_count) == 0) {
371 goto pfp_exit;
372 }
373
374 pdep = pmap_pde(pmap, va);
375
376 if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
377 if (pde & PTE_PS) {
378 pa = pte_to_pa(pde) + (va & I386_LPGMASK);
379 } else {
380 ptp = pmap_pte(pmap, va);
381 if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
382 pa = pte_to_pa(pte) + (va & PAGE_MASK);
383 }
384 }
385 }
386 pfp_exit:
387 if (locked) {
388 PMAP_UNLOCK_EXCLUSIVE(pmap);
389 } else {
390 mp_enable_preemption();
391 }
392
393 return pa;
394 }
395
396 /*
397 * pmap_find_phys returns the (4K) physical page number containing a
398 * given virtual address in a given pmap.
399 * Note that pmap_pte may return a pde if this virtual address is
400 * mapped by a large page and this is taken into account in order
401 * to return the correct page number in this case.
402 */
403 ppnum_t
pmap_find_phys(pmap_t pmap,addr64_t va)404 pmap_find_phys(pmap_t pmap, addr64_t va)
405 {
406 ppnum_t ppn = 0;
407 pmap_paddr_t pa = 0;
408
409 pa = pmap_find_pa(pmap, va);
410 ppn = (ppnum_t) i386_btop(pa);
411
412 return ppn;
413 }
414
415 ppnum_t
pmap_find_phys_nofault(pmap_t pmap,addr64_t va)416 pmap_find_phys_nofault(pmap_t pmap, addr64_t va)
417 {
418 if ((pmap == kernel_pmap) ||
419 ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map)))) {
420 return pmap_find_phys(pmap, va);
421 }
422 return 0;
423 }
424
425 /*
426 * pmap_get_prot returns the equivalent Vm page protections
427 * set on a given address, 'va'. This function is used in the
428 * ml_static_verify_page_protections() routine which is used
429 * by the kext loading code to validate that the TEXT segment
430 * of a kext is mapped executable.
431 */
432 kern_return_t
pmap_get_prot(pmap_t pmap,addr64_t va,vm_prot_t * protp)433 pmap_get_prot(pmap_t pmap, addr64_t va, vm_prot_t *protp)
434 {
435 pt_entry_t *ptp;
436 pd_entry_t *pdep;
437 pd_entry_t pde;
438 pt_entry_t pte;
439 boolean_t is_ept, locked = FALSE;
440 kern_return_t retval = KERN_FAILURE;
441 vm_prot_t prot = 0;
442
443 is_ept = is_ept_pmap(pmap);
444
445 if ((pmap != kernel_pmap) && not_in_kdp) {
446 PMAP_LOCK_EXCLUSIVE(pmap);
447 locked = TRUE;
448 } else {
449 mp_disable_preemption();
450 }
451
452 if (os_ref_get_count(&pmap->ref_count) == 0) {
453 goto pfp_exit;
454 }
455
456 pdep = pmap_pde(pmap, va);
457
458 if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
459 if (pde & PTE_PS) {
460 prot = VM_PROT_READ;
461
462 if (pde & PTE_WRITE(is_ept)) {
463 prot |= VM_PROT_WRITE;
464 }
465 if (PTE_IS_EXECUTABLE(is_ept, pde)) {
466 prot |= VM_PROT_EXECUTE;
467 }
468 retval = KERN_SUCCESS;
469 } else {
470 ptp = pmap_pte(pmap, va);
471 if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
472 prot = VM_PROT_READ;
473
474 if (pte & PTE_WRITE(is_ept)) {
475 prot |= VM_PROT_WRITE;
476 }
477 if (PTE_IS_EXECUTABLE(is_ept, pte)) {
478 prot |= VM_PROT_EXECUTE;
479 }
480 retval = KERN_SUCCESS;
481 }
482 }
483 }
484
485 pfp_exit:
486 if (locked) {
487 PMAP_UNLOCK_EXCLUSIVE(pmap);
488 } else {
489 mp_enable_preemption();
490 }
491
492 if (protp) {
493 *protp = prot;
494 }
495
496 return retval;
497 }
498
499 /*
500 * Update cache attributes for all extant managed mappings.
501 * Assumes PV for this page is locked, and that the page
502 * is managed. We assume that this physical page may be mapped in
503 * both EPT and normal Intel PTEs, so we convert the attributes
504 * to the corresponding format for each pmap.
505 *
506 * We assert that the passed set of attributes is a subset of the
507 * PHYS_CACHEABILITY_MASK.
508 */
509 void
pmap_update_cache_attributes_locked(ppnum_t pn,unsigned attributes)510 pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes)
511 {
512 pv_rooted_entry_t pv_h, pv_e;
513 pv_hashed_entry_t pvh_e, nexth;
514 vm_map_offset_t vaddr;
515 pmap_t pmap;
516 pt_entry_t *ptep;
517 boolean_t is_ept;
518 unsigned ept_attributes;
519
520 assert(IS_MANAGED_PAGE(pn));
521 assert(((~PHYS_CACHEABILITY_MASK) & attributes) == 0);
522
523 /* We don't support the PAT bit for EPT PTEs */
524 if (attributes & INTEL_PTE_NCACHE) {
525 ept_attributes = INTEL_EPT_NCACHE;
526 } else {
527 ept_attributes = INTEL_EPT_WB;
528 }
529
530 pv_h = pai_to_pvh(pn);
531 /* TODO: translate the PHYS_* bits to PTE bits, while they're
532 * currently identical, they may not remain so
533 * Potential optimization (here and in page_protect),
534 * parallel shootdowns, check for redundant
535 * attribute modifications.
536 */
537
538 /*
539 * Alter attributes on all mappings
540 */
541 if (pv_h->pmap != PMAP_NULL) {
542 pv_e = pv_h;
543 pvh_e = (pv_hashed_entry_t)pv_e;
544
545 do {
546 pmap = pv_e->pmap;
547 vaddr = PVE_VA(pv_e);
548 ptep = pmap_pte(pmap, vaddr);
549
550 if (0 == ptep) {
551 panic("pmap_update_cache_attributes_locked: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx kernel_pmap: %p", pmap, pn, vaddr, kernel_pmap);
552 }
553
554 is_ept = is_ept_pmap(pmap);
555
556 nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink);
557 if (!is_ept) {
558 pmap_update_pte(is_ept, ptep, PHYS_CACHEABILITY_MASK, attributes, true);
559 } else {
560 pmap_update_pte(is_ept, ptep, INTEL_EPT_CACHE_MASK, ept_attributes, true);
561 }
562 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
563 pvh_e = nexth;
564 } while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
565 }
566 }
567
568 void
x86_filter_TLB_coherency_interrupts(boolean_t dofilter)569 x86_filter_TLB_coherency_interrupts(boolean_t dofilter)
570 {
571 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
572
573 if (dofilter) {
574 CPU_CR3_MARK_INACTIVE();
575 } else {
576 CPU_CR3_MARK_ACTIVE();
577 mfence();
578 pmap_update_interrupt();
579 }
580 }
581
582
583 /*
584 * Insert the given physical page (p) at
585 * the specified virtual address (v) in the
586 * target physical map with the protection requested.
587 *
588 * If specified, the page will be wired down, meaning
589 * that the related pte cannot be reclaimed.
590 *
591 * NB: This is the only routine which MAY NOT lazy-evaluate
592 * or lose information. That is, this routine must actually
593 * insert this page into the given map NOW.
594 */
595
596 kern_return_t
pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired,pmap_mapping_type_t mapping_type)597 pmap_enter(
598 pmap_t pmap,
599 vm_map_offset_t vaddr,
600 ppnum_t pn,
601 vm_prot_t prot,
602 vm_prot_t fault_type,
603 unsigned int flags,
604 boolean_t wired,
605 pmap_mapping_type_t mapping_type)
606 {
607 return pmap_enter_options(pmap, vaddr, pn, prot, fault_type, flags, wired, PMAP_EXPAND_OPTIONS_NONE, NULL, mapping_type);
608 }
609
610 #define PTE_LOCK(EPT) INTEL_PTE_SWLOCK
611
612 static inline void PTE_LOCK_LOCK(pt_entry_t *);
613 static inline void PTE_LOCK_UNLOCK(pt_entry_t *);
614
615 void
PTE_LOCK_LOCK(pt_entry_t * lpte)616 PTE_LOCK_LOCK(pt_entry_t *lpte)
617 {
618 pt_entry_t pte;
619 plretry:
620 while ((pte = __c11_atomic_load((_Atomic pt_entry_t *)lpte, memory_order_relaxed)) & PTE_LOCK(0)) {
621 __builtin_ia32_pause();
622 }
623 if (__c11_atomic_compare_exchange_strong((_Atomic pt_entry_t *)lpte, &pte, pte | PTE_LOCK(0), memory_order_acquire_smp, TRUE)) {
624 return;
625 }
626
627 goto plretry;
628 }
629
630 void
PTE_LOCK_UNLOCK(pt_entry_t * lpte)631 PTE_LOCK_UNLOCK(pt_entry_t *lpte)
632 {
633 __c11_atomic_fetch_and((_Atomic pt_entry_t *)lpte, ~PTE_LOCK(0), memory_order_release_smp);
634 }
635
636 kern_return_t
pmap_enter_options_addr(pmap_t pmap,vm_map_address_t v,pmap_paddr_t pa,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,__unused void * arg,pmap_mapping_type_t mapping_type)637 pmap_enter_options_addr(
638 pmap_t pmap,
639 vm_map_address_t v,
640 pmap_paddr_t pa,
641 vm_prot_t prot,
642 vm_prot_t fault_type,
643 unsigned int flags,
644 boolean_t wired,
645 unsigned int options,
646 __unused void *arg,
647 pmap_mapping_type_t mapping_type)
648 {
649 return pmap_enter_options(pmap, v, intel_btop(pa), prot, fault_type, flags, wired, options, arg, mapping_type);
650 }
651
652 kern_return_t
pmap_enter_options(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,__unused vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,void * arg,__unused pmap_mapping_type_t mapping_type)653 pmap_enter_options(
654 pmap_t pmap,
655 vm_map_offset_t vaddr,
656 ppnum_t pn,
657 vm_prot_t prot,
658 __unused vm_prot_t fault_type,
659 unsigned int flags,
660 boolean_t wired,
661 unsigned int options,
662 void *arg,
663 __unused pmap_mapping_type_t mapping_type)
664 {
665 pt_entry_t *pte = NULL;
666 pv_rooted_entry_t pv_h;
667 ppnum_t pai;
668 pv_hashed_entry_t pvh_e;
669 pv_hashed_entry_t pvh_new;
670 pt_entry_t template;
671 pmap_paddr_t old_pa;
672 pmap_paddr_t pa = (pmap_paddr_t) i386_ptob(pn);
673 boolean_t need_tlbflush = FALSE;
674 boolean_t set_NX;
675 char oattr;
676 boolean_t old_pa_locked;
677 /* 2MiB mappings are confined to x86_64 by VM */
678 boolean_t superpage = flags & VM_MEM_SUPERPAGE;
679 vm_object_t delpage_pm_obj = NULL;
680 uint64_t delpage_pde_index = 0;
681 pt_entry_t old_pte;
682 kern_return_t kr = KERN_FAILURE;
683 boolean_t is_ept;
684 boolean_t is_altacct;
685 boolean_t ptelocked = FALSE;
686
687 pmap_intr_assert();
688
689 if (__improbable(pmap == PMAP_NULL)) {
690 return KERN_INVALID_ARGUMENT;
691 }
692 if (__improbable(pn == vm_page_guard_addr)) {
693 return KERN_INVALID_ARGUMENT;
694 }
695
696 is_ept = is_ept_pmap(pmap);
697
698 /* N.B. We can be supplied a zero page frame in the NOENTER case, it's an
699 * unused value for that scenario.
700 */
701 assert(pn != vm_page_fictitious_addr);
702
703
704 PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
705 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(vaddr), pn,
706 prot);
707
708 if ((prot & VM_PROT_EXECUTE) || __improbable(is_ept && (prot & VM_PROT_UEXEC))) {
709 set_NX = FALSE;
710 } else {
711 set_NX = TRUE;
712 }
713
714 #if DEVELOPMENT || DEBUG
715 if (__improbable(set_NX && (!nx_enabled || !pmap->nx_enabled))) {
716 set_NX = FALSE;
717 }
718
719 if (__improbable(set_NX && (pmap == kernel_pmap) &&
720 ((pmap_disable_kstack_nx && (flags & VM_MEM_STACK)) ||
721 (pmap_disable_kheap_nx && !(flags & VM_MEM_STACK))))) {
722 set_NX = FALSE;
723 }
724 #endif
725
726 pvh_new = PV_HASHED_ENTRY_NULL;
727 Retry:
728 pvh_e = PV_HASHED_ENTRY_NULL;
729
730 PMAP_LOCK_SHARED(pmap);
731
732 /*
733 * Expand pmap to include this pte. Assume that
734 * pmap is always expanded to include enough hardware
735 * pages to map one VM page.
736 */
737 if (__improbable(superpage)) {
738 while ((pte = pmap_pde(pmap, vaddr)) == PD_ENTRY_NULL) {
739 /* need room for another pde entry */
740 PMAP_UNLOCK_SHARED(pmap);
741 kr = pmap_expand_pdpt(pmap, vaddr, options);
742 if (kr != KERN_SUCCESS) {
743 goto done1;
744 }
745 PMAP_LOCK_SHARED(pmap);
746 }
747 } else {
748 while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
749 /*
750 * Must unlock to expand the pmap
751 * going to grow pde level page(s)
752 */
753 PMAP_UNLOCK_SHARED(pmap);
754 kr = pmap_expand(pmap, vaddr, options);
755 if (kr != KERN_SUCCESS) {
756 goto done1;
757 }
758 PMAP_LOCK_SHARED(pmap);
759 }
760 }
761
762 if (__improbable(options & PMAP_EXPAND_OPTIONS_NOENTER)) {
763 PMAP_UNLOCK_SHARED(pmap);
764 kr = KERN_SUCCESS;
765 goto done1;
766 }
767
768 if (__improbable(superpage && *pte && !(*pte & PTE_PS))) {
769 /*
770 * There is still an empty page table mapped that
771 * was used for a previous base page mapping.
772 * Remember the PDE and the PDE index, so that we
773 * can free the page at the end of this function.
774 */
775 delpage_pde_index = pdeidx(pmap, vaddr);
776 delpage_pm_obj = pmap->pm_obj;
777 pmap_store_pte(is_ept, pte, 0);
778 }
779
780 PTE_LOCK_LOCK(pte);
781 ptelocked = TRUE;
782
783 old_pa = pte_to_pa(*pte);
784 pai = pa_index(old_pa);
785 old_pa_locked = FALSE;
786
787 if (old_pa == 0 &&
788 PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr)) {
789 /*
790 * "pmap" should be locked at this point, so this should
791 * not race with another pmap_enter() or pmap_remove_range().
792 */
793 assert(pmap != kernel_pmap);
794
795 /* one less "compressed" */
796 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
797 PAGE_SIZE);
798 if (*pte & PTE_COMPRESSED_ALT) {
799 pmap_ledger_debit(
800 pmap,
801 task_ledgers.alternate_accounting_compressed,
802 PAGE_SIZE);
803 } else {
804 /* was part of the footprint */
805 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
806 PAGE_SIZE);
807 }
808 /* marker will be cleared below */
809 }
810
811 /*
812 * if we have a previous managed page, lock the pv entry now. after
813 * we lock it, check to see if someone beat us to the lock and if so
814 * drop the lock
815 */
816 if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) {
817 LOCK_PVH(pai);
818 old_pa_locked = TRUE;
819 old_pa = pte_to_pa(*pte);
820 if (0 == old_pa) {
821 UNLOCK_PVH(pai); /* another path beat us to it */
822 old_pa_locked = FALSE;
823 }
824 }
825
826 /*
827 * Special case if the incoming physical page is already mapped
828 * at this address.
829 */
830 if (old_pa == pa) {
831 pt_entry_t old_attributes =
832 *pte & ~(PTE_REF(is_ept) | PTE_MOD(is_ept) | PTE_LOCK(is_ept));
833
834 /*
835 * May be changing its wired attribute or protection
836 */
837
838 template = pa_to_pte(pa);
839
840 if (__probable(!is_ept)) {
841 template |= INTEL_PTE_VALID;
842 } else {
843 template |= INTEL_EPT_IPAT;
844 }
845
846 template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
847
848 /*
849 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
850 */
851 if (!is_ept && (VM_MEM_NOT_CACHEABLE ==
852 (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)))) {
853 if (!(flags & VM_MEM_GUARDED)) {
854 template |= INTEL_PTE_PAT;
855 }
856 template |= INTEL_PTE_NCACHE;
857 }
858 if (pmap != kernel_pmap && !is_ept) {
859 template |= INTEL_PTE_USER;
860 }
861
862 if (prot & VM_PROT_READ) {
863 template |= PTE_READ(is_ept);
864 }
865
866 if (prot & VM_PROT_WRITE) {
867 template |= PTE_WRITE(is_ept);
868 if (is_ept && !pmap_ept_support_ad) {
869 template |= PTE_MOD(is_ept);
870 if (old_pa_locked) {
871 assert(IS_MANAGED_PAGE(pai));
872 pmap_phys_attributes[pai] |= PHYS_MODIFIED;
873 }
874 }
875 }
876
877 if (prot & VM_PROT_EXECUTE) {
878 assert(set_NX == 0);
879 template = pte_set_ex(template, is_ept);
880 }
881
882 if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
883 assert(set_NX == 0);
884 template = pte_set_uex(template);
885 }
886
887 if (set_NX) {
888 template = pte_remove_ex(template, is_ept);
889 }
890
891 if (wired) {
892 template |= PTE_WIRED;
893 if (!iswired(old_attributes)) {
894 pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
895 }
896 } else {
897 if (iswired(old_attributes)) {
898 pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
899 }
900 }
901
902 if (superpage) { /* this path can not be used */
903 template |= PTE_PS; /* to change the page size! */
904 }
905 if (old_attributes == template) {
906 goto dont_update_pte;
907 }
908
909 /* Determine delta, PV locked */
910 need_tlbflush =
911 ((old_attributes ^ template) != PTE_WIRED);
912
913 /* Optimisation: avoid TLB flush when adding writability */
914 if (need_tlbflush == TRUE && !(old_attributes & PTE_WRITE(is_ept))) {
915 if ((old_attributes ^ template) == PTE_WRITE(is_ept)) {
916 need_tlbflush = FALSE;
917 }
918 }
919
920 /* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
921 if (__improbable(is_ept && !pmap_ept_support_ad)) {
922 template |= PTE_REF(is_ept);
923 if (old_pa_locked) {
924 assert(IS_MANAGED_PAGE(pai));
925 pmap_phys_attributes[pai] |= PHYS_REFERENCED;
926 }
927 }
928
929 /* store modified PTE and preserve RC bits */
930 pt_entry_t npte, opte;
931
932 assert((*pte & PTE_LOCK(is_ept)) != 0);
933
934 do {
935 opte = *pte;
936 npte = template | (opte & (PTE_REF(is_ept) |
937 PTE_MOD(is_ept))) | PTE_LOCK(is_ept);
938 } while (!pmap_cmpx_pte(pte, opte, npte));
939
940 DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, opte, uint64_t, npte);
941
942 dont_update_pte:
943 if (old_pa_locked) {
944 UNLOCK_PVH(pai);
945 old_pa_locked = FALSE;
946 }
947 goto done2;
948 }
949
950 /*
951 * Outline of code from here:
952 * 1) If va was mapped, update TLBs, remove the mapping
953 * and remove old pvlist entry.
954 * 2) Add pvlist entry for new mapping
955 * 3) Enter new mapping.
956 *
957 * If the old physical page is not managed step 1) is skipped
958 * (except for updating the TLBs), and the mapping is
959 * overwritten at step 3). If the new physical page is not
960 * managed, step 2) is skipped.
961 */
962 /* TODO: add opportunistic refmod collect */
963 if (old_pa != (pmap_paddr_t) 0) {
964 boolean_t was_altacct = FALSE;
965
966 /*
967 * Don't do anything to pages outside valid memory here.
968 * Instead convince the code that enters a new mapping
969 * to overwrite the old one.
970 */
971
972 /* invalidate the PTE */
973 pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
974 /* propagate invalidate everywhere */
975 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
976 /* remember reference and change */
977 old_pte = *pte;
978 oattr = (char) (old_pte & (PTE_MOD(is_ept) | PTE_REF(is_ept)));
979 /* completely invalidate the PTE */
980 pmap_store_pte(is_ept, pte, PTE_LOCK(is_ept));
981
982 if (IS_MANAGED_PAGE(pai)) {
983 /*
984 * Remove the mapping from the pvlist for
985 * this physical page.
986 * We'll end up with either a rooted pv or a
987 * hashed pv
988 */
989 pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, &old_pte, &was_altacct);
990 }
991
992 if (IS_MANAGED_PAGE(pai)) {
993 pmap_assert(old_pa_locked == TRUE);
994 pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
995 if (pmap != kernel_pmap) {
996 /* update ledgers */
997 if (was_altacct) {
998 assert(IS_INTERNAL_PAGE(pai));
999 pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1000 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1001 } else if (IS_REUSABLE_PAGE(pai)) {
1002 assert(!was_altacct);
1003 assert(IS_INTERNAL_PAGE(pai));
1004 pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
1005 /* was already not in phys_footprint */
1006 } else if (IS_INTERNAL_PAGE(pai)) {
1007 assert(!was_altacct);
1008 assert(!IS_REUSABLE_PAGE(pai));
1009 pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1010 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1011 } else {
1012 /* not an internal page */
1013 pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
1014 }
1015 }
1016 if (iswired(*pte)) {
1017 pmap_ledger_debit(pmap, task_ledgers.wired_mem,
1018 PAGE_SIZE);
1019 }
1020
1021 if (!is_ept) {
1022 pmap_phys_attributes[pai] |= oattr;
1023 } else {
1024 pmap_phys_attributes[pai] |= ept_refmod_to_physmap(oattr);
1025 }
1026 } else {
1027 /*
1028 * old_pa is not managed.
1029 * Do removal part of accounting.
1030 */
1031
1032 if (pmap != kernel_pmap) {
1033 #if 00
1034 assert(pmap->stats.device > 0);
1035 OSAddAtomic(-1, &pmap->stats.device);
1036 #endif
1037 }
1038 if (iswired(*pte)) {
1039 pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1040 }
1041 }
1042 }
1043
1044 /*
1045 * if we had a previously managed paged locked, unlock it now
1046 */
1047 if (old_pa_locked) {
1048 UNLOCK_PVH(pai);
1049 old_pa_locked = FALSE;
1050 }
1051
1052 pai = pa_index(pa); /* now working with new incoming phys page */
1053 if (IS_MANAGED_PAGE(pai)) {
1054 /*
1055 * Step 2) Enter the mapping in the PV list for this
1056 * physical page.
1057 */
1058 pv_h = pai_to_pvh(pai);
1059
1060 LOCK_PVH(pai);
1061
1062 if (pv_h->pmap == PMAP_NULL) {
1063 /*
1064 * No mappings yet, use rooted pv
1065 */
1066 pv_h->va_and_flags = vaddr;
1067 pv_h->pmap = pmap;
1068 queue_init(&pv_h->qlink);
1069
1070 if (options & PMAP_OPTIONS_INTERNAL) {
1071 pmap_phys_attributes[pai] |= PHYS_INTERNAL;
1072 } else {
1073 pmap_phys_attributes[pai] &= ~PHYS_INTERNAL;
1074 }
1075 if (options & PMAP_OPTIONS_REUSABLE) {
1076 pmap_phys_attributes[pai] |= PHYS_REUSABLE;
1077 } else {
1078 pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
1079 }
1080 if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1081 IS_INTERNAL_PAGE(pai)) {
1082 pv_h->va_and_flags |= PVE_IS_ALTACCT;
1083 is_altacct = TRUE;
1084 } else {
1085 pv_h->va_and_flags &= ~PVE_IS_ALTACCT;
1086 is_altacct = FALSE;
1087 }
1088 } else {
1089 /*
1090 * Add new pv_hashed_entry after header.
1091 */
1092 if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
1093 pvh_e = pvh_new;
1094 pvh_new = PV_HASHED_ENTRY_NULL;
1095 } else if (PV_HASHED_ENTRY_NULL == pvh_e) {
1096 PV_HASHED_ALLOC(&pvh_e);
1097 if (PV_HASHED_ENTRY_NULL == pvh_e) {
1098 /*
1099 * the pv list is empty. if we are on
1100 * the kernel pmap we'll use one of
1101 * the special private kernel pv_e's,
1102 * else, we need to unlock
1103 * everything, zalloc a pv_e, and
1104 * restart bringing in the pv_e with
1105 * us.
1106 */
1107 if (kernel_pmap == pmap) {
1108 PV_HASHED_KERN_ALLOC(&pvh_e);
1109 } else {
1110 UNLOCK_PVH(pai);
1111 PTE_LOCK_UNLOCK(pte);
1112 PMAP_UNLOCK_SHARED(pmap);
1113 pmap_pv_throttle(pmap);
1114 pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
1115 goto Retry;
1116 }
1117 }
1118 }
1119
1120 if (PV_HASHED_ENTRY_NULL == pvh_e) {
1121 panic("Mapping alias chain exhaustion, possibly induced by numerous kernel virtual double mappings");
1122 }
1123
1124 pvh_e->va_and_flags = vaddr;
1125 pvh_e->pmap = pmap;
1126 pvh_e->ppn = pn;
1127 if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1128 IS_INTERNAL_PAGE(pai)) {
1129 pvh_e->va_and_flags |= PVE_IS_ALTACCT;
1130 is_altacct = TRUE;
1131 } else {
1132 pvh_e->va_and_flags &= ~PVE_IS_ALTACCT;
1133 is_altacct = FALSE;
1134 }
1135 pv_hash_add(pvh_e, pv_h);
1136
1137 /*
1138 * Remember that we used the pvlist entry.
1139 */
1140 pvh_e = PV_HASHED_ENTRY_NULL;
1141 }
1142
1143 /*
1144 * only count the mapping
1145 * for 'managed memory'
1146 */
1147 pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1148 if (pmap != kernel_pmap) {
1149 /* update ledgers */
1150 if (is_altacct) {
1151 /* internal but also alternate accounting */
1152 assert(IS_INTERNAL_PAGE(pai));
1153 pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1154 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1155 /* alternate accounting, so not in footprint */
1156 } else if (IS_REUSABLE_PAGE(pai)) {
1157 assert(!is_altacct);
1158 assert(IS_INTERNAL_PAGE(pai));
1159 pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
1160 /* internal but reusable: not in footprint */
1161 } else if (IS_INTERNAL_PAGE(pai)) {
1162 assert(!is_altacct);
1163 assert(!IS_REUSABLE_PAGE(pai));
1164 /* internal: add to footprint */
1165 pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1166 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1167 } else {
1168 /* not internal: not in footprint */
1169 pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
1170 }
1171 }
1172 } else if (last_managed_page == 0) {
1173 /* Account for early mappings created before "managed pages"
1174 * are determined. Consider consulting the available DRAM map.
1175 */
1176 pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1177 if (pmap != kernel_pmap) {
1178 #if 00
1179 OSAddAtomic(+1, &pmap->stats.device);
1180 PMAP_STATS_PEAK(pmap->stats.device);
1181 #endif
1182 }
1183 }
1184 /*
1185 * Step 3) Enter the mapping.
1186 *
1187 * Build a template to speed up entering -
1188 * only the pfn changes.
1189 */
1190 template = pa_to_pte(pa);
1191
1192 if (!is_ept) {
1193 template |= INTEL_PTE_VALID;
1194 } else {
1195 template |= INTEL_EPT_IPAT;
1196 }
1197
1198 /*
1199 * DRK: It may be worth asserting on cache attribute flags that diverge
1200 * from the existing physical page attributes.
1201 */
1202
1203 template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
1204
1205 /*
1206 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
1207 */
1208 if (!is_ept && (flags & VM_MEM_NOT_CACHEABLE)) {
1209 if (!(flags & VM_MEM_GUARDED)) {
1210 template |= INTEL_PTE_PAT;
1211 }
1212 template |= INTEL_PTE_NCACHE;
1213 }
1214 if (pmap != kernel_pmap && !is_ept) {
1215 template |= INTEL_PTE_USER;
1216 }
1217 if (prot & VM_PROT_READ) {
1218 template |= PTE_READ(is_ept);
1219 }
1220 if (prot & VM_PROT_WRITE) {
1221 template |= PTE_WRITE(is_ept);
1222 if (is_ept && !pmap_ept_support_ad) {
1223 template |= PTE_MOD(is_ept);
1224 if (IS_MANAGED_PAGE(pai)) {
1225 pmap_phys_attributes[pai] |= PHYS_MODIFIED;
1226 }
1227 }
1228 }
1229 if (prot & VM_PROT_EXECUTE) {
1230 assert(set_NX == 0);
1231 template = pte_set_ex(template, is_ept);
1232 }
1233 if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
1234 assert(set_NX == 0);
1235 template = pte_set_uex(template);
1236 }
1237
1238 if (set_NX) {
1239 template = pte_remove_ex(template, is_ept);
1240 }
1241 if (wired) {
1242 template |= INTEL_PTE_WIRED;
1243 pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1244 }
1245 if (__improbable(superpage)) {
1246 template |= INTEL_PTE_PS;
1247 }
1248
1249 /* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
1250 if (__improbable(is_ept && !pmap_ept_support_ad)) {
1251 template |= PTE_REF(is_ept);
1252 if (IS_MANAGED_PAGE(pai)) {
1253 pmap_phys_attributes[pai] |= PHYS_REFERENCED;
1254 }
1255 }
1256 template |= PTE_LOCK(is_ept);
1257 pmap_store_pte(is_ept, pte, template);
1258 DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, 0, uint64_t, template);
1259
1260 /*
1261 * if this was a managed page we delayed unlocking the pv until here
1262 * to prevent pmap_page_protect et al from finding it until the pte
1263 * has been stored
1264 */
1265 if (IS_MANAGED_PAGE(pai)) {
1266 UNLOCK_PVH(pai);
1267 }
1268 done2:
1269 if (need_tlbflush == TRUE) {
1270 if (options & PMAP_OPTIONS_NOFLUSH) {
1271 PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1272 } else {
1273 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1274 }
1275 }
1276 if (ptelocked) {
1277 PTE_LOCK_UNLOCK(pte);
1278 }
1279 PMAP_UNLOCK_SHARED(pmap);
1280
1281 if (pvh_e != PV_HASHED_ENTRY_NULL) {
1282 PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
1283 }
1284 if (pvh_new != PV_HASHED_ENTRY_NULL) {
1285 PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
1286 }
1287
1288 if (delpage_pm_obj) {
1289 vm_page_t m;
1290
1291 vm_object_lock(delpage_pm_obj);
1292 m = vm_page_lookup(delpage_pm_obj, (delpage_pde_index * PAGE_SIZE));
1293 if (m == VM_PAGE_NULL) {
1294 panic("pmap_enter: pte page not in object");
1295 }
1296 VM_PAGE_FREE(m);
1297 vm_object_unlock(delpage_pm_obj);
1298 OSAddAtomic(-1, &inuse_ptepages_count);
1299 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
1300 }
1301
1302 kr = KERN_SUCCESS;
1303 done1:
1304 if (__improbable((kr == KERN_SUCCESS) && (pmap == kernel_pmap) &&
1305 zone_spans_ro_va(vaddr, vaddr + PAGE_SIZE))) {
1306 pmap_page_protect((ppnum_t)atop_kernel(kvtophys(vaddr)), VM_PROT_READ);
1307 }
1308 PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
1309 return kr;
1310 }
1311
1312 /*
1313 * Remove a range of hardware page-table entries.
1314 * The entries given are the first (inclusive)
1315 * and last (exclusive) entries for the VM pages.
1316 * The virtual address is the va for the first pte.
1317 *
1318 * The pmap must be locked.
1319 * If the pmap is not the kernel pmap, the range must lie
1320 * entirely within one pte-page. This is NOT checked.
1321 * Assumes that the pte-page exists.
1322 */
1323
1324 void
pmap_remove_range(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte)1325 pmap_remove_range(
1326 pmap_t pmap,
1327 vm_map_offset_t start_vaddr,
1328 pt_entry_t *spte,
1329 pt_entry_t *epte)
1330 {
1331 pmap_remove_range_options(pmap, start_vaddr, spte, epte,
1332 PMAP_OPTIONS_REMOVE);
1333 }
1334
1335 static void
pmap_remove_range_options(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte,int options)1336 pmap_remove_range_options(
1337 pmap_t pmap,
1338 vm_map_offset_t start_vaddr,
1339 pt_entry_t *spte,
1340 pt_entry_t *epte,
1341 int options)
1342 {
1343 pt_entry_t *cpte;
1344 pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
1345 pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
1346 pv_hashed_entry_t pvh_e;
1347 int pvh_cnt = 0;
1348 int num_removed, num_unwired, num_found, num_invalid;
1349 int ledgers_external, ledgers_reusable, ledgers_internal, ledgers_alt_internal;
1350 uint64_t ledgers_compressed, ledgers_alt_compressed;
1351 ppnum_t pai;
1352 pmap_paddr_t pa;
1353 vm_map_offset_t vaddr;
1354 boolean_t is_ept = is_ept_pmap(pmap);
1355 boolean_t was_altacct;
1356
1357 num_removed = 0;
1358 num_unwired = 0;
1359 num_found = 0;
1360 num_invalid = 0;
1361 ledgers_external = 0;
1362 ledgers_reusable = 0;
1363 ledgers_internal = 0;
1364 ledgers_compressed = 0;
1365 ledgers_alt_internal = 0;
1366 ledgers_alt_compressed = 0;
1367
1368 /* invalidate the PTEs first to "freeze" them */
1369 for (cpte = spte, vaddr = start_vaddr;
1370 cpte < epte;
1371 cpte++, vaddr += PAGE_SIZE_64) {
1372 pt_entry_t p = *cpte;
1373
1374 pa = pte_to_pa(p);
1375 if (pa == 0) {
1376 if ((options & PMAP_OPTIONS_REMOVE) &&
1377 (PTE_IS_COMPRESSED(p, cpte, pmap, vaddr))) {
1378 assert(pmap != kernel_pmap);
1379 /* one less "compressed"... */
1380 ledgers_compressed++;
1381 if (p & PTE_COMPRESSED_ALT) {
1382 /* ... but it used to be "ALTACCT" */
1383 ledgers_alt_compressed++;
1384 }
1385 /* clear marker(s) */
1386 /* XXX probably does not need to be atomic! */
1387 pmap_update_pte(is_ept, cpte, INTEL_PTE_COMPRESSED_MASK, 0, true);
1388 }
1389 continue;
1390 }
1391 num_found++;
1392
1393 if (iswired(p)) {
1394 num_unwired++;
1395 }
1396
1397 pai = pa_index(pa);
1398
1399 if (!IS_MANAGED_PAGE(pai)) {
1400 /*
1401 * Outside range of managed physical memory.
1402 * Just remove the mappings.
1403 */
1404 pmap_store_pte(is_ept, cpte, 0);
1405 continue;
1406 }
1407
1408 if ((p & PTE_VALID_MASK(is_ept)) == 0) {
1409 num_invalid++;
1410 }
1411
1412 /* invalidate the PTE */
1413 pmap_update_pte(is_ept, cpte, PTE_VALID_MASK(is_ept), 0, true);
1414 }
1415
1416 if (num_found == 0) {
1417 /* nothing was changed: we're done */
1418 goto update_counts;
1419 }
1420
1421 /* propagate the invalidates to other CPUs */
1422
1423 PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1424
1425 for (cpte = spte, vaddr = start_vaddr;
1426 cpte < epte;
1427 cpte++, vaddr += PAGE_SIZE_64) {
1428 pa = pte_to_pa(*cpte);
1429 if (pa == 0) {
1430 check_pte_for_compressed_marker:
1431 /*
1432 * This PTE could have been replaced with a
1433 * "compressed" marker after our first "freeze"
1434 * loop above, so check again.
1435 */
1436 if ((options & PMAP_OPTIONS_REMOVE) &&
1437 (PTE_IS_COMPRESSED(*cpte, cpte, pmap, vaddr))) {
1438 assert(pmap != kernel_pmap);
1439 /* one less "compressed"... */
1440 ledgers_compressed++;
1441 if (*cpte & PTE_COMPRESSED_ALT) {
1442 /* ... but it used to be "ALTACCT" */
1443 ledgers_alt_compressed++;
1444 }
1445 pmap_store_pte(is_ept, cpte, 0);
1446 }
1447 continue;
1448 }
1449
1450 pai = pa_index(pa);
1451
1452 LOCK_PVH(pai);
1453
1454 pa = pte_to_pa(*cpte);
1455 if (pa == 0) {
1456 UNLOCK_PVH(pai);
1457 goto check_pte_for_compressed_marker;
1458 }
1459
1460 /*
1461 * Remove the mapping from the pvlist for this physical page.
1462 */
1463 pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, cpte, &was_altacct);
1464
1465 num_removed++;
1466 /* update ledgers */
1467 if (was_altacct) {
1468 /* internal and alternate accounting */
1469 assert(IS_INTERNAL_PAGE(pai));
1470 ledgers_internal++;
1471 ledgers_alt_internal++;
1472 } else if (IS_REUSABLE_PAGE(pai)) {
1473 /* internal but reusable */
1474 assert(!was_altacct);
1475 assert(IS_INTERNAL_PAGE(pai));
1476 ledgers_reusable++;
1477 } else if (IS_INTERNAL_PAGE(pai)) {
1478 /* internal */
1479 assert(!was_altacct);
1480 assert(!IS_REUSABLE_PAGE(pai));
1481 ledgers_internal++;
1482 } else {
1483 /* not internal */
1484 ledgers_external++;
1485 }
1486
1487 /*
1488 * Get the modify and reference bits, then
1489 * nuke the entry in the page table
1490 */
1491 /* remember reference and change */
1492 if (!is_ept) {
1493 pmap_phys_attributes[pai] |=
1494 *cpte & (PHYS_MODIFIED | PHYS_REFERENCED);
1495 } else {
1496 pmap_phys_attributes[pai] |=
1497 ept_refmod_to_physmap((*cpte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1498 }
1499
1500 /* completely invalidate the PTE */
1501 pmap_store_pte(is_ept, cpte, 0);
1502
1503 UNLOCK_PVH(pai);
1504
1505 if (pvh_e != PV_HASHED_ENTRY_NULL) {
1506 pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1507 pvh_eh = pvh_e;
1508
1509 if (pvh_et == PV_HASHED_ENTRY_NULL) {
1510 pvh_et = pvh_e;
1511 }
1512 pvh_cnt++;
1513 }
1514 /* We can encounter at most 'num_found' PTEs for this level
1515 * Fewer may be encountered if some were replaced by
1516 * compressed markers. No new valid PTEs can be created
1517 * since the pmap lock is held exclusively.
1518 */
1519 if (num_removed == num_found) {
1520 break;
1521 }
1522 } /* for loop */
1523
1524 if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1525 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1526 }
1527 update_counts:
1528 /*
1529 * Update the counts
1530 */
1531 #if TESTING
1532 if (pmap->stats.resident_count < num_removed) {
1533 panic("pmap_remove_range: resident_count");
1534 }
1535 #endif
1536 if (num_removed) {
1537 pmap_ledger_debit(pmap, task_ledgers.phys_mem, machine_ptob(num_removed));
1538 }
1539
1540 if (pmap != kernel_pmap) {
1541 if (ledgers_external) {
1542 pmap_ledger_debit(pmap,
1543 task_ledgers.external,
1544 machine_ptob(ledgers_external));
1545 }
1546 if (ledgers_reusable) {
1547 pmap_ledger_debit(pmap,
1548 task_ledgers.reusable,
1549 machine_ptob(ledgers_reusable));
1550 }
1551 if (ledgers_internal) {
1552 pmap_ledger_debit(pmap,
1553 task_ledgers.internal,
1554 machine_ptob(ledgers_internal));
1555 }
1556 if (ledgers_compressed) {
1557 pmap_ledger_debit(pmap,
1558 task_ledgers.internal_compressed,
1559 machine_ptob(ledgers_compressed));
1560 }
1561 if (ledgers_alt_internal) {
1562 pmap_ledger_debit(pmap,
1563 task_ledgers.alternate_accounting,
1564 machine_ptob(ledgers_alt_internal));
1565 }
1566 if (ledgers_alt_compressed) {
1567 pmap_ledger_debit(pmap,
1568 task_ledgers.alternate_accounting_compressed,
1569 machine_ptob(ledgers_alt_compressed));
1570 }
1571
1572 uint64_t net_debit = (ledgers_internal - ledgers_alt_internal) + (ledgers_compressed - ledgers_alt_compressed);
1573 if (net_debit) {
1574 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(net_debit));
1575 }
1576 }
1577
1578 if (num_unwired != 0) {
1579 pmap_ledger_debit(pmap, task_ledgers.wired_mem, machine_ptob(num_unwired));
1580 }
1581 return;
1582 }
1583
1584
1585 /*
1586 * Remove the given range of addresses
1587 * from the specified map.
1588 *
1589 * It is assumed that the start and end are properly
1590 * rounded to the hardware page size.
1591 */
1592 void
pmap_remove(pmap_t map,addr64_t s64,addr64_t e64)1593 pmap_remove(
1594 pmap_t map,
1595 addr64_t s64,
1596 addr64_t e64)
1597 {
1598 pmap_remove_options(map, s64, e64, PMAP_OPTIONS_REMOVE);
1599 }
1600 #define PLCHECK_THRESHOLD (2)
1601
1602 void
pmap_remove_options(pmap_t map,addr64_t s64,addr64_t e64,int options)1603 pmap_remove_options(
1604 pmap_t map,
1605 addr64_t s64,
1606 addr64_t e64,
1607 int options)
1608 {
1609 pt_entry_t *pde;
1610 pt_entry_t *spte, *epte;
1611 addr64_t l64;
1612 uint64_t deadline = 0;
1613 boolean_t is_ept;
1614
1615 pmap_intr_assert();
1616
1617 if (map == PMAP_NULL || s64 == e64) {
1618 return;
1619 }
1620
1621 is_ept = is_ept_pmap(map);
1622
1623 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
1624 VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(s64),
1625 VM_KERNEL_ADDRHIDE(e64));
1626
1627 PMAP_LOCK_EXCLUSIVE(map);
1628 uint32_t traverse_count = 0;
1629
1630 while (s64 < e64) {
1631 pml4_entry_t *pml4e = pmap64_pml4(map, s64);
1632 if ((pml4e == NULL) ||
1633 ((*pml4e & PTE_VALID_MASK(is_ept)) == 0)) {
1634 if (os_add_overflow(s64, NBPML4, &s64)) {
1635 /* wrap; clip s64 to e64 */
1636 s64 = e64;
1637 break;
1638 }
1639 s64 &= ~(PML4MASK);
1640 continue;
1641 }
1642 pdpt_entry_t *pdpte = pmap64_pdpt(map, s64);
1643 if ((pdpte == NULL) ||
1644 ((*pdpte & PTE_VALID_MASK(is_ept)) == 0)) {
1645 if (os_add_overflow(s64, NBPDPT, &s64)) {
1646 /* wrap; clip s64 to e64 */
1647 s64 = e64;
1648 break;
1649 }
1650 s64 &= ~(PDPTMASK);
1651 continue;
1652 }
1653
1654 if (os_add_overflow(s64, PDE_MAPPED_SIZE, &l64)) {
1655 l64 = e64;
1656 } else {
1657 l64 &= ~(PDE_MAPPED_SIZE - 1);
1658
1659 if (l64 > e64) {
1660 l64 = e64;
1661 }
1662 }
1663
1664 pde = pmap_pde(map, s64);
1665
1666 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1667 if (*pde & PTE_PS) {
1668 /*
1669 * If we're removing a superpage, pmap_remove_range()
1670 * must work on level 2 instead of level 1; and we're
1671 * only passing a single level 2 entry instead of a
1672 * level 1 range.
1673 */
1674 spte = pde;
1675 epte = spte + 1; /* excluded */
1676 } else {
1677 spte = pmap_pte(map, (s64 & ~(PDE_MAPPED_SIZE - 1)));
1678 spte = &spte[ptenum(s64)];
1679 epte = &spte[intel_btop(l64 - s64)];
1680 }
1681 pmap_remove_range_options(map, s64, spte, epte,
1682 options);
1683 }
1684 s64 = l64;
1685
1686 if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
1687 if (deadline == 0) {
1688 deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1689 } else {
1690 if (rdtsc64_nofence() > deadline) {
1691 PMAP_UNLOCK_EXCLUSIVE(map);
1692 __builtin_ia32_pause();
1693 PMAP_LOCK_EXCLUSIVE(map);
1694 deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1695 }
1696 }
1697 }
1698 }
1699
1700 PMAP_UNLOCK_EXCLUSIVE(map);
1701
1702 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
1703 }
1704
1705 void
pmap_page_protect(ppnum_t pn,vm_prot_t prot)1706 pmap_page_protect(
1707 ppnum_t pn,
1708 vm_prot_t prot)
1709 {
1710 pmap_page_protect_options(pn, prot, 0, NULL);
1711 }
1712
1713 /*
1714 * Routine: pmap_page_protect_options
1715 *
1716 * Function:
1717 * Lower the permission for all mappings to a given
1718 * page.
1719 */
1720 void
pmap_page_protect_options(ppnum_t pn,vm_prot_t prot,unsigned int options,void * arg)1721 pmap_page_protect_options(
1722 ppnum_t pn,
1723 vm_prot_t prot,
1724 unsigned int options,
1725 void *arg)
1726 {
1727 pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
1728 pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
1729 pv_hashed_entry_t nexth;
1730 int pvh_cnt = 0;
1731 pv_rooted_entry_t pv_h;
1732 pv_rooted_entry_t pv_e;
1733 pv_hashed_entry_t pvh_e;
1734 pt_entry_t *pte;
1735 int pai;
1736 pmap_t pmap;
1737 boolean_t remove;
1738 pt_entry_t new_pte_value;
1739 boolean_t is_ept;
1740
1741 pmap_intr_assert();
1742 assert(pn != vm_page_fictitious_addr);
1743 if (pn == vm_page_guard_addr) {
1744 return;
1745 }
1746
1747 pai = ppn_to_pai(pn);
1748
1749 if (!IS_MANAGED_PAGE(pai)) {
1750 /*
1751 * Not a managed page.
1752 */
1753 return;
1754 }
1755
1756 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, pn, prot);
1757
1758 /*
1759 * Determine the new protection.
1760 */
1761 switch (prot) {
1762 case VM_PROT_READ:
1763 case VM_PROT_READ | VM_PROT_EXECUTE:
1764 remove = FALSE;
1765 break;
1766 case VM_PROT_ALL:
1767 return; /* nothing to do */
1768 default:
1769 remove = TRUE;
1770 break;
1771 }
1772
1773 pv_h = pai_to_pvh(pai);
1774
1775 LOCK_PVH(pai);
1776
1777
1778 /*
1779 * Walk down PV list, if any, changing or removing all mappings.
1780 */
1781 if (pv_h->pmap == PMAP_NULL) {
1782 goto done;
1783 }
1784
1785 pv_e = pv_h;
1786 pvh_e = (pv_hashed_entry_t) pv_e; /* cheat */
1787
1788 do {
1789 vm_map_offset_t vaddr;
1790
1791 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1792 (pmap_phys_attributes[pai] & PHYS_MODIFIED)) {
1793 /* page was modified, so it will be compressed */
1794 options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1795 options |= PMAP_OPTIONS_COMPRESSOR;
1796 }
1797
1798 pmap = pv_e->pmap;
1799 is_ept = is_ept_pmap(pmap);
1800 vaddr = PVE_VA(pv_e);
1801 pte = pmap_pte(pmap, vaddr);
1802
1803 pmap_assert2((pa_index(pte_to_pa(*pte)) == pn),
1804 "pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte);
1805
1806 if (0 == pte) {
1807 panic("pmap_page_protect() "
1808 "pmap=%p pn=0x%x vaddr=0x%llx\n",
1809 pmap, pn, vaddr);
1810 }
1811 nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink);
1812
1813 /*
1814 * Remove the mapping if new protection is NONE
1815 */
1816 if (remove) {
1817 /* Remove per-pmap wired count */
1818 if (iswired(*pte)) {
1819 pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1820 }
1821
1822 if (pmap != kernel_pmap &&
1823 (options & PMAP_OPTIONS_COMPRESSOR) &&
1824 IS_INTERNAL_PAGE(pai)) {
1825 assert(!PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr));
1826 /* mark this PTE as having been "compressed" */
1827 new_pte_value = PTE_COMPRESSED;
1828 if (IS_ALTACCT_PAGE(pai, pv_e)) {
1829 new_pte_value |= PTE_COMPRESSED_ALT;
1830 }
1831 } else {
1832 new_pte_value = 0;
1833 }
1834
1835 if (options & PMAP_OPTIONS_NOREFMOD) {
1836 pmap_store_pte(is_ept, pte, new_pte_value);
1837
1838 if (options & PMAP_OPTIONS_NOFLUSH) {
1839 PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1840 } else {
1841 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1842 }
1843 } else {
1844 /*
1845 * Remove the mapping, collecting dirty bits.
1846 */
1847 pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
1848
1849 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1850 if (!is_ept) {
1851 pmap_phys_attributes[pai] |=
1852 *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1853 } else {
1854 pmap_phys_attributes[pai] |=
1855 ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1856 }
1857 if ((options &
1858 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1859 IS_INTERNAL_PAGE(pai) &&
1860 (pmap_phys_attributes[pai] &
1861 PHYS_MODIFIED)) {
1862 /*
1863 * Page is actually "modified" and
1864 * will be compressed. Start
1865 * accounting for it as "compressed".
1866 */
1867 assert(!(options & PMAP_OPTIONS_COMPRESSOR));
1868 options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1869 options |= PMAP_OPTIONS_COMPRESSOR;
1870 assert(new_pte_value == 0);
1871 if (pmap != kernel_pmap) {
1872 new_pte_value = PTE_COMPRESSED;
1873 if (IS_ALTACCT_PAGE(pai, pv_e)) {
1874 new_pte_value |= PTE_COMPRESSED_ALT;
1875 }
1876 }
1877 }
1878 pmap_store_pte(is_ept, pte, new_pte_value);
1879 }
1880
1881 #if TESTING
1882 if (pmap->stats.resident_count < 1) {
1883 panic("pmap_page_protect: resident_count");
1884 }
1885 #endif
1886 pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1887
1888 /*
1889 * We only ever compress internal pages.
1890 */
1891 if (options & PMAP_OPTIONS_COMPRESSOR) {
1892 assert(IS_INTERNAL_PAGE(pai));
1893 }
1894 if (pmap != kernel_pmap) {
1895 /* update ledgers */
1896 if (IS_ALTACCT_PAGE(pai, pv_e)) {
1897 assert(IS_INTERNAL_PAGE(pai));
1898 pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1899 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1900 if (options & PMAP_OPTIONS_COMPRESSOR) {
1901 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1902 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, PAGE_SIZE);
1903 }
1904 } else if (IS_REUSABLE_PAGE(pai)) {
1905 assert(!IS_ALTACCT_PAGE(pai, pv_e));
1906 assert(IS_INTERNAL_PAGE(pai));
1907 if (options & PMAP_OPTIONS_COMPRESSOR) {
1908 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1909 /* was not in footprint, but is now */
1910 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1911 }
1912 pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
1913 } else if (IS_INTERNAL_PAGE(pai)) {
1914 assert(!IS_ALTACCT_PAGE(pai, pv_e));
1915 assert(!IS_REUSABLE_PAGE(pai));
1916 pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1917 /*
1918 * Update all stats related to physical
1919 * footprint, which only deals with
1920 * internal pages.
1921 */
1922 if (options & PMAP_OPTIONS_COMPRESSOR) {
1923 /*
1924 * This removal is only being
1925 * done so we can send this page
1926 * to the compressor; therefore
1927 * it mustn't affect total task
1928 * footprint.
1929 */
1930 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1931 } else {
1932 /*
1933 * This internal page isn't
1934 * going to the compressor,
1935 * so adjust stats to keep
1936 * phys_footprint up to date.
1937 */
1938 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1939 }
1940 } else {
1941 pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
1942 }
1943 }
1944
1945 /*
1946 * Deal with the pv_rooted_entry.
1947 */
1948
1949 if (pv_e == pv_h) {
1950 /*
1951 * Fix up head later.
1952 */
1953 pv_h->pmap = PMAP_NULL;
1954 } else {
1955 /*
1956 * Delete this entry.
1957 */
1958 pv_hash_remove(pvh_e);
1959 pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1960 pvh_eh = pvh_e;
1961
1962 if (pvh_et == PV_HASHED_ENTRY_NULL) {
1963 pvh_et = pvh_e;
1964 }
1965 pvh_cnt++;
1966 }
1967 } else {
1968 /*
1969 * Write-protect, after opportunistic refmod collect
1970 */
1971 if (!is_ept) {
1972 pmap_phys_attributes[pai] |=
1973 *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1974 } else {
1975 pmap_phys_attributes[pai] |=
1976 ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1977 }
1978
1979 pmap_update_pte(is_ept, pte, PTE_WRITE(is_ept), 0, true);
1980 if (options & PMAP_OPTIONS_NOFLUSH) {
1981 PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1982 } else {
1983 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1984 }
1985 }
1986 pvh_e = nexth;
1987 } while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h);
1988
1989
1990 /*
1991 * If pv_head mapping was removed, fix it up.
1992 */
1993 if (pv_h->pmap == PMAP_NULL) {
1994 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
1995
1996 if (pvh_e != (pv_hashed_entry_t) pv_h) {
1997 pv_hash_remove(pvh_e);
1998 pv_h->pmap = pvh_e->pmap;
1999 pv_h->va_and_flags = pvh_e->va_and_flags;
2000 pvh_e->qlink.next = (queue_entry_t) pvh_eh;
2001 pvh_eh = pvh_e;
2002
2003 if (pvh_et == PV_HASHED_ENTRY_NULL) {
2004 pvh_et = pvh_e;
2005 }
2006 pvh_cnt++;
2007 }
2008 }
2009 if (pvh_eh != PV_HASHED_ENTRY_NULL) {
2010 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
2011 }
2012 done:
2013 UNLOCK_PVH(pai);
2014
2015 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
2016 }
2017
2018
2019 /*
2020 * Clear specified attribute bits.
2021 */
2022 void
phys_attribute_clear(ppnum_t pn,int bits,unsigned int options,void * arg)2023 phys_attribute_clear(
2024 ppnum_t pn,
2025 int bits,
2026 unsigned int options,
2027 void *arg)
2028 {
2029 pv_rooted_entry_t pv_h;
2030 pv_hashed_entry_t pv_e;
2031 pt_entry_t *pte = NULL;
2032 int pai;
2033 pmap_t pmap;
2034 char attributes = 0;
2035 boolean_t is_internal, is_reusable, is_altacct, is_ept;
2036 int ept_bits_to_clear;
2037 boolean_t ept_keep_global_mod = FALSE;
2038
2039 if ((bits & PHYS_MODIFIED) &&
2040 (options & PMAP_OPTIONS_NOFLUSH) &&
2041 arg == NULL) {
2042 panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p): "
2043 "should not clear 'modified' without flushing TLBs\n",
2044 pn, bits, options, arg);
2045 }
2046
2047 /* We only support converting MOD and REF bits for EPT PTEs in this function */
2048 assert((bits & ~(PHYS_REFERENCED | PHYS_MODIFIED)) == 0);
2049
2050 ept_bits_to_clear = (unsigned)physmap_refmod_to_ept(bits & (PHYS_MODIFIED | PHYS_REFERENCED));
2051
2052 pmap_intr_assert();
2053 assert(pn != vm_page_fictitious_addr);
2054 if (pn == vm_page_guard_addr) {
2055 return;
2056 }
2057
2058 pai = ppn_to_pai(pn);
2059
2060 if (!IS_MANAGED_PAGE(pai)) {
2061 /*
2062 * Not a managed page.
2063 */
2064 return;
2065 }
2066
2067 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
2068
2069 pv_h = pai_to_pvh(pai);
2070
2071 LOCK_PVH(pai);
2072
2073
2074 /*
2075 * Walk down PV list, clearing all modify or reference bits.
2076 * We do not have to lock the pv_list because we have
2077 * the per-pmap lock
2078 */
2079 if (pv_h->pmap != PMAP_NULL) {
2080 /*
2081 * There are some mappings.
2082 */
2083
2084 is_internal = IS_INTERNAL_PAGE(pai);
2085 is_reusable = IS_REUSABLE_PAGE(pai);
2086
2087 pv_e = (pv_hashed_entry_t)pv_h;
2088
2089 do {
2090 vm_map_offset_t va;
2091 char pte_bits;
2092
2093 pmap = pv_e->pmap;
2094 is_ept = is_ept_pmap(pmap);
2095 is_altacct = IS_ALTACCT_PAGE(pai, pv_e);
2096 va = PVE_VA(pv_e);
2097 pte_bits = 0;
2098
2099 if (bits) {
2100 pte = pmap_pte(pmap, va);
2101 /* grab ref/mod bits from this PTE */
2102 pte_bits = (*pte & (PTE_REF(is_ept) | PTE_MOD(is_ept)));
2103 /* propagate to page's global attributes */
2104 if (!is_ept) {
2105 attributes |= pte_bits;
2106 } else {
2107 attributes |= ept_refmod_to_physmap(pte_bits);
2108 if (!pmap_ept_support_ad && (pte_bits & INTEL_EPT_MOD)) {
2109 ept_keep_global_mod = TRUE;
2110 }
2111 }
2112 /* which bits to clear for this PTE? */
2113 if (!is_ept) {
2114 pte_bits &= bits;
2115 } else {
2116 pte_bits &= ept_bits_to_clear;
2117 }
2118 }
2119 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
2120 pte_bits |= PTE_WRITE(is_ept);
2121 }
2122
2123 /*
2124 * Clear modify and/or reference bits.
2125 */
2126 if (pte_bits) {
2127 pmap_update_pte(is_ept, pte, pte_bits, 0, true);
2128
2129 /* Ensure all processors using this translation
2130 * invalidate this TLB entry. The invalidation
2131 * *must* follow the PTE update, to ensure that
2132 * the TLB shadow of the 'D' bit (in particular)
2133 * is synchronized with the updated PTE.
2134 */
2135 if (!(options & PMAP_OPTIONS_NOFLUSH)) {
2136 /* flush TLBS now */
2137 PMAP_UPDATE_TLBS(pmap,
2138 va,
2139 va + PAGE_SIZE);
2140 } else if (arg) {
2141 /* delayed TLB flush: add "pmap" info */
2142 PMAP_UPDATE_TLBS_DELAYED(
2143 pmap,
2144 va,
2145 va + PAGE_SIZE,
2146 (pmap_flush_context *)arg);
2147 } else {
2148 /* no TLB flushing at all */
2149 }
2150 }
2151
2152 /* update pmap "reusable" stats */
2153 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
2154 is_reusable &&
2155 pmap != kernel_pmap) {
2156 /* one less "reusable" */
2157 pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
2158 if (is_internal) {
2159 /* one more "internal" */
2160 if (is_altacct) {
2161 /* no impact on ledgers */
2162 } else {
2163 pmap_ledger_credit(pmap,
2164 task_ledgers.internal,
2165 PAGE_SIZE);
2166 pmap_ledger_credit(
2167 pmap,
2168 task_ledgers.phys_footprint,
2169 PAGE_SIZE);
2170 }
2171 } else {
2172 /* one more "external" */
2173 pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
2174 }
2175 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
2176 !is_reusable &&
2177 pmap != kernel_pmap) {
2178 /* one more "reusable" */
2179 pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
2180 if (is_internal) {
2181 /* one less "internal" */
2182 if (is_altacct) {
2183 /* no impact on footprint */
2184 } else {
2185 pmap_ledger_debit(pmap,
2186 task_ledgers.internal,
2187 PAGE_SIZE);
2188 pmap_ledger_debit(
2189 pmap,
2190 task_ledgers.phys_footprint,
2191 PAGE_SIZE);
2192 }
2193 } else {
2194 /* one less "external" */
2195 pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
2196 }
2197 }
2198
2199 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2200 } while (pv_e != (pv_hashed_entry_t)pv_h);
2201 }
2202 /* Opportunistic refmod collection, annulled
2203 * if both REF and MOD are being cleared.
2204 */
2205
2206 pmap_phys_attributes[pai] |= attributes;
2207
2208 if (ept_keep_global_mod) {
2209 /*
2210 * If the hardware doesn't support AD bits for EPT PTEs and someone is
2211 * requesting that we clear the modified bit for a phys page, we need
2212 * to ensure that there are no EPT mappings for the page with the
2213 * modified bit set. If there are, we cannot clear the global modified bit.
2214 */
2215 bits &= ~PHYS_MODIFIED;
2216 }
2217 pmap_phys_attributes[pai] &= ~(bits);
2218
2219 /* update this page's "reusable" status */
2220 if (options & PMAP_OPTIONS_CLEAR_REUSABLE) {
2221 pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
2222 } else if (options & PMAP_OPTIONS_SET_REUSABLE) {
2223 pmap_phys_attributes[pai] |= PHYS_REUSABLE;
2224 }
2225
2226 UNLOCK_PVH(pai);
2227
2228 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
2229 }
2230
2231 /*
2232 * Check specified attribute bits.
2233 */
2234 int
phys_attribute_test(ppnum_t pn,int bits)2235 phys_attribute_test(
2236 ppnum_t pn,
2237 int bits)
2238 {
2239 pv_rooted_entry_t pv_h;
2240 pv_hashed_entry_t pv_e;
2241 pt_entry_t *pte;
2242 int pai;
2243 pmap_t pmap;
2244 int attributes = 0;
2245 boolean_t is_ept;
2246
2247 pmap_intr_assert();
2248 assert(pn != vm_page_fictitious_addr);
2249 assert((bits & ~(PHYS_MODIFIED | PHYS_REFERENCED)) == 0);
2250 if (pn == vm_page_guard_addr) {
2251 return 0;
2252 }
2253
2254 pai = ppn_to_pai(pn);
2255
2256 if (!IS_MANAGED_PAGE(pai)) {
2257 /*
2258 * Not a managed page.
2259 */
2260 return 0;
2261 }
2262
2263 /*
2264 * Fast check... if bits already collected
2265 * no need to take any locks...
2266 * if not set, we need to recheck after taking
2267 * the lock in case they got pulled in while
2268 * we were waiting for the lock
2269 */
2270 if ((pmap_phys_attributes[pai] & bits) == bits) {
2271 return bits;
2272 }
2273
2274 pv_h = pai_to_pvh(pai);
2275
2276 LOCK_PVH(pai);
2277
2278 attributes = pmap_phys_attributes[pai] & bits;
2279
2280
2281 /*
2282 * Walk down PV list, checking the mappings until we
2283 * reach the end or we've found the desired attributes.
2284 */
2285 if (attributes != bits &&
2286 pv_h->pmap != PMAP_NULL) {
2287 /*
2288 * There are some mappings.
2289 */
2290 pv_e = (pv_hashed_entry_t)pv_h;
2291 do {
2292 vm_map_offset_t va;
2293
2294 pmap = pv_e->pmap;
2295 is_ept = is_ept_pmap(pmap);
2296 va = PVE_VA(pv_e);
2297 /*
2298 * pick up modify and/or reference bits from mapping
2299 */
2300
2301 pte = pmap_pte(pmap, va);
2302 if (!is_ept) {
2303 attributes |= (int)(*pte & bits);
2304 } else {
2305 attributes |= (int)(ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED));
2306 }
2307
2308 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2309 } while ((attributes != bits) &&
2310 (pv_e != (pv_hashed_entry_t)pv_h));
2311 }
2312 pmap_phys_attributes[pai] |= attributes;
2313
2314 UNLOCK_PVH(pai);
2315 return attributes;
2316 }
2317
2318 /*
2319 * Routine: pmap_change_wiring
2320 * Function: Change the wiring attribute for a map/virtual-address
2321 * pair.
2322 * In/out conditions:
2323 * The mapping must already exist in the pmap.
2324 */
2325 void
pmap_change_wiring(pmap_t map,vm_map_offset_t vaddr,boolean_t wired)2326 pmap_change_wiring(
2327 pmap_t map,
2328 vm_map_offset_t vaddr,
2329 boolean_t wired)
2330 {
2331 pt_entry_t *pte;
2332
2333 PMAP_LOCK_SHARED(map);
2334
2335 if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL) {
2336 panic("pmap_change_wiring(%p,0x%llx,%d): pte missing",
2337 map, vaddr, wired);
2338 }
2339
2340 if (wired && !iswired(*pte)) {
2341 /*
2342 * wiring down mapping
2343 */
2344 pmap_ledger_credit(map, task_ledgers.wired_mem, PAGE_SIZE);
2345 pmap_update_pte(is_ept_pmap(map), pte, 0, PTE_WIRED, false);
2346 } else if (!wired && iswired(*pte)) {
2347 /*
2348 * unwiring mapping
2349 */
2350 pmap_ledger_debit(map, task_ledgers.wired_mem, PAGE_SIZE);
2351 pmap_update_pte(is_ept_pmap(map), pte, PTE_WIRED, 0, false);
2352 }
2353
2354 PMAP_UNLOCK_SHARED(map);
2355 }
2356
2357 /*
2358 * "Backdoor" direct map routine for early mappings.
2359 * Useful for mapping memory outside the range
2360 * Sets A, D and NC if requested
2361 */
2362
2363 vm_offset_t
pmap_map_bd(vm_offset_t virt,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int flags)2364 pmap_map_bd(
2365 vm_offset_t virt,
2366 vm_map_offset_t start_addr,
2367 vm_map_offset_t end_addr,
2368 vm_prot_t prot,
2369 unsigned int flags)
2370 {
2371 pt_entry_t template;
2372 pt_entry_t *ptep;
2373
2374 vm_offset_t base = virt;
2375 boolean_t doflush = FALSE;
2376
2377 template = pa_to_pte(start_addr)
2378 | INTEL_PTE_REF
2379 | INTEL_PTE_MOD
2380 | INTEL_PTE_WIRED
2381 | INTEL_PTE_VALID;
2382
2383 if ((flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) == VM_MEM_NOT_CACHEABLE) {
2384 template |= INTEL_PTE_NCACHE;
2385 if (!(flags & (VM_MEM_GUARDED))) {
2386 template |= INTEL_PTE_PAT;
2387 }
2388 }
2389
2390 if ((prot & VM_PROT_EXECUTE) == 0) {
2391 template |= INTEL_PTE_NX;
2392 }
2393
2394 if (prot & VM_PROT_WRITE) {
2395 template |= INTEL_PTE_WRITE;
2396 }
2397 vm_map_offset_t caddr = start_addr;
2398 while (caddr < end_addr) {
2399 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
2400 if (ptep == PT_ENTRY_NULL) {
2401 panic("pmap_map_bd: Invalid kernel address");
2402 }
2403 if (pte_to_pa(*ptep)) {
2404 doflush = TRUE;
2405 }
2406 pmap_store_pte(FALSE, ptep, template);
2407 pte_increment_pa(template);
2408 virt += PAGE_SIZE;
2409 caddr += PAGE_SIZE;
2410 }
2411 if (doflush) {
2412 pmap_tlbi_range(0, ~0ULL, true, 0);
2413 PMAP_UPDATE_TLBS(kernel_pmap, base, base + end_addr - start_addr);
2414 }
2415 return virt;
2416 }
2417
2418 /* Create a virtual alias beginning at 'ava' of the specified kernel virtual
2419 * range. The aliased pagetable range is expanded if
2420 * PMAP_EXPAND_OPTIONS_ALIASMAP is specified. Performs no synchronization,
2421 * assumes caller has stabilized the source and destination ranges. Currently
2422 * used to populate sections of the trampoline "doublemap" at CPU startup.
2423 */
2424
2425 void
pmap_alias(vm_offset_t ava,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int eoptions)2426 pmap_alias(
2427 vm_offset_t ava,
2428 vm_map_offset_t start_addr,
2429 vm_map_offset_t end_addr,
2430 vm_prot_t prot,
2431 unsigned int eoptions)
2432 {
2433 pt_entry_t prot_template, template;
2434 pt_entry_t *aptep, *sptep;
2435
2436 prot_template = INTEL_PTE_REF | INTEL_PTE_MOD | INTEL_PTE_WIRED | INTEL_PTE_VALID;
2437 if ((prot & VM_PROT_EXECUTE) == 0) {
2438 prot_template |= INTEL_PTE_NX;
2439 }
2440
2441 if (prot & VM_PROT_WRITE) {
2442 prot_template |= INTEL_PTE_WRITE;
2443 }
2444 assert(((start_addr | end_addr) & PAGE_MASK) == 0);
2445 while (start_addr < end_addr) {
2446 aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2447 if (aptep == PT_ENTRY_NULL) {
2448 if (eoptions & PMAP_EXPAND_OPTIONS_ALIASMAP) {
2449 pmap_expand(kernel_pmap, ava, PMAP_EXPAND_OPTIONS_ALIASMAP);
2450 aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2451 } else {
2452 panic("pmap_alias: Invalid alias address");
2453 }
2454 }
2455 /* The aliased range should not have any active mappings */
2456 assert(pte_to_pa(*aptep) == 0);
2457
2458 sptep = pmap_pte(kernel_pmap, start_addr);
2459 assert(sptep != PT_ENTRY_NULL && (pte_to_pa(*sptep) != 0));
2460 template = pa_to_pte(pte_to_pa(*sptep)) | prot_template;
2461 pmap_store_pte(FALSE, aptep, template);
2462
2463 ava += PAGE_SIZE;
2464 start_addr += PAGE_SIZE;
2465 }
2466 }
2467
2468 mach_vm_size_t
pmap_query_resident(pmap_t pmap,addr64_t s64,addr64_t e64,mach_vm_size_t * compressed_bytes_p)2469 pmap_query_resident(
2470 pmap_t pmap,
2471 addr64_t s64,
2472 addr64_t e64,
2473 mach_vm_size_t *compressed_bytes_p)
2474 {
2475 pt_entry_t *pde;
2476 pt_entry_t *spte, *epte;
2477 addr64_t l64;
2478 uint64_t deadline = 0;
2479 mach_vm_size_t resident_bytes;
2480 mach_vm_size_t compressed_bytes;
2481 boolean_t is_ept;
2482
2483 pmap_intr_assert();
2484
2485 if (pmap == PMAP_NULL || pmap == kernel_pmap || s64 == e64) {
2486 if (compressed_bytes_p) {
2487 *compressed_bytes_p = 0;
2488 }
2489 return 0;
2490 }
2491
2492 is_ept = is_ept_pmap(pmap);
2493
2494 PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
2495 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(s64),
2496 VM_KERNEL_ADDRHIDE(e64));
2497
2498 resident_bytes = 0;
2499 compressed_bytes = 0;
2500
2501 PMAP_LOCK_EXCLUSIVE(pmap);
2502 uint32_t traverse_count = 0;
2503
2504 while (s64 < e64) {
2505 if (os_add_overflow(s64, PDE_MAPPED_SIZE, &l64)) {
2506 l64 = e64;
2507 } else {
2508 l64 &= ~(PDE_MAPPED_SIZE - 1);
2509
2510 if (l64 > e64) {
2511 l64 = e64;
2512 }
2513 }
2514
2515 pde = pmap_pde(pmap, s64);
2516
2517 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
2518 if (*pde & PTE_PS) {
2519 /* superpage: not supported */
2520 } else {
2521 spte = pmap_pte(pmap,
2522 (s64 & ~(PDE_MAPPED_SIZE - 1)));
2523 spte = &spte[ptenum(s64)];
2524 epte = &spte[intel_btop(l64 - s64)];
2525
2526 for (; spte < epte; spte++) {
2527 if (pte_to_pa(*spte) != 0) {
2528 resident_bytes += PAGE_SIZE;
2529 } else if (*spte & PTE_COMPRESSED) {
2530 compressed_bytes += PAGE_SIZE;
2531 }
2532 }
2533 }
2534 }
2535 s64 = l64;
2536
2537 if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
2538 if (deadline == 0) {
2539 deadline = rdtsc64() + max_preemption_latency_tsc;
2540 } else {
2541 if (rdtsc64() > deadline) {
2542 PMAP_UNLOCK_EXCLUSIVE(pmap);
2543 __builtin_ia32_pause();
2544 PMAP_LOCK_EXCLUSIVE(pmap);
2545 deadline = rdtsc64() + max_preemption_latency_tsc;
2546 }
2547 }
2548 }
2549 }
2550
2551 PMAP_UNLOCK_EXCLUSIVE(pmap);
2552
2553 PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
2554 resident_bytes);
2555
2556 if (compressed_bytes_p) {
2557 *compressed_bytes_p = compressed_bytes;
2558 }
2559 return resident_bytes;
2560 }
2561
2562 uint64_t pmap_query_page_info_retries;
2563
2564 kern_return_t
pmap_query_page_info(pmap_t pmap,vm_map_offset_t va,int * disp_p)2565 pmap_query_page_info(
2566 pmap_t pmap,
2567 vm_map_offset_t va,
2568 int *disp_p)
2569 {
2570 int disp;
2571 boolean_t is_ept;
2572 pmap_paddr_t pa;
2573 ppnum_t pai;
2574 pd_entry_t *pde_p;
2575 pt_entry_t *pte_p, pte;
2576
2577 pmap_intr_assert();
2578 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
2579 *disp_p = 0;
2580 return KERN_INVALID_ARGUMENT;
2581 }
2582
2583 disp = 0;
2584 is_ept = is_ept_pmap(pmap);
2585
2586 PMAP_LOCK_EXCLUSIVE(pmap);
2587
2588 pde_p = pmap_pde(pmap, va);
2589 if (!pde_p ||
2590 !(*pde_p & PTE_VALID_MASK(is_ept)) ||
2591 (*pde_p & PTE_PS)) {
2592 goto done;
2593 }
2594
2595 try_again:
2596 disp = 0;
2597
2598 pte_p = pmap_pte(pmap, va);
2599 if (pte_p == PT_ENTRY_NULL) {
2600 goto done;
2601 }
2602
2603 pte = *pte_p;
2604 pa = pte_to_pa(pte);
2605 if (pa == 0) {
2606 if (PTE_IS_COMPRESSED(pte, pte_p, pmap, va)) {
2607 disp |= PMAP_QUERY_PAGE_COMPRESSED;
2608 if (pte & PTE_COMPRESSED_ALT) {
2609 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
2610 }
2611 }
2612 } else {
2613 disp |= PMAP_QUERY_PAGE_PRESENT;
2614 pai = pa_index(pa);
2615 if (!IS_MANAGED_PAGE(pai)) {
2616 } else if (pmap_pv_is_altacct(pmap, va, pai)) {
2617 assert(IS_INTERNAL_PAGE(pai));
2618 disp |= PMAP_QUERY_PAGE_INTERNAL;
2619 disp |= PMAP_QUERY_PAGE_ALTACCT;
2620 } else if (IS_REUSABLE_PAGE(pai)) {
2621 disp |= PMAP_QUERY_PAGE_REUSABLE;
2622 } else if (IS_INTERNAL_PAGE(pai)) {
2623 disp |= PMAP_QUERY_PAGE_INTERNAL;
2624 }
2625 }
2626 if (__improbable(pte_p != pmap_pte(pmap, va) || pte != *pte_p)) {
2627 /* something changed: try again */
2628 pmap_query_page_info_retries++;
2629 goto try_again;
2630 }
2631 done:
2632 PMAP_UNLOCK_EXCLUSIVE(pmap);
2633 *disp_p = disp;
2634 return KERN_SUCCESS;
2635 }
2636
2637 void
pmap_set_vm_map_cs_enforced(pmap_t pmap,bool new_value)2638 pmap_set_vm_map_cs_enforced(
2639 pmap_t pmap,
2640 bool new_value)
2641 {
2642 PMAP_LOCK_EXCLUSIVE(pmap);
2643 pmap->pm_vm_map_cs_enforced = new_value;
2644 PMAP_UNLOCK_EXCLUSIVE(pmap);
2645 }
2646 extern int cs_process_enforcement_enable;
2647 bool
pmap_get_vm_map_cs_enforced(pmap_t pmap)2648 pmap_get_vm_map_cs_enforced(
2649 pmap_t pmap)
2650 {
2651 if (cs_process_enforcement_enable) {
2652 return true;
2653 }
2654 return pmap->pm_vm_map_cs_enforced;
2655 }
2656
2657 void
pmap_set_jit_entitled(__unused pmap_t pmap)2658 pmap_set_jit_entitled(__unused pmap_t pmap)
2659 {
2660 /* The x86 pmap layer does not care if a map has a JIT entry. */
2661 return;
2662 }
2663
2664 bool
pmap_get_jit_entitled(__unused pmap_t pmap)2665 pmap_get_jit_entitled(__unused pmap_t pmap)
2666 {
2667 /* The x86 pmap layer does not care if a map is using JIT. */
2668 return false;
2669 }
2670
2671 void
pmap_set_tpro(__unused pmap_t pmap)2672 pmap_set_tpro(__unused pmap_t pmap)
2673 {
2674 /* The x86 pmap layer does not care if a map is using TPRO */
2675 return;
2676 }
2677
2678 bool
pmap_get_tpro(__unused pmap_t pmap)2679 pmap_get_tpro(__unused pmap_t pmap)
2680 {
2681 /* The x86 pmap layer does not care if a map is using TPRO */
2682 return false;
2683 }
2684
2685 bool
pmap_has_prot_policy(__unused pmap_t pmap,__unused bool translated_allow_execute,__unused vm_prot_t prot)2686 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
2687 {
2688 /*
2689 * The x86 pmap layer does not apply any policy to any protection
2690 * types.
2691 */
2692 return false;
2693 }
2694
2695 uint64_t
pmap_release_pages_fast(void)2696 pmap_release_pages_fast(void)
2697 {
2698 return 0;
2699 }
2700
2701 void
pmap_trim(__unused pmap_t grand,__unused pmap_t subord,__unused addr64_t vstart,__unused uint64_t size)2702 pmap_trim(__unused pmap_t grand, __unused pmap_t subord, __unused addr64_t vstart, __unused uint64_t size)
2703 {
2704 return;
2705 }
2706
2707 __dead2
2708 void
pmap_ledger_verify_size(size_t size)2709 pmap_ledger_verify_size(size_t size)
2710 {
2711 panic("%s: unsupported, "
2712 "size=%lu",
2713 __func__, size);
2714 }
2715
2716 __dead2
2717 ledger_t
pmap_ledger_alloc(void)2718 pmap_ledger_alloc(void)
2719 {
2720 panic("%s: unsupported",
2721 __func__);
2722 }
2723
2724 __dead2
2725 void
pmap_ledger_free(ledger_t ledger)2726 pmap_ledger_free(ledger_t ledger)
2727 {
2728 panic("%s: unsupported, "
2729 "ledger=%p",
2730 __func__, ledger);
2731 }
2732
2733 kern_return_t
pmap_dump_page_tables(pmap_t pmap __unused,void * bufp __unused,void * buf_end __unused,unsigned int level_mask __unused,size_t * bytes_copied __unused)2734 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
2735 unsigned int level_mask __unused, size_t *bytes_copied __unused)
2736 {
2737 return KERN_NOT_SUPPORTED;
2738 }
2739
2740 void *
pmap_map_compressor_page(ppnum_t pn)2741 pmap_map_compressor_page(ppnum_t pn)
2742 {
2743 assertf(IS_MANAGED_PAGE(ppn_to_pai(pn)), "%s called on non-managed page 0x%08x", __func__, pn);
2744 return PHYSMAP_PTOV((uint64_t)pn << (uint64_t)PAGE_SHIFT);
2745 }
2746
2747 void
pmap_unmap_compressor_page(ppnum_t pn __unused,void * kva __unused)2748 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
2749 {
2750 }
2751
2752 bool
pmap_clear_refmod_range_options(pmap_t pmap __unused,vm_map_address_t start __unused,vm_map_address_t end __unused,unsigned int mask __unused,unsigned int options __unused)2753 pmap_clear_refmod_range_options(
2754 pmap_t pmap __unused,
2755 vm_map_address_t start __unused,
2756 vm_map_address_t end __unused,
2757 unsigned int mask __unused,
2758 unsigned int options __unused)
2759 {
2760 /*
2761 * x86 doesn't have ranged tlbi instructions, and we already have
2762 * the pmap_flush_context. This operation isn't implemented.
2763 */
2764 return false;
2765 }
2766
2767 bool
pmap_supported_feature(pmap_t pmap,pmap_feature_flags_t feat)2768 pmap_supported_feature(pmap_t pmap, pmap_feature_flags_t feat)
2769 {
2770 switch (feat) {
2771 case PMAP_FEAT_UEXEC:
2772 return pmap != NULL && is_ept_pmap(pmap);
2773 default:
2774 return false;
2775 }
2776 }
2777