1 /*
2 * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <mach_assert.h>
30
31 #include <vm/pmap.h>
32 #include <vm/vm_map.h>
33 #include <vm/vm_kern.h>
34 #include <kern/ledger.h>
35 #include <kern/zalloc_internal.h>
36 #include <i386/pmap_internal.h>
37
38 void pmap_remove_range(
39 pmap_t pmap,
40 vm_map_offset_t va,
41 pt_entry_t *spte,
42 pt_entry_t *epte);
43
44 static void pmap_remove_range_options(
45 pmap_t pmap,
46 vm_map_offset_t va,
47 pt_entry_t *spte,
48 pt_entry_t *epte,
49 int options);
50
51 void pmap_reusable_range(
52 pmap_t pmap,
53 vm_map_offset_t va,
54 pt_entry_t *spte,
55 pt_entry_t *epte,
56 boolean_t reusable);
57
58 pt_entry_t *PTE_corrupted_ptr;
59
60 #if DEVELOPMENT || DEBUG
61 int pmap_inject_pte_corruption;
62 uint32_t pmap_update_clear_pte_count;
63 uint32_t pmap_update_invalid_pte_count;
64 #endif
65
66 /*
67 * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time,
68 * on a NBPDE boundary.
69 */
70
71 uint64_t
pmap_shared_region_size_min(__unused pmap_t pmap)72 pmap_shared_region_size_min(__unused pmap_t pmap)
73 {
74 return NBPDE;
75 }
76
77 uint64_t
pmap_commpage_size_min(__unused pmap_t pmap)78 pmap_commpage_size_min(__unused pmap_t pmap)
79 {
80 return NBPDE;
81 }
82
83 /*
84 * kern_return_t pmap_nest(grand, subord, va_start, size)
85 *
86 * grand = the pmap that we will nest subord into
87 * subord = the pmap that goes into the grand
88 * va_start = start of range in pmap to be inserted
89 * size = Size of nest area (up to 16TB)
90 *
91 * Inserts a pmap into another. This is used to implement shared segments.
92 *
93 * Note that we depend upon higher level VM locks to insure that things don't change while
94 * we are doing this. For example, VM should not be doing any pmap enters while it is nesting
95 * or do 2 nests at once.
96 */
97
98 /*
99 * This routine can nest subtrees either at the PDPT level (1GiB) or at the
100 * PDE level (2MiB). We currently disallow disparate offsets for the "subord"
101 * container and the "grand" parent. A minor optimization to consider for the
102 * future: make the "subord" truly a container rather than a full-fledged
103 * pagetable hierarchy which can be unnecessarily sparse (DRK).
104 */
105
106 kern_return_t
pmap_nest(pmap_t grand,pmap_t subord,addr64_t va_start,uint64_t size)107 pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, uint64_t size)
108 {
109 vm_map_offset_t vaddr;
110 pd_entry_t *pde, *npde;
111 unsigned int i;
112 uint64_t num_pde;
113
114 assert(!is_ept_pmap(grand));
115 assert(!is_ept_pmap(subord));
116
117 if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
118 (va_start & (pmap_shared_region_size_min(grand) - 1)) ||
119 ((size >> 28) > 65536)) { /* Max size we can nest is 16TB */
120 return KERN_INVALID_VALUE;
121 }
122
123 if (size == 0) {
124 panic("pmap_nest: size is invalid - %016llX", size);
125 }
126
127 PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
128 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
129 VM_KERNEL_ADDRHIDE(va_start));
130
131 vaddr = (vm_map_offset_t)va_start;
132 num_pde = size >> PDESHIFT;
133
134 PMAP_LOCK_EXCLUSIVE(subord);
135
136 subord->pm_shared = TRUE;
137
138 for (i = 0; i < num_pde;) {
139 if (((vaddr & PDPTMASK) == 0) && (num_pde - i) >= NPDEPG) {
140 npde = pmap64_pdpt(subord, vaddr);
141
142 while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
143 PMAP_UNLOCK_EXCLUSIVE(subord);
144 pmap_expand_pdpt(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
145 PMAP_LOCK_EXCLUSIVE(subord);
146 npde = pmap64_pdpt(subord, vaddr);
147 }
148 *npde |= INTEL_PDPTE_NESTED;
149 vaddr += NBPDPT;
150 i += (uint32_t)NPDEPG;
151 } else {
152 npde = pmap_pde(subord, vaddr);
153
154 while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
155 PMAP_UNLOCK_EXCLUSIVE(subord);
156 pmap_expand(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
157 PMAP_LOCK_EXCLUSIVE(subord);
158 npde = pmap_pde(subord, vaddr);
159 }
160 vaddr += NBPDE;
161 i++;
162 }
163 }
164
165 PMAP_UNLOCK_EXCLUSIVE(subord);
166
167 vaddr = (vm_map_offset_t)va_start;
168
169 PMAP_LOCK_EXCLUSIVE(grand);
170
171 for (i = 0; i < num_pde;) {
172 pd_entry_t tpde;
173
174 if (((vaddr & PDPTMASK) == 0) && ((num_pde - i) >= NPDEPG)) {
175 npde = pmap64_pdpt(subord, vaddr);
176 if (npde == 0) {
177 panic("pmap_nest: no PDPT, subord %p nstart 0x%llx", subord, vaddr);
178 }
179 tpde = *npde;
180 pde = pmap64_pdpt(grand, vaddr);
181 if (0 == pde) {
182 PMAP_UNLOCK_EXCLUSIVE(grand);
183 pmap_expand_pml4(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
184 PMAP_LOCK_EXCLUSIVE(grand);
185 pde = pmap64_pdpt(grand, vaddr);
186 }
187 if (pde == 0) {
188 panic("pmap_nest: no PDPT, grand %p vaddr 0x%llx", grand, vaddr);
189 }
190 pmap_store_pte(FALSE, pde, tpde);
191 vaddr += NBPDPT;
192 i += (uint32_t) NPDEPG;
193 } else {
194 npde = pmap_pde(subord, vaddr);
195 if (npde == 0) {
196 panic("pmap_nest: no npde, subord %p vaddr 0x%llx", subord, vaddr);
197 }
198 tpde = *npde;
199 pde = pmap_pde(grand, vaddr);
200 if (0 == pde) {
201 PMAP_UNLOCK_EXCLUSIVE(grand);
202 pmap_expand_pdpt(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
203 PMAP_LOCK_EXCLUSIVE(grand);
204 pde = pmap_pde(grand, vaddr);
205 }
206
207 if (pde == 0) {
208 panic("pmap_nest: no pde, grand %p vaddr 0x%llx", grand, vaddr);
209 }
210 vaddr += NBPDE;
211 pmap_store_pte(FALSE, pde, tpde);
212 i++;
213 }
214 }
215
216 PMAP_UNLOCK_EXCLUSIVE(grand);
217
218 PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, KERN_SUCCESS);
219
220 return KERN_SUCCESS;
221 }
222
223 /*
224 * kern_return_t pmap_unnest(grand, vaddr)
225 *
226 * grand = the pmap that we will un-nest subord from
227 * vaddr = start of range in pmap to be unnested
228 *
229 * Removes a pmap from another. This is used to implement shared segments.
230 */
231
232 kern_return_t
pmap_unnest(pmap_t grand,addr64_t vaddr,uint64_t size)233 pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size)
234 {
235 pd_entry_t *pde;
236 unsigned int i;
237 uint64_t num_pde;
238 addr64_t va_start, va_end;
239 uint64_t npdpt = PMAP_INVALID_PDPTNUM;
240
241 PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
242 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
243
244 if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
245 (vaddr & (pmap_shared_region_size_min(grand) - 1))) {
246 panic("pmap_unnest(%p,0x%llx,0x%llx): unaligned...",
247 grand, vaddr, size);
248 }
249
250 assert(!is_ept_pmap(grand));
251
252 /* align everything to PDE boundaries */
253 va_start = vaddr & ~(NBPDE - 1);
254
255 if (os_add_overflow(vaddr, size + NBPDE - 1, &va_end)) {
256 panic("pmap_unnest: Overflow when calculating range end: s=0x%llx sz=0x%llx\n", vaddr, size);
257 }
258
259 va_end &= ~(NBPDE - 1);
260 size = va_end - va_start;
261
262 PMAP_LOCK_EXCLUSIVE(grand);
263
264 num_pde = size >> PDESHIFT;
265 vaddr = va_start;
266
267 for (i = 0; i < num_pde;) {
268 if (pdptnum(grand, vaddr) != npdpt) {
269 npdpt = pdptnum(grand, vaddr);
270 pde = pmap64_pdpt(grand, vaddr);
271 if (pde && (*pde & INTEL_PDPTE_NESTED)) {
272 pmap_store_pte(FALSE, pde, (pd_entry_t)0);
273 i += (uint32_t) NPDEPG;
274 vaddr += NBPDPT;
275 continue;
276 }
277 }
278 pde = pmap_pde(grand, (vm_map_offset_t)vaddr);
279 if (pde == 0) {
280 panic("pmap_unnest: no pde, grand %p vaddr 0x%llx", grand, vaddr);
281 }
282 pmap_store_pte(FALSE, pde, (pd_entry_t)0);
283 i++;
284 vaddr += NBPDE;
285 }
286
287 PMAP_UPDATE_TLBS(grand, va_start, va_end);
288
289 PMAP_UNLOCK_EXCLUSIVE(grand);
290
291 PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
292
293 return KERN_SUCCESS;
294 }
295
296 kern_return_t
pmap_unnest_options(pmap_t grand,addr64_t vaddr,__unused uint64_t size,__unused unsigned int options)297 pmap_unnest_options(
298 pmap_t grand,
299 addr64_t vaddr,
300 __unused uint64_t size,
301 __unused unsigned int options)
302 {
303 return pmap_unnest(grand, vaddr, size);
304 }
305
306 /* Invoked by the Mach VM to determine the platform specific unnest region */
307
308 boolean_t
pmap_adjust_unnest_parameters(pmap_t p,vm_map_offset_t * s,vm_map_offset_t * e)309 pmap_adjust_unnest_parameters(pmap_t p, vm_map_offset_t *s, vm_map_offset_t *e)
310 {
311 pd_entry_t *pdpte;
312 boolean_t rval = FALSE;
313
314 PMAP_LOCK_EXCLUSIVE(p);
315
316 pdpte = pmap64_pdpt(p, *s);
317 if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
318 *s &= ~(NBPDPT - 1);
319 rval = TRUE;
320 }
321
322 pdpte = pmap64_pdpt(p, *e);
323 if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
324 *e = ((*e + NBPDPT) & ~(NBPDPT - 1));
325 rval = TRUE;
326 }
327
328 PMAP_UNLOCK_EXCLUSIVE(p);
329
330 return rval;
331 }
332
333 pmap_paddr_t
pmap_find_pa(pmap_t pmap,addr64_t va)334 pmap_find_pa(pmap_t pmap, addr64_t va)
335 {
336 pt_entry_t *ptp;
337 pd_entry_t *pdep;
338 pd_entry_t pde;
339 pt_entry_t pte;
340 boolean_t is_ept, locked = FALSE;
341 pmap_paddr_t pa = 0;
342
343 is_ept = is_ept_pmap(pmap);
344
345 if ((pmap != kernel_pmap) && not_in_kdp) {
346 PMAP_LOCK_EXCLUSIVE(pmap);
347 locked = TRUE;
348 } else {
349 mp_disable_preemption();
350 }
351
352 if (os_ref_get_count(&pmap->ref_count) == 0) {
353 goto pfp_exit;
354 }
355
356 pdep = pmap_pde(pmap, va);
357
358 if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
359 if (pde & PTE_PS) {
360 pa = pte_to_pa(pde) + (va & I386_LPGMASK);
361 } else {
362 ptp = pmap_pte(pmap, va);
363 if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
364 pa = pte_to_pa(pte) + (va & PAGE_MASK);
365 }
366 }
367 }
368 pfp_exit:
369 if (locked) {
370 PMAP_UNLOCK_EXCLUSIVE(pmap);
371 } else {
372 mp_enable_preemption();
373 }
374
375 return pa;
376 }
377
378 /*
379 * pmap_find_phys returns the (4K) physical page number containing a
380 * given virtual address in a given pmap.
381 * Note that pmap_pte may return a pde if this virtual address is
382 * mapped by a large page and this is taken into account in order
383 * to return the correct page number in this case.
384 */
385 ppnum_t
pmap_find_phys(pmap_t pmap,addr64_t va)386 pmap_find_phys(pmap_t pmap, addr64_t va)
387 {
388 ppnum_t ppn = 0;
389 pmap_paddr_t pa = 0;
390
391 pa = pmap_find_pa(pmap, va);
392 ppn = (ppnum_t) i386_btop(pa);
393
394 return ppn;
395 }
396
397 ppnum_t
pmap_find_phys_nofault(pmap_t pmap,addr64_t va)398 pmap_find_phys_nofault(pmap_t pmap, addr64_t va)
399 {
400 if ((pmap == kernel_pmap) ||
401 ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map)))) {
402 return pmap_find_phys(pmap, va);
403 }
404 return 0;
405 }
406
407 /*
408 * pmap_get_prot returns the equivalent Vm page protections
409 * set on a given address, 'va'. This function is used in the
410 * ml_static_verify_page_protections() routine which is used
411 * by the kext loading code to validate that the TEXT segment
412 * of a kext is mapped executable.
413 */
414 kern_return_t
pmap_get_prot(pmap_t pmap,addr64_t va,vm_prot_t * protp)415 pmap_get_prot(pmap_t pmap, addr64_t va, vm_prot_t *protp)
416 {
417 pt_entry_t *ptp;
418 pd_entry_t *pdep;
419 pd_entry_t pde;
420 pt_entry_t pte;
421 boolean_t is_ept, locked = FALSE;
422 kern_return_t retval = KERN_FAILURE;
423 vm_prot_t prot = 0;
424
425 is_ept = is_ept_pmap(pmap);
426
427 if ((pmap != kernel_pmap) && not_in_kdp) {
428 PMAP_LOCK_EXCLUSIVE(pmap);
429 locked = TRUE;
430 } else {
431 mp_disable_preemption();
432 }
433
434 if (os_ref_get_count(&pmap->ref_count) == 0) {
435 goto pfp_exit;
436 }
437
438 pdep = pmap_pde(pmap, va);
439
440 if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
441 if (pde & PTE_PS) {
442 prot = VM_PROT_READ;
443
444 if (pde & PTE_WRITE(is_ept)) {
445 prot |= VM_PROT_WRITE;
446 }
447 if (PTE_IS_EXECUTABLE(is_ept, pde)) {
448 prot |= VM_PROT_EXECUTE;
449 }
450 retval = KERN_SUCCESS;
451 } else {
452 ptp = pmap_pte(pmap, va);
453 if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
454 prot = VM_PROT_READ;
455
456 if (pte & PTE_WRITE(is_ept)) {
457 prot |= VM_PROT_WRITE;
458 }
459 if (PTE_IS_EXECUTABLE(is_ept, pte)) {
460 prot |= VM_PROT_EXECUTE;
461 }
462 retval = KERN_SUCCESS;
463 }
464 }
465 }
466
467 pfp_exit:
468 if (locked) {
469 PMAP_UNLOCK_EXCLUSIVE(pmap);
470 } else {
471 mp_enable_preemption();
472 }
473
474 if (protp) {
475 *protp = prot;
476 }
477
478 return retval;
479 }
480
481 /*
482 * Update cache attributes for all extant managed mappings.
483 * Assumes PV for this page is locked, and that the page
484 * is managed. We assume that this physical page may be mapped in
485 * both EPT and normal Intel PTEs, so we convert the attributes
486 * to the corresponding format for each pmap.
487 *
488 * We assert that the passed set of attributes is a subset of the
489 * PHYS_CACHEABILITY_MASK.
490 */
491 void
pmap_update_cache_attributes_locked(ppnum_t pn,unsigned attributes)492 pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes)
493 {
494 pv_rooted_entry_t pv_h, pv_e;
495 pv_hashed_entry_t pvh_e, nexth;
496 vm_map_offset_t vaddr;
497 pmap_t pmap;
498 pt_entry_t *ptep;
499 boolean_t is_ept;
500 unsigned ept_attributes;
501
502 assert(IS_MANAGED_PAGE(pn));
503 assert(((~PHYS_CACHEABILITY_MASK) & attributes) == 0);
504
505 /* We don't support the PAT bit for EPT PTEs */
506 if (attributes & INTEL_PTE_NCACHE) {
507 ept_attributes = INTEL_EPT_NCACHE;
508 } else {
509 ept_attributes = INTEL_EPT_WB;
510 }
511
512 pv_h = pai_to_pvh(pn);
513 /* TODO: translate the PHYS_* bits to PTE bits, while they're
514 * currently identical, they may not remain so
515 * Potential optimization (here and in page_protect),
516 * parallel shootdowns, check for redundant
517 * attribute modifications.
518 */
519
520 /*
521 * Alter attributes on all mappings
522 */
523 if (pv_h->pmap != PMAP_NULL) {
524 pv_e = pv_h;
525 pvh_e = (pv_hashed_entry_t)pv_e;
526
527 do {
528 pmap = pv_e->pmap;
529 vaddr = PVE_VA(pv_e);
530 ptep = pmap_pte(pmap, vaddr);
531
532 if (0 == ptep) {
533 panic("pmap_update_cache_attributes_locked: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx kernel_pmap: %p", pmap, pn, vaddr, kernel_pmap);
534 }
535
536 is_ept = is_ept_pmap(pmap);
537
538 nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink);
539 if (!is_ept) {
540 pmap_update_pte(is_ept, ptep, PHYS_CACHEABILITY_MASK, attributes, true);
541 } else {
542 pmap_update_pte(is_ept, ptep, INTEL_EPT_CACHE_MASK, ept_attributes, true);
543 }
544 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
545 pvh_e = nexth;
546 } while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
547 }
548 }
549
550 void
x86_filter_TLB_coherency_interrupts(boolean_t dofilter)551 x86_filter_TLB_coherency_interrupts(boolean_t dofilter)
552 {
553 assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
554
555 if (dofilter) {
556 CPU_CR3_MARK_INACTIVE();
557 } else {
558 CPU_CR3_MARK_ACTIVE();
559 mfence();
560 pmap_update_interrupt();
561 }
562 }
563
564
565 /*
566 * Insert the given physical page (p) at
567 * the specified virtual address (v) in the
568 * target physical map with the protection requested.
569 *
570 * If specified, the page will be wired down, meaning
571 * that the related pte cannot be reclaimed.
572 *
573 * NB: This is the only routine which MAY NOT lazy-evaluate
574 * or lose information. That is, this routine must actually
575 * insert this page into the given map NOW.
576 */
577
578 kern_return_t
pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired)579 pmap_enter(
580 pmap_t pmap,
581 vm_map_offset_t vaddr,
582 ppnum_t pn,
583 vm_prot_t prot,
584 vm_prot_t fault_type,
585 unsigned int flags,
586 boolean_t wired)
587 {
588 return pmap_enter_options(pmap, vaddr, pn, prot, fault_type, flags, wired, PMAP_EXPAND_OPTIONS_NONE, NULL);
589 }
590
591 #define PTE_LOCK(EPT) INTEL_PTE_SWLOCK
592
593 static inline void PTE_LOCK_LOCK(pt_entry_t *);
594 static inline void PTE_LOCK_UNLOCK(pt_entry_t *);
595
596 void
PTE_LOCK_LOCK(pt_entry_t * lpte)597 PTE_LOCK_LOCK(pt_entry_t *lpte)
598 {
599 pt_entry_t pte;
600 plretry:
601 while ((pte = __c11_atomic_load((_Atomic pt_entry_t *)lpte, memory_order_relaxed)) & PTE_LOCK(0)) {
602 __builtin_ia32_pause();
603 }
604 if (__c11_atomic_compare_exchange_strong((_Atomic pt_entry_t *)lpte, &pte, pte | PTE_LOCK(0), memory_order_acquire_smp, TRUE)) {
605 return;
606 }
607
608 goto plretry;
609 }
610
611 void
PTE_LOCK_UNLOCK(pt_entry_t * lpte)612 PTE_LOCK_UNLOCK(pt_entry_t *lpte)
613 {
614 __c11_atomic_fetch_and((_Atomic pt_entry_t *)lpte, ~PTE_LOCK(0), memory_order_release_smp);
615 }
616
617 kern_return_t
pmap_enter_options_addr(pmap_t pmap,vm_map_address_t v,pmap_paddr_t pa,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,__unused void * arg)618 pmap_enter_options_addr(
619 pmap_t pmap,
620 vm_map_address_t v,
621 pmap_paddr_t pa,
622 vm_prot_t prot,
623 vm_prot_t fault_type,
624 unsigned int flags,
625 boolean_t wired,
626 unsigned int options,
627 __unused void *arg)
628 {
629 return pmap_enter_options(pmap, v, intel_btop(pa), prot, fault_type, flags, wired, options, arg);
630 }
631
632 kern_return_t
pmap_enter_options(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,__unused vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,void * arg)633 pmap_enter_options(
634 pmap_t pmap,
635 vm_map_offset_t vaddr,
636 ppnum_t pn,
637 vm_prot_t prot,
638 __unused vm_prot_t fault_type,
639 unsigned int flags,
640 boolean_t wired,
641 unsigned int options,
642 void *arg)
643 {
644 pt_entry_t *pte = NULL;
645 pv_rooted_entry_t pv_h;
646 ppnum_t pai;
647 pv_hashed_entry_t pvh_e;
648 pv_hashed_entry_t pvh_new;
649 pt_entry_t template;
650 pmap_paddr_t old_pa;
651 pmap_paddr_t pa = (pmap_paddr_t) i386_ptob(pn);
652 boolean_t need_tlbflush = FALSE;
653 boolean_t set_NX;
654 char oattr;
655 boolean_t old_pa_locked;
656 /* 2MiB mappings are confined to x86_64 by VM */
657 boolean_t superpage = flags & VM_MEM_SUPERPAGE;
658 vm_object_t delpage_pm_obj = NULL;
659 uint64_t delpage_pde_index = 0;
660 pt_entry_t old_pte;
661 kern_return_t kr = KERN_FAILURE;
662 boolean_t is_ept;
663 boolean_t is_altacct;
664 boolean_t ptelocked = FALSE;
665
666 pmap_intr_assert();
667
668 if (__improbable(pmap == PMAP_NULL)) {
669 return KERN_INVALID_ARGUMENT;
670 }
671 if (__improbable(pn == vm_page_guard_addr)) {
672 return KERN_INVALID_ARGUMENT;
673 }
674
675 is_ept = is_ept_pmap(pmap);
676
677 /* N.B. We can be supplied a zero page frame in the NOENTER case, it's an
678 * unused value for that scenario.
679 */
680 assert(pn != vm_page_fictitious_addr);
681
682
683 PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
684 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(vaddr), pn,
685 prot);
686
687 if ((prot & VM_PROT_EXECUTE) || __improbable(is_ept && (prot & VM_PROT_UEXEC))) {
688 set_NX = FALSE;
689 } else {
690 set_NX = TRUE;
691 }
692
693 #if DEVELOPMENT || DEBUG
694 if (__improbable(set_NX && (!nx_enabled || !pmap->nx_enabled))) {
695 set_NX = FALSE;
696 }
697
698 if (__improbable(set_NX && (pmap == kernel_pmap) &&
699 ((pmap_disable_kstack_nx && (flags & VM_MEM_STACK)) ||
700 (pmap_disable_kheap_nx && !(flags & VM_MEM_STACK))))) {
701 set_NX = FALSE;
702 }
703 #endif
704
705 pvh_new = PV_HASHED_ENTRY_NULL;
706 Retry:
707 pvh_e = PV_HASHED_ENTRY_NULL;
708
709 PMAP_LOCK_SHARED(pmap);
710
711 /*
712 * Expand pmap to include this pte. Assume that
713 * pmap is always expanded to include enough hardware
714 * pages to map one VM page.
715 */
716 if (__improbable(superpage)) {
717 while ((pte = pmap_pde(pmap, vaddr)) == PD_ENTRY_NULL) {
718 /* need room for another pde entry */
719 PMAP_UNLOCK_SHARED(pmap);
720 kr = pmap_expand_pdpt(pmap, vaddr, options);
721 if (kr != KERN_SUCCESS) {
722 goto done1;
723 }
724 PMAP_LOCK_SHARED(pmap);
725 }
726 } else {
727 while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
728 /*
729 * Must unlock to expand the pmap
730 * going to grow pde level page(s)
731 */
732 PMAP_UNLOCK_SHARED(pmap);
733 kr = pmap_expand(pmap, vaddr, options);
734 if (kr != KERN_SUCCESS) {
735 goto done1;
736 }
737 PMAP_LOCK_SHARED(pmap);
738 }
739 }
740
741 if (__improbable(options & PMAP_EXPAND_OPTIONS_NOENTER)) {
742 PMAP_UNLOCK_SHARED(pmap);
743 kr = KERN_SUCCESS;
744 goto done1;
745 }
746
747 if (__improbable(superpage && *pte && !(*pte & PTE_PS))) {
748 /*
749 * There is still an empty page table mapped that
750 * was used for a previous base page mapping.
751 * Remember the PDE and the PDE index, so that we
752 * can free the page at the end of this function.
753 */
754 delpage_pde_index = pdeidx(pmap, vaddr);
755 delpage_pm_obj = pmap->pm_obj;
756 pmap_store_pte(is_ept, pte, 0);
757 }
758
759 PTE_LOCK_LOCK(pte);
760 ptelocked = TRUE;
761
762 old_pa = pte_to_pa(*pte);
763 pai = pa_index(old_pa);
764 old_pa_locked = FALSE;
765
766 if (old_pa == 0 &&
767 PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr)) {
768 /*
769 * "pmap" should be locked at this point, so this should
770 * not race with another pmap_enter() or pmap_remove_range().
771 */
772 assert(pmap != kernel_pmap);
773
774 /* one less "compressed" */
775 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
776 PAGE_SIZE);
777 if (*pte & PTE_COMPRESSED_ALT) {
778 pmap_ledger_debit(
779 pmap,
780 task_ledgers.alternate_accounting_compressed,
781 PAGE_SIZE);
782 } else {
783 /* was part of the footprint */
784 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
785 PAGE_SIZE);
786 }
787 /* marker will be cleared below */
788 }
789
790 /*
791 * if we have a previous managed page, lock the pv entry now. after
792 * we lock it, check to see if someone beat us to the lock and if so
793 * drop the lock
794 */
795 if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) {
796 LOCK_PVH(pai);
797 old_pa_locked = TRUE;
798 old_pa = pte_to_pa(*pte);
799 if (0 == old_pa) {
800 UNLOCK_PVH(pai); /* another path beat us to it */
801 old_pa_locked = FALSE;
802 }
803 }
804
805 /*
806 * Special case if the incoming physical page is already mapped
807 * at this address.
808 */
809 if (old_pa == pa) {
810 pt_entry_t old_attributes =
811 *pte & ~(PTE_REF(is_ept) | PTE_MOD(is_ept) | PTE_LOCK(is_ept));
812
813 /*
814 * May be changing its wired attribute or protection
815 */
816
817 template = pa_to_pte(pa);
818
819 if (__probable(!is_ept)) {
820 template |= INTEL_PTE_VALID;
821 } else {
822 template |= INTEL_EPT_IPAT;
823 }
824
825 template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
826
827 /*
828 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
829 */
830 if (!is_ept && (VM_MEM_NOT_CACHEABLE ==
831 (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)))) {
832 if (!(flags & VM_MEM_GUARDED)) {
833 template |= INTEL_PTE_PAT;
834 }
835 template |= INTEL_PTE_NCACHE;
836 }
837 if (pmap != kernel_pmap && !is_ept) {
838 template |= INTEL_PTE_USER;
839 }
840
841 if (prot & VM_PROT_READ) {
842 template |= PTE_READ(is_ept);
843 }
844
845 if (prot & VM_PROT_WRITE) {
846 template |= PTE_WRITE(is_ept);
847 if (is_ept && !pmap_ept_support_ad) {
848 template |= PTE_MOD(is_ept);
849 if (old_pa_locked) {
850 assert(IS_MANAGED_PAGE(pai));
851 pmap_phys_attributes[pai] |= PHYS_MODIFIED;
852 }
853 }
854 }
855
856 if (prot & VM_PROT_EXECUTE) {
857 assert(set_NX == 0);
858 template = pte_set_ex(template, is_ept);
859 }
860
861 if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
862 assert(set_NX == 0);
863 template = pte_set_uex(template);
864 }
865
866 if (set_NX) {
867 template = pte_remove_ex(template, is_ept);
868 }
869
870 if (wired) {
871 template |= PTE_WIRED;
872 if (!iswired(old_attributes)) {
873 pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
874 }
875 } else {
876 if (iswired(old_attributes)) {
877 pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
878 }
879 }
880
881 if (superpage) { /* this path can not be used */
882 template |= PTE_PS; /* to change the page size! */
883 }
884 if (old_attributes == template) {
885 goto dont_update_pte;
886 }
887
888 /* Determine delta, PV locked */
889 need_tlbflush =
890 ((old_attributes ^ template) != PTE_WIRED);
891
892 /* Optimisation: avoid TLB flush when adding writability */
893 if (need_tlbflush == TRUE && !(old_attributes & PTE_WRITE(is_ept))) {
894 if ((old_attributes ^ template) == PTE_WRITE(is_ept)) {
895 need_tlbflush = FALSE;
896 }
897 }
898
899 /* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
900 if (__improbable(is_ept && !pmap_ept_support_ad)) {
901 template |= PTE_REF(is_ept);
902 if (old_pa_locked) {
903 assert(IS_MANAGED_PAGE(pai));
904 pmap_phys_attributes[pai] |= PHYS_REFERENCED;
905 }
906 }
907
908 /* store modified PTE and preserve RC bits */
909 pt_entry_t npte, opte;
910
911 assert((*pte & PTE_LOCK(is_ept)) != 0);
912
913 do {
914 opte = *pte;
915 npte = template | (opte & (PTE_REF(is_ept) |
916 PTE_MOD(is_ept))) | PTE_LOCK(is_ept);
917 } while (!pmap_cmpx_pte(pte, opte, npte));
918
919 DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, opte, uint64_t, npte);
920
921 dont_update_pte:
922 if (old_pa_locked) {
923 UNLOCK_PVH(pai);
924 old_pa_locked = FALSE;
925 }
926 goto done2;
927 }
928
929 /*
930 * Outline of code from here:
931 * 1) If va was mapped, update TLBs, remove the mapping
932 * and remove old pvlist entry.
933 * 2) Add pvlist entry for new mapping
934 * 3) Enter new mapping.
935 *
936 * If the old physical page is not managed step 1) is skipped
937 * (except for updating the TLBs), and the mapping is
938 * overwritten at step 3). If the new physical page is not
939 * managed, step 2) is skipped.
940 */
941 /* TODO: add opportunistic refmod collect */
942 if (old_pa != (pmap_paddr_t) 0) {
943 boolean_t was_altacct = FALSE;
944
945 /*
946 * Don't do anything to pages outside valid memory here.
947 * Instead convince the code that enters a new mapping
948 * to overwrite the old one.
949 */
950
951 /* invalidate the PTE */
952 pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
953 /* propagate invalidate everywhere */
954 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
955 /* remember reference and change */
956 old_pte = *pte;
957 oattr = (char) (old_pte & (PTE_MOD(is_ept) | PTE_REF(is_ept)));
958 /* completely invalidate the PTE */
959 pmap_store_pte(is_ept, pte, PTE_LOCK(is_ept));
960
961 if (IS_MANAGED_PAGE(pai)) {
962 /*
963 * Remove the mapping from the pvlist for
964 * this physical page.
965 * We'll end up with either a rooted pv or a
966 * hashed pv
967 */
968 pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, &old_pte, &was_altacct);
969 }
970
971 if (IS_MANAGED_PAGE(pai)) {
972 pmap_assert(old_pa_locked == TRUE);
973 pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
974 if (pmap != kernel_pmap) {
975 /* update ledgers */
976 if (was_altacct) {
977 assert(IS_INTERNAL_PAGE(pai));
978 pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
979 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
980 } else if (IS_REUSABLE_PAGE(pai)) {
981 assert(!was_altacct);
982 assert(IS_INTERNAL_PAGE(pai));
983 pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
984 /* was already not in phys_footprint */
985 } else if (IS_INTERNAL_PAGE(pai)) {
986 assert(!was_altacct);
987 assert(!IS_REUSABLE_PAGE(pai));
988 pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
989 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
990 } else {
991 /* not an internal page */
992 pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
993 }
994 }
995 if (iswired(*pte)) {
996 pmap_ledger_debit(pmap, task_ledgers.wired_mem,
997 PAGE_SIZE);
998 }
999
1000 if (!is_ept) {
1001 pmap_phys_attributes[pai] |= oattr;
1002 } else {
1003 pmap_phys_attributes[pai] |= ept_refmod_to_physmap(oattr);
1004 }
1005 } else {
1006 /*
1007 * old_pa is not managed.
1008 * Do removal part of accounting.
1009 */
1010
1011 if (pmap != kernel_pmap) {
1012 #if 00
1013 assert(pmap->stats.device > 0);
1014 OSAddAtomic(-1, &pmap->stats.device);
1015 #endif
1016 }
1017 if (iswired(*pte)) {
1018 pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1019 }
1020 }
1021 }
1022
1023 /*
1024 * if we had a previously managed paged locked, unlock it now
1025 */
1026 if (old_pa_locked) {
1027 UNLOCK_PVH(pai);
1028 old_pa_locked = FALSE;
1029 }
1030
1031 pai = pa_index(pa); /* now working with new incoming phys page */
1032 if (IS_MANAGED_PAGE(pai)) {
1033 /*
1034 * Step 2) Enter the mapping in the PV list for this
1035 * physical page.
1036 */
1037 pv_h = pai_to_pvh(pai);
1038
1039 LOCK_PVH(pai);
1040
1041 if (pv_h->pmap == PMAP_NULL) {
1042 /*
1043 * No mappings yet, use rooted pv
1044 */
1045 pv_h->va_and_flags = vaddr;
1046 pv_h->pmap = pmap;
1047 queue_init(&pv_h->qlink);
1048
1049 if (options & PMAP_OPTIONS_INTERNAL) {
1050 pmap_phys_attributes[pai] |= PHYS_INTERNAL;
1051 } else {
1052 pmap_phys_attributes[pai] &= ~PHYS_INTERNAL;
1053 }
1054 if (options & PMAP_OPTIONS_REUSABLE) {
1055 pmap_phys_attributes[pai] |= PHYS_REUSABLE;
1056 } else {
1057 pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
1058 }
1059 if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1060 IS_INTERNAL_PAGE(pai)) {
1061 pv_h->va_and_flags |= PVE_IS_ALTACCT;
1062 is_altacct = TRUE;
1063 } else {
1064 pv_h->va_and_flags &= ~PVE_IS_ALTACCT;
1065 is_altacct = FALSE;
1066 }
1067 } else {
1068 /*
1069 * Add new pv_hashed_entry after header.
1070 */
1071 if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
1072 pvh_e = pvh_new;
1073 pvh_new = PV_HASHED_ENTRY_NULL;
1074 } else if (PV_HASHED_ENTRY_NULL == pvh_e) {
1075 PV_HASHED_ALLOC(&pvh_e);
1076 if (PV_HASHED_ENTRY_NULL == pvh_e) {
1077 /*
1078 * the pv list is empty. if we are on
1079 * the kernel pmap we'll use one of
1080 * the special private kernel pv_e's,
1081 * else, we need to unlock
1082 * everything, zalloc a pv_e, and
1083 * restart bringing in the pv_e with
1084 * us.
1085 */
1086 if (kernel_pmap == pmap) {
1087 PV_HASHED_KERN_ALLOC(&pvh_e);
1088 } else {
1089 UNLOCK_PVH(pai);
1090 PTE_LOCK_UNLOCK(pte);
1091 PMAP_UNLOCK_SHARED(pmap);
1092 pmap_pv_throttle(pmap);
1093 pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
1094 goto Retry;
1095 }
1096 }
1097 }
1098
1099 if (PV_HASHED_ENTRY_NULL == pvh_e) {
1100 panic("Mapping alias chain exhaustion, possibly induced by numerous kernel virtual double mappings");
1101 }
1102
1103 pvh_e->va_and_flags = vaddr;
1104 pvh_e->pmap = pmap;
1105 pvh_e->ppn = pn;
1106 if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1107 IS_INTERNAL_PAGE(pai)) {
1108 pvh_e->va_and_flags |= PVE_IS_ALTACCT;
1109 is_altacct = TRUE;
1110 } else {
1111 pvh_e->va_and_flags &= ~PVE_IS_ALTACCT;
1112 is_altacct = FALSE;
1113 }
1114 pv_hash_add(pvh_e, pv_h);
1115
1116 /*
1117 * Remember that we used the pvlist entry.
1118 */
1119 pvh_e = PV_HASHED_ENTRY_NULL;
1120 }
1121
1122 /*
1123 * only count the mapping
1124 * for 'managed memory'
1125 */
1126 pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1127 if (pmap != kernel_pmap) {
1128 /* update ledgers */
1129 if (is_altacct) {
1130 /* internal but also alternate accounting */
1131 assert(IS_INTERNAL_PAGE(pai));
1132 pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1133 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1134 /* alternate accounting, so not in footprint */
1135 } else if (IS_REUSABLE_PAGE(pai)) {
1136 assert(!is_altacct);
1137 assert(IS_INTERNAL_PAGE(pai));
1138 pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
1139 /* internal but reusable: not in footprint */
1140 } else if (IS_INTERNAL_PAGE(pai)) {
1141 assert(!is_altacct);
1142 assert(!IS_REUSABLE_PAGE(pai));
1143 /* internal: add to footprint */
1144 pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1145 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1146 } else {
1147 /* not internal: not in footprint */
1148 pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
1149 }
1150 }
1151 } else if (last_managed_page == 0) {
1152 /* Account for early mappings created before "managed pages"
1153 * are determined. Consider consulting the available DRAM map.
1154 */
1155 pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1156 if (pmap != kernel_pmap) {
1157 #if 00
1158 OSAddAtomic(+1, &pmap->stats.device);
1159 PMAP_STATS_PEAK(pmap->stats.device);
1160 #endif
1161 }
1162 }
1163 /*
1164 * Step 3) Enter the mapping.
1165 *
1166 * Build a template to speed up entering -
1167 * only the pfn changes.
1168 */
1169 template = pa_to_pte(pa);
1170
1171 if (!is_ept) {
1172 template |= INTEL_PTE_VALID;
1173 } else {
1174 template |= INTEL_EPT_IPAT;
1175 }
1176
1177 /*
1178 * DRK: It may be worth asserting on cache attribute flags that diverge
1179 * from the existing physical page attributes.
1180 */
1181
1182 template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
1183
1184 /*
1185 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
1186 */
1187 if (!is_ept && (flags & VM_MEM_NOT_CACHEABLE)) {
1188 if (!(flags & VM_MEM_GUARDED)) {
1189 template |= INTEL_PTE_PAT;
1190 }
1191 template |= INTEL_PTE_NCACHE;
1192 }
1193 if (pmap != kernel_pmap && !is_ept) {
1194 template |= INTEL_PTE_USER;
1195 }
1196 if (prot & VM_PROT_READ) {
1197 template |= PTE_READ(is_ept);
1198 }
1199 if (prot & VM_PROT_WRITE) {
1200 template |= PTE_WRITE(is_ept);
1201 if (is_ept && !pmap_ept_support_ad) {
1202 template |= PTE_MOD(is_ept);
1203 if (IS_MANAGED_PAGE(pai)) {
1204 pmap_phys_attributes[pai] |= PHYS_MODIFIED;
1205 }
1206 }
1207 }
1208 if (prot & VM_PROT_EXECUTE) {
1209 assert(set_NX == 0);
1210 template = pte_set_ex(template, is_ept);
1211 }
1212 if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
1213 assert(set_NX == 0);
1214 template = pte_set_uex(template);
1215 }
1216
1217 if (set_NX) {
1218 template = pte_remove_ex(template, is_ept);
1219 }
1220 if (wired) {
1221 template |= INTEL_PTE_WIRED;
1222 pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1223 }
1224 if (__improbable(superpage)) {
1225 template |= INTEL_PTE_PS;
1226 }
1227
1228 /* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
1229 if (__improbable(is_ept && !pmap_ept_support_ad)) {
1230 template |= PTE_REF(is_ept);
1231 if (IS_MANAGED_PAGE(pai)) {
1232 pmap_phys_attributes[pai] |= PHYS_REFERENCED;
1233 }
1234 }
1235 template |= PTE_LOCK(is_ept);
1236 pmap_store_pte(is_ept, pte, template);
1237 DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, 0, uint64_t, template);
1238
1239 /*
1240 * if this was a managed page we delayed unlocking the pv until here
1241 * to prevent pmap_page_protect et al from finding it until the pte
1242 * has been stored
1243 */
1244 if (IS_MANAGED_PAGE(pai)) {
1245 UNLOCK_PVH(pai);
1246 }
1247 done2:
1248 if (need_tlbflush == TRUE) {
1249 if (options & PMAP_OPTIONS_NOFLUSH) {
1250 PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1251 } else {
1252 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1253 }
1254 }
1255 if (ptelocked) {
1256 PTE_LOCK_UNLOCK(pte);
1257 }
1258 PMAP_UNLOCK_SHARED(pmap);
1259
1260 if (pvh_e != PV_HASHED_ENTRY_NULL) {
1261 PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
1262 }
1263 if (pvh_new != PV_HASHED_ENTRY_NULL) {
1264 PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
1265 }
1266
1267 if (delpage_pm_obj) {
1268 vm_page_t m;
1269
1270 vm_object_lock(delpage_pm_obj);
1271 m = vm_page_lookup(delpage_pm_obj, (delpage_pde_index * PAGE_SIZE));
1272 if (m == VM_PAGE_NULL) {
1273 panic("pmap_enter: pte page not in object");
1274 }
1275 VM_PAGE_FREE(m);
1276 vm_object_unlock(delpage_pm_obj);
1277 OSAddAtomic(-1, &inuse_ptepages_count);
1278 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
1279 }
1280
1281 kr = KERN_SUCCESS;
1282 done1:
1283 if (__improbable((kr == KERN_SUCCESS) && (pmap == kernel_pmap) &&
1284 zone_spans_ro_va(vaddr, vaddr + PAGE_SIZE))) {
1285 pmap_page_protect((ppnum_t)atop_kernel(kvtophys(vaddr)), VM_PROT_READ);
1286 }
1287 PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
1288 return kr;
1289 }
1290
1291 /*
1292 * Remove a range of hardware page-table entries.
1293 * The entries given are the first (inclusive)
1294 * and last (exclusive) entries for the VM pages.
1295 * The virtual address is the va for the first pte.
1296 *
1297 * The pmap must be locked.
1298 * If the pmap is not the kernel pmap, the range must lie
1299 * entirely within one pte-page. This is NOT checked.
1300 * Assumes that the pte-page exists.
1301 */
1302
1303 void
pmap_remove_range(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte)1304 pmap_remove_range(
1305 pmap_t pmap,
1306 vm_map_offset_t start_vaddr,
1307 pt_entry_t *spte,
1308 pt_entry_t *epte)
1309 {
1310 pmap_remove_range_options(pmap, start_vaddr, spte, epte,
1311 PMAP_OPTIONS_REMOVE);
1312 }
1313
1314 static void
pmap_remove_range_options(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte,int options)1315 pmap_remove_range_options(
1316 pmap_t pmap,
1317 vm_map_offset_t start_vaddr,
1318 pt_entry_t *spte,
1319 pt_entry_t *epte,
1320 int options)
1321 {
1322 pt_entry_t *cpte;
1323 pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
1324 pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
1325 pv_hashed_entry_t pvh_e;
1326 int pvh_cnt = 0;
1327 int num_removed, num_unwired, num_found, num_invalid;
1328 int ledgers_external, ledgers_reusable, ledgers_internal, ledgers_alt_internal;
1329 uint64_t ledgers_compressed, ledgers_alt_compressed;
1330 ppnum_t pai;
1331 pmap_paddr_t pa;
1332 vm_map_offset_t vaddr;
1333 boolean_t is_ept = is_ept_pmap(pmap);
1334 boolean_t was_altacct;
1335
1336 num_removed = 0;
1337 num_unwired = 0;
1338 num_found = 0;
1339 num_invalid = 0;
1340 ledgers_external = 0;
1341 ledgers_reusable = 0;
1342 ledgers_internal = 0;
1343 ledgers_compressed = 0;
1344 ledgers_alt_internal = 0;
1345 ledgers_alt_compressed = 0;
1346
1347 /* invalidate the PTEs first to "freeze" them */
1348 for (cpte = spte, vaddr = start_vaddr;
1349 cpte < epte;
1350 cpte++, vaddr += PAGE_SIZE_64) {
1351 pt_entry_t p = *cpte;
1352
1353 pa = pte_to_pa(p);
1354 if (pa == 0) {
1355 if ((options & PMAP_OPTIONS_REMOVE) &&
1356 (PTE_IS_COMPRESSED(p, cpte, pmap, vaddr))) {
1357 assert(pmap != kernel_pmap);
1358 /* one less "compressed"... */
1359 ledgers_compressed++;
1360 if (p & PTE_COMPRESSED_ALT) {
1361 /* ... but it used to be "ALTACCT" */
1362 ledgers_alt_compressed++;
1363 }
1364 /* clear marker(s) */
1365 /* XXX probably does not need to be atomic! */
1366 pmap_update_pte(is_ept, cpte, INTEL_PTE_COMPRESSED_MASK, 0, true);
1367 }
1368 continue;
1369 }
1370 num_found++;
1371
1372 if (iswired(p)) {
1373 num_unwired++;
1374 }
1375
1376 pai = pa_index(pa);
1377
1378 if (!IS_MANAGED_PAGE(pai)) {
1379 /*
1380 * Outside range of managed physical memory.
1381 * Just remove the mappings.
1382 */
1383 pmap_store_pte(is_ept, cpte, 0);
1384 continue;
1385 }
1386
1387 if ((p & PTE_VALID_MASK(is_ept)) == 0) {
1388 num_invalid++;
1389 }
1390
1391 /* invalidate the PTE */
1392 pmap_update_pte(is_ept, cpte, PTE_VALID_MASK(is_ept), 0, true);
1393 }
1394
1395 if (num_found == 0) {
1396 /* nothing was changed: we're done */
1397 goto update_counts;
1398 }
1399
1400 /* propagate the invalidates to other CPUs */
1401
1402 PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1403
1404 for (cpte = spte, vaddr = start_vaddr;
1405 cpte < epte;
1406 cpte++, vaddr += PAGE_SIZE_64) {
1407 pa = pte_to_pa(*cpte);
1408 if (pa == 0) {
1409 check_pte_for_compressed_marker:
1410 /*
1411 * This PTE could have been replaced with a
1412 * "compressed" marker after our first "freeze"
1413 * loop above, so check again.
1414 */
1415 if ((options & PMAP_OPTIONS_REMOVE) &&
1416 (PTE_IS_COMPRESSED(*cpte, cpte, pmap, vaddr))) {
1417 assert(pmap != kernel_pmap);
1418 /* one less "compressed"... */
1419 ledgers_compressed++;
1420 if (*cpte & PTE_COMPRESSED_ALT) {
1421 /* ... but it used to be "ALTACCT" */
1422 ledgers_alt_compressed++;
1423 }
1424 pmap_store_pte(is_ept, cpte, 0);
1425 }
1426 continue;
1427 }
1428
1429 pai = pa_index(pa);
1430
1431 LOCK_PVH(pai);
1432
1433 pa = pte_to_pa(*cpte);
1434 if (pa == 0) {
1435 UNLOCK_PVH(pai);
1436 goto check_pte_for_compressed_marker;
1437 }
1438
1439 /*
1440 * Remove the mapping from the pvlist for this physical page.
1441 */
1442 pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, cpte, &was_altacct);
1443
1444 num_removed++;
1445 /* update ledgers */
1446 if (was_altacct) {
1447 /* internal and alternate accounting */
1448 assert(IS_INTERNAL_PAGE(pai));
1449 ledgers_internal++;
1450 ledgers_alt_internal++;
1451 } else if (IS_REUSABLE_PAGE(pai)) {
1452 /* internal but reusable */
1453 assert(!was_altacct);
1454 assert(IS_INTERNAL_PAGE(pai));
1455 ledgers_reusable++;
1456 } else if (IS_INTERNAL_PAGE(pai)) {
1457 /* internal */
1458 assert(!was_altacct);
1459 assert(!IS_REUSABLE_PAGE(pai));
1460 ledgers_internal++;
1461 } else {
1462 /* not internal */
1463 ledgers_external++;
1464 }
1465
1466 /*
1467 * Get the modify and reference bits, then
1468 * nuke the entry in the page table
1469 */
1470 /* remember reference and change */
1471 if (!is_ept) {
1472 pmap_phys_attributes[pai] |=
1473 *cpte & (PHYS_MODIFIED | PHYS_REFERENCED);
1474 } else {
1475 pmap_phys_attributes[pai] |=
1476 ept_refmod_to_physmap((*cpte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1477 }
1478
1479 /* completely invalidate the PTE */
1480 pmap_store_pte(is_ept, cpte, 0);
1481
1482 UNLOCK_PVH(pai);
1483
1484 if (pvh_e != PV_HASHED_ENTRY_NULL) {
1485 pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1486 pvh_eh = pvh_e;
1487
1488 if (pvh_et == PV_HASHED_ENTRY_NULL) {
1489 pvh_et = pvh_e;
1490 }
1491 pvh_cnt++;
1492 }
1493 /* We can encounter at most 'num_found' PTEs for this level
1494 * Fewer may be encountered if some were replaced by
1495 * compressed markers. No new valid PTEs can be created
1496 * since the pmap lock is held exclusively.
1497 */
1498 if (num_removed == num_found) {
1499 break;
1500 }
1501 } /* for loop */
1502
1503 if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1504 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1505 }
1506 update_counts:
1507 /*
1508 * Update the counts
1509 */
1510 #if TESTING
1511 if (pmap->stats.resident_count < num_removed) {
1512 panic("pmap_remove_range: resident_count");
1513 }
1514 #endif
1515 if (num_removed) {
1516 pmap_ledger_debit(pmap, task_ledgers.phys_mem, machine_ptob(num_removed));
1517 }
1518
1519 if (pmap != kernel_pmap) {
1520 if (ledgers_external) {
1521 pmap_ledger_debit(pmap,
1522 task_ledgers.external,
1523 machine_ptob(ledgers_external));
1524 }
1525 if (ledgers_reusable) {
1526 pmap_ledger_debit(pmap,
1527 task_ledgers.reusable,
1528 machine_ptob(ledgers_reusable));
1529 }
1530 if (ledgers_internal) {
1531 pmap_ledger_debit(pmap,
1532 task_ledgers.internal,
1533 machine_ptob(ledgers_internal));
1534 }
1535 if (ledgers_compressed) {
1536 pmap_ledger_debit(pmap,
1537 task_ledgers.internal_compressed,
1538 machine_ptob(ledgers_compressed));
1539 }
1540 if (ledgers_alt_internal) {
1541 pmap_ledger_debit(pmap,
1542 task_ledgers.alternate_accounting,
1543 machine_ptob(ledgers_alt_internal));
1544 }
1545 if (ledgers_alt_compressed) {
1546 pmap_ledger_debit(pmap,
1547 task_ledgers.alternate_accounting_compressed,
1548 machine_ptob(ledgers_alt_compressed));
1549 }
1550
1551 uint64_t net_debit = (ledgers_internal - ledgers_alt_internal) + (ledgers_compressed - ledgers_alt_compressed);
1552 if (net_debit) {
1553 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(net_debit));
1554 }
1555 }
1556
1557 if (num_unwired != 0) {
1558 pmap_ledger_debit(pmap, task_ledgers.wired_mem, machine_ptob(num_unwired));
1559 }
1560 return;
1561 }
1562
1563
1564 /*
1565 * Remove the given range of addresses
1566 * from the specified map.
1567 *
1568 * It is assumed that the start and end are properly
1569 * rounded to the hardware page size.
1570 */
1571 void
pmap_remove(pmap_t map,addr64_t s64,addr64_t e64)1572 pmap_remove(
1573 pmap_t map,
1574 addr64_t s64,
1575 addr64_t e64)
1576 {
1577 pmap_remove_options(map, s64, e64, PMAP_OPTIONS_REMOVE);
1578 }
1579 #define PLCHECK_THRESHOLD (2)
1580
1581 void
pmap_remove_options(pmap_t map,addr64_t s64,addr64_t e64,int options)1582 pmap_remove_options(
1583 pmap_t map,
1584 addr64_t s64,
1585 addr64_t e64,
1586 int options)
1587 {
1588 pt_entry_t *pde;
1589 pt_entry_t *spte, *epte;
1590 addr64_t l64;
1591 uint64_t deadline = 0;
1592 boolean_t is_ept;
1593
1594 pmap_intr_assert();
1595
1596 if (map == PMAP_NULL || s64 == e64) {
1597 return;
1598 }
1599
1600 is_ept = is_ept_pmap(map);
1601
1602 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
1603 VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(s64),
1604 VM_KERNEL_ADDRHIDE(e64));
1605
1606 PMAP_LOCK_EXCLUSIVE(map);
1607 uint32_t traverse_count = 0;
1608
1609 while (s64 < e64) {
1610 pml4_entry_t *pml4e = pmap64_pml4(map, s64);
1611 if ((pml4e == NULL) ||
1612 ((*pml4e & PTE_VALID_MASK(is_ept)) == 0)) {
1613 if (os_add_overflow(s64, NBPML4, &s64)) {
1614 /* wrap; clip s64 to e64 */
1615 s64 = e64;
1616 break;
1617 }
1618 s64 &= ~(PML4MASK);
1619 continue;
1620 }
1621 pdpt_entry_t *pdpte = pmap64_pdpt(map, s64);
1622 if ((pdpte == NULL) ||
1623 ((*pdpte & PTE_VALID_MASK(is_ept)) == 0)) {
1624 if (os_add_overflow(s64, NBPDPT, &s64)) {
1625 /* wrap; clip s64 to e64 */
1626 s64 = e64;
1627 break;
1628 }
1629 s64 &= ~(PDPTMASK);
1630 continue;
1631 }
1632
1633 if (os_add_overflow(s64, PDE_MAPPED_SIZE, &l64)) {
1634 l64 = e64;
1635 } else {
1636 l64 &= ~(PDE_MAPPED_SIZE - 1);
1637
1638 if (l64 > e64) {
1639 l64 = e64;
1640 }
1641 }
1642
1643 pde = pmap_pde(map, s64);
1644
1645 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1646 if (*pde & PTE_PS) {
1647 /*
1648 * If we're removing a superpage, pmap_remove_range()
1649 * must work on level 2 instead of level 1; and we're
1650 * only passing a single level 2 entry instead of a
1651 * level 1 range.
1652 */
1653 spte = pde;
1654 epte = spte + 1; /* excluded */
1655 } else {
1656 spte = pmap_pte(map, (s64 & ~(PDE_MAPPED_SIZE - 1)));
1657 spte = &spte[ptenum(s64)];
1658 epte = &spte[intel_btop(l64 - s64)];
1659 }
1660 pmap_remove_range_options(map, s64, spte, epte,
1661 options);
1662 }
1663 s64 = l64;
1664
1665 if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
1666 if (deadline == 0) {
1667 deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1668 } else {
1669 if (rdtsc64_nofence() > deadline) {
1670 PMAP_UNLOCK_EXCLUSIVE(map);
1671 __builtin_ia32_pause();
1672 PMAP_LOCK_EXCLUSIVE(map);
1673 deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1674 }
1675 }
1676 }
1677 }
1678
1679 PMAP_UNLOCK_EXCLUSIVE(map);
1680
1681 PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
1682 }
1683
1684 void
pmap_page_protect(ppnum_t pn,vm_prot_t prot)1685 pmap_page_protect(
1686 ppnum_t pn,
1687 vm_prot_t prot)
1688 {
1689 pmap_page_protect_options(pn, prot, 0, NULL);
1690 }
1691
1692 /*
1693 * Routine: pmap_page_protect_options
1694 *
1695 * Function:
1696 * Lower the permission for all mappings to a given
1697 * page.
1698 */
1699 void
pmap_page_protect_options(ppnum_t pn,vm_prot_t prot,unsigned int options,void * arg)1700 pmap_page_protect_options(
1701 ppnum_t pn,
1702 vm_prot_t prot,
1703 unsigned int options,
1704 void *arg)
1705 {
1706 pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
1707 pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
1708 pv_hashed_entry_t nexth;
1709 int pvh_cnt = 0;
1710 pv_rooted_entry_t pv_h;
1711 pv_rooted_entry_t pv_e;
1712 pv_hashed_entry_t pvh_e;
1713 pt_entry_t *pte;
1714 int pai;
1715 pmap_t pmap;
1716 boolean_t remove;
1717 pt_entry_t new_pte_value;
1718 boolean_t is_ept;
1719
1720 pmap_intr_assert();
1721 assert(pn != vm_page_fictitious_addr);
1722 if (pn == vm_page_guard_addr) {
1723 return;
1724 }
1725
1726 pai = ppn_to_pai(pn);
1727
1728 if (!IS_MANAGED_PAGE(pai)) {
1729 /*
1730 * Not a managed page.
1731 */
1732 return;
1733 }
1734
1735 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, pn, prot);
1736
1737 /*
1738 * Determine the new protection.
1739 */
1740 switch (prot) {
1741 case VM_PROT_READ:
1742 case VM_PROT_READ | VM_PROT_EXECUTE:
1743 remove = FALSE;
1744 break;
1745 case VM_PROT_ALL:
1746 return; /* nothing to do */
1747 default:
1748 remove = TRUE;
1749 break;
1750 }
1751
1752 pv_h = pai_to_pvh(pai);
1753
1754 LOCK_PVH(pai);
1755
1756
1757 /*
1758 * Walk down PV list, if any, changing or removing all mappings.
1759 */
1760 if (pv_h->pmap == PMAP_NULL) {
1761 goto done;
1762 }
1763
1764 pv_e = pv_h;
1765 pvh_e = (pv_hashed_entry_t) pv_e; /* cheat */
1766
1767 do {
1768 vm_map_offset_t vaddr;
1769
1770 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1771 (pmap_phys_attributes[pai] & PHYS_MODIFIED)) {
1772 /* page was modified, so it will be compressed */
1773 options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1774 options |= PMAP_OPTIONS_COMPRESSOR;
1775 }
1776
1777 pmap = pv_e->pmap;
1778 is_ept = is_ept_pmap(pmap);
1779 vaddr = PVE_VA(pv_e);
1780 pte = pmap_pte(pmap, vaddr);
1781
1782 pmap_assert2((pa_index(pte_to_pa(*pte)) == pn),
1783 "pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte);
1784
1785 if (0 == pte) {
1786 panic("pmap_page_protect() "
1787 "pmap=%p pn=0x%x vaddr=0x%llx\n",
1788 pmap, pn, vaddr);
1789 }
1790 nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink);
1791
1792 /*
1793 * Remove the mapping if new protection is NONE
1794 */
1795 if (remove) {
1796 /* Remove per-pmap wired count */
1797 if (iswired(*pte)) {
1798 pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1799 }
1800
1801 if (pmap != kernel_pmap &&
1802 (options & PMAP_OPTIONS_COMPRESSOR) &&
1803 IS_INTERNAL_PAGE(pai)) {
1804 assert(!PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr));
1805 /* mark this PTE as having been "compressed" */
1806 new_pte_value = PTE_COMPRESSED;
1807 if (IS_ALTACCT_PAGE(pai, pv_e)) {
1808 new_pte_value |= PTE_COMPRESSED_ALT;
1809 }
1810 } else {
1811 new_pte_value = 0;
1812 }
1813
1814 if (options & PMAP_OPTIONS_NOREFMOD) {
1815 pmap_store_pte(is_ept, pte, new_pte_value);
1816
1817 if (options & PMAP_OPTIONS_NOFLUSH) {
1818 PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1819 } else {
1820 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1821 }
1822 } else {
1823 /*
1824 * Remove the mapping, collecting dirty bits.
1825 */
1826 pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
1827
1828 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1829 if (!is_ept) {
1830 pmap_phys_attributes[pai] |=
1831 *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1832 } else {
1833 pmap_phys_attributes[pai] |=
1834 ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1835 }
1836 if ((options &
1837 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1838 IS_INTERNAL_PAGE(pai) &&
1839 (pmap_phys_attributes[pai] &
1840 PHYS_MODIFIED)) {
1841 /*
1842 * Page is actually "modified" and
1843 * will be compressed. Start
1844 * accounting for it as "compressed".
1845 */
1846 assert(!(options & PMAP_OPTIONS_COMPRESSOR));
1847 options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1848 options |= PMAP_OPTIONS_COMPRESSOR;
1849 assert(new_pte_value == 0);
1850 if (pmap != kernel_pmap) {
1851 new_pte_value = PTE_COMPRESSED;
1852 if (IS_ALTACCT_PAGE(pai, pv_e)) {
1853 new_pte_value |= PTE_COMPRESSED_ALT;
1854 }
1855 }
1856 }
1857 pmap_store_pte(is_ept, pte, new_pte_value);
1858 }
1859
1860 #if TESTING
1861 if (pmap->stats.resident_count < 1) {
1862 panic("pmap_page_protect: resident_count");
1863 }
1864 #endif
1865 pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1866
1867 /*
1868 * We only ever compress internal pages.
1869 */
1870 if (options & PMAP_OPTIONS_COMPRESSOR) {
1871 assert(IS_INTERNAL_PAGE(pai));
1872 }
1873 if (pmap != kernel_pmap) {
1874 /* update ledgers */
1875 if (IS_ALTACCT_PAGE(pai, pv_e)) {
1876 assert(IS_INTERNAL_PAGE(pai));
1877 pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1878 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1879 if (options & PMAP_OPTIONS_COMPRESSOR) {
1880 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1881 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, PAGE_SIZE);
1882 }
1883 } else if (IS_REUSABLE_PAGE(pai)) {
1884 assert(!IS_ALTACCT_PAGE(pai, pv_e));
1885 assert(IS_INTERNAL_PAGE(pai));
1886 if (options & PMAP_OPTIONS_COMPRESSOR) {
1887 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1888 /* was not in footprint, but is now */
1889 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1890 }
1891 pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
1892 } else if (IS_INTERNAL_PAGE(pai)) {
1893 assert(!IS_ALTACCT_PAGE(pai, pv_e));
1894 assert(!IS_REUSABLE_PAGE(pai));
1895 pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1896 /*
1897 * Update all stats related to physical
1898 * footprint, which only deals with
1899 * internal pages.
1900 */
1901 if (options & PMAP_OPTIONS_COMPRESSOR) {
1902 /*
1903 * This removal is only being
1904 * done so we can send this page
1905 * to the compressor; therefore
1906 * it mustn't affect total task
1907 * footprint.
1908 */
1909 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1910 } else {
1911 /*
1912 * This internal page isn't
1913 * going to the compressor,
1914 * so adjust stats to keep
1915 * phys_footprint up to date.
1916 */
1917 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1918 }
1919 } else {
1920 pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
1921 }
1922 }
1923
1924 /*
1925 * Deal with the pv_rooted_entry.
1926 */
1927
1928 if (pv_e == pv_h) {
1929 /*
1930 * Fix up head later.
1931 */
1932 pv_h->pmap = PMAP_NULL;
1933 } else {
1934 /*
1935 * Delete this entry.
1936 */
1937 pv_hash_remove(pvh_e);
1938 pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1939 pvh_eh = pvh_e;
1940
1941 if (pvh_et == PV_HASHED_ENTRY_NULL) {
1942 pvh_et = pvh_e;
1943 }
1944 pvh_cnt++;
1945 }
1946 } else {
1947 /*
1948 * Write-protect, after opportunistic refmod collect
1949 */
1950 if (!is_ept) {
1951 pmap_phys_attributes[pai] |=
1952 *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1953 } else {
1954 pmap_phys_attributes[pai] |=
1955 ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1956 }
1957
1958 pmap_update_pte(is_ept, pte, PTE_WRITE(is_ept), 0, true);
1959 if (options & PMAP_OPTIONS_NOFLUSH) {
1960 PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1961 } else {
1962 PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1963 }
1964 }
1965 pvh_e = nexth;
1966 } while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h);
1967
1968
1969 /*
1970 * If pv_head mapping was removed, fix it up.
1971 */
1972 if (pv_h->pmap == PMAP_NULL) {
1973 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
1974
1975 if (pvh_e != (pv_hashed_entry_t) pv_h) {
1976 pv_hash_remove(pvh_e);
1977 pv_h->pmap = pvh_e->pmap;
1978 pv_h->va_and_flags = pvh_e->va_and_flags;
1979 pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1980 pvh_eh = pvh_e;
1981
1982 if (pvh_et == PV_HASHED_ENTRY_NULL) {
1983 pvh_et = pvh_e;
1984 }
1985 pvh_cnt++;
1986 }
1987 }
1988 if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1989 PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1990 }
1991 done:
1992 UNLOCK_PVH(pai);
1993
1994 PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
1995 }
1996
1997
1998 /*
1999 * Clear specified attribute bits.
2000 */
2001 void
phys_attribute_clear(ppnum_t pn,int bits,unsigned int options,void * arg)2002 phys_attribute_clear(
2003 ppnum_t pn,
2004 int bits,
2005 unsigned int options,
2006 void *arg)
2007 {
2008 pv_rooted_entry_t pv_h;
2009 pv_hashed_entry_t pv_e;
2010 pt_entry_t *pte = NULL;
2011 int pai;
2012 pmap_t pmap;
2013 char attributes = 0;
2014 boolean_t is_internal, is_reusable, is_altacct, is_ept;
2015 int ept_bits_to_clear;
2016 boolean_t ept_keep_global_mod = FALSE;
2017
2018 if ((bits & PHYS_MODIFIED) &&
2019 (options & PMAP_OPTIONS_NOFLUSH) &&
2020 arg == NULL) {
2021 panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p): "
2022 "should not clear 'modified' without flushing TLBs\n",
2023 pn, bits, options, arg);
2024 }
2025
2026 /* We only support converting MOD and REF bits for EPT PTEs in this function */
2027 assert((bits & ~(PHYS_REFERENCED | PHYS_MODIFIED)) == 0);
2028
2029 ept_bits_to_clear = (unsigned)physmap_refmod_to_ept(bits & (PHYS_MODIFIED | PHYS_REFERENCED));
2030
2031 pmap_intr_assert();
2032 assert(pn != vm_page_fictitious_addr);
2033 if (pn == vm_page_guard_addr) {
2034 return;
2035 }
2036
2037 pai = ppn_to_pai(pn);
2038
2039 if (!IS_MANAGED_PAGE(pai)) {
2040 /*
2041 * Not a managed page.
2042 */
2043 return;
2044 }
2045
2046 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
2047
2048 pv_h = pai_to_pvh(pai);
2049
2050 LOCK_PVH(pai);
2051
2052
2053 /*
2054 * Walk down PV list, clearing all modify or reference bits.
2055 * We do not have to lock the pv_list because we have
2056 * the per-pmap lock
2057 */
2058 if (pv_h->pmap != PMAP_NULL) {
2059 /*
2060 * There are some mappings.
2061 */
2062
2063 is_internal = IS_INTERNAL_PAGE(pai);
2064 is_reusable = IS_REUSABLE_PAGE(pai);
2065
2066 pv_e = (pv_hashed_entry_t)pv_h;
2067
2068 do {
2069 vm_map_offset_t va;
2070 char pte_bits;
2071
2072 pmap = pv_e->pmap;
2073 is_ept = is_ept_pmap(pmap);
2074 is_altacct = IS_ALTACCT_PAGE(pai, pv_e);
2075 va = PVE_VA(pv_e);
2076 pte_bits = 0;
2077
2078 if (bits) {
2079 pte = pmap_pte(pmap, va);
2080 /* grab ref/mod bits from this PTE */
2081 pte_bits = (*pte & (PTE_REF(is_ept) | PTE_MOD(is_ept)));
2082 /* propagate to page's global attributes */
2083 if (!is_ept) {
2084 attributes |= pte_bits;
2085 } else {
2086 attributes |= ept_refmod_to_physmap(pte_bits);
2087 if (!pmap_ept_support_ad && (pte_bits & INTEL_EPT_MOD)) {
2088 ept_keep_global_mod = TRUE;
2089 }
2090 }
2091 /* which bits to clear for this PTE? */
2092 if (!is_ept) {
2093 pte_bits &= bits;
2094 } else {
2095 pte_bits &= ept_bits_to_clear;
2096 }
2097 }
2098 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
2099 pte_bits |= PTE_WRITE(is_ept);
2100 }
2101
2102 /*
2103 * Clear modify and/or reference bits.
2104 */
2105 if (pte_bits) {
2106 pmap_update_pte(is_ept, pte, pte_bits, 0, true);
2107
2108 /* Ensure all processors using this translation
2109 * invalidate this TLB entry. The invalidation
2110 * *must* follow the PTE update, to ensure that
2111 * the TLB shadow of the 'D' bit (in particular)
2112 * is synchronized with the updated PTE.
2113 */
2114 if (!(options & PMAP_OPTIONS_NOFLUSH)) {
2115 /* flush TLBS now */
2116 PMAP_UPDATE_TLBS(pmap,
2117 va,
2118 va + PAGE_SIZE);
2119 } else if (arg) {
2120 /* delayed TLB flush: add "pmap" info */
2121 PMAP_UPDATE_TLBS_DELAYED(
2122 pmap,
2123 va,
2124 va + PAGE_SIZE,
2125 (pmap_flush_context *)arg);
2126 } else {
2127 /* no TLB flushing at all */
2128 }
2129 }
2130
2131 /* update pmap "reusable" stats */
2132 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
2133 is_reusable &&
2134 pmap != kernel_pmap) {
2135 /* one less "reusable" */
2136 pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
2137 if (is_internal) {
2138 /* one more "internal" */
2139 if (is_altacct) {
2140 /* no impact on ledgers */
2141 } else {
2142 pmap_ledger_credit(pmap,
2143 task_ledgers.internal,
2144 PAGE_SIZE);
2145 pmap_ledger_credit(
2146 pmap,
2147 task_ledgers.phys_footprint,
2148 PAGE_SIZE);
2149 }
2150 } else {
2151 /* one more "external" */
2152 pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
2153 }
2154 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
2155 !is_reusable &&
2156 pmap != kernel_pmap) {
2157 /* one more "reusable" */
2158 pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
2159 if (is_internal) {
2160 /* one less "internal" */
2161 if (is_altacct) {
2162 /* no impact on footprint */
2163 } else {
2164 pmap_ledger_debit(pmap,
2165 task_ledgers.internal,
2166 PAGE_SIZE);
2167 pmap_ledger_debit(
2168 pmap,
2169 task_ledgers.phys_footprint,
2170 PAGE_SIZE);
2171 }
2172 } else {
2173 /* one less "external" */
2174 pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
2175 }
2176 }
2177
2178 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2179 } while (pv_e != (pv_hashed_entry_t)pv_h);
2180 }
2181 /* Opportunistic refmod collection, annulled
2182 * if both REF and MOD are being cleared.
2183 */
2184
2185 pmap_phys_attributes[pai] |= attributes;
2186
2187 if (ept_keep_global_mod) {
2188 /*
2189 * If the hardware doesn't support AD bits for EPT PTEs and someone is
2190 * requesting that we clear the modified bit for a phys page, we need
2191 * to ensure that there are no EPT mappings for the page with the
2192 * modified bit set. If there are, we cannot clear the global modified bit.
2193 */
2194 bits &= ~PHYS_MODIFIED;
2195 }
2196 pmap_phys_attributes[pai] &= ~(bits);
2197
2198 /* update this page's "reusable" status */
2199 if (options & PMAP_OPTIONS_CLEAR_REUSABLE) {
2200 pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
2201 } else if (options & PMAP_OPTIONS_SET_REUSABLE) {
2202 pmap_phys_attributes[pai] |= PHYS_REUSABLE;
2203 }
2204
2205 UNLOCK_PVH(pai);
2206
2207 PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
2208 }
2209
2210 /*
2211 * Check specified attribute bits.
2212 */
2213 int
phys_attribute_test(ppnum_t pn,int bits)2214 phys_attribute_test(
2215 ppnum_t pn,
2216 int bits)
2217 {
2218 pv_rooted_entry_t pv_h;
2219 pv_hashed_entry_t pv_e;
2220 pt_entry_t *pte;
2221 int pai;
2222 pmap_t pmap;
2223 int attributes = 0;
2224 boolean_t is_ept;
2225
2226 pmap_intr_assert();
2227 assert(pn != vm_page_fictitious_addr);
2228 assert((bits & ~(PHYS_MODIFIED | PHYS_REFERENCED)) == 0);
2229 if (pn == vm_page_guard_addr) {
2230 return 0;
2231 }
2232
2233 pai = ppn_to_pai(pn);
2234
2235 if (!IS_MANAGED_PAGE(pai)) {
2236 /*
2237 * Not a managed page.
2238 */
2239 return 0;
2240 }
2241
2242 /*
2243 * Fast check... if bits already collected
2244 * no need to take any locks...
2245 * if not set, we need to recheck after taking
2246 * the lock in case they got pulled in while
2247 * we were waiting for the lock
2248 */
2249 if ((pmap_phys_attributes[pai] & bits) == bits) {
2250 return bits;
2251 }
2252
2253 pv_h = pai_to_pvh(pai);
2254
2255 LOCK_PVH(pai);
2256
2257 attributes = pmap_phys_attributes[pai] & bits;
2258
2259
2260 /*
2261 * Walk down PV list, checking the mappings until we
2262 * reach the end or we've found the desired attributes.
2263 */
2264 if (attributes != bits &&
2265 pv_h->pmap != PMAP_NULL) {
2266 /*
2267 * There are some mappings.
2268 */
2269 pv_e = (pv_hashed_entry_t)pv_h;
2270 do {
2271 vm_map_offset_t va;
2272
2273 pmap = pv_e->pmap;
2274 is_ept = is_ept_pmap(pmap);
2275 va = PVE_VA(pv_e);
2276 /*
2277 * pick up modify and/or reference bits from mapping
2278 */
2279
2280 pte = pmap_pte(pmap, va);
2281 if (!is_ept) {
2282 attributes |= (int)(*pte & bits);
2283 } else {
2284 attributes |= (int)(ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED));
2285 }
2286
2287 pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2288 } while ((attributes != bits) &&
2289 (pv_e != (pv_hashed_entry_t)pv_h));
2290 }
2291 pmap_phys_attributes[pai] |= attributes;
2292
2293 UNLOCK_PVH(pai);
2294 return attributes;
2295 }
2296
2297 /*
2298 * Routine: pmap_change_wiring
2299 * Function: Change the wiring attribute for a map/virtual-address
2300 * pair.
2301 * In/out conditions:
2302 * The mapping must already exist in the pmap.
2303 */
2304 void
pmap_change_wiring(pmap_t map,vm_map_offset_t vaddr,boolean_t wired)2305 pmap_change_wiring(
2306 pmap_t map,
2307 vm_map_offset_t vaddr,
2308 boolean_t wired)
2309 {
2310 pt_entry_t *pte;
2311
2312 PMAP_LOCK_SHARED(map);
2313
2314 if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL) {
2315 panic("pmap_change_wiring(%p,0x%llx,%d): pte missing",
2316 map, vaddr, wired);
2317 }
2318
2319 if (wired && !iswired(*pte)) {
2320 /*
2321 * wiring down mapping
2322 */
2323 pmap_ledger_credit(map, task_ledgers.wired_mem, PAGE_SIZE);
2324 pmap_update_pte(is_ept_pmap(map), pte, 0, PTE_WIRED, false);
2325 } else if (!wired && iswired(*pte)) {
2326 /*
2327 * unwiring mapping
2328 */
2329 pmap_ledger_debit(map, task_ledgers.wired_mem, PAGE_SIZE);
2330 pmap_update_pte(is_ept_pmap(map), pte, PTE_WIRED, 0, false);
2331 }
2332
2333 PMAP_UNLOCK_SHARED(map);
2334 }
2335
2336 /*
2337 * "Backdoor" direct map routine for early mappings.
2338 * Useful for mapping memory outside the range
2339 * Sets A, D and NC if requested
2340 */
2341
2342 vm_offset_t
pmap_map_bd(vm_offset_t virt,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int flags)2343 pmap_map_bd(
2344 vm_offset_t virt,
2345 vm_map_offset_t start_addr,
2346 vm_map_offset_t end_addr,
2347 vm_prot_t prot,
2348 unsigned int flags)
2349 {
2350 pt_entry_t template;
2351 pt_entry_t *ptep;
2352
2353 vm_offset_t base = virt;
2354 boolean_t doflush = FALSE;
2355
2356 template = pa_to_pte(start_addr)
2357 | INTEL_PTE_REF
2358 | INTEL_PTE_MOD
2359 | INTEL_PTE_WIRED
2360 | INTEL_PTE_VALID;
2361
2362 if ((flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) == VM_MEM_NOT_CACHEABLE) {
2363 template |= INTEL_PTE_NCACHE;
2364 if (!(flags & (VM_MEM_GUARDED))) {
2365 template |= INTEL_PTE_PAT;
2366 }
2367 }
2368
2369 if ((prot & VM_PROT_EXECUTE) == 0) {
2370 template |= INTEL_PTE_NX;
2371 }
2372
2373 if (prot & VM_PROT_WRITE) {
2374 template |= INTEL_PTE_WRITE;
2375 }
2376 vm_map_offset_t caddr = start_addr;
2377 while (caddr < end_addr) {
2378 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
2379 if (ptep == PT_ENTRY_NULL) {
2380 panic("pmap_map_bd: Invalid kernel address");
2381 }
2382 if (pte_to_pa(*ptep)) {
2383 doflush = TRUE;
2384 }
2385 pmap_store_pte(FALSE, ptep, template);
2386 pte_increment_pa(template);
2387 virt += PAGE_SIZE;
2388 caddr += PAGE_SIZE;
2389 }
2390 if (doflush) {
2391 pmap_tlbi_range(0, ~0ULL, true, 0);
2392 PMAP_UPDATE_TLBS(kernel_pmap, base, base + end_addr - start_addr);
2393 }
2394 return virt;
2395 }
2396
2397 /* Create a virtual alias beginning at 'ava' of the specified kernel virtual
2398 * range. The aliased pagetable range is expanded if
2399 * PMAP_EXPAND_OPTIONS_ALIASMAP is specified. Performs no synchronization,
2400 * assumes caller has stabilized the source and destination ranges. Currently
2401 * used to populate sections of the trampoline "doublemap" at CPU startup.
2402 */
2403
2404 void
pmap_alias(vm_offset_t ava,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int eoptions)2405 pmap_alias(
2406 vm_offset_t ava,
2407 vm_map_offset_t start_addr,
2408 vm_map_offset_t end_addr,
2409 vm_prot_t prot,
2410 unsigned int eoptions)
2411 {
2412 pt_entry_t prot_template, template;
2413 pt_entry_t *aptep, *sptep;
2414
2415 prot_template = INTEL_PTE_REF | INTEL_PTE_MOD | INTEL_PTE_WIRED | INTEL_PTE_VALID;
2416 if ((prot & VM_PROT_EXECUTE) == 0) {
2417 prot_template |= INTEL_PTE_NX;
2418 }
2419
2420 if (prot & VM_PROT_WRITE) {
2421 prot_template |= INTEL_PTE_WRITE;
2422 }
2423 assert(((start_addr | end_addr) & PAGE_MASK) == 0);
2424 while (start_addr < end_addr) {
2425 aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2426 if (aptep == PT_ENTRY_NULL) {
2427 if (eoptions & PMAP_EXPAND_OPTIONS_ALIASMAP) {
2428 pmap_expand(kernel_pmap, ava, PMAP_EXPAND_OPTIONS_ALIASMAP);
2429 aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2430 } else {
2431 panic("pmap_alias: Invalid alias address");
2432 }
2433 }
2434 /* The aliased range should not have any active mappings */
2435 assert(pte_to_pa(*aptep) == 0);
2436
2437 sptep = pmap_pte(kernel_pmap, start_addr);
2438 assert(sptep != PT_ENTRY_NULL && (pte_to_pa(*sptep) != 0));
2439 template = pa_to_pte(pte_to_pa(*sptep)) | prot_template;
2440 pmap_store_pte(FALSE, aptep, template);
2441
2442 ava += PAGE_SIZE;
2443 start_addr += PAGE_SIZE;
2444 }
2445 }
2446
2447 mach_vm_size_t
pmap_query_resident(pmap_t pmap,addr64_t s64,addr64_t e64,mach_vm_size_t * compressed_bytes_p)2448 pmap_query_resident(
2449 pmap_t pmap,
2450 addr64_t s64,
2451 addr64_t e64,
2452 mach_vm_size_t *compressed_bytes_p)
2453 {
2454 pt_entry_t *pde;
2455 pt_entry_t *spte, *epte;
2456 addr64_t l64;
2457 uint64_t deadline = 0;
2458 mach_vm_size_t resident_bytes;
2459 mach_vm_size_t compressed_bytes;
2460 boolean_t is_ept;
2461
2462 pmap_intr_assert();
2463
2464 if (pmap == PMAP_NULL || pmap == kernel_pmap || s64 == e64) {
2465 if (compressed_bytes_p) {
2466 *compressed_bytes_p = 0;
2467 }
2468 return 0;
2469 }
2470
2471 is_ept = is_ept_pmap(pmap);
2472
2473 PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
2474 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(s64),
2475 VM_KERNEL_ADDRHIDE(e64));
2476
2477 resident_bytes = 0;
2478 compressed_bytes = 0;
2479
2480 PMAP_LOCK_EXCLUSIVE(pmap);
2481 uint32_t traverse_count = 0;
2482
2483 while (s64 < e64) {
2484 if (os_add_overflow(s64, PDE_MAPPED_SIZE, &l64)) {
2485 l64 = e64;
2486 } else {
2487 l64 &= ~(PDE_MAPPED_SIZE - 1);
2488
2489 if (l64 > e64) {
2490 l64 = e64;
2491 }
2492 }
2493
2494 pde = pmap_pde(pmap, s64);
2495
2496 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
2497 if (*pde & PTE_PS) {
2498 /* superpage: not supported */
2499 } else {
2500 spte = pmap_pte(pmap,
2501 (s64 & ~(PDE_MAPPED_SIZE - 1)));
2502 spte = &spte[ptenum(s64)];
2503 epte = &spte[intel_btop(l64 - s64)];
2504
2505 for (; spte < epte; spte++) {
2506 if (pte_to_pa(*spte) != 0) {
2507 resident_bytes += PAGE_SIZE;
2508 } else if (*spte & PTE_COMPRESSED) {
2509 compressed_bytes += PAGE_SIZE;
2510 }
2511 }
2512 }
2513 }
2514 s64 = l64;
2515
2516 if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
2517 if (deadline == 0) {
2518 deadline = rdtsc64() + max_preemption_latency_tsc;
2519 } else {
2520 if (rdtsc64() > deadline) {
2521 PMAP_UNLOCK_EXCLUSIVE(pmap);
2522 __builtin_ia32_pause();
2523 PMAP_LOCK_EXCLUSIVE(pmap);
2524 deadline = rdtsc64() + max_preemption_latency_tsc;
2525 }
2526 }
2527 }
2528 }
2529
2530 PMAP_UNLOCK_EXCLUSIVE(pmap);
2531
2532 PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
2533 resident_bytes);
2534
2535 if (compressed_bytes_p) {
2536 *compressed_bytes_p = compressed_bytes;
2537 }
2538 return resident_bytes;
2539 }
2540
2541 kern_return_t
pmap_query_page_info(pmap_t pmap,vm_map_offset_t va,int * disp_p)2542 pmap_query_page_info(
2543 pmap_t pmap,
2544 vm_map_offset_t va,
2545 int *disp_p)
2546 {
2547 int disp;
2548 boolean_t is_ept;
2549 pmap_paddr_t pa;
2550 ppnum_t pai;
2551 pd_entry_t *pde;
2552 pt_entry_t *pte;
2553
2554 pmap_intr_assert();
2555 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
2556 *disp_p = 0;
2557 return KERN_INVALID_ARGUMENT;
2558 }
2559
2560 disp = 0;
2561 is_ept = is_ept_pmap(pmap);
2562
2563 PMAP_LOCK_EXCLUSIVE(pmap);
2564
2565 pde = pmap_pde(pmap, va);
2566 if (!pde ||
2567 !(*pde & PTE_VALID_MASK(is_ept)) ||
2568 (*pde & PTE_PS)) {
2569 goto done;
2570 }
2571
2572 pte = pmap_pte(pmap, va);
2573 if (pte == PT_ENTRY_NULL) {
2574 goto done;
2575 }
2576
2577 pa = pte_to_pa(*pte);
2578 if (pa == 0) {
2579 if (PTE_IS_COMPRESSED(*pte, pte, pmap, va)) {
2580 disp |= PMAP_QUERY_PAGE_COMPRESSED;
2581 if (*pte & PTE_COMPRESSED_ALT) {
2582 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
2583 }
2584 }
2585 } else {
2586 disp |= PMAP_QUERY_PAGE_PRESENT;
2587 pai = pa_index(pa);
2588 if (!IS_MANAGED_PAGE(pai)) {
2589 } else if (pmap_pv_is_altacct(pmap, va, pai)) {
2590 assert(IS_INTERNAL_PAGE(pai));
2591 disp |= PMAP_QUERY_PAGE_INTERNAL;
2592 disp |= PMAP_QUERY_PAGE_ALTACCT;
2593 } else if (IS_REUSABLE_PAGE(pai)) {
2594 disp |= PMAP_QUERY_PAGE_REUSABLE;
2595 } else if (IS_INTERNAL_PAGE(pai)) {
2596 disp |= PMAP_QUERY_PAGE_INTERNAL;
2597 }
2598 }
2599
2600 done:
2601 PMAP_UNLOCK_EXCLUSIVE(pmap);
2602 *disp_p = disp;
2603 return KERN_SUCCESS;
2604 }
2605
2606 void
pmap_set_vm_map_cs_enforced(pmap_t pmap,bool new_value)2607 pmap_set_vm_map_cs_enforced(
2608 pmap_t pmap,
2609 bool new_value)
2610 {
2611 PMAP_LOCK_EXCLUSIVE(pmap);
2612 pmap->pm_vm_map_cs_enforced = new_value;
2613 PMAP_UNLOCK_EXCLUSIVE(pmap);
2614 }
2615 extern int cs_process_enforcement_enable;
2616 bool
pmap_get_vm_map_cs_enforced(pmap_t pmap)2617 pmap_get_vm_map_cs_enforced(
2618 pmap_t pmap)
2619 {
2620 if (cs_process_enforcement_enable) {
2621 return true;
2622 }
2623 return pmap->pm_vm_map_cs_enforced;
2624 }
2625
2626 void
pmap_set_jit_entitled(__unused pmap_t pmap)2627 pmap_set_jit_entitled(__unused pmap_t pmap)
2628 {
2629 /* The x86 pmap layer does not care if a map has a JIT entry. */
2630 return;
2631 }
2632
2633 bool
pmap_get_jit_entitled(__unused pmap_t pmap)2634 pmap_get_jit_entitled(__unused pmap_t pmap)
2635 {
2636 /* The x86 pmap layer does not care if a map is using JIT. */
2637 return false;
2638 }
2639
2640 bool
pmap_has_prot_policy(__unused pmap_t pmap,__unused bool translated_allow_execute,__unused vm_prot_t prot)2641 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
2642 {
2643 /*
2644 * The x86 pmap layer does not apply any policy to any protection
2645 * types.
2646 */
2647 return false;
2648 }
2649
2650 uint64_t
pmap_release_pages_fast(void)2651 pmap_release_pages_fast(void)
2652 {
2653 return 0;
2654 }
2655
2656 void
pmap_trim(__unused pmap_t grand,__unused pmap_t subord,__unused addr64_t vstart,__unused uint64_t size)2657 pmap_trim(__unused pmap_t grand, __unused pmap_t subord, __unused addr64_t vstart, __unused uint64_t size)
2658 {
2659 return;
2660 }
2661
2662 __dead2
2663 void
pmap_ledger_verify_size(size_t size)2664 pmap_ledger_verify_size(size_t size)
2665 {
2666 panic("%s: unsupported, "
2667 "size=%lu",
2668 __func__, size);
2669 }
2670
2671 __dead2
2672 ledger_t
pmap_ledger_alloc(void)2673 pmap_ledger_alloc(void)
2674 {
2675 panic("%s: unsupported",
2676 __func__);
2677 }
2678
2679 __dead2
2680 void
pmap_ledger_free(ledger_t ledger)2681 pmap_ledger_free(ledger_t ledger)
2682 {
2683 panic("%s: unsupported, "
2684 "ledger=%p",
2685 __func__, ledger);
2686 }
2687
2688 kern_return_t
pmap_dump_page_tables(pmap_t pmap __unused,void * bufp __unused,void * buf_end __unused,unsigned int level_mask __unused,size_t * bytes_copied __unused)2689 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
2690 unsigned int level_mask __unused, size_t *bytes_copied __unused)
2691 {
2692 return KERN_NOT_SUPPORTED;
2693 }
2694
2695 void *
pmap_map_compressor_page(ppnum_t pn)2696 pmap_map_compressor_page(ppnum_t pn)
2697 {
2698 assertf(IS_MANAGED_PAGE(ppn_to_pai(pn)), "%s called on non-managed page 0x%08x", __func__, pn);
2699 return PHYSMAP_PTOV((uint64_t)pn << (uint64_t)PAGE_SHIFT);
2700 }
2701
2702 void
pmap_unmap_compressor_page(ppnum_t pn __unused,void * kva __unused)2703 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
2704 {
2705 }
2706
2707 bool
pmap_clear_refmod_range_options(pmap_t pmap __unused,vm_map_address_t start __unused,vm_map_address_t end __unused,unsigned int mask __unused,unsigned int options __unused)2708 pmap_clear_refmod_range_options(
2709 pmap_t pmap __unused,
2710 vm_map_address_t start __unused,
2711 vm_map_address_t end __unused,
2712 unsigned int mask __unused,
2713 unsigned int options __unused)
2714 {
2715 /*
2716 * x86 doesn't have ranged tlbi instructions, and we already have
2717 * the pmap_flush_context. This operation isn't implemented.
2718 */
2719 return false;
2720 }
2721
2722 bool
pmap_supported_feature(pmap_t pmap,pmap_feature_flags_t feat)2723 pmap_supported_feature(pmap_t pmap, pmap_feature_flags_t feat)
2724 {
2725 switch (feat) {
2726 case PMAP_FEAT_UEXEC:
2727 return pmap != NULL && is_ept_pmap(pmap);
2728 default:
2729 return false;
2730 }
2731 }
2732