xref: /xnu-8020.140.41/osfmk/i386/pmap_x86_common.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <mach_assert.h>
30 
31 #include <vm/pmap.h>
32 #include <vm/vm_map.h>
33 #include <vm/vm_kern.h>
34 #include <kern/ledger.h>
35 #include <kern/zalloc_internal.h>
36 #include <i386/pmap_internal.h>
37 
38 void            pmap_remove_range(
39 	pmap_t          pmap,
40 	vm_map_offset_t va,
41 	pt_entry_t      *spte,
42 	pt_entry_t      *epte);
43 
44 static void            pmap_remove_range_options(
45 	pmap_t          pmap,
46 	vm_map_offset_t va,
47 	pt_entry_t      *spte,
48 	pt_entry_t      *epte,
49 	int             options);
50 
51 void            pmap_reusable_range(
52 	pmap_t          pmap,
53 	vm_map_offset_t va,
54 	pt_entry_t      *spte,
55 	pt_entry_t      *epte,
56 	boolean_t       reusable);
57 
58 pt_entry_t *PTE_corrupted_ptr;
59 
60 #if DEVELOPMENT || DEBUG
61 int pmap_inject_pte_corruption;
62 uint32_t pmap_update_clear_pte_count;
63 uint32_t pmap_update_invalid_pte_count;
64 #endif
65 
66 /*
67  * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time,
68  * on a NBPDE boundary.
69  */
70 
71 uint64_t
pmap_shared_region_size_min(__unused pmap_t pmap)72 pmap_shared_region_size_min(__unused pmap_t pmap)
73 {
74 	return NBPDE;
75 }
76 
77 uint64_t
pmap_commpage_size_min(__unused pmap_t pmap)78 pmap_commpage_size_min(__unused pmap_t pmap)
79 {
80 	return NBPDE;
81 }
82 
83 /*
84  *	kern_return_t pmap_nest(grand, subord, va_start, size)
85  *
86  *	grand  = the pmap that we will nest subord into
87  *	subord = the pmap that goes into the grand
88  *	va_start  = start of range in pmap to be inserted
89  *	size   = Size of nest area (up to 16TB)
90  *
91  *	Inserts a pmap into another.  This is used to implement shared segments.
92  *
93  *	Note that we depend upon higher level VM locks to insure that things don't change while
94  *	we are doing this.  For example, VM should not be doing any pmap enters while it is nesting
95  *	or do 2 nests at once.
96  */
97 
98 /*
99  * This routine can nest subtrees either at the PDPT level (1GiB) or at the
100  * PDE level (2MiB). We currently disallow disparate offsets for the "subord"
101  * container and the "grand" parent. A minor optimization to consider for the
102  * future: make the "subord" truly a container rather than a full-fledged
103  * pagetable hierarchy which can be unnecessarily sparse (DRK).
104  */
105 
106 kern_return_t
pmap_nest(pmap_t grand,pmap_t subord,addr64_t va_start,uint64_t size)107 pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, uint64_t size)
108 {
109 	vm_map_offset_t vaddr;
110 	pd_entry_t      *pde, *npde;
111 	unsigned int    i;
112 	uint64_t        num_pde;
113 
114 	assert(!is_ept_pmap(grand));
115 	assert(!is_ept_pmap(subord));
116 
117 	if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
118 	    (va_start & (pmap_shared_region_size_min(grand) - 1)) ||
119 	    ((size >> 28) > 65536)) {   /* Max size we can nest is 16TB */
120 		return KERN_INVALID_VALUE;
121 	}
122 
123 	if (size == 0) {
124 		panic("pmap_nest: size is invalid - %016llX", size);
125 	}
126 
127 	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
128 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
129 	    VM_KERNEL_ADDRHIDE(va_start));
130 
131 	vaddr = (vm_map_offset_t)va_start;
132 	num_pde = size >> PDESHIFT;
133 
134 	PMAP_LOCK_EXCLUSIVE(subord);
135 
136 	subord->pm_shared = TRUE;
137 
138 	for (i = 0; i < num_pde;) {
139 		if (((vaddr & PDPTMASK) == 0) && (num_pde - i) >= NPDEPG) {
140 			npde = pmap64_pdpt(subord, vaddr);
141 
142 			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
143 				PMAP_UNLOCK_EXCLUSIVE(subord);
144 				pmap_expand_pdpt(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
145 				PMAP_LOCK_EXCLUSIVE(subord);
146 				npde = pmap64_pdpt(subord, vaddr);
147 			}
148 			*npde |= INTEL_PDPTE_NESTED;
149 			vaddr += NBPDPT;
150 			i += (uint32_t)NPDEPG;
151 		} else {
152 			npde = pmap_pde(subord, vaddr);
153 
154 			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
155 				PMAP_UNLOCK_EXCLUSIVE(subord);
156 				pmap_expand(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
157 				PMAP_LOCK_EXCLUSIVE(subord);
158 				npde = pmap_pde(subord, vaddr);
159 			}
160 			vaddr += NBPDE;
161 			i++;
162 		}
163 	}
164 
165 	PMAP_UNLOCK_EXCLUSIVE(subord);
166 
167 	vaddr = (vm_map_offset_t)va_start;
168 
169 	PMAP_LOCK_EXCLUSIVE(grand);
170 
171 	for (i = 0; i < num_pde;) {
172 		pd_entry_t tpde;
173 
174 		if (((vaddr & PDPTMASK) == 0) && ((num_pde - i) >= NPDEPG)) {
175 			npde = pmap64_pdpt(subord, vaddr);
176 			if (npde == 0) {
177 				panic("pmap_nest: no PDPT, subord %p nstart 0x%llx", subord, vaddr);
178 			}
179 			tpde = *npde;
180 			pde = pmap64_pdpt(grand, vaddr);
181 			if (0 == pde) {
182 				PMAP_UNLOCK_EXCLUSIVE(grand);
183 				pmap_expand_pml4(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
184 				PMAP_LOCK_EXCLUSIVE(grand);
185 				pde = pmap64_pdpt(grand, vaddr);
186 			}
187 			if (pde == 0) {
188 				panic("pmap_nest: no PDPT, grand  %p vaddr 0x%llx", grand, vaddr);
189 			}
190 			pmap_store_pte(FALSE, pde, tpde);
191 			vaddr += NBPDPT;
192 			i += (uint32_t) NPDEPG;
193 		} else {
194 			npde = pmap_pde(subord, vaddr);
195 			if (npde == 0) {
196 				panic("pmap_nest: no npde, subord %p vaddr 0x%llx", subord, vaddr);
197 			}
198 			tpde = *npde;
199 			pde = pmap_pde(grand, vaddr);
200 			if (0 == pde) {
201 				PMAP_UNLOCK_EXCLUSIVE(grand);
202 				pmap_expand_pdpt(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
203 				PMAP_LOCK_EXCLUSIVE(grand);
204 				pde = pmap_pde(grand, vaddr);
205 			}
206 
207 			if (pde == 0) {
208 				panic("pmap_nest: no pde, grand  %p vaddr 0x%llx", grand, vaddr);
209 			}
210 			vaddr += NBPDE;
211 			pmap_store_pte(FALSE, pde, tpde);
212 			i++;
213 		}
214 	}
215 
216 	PMAP_UNLOCK_EXCLUSIVE(grand);
217 
218 	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, KERN_SUCCESS);
219 
220 	return KERN_SUCCESS;
221 }
222 
223 /*
224  *	kern_return_t pmap_unnest(grand, vaddr)
225  *
226  *	grand  = the pmap that we will un-nest subord from
227  *	vaddr  = start of range in pmap to be unnested
228  *
229  *	Removes a pmap from another.  This is used to implement shared segments.
230  */
231 
232 kern_return_t
pmap_unnest(pmap_t grand,addr64_t vaddr,uint64_t size)233 pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size)
234 {
235 	pd_entry_t *pde;
236 	unsigned int i;
237 	uint64_t num_pde;
238 	addr64_t va_start, va_end;
239 	uint64_t npdpt = PMAP_INVALID_PDPTNUM;
240 
241 	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
242 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
243 
244 	if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
245 	    (vaddr & (pmap_shared_region_size_min(grand) - 1))) {
246 		panic("pmap_unnest(%p,0x%llx,0x%llx): unaligned...",
247 		    grand, vaddr, size);
248 	}
249 
250 	assert(!is_ept_pmap(grand));
251 
252 	/* align everything to PDE boundaries */
253 	va_start = vaddr & ~(NBPDE - 1);
254 
255 	if (os_add_overflow(vaddr, size + NBPDE - 1, &va_end)) {
256 		panic("pmap_unnest: Overflow when calculating range end: s=0x%llx sz=0x%llx\n", vaddr, size);
257 	}
258 
259 	va_end &= ~(NBPDE - 1);
260 	size = va_end - va_start;
261 
262 	PMAP_LOCK_EXCLUSIVE(grand);
263 
264 	num_pde = size >> PDESHIFT;
265 	vaddr = va_start;
266 
267 	for (i = 0; i < num_pde;) {
268 		if (pdptnum(grand, vaddr) != npdpt) {
269 			npdpt = pdptnum(grand, vaddr);
270 			pde = pmap64_pdpt(grand, vaddr);
271 			if (pde && (*pde & INTEL_PDPTE_NESTED)) {
272 				pmap_store_pte(FALSE, pde, (pd_entry_t)0);
273 				i += (uint32_t) NPDEPG;
274 				vaddr += NBPDPT;
275 				continue;
276 			}
277 		}
278 		pde = pmap_pde(grand, (vm_map_offset_t)vaddr);
279 		if (pde == 0) {
280 			panic("pmap_unnest: no pde, grand %p vaddr 0x%llx", grand, vaddr);
281 		}
282 		pmap_store_pte(FALSE, pde, (pd_entry_t)0);
283 		i++;
284 		vaddr += NBPDE;
285 	}
286 
287 	PMAP_UPDATE_TLBS(grand, va_start, va_end);
288 
289 	PMAP_UNLOCK_EXCLUSIVE(grand);
290 
291 	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
292 
293 	return KERN_SUCCESS;
294 }
295 
296 kern_return_t
pmap_unnest_options(pmap_t grand,addr64_t vaddr,__unused uint64_t size,__unused unsigned int options)297 pmap_unnest_options(
298 	pmap_t grand,
299 	addr64_t vaddr,
300 	__unused uint64_t size,
301 	__unused unsigned int options)
302 {
303 	return pmap_unnest(grand, vaddr, size);
304 }
305 
306 /* Invoked by the Mach VM to determine the platform specific unnest region */
307 
308 boolean_t
pmap_adjust_unnest_parameters(pmap_t p,vm_map_offset_t * s,vm_map_offset_t * e)309 pmap_adjust_unnest_parameters(pmap_t p, vm_map_offset_t *s, vm_map_offset_t *e)
310 {
311 	pd_entry_t *pdpte;
312 	boolean_t rval = FALSE;
313 
314 	PMAP_LOCK_EXCLUSIVE(p);
315 
316 	pdpte = pmap64_pdpt(p, *s);
317 	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
318 		*s &= ~(NBPDPT - 1);
319 		rval = TRUE;
320 	}
321 
322 	pdpte = pmap64_pdpt(p, *e);
323 	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
324 		*e = ((*e + NBPDPT) & ~(NBPDPT - 1));
325 		rval = TRUE;
326 	}
327 
328 	PMAP_UNLOCK_EXCLUSIVE(p);
329 
330 	return rval;
331 }
332 
333 pmap_paddr_t
pmap_find_pa(pmap_t pmap,addr64_t va)334 pmap_find_pa(pmap_t pmap, addr64_t va)
335 {
336 	pt_entry_t      *ptp;
337 	pd_entry_t      *pdep;
338 	pd_entry_t      pde;
339 	pt_entry_t      pte;
340 	boolean_t       is_ept, locked = FALSE;
341 	pmap_paddr_t    pa = 0;
342 
343 	is_ept = is_ept_pmap(pmap);
344 
345 	if ((pmap != kernel_pmap) && not_in_kdp) {
346 		PMAP_LOCK_EXCLUSIVE(pmap);
347 		locked = TRUE;
348 	} else {
349 		mp_disable_preemption();
350 	}
351 
352 	if (os_ref_get_count(&pmap->ref_count) == 0) {
353 		goto pfp_exit;
354 	}
355 
356 	pdep = pmap_pde(pmap, va);
357 
358 	if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
359 		if (pde & PTE_PS) {
360 			pa = pte_to_pa(pde) + (va & I386_LPGMASK);
361 		} else {
362 			ptp = pmap_pte(pmap, va);
363 			if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
364 				pa = pte_to_pa(pte) + (va & PAGE_MASK);
365 			}
366 		}
367 	}
368 pfp_exit:
369 	if (locked) {
370 		PMAP_UNLOCK_EXCLUSIVE(pmap);
371 	} else {
372 		mp_enable_preemption();
373 	}
374 
375 	return pa;
376 }
377 
378 /*
379  * pmap_find_phys returns the (4K) physical page number containing a
380  * given virtual address in a given pmap.
381  * Note that pmap_pte may return a pde if this virtual address is
382  * mapped by a large page and this is taken into account in order
383  * to return the correct page number in this case.
384  */
385 ppnum_t
pmap_find_phys(pmap_t pmap,addr64_t va)386 pmap_find_phys(pmap_t pmap, addr64_t va)
387 {
388 	ppnum_t         ppn = 0;
389 	pmap_paddr_t    pa = 0;
390 
391 	pa = pmap_find_pa(pmap, va);
392 	ppn = (ppnum_t) i386_btop(pa);
393 
394 	return ppn;
395 }
396 
397 ppnum_t
pmap_find_phys_nofault(pmap_t pmap,addr64_t va)398 pmap_find_phys_nofault(pmap_t pmap, addr64_t va)
399 {
400 	if ((pmap == kernel_pmap) ||
401 	    ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map)))) {
402 		return pmap_find_phys(pmap, va);
403 	}
404 	return 0;
405 }
406 
407 /*
408  *  pmap_get_prot returns the equivalent Vm page protections
409  *  set on a given address, 'va'. This function is used in the
410  *  ml_static_verify_page_protections() routine which is used
411  *  by the kext loading code to validate that the TEXT segment
412  *  of a kext is mapped executable.
413  */
414 kern_return_t
pmap_get_prot(pmap_t pmap,addr64_t va,vm_prot_t * protp)415 pmap_get_prot(pmap_t pmap, addr64_t va, vm_prot_t *protp)
416 {
417 	pt_entry_t      *ptp;
418 	pd_entry_t      *pdep;
419 	pd_entry_t      pde;
420 	pt_entry_t      pte;
421 	boolean_t       is_ept, locked = FALSE;
422 	kern_return_t   retval = KERN_FAILURE;
423 	vm_prot_t       prot = 0;
424 
425 	is_ept = is_ept_pmap(pmap);
426 
427 	if ((pmap != kernel_pmap) && not_in_kdp) {
428 		PMAP_LOCK_EXCLUSIVE(pmap);
429 		locked = TRUE;
430 	} else {
431 		mp_disable_preemption();
432 	}
433 
434 	if (os_ref_get_count(&pmap->ref_count) == 0) {
435 		goto pfp_exit;
436 	}
437 
438 	pdep = pmap_pde(pmap, va);
439 
440 	if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
441 		if (pde & PTE_PS) {
442 			prot = VM_PROT_READ;
443 
444 			if (pde & PTE_WRITE(is_ept)) {
445 				prot |= VM_PROT_WRITE;
446 			}
447 			if (PTE_IS_EXECUTABLE(is_ept, pde)) {
448 				prot |= VM_PROT_EXECUTE;
449 			}
450 			retval = KERN_SUCCESS;
451 		} else {
452 			ptp = pmap_pte(pmap, va);
453 			if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
454 				prot = VM_PROT_READ;
455 
456 				if (pte & PTE_WRITE(is_ept)) {
457 					prot |= VM_PROT_WRITE;
458 				}
459 				if (PTE_IS_EXECUTABLE(is_ept, pte)) {
460 					prot |= VM_PROT_EXECUTE;
461 				}
462 				retval = KERN_SUCCESS;
463 			}
464 		}
465 	}
466 
467 pfp_exit:
468 	if (locked) {
469 		PMAP_UNLOCK_EXCLUSIVE(pmap);
470 	} else {
471 		mp_enable_preemption();
472 	}
473 
474 	if (protp) {
475 		*protp = prot;
476 	}
477 
478 	return retval;
479 }
480 
481 /*
482  * Update cache attributes for all extant managed mappings.
483  * Assumes PV for this page is locked, and that the page
484  * is managed. We assume that this physical page may be mapped in
485  * both EPT and normal Intel PTEs, so we convert the attributes
486  * to the corresponding format for each pmap.
487  *
488  * We assert that the passed set of attributes is a subset of the
489  * PHYS_CACHEABILITY_MASK.
490  */
491 void
pmap_update_cache_attributes_locked(ppnum_t pn,unsigned attributes)492 pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes)
493 {
494 	pv_rooted_entry_t       pv_h, pv_e;
495 	pv_hashed_entry_t       pvh_e, nexth;
496 	vm_map_offset_t vaddr;
497 	pmap_t  pmap;
498 	pt_entry_t      *ptep;
499 	boolean_t       is_ept;
500 	unsigned        ept_attributes;
501 
502 	assert(IS_MANAGED_PAGE(pn));
503 	assert(((~PHYS_CACHEABILITY_MASK) & attributes) == 0);
504 
505 	/* We don't support the PAT bit for EPT PTEs */
506 	if (attributes & INTEL_PTE_NCACHE) {
507 		ept_attributes = INTEL_EPT_NCACHE;
508 	} else {
509 		ept_attributes = INTEL_EPT_WB;
510 	}
511 
512 	pv_h = pai_to_pvh(pn);
513 	/* TODO: translate the PHYS_* bits to PTE bits, while they're
514 	 * currently identical, they may not remain so
515 	 * Potential optimization (here and in page_protect),
516 	 * parallel shootdowns, check for redundant
517 	 * attribute modifications.
518 	 */
519 
520 	/*
521 	 * Alter attributes on all mappings
522 	 */
523 	if (pv_h->pmap != PMAP_NULL) {
524 		pv_e = pv_h;
525 		pvh_e = (pv_hashed_entry_t)pv_e;
526 
527 		do {
528 			pmap = pv_e->pmap;
529 			vaddr = PVE_VA(pv_e);
530 			ptep = pmap_pte(pmap, vaddr);
531 
532 			if (0 == ptep) {
533 				panic("pmap_update_cache_attributes_locked: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx kernel_pmap: %p", pmap, pn, vaddr, kernel_pmap);
534 			}
535 
536 			is_ept = is_ept_pmap(pmap);
537 
538 			nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink);
539 			if (!is_ept) {
540 				pmap_update_pte(is_ept, ptep, PHYS_CACHEABILITY_MASK, attributes, true);
541 			} else {
542 				pmap_update_pte(is_ept, ptep, INTEL_EPT_CACHE_MASK, ept_attributes, true);
543 			}
544 			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
545 			pvh_e = nexth;
546 		} while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
547 	}
548 }
549 
550 void
x86_filter_TLB_coherency_interrupts(boolean_t dofilter)551 x86_filter_TLB_coherency_interrupts(boolean_t dofilter)
552 {
553 	assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
554 
555 	if (dofilter) {
556 		CPU_CR3_MARK_INACTIVE();
557 	} else {
558 		CPU_CR3_MARK_ACTIVE();
559 		mfence();
560 		pmap_update_interrupt();
561 	}
562 }
563 
564 
565 /*
566  *	Insert the given physical page (p) at
567  *	the specified virtual address (v) in the
568  *	target physical map with the protection requested.
569  *
570  *	If specified, the page will be wired down, meaning
571  *	that the related pte cannot be reclaimed.
572  *
573  *	NB:  This is the only routine which MAY NOT lazy-evaluate
574  *	or lose information.  That is, this routine must actually
575  *	insert this page into the given map NOW.
576  */
577 
578 kern_return_t
pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired)579 pmap_enter(
580 	pmap_t          pmap,
581 	vm_map_offset_t         vaddr,
582 	ppnum_t                 pn,
583 	vm_prot_t               prot,
584 	vm_prot_t               fault_type,
585 	unsigned int            flags,
586 	boolean_t               wired)
587 {
588 	return pmap_enter_options(pmap, vaddr, pn, prot, fault_type, flags, wired, PMAP_EXPAND_OPTIONS_NONE, NULL);
589 }
590 
591 #define PTE_LOCK(EPT) INTEL_PTE_SWLOCK
592 
593 static inline void PTE_LOCK_LOCK(pt_entry_t *);
594 static inline void PTE_LOCK_UNLOCK(pt_entry_t *);
595 
596 void
PTE_LOCK_LOCK(pt_entry_t * lpte)597 PTE_LOCK_LOCK(pt_entry_t *lpte)
598 {
599 	pt_entry_t pte;
600 plretry:
601 	while ((pte = __c11_atomic_load((_Atomic pt_entry_t *)lpte, memory_order_relaxed)) & PTE_LOCK(0)) {
602 		__builtin_ia32_pause();
603 	}
604 	if (__c11_atomic_compare_exchange_strong((_Atomic pt_entry_t *)lpte, &pte, pte | PTE_LOCK(0), memory_order_acquire_smp, TRUE)) {
605 		return;
606 	}
607 
608 	goto plretry;
609 }
610 
611 void
PTE_LOCK_UNLOCK(pt_entry_t * lpte)612 PTE_LOCK_UNLOCK(pt_entry_t *lpte)
613 {
614 	__c11_atomic_fetch_and((_Atomic pt_entry_t *)lpte, ~PTE_LOCK(0), memory_order_release_smp);
615 }
616 
617 kern_return_t
pmap_enter_options_addr(pmap_t pmap,vm_map_address_t v,pmap_paddr_t pa,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,__unused void * arg)618 pmap_enter_options_addr(
619 	pmap_t pmap,
620 	vm_map_address_t v,
621 	pmap_paddr_t pa,
622 	vm_prot_t prot,
623 	vm_prot_t fault_type,
624 	unsigned int flags,
625 	boolean_t wired,
626 	unsigned int options,
627 	__unused void   *arg)
628 {
629 	return pmap_enter_options(pmap, v, intel_btop(pa), prot, fault_type, flags, wired, options, arg);
630 }
631 
632 kern_return_t
pmap_enter_options(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,__unused vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,void * arg)633 pmap_enter_options(
634 	pmap_t          pmap,
635 	vm_map_offset_t         vaddr,
636 	ppnum_t                 pn,
637 	vm_prot_t               prot,
638 	__unused vm_prot_t      fault_type,
639 	unsigned int            flags,
640 	boolean_t               wired,
641 	unsigned int            options,
642 	void                    *arg)
643 {
644 	pt_entry_t              *pte = NULL;
645 	pv_rooted_entry_t       pv_h;
646 	ppnum_t                 pai;
647 	pv_hashed_entry_t       pvh_e;
648 	pv_hashed_entry_t       pvh_new;
649 	pt_entry_t              template;
650 	pmap_paddr_t            old_pa;
651 	pmap_paddr_t            pa = (pmap_paddr_t) i386_ptob(pn);
652 	boolean_t               need_tlbflush = FALSE;
653 	boolean_t               set_NX;
654 	char                    oattr;
655 	boolean_t               old_pa_locked;
656 	/* 2MiB mappings are confined to x86_64 by VM */
657 	boolean_t               superpage = flags & VM_MEM_SUPERPAGE;
658 	vm_object_t             delpage_pm_obj = NULL;
659 	uint64_t                delpage_pde_index = 0;
660 	pt_entry_t              old_pte;
661 	kern_return_t           kr = KERN_FAILURE;
662 	boolean_t               is_ept;
663 	boolean_t               is_altacct;
664 	boolean_t               ptelocked = FALSE;
665 
666 	pmap_intr_assert();
667 
668 	if (__improbable(pmap == PMAP_NULL)) {
669 		return KERN_INVALID_ARGUMENT;
670 	}
671 	if (__improbable(pn == vm_page_guard_addr)) {
672 		return KERN_INVALID_ARGUMENT;
673 	}
674 
675 	is_ept = is_ept_pmap(pmap);
676 
677 	/* N.B. We can be supplied a zero page frame in the NOENTER case, it's an
678 	 * unused value for that scenario.
679 	 */
680 	assert(pn != vm_page_fictitious_addr);
681 
682 
683 	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
684 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(vaddr), pn,
685 	    prot);
686 
687 	if ((prot & VM_PROT_EXECUTE) || __improbable(is_ept && (prot & VM_PROT_UEXEC))) {
688 		set_NX = FALSE;
689 	} else {
690 		set_NX = TRUE;
691 	}
692 
693 #if DEVELOPMENT || DEBUG
694 	if (__improbable(set_NX && (!nx_enabled || !pmap->nx_enabled))) {
695 		set_NX = FALSE;
696 	}
697 
698 	if (__improbable(set_NX && (pmap == kernel_pmap) &&
699 	    ((pmap_disable_kstack_nx && (flags & VM_MEM_STACK)) ||
700 	    (pmap_disable_kheap_nx && !(flags & VM_MEM_STACK))))) {
701 		set_NX = FALSE;
702 	}
703 #endif
704 
705 	pvh_new = PV_HASHED_ENTRY_NULL;
706 Retry:
707 	pvh_e = PV_HASHED_ENTRY_NULL;
708 
709 	PMAP_LOCK_SHARED(pmap);
710 
711 	/*
712 	 *	Expand pmap to include this pte.  Assume that
713 	 *	pmap is always expanded to include enough hardware
714 	 *	pages to map one VM page.
715 	 */
716 	if (__improbable(superpage)) {
717 		while ((pte = pmap_pde(pmap, vaddr)) == PD_ENTRY_NULL) {
718 			/* need room for another pde entry */
719 			PMAP_UNLOCK_SHARED(pmap);
720 			kr = pmap_expand_pdpt(pmap, vaddr, options);
721 			if (kr != KERN_SUCCESS) {
722 				goto done1;
723 			}
724 			PMAP_LOCK_SHARED(pmap);
725 		}
726 	} else {
727 		while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
728 			/*
729 			 * Must unlock to expand the pmap
730 			 * going to grow pde level page(s)
731 			 */
732 			PMAP_UNLOCK_SHARED(pmap);
733 			kr = pmap_expand(pmap, vaddr, options);
734 			if (kr != KERN_SUCCESS) {
735 				goto done1;
736 			}
737 			PMAP_LOCK_SHARED(pmap);
738 		}
739 	}
740 
741 	if (__improbable(options & PMAP_EXPAND_OPTIONS_NOENTER)) {
742 		PMAP_UNLOCK_SHARED(pmap);
743 		kr = KERN_SUCCESS;
744 		goto done1;
745 	}
746 
747 	if (__improbable(superpage && *pte && !(*pte & PTE_PS))) {
748 		/*
749 		 * There is still an empty page table mapped that
750 		 * was used for a previous base page mapping.
751 		 * Remember the PDE and the PDE index, so that we
752 		 * can free the page at the end of this function.
753 		 */
754 		delpage_pde_index = pdeidx(pmap, vaddr);
755 		delpage_pm_obj = pmap->pm_obj;
756 		pmap_store_pte(is_ept, pte, 0);
757 	}
758 
759 	PTE_LOCK_LOCK(pte);
760 	ptelocked = TRUE;
761 
762 	old_pa = pte_to_pa(*pte);
763 	pai = pa_index(old_pa);
764 	old_pa_locked = FALSE;
765 
766 	if (old_pa == 0 &&
767 	    PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr)) {
768 		/*
769 		 * "pmap" should be locked at this point, so this should
770 		 * not race with another pmap_enter() or pmap_remove_range().
771 		 */
772 		assert(pmap != kernel_pmap);
773 
774 		/* one less "compressed" */
775 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
776 		    PAGE_SIZE);
777 		if (*pte & PTE_COMPRESSED_ALT) {
778 			pmap_ledger_debit(
779 				pmap,
780 				task_ledgers.alternate_accounting_compressed,
781 				PAGE_SIZE);
782 		} else {
783 			/* was part of the footprint */
784 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
785 			    PAGE_SIZE);
786 		}
787 		/* marker will be cleared below */
788 	}
789 
790 	/*
791 	 * if we have a previous managed page, lock the pv entry now. after
792 	 * we lock it, check to see if someone beat us to the lock and if so
793 	 * drop the lock
794 	 */
795 	if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) {
796 		LOCK_PVH(pai);
797 		old_pa_locked = TRUE;
798 		old_pa = pte_to_pa(*pte);
799 		if (0 == old_pa) {
800 			UNLOCK_PVH(pai);        /* another path beat us to it */
801 			old_pa_locked = FALSE;
802 		}
803 	}
804 
805 	/*
806 	 *	Special case if the incoming physical page is already mapped
807 	 *	at this address.
808 	 */
809 	if (old_pa == pa) {
810 		pt_entry_t old_attributes =
811 		    *pte & ~(PTE_REF(is_ept) | PTE_MOD(is_ept) | PTE_LOCK(is_ept));
812 
813 		/*
814 		 *	May be changing its wired attribute or protection
815 		 */
816 
817 		template = pa_to_pte(pa);
818 
819 		if (__probable(!is_ept)) {
820 			template |= INTEL_PTE_VALID;
821 		} else {
822 			template |= INTEL_EPT_IPAT;
823 		}
824 
825 		template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
826 
827 		/*
828 		 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
829 		 */
830 		if (!is_ept && (VM_MEM_NOT_CACHEABLE ==
831 		    (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)))) {
832 			if (!(flags & VM_MEM_GUARDED)) {
833 				template |= INTEL_PTE_PAT;
834 			}
835 			template |= INTEL_PTE_NCACHE;
836 		}
837 		if (pmap != kernel_pmap && !is_ept) {
838 			template |= INTEL_PTE_USER;
839 		}
840 
841 		if (prot & VM_PROT_READ) {
842 			template |= PTE_READ(is_ept);
843 		}
844 
845 		if (prot & VM_PROT_WRITE) {
846 			template |= PTE_WRITE(is_ept);
847 			if (is_ept && !pmap_ept_support_ad) {
848 				template |= PTE_MOD(is_ept);
849 				if (old_pa_locked) {
850 					assert(IS_MANAGED_PAGE(pai));
851 					pmap_phys_attributes[pai] |= PHYS_MODIFIED;
852 				}
853 			}
854 		}
855 
856 		if (prot & VM_PROT_EXECUTE) {
857 			assert(set_NX == 0);
858 			template = pte_set_ex(template, is_ept);
859 		}
860 
861 		if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
862 			assert(set_NX == 0);
863 			template = pte_set_uex(template);
864 		}
865 
866 		if (set_NX) {
867 			template = pte_remove_ex(template, is_ept);
868 		}
869 
870 		if (wired) {
871 			template |= PTE_WIRED;
872 			if (!iswired(old_attributes)) {
873 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
874 			}
875 		} else {
876 			if (iswired(old_attributes)) {
877 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
878 			}
879 		}
880 
881 		if (superpage) {        /* this path can not be used */
882 			template |= PTE_PS;     /* to change the page size! */
883 		}
884 		if (old_attributes == template) {
885 			goto dont_update_pte;
886 		}
887 
888 		/* Determine delta, PV locked */
889 		need_tlbflush =
890 		    ((old_attributes ^ template) != PTE_WIRED);
891 
892 		/* Optimisation: avoid TLB flush when adding writability */
893 		if (need_tlbflush == TRUE && !(old_attributes & PTE_WRITE(is_ept))) {
894 			if ((old_attributes ^ template) == PTE_WRITE(is_ept)) {
895 				need_tlbflush = FALSE;
896 			}
897 		}
898 
899 		/* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
900 		if (__improbable(is_ept && !pmap_ept_support_ad)) {
901 			template |= PTE_REF(is_ept);
902 			if (old_pa_locked) {
903 				assert(IS_MANAGED_PAGE(pai));
904 				pmap_phys_attributes[pai] |= PHYS_REFERENCED;
905 			}
906 		}
907 
908 		/* store modified PTE and preserve RC bits */
909 		pt_entry_t npte, opte;
910 
911 		assert((*pte & PTE_LOCK(is_ept)) != 0);
912 
913 		do {
914 			opte = *pte;
915 			npte = template | (opte & (PTE_REF(is_ept) |
916 			    PTE_MOD(is_ept))) | PTE_LOCK(is_ept);
917 		} while (!pmap_cmpx_pte(pte, opte, npte));
918 
919 		DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, opte, uint64_t, npte);
920 
921 dont_update_pte:
922 		if (old_pa_locked) {
923 			UNLOCK_PVH(pai);
924 			old_pa_locked = FALSE;
925 		}
926 		goto done2;
927 	}
928 
929 	/*
930 	 *	Outline of code from here:
931 	 *	   1) If va was mapped, update TLBs, remove the mapping
932 	 *	      and remove old pvlist entry.
933 	 *	   2) Add pvlist entry for new mapping
934 	 *	   3) Enter new mapping.
935 	 *
936 	 *	If the old physical page is not managed step 1) is skipped
937 	 *	(except for updating the TLBs), and the mapping is
938 	 *	overwritten at step 3).  If the new physical page is not
939 	 *	managed, step 2) is skipped.
940 	 */
941 	/* TODO: add opportunistic refmod collect */
942 	if (old_pa != (pmap_paddr_t) 0) {
943 		boolean_t       was_altacct = FALSE;
944 
945 		/*
946 		 *	Don't do anything to pages outside valid memory here.
947 		 *	Instead convince the code that enters a new mapping
948 		 *	to overwrite the old one.
949 		 */
950 
951 		/* invalidate the PTE */
952 		pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
953 		/* propagate invalidate everywhere */
954 		PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
955 		/* remember reference and change */
956 		old_pte = *pte;
957 		oattr = (char) (old_pte & (PTE_MOD(is_ept) | PTE_REF(is_ept)));
958 		/* completely invalidate the PTE */
959 		pmap_store_pte(is_ept, pte, PTE_LOCK(is_ept));
960 
961 		if (IS_MANAGED_PAGE(pai)) {
962 			/*
963 			 *	Remove the mapping from the pvlist for
964 			 *	this physical page.
965 			 *      We'll end up with either a rooted pv or a
966 			 *      hashed pv
967 			 */
968 			pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, &old_pte, &was_altacct);
969 		}
970 
971 		if (IS_MANAGED_PAGE(pai)) {
972 			pmap_assert(old_pa_locked == TRUE);
973 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
974 			if (pmap != kernel_pmap) {
975 				/* update ledgers */
976 				if (was_altacct) {
977 					assert(IS_INTERNAL_PAGE(pai));
978 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
979 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
980 				} else if (IS_REUSABLE_PAGE(pai)) {
981 					assert(!was_altacct);
982 					assert(IS_INTERNAL_PAGE(pai));
983 					pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
984 					/* was already not in phys_footprint */
985 				} else if (IS_INTERNAL_PAGE(pai)) {
986 					assert(!was_altacct);
987 					assert(!IS_REUSABLE_PAGE(pai));
988 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
989 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
990 				} else {
991 					/* not an internal page */
992 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
993 				}
994 			}
995 			if (iswired(*pte)) {
996 				pmap_ledger_debit(pmap, task_ledgers.wired_mem,
997 				    PAGE_SIZE);
998 			}
999 
1000 			if (!is_ept) {
1001 				pmap_phys_attributes[pai] |= oattr;
1002 			} else {
1003 				pmap_phys_attributes[pai] |= ept_refmod_to_physmap(oattr);
1004 			}
1005 		} else {
1006 			/*
1007 			 *	old_pa is not managed.
1008 			 *	Do removal part of accounting.
1009 			 */
1010 
1011 			if (pmap != kernel_pmap) {
1012 #if 00
1013 				assert(pmap->stats.device > 0);
1014 				OSAddAtomic(-1, &pmap->stats.device);
1015 #endif
1016 			}
1017 			if (iswired(*pte)) {
1018 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1019 			}
1020 		}
1021 	}
1022 
1023 	/*
1024 	 * if we had a previously managed paged locked, unlock it now
1025 	 */
1026 	if (old_pa_locked) {
1027 		UNLOCK_PVH(pai);
1028 		old_pa_locked = FALSE;
1029 	}
1030 
1031 	pai = pa_index(pa);     /* now working with new incoming phys page */
1032 	if (IS_MANAGED_PAGE(pai)) {
1033 		/*
1034 		 *	Step 2) Enter the mapping in the PV list for this
1035 		 *	physical page.
1036 		 */
1037 		pv_h = pai_to_pvh(pai);
1038 
1039 		LOCK_PVH(pai);
1040 
1041 		if (pv_h->pmap == PMAP_NULL) {
1042 			/*
1043 			 *	No mappings yet, use rooted pv
1044 			 */
1045 			pv_h->va_and_flags = vaddr;
1046 			pv_h->pmap = pmap;
1047 			queue_init(&pv_h->qlink);
1048 
1049 			if (options & PMAP_OPTIONS_INTERNAL) {
1050 				pmap_phys_attributes[pai] |= PHYS_INTERNAL;
1051 			} else {
1052 				pmap_phys_attributes[pai] &= ~PHYS_INTERNAL;
1053 			}
1054 			if (options & PMAP_OPTIONS_REUSABLE) {
1055 				pmap_phys_attributes[pai] |= PHYS_REUSABLE;
1056 			} else {
1057 				pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
1058 			}
1059 			if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1060 			    IS_INTERNAL_PAGE(pai)) {
1061 				pv_h->va_and_flags |= PVE_IS_ALTACCT;
1062 				is_altacct = TRUE;
1063 			} else {
1064 				pv_h->va_and_flags &= ~PVE_IS_ALTACCT;
1065 				is_altacct = FALSE;
1066 			}
1067 		} else {
1068 			/*
1069 			 *	Add new pv_hashed_entry after header.
1070 			 */
1071 			if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
1072 				pvh_e = pvh_new;
1073 				pvh_new = PV_HASHED_ENTRY_NULL;
1074 			} else if (PV_HASHED_ENTRY_NULL == pvh_e) {
1075 				PV_HASHED_ALLOC(&pvh_e);
1076 				if (PV_HASHED_ENTRY_NULL == pvh_e) {
1077 					/*
1078 					 * the pv list is empty. if we are on
1079 					 * the kernel pmap we'll use one of
1080 					 * the special private kernel pv_e's,
1081 					 * else, we need to unlock
1082 					 * everything, zalloc a pv_e, and
1083 					 * restart bringing in the pv_e with
1084 					 * us.
1085 					 */
1086 					if (kernel_pmap == pmap) {
1087 						PV_HASHED_KERN_ALLOC(&pvh_e);
1088 					} else {
1089 						UNLOCK_PVH(pai);
1090 						PTE_LOCK_UNLOCK(pte);
1091 						PMAP_UNLOCK_SHARED(pmap);
1092 						pmap_pv_throttle(pmap);
1093 						pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
1094 						goto Retry;
1095 					}
1096 				}
1097 			}
1098 
1099 			if (PV_HASHED_ENTRY_NULL == pvh_e) {
1100 				panic("Mapping alias chain exhaustion, possibly induced by numerous kernel virtual double mappings");
1101 			}
1102 
1103 			pvh_e->va_and_flags = vaddr;
1104 			pvh_e->pmap = pmap;
1105 			pvh_e->ppn = pn;
1106 			if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1107 			    IS_INTERNAL_PAGE(pai)) {
1108 				pvh_e->va_and_flags |= PVE_IS_ALTACCT;
1109 				is_altacct = TRUE;
1110 			} else {
1111 				pvh_e->va_and_flags &= ~PVE_IS_ALTACCT;
1112 				is_altacct = FALSE;
1113 			}
1114 			pv_hash_add(pvh_e, pv_h);
1115 
1116 			/*
1117 			 *	Remember that we used the pvlist entry.
1118 			 */
1119 			pvh_e = PV_HASHED_ENTRY_NULL;
1120 		}
1121 
1122 		/*
1123 		 * only count the mapping
1124 		 * for 'managed memory'
1125 		 */
1126 		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1127 		if (pmap != kernel_pmap) {
1128 			/* update ledgers */
1129 			if (is_altacct) {
1130 				/* internal but also alternate accounting */
1131 				assert(IS_INTERNAL_PAGE(pai));
1132 				pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1133 				pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1134 				/* alternate accounting, so not in footprint */
1135 			} else if (IS_REUSABLE_PAGE(pai)) {
1136 				assert(!is_altacct);
1137 				assert(IS_INTERNAL_PAGE(pai));
1138 				pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
1139 				/* internal but reusable: not in footprint */
1140 			} else if (IS_INTERNAL_PAGE(pai)) {
1141 				assert(!is_altacct);
1142 				assert(!IS_REUSABLE_PAGE(pai));
1143 				/* internal: add to footprint */
1144 				pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1145 				pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1146 			} else {
1147 				/* not internal: not in footprint */
1148 				pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
1149 			}
1150 		}
1151 	} else if (last_managed_page == 0) {
1152 		/* Account for early mappings created before "managed pages"
1153 		 * are determined. Consider consulting the available DRAM map.
1154 		 */
1155 		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1156 		if (pmap != kernel_pmap) {
1157 #if 00
1158 			OSAddAtomic(+1, &pmap->stats.device);
1159 			PMAP_STATS_PEAK(pmap->stats.device);
1160 #endif
1161 		}
1162 	}
1163 	/*
1164 	 * Step 3) Enter the mapping.
1165 	 *
1166 	 *	Build a template to speed up entering -
1167 	 *	only the pfn changes.
1168 	 */
1169 	template = pa_to_pte(pa);
1170 
1171 	if (!is_ept) {
1172 		template |= INTEL_PTE_VALID;
1173 	} else {
1174 		template |= INTEL_EPT_IPAT;
1175 	}
1176 
1177 	/*
1178 	 * DRK: It may be worth asserting on cache attribute flags that diverge
1179 	 * from the existing physical page attributes.
1180 	 */
1181 
1182 	template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
1183 
1184 	/*
1185 	 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
1186 	 */
1187 	if (!is_ept && (flags & VM_MEM_NOT_CACHEABLE)) {
1188 		if (!(flags & VM_MEM_GUARDED)) {
1189 			template |= INTEL_PTE_PAT;
1190 		}
1191 		template |= INTEL_PTE_NCACHE;
1192 	}
1193 	if (pmap != kernel_pmap && !is_ept) {
1194 		template |= INTEL_PTE_USER;
1195 	}
1196 	if (prot & VM_PROT_READ) {
1197 		template |= PTE_READ(is_ept);
1198 	}
1199 	if (prot & VM_PROT_WRITE) {
1200 		template |= PTE_WRITE(is_ept);
1201 		if (is_ept && !pmap_ept_support_ad) {
1202 			template |= PTE_MOD(is_ept);
1203 			if (IS_MANAGED_PAGE(pai)) {
1204 				pmap_phys_attributes[pai] |= PHYS_MODIFIED;
1205 			}
1206 		}
1207 	}
1208 	if (prot & VM_PROT_EXECUTE) {
1209 		assert(set_NX == 0);
1210 		template = pte_set_ex(template, is_ept);
1211 	}
1212 	if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
1213 		assert(set_NX == 0);
1214 		template = pte_set_uex(template);
1215 	}
1216 
1217 	if (set_NX) {
1218 		template = pte_remove_ex(template, is_ept);
1219 	}
1220 	if (wired) {
1221 		template |= INTEL_PTE_WIRED;
1222 		pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1223 	}
1224 	if (__improbable(superpage)) {
1225 		template |= INTEL_PTE_PS;
1226 	}
1227 
1228 	/* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
1229 	if (__improbable(is_ept && !pmap_ept_support_ad)) {
1230 		template |= PTE_REF(is_ept);
1231 		if (IS_MANAGED_PAGE(pai)) {
1232 			pmap_phys_attributes[pai] |= PHYS_REFERENCED;
1233 		}
1234 	}
1235 	template |= PTE_LOCK(is_ept);
1236 	pmap_store_pte(is_ept, pte, template);
1237 	DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, 0, uint64_t, template);
1238 
1239 	/*
1240 	 * if this was a managed page we delayed unlocking the pv until here
1241 	 * to prevent pmap_page_protect et al from finding it until the pte
1242 	 * has been stored
1243 	 */
1244 	if (IS_MANAGED_PAGE(pai)) {
1245 		UNLOCK_PVH(pai);
1246 	}
1247 done2:
1248 	if (need_tlbflush == TRUE) {
1249 		if (options & PMAP_OPTIONS_NOFLUSH) {
1250 			PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1251 		} else {
1252 			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1253 		}
1254 	}
1255 	if (ptelocked) {
1256 		PTE_LOCK_UNLOCK(pte);
1257 	}
1258 	PMAP_UNLOCK_SHARED(pmap);
1259 
1260 	if (pvh_e != PV_HASHED_ENTRY_NULL) {
1261 		PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
1262 	}
1263 	if (pvh_new != PV_HASHED_ENTRY_NULL) {
1264 		PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
1265 	}
1266 
1267 	if (delpage_pm_obj) {
1268 		vm_page_t m;
1269 
1270 		vm_object_lock(delpage_pm_obj);
1271 		m = vm_page_lookup(delpage_pm_obj, (delpage_pde_index * PAGE_SIZE));
1272 		if (m == VM_PAGE_NULL) {
1273 			panic("pmap_enter: pte page not in object");
1274 		}
1275 		VM_PAGE_FREE(m);
1276 		vm_object_unlock(delpage_pm_obj);
1277 		OSAddAtomic(-1, &inuse_ptepages_count);
1278 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
1279 	}
1280 
1281 	kr = KERN_SUCCESS;
1282 done1:
1283 	if (__improbable((kr == KERN_SUCCESS) && (pmap == kernel_pmap) &&
1284 	    zone_spans_ro_va(vaddr, vaddr + PAGE_SIZE))) {
1285 		pmap_page_protect((ppnum_t)atop_kernel(kvtophys(vaddr)), VM_PROT_READ);
1286 	}
1287 	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
1288 	return kr;
1289 }
1290 
1291 /*
1292  *	Remove a range of hardware page-table entries.
1293  *	The entries given are the first (inclusive)
1294  *	and last (exclusive) entries for the VM pages.
1295  *	The virtual address is the va for the first pte.
1296  *
1297  *	The pmap must be locked.
1298  *	If the pmap is not the kernel pmap, the range must lie
1299  *	entirely within one pte-page.  This is NOT checked.
1300  *	Assumes that the pte-page exists.
1301  */
1302 
1303 void
pmap_remove_range(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte)1304 pmap_remove_range(
1305 	pmap_t                  pmap,
1306 	vm_map_offset_t         start_vaddr,
1307 	pt_entry_t              *spte,
1308 	pt_entry_t              *epte)
1309 {
1310 	pmap_remove_range_options(pmap, start_vaddr, spte, epte,
1311 	    PMAP_OPTIONS_REMOVE);
1312 }
1313 
1314 static void
pmap_remove_range_options(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte,int options)1315 pmap_remove_range_options(
1316 	pmap_t                  pmap,
1317 	vm_map_offset_t         start_vaddr,
1318 	pt_entry_t              *spte,
1319 	pt_entry_t              *epte,
1320 	int                     options)
1321 {
1322 	pt_entry_t              *cpte;
1323 	pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1324 	pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1325 	pv_hashed_entry_t       pvh_e;
1326 	int                     pvh_cnt = 0;
1327 	int                     num_removed, num_unwired, num_found, num_invalid;
1328 	int                     ledgers_external, ledgers_reusable, ledgers_internal, ledgers_alt_internal;
1329 	uint64_t                ledgers_compressed, ledgers_alt_compressed;
1330 	ppnum_t                 pai;
1331 	pmap_paddr_t            pa;
1332 	vm_map_offset_t         vaddr;
1333 	boolean_t               is_ept = is_ept_pmap(pmap);
1334 	boolean_t               was_altacct;
1335 
1336 	num_removed = 0;
1337 	num_unwired = 0;
1338 	num_found   = 0;
1339 	num_invalid = 0;
1340 	ledgers_external = 0;
1341 	ledgers_reusable = 0;
1342 	ledgers_internal = 0;
1343 	ledgers_compressed = 0;
1344 	ledgers_alt_internal = 0;
1345 	ledgers_alt_compressed = 0;
1346 
1347 	/* invalidate the PTEs first to "freeze" them */
1348 	for (cpte = spte, vaddr = start_vaddr;
1349 	    cpte < epte;
1350 	    cpte++, vaddr += PAGE_SIZE_64) {
1351 		pt_entry_t p = *cpte;
1352 
1353 		pa = pte_to_pa(p);
1354 		if (pa == 0) {
1355 			if ((options & PMAP_OPTIONS_REMOVE) &&
1356 			    (PTE_IS_COMPRESSED(p, cpte, pmap, vaddr))) {
1357 				assert(pmap != kernel_pmap);
1358 				/* one less "compressed"... */
1359 				ledgers_compressed++;
1360 				if (p & PTE_COMPRESSED_ALT) {
1361 					/* ... but it used to be "ALTACCT" */
1362 					ledgers_alt_compressed++;
1363 				}
1364 				/* clear marker(s) */
1365 				/* XXX probably does not need to be atomic! */
1366 				pmap_update_pte(is_ept, cpte, INTEL_PTE_COMPRESSED_MASK, 0, true);
1367 			}
1368 			continue;
1369 		}
1370 		num_found++;
1371 
1372 		if (iswired(p)) {
1373 			num_unwired++;
1374 		}
1375 
1376 		pai = pa_index(pa);
1377 
1378 		if (!IS_MANAGED_PAGE(pai)) {
1379 			/*
1380 			 *	Outside range of managed physical memory.
1381 			 *	Just remove the mappings.
1382 			 */
1383 			pmap_store_pte(is_ept, cpte, 0);
1384 			continue;
1385 		}
1386 
1387 		if ((p & PTE_VALID_MASK(is_ept)) == 0) {
1388 			num_invalid++;
1389 		}
1390 
1391 		/* invalidate the PTE */
1392 		pmap_update_pte(is_ept, cpte, PTE_VALID_MASK(is_ept), 0, true);
1393 	}
1394 
1395 	if (num_found == 0) {
1396 		/* nothing was changed: we're done */
1397 		goto update_counts;
1398 	}
1399 
1400 	/* propagate the invalidates to other CPUs */
1401 
1402 	PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1403 
1404 	for (cpte = spte, vaddr = start_vaddr;
1405 	    cpte < epte;
1406 	    cpte++, vaddr += PAGE_SIZE_64) {
1407 		pa = pte_to_pa(*cpte);
1408 		if (pa == 0) {
1409 check_pte_for_compressed_marker:
1410 			/*
1411 			 * This PTE could have been replaced with a
1412 			 * "compressed" marker after our first "freeze"
1413 			 * loop above, so check again.
1414 			 */
1415 			if ((options & PMAP_OPTIONS_REMOVE) &&
1416 			    (PTE_IS_COMPRESSED(*cpte, cpte, pmap, vaddr))) {
1417 				assert(pmap != kernel_pmap);
1418 				/* one less "compressed"... */
1419 				ledgers_compressed++;
1420 				if (*cpte & PTE_COMPRESSED_ALT) {
1421 					/* ... but it used to be "ALTACCT" */
1422 					ledgers_alt_compressed++;
1423 				}
1424 				pmap_store_pte(is_ept, cpte, 0);
1425 			}
1426 			continue;
1427 		}
1428 
1429 		pai = pa_index(pa);
1430 
1431 		LOCK_PVH(pai);
1432 
1433 		pa = pte_to_pa(*cpte);
1434 		if (pa == 0) {
1435 			UNLOCK_PVH(pai);
1436 			goto check_pte_for_compressed_marker;
1437 		}
1438 
1439 		/*
1440 		 * Remove the mapping from the pvlist for this physical page.
1441 		 */
1442 		pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, cpte, &was_altacct);
1443 
1444 		num_removed++;
1445 		/* update ledgers */
1446 		if (was_altacct) {
1447 			/* internal and alternate accounting */
1448 			assert(IS_INTERNAL_PAGE(pai));
1449 			ledgers_internal++;
1450 			ledgers_alt_internal++;
1451 		} else if (IS_REUSABLE_PAGE(pai)) {
1452 			/* internal but reusable */
1453 			assert(!was_altacct);
1454 			assert(IS_INTERNAL_PAGE(pai));
1455 			ledgers_reusable++;
1456 		} else if (IS_INTERNAL_PAGE(pai)) {
1457 			/* internal */
1458 			assert(!was_altacct);
1459 			assert(!IS_REUSABLE_PAGE(pai));
1460 			ledgers_internal++;
1461 		} else {
1462 			/* not internal */
1463 			ledgers_external++;
1464 		}
1465 
1466 		/*
1467 		 * Get the modify and reference bits, then
1468 		 * nuke the entry in the page table
1469 		 */
1470 		/* remember reference and change */
1471 		if (!is_ept) {
1472 			pmap_phys_attributes[pai] |=
1473 			    *cpte & (PHYS_MODIFIED | PHYS_REFERENCED);
1474 		} else {
1475 			pmap_phys_attributes[pai] |=
1476 			    ept_refmod_to_physmap((*cpte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1477 		}
1478 
1479 		/* completely invalidate the PTE */
1480 		pmap_store_pte(is_ept, cpte, 0);
1481 
1482 		UNLOCK_PVH(pai);
1483 
1484 		if (pvh_e != PV_HASHED_ENTRY_NULL) {
1485 			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1486 			pvh_eh = pvh_e;
1487 
1488 			if (pvh_et == PV_HASHED_ENTRY_NULL) {
1489 				pvh_et = pvh_e;
1490 			}
1491 			pvh_cnt++;
1492 		}
1493 		/* We can encounter at most 'num_found' PTEs for this level
1494 		 * Fewer may be encountered if some were replaced by
1495 		 * compressed markers. No new valid PTEs can be created
1496 		 * since the pmap lock is held exclusively.
1497 		 */
1498 		if (num_removed == num_found) {
1499 			break;
1500 		}
1501 	} /* for loop */
1502 
1503 	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1504 		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1505 	}
1506 update_counts:
1507 	/*
1508 	 *	Update the counts
1509 	 */
1510 #if TESTING
1511 	if (pmap->stats.resident_count < num_removed) {
1512 		panic("pmap_remove_range: resident_count");
1513 	}
1514 #endif
1515 	if (num_removed) {
1516 		pmap_ledger_debit(pmap, task_ledgers.phys_mem, machine_ptob(num_removed));
1517 	}
1518 
1519 	if (pmap != kernel_pmap) {
1520 		if (ledgers_external) {
1521 			pmap_ledger_debit(pmap,
1522 			    task_ledgers.external,
1523 			    machine_ptob(ledgers_external));
1524 		}
1525 		if (ledgers_reusable) {
1526 			pmap_ledger_debit(pmap,
1527 			    task_ledgers.reusable,
1528 			    machine_ptob(ledgers_reusable));
1529 		}
1530 		if (ledgers_internal) {
1531 			pmap_ledger_debit(pmap,
1532 			    task_ledgers.internal,
1533 			    machine_ptob(ledgers_internal));
1534 		}
1535 		if (ledgers_compressed) {
1536 			pmap_ledger_debit(pmap,
1537 			    task_ledgers.internal_compressed,
1538 			    machine_ptob(ledgers_compressed));
1539 		}
1540 		if (ledgers_alt_internal) {
1541 			pmap_ledger_debit(pmap,
1542 			    task_ledgers.alternate_accounting,
1543 			    machine_ptob(ledgers_alt_internal));
1544 		}
1545 		if (ledgers_alt_compressed) {
1546 			pmap_ledger_debit(pmap,
1547 			    task_ledgers.alternate_accounting_compressed,
1548 			    machine_ptob(ledgers_alt_compressed));
1549 		}
1550 
1551 		uint64_t net_debit = (ledgers_internal - ledgers_alt_internal) + (ledgers_compressed - ledgers_alt_compressed);
1552 		if (net_debit) {
1553 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(net_debit));
1554 		}
1555 	}
1556 
1557 	if (num_unwired != 0) {
1558 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, machine_ptob(num_unwired));
1559 	}
1560 	return;
1561 }
1562 
1563 
1564 /*
1565  *	Remove the given range of addresses
1566  *	from the specified map.
1567  *
1568  *	It is assumed that the start and end are properly
1569  *	rounded to the hardware page size.
1570  */
1571 void
pmap_remove(pmap_t map,addr64_t s64,addr64_t e64)1572 pmap_remove(
1573 	pmap_t          map,
1574 	addr64_t        s64,
1575 	addr64_t        e64)
1576 {
1577 	pmap_remove_options(map, s64, e64, PMAP_OPTIONS_REMOVE);
1578 }
1579 #define PLCHECK_THRESHOLD (2)
1580 
1581 void
pmap_remove_options(pmap_t map,addr64_t s64,addr64_t e64,int options)1582 pmap_remove_options(
1583 	pmap_t          map,
1584 	addr64_t        s64,
1585 	addr64_t        e64,
1586 	int             options)
1587 {
1588 	pt_entry_t     *pde;
1589 	pt_entry_t     *spte, *epte;
1590 	addr64_t        l64;
1591 	uint64_t        deadline = 0;
1592 	boolean_t       is_ept;
1593 
1594 	pmap_intr_assert();
1595 
1596 	if (map == PMAP_NULL || s64 == e64) {
1597 		return;
1598 	}
1599 
1600 	is_ept = is_ept_pmap(map);
1601 
1602 	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
1603 	    VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(s64),
1604 	    VM_KERNEL_ADDRHIDE(e64));
1605 
1606 	PMAP_LOCK_EXCLUSIVE(map);
1607 	uint32_t traverse_count = 0;
1608 
1609 	while (s64 < e64) {
1610 		pml4_entry_t *pml4e = pmap64_pml4(map, s64);
1611 		if ((pml4e == NULL) ||
1612 		    ((*pml4e & PTE_VALID_MASK(is_ept)) == 0)) {
1613 			if (os_add_overflow(s64, NBPML4, &s64)) {
1614 				/* wrap; clip s64 to e64 */
1615 				s64 = e64;
1616 				break;
1617 			}
1618 			s64 &= ~(PML4MASK);
1619 			continue;
1620 		}
1621 		pdpt_entry_t *pdpte = pmap64_pdpt(map, s64);
1622 		if ((pdpte == NULL) ||
1623 		    ((*pdpte & PTE_VALID_MASK(is_ept)) == 0)) {
1624 			if (os_add_overflow(s64, NBPDPT, &s64)) {
1625 				/* wrap; clip s64 to e64 */
1626 				s64 = e64;
1627 				break;
1628 			}
1629 			s64 &= ~(PDPTMASK);
1630 			continue;
1631 		}
1632 
1633 		if (os_add_overflow(s64, PDE_MAPPED_SIZE, &l64)) {
1634 			l64 = e64;
1635 		} else {
1636 			l64 &= ~(PDE_MAPPED_SIZE - 1);
1637 
1638 			if (l64 > e64) {
1639 				l64 = e64;
1640 			}
1641 		}
1642 
1643 		pde = pmap_pde(map, s64);
1644 
1645 		if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1646 			if (*pde & PTE_PS) {
1647 				/*
1648 				 * If we're removing a superpage, pmap_remove_range()
1649 				 * must work on level 2 instead of level 1; and we're
1650 				 * only passing a single level 2 entry instead of a
1651 				 * level 1 range.
1652 				 */
1653 				spte = pde;
1654 				epte = spte + 1; /* excluded */
1655 			} else {
1656 				spte = pmap_pte(map, (s64 & ~(PDE_MAPPED_SIZE - 1)));
1657 				spte = &spte[ptenum(s64)];
1658 				epte = &spte[intel_btop(l64 - s64)];
1659 			}
1660 			pmap_remove_range_options(map, s64, spte, epte,
1661 			    options);
1662 		}
1663 		s64 = l64;
1664 
1665 		if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
1666 			if (deadline == 0) {
1667 				deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1668 			} else {
1669 				if (rdtsc64_nofence() > deadline) {
1670 					PMAP_UNLOCK_EXCLUSIVE(map);
1671 					__builtin_ia32_pause();
1672 					PMAP_LOCK_EXCLUSIVE(map);
1673 					deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1674 				}
1675 			}
1676 		}
1677 	}
1678 
1679 	PMAP_UNLOCK_EXCLUSIVE(map);
1680 
1681 	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
1682 }
1683 
1684 void
pmap_page_protect(ppnum_t pn,vm_prot_t prot)1685 pmap_page_protect(
1686 	ppnum_t         pn,
1687 	vm_prot_t       prot)
1688 {
1689 	pmap_page_protect_options(pn, prot, 0, NULL);
1690 }
1691 
1692 /*
1693  *	Routine:	pmap_page_protect_options
1694  *
1695  *	Function:
1696  *		Lower the permission for all mappings to a given
1697  *		page.
1698  */
1699 void
pmap_page_protect_options(ppnum_t pn,vm_prot_t prot,unsigned int options,void * arg)1700 pmap_page_protect_options(
1701 	ppnum_t         pn,
1702 	vm_prot_t       prot,
1703 	unsigned int    options,
1704 	void            *arg)
1705 {
1706 	pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1707 	pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1708 	pv_hashed_entry_t       nexth;
1709 	int                     pvh_cnt = 0;
1710 	pv_rooted_entry_t       pv_h;
1711 	pv_rooted_entry_t       pv_e;
1712 	pv_hashed_entry_t       pvh_e;
1713 	pt_entry_t              *pte;
1714 	int                     pai;
1715 	pmap_t                  pmap;
1716 	boolean_t               remove;
1717 	pt_entry_t              new_pte_value;
1718 	boolean_t               is_ept;
1719 
1720 	pmap_intr_assert();
1721 	assert(pn != vm_page_fictitious_addr);
1722 	if (pn == vm_page_guard_addr) {
1723 		return;
1724 	}
1725 
1726 	pai = ppn_to_pai(pn);
1727 
1728 	if (!IS_MANAGED_PAGE(pai)) {
1729 		/*
1730 		 *	Not a managed page.
1731 		 */
1732 		return;
1733 	}
1734 
1735 	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, pn, prot);
1736 
1737 	/*
1738 	 * Determine the new protection.
1739 	 */
1740 	switch (prot) {
1741 	case VM_PROT_READ:
1742 	case VM_PROT_READ | VM_PROT_EXECUTE:
1743 		remove = FALSE;
1744 		break;
1745 	case VM_PROT_ALL:
1746 		return;         /* nothing to do */
1747 	default:
1748 		remove = TRUE;
1749 		break;
1750 	}
1751 
1752 	pv_h = pai_to_pvh(pai);
1753 
1754 	LOCK_PVH(pai);
1755 
1756 
1757 	/*
1758 	 * Walk down PV list, if any, changing or removing all mappings.
1759 	 */
1760 	if (pv_h->pmap == PMAP_NULL) {
1761 		goto done;
1762 	}
1763 
1764 	pv_e = pv_h;
1765 	pvh_e = (pv_hashed_entry_t) pv_e;       /* cheat */
1766 
1767 	do {
1768 		vm_map_offset_t vaddr;
1769 
1770 		if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1771 		    (pmap_phys_attributes[pai] & PHYS_MODIFIED)) {
1772 			/* page was modified, so it will be compressed */
1773 			options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1774 			options |= PMAP_OPTIONS_COMPRESSOR;
1775 		}
1776 
1777 		pmap = pv_e->pmap;
1778 		is_ept = is_ept_pmap(pmap);
1779 		vaddr = PVE_VA(pv_e);
1780 		pte = pmap_pte(pmap, vaddr);
1781 
1782 		pmap_assert2((pa_index(pte_to_pa(*pte)) == pn),
1783 		    "pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte);
1784 
1785 		if (0 == pte) {
1786 			panic("pmap_page_protect() "
1787 			    "pmap=%p pn=0x%x vaddr=0x%llx\n",
1788 			    pmap, pn, vaddr);
1789 		}
1790 		nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink);
1791 
1792 		/*
1793 		 * Remove the mapping if new protection is NONE
1794 		 */
1795 		if (remove) {
1796 			/* Remove per-pmap wired count */
1797 			if (iswired(*pte)) {
1798 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1799 			}
1800 
1801 			if (pmap != kernel_pmap &&
1802 			    (options & PMAP_OPTIONS_COMPRESSOR) &&
1803 			    IS_INTERNAL_PAGE(pai)) {
1804 				assert(!PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr));
1805 				/* mark this PTE as having been "compressed" */
1806 				new_pte_value = PTE_COMPRESSED;
1807 				if (IS_ALTACCT_PAGE(pai, pv_e)) {
1808 					new_pte_value |= PTE_COMPRESSED_ALT;
1809 				}
1810 			} else {
1811 				new_pte_value = 0;
1812 			}
1813 
1814 			if (options & PMAP_OPTIONS_NOREFMOD) {
1815 				pmap_store_pte(is_ept, pte, new_pte_value);
1816 
1817 				if (options & PMAP_OPTIONS_NOFLUSH) {
1818 					PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1819 				} else {
1820 					PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1821 				}
1822 			} else {
1823 				/*
1824 				 * Remove the mapping, collecting dirty bits.
1825 				 */
1826 				pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
1827 
1828 				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1829 				if (!is_ept) {
1830 					pmap_phys_attributes[pai] |=
1831 					    *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1832 				} else {
1833 					pmap_phys_attributes[pai] |=
1834 					    ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1835 				}
1836 				if ((options &
1837 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1838 				    IS_INTERNAL_PAGE(pai) &&
1839 				    (pmap_phys_attributes[pai] &
1840 				    PHYS_MODIFIED)) {
1841 					/*
1842 					 * Page is actually "modified" and
1843 					 * will be compressed.  Start
1844 					 * accounting for it as "compressed".
1845 					 */
1846 					assert(!(options & PMAP_OPTIONS_COMPRESSOR));
1847 					options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1848 					options |= PMAP_OPTIONS_COMPRESSOR;
1849 					assert(new_pte_value == 0);
1850 					if (pmap != kernel_pmap) {
1851 						new_pte_value = PTE_COMPRESSED;
1852 						if (IS_ALTACCT_PAGE(pai, pv_e)) {
1853 							new_pte_value |= PTE_COMPRESSED_ALT;
1854 						}
1855 					}
1856 				}
1857 				pmap_store_pte(is_ept, pte, new_pte_value);
1858 			}
1859 
1860 #if TESTING
1861 			if (pmap->stats.resident_count < 1) {
1862 				panic("pmap_page_protect: resident_count");
1863 			}
1864 #endif
1865 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1866 
1867 			/*
1868 			 * We only ever compress internal pages.
1869 			 */
1870 			if (options & PMAP_OPTIONS_COMPRESSOR) {
1871 				assert(IS_INTERNAL_PAGE(pai));
1872 			}
1873 			if (pmap != kernel_pmap) {
1874 				/* update ledgers */
1875 				if (IS_ALTACCT_PAGE(pai, pv_e)) {
1876 					assert(IS_INTERNAL_PAGE(pai));
1877 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1878 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1879 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1880 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1881 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, PAGE_SIZE);
1882 					}
1883 				} else if (IS_REUSABLE_PAGE(pai)) {
1884 					assert(!IS_ALTACCT_PAGE(pai, pv_e));
1885 					assert(IS_INTERNAL_PAGE(pai));
1886 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1887 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1888 						/* was not in footprint, but is now */
1889 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1890 					}
1891 					pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
1892 				} else if (IS_INTERNAL_PAGE(pai)) {
1893 					assert(!IS_ALTACCT_PAGE(pai, pv_e));
1894 					assert(!IS_REUSABLE_PAGE(pai));
1895 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1896 					/*
1897 					 * Update all stats related to physical
1898 					 * footprint, which only deals with
1899 					 * internal pages.
1900 					 */
1901 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1902 						/*
1903 						 * This removal is only being
1904 						 * done so we can send this page
1905 						 * to the compressor;  therefore
1906 						 * it mustn't affect total task
1907 						 * footprint.
1908 						 */
1909 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1910 					} else {
1911 						/*
1912 						 * This internal page isn't
1913 						 * going to the compressor,
1914 						 * so adjust stats to keep
1915 						 * phys_footprint up to date.
1916 						 */
1917 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1918 					}
1919 				} else {
1920 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
1921 				}
1922 			}
1923 
1924 			/*
1925 			 * Deal with the pv_rooted_entry.
1926 			 */
1927 
1928 			if (pv_e == pv_h) {
1929 				/*
1930 				 * Fix up head later.
1931 				 */
1932 				pv_h->pmap = PMAP_NULL;
1933 			} else {
1934 				/*
1935 				 * Delete this entry.
1936 				 */
1937 				pv_hash_remove(pvh_e);
1938 				pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1939 				pvh_eh = pvh_e;
1940 
1941 				if (pvh_et == PV_HASHED_ENTRY_NULL) {
1942 					pvh_et = pvh_e;
1943 				}
1944 				pvh_cnt++;
1945 			}
1946 		} else {
1947 			/*
1948 			 * Write-protect, after opportunistic refmod collect
1949 			 */
1950 			if (!is_ept) {
1951 				pmap_phys_attributes[pai] |=
1952 				    *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1953 			} else {
1954 				pmap_phys_attributes[pai] |=
1955 				    ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1956 			}
1957 
1958 			pmap_update_pte(is_ept, pte, PTE_WRITE(is_ept), 0, true);
1959 			if (options & PMAP_OPTIONS_NOFLUSH) {
1960 				PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1961 			} else {
1962 				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1963 			}
1964 		}
1965 		pvh_e = nexth;
1966 	} while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h);
1967 
1968 
1969 	/*
1970 	 * If pv_head mapping was removed, fix it up.
1971 	 */
1972 	if (pv_h->pmap == PMAP_NULL) {
1973 		pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
1974 
1975 		if (pvh_e != (pv_hashed_entry_t) pv_h) {
1976 			pv_hash_remove(pvh_e);
1977 			pv_h->pmap = pvh_e->pmap;
1978 			pv_h->va_and_flags = pvh_e->va_and_flags;
1979 			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1980 			pvh_eh = pvh_e;
1981 
1982 			if (pvh_et == PV_HASHED_ENTRY_NULL) {
1983 				pvh_et = pvh_e;
1984 			}
1985 			pvh_cnt++;
1986 		}
1987 	}
1988 	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1989 		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1990 	}
1991 done:
1992 	UNLOCK_PVH(pai);
1993 
1994 	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
1995 }
1996 
1997 
1998 /*
1999  *	Clear specified attribute bits.
2000  */
2001 void
phys_attribute_clear(ppnum_t pn,int bits,unsigned int options,void * arg)2002 phys_attribute_clear(
2003 	ppnum_t         pn,
2004 	int             bits,
2005 	unsigned int    options,
2006 	void            *arg)
2007 {
2008 	pv_rooted_entry_t       pv_h;
2009 	pv_hashed_entry_t       pv_e;
2010 	pt_entry_t              *pte = NULL;
2011 	int                     pai;
2012 	pmap_t                  pmap;
2013 	char                    attributes = 0;
2014 	boolean_t               is_internal, is_reusable, is_altacct, is_ept;
2015 	int                     ept_bits_to_clear;
2016 	boolean_t               ept_keep_global_mod = FALSE;
2017 
2018 	if ((bits & PHYS_MODIFIED) &&
2019 	    (options & PMAP_OPTIONS_NOFLUSH) &&
2020 	    arg == NULL) {
2021 		panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p): "
2022 		    "should not clear 'modified' without flushing TLBs\n",
2023 		    pn, bits, options, arg);
2024 	}
2025 
2026 	/* We only support converting MOD and REF bits for EPT PTEs in this function */
2027 	assert((bits & ~(PHYS_REFERENCED | PHYS_MODIFIED)) == 0);
2028 
2029 	ept_bits_to_clear = (unsigned)physmap_refmod_to_ept(bits & (PHYS_MODIFIED | PHYS_REFERENCED));
2030 
2031 	pmap_intr_assert();
2032 	assert(pn != vm_page_fictitious_addr);
2033 	if (pn == vm_page_guard_addr) {
2034 		return;
2035 	}
2036 
2037 	pai = ppn_to_pai(pn);
2038 
2039 	if (!IS_MANAGED_PAGE(pai)) {
2040 		/*
2041 		 *	Not a managed page.
2042 		 */
2043 		return;
2044 	}
2045 
2046 	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
2047 
2048 	pv_h = pai_to_pvh(pai);
2049 
2050 	LOCK_PVH(pai);
2051 
2052 
2053 	/*
2054 	 * Walk down PV list, clearing all modify or reference bits.
2055 	 * We do not have to lock the pv_list because we have
2056 	 * the per-pmap lock
2057 	 */
2058 	if (pv_h->pmap != PMAP_NULL) {
2059 		/*
2060 		 * There are some mappings.
2061 		 */
2062 
2063 		is_internal = IS_INTERNAL_PAGE(pai);
2064 		is_reusable = IS_REUSABLE_PAGE(pai);
2065 
2066 		pv_e = (pv_hashed_entry_t)pv_h;
2067 
2068 		do {
2069 			vm_map_offset_t va;
2070 			char pte_bits;
2071 
2072 			pmap = pv_e->pmap;
2073 			is_ept = is_ept_pmap(pmap);
2074 			is_altacct = IS_ALTACCT_PAGE(pai, pv_e);
2075 			va = PVE_VA(pv_e);
2076 			pte_bits = 0;
2077 
2078 			if (bits) {
2079 				pte = pmap_pte(pmap, va);
2080 				/* grab ref/mod bits from this PTE */
2081 				pte_bits = (*pte & (PTE_REF(is_ept) | PTE_MOD(is_ept)));
2082 				/* propagate to page's global attributes */
2083 				if (!is_ept) {
2084 					attributes |= pte_bits;
2085 				} else {
2086 					attributes |= ept_refmod_to_physmap(pte_bits);
2087 					if (!pmap_ept_support_ad && (pte_bits & INTEL_EPT_MOD)) {
2088 						ept_keep_global_mod = TRUE;
2089 					}
2090 				}
2091 				/* which bits to clear for this PTE? */
2092 				if (!is_ept) {
2093 					pte_bits &= bits;
2094 				} else {
2095 					pte_bits &= ept_bits_to_clear;
2096 				}
2097 			}
2098 			if (options & PMAP_OPTIONS_CLEAR_WRITE) {
2099 				pte_bits |= PTE_WRITE(is_ept);
2100 			}
2101 
2102 			/*
2103 			 * Clear modify and/or reference bits.
2104 			 */
2105 			if (pte_bits) {
2106 				pmap_update_pte(is_ept, pte, pte_bits, 0, true);
2107 
2108 				/* Ensure all processors using this translation
2109 				 * invalidate this TLB entry. The invalidation
2110 				 * *must* follow the PTE update, to ensure that
2111 				 * the TLB shadow of the 'D' bit (in particular)
2112 				 * is synchronized with the updated PTE.
2113 				 */
2114 				if (!(options & PMAP_OPTIONS_NOFLUSH)) {
2115 					/* flush TLBS now */
2116 					PMAP_UPDATE_TLBS(pmap,
2117 					    va,
2118 					    va + PAGE_SIZE);
2119 				} else if (arg) {
2120 					/* delayed TLB flush: add "pmap" info */
2121 					PMAP_UPDATE_TLBS_DELAYED(
2122 						pmap,
2123 						va,
2124 						va + PAGE_SIZE,
2125 						(pmap_flush_context *)arg);
2126 				} else {
2127 					/* no TLB flushing at all */
2128 				}
2129 			}
2130 
2131 			/* update pmap "reusable" stats */
2132 			if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
2133 			    is_reusable &&
2134 			    pmap != kernel_pmap) {
2135 				/* one less "reusable" */
2136 				pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
2137 				if (is_internal) {
2138 					/* one more "internal" */
2139 					if (is_altacct) {
2140 						/* no impact on ledgers */
2141 					} else {
2142 						pmap_ledger_credit(pmap,
2143 						    task_ledgers.internal,
2144 						    PAGE_SIZE);
2145 						pmap_ledger_credit(
2146 							pmap,
2147 							task_ledgers.phys_footprint,
2148 							PAGE_SIZE);
2149 					}
2150 				} else {
2151 					/* one more "external" */
2152 					pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
2153 				}
2154 			} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
2155 			    !is_reusable &&
2156 			    pmap != kernel_pmap) {
2157 				/* one more "reusable" */
2158 				pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
2159 				if (is_internal) {
2160 					/* one less "internal" */
2161 					if (is_altacct) {
2162 						/* no impact on footprint */
2163 					} else {
2164 						pmap_ledger_debit(pmap,
2165 						    task_ledgers.internal,
2166 						    PAGE_SIZE);
2167 						pmap_ledger_debit(
2168 							pmap,
2169 							task_ledgers.phys_footprint,
2170 							PAGE_SIZE);
2171 					}
2172 				} else {
2173 					/* one less "external" */
2174 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
2175 				}
2176 			}
2177 
2178 			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2179 		} while (pv_e != (pv_hashed_entry_t)pv_h);
2180 	}
2181 	/* Opportunistic refmod collection, annulled
2182 	 * if both REF and MOD are being cleared.
2183 	 */
2184 
2185 	pmap_phys_attributes[pai] |= attributes;
2186 
2187 	if (ept_keep_global_mod) {
2188 		/*
2189 		 * If the hardware doesn't support AD bits for EPT PTEs and someone is
2190 		 * requesting that we clear the modified bit for a phys page, we need
2191 		 * to ensure that there are no EPT mappings for the page with the
2192 		 * modified bit set. If there are, we cannot clear the global modified bit.
2193 		 */
2194 		bits &= ~PHYS_MODIFIED;
2195 	}
2196 	pmap_phys_attributes[pai] &= ~(bits);
2197 
2198 	/* update this page's "reusable" status */
2199 	if (options & PMAP_OPTIONS_CLEAR_REUSABLE) {
2200 		pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
2201 	} else if (options & PMAP_OPTIONS_SET_REUSABLE) {
2202 		pmap_phys_attributes[pai] |= PHYS_REUSABLE;
2203 	}
2204 
2205 	UNLOCK_PVH(pai);
2206 
2207 	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
2208 }
2209 
2210 /*
2211  *	Check specified attribute bits.
2212  */
2213 int
phys_attribute_test(ppnum_t pn,int bits)2214 phys_attribute_test(
2215 	ppnum_t         pn,
2216 	int             bits)
2217 {
2218 	pv_rooted_entry_t       pv_h;
2219 	pv_hashed_entry_t       pv_e;
2220 	pt_entry_t              *pte;
2221 	int                     pai;
2222 	pmap_t                  pmap;
2223 	int                     attributes = 0;
2224 	boolean_t               is_ept;
2225 
2226 	pmap_intr_assert();
2227 	assert(pn != vm_page_fictitious_addr);
2228 	assert((bits & ~(PHYS_MODIFIED | PHYS_REFERENCED)) == 0);
2229 	if (pn == vm_page_guard_addr) {
2230 		return 0;
2231 	}
2232 
2233 	pai = ppn_to_pai(pn);
2234 
2235 	if (!IS_MANAGED_PAGE(pai)) {
2236 		/*
2237 		 *	Not a managed page.
2238 		 */
2239 		return 0;
2240 	}
2241 
2242 	/*
2243 	 * Fast check...  if bits already collected
2244 	 * no need to take any locks...
2245 	 * if not set, we need to recheck after taking
2246 	 * the lock in case they got pulled in while
2247 	 * we were waiting for the lock
2248 	 */
2249 	if ((pmap_phys_attributes[pai] & bits) == bits) {
2250 		return bits;
2251 	}
2252 
2253 	pv_h = pai_to_pvh(pai);
2254 
2255 	LOCK_PVH(pai);
2256 
2257 	attributes = pmap_phys_attributes[pai] & bits;
2258 
2259 
2260 	/*
2261 	 * Walk down PV list, checking the mappings until we
2262 	 * reach the end or we've found the desired attributes.
2263 	 */
2264 	if (attributes != bits &&
2265 	    pv_h->pmap != PMAP_NULL) {
2266 		/*
2267 		 * There are some mappings.
2268 		 */
2269 		pv_e = (pv_hashed_entry_t)pv_h;
2270 		do {
2271 			vm_map_offset_t va;
2272 
2273 			pmap = pv_e->pmap;
2274 			is_ept = is_ept_pmap(pmap);
2275 			va = PVE_VA(pv_e);
2276 			/*
2277 			 * pick up modify and/or reference bits from mapping
2278 			 */
2279 
2280 			pte = pmap_pte(pmap, va);
2281 			if (!is_ept) {
2282 				attributes |= (int)(*pte & bits);
2283 			} else {
2284 				attributes |= (int)(ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED));
2285 			}
2286 
2287 			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2288 		} while ((attributes != bits) &&
2289 		    (pv_e != (pv_hashed_entry_t)pv_h));
2290 	}
2291 	pmap_phys_attributes[pai] |= attributes;
2292 
2293 	UNLOCK_PVH(pai);
2294 	return attributes;
2295 }
2296 
2297 /*
2298  *	Routine:	pmap_change_wiring
2299  *	Function:	Change the wiring attribute for a map/virtual-address
2300  *			pair.
2301  *	In/out conditions:
2302  *			The mapping must already exist in the pmap.
2303  */
2304 void
pmap_change_wiring(pmap_t map,vm_map_offset_t vaddr,boolean_t wired)2305 pmap_change_wiring(
2306 	pmap_t          map,
2307 	vm_map_offset_t vaddr,
2308 	boolean_t       wired)
2309 {
2310 	pt_entry_t      *pte;
2311 
2312 	PMAP_LOCK_SHARED(map);
2313 
2314 	if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL) {
2315 		panic("pmap_change_wiring(%p,0x%llx,%d): pte missing",
2316 		    map, vaddr, wired);
2317 	}
2318 
2319 	if (wired && !iswired(*pte)) {
2320 		/*
2321 		 * wiring down mapping
2322 		 */
2323 		pmap_ledger_credit(map, task_ledgers.wired_mem, PAGE_SIZE);
2324 		pmap_update_pte(is_ept_pmap(map), pte, 0, PTE_WIRED, false);
2325 	} else if (!wired && iswired(*pte)) {
2326 		/*
2327 		 * unwiring mapping
2328 		 */
2329 		pmap_ledger_debit(map, task_ledgers.wired_mem, PAGE_SIZE);
2330 		pmap_update_pte(is_ept_pmap(map), pte, PTE_WIRED, 0, false);
2331 	}
2332 
2333 	PMAP_UNLOCK_SHARED(map);
2334 }
2335 
2336 /*
2337  *	"Backdoor" direct map routine for early mappings.
2338  *      Useful for mapping memory outside the range
2339  *      Sets A, D and NC if requested
2340  */
2341 
2342 vm_offset_t
pmap_map_bd(vm_offset_t virt,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int flags)2343 pmap_map_bd(
2344 	vm_offset_t     virt,
2345 	vm_map_offset_t start_addr,
2346 	vm_map_offset_t end_addr,
2347 	vm_prot_t       prot,
2348 	unsigned int    flags)
2349 {
2350 	pt_entry_t      template;
2351 	pt_entry_t      *ptep;
2352 
2353 	vm_offset_t     base = virt;
2354 	boolean_t       doflush = FALSE;
2355 
2356 	template = pa_to_pte(start_addr)
2357 	    | INTEL_PTE_REF
2358 	    | INTEL_PTE_MOD
2359 	    | INTEL_PTE_WIRED
2360 	    | INTEL_PTE_VALID;
2361 
2362 	if ((flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) == VM_MEM_NOT_CACHEABLE) {
2363 		template |= INTEL_PTE_NCACHE;
2364 		if (!(flags & (VM_MEM_GUARDED))) {
2365 			template |= INTEL_PTE_PAT;
2366 		}
2367 	}
2368 
2369 	if ((prot & VM_PROT_EXECUTE) == 0) {
2370 		template |= INTEL_PTE_NX;
2371 	}
2372 
2373 	if (prot & VM_PROT_WRITE) {
2374 		template |= INTEL_PTE_WRITE;
2375 	}
2376 	vm_map_offset_t caddr = start_addr;
2377 	while (caddr < end_addr) {
2378 		ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
2379 		if (ptep == PT_ENTRY_NULL) {
2380 			panic("pmap_map_bd: Invalid kernel address");
2381 		}
2382 		if (pte_to_pa(*ptep)) {
2383 			doflush = TRUE;
2384 		}
2385 		pmap_store_pte(FALSE, ptep, template);
2386 		pte_increment_pa(template);
2387 		virt += PAGE_SIZE;
2388 		caddr += PAGE_SIZE;
2389 	}
2390 	if (doflush) {
2391 		pmap_tlbi_range(0, ~0ULL, true, 0);
2392 		PMAP_UPDATE_TLBS(kernel_pmap, base, base + end_addr - start_addr);
2393 	}
2394 	return virt;
2395 }
2396 
2397 /* Create a virtual alias beginning at 'ava' of the specified kernel virtual
2398  * range. The aliased pagetable range is expanded if
2399  * PMAP_EXPAND_OPTIONS_ALIASMAP is specified. Performs no synchronization,
2400  * assumes caller has stabilized the source and destination ranges. Currently
2401  * used to populate sections of the trampoline "doublemap" at CPU startup.
2402  */
2403 
2404 void
pmap_alias(vm_offset_t ava,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int eoptions)2405 pmap_alias(
2406 	vm_offset_t     ava,
2407 	vm_map_offset_t start_addr,
2408 	vm_map_offset_t end_addr,
2409 	vm_prot_t       prot,
2410 	unsigned int    eoptions)
2411 {
2412 	pt_entry_t      prot_template, template;
2413 	pt_entry_t      *aptep, *sptep;
2414 
2415 	prot_template =  INTEL_PTE_REF | INTEL_PTE_MOD | INTEL_PTE_WIRED | INTEL_PTE_VALID;
2416 	if ((prot & VM_PROT_EXECUTE) == 0) {
2417 		prot_template |= INTEL_PTE_NX;
2418 	}
2419 
2420 	if (prot & VM_PROT_WRITE) {
2421 		prot_template |= INTEL_PTE_WRITE;
2422 	}
2423 	assert(((start_addr | end_addr) & PAGE_MASK) == 0);
2424 	while (start_addr < end_addr) {
2425 		aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2426 		if (aptep == PT_ENTRY_NULL) {
2427 			if (eoptions & PMAP_EXPAND_OPTIONS_ALIASMAP) {
2428 				pmap_expand(kernel_pmap, ava, PMAP_EXPAND_OPTIONS_ALIASMAP);
2429 				aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2430 			} else {
2431 				panic("pmap_alias: Invalid alias address");
2432 			}
2433 		}
2434 		/* The aliased range should not have any active mappings */
2435 		assert(pte_to_pa(*aptep) == 0);
2436 
2437 		sptep = pmap_pte(kernel_pmap, start_addr);
2438 		assert(sptep != PT_ENTRY_NULL && (pte_to_pa(*sptep) != 0));
2439 		template = pa_to_pte(pte_to_pa(*sptep)) | prot_template;
2440 		pmap_store_pte(FALSE, aptep, template);
2441 
2442 		ava += PAGE_SIZE;
2443 		start_addr += PAGE_SIZE;
2444 	}
2445 }
2446 
2447 mach_vm_size_t
pmap_query_resident(pmap_t pmap,addr64_t s64,addr64_t e64,mach_vm_size_t * compressed_bytes_p)2448 pmap_query_resident(
2449 	pmap_t          pmap,
2450 	addr64_t        s64,
2451 	addr64_t        e64,
2452 	mach_vm_size_t  *compressed_bytes_p)
2453 {
2454 	pt_entry_t     *pde;
2455 	pt_entry_t     *spte, *epte;
2456 	addr64_t        l64;
2457 	uint64_t        deadline = 0;
2458 	mach_vm_size_t  resident_bytes;
2459 	mach_vm_size_t  compressed_bytes;
2460 	boolean_t       is_ept;
2461 
2462 	pmap_intr_assert();
2463 
2464 	if (pmap == PMAP_NULL || pmap == kernel_pmap || s64 == e64) {
2465 		if (compressed_bytes_p) {
2466 			*compressed_bytes_p = 0;
2467 		}
2468 		return 0;
2469 	}
2470 
2471 	is_ept = is_ept_pmap(pmap);
2472 
2473 	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
2474 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(s64),
2475 	    VM_KERNEL_ADDRHIDE(e64));
2476 
2477 	resident_bytes = 0;
2478 	compressed_bytes = 0;
2479 
2480 	PMAP_LOCK_EXCLUSIVE(pmap);
2481 	uint32_t traverse_count = 0;
2482 
2483 	while (s64 < e64) {
2484 		if (os_add_overflow(s64, PDE_MAPPED_SIZE, &l64)) {
2485 			l64 = e64;
2486 		} else {
2487 			l64 &= ~(PDE_MAPPED_SIZE - 1);
2488 
2489 			if (l64 > e64) {
2490 				l64 = e64;
2491 			}
2492 		}
2493 
2494 		pde = pmap_pde(pmap, s64);
2495 
2496 		if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
2497 			if (*pde & PTE_PS) {
2498 				/* superpage: not supported */
2499 			} else {
2500 				spte = pmap_pte(pmap,
2501 				    (s64 & ~(PDE_MAPPED_SIZE - 1)));
2502 				spte = &spte[ptenum(s64)];
2503 				epte = &spte[intel_btop(l64 - s64)];
2504 
2505 				for (; spte < epte; spte++) {
2506 					if (pte_to_pa(*spte) != 0) {
2507 						resident_bytes += PAGE_SIZE;
2508 					} else if (*spte & PTE_COMPRESSED) {
2509 						compressed_bytes += PAGE_SIZE;
2510 					}
2511 				}
2512 			}
2513 		}
2514 		s64 = l64;
2515 
2516 		if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
2517 			if (deadline == 0) {
2518 				deadline = rdtsc64() + max_preemption_latency_tsc;
2519 			} else {
2520 				if (rdtsc64() > deadline) {
2521 					PMAP_UNLOCK_EXCLUSIVE(pmap);
2522 					__builtin_ia32_pause();
2523 					PMAP_LOCK_EXCLUSIVE(pmap);
2524 					deadline = rdtsc64() + max_preemption_latency_tsc;
2525 				}
2526 			}
2527 		}
2528 	}
2529 
2530 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2531 
2532 	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
2533 	    resident_bytes);
2534 
2535 	if (compressed_bytes_p) {
2536 		*compressed_bytes_p = compressed_bytes;
2537 	}
2538 	return resident_bytes;
2539 }
2540 
2541 kern_return_t
pmap_query_page_info(pmap_t pmap,vm_map_offset_t va,int * disp_p)2542 pmap_query_page_info(
2543 	pmap_t          pmap,
2544 	vm_map_offset_t va,
2545 	int             *disp_p)
2546 {
2547 	int             disp;
2548 	boolean_t       is_ept;
2549 	pmap_paddr_t    pa;
2550 	ppnum_t         pai;
2551 	pd_entry_t      *pde;
2552 	pt_entry_t      *pte;
2553 
2554 	pmap_intr_assert();
2555 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
2556 		*disp_p = 0;
2557 		return KERN_INVALID_ARGUMENT;
2558 	}
2559 
2560 	disp = 0;
2561 	is_ept = is_ept_pmap(pmap);
2562 
2563 	PMAP_LOCK_EXCLUSIVE(pmap);
2564 
2565 	pde = pmap_pde(pmap, va);
2566 	if (!pde ||
2567 	    !(*pde & PTE_VALID_MASK(is_ept)) ||
2568 	    (*pde & PTE_PS)) {
2569 		goto done;
2570 	}
2571 
2572 	pte = pmap_pte(pmap, va);
2573 	if (pte == PT_ENTRY_NULL) {
2574 		goto done;
2575 	}
2576 
2577 	pa = pte_to_pa(*pte);
2578 	if (pa == 0) {
2579 		if (PTE_IS_COMPRESSED(*pte, pte, pmap, va)) {
2580 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
2581 			if (*pte & PTE_COMPRESSED_ALT) {
2582 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
2583 			}
2584 		}
2585 	} else {
2586 		disp |= PMAP_QUERY_PAGE_PRESENT;
2587 		pai = pa_index(pa);
2588 		if (!IS_MANAGED_PAGE(pai)) {
2589 		} else if (pmap_pv_is_altacct(pmap, va, pai)) {
2590 			assert(IS_INTERNAL_PAGE(pai));
2591 			disp |= PMAP_QUERY_PAGE_INTERNAL;
2592 			disp |= PMAP_QUERY_PAGE_ALTACCT;
2593 		} else if (IS_REUSABLE_PAGE(pai)) {
2594 			disp |= PMAP_QUERY_PAGE_REUSABLE;
2595 		} else if (IS_INTERNAL_PAGE(pai)) {
2596 			disp |= PMAP_QUERY_PAGE_INTERNAL;
2597 		}
2598 	}
2599 
2600 done:
2601 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2602 	*disp_p = disp;
2603 	return KERN_SUCCESS;
2604 }
2605 
2606 void
pmap_set_vm_map_cs_enforced(pmap_t pmap,bool new_value)2607 pmap_set_vm_map_cs_enforced(
2608 	pmap_t pmap,
2609 	bool new_value)
2610 {
2611 	PMAP_LOCK_EXCLUSIVE(pmap);
2612 	pmap->pm_vm_map_cs_enforced = new_value;
2613 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2614 }
2615 extern int cs_process_enforcement_enable;
2616 bool
pmap_get_vm_map_cs_enforced(pmap_t pmap)2617 pmap_get_vm_map_cs_enforced(
2618 	pmap_t pmap)
2619 {
2620 	if (cs_process_enforcement_enable) {
2621 		return true;
2622 	}
2623 	return pmap->pm_vm_map_cs_enforced;
2624 }
2625 
2626 void
pmap_set_jit_entitled(__unused pmap_t pmap)2627 pmap_set_jit_entitled(__unused pmap_t pmap)
2628 {
2629 	/* The x86 pmap layer does not care if a map has a JIT entry. */
2630 	return;
2631 }
2632 
2633 bool
pmap_get_jit_entitled(__unused pmap_t pmap)2634 pmap_get_jit_entitled(__unused pmap_t pmap)
2635 {
2636 	/* The x86 pmap layer does not care if a map is using JIT. */
2637 	return false;
2638 }
2639 
2640 bool
pmap_has_prot_policy(__unused pmap_t pmap,__unused bool translated_allow_execute,__unused vm_prot_t prot)2641 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
2642 {
2643 	/*
2644 	 * The x86 pmap layer does not apply any policy to any protection
2645 	 * types.
2646 	 */
2647 	return false;
2648 }
2649 
2650 uint64_t
pmap_release_pages_fast(void)2651 pmap_release_pages_fast(void)
2652 {
2653 	return 0;
2654 }
2655 
2656 void
pmap_trim(__unused pmap_t grand,__unused pmap_t subord,__unused addr64_t vstart,__unused uint64_t size)2657 pmap_trim(__unused pmap_t grand, __unused pmap_t subord, __unused addr64_t vstart, __unused uint64_t size)
2658 {
2659 	return;
2660 }
2661 
2662 __dead2
2663 void
pmap_ledger_verify_size(size_t size)2664 pmap_ledger_verify_size(size_t size)
2665 {
2666 	panic("%s: unsupported, "
2667 	    "size=%lu",
2668 	    __func__, size);
2669 }
2670 
2671 __dead2
2672 ledger_t
pmap_ledger_alloc(void)2673 pmap_ledger_alloc(void)
2674 {
2675 	panic("%s: unsupported",
2676 	    __func__);
2677 }
2678 
2679 __dead2
2680 void
pmap_ledger_free(ledger_t ledger)2681 pmap_ledger_free(ledger_t ledger)
2682 {
2683 	panic("%s: unsupported, "
2684 	    "ledger=%p",
2685 	    __func__, ledger);
2686 }
2687 
2688 kern_return_t
pmap_dump_page_tables(pmap_t pmap __unused,void * bufp __unused,void * buf_end __unused,unsigned int level_mask __unused,size_t * bytes_copied __unused)2689 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
2690     unsigned int level_mask __unused, size_t *bytes_copied __unused)
2691 {
2692 	return KERN_NOT_SUPPORTED;
2693 }
2694 
2695 void *
pmap_map_compressor_page(ppnum_t pn)2696 pmap_map_compressor_page(ppnum_t pn)
2697 {
2698 	assertf(IS_MANAGED_PAGE(ppn_to_pai(pn)), "%s called on non-managed page 0x%08x", __func__, pn);
2699 	return PHYSMAP_PTOV((uint64_t)pn << (uint64_t)PAGE_SHIFT);
2700 }
2701 
2702 void
pmap_unmap_compressor_page(ppnum_t pn __unused,void * kva __unused)2703 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
2704 {
2705 }
2706 
2707 bool
pmap_clear_refmod_range_options(pmap_t pmap __unused,vm_map_address_t start __unused,vm_map_address_t end __unused,unsigned int mask __unused,unsigned int options __unused)2708 pmap_clear_refmod_range_options(
2709 	pmap_t pmap __unused,
2710 	vm_map_address_t start __unused,
2711 	vm_map_address_t end __unused,
2712 	unsigned int mask __unused,
2713 	unsigned int options __unused)
2714 {
2715 	/*
2716 	 * x86 doesn't have ranged tlbi instructions, and we already have
2717 	 * the pmap_flush_context. This operation isn't implemented.
2718 	 */
2719 	return false;
2720 }
2721 
2722 bool
pmap_supported_feature(pmap_t pmap,pmap_feature_flags_t feat)2723 pmap_supported_feature(pmap_t pmap, pmap_feature_flags_t feat)
2724 {
2725 	switch (feat) {
2726 	case PMAP_FEAT_UEXEC:
2727 		return pmap != NULL && is_ept_pmap(pmap);
2728 	default:
2729 		return false;
2730 	}
2731 }
2732