xref: /xnu-8019.80.24/osfmk/i386/pmap_x86_common.c (revision a325d9c4a84054e40bbe985afedcb50ab80993ea) !
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <mach_assert.h>
30 
31 #include <vm/pmap.h>
32 #include <vm/vm_map.h>
33 #include <vm/vm_kern.h>
34 #include <kern/ledger.h>
35 #include <kern/zalloc_internal.h>
36 #include <i386/pmap_internal.h>
37 
38 void            pmap_remove_range(
39 	pmap_t          pmap,
40 	vm_map_offset_t va,
41 	pt_entry_t      *spte,
42 	pt_entry_t      *epte);
43 
44 static void            pmap_remove_range_options(
45 	pmap_t          pmap,
46 	vm_map_offset_t va,
47 	pt_entry_t      *spte,
48 	pt_entry_t      *epte,
49 	int             options);
50 
51 void            pmap_reusable_range(
52 	pmap_t          pmap,
53 	vm_map_offset_t va,
54 	pt_entry_t      *spte,
55 	pt_entry_t      *epte,
56 	boolean_t       reusable);
57 
58 pt_entry_t *PTE_corrupted_ptr;
59 
60 #if DEVELOPMENT || DEBUG
61 int pmap_inject_pte_corruption;
62 uint32_t pmap_update_clear_pte_count;
63 uint32_t pmap_update_invalid_pte_count;
64 #endif
65 
66 /*
67  * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time,
68  * on a NBPDE boundary.
69  */
70 
71 uint64_t
pmap_shared_region_size_min(__unused pmap_t pmap)72 pmap_shared_region_size_min(__unused pmap_t pmap)
73 {
74 	return NBPDE;
75 }
76 
77 uint64_t
pmap_commpage_size_min(__unused pmap_t pmap)78 pmap_commpage_size_min(__unused pmap_t pmap)
79 {
80 	return NBPDE;
81 }
82 
83 /*
84  *	kern_return_t pmap_nest(grand, subord, va_start, size)
85  *
86  *	grand  = the pmap that we will nest subord into
87  *	subord = the pmap that goes into the grand
88  *	va_start  = start of range in pmap to be inserted
89  *	size   = Size of nest area (up to 16TB)
90  *
91  *	Inserts a pmap into another.  This is used to implement shared segments.
92  *
93  *	Note that we depend upon higher level VM locks to insure that things don't change while
94  *	we are doing this.  For example, VM should not be doing any pmap enters while it is nesting
95  *	or do 2 nests at once.
96  */
97 
98 /*
99  * This routine can nest subtrees either at the PDPT level (1GiB) or at the
100  * PDE level (2MiB). We currently disallow disparate offsets for the "subord"
101  * container and the "grand" parent. A minor optimization to consider for the
102  * future: make the "subord" truly a container rather than a full-fledged
103  * pagetable hierarchy which can be unnecessarily sparse (DRK).
104  */
105 
106 kern_return_t
pmap_nest(pmap_t grand,pmap_t subord,addr64_t va_start,uint64_t size)107 pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, uint64_t size)
108 {
109 	vm_map_offset_t vaddr;
110 	pd_entry_t      *pde, *npde;
111 	unsigned int    i;
112 	uint64_t        num_pde;
113 
114 	assert(!is_ept_pmap(grand));
115 	assert(!is_ept_pmap(subord));
116 
117 	if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
118 	    (va_start & (pmap_shared_region_size_min(grand) - 1)) ||
119 	    ((size >> 28) > 65536)) {   /* Max size we can nest is 16TB */
120 		return KERN_INVALID_VALUE;
121 	}
122 
123 	if (size == 0) {
124 		panic("pmap_nest: size is invalid - %016llX", size);
125 	}
126 
127 	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
128 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
129 	    VM_KERNEL_ADDRHIDE(va_start));
130 
131 	vaddr = (vm_map_offset_t)va_start;
132 	num_pde = size >> PDESHIFT;
133 
134 	PMAP_LOCK_EXCLUSIVE(subord);
135 
136 	subord->pm_shared = TRUE;
137 
138 	for (i = 0; i < num_pde;) {
139 		if (((vaddr & PDPTMASK) == 0) && (num_pde - i) >= NPDEPG) {
140 			npde = pmap64_pdpt(subord, vaddr);
141 
142 			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
143 				PMAP_UNLOCK_EXCLUSIVE(subord);
144 				pmap_expand_pdpt(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
145 				PMAP_LOCK_EXCLUSIVE(subord);
146 				npde = pmap64_pdpt(subord, vaddr);
147 			}
148 			*npde |= INTEL_PDPTE_NESTED;
149 			vaddr += NBPDPT;
150 			i += (uint32_t)NPDEPG;
151 		} else {
152 			npde = pmap_pde(subord, vaddr);
153 
154 			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
155 				PMAP_UNLOCK_EXCLUSIVE(subord);
156 				pmap_expand(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
157 				PMAP_LOCK_EXCLUSIVE(subord);
158 				npde = pmap_pde(subord, vaddr);
159 			}
160 			vaddr += NBPDE;
161 			i++;
162 		}
163 	}
164 
165 	PMAP_UNLOCK_EXCLUSIVE(subord);
166 
167 	vaddr = (vm_map_offset_t)va_start;
168 
169 	PMAP_LOCK_EXCLUSIVE(grand);
170 
171 	for (i = 0; i < num_pde;) {
172 		pd_entry_t tpde;
173 
174 		if (((vaddr & PDPTMASK) == 0) && ((num_pde - i) >= NPDEPG)) {
175 			npde = pmap64_pdpt(subord, vaddr);
176 			if (npde == 0) {
177 				panic("pmap_nest: no PDPT, subord %p nstart 0x%llx", subord, vaddr);
178 			}
179 			tpde = *npde;
180 			pde = pmap64_pdpt(grand, vaddr);
181 			if (0 == pde) {
182 				PMAP_UNLOCK_EXCLUSIVE(grand);
183 				pmap_expand_pml4(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
184 				PMAP_LOCK_EXCLUSIVE(grand);
185 				pde = pmap64_pdpt(grand, vaddr);
186 			}
187 			if (pde == 0) {
188 				panic("pmap_nest: no PDPT, grand  %p vaddr 0x%llx", grand, vaddr);
189 			}
190 			pmap_store_pte(FALSE, pde, tpde);
191 			vaddr += NBPDPT;
192 			i += (uint32_t) NPDEPG;
193 		} else {
194 			npde = pmap_pde(subord, vaddr);
195 			if (npde == 0) {
196 				panic("pmap_nest: no npde, subord %p vaddr 0x%llx", subord, vaddr);
197 			}
198 			tpde = *npde;
199 			pde = pmap_pde(grand, vaddr);
200 			if (0 == pde) {
201 				PMAP_UNLOCK_EXCLUSIVE(grand);
202 				pmap_expand_pdpt(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
203 				PMAP_LOCK_EXCLUSIVE(grand);
204 				pde = pmap_pde(grand, vaddr);
205 			}
206 
207 			if (pde == 0) {
208 				panic("pmap_nest: no pde, grand  %p vaddr 0x%llx", grand, vaddr);
209 			}
210 			vaddr += NBPDE;
211 			pmap_store_pte(FALSE, pde, tpde);
212 			i++;
213 		}
214 	}
215 
216 	PMAP_UNLOCK_EXCLUSIVE(grand);
217 
218 	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, KERN_SUCCESS);
219 
220 	return KERN_SUCCESS;
221 }
222 
223 /*
224  *	kern_return_t pmap_unnest(grand, vaddr)
225  *
226  *	grand  = the pmap that we will un-nest subord from
227  *	vaddr  = start of range in pmap to be unnested
228  *
229  *	Removes a pmap from another.  This is used to implement shared segments.
230  */
231 
232 kern_return_t
pmap_unnest(pmap_t grand,addr64_t vaddr,uint64_t size)233 pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size)
234 {
235 	pd_entry_t *pde;
236 	unsigned int i;
237 	uint64_t num_pde;
238 	addr64_t va_start, va_end;
239 	uint64_t npdpt = PMAP_INVALID_PDPTNUM;
240 
241 	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
242 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
243 
244 	if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
245 	    (vaddr & (pmap_shared_region_size_min(grand) - 1))) {
246 		panic("pmap_unnest(%p,0x%llx,0x%llx): unaligned...",
247 		    grand, vaddr, size);
248 	}
249 
250 	assert(!is_ept_pmap(grand));
251 
252 	/* align everything to PDE boundaries */
253 	va_start = vaddr & ~(NBPDE - 1);
254 	va_end = (vaddr + size + NBPDE - 1) & ~(NBPDE - 1);
255 	size = va_end - va_start;
256 
257 	PMAP_LOCK_EXCLUSIVE(grand);
258 
259 	num_pde = size >> PDESHIFT;
260 	vaddr = va_start;
261 
262 	for (i = 0; i < num_pde;) {
263 		if (pdptnum(grand, vaddr) != npdpt) {
264 			npdpt = pdptnum(grand, vaddr);
265 			pde = pmap64_pdpt(grand, vaddr);
266 			if (pde && (*pde & INTEL_PDPTE_NESTED)) {
267 				pmap_store_pte(FALSE, pde, (pd_entry_t)0);
268 				i += (uint32_t) NPDEPG;
269 				vaddr += NBPDPT;
270 				continue;
271 			}
272 		}
273 		pde = pmap_pde(grand, (vm_map_offset_t)vaddr);
274 		if (pde == 0) {
275 			panic("pmap_unnest: no pde, grand %p vaddr 0x%llx", grand, vaddr);
276 		}
277 		pmap_store_pte(FALSE, pde, (pd_entry_t)0);
278 		i++;
279 		vaddr += NBPDE;
280 	}
281 
282 	PMAP_UPDATE_TLBS(grand, va_start, va_end);
283 
284 	PMAP_UNLOCK_EXCLUSIVE(grand);
285 
286 	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
287 
288 	return KERN_SUCCESS;
289 }
290 
291 kern_return_t
pmap_unnest_options(pmap_t grand,addr64_t vaddr,__unused uint64_t size,__unused unsigned int options)292 pmap_unnest_options(
293 	pmap_t grand,
294 	addr64_t vaddr,
295 	__unused uint64_t size,
296 	__unused unsigned int options)
297 {
298 	return pmap_unnest(grand, vaddr, size);
299 }
300 
301 /* Invoked by the Mach VM to determine the platform specific unnest region */
302 
303 boolean_t
pmap_adjust_unnest_parameters(pmap_t p,vm_map_offset_t * s,vm_map_offset_t * e)304 pmap_adjust_unnest_parameters(pmap_t p, vm_map_offset_t *s, vm_map_offset_t *e)
305 {
306 	pd_entry_t *pdpte;
307 	boolean_t rval = FALSE;
308 
309 	PMAP_LOCK_EXCLUSIVE(p);
310 
311 	pdpte = pmap64_pdpt(p, *s);
312 	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
313 		*s &= ~(NBPDPT - 1);
314 		rval = TRUE;
315 	}
316 
317 	pdpte = pmap64_pdpt(p, *e);
318 	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
319 		*e = ((*e + NBPDPT) & ~(NBPDPT - 1));
320 		rval = TRUE;
321 	}
322 
323 	PMAP_UNLOCK_EXCLUSIVE(p);
324 
325 	return rval;
326 }
327 
328 pmap_paddr_t
pmap_find_pa(pmap_t pmap,addr64_t va)329 pmap_find_pa(pmap_t pmap, addr64_t va)
330 {
331 	pt_entry_t      *ptp;
332 	pd_entry_t      *pdep;
333 	pd_entry_t      pde;
334 	pt_entry_t      pte;
335 	boolean_t       is_ept, locked = FALSE;
336 	pmap_paddr_t    pa = 0;
337 
338 	is_ept = is_ept_pmap(pmap);
339 
340 	if ((pmap != kernel_pmap) && not_in_kdp) {
341 		PMAP_LOCK_EXCLUSIVE(pmap);
342 		locked = TRUE;
343 	} else {
344 		mp_disable_preemption();
345 	}
346 
347 	if (os_ref_get_count(&pmap->ref_count) == 0) {
348 		goto pfp_exit;
349 	}
350 
351 	pdep = pmap_pde(pmap, va);
352 
353 	if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
354 		if (pde & PTE_PS) {
355 			pa = pte_to_pa(pde) + (va & I386_LPGMASK);
356 		} else {
357 			ptp = pmap_pte(pmap, va);
358 			if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
359 				pa = pte_to_pa(pte) + (va & PAGE_MASK);
360 			}
361 		}
362 	}
363 pfp_exit:
364 	if (locked) {
365 		PMAP_UNLOCK_EXCLUSIVE(pmap);
366 	} else {
367 		mp_enable_preemption();
368 	}
369 
370 	return pa;
371 }
372 
373 /*
374  * pmap_find_phys returns the (4K) physical page number containing a
375  * given virtual address in a given pmap.
376  * Note that pmap_pte may return a pde if this virtual address is
377  * mapped by a large page and this is taken into account in order
378  * to return the correct page number in this case.
379  */
380 ppnum_t
pmap_find_phys(pmap_t pmap,addr64_t va)381 pmap_find_phys(pmap_t pmap, addr64_t va)
382 {
383 	ppnum_t         ppn = 0;
384 	pmap_paddr_t    pa = 0;
385 
386 	pa = pmap_find_pa(pmap, va);
387 	ppn = (ppnum_t) i386_btop(pa);
388 
389 	return ppn;
390 }
391 
392 ppnum_t
pmap_find_phys_nofault(pmap_t pmap,addr64_t va)393 pmap_find_phys_nofault(pmap_t pmap, addr64_t va)
394 {
395 	if ((pmap == kernel_pmap) ||
396 	    ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map)))) {
397 		return pmap_find_phys(pmap, va);
398 	}
399 	return 0;
400 }
401 
402 /*
403  *  pmap_get_prot returns the equivalent Vm page protections
404  *  set on a given address, 'va'. This function is used in the
405  *  ml_static_verify_page_protections() routine which is used
406  *  by the kext loading code to validate that the TEXT segment
407  *  of a kext is mapped executable.
408  */
409 kern_return_t
pmap_get_prot(pmap_t pmap,addr64_t va,vm_prot_t * protp)410 pmap_get_prot(pmap_t pmap, addr64_t va, vm_prot_t *protp)
411 {
412 	pt_entry_t      *ptp;
413 	pd_entry_t      *pdep;
414 	pd_entry_t      pde;
415 	pt_entry_t      pte;
416 	boolean_t       is_ept, locked = FALSE;
417 	kern_return_t   retval = KERN_FAILURE;
418 	vm_prot_t       prot = 0;
419 
420 	is_ept = is_ept_pmap(pmap);
421 
422 	if ((pmap != kernel_pmap) && not_in_kdp) {
423 		PMAP_LOCK_EXCLUSIVE(pmap);
424 		locked = TRUE;
425 	} else {
426 		mp_disable_preemption();
427 	}
428 
429 	if (os_ref_get_count(&pmap->ref_count) == 0) {
430 		goto pfp_exit;
431 	}
432 
433 	pdep = pmap_pde(pmap, va);
434 
435 	if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
436 		if (pde & PTE_PS) {
437 			prot = VM_PROT_READ;
438 
439 			if (pde & PTE_WRITE(is_ept)) {
440 				prot |= VM_PROT_WRITE;
441 			}
442 			if (PTE_IS_EXECUTABLE(is_ept, pde)) {
443 				prot |= VM_PROT_EXECUTE;
444 			}
445 			retval = KERN_SUCCESS;
446 		} else {
447 			ptp = pmap_pte(pmap, va);
448 			if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
449 				prot = VM_PROT_READ;
450 
451 				if (pte & PTE_WRITE(is_ept)) {
452 					prot |= VM_PROT_WRITE;
453 				}
454 				if (PTE_IS_EXECUTABLE(is_ept, pte)) {
455 					prot |= VM_PROT_EXECUTE;
456 				}
457 				retval = KERN_SUCCESS;
458 			}
459 		}
460 	}
461 
462 pfp_exit:
463 	if (locked) {
464 		PMAP_UNLOCK_EXCLUSIVE(pmap);
465 	} else {
466 		mp_enable_preemption();
467 	}
468 
469 	if (protp) {
470 		*protp = prot;
471 	}
472 
473 	return retval;
474 }
475 
476 /*
477  * Update cache attributes for all extant managed mappings.
478  * Assumes PV for this page is locked, and that the page
479  * is managed. We assume that this physical page may be mapped in
480  * both EPT and normal Intel PTEs, so we convert the attributes
481  * to the corresponding format for each pmap.
482  *
483  * We assert that the passed set of attributes is a subset of the
484  * PHYS_CACHEABILITY_MASK.
485  */
486 void
pmap_update_cache_attributes_locked(ppnum_t pn,unsigned attributes)487 pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes)
488 {
489 	pv_rooted_entry_t       pv_h, pv_e;
490 	pv_hashed_entry_t       pvh_e, nexth;
491 	vm_map_offset_t vaddr;
492 	pmap_t  pmap;
493 	pt_entry_t      *ptep;
494 	boolean_t       is_ept;
495 	unsigned        ept_attributes;
496 
497 	assert(IS_MANAGED_PAGE(pn));
498 	assert(((~PHYS_CACHEABILITY_MASK) & attributes) == 0);
499 
500 	/* We don't support the PAT bit for EPT PTEs */
501 	if (attributes & INTEL_PTE_NCACHE) {
502 		ept_attributes = INTEL_EPT_NCACHE;
503 	} else {
504 		ept_attributes = INTEL_EPT_WB;
505 	}
506 
507 	pv_h = pai_to_pvh(pn);
508 	/* TODO: translate the PHYS_* bits to PTE bits, while they're
509 	 * currently identical, they may not remain so
510 	 * Potential optimization (here and in page_protect),
511 	 * parallel shootdowns, check for redundant
512 	 * attribute modifications.
513 	 */
514 
515 	/*
516 	 * Alter attributes on all mappings
517 	 */
518 	if (pv_h->pmap != PMAP_NULL) {
519 		pv_e = pv_h;
520 		pvh_e = (pv_hashed_entry_t)pv_e;
521 
522 		do {
523 			pmap = pv_e->pmap;
524 			vaddr = PVE_VA(pv_e);
525 			ptep = pmap_pte(pmap, vaddr);
526 
527 			if (0 == ptep) {
528 				panic("pmap_update_cache_attributes_locked: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx kernel_pmap: %p", pmap, pn, vaddr, kernel_pmap);
529 			}
530 
531 			is_ept = is_ept_pmap(pmap);
532 
533 			nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink);
534 			if (!is_ept) {
535 				pmap_update_pte(is_ept, ptep, PHYS_CACHEABILITY_MASK, attributes, true);
536 			} else {
537 				pmap_update_pte(is_ept, ptep, INTEL_EPT_CACHE_MASK, ept_attributes, true);
538 			}
539 			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
540 			pvh_e = nexth;
541 		} while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
542 	}
543 }
544 
545 void
x86_filter_TLB_coherency_interrupts(boolean_t dofilter)546 x86_filter_TLB_coherency_interrupts(boolean_t dofilter)
547 {
548 	assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
549 
550 	if (dofilter) {
551 		CPU_CR3_MARK_INACTIVE();
552 	} else {
553 		CPU_CR3_MARK_ACTIVE();
554 		mfence();
555 		pmap_update_interrupt();
556 	}
557 }
558 
559 
560 /*
561  *	Insert the given physical page (p) at
562  *	the specified virtual address (v) in the
563  *	target physical map with the protection requested.
564  *
565  *	If specified, the page will be wired down, meaning
566  *	that the related pte cannot be reclaimed.
567  *
568  *	NB:  This is the only routine which MAY NOT lazy-evaluate
569  *	or lose information.  That is, this routine must actually
570  *	insert this page into the given map NOW.
571  */
572 
573 kern_return_t
pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired)574 pmap_enter(
575 	pmap_t          pmap,
576 	vm_map_offset_t         vaddr,
577 	ppnum_t                 pn,
578 	vm_prot_t               prot,
579 	vm_prot_t               fault_type,
580 	unsigned int            flags,
581 	boolean_t               wired)
582 {
583 	return pmap_enter_options(pmap, vaddr, pn, prot, fault_type, flags, wired, PMAP_EXPAND_OPTIONS_NONE, NULL);
584 }
585 
586 #define PTE_LOCK(EPT) INTEL_PTE_SWLOCK
587 
588 static inline void PTE_LOCK_LOCK(pt_entry_t *);
589 static inline void PTE_LOCK_UNLOCK(pt_entry_t *);
590 
591 void
PTE_LOCK_LOCK(pt_entry_t * lpte)592 PTE_LOCK_LOCK(pt_entry_t *lpte)
593 {
594 	pt_entry_t pte;
595 plretry:
596 	while ((pte = __c11_atomic_load((_Atomic pt_entry_t *)lpte, memory_order_relaxed)) & PTE_LOCK(0)) {
597 		__builtin_ia32_pause();
598 	}
599 	if (__c11_atomic_compare_exchange_strong((_Atomic pt_entry_t *)lpte, &pte, pte | PTE_LOCK(0), memory_order_acquire_smp, TRUE)) {
600 		return;
601 	}
602 
603 	goto plretry;
604 }
605 
606 void
PTE_LOCK_UNLOCK(pt_entry_t * lpte)607 PTE_LOCK_UNLOCK(pt_entry_t *lpte)
608 {
609 	__c11_atomic_fetch_and((_Atomic pt_entry_t *)lpte, ~PTE_LOCK(0), memory_order_release_smp);
610 }
611 
612 kern_return_t
pmap_enter_options_addr(pmap_t pmap,vm_map_address_t v,pmap_paddr_t pa,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,__unused void * arg)613 pmap_enter_options_addr(
614 	pmap_t pmap,
615 	vm_map_address_t v,
616 	pmap_paddr_t pa,
617 	vm_prot_t prot,
618 	vm_prot_t fault_type,
619 	unsigned int flags,
620 	boolean_t wired,
621 	unsigned int options,
622 	__unused void   *arg)
623 {
624 	return pmap_enter_options(pmap, v, intel_btop(pa), prot, fault_type, flags, wired, options, arg);
625 }
626 
627 kern_return_t
pmap_enter_options(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,__unused vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,void * arg)628 pmap_enter_options(
629 	pmap_t          pmap,
630 	vm_map_offset_t         vaddr,
631 	ppnum_t                 pn,
632 	vm_prot_t               prot,
633 	__unused vm_prot_t      fault_type,
634 	unsigned int            flags,
635 	boolean_t               wired,
636 	unsigned int            options,
637 	void                    *arg)
638 {
639 	pt_entry_t              *pte = NULL;
640 	pv_rooted_entry_t       pv_h;
641 	ppnum_t                 pai;
642 	pv_hashed_entry_t       pvh_e;
643 	pv_hashed_entry_t       pvh_new;
644 	pt_entry_t              template;
645 	pmap_paddr_t            old_pa;
646 	pmap_paddr_t            pa = (pmap_paddr_t) i386_ptob(pn);
647 	boolean_t               need_tlbflush = FALSE;
648 	boolean_t               set_NX;
649 	char                    oattr;
650 	boolean_t               old_pa_locked;
651 	/* 2MiB mappings are confined to x86_64 by VM */
652 	boolean_t               superpage = flags & VM_MEM_SUPERPAGE;
653 	vm_object_t             delpage_pm_obj = NULL;
654 	uint64_t                delpage_pde_index = 0;
655 	pt_entry_t              old_pte;
656 	kern_return_t           kr = KERN_FAILURE;
657 	boolean_t               is_ept;
658 	boolean_t               is_altacct;
659 	boolean_t               ptelocked = FALSE;
660 
661 	pmap_intr_assert();
662 
663 	if (__improbable(pmap == PMAP_NULL)) {
664 		return KERN_INVALID_ARGUMENT;
665 	}
666 	if (__improbable(pn == vm_page_guard_addr)) {
667 		return KERN_INVALID_ARGUMENT;
668 	}
669 
670 	is_ept = is_ept_pmap(pmap);
671 
672 	/* N.B. We can be supplied a zero page frame in the NOENTER case, it's an
673 	 * unused value for that scenario.
674 	 */
675 	assert(pn != vm_page_fictitious_addr);
676 
677 
678 	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
679 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(vaddr), pn,
680 	    prot);
681 
682 	if ((prot & VM_PROT_EXECUTE) || __improbable(is_ept && (prot & VM_PROT_UEXEC))) {
683 		set_NX = FALSE;
684 	} else {
685 		set_NX = TRUE;
686 	}
687 
688 #if DEVELOPMENT || DEBUG
689 	if (__improbable(set_NX && (!nx_enabled || !pmap->nx_enabled))) {
690 		set_NX = FALSE;
691 	}
692 
693 	if (__improbable(set_NX && (pmap == kernel_pmap) &&
694 	    ((pmap_disable_kstack_nx && (flags & VM_MEM_STACK)) ||
695 	    (pmap_disable_kheap_nx && !(flags & VM_MEM_STACK))))) {
696 		set_NX = FALSE;
697 	}
698 #endif
699 
700 	pvh_new = PV_HASHED_ENTRY_NULL;
701 Retry:
702 	pvh_e = PV_HASHED_ENTRY_NULL;
703 
704 	PMAP_LOCK_SHARED(pmap);
705 
706 	/*
707 	 *	Expand pmap to include this pte.  Assume that
708 	 *	pmap is always expanded to include enough hardware
709 	 *	pages to map one VM page.
710 	 */
711 	if (__improbable(superpage)) {
712 		while ((pte = pmap_pde(pmap, vaddr)) == PD_ENTRY_NULL) {
713 			/* need room for another pde entry */
714 			PMAP_UNLOCK_SHARED(pmap);
715 			kr = pmap_expand_pdpt(pmap, vaddr, options);
716 			if (kr != KERN_SUCCESS) {
717 				goto done1;
718 			}
719 			PMAP_LOCK_SHARED(pmap);
720 		}
721 	} else {
722 		while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
723 			/*
724 			 * Must unlock to expand the pmap
725 			 * going to grow pde level page(s)
726 			 */
727 			PMAP_UNLOCK_SHARED(pmap);
728 			kr = pmap_expand(pmap, vaddr, options);
729 			if (kr != KERN_SUCCESS) {
730 				goto done1;
731 			}
732 			PMAP_LOCK_SHARED(pmap);
733 		}
734 	}
735 
736 	if (__improbable(options & PMAP_EXPAND_OPTIONS_NOENTER)) {
737 		PMAP_UNLOCK_SHARED(pmap);
738 		kr = KERN_SUCCESS;
739 		goto done1;
740 	}
741 
742 	if (__improbable(superpage && *pte && !(*pte & PTE_PS))) {
743 		/*
744 		 * There is still an empty page table mapped that
745 		 * was used for a previous base page mapping.
746 		 * Remember the PDE and the PDE index, so that we
747 		 * can free the page at the end of this function.
748 		 */
749 		delpage_pde_index = pdeidx(pmap, vaddr);
750 		delpage_pm_obj = pmap->pm_obj;
751 		pmap_store_pte(is_ept, pte, 0);
752 	}
753 
754 	PTE_LOCK_LOCK(pte);
755 	ptelocked = TRUE;
756 
757 	old_pa = pte_to_pa(*pte);
758 	pai = pa_index(old_pa);
759 	old_pa_locked = FALSE;
760 
761 	if (old_pa == 0 &&
762 	    PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr)) {
763 		/*
764 		 * "pmap" should be locked at this point, so this should
765 		 * not race with another pmap_enter() or pmap_remove_range().
766 		 */
767 		assert(pmap != kernel_pmap);
768 
769 		/* one less "compressed" */
770 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
771 		    PAGE_SIZE);
772 		if (*pte & PTE_COMPRESSED_ALT) {
773 			pmap_ledger_debit(
774 				pmap,
775 				task_ledgers.alternate_accounting_compressed,
776 				PAGE_SIZE);
777 		} else {
778 			/* was part of the footprint */
779 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
780 			    PAGE_SIZE);
781 		}
782 		/* marker will be cleared below */
783 	}
784 
785 	/*
786 	 * if we have a previous managed page, lock the pv entry now. after
787 	 * we lock it, check to see if someone beat us to the lock and if so
788 	 * drop the lock
789 	 */
790 	if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) {
791 		LOCK_PVH(pai);
792 		old_pa_locked = TRUE;
793 		old_pa = pte_to_pa(*pte);
794 		if (0 == old_pa) {
795 			UNLOCK_PVH(pai);        /* another path beat us to it */
796 			old_pa_locked = FALSE;
797 		}
798 	}
799 
800 	/*
801 	 *	Special case if the incoming physical page is already mapped
802 	 *	at this address.
803 	 */
804 	if (old_pa == pa) {
805 		pt_entry_t old_attributes =
806 		    *pte & ~(PTE_REF(is_ept) | PTE_MOD(is_ept) | PTE_LOCK(is_ept));
807 
808 		/*
809 		 *	May be changing its wired attribute or protection
810 		 */
811 
812 		template = pa_to_pte(pa);
813 
814 		if (__probable(!is_ept)) {
815 			template |= INTEL_PTE_VALID;
816 		} else {
817 			template |= INTEL_EPT_IPAT;
818 		}
819 
820 		template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
821 
822 		/*
823 		 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
824 		 */
825 		if (!is_ept && (VM_MEM_NOT_CACHEABLE ==
826 		    (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)))) {
827 			if (!(flags & VM_MEM_GUARDED)) {
828 				template |= INTEL_PTE_PAT;
829 			}
830 			template |= INTEL_PTE_NCACHE;
831 		}
832 		if (pmap != kernel_pmap && !is_ept) {
833 			template |= INTEL_PTE_USER;
834 		}
835 
836 		if (prot & VM_PROT_READ) {
837 			template |= PTE_READ(is_ept);
838 		}
839 
840 		if (prot & VM_PROT_WRITE) {
841 			template |= PTE_WRITE(is_ept);
842 			if (is_ept && !pmap_ept_support_ad) {
843 				template |= PTE_MOD(is_ept);
844 				if (old_pa_locked) {
845 					assert(IS_MANAGED_PAGE(pai));
846 					pmap_phys_attributes[pai] |= PHYS_MODIFIED;
847 				}
848 			}
849 		}
850 
851 		if (prot & VM_PROT_EXECUTE) {
852 			assert(set_NX == 0);
853 			template = pte_set_ex(template, is_ept);
854 		}
855 
856 		if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
857 			assert(set_NX == 0);
858 			template = pte_set_uex(template);
859 		}
860 
861 		if (set_NX) {
862 			template = pte_remove_ex(template, is_ept);
863 		}
864 
865 		if (wired) {
866 			template |= PTE_WIRED;
867 			if (!iswired(old_attributes)) {
868 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
869 			}
870 		} else {
871 			if (iswired(old_attributes)) {
872 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
873 			}
874 		}
875 
876 		if (superpage) {        /* this path can not be used */
877 			template |= PTE_PS;     /* to change the page size! */
878 		}
879 		if (old_attributes == template) {
880 			goto dont_update_pte;
881 		}
882 
883 		/* Determine delta, PV locked */
884 		need_tlbflush =
885 		    ((old_attributes ^ template) != PTE_WIRED);
886 
887 		/* Optimisation: avoid TLB flush when adding writability */
888 		if (need_tlbflush == TRUE && !(old_attributes & PTE_WRITE(is_ept))) {
889 			if ((old_attributes ^ template) == PTE_WRITE(is_ept)) {
890 				need_tlbflush = FALSE;
891 			}
892 		}
893 
894 		/* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
895 		if (__improbable(is_ept && !pmap_ept_support_ad)) {
896 			template |= PTE_REF(is_ept);
897 			if (old_pa_locked) {
898 				assert(IS_MANAGED_PAGE(pai));
899 				pmap_phys_attributes[pai] |= PHYS_REFERENCED;
900 			}
901 		}
902 
903 		/* store modified PTE and preserve RC bits */
904 		pt_entry_t npte, opte;
905 
906 		assert((*pte & PTE_LOCK(is_ept)) != 0);
907 
908 		do {
909 			opte = *pte;
910 			npte = template | (opte & (PTE_REF(is_ept) |
911 			    PTE_MOD(is_ept))) | PTE_LOCK(is_ept);
912 		} while (!pmap_cmpx_pte(pte, opte, npte));
913 
914 		DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, opte, uint64_t, npte);
915 
916 dont_update_pte:
917 		if (old_pa_locked) {
918 			UNLOCK_PVH(pai);
919 			old_pa_locked = FALSE;
920 		}
921 		goto done2;
922 	}
923 
924 	/*
925 	 *	Outline of code from here:
926 	 *	   1) If va was mapped, update TLBs, remove the mapping
927 	 *	      and remove old pvlist entry.
928 	 *	   2) Add pvlist entry for new mapping
929 	 *	   3) Enter new mapping.
930 	 *
931 	 *	If the old physical page is not managed step 1) is skipped
932 	 *	(except for updating the TLBs), and the mapping is
933 	 *	overwritten at step 3).  If the new physical page is not
934 	 *	managed, step 2) is skipped.
935 	 */
936 	/* TODO: add opportunistic refmod collect */
937 	if (old_pa != (pmap_paddr_t) 0) {
938 		boolean_t       was_altacct = FALSE;
939 
940 		/*
941 		 *	Don't do anything to pages outside valid memory here.
942 		 *	Instead convince the code that enters a new mapping
943 		 *	to overwrite the old one.
944 		 */
945 
946 		/* invalidate the PTE */
947 		pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
948 		/* propagate invalidate everywhere */
949 		PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
950 		/* remember reference and change */
951 		old_pte = *pte;
952 		oattr = (char) (old_pte & (PTE_MOD(is_ept) | PTE_REF(is_ept)));
953 		/* completely invalidate the PTE */
954 		pmap_store_pte(is_ept, pte, PTE_LOCK(is_ept));
955 
956 		if (IS_MANAGED_PAGE(pai)) {
957 			/*
958 			 *	Remove the mapping from the pvlist for
959 			 *	this physical page.
960 			 *      We'll end up with either a rooted pv or a
961 			 *      hashed pv
962 			 */
963 			pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, &old_pte, &was_altacct);
964 		}
965 
966 		if (IS_MANAGED_PAGE(pai)) {
967 			pmap_assert(old_pa_locked == TRUE);
968 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
969 			if (pmap != kernel_pmap) {
970 				/* update ledgers */
971 				if (was_altacct) {
972 					assert(IS_INTERNAL_PAGE(pai));
973 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
974 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
975 				} else if (IS_REUSABLE_PAGE(pai)) {
976 					assert(!was_altacct);
977 					assert(IS_INTERNAL_PAGE(pai));
978 					pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
979 					/* was already not in phys_footprint */
980 				} else if (IS_INTERNAL_PAGE(pai)) {
981 					assert(!was_altacct);
982 					assert(!IS_REUSABLE_PAGE(pai));
983 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
984 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
985 				} else {
986 					/* not an internal page */
987 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
988 				}
989 			}
990 			if (iswired(*pte)) {
991 				pmap_ledger_debit(pmap, task_ledgers.wired_mem,
992 				    PAGE_SIZE);
993 			}
994 
995 			if (!is_ept) {
996 				pmap_phys_attributes[pai] |= oattr;
997 			} else {
998 				pmap_phys_attributes[pai] |= ept_refmod_to_physmap(oattr);
999 			}
1000 		} else {
1001 			/*
1002 			 *	old_pa is not managed.
1003 			 *	Do removal part of accounting.
1004 			 */
1005 
1006 			if (pmap != kernel_pmap) {
1007 #if 00
1008 				assert(pmap->stats.device > 0);
1009 				OSAddAtomic(-1, &pmap->stats.device);
1010 #endif
1011 			}
1012 			if (iswired(*pte)) {
1013 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1014 			}
1015 		}
1016 	}
1017 
1018 	/*
1019 	 * if we had a previously managed paged locked, unlock it now
1020 	 */
1021 	if (old_pa_locked) {
1022 		UNLOCK_PVH(pai);
1023 		old_pa_locked = FALSE;
1024 	}
1025 
1026 	pai = pa_index(pa);     /* now working with new incoming phys page */
1027 	if (IS_MANAGED_PAGE(pai)) {
1028 		/*
1029 		 *	Step 2) Enter the mapping in the PV list for this
1030 		 *	physical page.
1031 		 */
1032 		pv_h = pai_to_pvh(pai);
1033 
1034 		LOCK_PVH(pai);
1035 
1036 		if (pv_h->pmap == PMAP_NULL) {
1037 			/*
1038 			 *	No mappings yet, use rooted pv
1039 			 */
1040 			pv_h->va_and_flags = vaddr;
1041 			pv_h->pmap = pmap;
1042 			queue_init(&pv_h->qlink);
1043 
1044 			if (options & PMAP_OPTIONS_INTERNAL) {
1045 				pmap_phys_attributes[pai] |= PHYS_INTERNAL;
1046 			} else {
1047 				pmap_phys_attributes[pai] &= ~PHYS_INTERNAL;
1048 			}
1049 			if (options & PMAP_OPTIONS_REUSABLE) {
1050 				pmap_phys_attributes[pai] |= PHYS_REUSABLE;
1051 			} else {
1052 				pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
1053 			}
1054 			if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1055 			    IS_INTERNAL_PAGE(pai)) {
1056 				pv_h->va_and_flags |= PVE_IS_ALTACCT;
1057 				is_altacct = TRUE;
1058 			} else {
1059 				pv_h->va_and_flags &= ~PVE_IS_ALTACCT;
1060 				is_altacct = FALSE;
1061 			}
1062 		} else {
1063 			/*
1064 			 *	Add new pv_hashed_entry after header.
1065 			 */
1066 			if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
1067 				pvh_e = pvh_new;
1068 				pvh_new = PV_HASHED_ENTRY_NULL;
1069 			} else if (PV_HASHED_ENTRY_NULL == pvh_e) {
1070 				PV_HASHED_ALLOC(&pvh_e);
1071 				if (PV_HASHED_ENTRY_NULL == pvh_e) {
1072 					/*
1073 					 * the pv list is empty. if we are on
1074 					 * the kernel pmap we'll use one of
1075 					 * the special private kernel pv_e's,
1076 					 * else, we need to unlock
1077 					 * everything, zalloc a pv_e, and
1078 					 * restart bringing in the pv_e with
1079 					 * us.
1080 					 */
1081 					if (kernel_pmap == pmap) {
1082 						PV_HASHED_KERN_ALLOC(&pvh_e);
1083 					} else {
1084 						UNLOCK_PVH(pai);
1085 						PTE_LOCK_UNLOCK(pte);
1086 						PMAP_UNLOCK_SHARED(pmap);
1087 						pmap_pv_throttle(pmap);
1088 						pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
1089 						goto Retry;
1090 					}
1091 				}
1092 			}
1093 
1094 			if (PV_HASHED_ENTRY_NULL == pvh_e) {
1095 				panic("Mapping alias chain exhaustion, possibly induced by numerous kernel virtual double mappings");
1096 			}
1097 
1098 			pvh_e->va_and_flags = vaddr;
1099 			pvh_e->pmap = pmap;
1100 			pvh_e->ppn = pn;
1101 			if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1102 			    IS_INTERNAL_PAGE(pai)) {
1103 				pvh_e->va_and_flags |= PVE_IS_ALTACCT;
1104 				is_altacct = TRUE;
1105 			} else {
1106 				pvh_e->va_and_flags &= ~PVE_IS_ALTACCT;
1107 				is_altacct = FALSE;
1108 			}
1109 			pv_hash_add(pvh_e, pv_h);
1110 
1111 			/*
1112 			 *	Remember that we used the pvlist entry.
1113 			 */
1114 			pvh_e = PV_HASHED_ENTRY_NULL;
1115 		}
1116 
1117 		/*
1118 		 * only count the mapping
1119 		 * for 'managed memory'
1120 		 */
1121 		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1122 		if (pmap != kernel_pmap) {
1123 			/* update ledgers */
1124 			if (is_altacct) {
1125 				/* internal but also alternate accounting */
1126 				assert(IS_INTERNAL_PAGE(pai));
1127 				pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1128 				pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1129 				/* alternate accounting, so not in footprint */
1130 			} else if (IS_REUSABLE_PAGE(pai)) {
1131 				assert(!is_altacct);
1132 				assert(IS_INTERNAL_PAGE(pai));
1133 				pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
1134 				/* internal but reusable: not in footprint */
1135 			} else if (IS_INTERNAL_PAGE(pai)) {
1136 				assert(!is_altacct);
1137 				assert(!IS_REUSABLE_PAGE(pai));
1138 				/* internal: add to footprint */
1139 				pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1140 				pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1141 			} else {
1142 				/* not internal: not in footprint */
1143 				pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
1144 			}
1145 		}
1146 	} else if (last_managed_page == 0) {
1147 		/* Account for early mappings created before "managed pages"
1148 		 * are determined. Consider consulting the available DRAM map.
1149 		 */
1150 		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1151 		if (pmap != kernel_pmap) {
1152 #if 00
1153 			OSAddAtomic(+1, &pmap->stats.device);
1154 			PMAP_STATS_PEAK(pmap->stats.device);
1155 #endif
1156 		}
1157 	}
1158 	/*
1159 	 * Step 3) Enter the mapping.
1160 	 *
1161 	 *	Build a template to speed up entering -
1162 	 *	only the pfn changes.
1163 	 */
1164 	template = pa_to_pte(pa);
1165 
1166 	if (!is_ept) {
1167 		template |= INTEL_PTE_VALID;
1168 	} else {
1169 		template |= INTEL_EPT_IPAT;
1170 	}
1171 
1172 	/*
1173 	 * DRK: It may be worth asserting on cache attribute flags that diverge
1174 	 * from the existing physical page attributes.
1175 	 */
1176 
1177 	template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
1178 
1179 	/*
1180 	 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
1181 	 */
1182 	if (!is_ept && (flags & VM_MEM_NOT_CACHEABLE)) {
1183 		if (!(flags & VM_MEM_GUARDED)) {
1184 			template |= INTEL_PTE_PAT;
1185 		}
1186 		template |= INTEL_PTE_NCACHE;
1187 	}
1188 	if (pmap != kernel_pmap && !is_ept) {
1189 		template |= INTEL_PTE_USER;
1190 	}
1191 	if (prot & VM_PROT_READ) {
1192 		template |= PTE_READ(is_ept);
1193 	}
1194 	if (prot & VM_PROT_WRITE) {
1195 		template |= PTE_WRITE(is_ept);
1196 		if (is_ept && !pmap_ept_support_ad) {
1197 			template |= PTE_MOD(is_ept);
1198 			if (IS_MANAGED_PAGE(pai)) {
1199 				pmap_phys_attributes[pai] |= PHYS_MODIFIED;
1200 			}
1201 		}
1202 	}
1203 	if (prot & VM_PROT_EXECUTE) {
1204 		assert(set_NX == 0);
1205 		template = pte_set_ex(template, is_ept);
1206 	}
1207 	if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
1208 		assert(set_NX == 0);
1209 		template = pte_set_uex(template);
1210 	}
1211 
1212 	if (set_NX) {
1213 		template = pte_remove_ex(template, is_ept);
1214 	}
1215 	if (wired) {
1216 		template |= INTEL_PTE_WIRED;
1217 		pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1218 	}
1219 	if (__improbable(superpage)) {
1220 		template |= INTEL_PTE_PS;
1221 	}
1222 
1223 	/* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
1224 	if (__improbable(is_ept && !pmap_ept_support_ad)) {
1225 		template |= PTE_REF(is_ept);
1226 		if (IS_MANAGED_PAGE(pai)) {
1227 			pmap_phys_attributes[pai] |= PHYS_REFERENCED;
1228 		}
1229 	}
1230 	template |= PTE_LOCK(is_ept);
1231 	pmap_store_pte(is_ept, pte, template);
1232 	DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, 0, uint64_t, template);
1233 
1234 	/*
1235 	 * if this was a managed page we delayed unlocking the pv until here
1236 	 * to prevent pmap_page_protect et al from finding it until the pte
1237 	 * has been stored
1238 	 */
1239 	if (IS_MANAGED_PAGE(pai)) {
1240 		UNLOCK_PVH(pai);
1241 	}
1242 done2:
1243 	if (need_tlbflush == TRUE) {
1244 		if (options & PMAP_OPTIONS_NOFLUSH) {
1245 			PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1246 		} else {
1247 			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1248 		}
1249 	}
1250 	if (ptelocked) {
1251 		PTE_LOCK_UNLOCK(pte);
1252 	}
1253 	PMAP_UNLOCK_SHARED(pmap);
1254 
1255 	if (pvh_e != PV_HASHED_ENTRY_NULL) {
1256 		PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
1257 	}
1258 	if (pvh_new != PV_HASHED_ENTRY_NULL) {
1259 		PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
1260 	}
1261 
1262 	if (delpage_pm_obj) {
1263 		vm_page_t m;
1264 
1265 		vm_object_lock(delpage_pm_obj);
1266 		m = vm_page_lookup(delpage_pm_obj, (delpage_pde_index * PAGE_SIZE));
1267 		if (m == VM_PAGE_NULL) {
1268 			panic("pmap_enter: pte page not in object");
1269 		}
1270 		VM_PAGE_FREE(m);
1271 		vm_object_unlock(delpage_pm_obj);
1272 		OSAddAtomic(-1, &inuse_ptepages_count);
1273 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
1274 	}
1275 
1276 	kr = KERN_SUCCESS;
1277 done1:
1278 	if (__improbable((kr == KERN_SUCCESS) && (pmap == kernel_pmap) &&
1279 	    zone_spans_ro_va(vaddr, vaddr + PAGE_SIZE))) {
1280 		pmap_page_protect((ppnum_t)atop_kernel(kvtophys(vaddr)), VM_PROT_READ);
1281 	}
1282 	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
1283 	return kr;
1284 }
1285 
1286 /*
1287  *	Remove a range of hardware page-table entries.
1288  *	The entries given are the first (inclusive)
1289  *	and last (exclusive) entries for the VM pages.
1290  *	The virtual address is the va for the first pte.
1291  *
1292  *	The pmap must be locked.
1293  *	If the pmap is not the kernel pmap, the range must lie
1294  *	entirely within one pte-page.  This is NOT checked.
1295  *	Assumes that the pte-page exists.
1296  */
1297 
1298 void
pmap_remove_range(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte)1299 pmap_remove_range(
1300 	pmap_t                  pmap,
1301 	vm_map_offset_t         start_vaddr,
1302 	pt_entry_t              *spte,
1303 	pt_entry_t              *epte)
1304 {
1305 	pmap_remove_range_options(pmap, start_vaddr, spte, epte,
1306 	    PMAP_OPTIONS_REMOVE);
1307 }
1308 
1309 static void
pmap_remove_range_options(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte,int options)1310 pmap_remove_range_options(
1311 	pmap_t                  pmap,
1312 	vm_map_offset_t         start_vaddr,
1313 	pt_entry_t              *spte,
1314 	pt_entry_t              *epte,
1315 	int                     options)
1316 {
1317 	pt_entry_t              *cpte;
1318 	pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1319 	pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1320 	pv_hashed_entry_t       pvh_e;
1321 	int                     pvh_cnt = 0;
1322 	int                     num_removed, num_unwired, num_found, num_invalid;
1323 	int                     ledgers_external, ledgers_reusable, ledgers_internal, ledgers_alt_internal;
1324 	uint64_t                ledgers_compressed, ledgers_alt_compressed;
1325 	ppnum_t                 pai;
1326 	pmap_paddr_t            pa;
1327 	vm_map_offset_t         vaddr;
1328 	boolean_t               is_ept = is_ept_pmap(pmap);
1329 	boolean_t               was_altacct;
1330 
1331 	num_removed = 0;
1332 	num_unwired = 0;
1333 	num_found   = 0;
1334 	num_invalid = 0;
1335 	ledgers_external = 0;
1336 	ledgers_reusable = 0;
1337 	ledgers_internal = 0;
1338 	ledgers_compressed = 0;
1339 	ledgers_alt_internal = 0;
1340 	ledgers_alt_compressed = 0;
1341 
1342 	/* invalidate the PTEs first to "freeze" them */
1343 	for (cpte = spte, vaddr = start_vaddr;
1344 	    cpte < epte;
1345 	    cpte++, vaddr += PAGE_SIZE_64) {
1346 		pt_entry_t p = *cpte;
1347 
1348 		pa = pte_to_pa(p);
1349 		if (pa == 0) {
1350 			if ((options & PMAP_OPTIONS_REMOVE) &&
1351 			    (PTE_IS_COMPRESSED(p, cpte, pmap, vaddr))) {
1352 				assert(pmap != kernel_pmap);
1353 				/* one less "compressed"... */
1354 				ledgers_compressed++;
1355 				if (p & PTE_COMPRESSED_ALT) {
1356 					/* ... but it used to be "ALTACCT" */
1357 					ledgers_alt_compressed++;
1358 				}
1359 				/* clear marker(s) */
1360 				/* XXX probably does not need to be atomic! */
1361 				pmap_update_pte(is_ept, cpte, INTEL_PTE_COMPRESSED_MASK, 0, true);
1362 			}
1363 			continue;
1364 		}
1365 		num_found++;
1366 
1367 		if (iswired(p)) {
1368 			num_unwired++;
1369 		}
1370 
1371 		pai = pa_index(pa);
1372 
1373 		if (!IS_MANAGED_PAGE(pai)) {
1374 			/*
1375 			 *	Outside range of managed physical memory.
1376 			 *	Just remove the mappings.
1377 			 */
1378 			pmap_store_pte(is_ept, cpte, 0);
1379 			continue;
1380 		}
1381 
1382 		if ((p & PTE_VALID_MASK(is_ept)) == 0) {
1383 			num_invalid++;
1384 		}
1385 
1386 		/* invalidate the PTE */
1387 		pmap_update_pte(is_ept, cpte, PTE_VALID_MASK(is_ept), 0, true);
1388 	}
1389 
1390 	if (num_found == 0) {
1391 		/* nothing was changed: we're done */
1392 		goto update_counts;
1393 	}
1394 
1395 	/* propagate the invalidates to other CPUs */
1396 
1397 	PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1398 
1399 	for (cpte = spte, vaddr = start_vaddr;
1400 	    cpte < epte;
1401 	    cpte++, vaddr += PAGE_SIZE_64) {
1402 		pa = pte_to_pa(*cpte);
1403 		if (pa == 0) {
1404 check_pte_for_compressed_marker:
1405 			/*
1406 			 * This PTE could have been replaced with a
1407 			 * "compressed" marker after our first "freeze"
1408 			 * loop above, so check again.
1409 			 */
1410 			if ((options & PMAP_OPTIONS_REMOVE) &&
1411 			    (PTE_IS_COMPRESSED(*cpte, cpte, pmap, vaddr))) {
1412 				assert(pmap != kernel_pmap);
1413 				/* one less "compressed"... */
1414 				ledgers_compressed++;
1415 				if (*cpte & PTE_COMPRESSED_ALT) {
1416 					/* ... but it used to be "ALTACCT" */
1417 					ledgers_alt_compressed++;
1418 				}
1419 				pmap_store_pte(is_ept, cpte, 0);
1420 			}
1421 			continue;
1422 		}
1423 
1424 		pai = pa_index(pa);
1425 
1426 		LOCK_PVH(pai);
1427 
1428 		pa = pte_to_pa(*cpte);
1429 		if (pa == 0) {
1430 			UNLOCK_PVH(pai);
1431 			goto check_pte_for_compressed_marker;
1432 		}
1433 
1434 		/*
1435 		 * Remove the mapping from the pvlist for this physical page.
1436 		 */
1437 		pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, cpte, &was_altacct);
1438 
1439 		num_removed++;
1440 		/* update ledgers */
1441 		if (was_altacct) {
1442 			/* internal and alternate accounting */
1443 			assert(IS_INTERNAL_PAGE(pai));
1444 			ledgers_internal++;
1445 			ledgers_alt_internal++;
1446 		} else if (IS_REUSABLE_PAGE(pai)) {
1447 			/* internal but reusable */
1448 			assert(!was_altacct);
1449 			assert(IS_INTERNAL_PAGE(pai));
1450 			ledgers_reusable++;
1451 		} else if (IS_INTERNAL_PAGE(pai)) {
1452 			/* internal */
1453 			assert(!was_altacct);
1454 			assert(!IS_REUSABLE_PAGE(pai));
1455 			ledgers_internal++;
1456 		} else {
1457 			/* not internal */
1458 			ledgers_external++;
1459 		}
1460 
1461 		/*
1462 		 * Get the modify and reference bits, then
1463 		 * nuke the entry in the page table
1464 		 */
1465 		/* remember reference and change */
1466 		if (!is_ept) {
1467 			pmap_phys_attributes[pai] |=
1468 			    *cpte & (PHYS_MODIFIED | PHYS_REFERENCED);
1469 		} else {
1470 			pmap_phys_attributes[pai] |=
1471 			    ept_refmod_to_physmap((*cpte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1472 		}
1473 
1474 		/* completely invalidate the PTE */
1475 		pmap_store_pte(is_ept, cpte, 0);
1476 
1477 		UNLOCK_PVH(pai);
1478 
1479 		if (pvh_e != PV_HASHED_ENTRY_NULL) {
1480 			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1481 			pvh_eh = pvh_e;
1482 
1483 			if (pvh_et == PV_HASHED_ENTRY_NULL) {
1484 				pvh_et = pvh_e;
1485 			}
1486 			pvh_cnt++;
1487 		}
1488 		/* We can encounter at most 'num_found' PTEs for this level
1489 		 * Fewer may be encountered if some were replaced by
1490 		 * compressed markers. No new valid PTEs can be created
1491 		 * since the pmap lock is held exclusively.
1492 		 */
1493 		if (num_removed == num_found) {
1494 			break;
1495 		}
1496 	} /* for loop */
1497 
1498 	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1499 		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1500 	}
1501 update_counts:
1502 	/*
1503 	 *	Update the counts
1504 	 */
1505 #if TESTING
1506 	if (pmap->stats.resident_count < num_removed) {
1507 		panic("pmap_remove_range: resident_count");
1508 	}
1509 #endif
1510 	if (num_removed) {
1511 		pmap_ledger_debit(pmap, task_ledgers.phys_mem, machine_ptob(num_removed));
1512 	}
1513 
1514 	if (pmap != kernel_pmap) {
1515 		if (ledgers_external) {
1516 			pmap_ledger_debit(pmap,
1517 			    task_ledgers.external,
1518 			    machine_ptob(ledgers_external));
1519 		}
1520 		if (ledgers_reusable) {
1521 			pmap_ledger_debit(pmap,
1522 			    task_ledgers.reusable,
1523 			    machine_ptob(ledgers_reusable));
1524 		}
1525 		if (ledgers_internal) {
1526 			pmap_ledger_debit(pmap,
1527 			    task_ledgers.internal,
1528 			    machine_ptob(ledgers_internal));
1529 		}
1530 		if (ledgers_compressed) {
1531 			pmap_ledger_debit(pmap,
1532 			    task_ledgers.internal_compressed,
1533 			    machine_ptob(ledgers_compressed));
1534 		}
1535 		if (ledgers_alt_internal) {
1536 			pmap_ledger_debit(pmap,
1537 			    task_ledgers.alternate_accounting,
1538 			    machine_ptob(ledgers_alt_internal));
1539 		}
1540 		if (ledgers_alt_compressed) {
1541 			pmap_ledger_debit(pmap,
1542 			    task_ledgers.alternate_accounting_compressed,
1543 			    machine_ptob(ledgers_alt_compressed));
1544 		}
1545 
1546 		uint64_t net_debit = (ledgers_internal - ledgers_alt_internal) + (ledgers_compressed - ledgers_alt_compressed);
1547 		if (net_debit) {
1548 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(net_debit));
1549 		}
1550 	}
1551 
1552 	if (num_unwired != 0) {
1553 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, machine_ptob(num_unwired));
1554 	}
1555 	return;
1556 }
1557 
1558 
1559 /*
1560  *	Remove the given range of addresses
1561  *	from the specified map.
1562  *
1563  *	It is assumed that the start and end are properly
1564  *	rounded to the hardware page size.
1565  */
1566 void
pmap_remove(pmap_t map,addr64_t s64,addr64_t e64)1567 pmap_remove(
1568 	pmap_t          map,
1569 	addr64_t        s64,
1570 	addr64_t        e64)
1571 {
1572 	pmap_remove_options(map, s64, e64, PMAP_OPTIONS_REMOVE);
1573 }
1574 #define PLCHECK_THRESHOLD (2)
1575 
1576 void
pmap_remove_options(pmap_t map,addr64_t s64,addr64_t e64,int options)1577 pmap_remove_options(
1578 	pmap_t          map,
1579 	addr64_t        s64,
1580 	addr64_t        e64,
1581 	int             options)
1582 {
1583 	pt_entry_t     *pde;
1584 	pt_entry_t     *spte, *epte;
1585 	addr64_t        l64;
1586 	uint64_t        deadline = 0;
1587 	boolean_t       is_ept;
1588 
1589 	pmap_intr_assert();
1590 
1591 	if (map == PMAP_NULL || s64 == e64) {
1592 		return;
1593 	}
1594 
1595 	is_ept = is_ept_pmap(map);
1596 
1597 	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
1598 	    VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(s64),
1599 	    VM_KERNEL_ADDRHIDE(e64));
1600 
1601 	PMAP_LOCK_EXCLUSIVE(map);
1602 	uint32_t traverse_count = 0;
1603 
1604 	while (s64 < e64) {
1605 		pml4_entry_t *pml4e = pmap64_pml4(map, s64);
1606 		if ((pml4e == NULL) ||
1607 		    ((*pml4e & PTE_VALID_MASK(is_ept)) == 0)) {
1608 			s64 = (s64 + NBPML4) & ~(PML4MASK);
1609 			continue;
1610 		}
1611 		pdpt_entry_t *pdpte = pmap64_pdpt(map, s64);
1612 		if ((pdpte == NULL) ||
1613 		    ((*pdpte & PTE_VALID_MASK(is_ept)) == 0)) {
1614 			s64 = (s64 + NBPDPT) & ~(PDPTMASK);
1615 			continue;
1616 		}
1617 
1618 		l64 = (s64 + PDE_MAPPED_SIZE) & ~(PDE_MAPPED_SIZE - 1);
1619 
1620 		if (l64 > e64) {
1621 			l64 = e64;
1622 		}
1623 
1624 		pde = pmap_pde(map, s64);
1625 
1626 		if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1627 			if (*pde & PTE_PS) {
1628 				/*
1629 				 * If we're removing a superpage, pmap_remove_range()
1630 				 * must work on level 2 instead of level 1; and we're
1631 				 * only passing a single level 2 entry instead of a
1632 				 * level 1 range.
1633 				 */
1634 				spte = pde;
1635 				epte = spte + 1; /* excluded */
1636 			} else {
1637 				spte = pmap_pte(map, (s64 & ~(PDE_MAPPED_SIZE - 1)));
1638 				spte = &spte[ptenum(s64)];
1639 				epte = &spte[intel_btop(l64 - s64)];
1640 			}
1641 			pmap_remove_range_options(map, s64, spte, epte,
1642 			    options);
1643 		}
1644 		s64 = l64;
1645 
1646 		if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
1647 			if (deadline == 0) {
1648 				deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1649 			} else {
1650 				if (rdtsc64_nofence() > deadline) {
1651 					PMAP_UNLOCK_EXCLUSIVE(map);
1652 					__builtin_ia32_pause();
1653 					PMAP_LOCK_EXCLUSIVE(map);
1654 					deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1655 				}
1656 			}
1657 		}
1658 	}
1659 
1660 	PMAP_UNLOCK_EXCLUSIVE(map);
1661 
1662 	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
1663 }
1664 
1665 void
pmap_page_protect(ppnum_t pn,vm_prot_t prot)1666 pmap_page_protect(
1667 	ppnum_t         pn,
1668 	vm_prot_t       prot)
1669 {
1670 	pmap_page_protect_options(pn, prot, 0, NULL);
1671 }
1672 
1673 /*
1674  *	Routine:	pmap_page_protect_options
1675  *
1676  *	Function:
1677  *		Lower the permission for all mappings to a given
1678  *		page.
1679  */
1680 void
pmap_page_protect_options(ppnum_t pn,vm_prot_t prot,unsigned int options,void * arg)1681 pmap_page_protect_options(
1682 	ppnum_t         pn,
1683 	vm_prot_t       prot,
1684 	unsigned int    options,
1685 	void            *arg)
1686 {
1687 	pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1688 	pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1689 	pv_hashed_entry_t       nexth;
1690 	int                     pvh_cnt = 0;
1691 	pv_rooted_entry_t       pv_h;
1692 	pv_rooted_entry_t       pv_e;
1693 	pv_hashed_entry_t       pvh_e;
1694 	pt_entry_t              *pte;
1695 	int                     pai;
1696 	pmap_t                  pmap;
1697 	boolean_t               remove;
1698 	pt_entry_t              new_pte_value;
1699 	boolean_t               is_ept;
1700 
1701 	pmap_intr_assert();
1702 	assert(pn != vm_page_fictitious_addr);
1703 	if (pn == vm_page_guard_addr) {
1704 		return;
1705 	}
1706 
1707 	pai = ppn_to_pai(pn);
1708 
1709 	if (!IS_MANAGED_PAGE(pai)) {
1710 		/*
1711 		 *	Not a managed page.
1712 		 */
1713 		return;
1714 	}
1715 
1716 	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, pn, prot);
1717 
1718 	/*
1719 	 * Determine the new protection.
1720 	 */
1721 	switch (prot) {
1722 	case VM_PROT_READ:
1723 	case VM_PROT_READ | VM_PROT_EXECUTE:
1724 		remove = FALSE;
1725 		break;
1726 	case VM_PROT_ALL:
1727 		return;         /* nothing to do */
1728 	default:
1729 		remove = TRUE;
1730 		break;
1731 	}
1732 
1733 	pv_h = pai_to_pvh(pai);
1734 
1735 	LOCK_PVH(pai);
1736 
1737 
1738 	/*
1739 	 * Walk down PV list, if any, changing or removing all mappings.
1740 	 */
1741 	if (pv_h->pmap == PMAP_NULL) {
1742 		goto done;
1743 	}
1744 
1745 	pv_e = pv_h;
1746 	pvh_e = (pv_hashed_entry_t) pv_e;       /* cheat */
1747 
1748 	do {
1749 		vm_map_offset_t vaddr;
1750 
1751 		if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1752 		    (pmap_phys_attributes[pai] & PHYS_MODIFIED)) {
1753 			/* page was modified, so it will be compressed */
1754 			options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1755 			options |= PMAP_OPTIONS_COMPRESSOR;
1756 		}
1757 
1758 		pmap = pv_e->pmap;
1759 		is_ept = is_ept_pmap(pmap);
1760 		vaddr = PVE_VA(pv_e);
1761 		pte = pmap_pte(pmap, vaddr);
1762 
1763 		pmap_assert2((pa_index(pte_to_pa(*pte)) == pn),
1764 		    "pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte);
1765 
1766 		if (0 == pte) {
1767 			panic("pmap_page_protect() "
1768 			    "pmap=%p pn=0x%x vaddr=0x%llx\n",
1769 			    pmap, pn, vaddr);
1770 		}
1771 		nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink);
1772 
1773 		/*
1774 		 * Remove the mapping if new protection is NONE
1775 		 */
1776 		if (remove) {
1777 			/* Remove per-pmap wired count */
1778 			if (iswired(*pte)) {
1779 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1780 			}
1781 
1782 			if (pmap != kernel_pmap &&
1783 			    (options & PMAP_OPTIONS_COMPRESSOR) &&
1784 			    IS_INTERNAL_PAGE(pai)) {
1785 				assert(!PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr));
1786 				/* mark this PTE as having been "compressed" */
1787 				new_pte_value = PTE_COMPRESSED;
1788 				if (IS_ALTACCT_PAGE(pai, pv_e)) {
1789 					new_pte_value |= PTE_COMPRESSED_ALT;
1790 				}
1791 			} else {
1792 				new_pte_value = 0;
1793 			}
1794 
1795 			if (options & PMAP_OPTIONS_NOREFMOD) {
1796 				pmap_store_pte(is_ept, pte, new_pte_value);
1797 
1798 				if (options & PMAP_OPTIONS_NOFLUSH) {
1799 					PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1800 				} else {
1801 					PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1802 				}
1803 			} else {
1804 				/*
1805 				 * Remove the mapping, collecting dirty bits.
1806 				 */
1807 				pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
1808 
1809 				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1810 				if (!is_ept) {
1811 					pmap_phys_attributes[pai] |=
1812 					    *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1813 				} else {
1814 					pmap_phys_attributes[pai] |=
1815 					    ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1816 				}
1817 				if ((options &
1818 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1819 				    IS_INTERNAL_PAGE(pai) &&
1820 				    (pmap_phys_attributes[pai] &
1821 				    PHYS_MODIFIED)) {
1822 					/*
1823 					 * Page is actually "modified" and
1824 					 * will be compressed.  Start
1825 					 * accounting for it as "compressed".
1826 					 */
1827 					assert(!(options & PMAP_OPTIONS_COMPRESSOR));
1828 					options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1829 					options |= PMAP_OPTIONS_COMPRESSOR;
1830 					assert(new_pte_value == 0);
1831 					if (pmap != kernel_pmap) {
1832 						new_pte_value = PTE_COMPRESSED;
1833 						if (IS_ALTACCT_PAGE(pai, pv_e)) {
1834 							new_pte_value |= PTE_COMPRESSED_ALT;
1835 						}
1836 					}
1837 				}
1838 				pmap_store_pte(is_ept, pte, new_pte_value);
1839 			}
1840 
1841 #if TESTING
1842 			if (pmap->stats.resident_count < 1) {
1843 				panic("pmap_page_protect: resident_count");
1844 			}
1845 #endif
1846 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1847 
1848 			/*
1849 			 * We only ever compress internal pages.
1850 			 */
1851 			if (options & PMAP_OPTIONS_COMPRESSOR) {
1852 				assert(IS_INTERNAL_PAGE(pai));
1853 			}
1854 			if (pmap != kernel_pmap) {
1855 				/* update ledgers */
1856 				if (IS_ALTACCT_PAGE(pai, pv_e)) {
1857 					assert(IS_INTERNAL_PAGE(pai));
1858 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1859 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1860 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1861 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1862 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, PAGE_SIZE);
1863 					}
1864 				} else if (IS_REUSABLE_PAGE(pai)) {
1865 					assert(!IS_ALTACCT_PAGE(pai, pv_e));
1866 					assert(IS_INTERNAL_PAGE(pai));
1867 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1868 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1869 						/* was not in footprint, but is now */
1870 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1871 					}
1872 					pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
1873 				} else if (IS_INTERNAL_PAGE(pai)) {
1874 					assert(!IS_ALTACCT_PAGE(pai, pv_e));
1875 					assert(!IS_REUSABLE_PAGE(pai));
1876 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1877 					/*
1878 					 * Update all stats related to physical
1879 					 * footprint, which only deals with
1880 					 * internal pages.
1881 					 */
1882 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1883 						/*
1884 						 * This removal is only being
1885 						 * done so we can send this page
1886 						 * to the compressor;  therefore
1887 						 * it mustn't affect total task
1888 						 * footprint.
1889 						 */
1890 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1891 					} else {
1892 						/*
1893 						 * This internal page isn't
1894 						 * going to the compressor,
1895 						 * so adjust stats to keep
1896 						 * phys_footprint up to date.
1897 						 */
1898 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1899 					}
1900 				} else {
1901 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
1902 				}
1903 			}
1904 
1905 			/*
1906 			 * Deal with the pv_rooted_entry.
1907 			 */
1908 
1909 			if (pv_e == pv_h) {
1910 				/*
1911 				 * Fix up head later.
1912 				 */
1913 				pv_h->pmap = PMAP_NULL;
1914 			} else {
1915 				/*
1916 				 * Delete this entry.
1917 				 */
1918 				pv_hash_remove(pvh_e);
1919 				pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1920 				pvh_eh = pvh_e;
1921 
1922 				if (pvh_et == PV_HASHED_ENTRY_NULL) {
1923 					pvh_et = pvh_e;
1924 				}
1925 				pvh_cnt++;
1926 			}
1927 		} else {
1928 			/*
1929 			 * Write-protect, after opportunistic refmod collect
1930 			 */
1931 			if (!is_ept) {
1932 				pmap_phys_attributes[pai] |=
1933 				    *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1934 			} else {
1935 				pmap_phys_attributes[pai] |=
1936 				    ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1937 			}
1938 
1939 			pmap_update_pte(is_ept, pte, PTE_WRITE(is_ept), 0, true);
1940 			if (options & PMAP_OPTIONS_NOFLUSH) {
1941 				PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1942 			} else {
1943 				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1944 			}
1945 		}
1946 		pvh_e = nexth;
1947 	} while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h);
1948 
1949 
1950 	/*
1951 	 * If pv_head mapping was removed, fix it up.
1952 	 */
1953 	if (pv_h->pmap == PMAP_NULL) {
1954 		pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
1955 
1956 		if (pvh_e != (pv_hashed_entry_t) pv_h) {
1957 			pv_hash_remove(pvh_e);
1958 			pv_h->pmap = pvh_e->pmap;
1959 			pv_h->va_and_flags = pvh_e->va_and_flags;
1960 			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1961 			pvh_eh = pvh_e;
1962 
1963 			if (pvh_et == PV_HASHED_ENTRY_NULL) {
1964 				pvh_et = pvh_e;
1965 			}
1966 			pvh_cnt++;
1967 		}
1968 	}
1969 	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1970 		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1971 	}
1972 done:
1973 	UNLOCK_PVH(pai);
1974 
1975 	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
1976 }
1977 
1978 
1979 /*
1980  *	Clear specified attribute bits.
1981  */
1982 void
phys_attribute_clear(ppnum_t pn,int bits,unsigned int options,void * arg)1983 phys_attribute_clear(
1984 	ppnum_t         pn,
1985 	int             bits,
1986 	unsigned int    options,
1987 	void            *arg)
1988 {
1989 	pv_rooted_entry_t       pv_h;
1990 	pv_hashed_entry_t       pv_e;
1991 	pt_entry_t              *pte = NULL;
1992 	int                     pai;
1993 	pmap_t                  pmap;
1994 	char                    attributes = 0;
1995 	boolean_t               is_internal, is_reusable, is_altacct, is_ept;
1996 	int                     ept_bits_to_clear;
1997 	boolean_t               ept_keep_global_mod = FALSE;
1998 
1999 	if ((bits & PHYS_MODIFIED) &&
2000 	    (options & PMAP_OPTIONS_NOFLUSH) &&
2001 	    arg == NULL) {
2002 		panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p): "
2003 		    "should not clear 'modified' without flushing TLBs\n",
2004 		    pn, bits, options, arg);
2005 	}
2006 
2007 	/* We only support converting MOD and REF bits for EPT PTEs in this function */
2008 	assert((bits & ~(PHYS_REFERENCED | PHYS_MODIFIED)) == 0);
2009 
2010 	ept_bits_to_clear = (unsigned)physmap_refmod_to_ept(bits & (PHYS_MODIFIED | PHYS_REFERENCED));
2011 
2012 	pmap_intr_assert();
2013 	assert(pn != vm_page_fictitious_addr);
2014 	if (pn == vm_page_guard_addr) {
2015 		return;
2016 	}
2017 
2018 	pai = ppn_to_pai(pn);
2019 
2020 	if (!IS_MANAGED_PAGE(pai)) {
2021 		/*
2022 		 *	Not a managed page.
2023 		 */
2024 		return;
2025 	}
2026 
2027 	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
2028 
2029 	pv_h = pai_to_pvh(pai);
2030 
2031 	LOCK_PVH(pai);
2032 
2033 
2034 	/*
2035 	 * Walk down PV list, clearing all modify or reference bits.
2036 	 * We do not have to lock the pv_list because we have
2037 	 * the per-pmap lock
2038 	 */
2039 	if (pv_h->pmap != PMAP_NULL) {
2040 		/*
2041 		 * There are some mappings.
2042 		 */
2043 
2044 		is_internal = IS_INTERNAL_PAGE(pai);
2045 		is_reusable = IS_REUSABLE_PAGE(pai);
2046 
2047 		pv_e = (pv_hashed_entry_t)pv_h;
2048 
2049 		do {
2050 			vm_map_offset_t va;
2051 			char pte_bits;
2052 
2053 			pmap = pv_e->pmap;
2054 			is_ept = is_ept_pmap(pmap);
2055 			is_altacct = IS_ALTACCT_PAGE(pai, pv_e);
2056 			va = PVE_VA(pv_e);
2057 			pte_bits = 0;
2058 
2059 			if (bits) {
2060 				pte = pmap_pte(pmap, va);
2061 				/* grab ref/mod bits from this PTE */
2062 				pte_bits = (*pte & (PTE_REF(is_ept) | PTE_MOD(is_ept)));
2063 				/* propagate to page's global attributes */
2064 				if (!is_ept) {
2065 					attributes |= pte_bits;
2066 				} else {
2067 					attributes |= ept_refmod_to_physmap(pte_bits);
2068 					if (!pmap_ept_support_ad && (pte_bits & INTEL_EPT_MOD)) {
2069 						ept_keep_global_mod = TRUE;
2070 					}
2071 				}
2072 				/* which bits to clear for this PTE? */
2073 				if (!is_ept) {
2074 					pte_bits &= bits;
2075 				} else {
2076 					pte_bits &= ept_bits_to_clear;
2077 				}
2078 			}
2079 			if (options & PMAP_OPTIONS_CLEAR_WRITE) {
2080 				pte_bits |= PTE_WRITE(is_ept);
2081 			}
2082 
2083 			/*
2084 			 * Clear modify and/or reference bits.
2085 			 */
2086 			if (pte_bits) {
2087 				pmap_update_pte(is_ept, pte, pte_bits, 0, true);
2088 
2089 				/* Ensure all processors using this translation
2090 				 * invalidate this TLB entry. The invalidation
2091 				 * *must* follow the PTE update, to ensure that
2092 				 * the TLB shadow of the 'D' bit (in particular)
2093 				 * is synchronized with the updated PTE.
2094 				 */
2095 				if (!(options & PMAP_OPTIONS_NOFLUSH)) {
2096 					/* flush TLBS now */
2097 					PMAP_UPDATE_TLBS(pmap,
2098 					    va,
2099 					    va + PAGE_SIZE);
2100 				} else if (arg) {
2101 					/* delayed TLB flush: add "pmap" info */
2102 					PMAP_UPDATE_TLBS_DELAYED(
2103 						pmap,
2104 						va,
2105 						va + PAGE_SIZE,
2106 						(pmap_flush_context *)arg);
2107 				} else {
2108 					/* no TLB flushing at all */
2109 				}
2110 			}
2111 
2112 			/* update pmap "reusable" stats */
2113 			if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
2114 			    is_reusable &&
2115 			    pmap != kernel_pmap) {
2116 				/* one less "reusable" */
2117 				pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
2118 				if (is_internal) {
2119 					/* one more "internal" */
2120 					if (is_altacct) {
2121 						/* no impact on ledgers */
2122 					} else {
2123 						pmap_ledger_credit(pmap,
2124 						    task_ledgers.internal,
2125 						    PAGE_SIZE);
2126 						pmap_ledger_credit(
2127 							pmap,
2128 							task_ledgers.phys_footprint,
2129 							PAGE_SIZE);
2130 					}
2131 				} else {
2132 					/* one more "external" */
2133 					pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
2134 				}
2135 			} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
2136 			    !is_reusable &&
2137 			    pmap != kernel_pmap) {
2138 				/* one more "reusable" */
2139 				pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
2140 				if (is_internal) {
2141 					/* one less "internal" */
2142 					if (is_altacct) {
2143 						/* no impact on footprint */
2144 					} else {
2145 						pmap_ledger_debit(pmap,
2146 						    task_ledgers.internal,
2147 						    PAGE_SIZE);
2148 						pmap_ledger_debit(
2149 							pmap,
2150 							task_ledgers.phys_footprint,
2151 							PAGE_SIZE);
2152 					}
2153 				} else {
2154 					/* one less "external" */
2155 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
2156 				}
2157 			}
2158 
2159 			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2160 		} while (pv_e != (pv_hashed_entry_t)pv_h);
2161 	}
2162 	/* Opportunistic refmod collection, annulled
2163 	 * if both REF and MOD are being cleared.
2164 	 */
2165 
2166 	pmap_phys_attributes[pai] |= attributes;
2167 
2168 	if (ept_keep_global_mod) {
2169 		/*
2170 		 * If the hardware doesn't support AD bits for EPT PTEs and someone is
2171 		 * requesting that we clear the modified bit for a phys page, we need
2172 		 * to ensure that there are no EPT mappings for the page with the
2173 		 * modified bit set. If there are, we cannot clear the global modified bit.
2174 		 */
2175 		bits &= ~PHYS_MODIFIED;
2176 	}
2177 	pmap_phys_attributes[pai] &= ~(bits);
2178 
2179 	/* update this page's "reusable" status */
2180 	if (options & PMAP_OPTIONS_CLEAR_REUSABLE) {
2181 		pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
2182 	} else if (options & PMAP_OPTIONS_SET_REUSABLE) {
2183 		pmap_phys_attributes[pai] |= PHYS_REUSABLE;
2184 	}
2185 
2186 	UNLOCK_PVH(pai);
2187 
2188 	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
2189 }
2190 
2191 /*
2192  *	Check specified attribute bits.
2193  */
2194 int
phys_attribute_test(ppnum_t pn,int bits)2195 phys_attribute_test(
2196 	ppnum_t         pn,
2197 	int             bits)
2198 {
2199 	pv_rooted_entry_t       pv_h;
2200 	pv_hashed_entry_t       pv_e;
2201 	pt_entry_t              *pte;
2202 	int                     pai;
2203 	pmap_t                  pmap;
2204 	int                     attributes = 0;
2205 	boolean_t               is_ept;
2206 
2207 	pmap_intr_assert();
2208 	assert(pn != vm_page_fictitious_addr);
2209 	assert((bits & ~(PHYS_MODIFIED | PHYS_REFERENCED)) == 0);
2210 	if (pn == vm_page_guard_addr) {
2211 		return 0;
2212 	}
2213 
2214 	pai = ppn_to_pai(pn);
2215 
2216 	if (!IS_MANAGED_PAGE(pai)) {
2217 		/*
2218 		 *	Not a managed page.
2219 		 */
2220 		return 0;
2221 	}
2222 
2223 	/*
2224 	 * Fast check...  if bits already collected
2225 	 * no need to take any locks...
2226 	 * if not set, we need to recheck after taking
2227 	 * the lock in case they got pulled in while
2228 	 * we were waiting for the lock
2229 	 */
2230 	if ((pmap_phys_attributes[pai] & bits) == bits) {
2231 		return bits;
2232 	}
2233 
2234 	pv_h = pai_to_pvh(pai);
2235 
2236 	LOCK_PVH(pai);
2237 
2238 	attributes = pmap_phys_attributes[pai] & bits;
2239 
2240 
2241 	/*
2242 	 * Walk down PV list, checking the mappings until we
2243 	 * reach the end or we've found the desired attributes.
2244 	 */
2245 	if (attributes != bits &&
2246 	    pv_h->pmap != PMAP_NULL) {
2247 		/*
2248 		 * There are some mappings.
2249 		 */
2250 		pv_e = (pv_hashed_entry_t)pv_h;
2251 		do {
2252 			vm_map_offset_t va;
2253 
2254 			pmap = pv_e->pmap;
2255 			is_ept = is_ept_pmap(pmap);
2256 			va = PVE_VA(pv_e);
2257 			/*
2258 			 * pick up modify and/or reference bits from mapping
2259 			 */
2260 
2261 			pte = pmap_pte(pmap, va);
2262 			if (!is_ept) {
2263 				attributes |= (int)(*pte & bits);
2264 			} else {
2265 				attributes |= (int)(ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED));
2266 			}
2267 
2268 			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2269 		} while ((attributes != bits) &&
2270 		    (pv_e != (pv_hashed_entry_t)pv_h));
2271 	}
2272 	pmap_phys_attributes[pai] |= attributes;
2273 
2274 	UNLOCK_PVH(pai);
2275 	return attributes;
2276 }
2277 
2278 /*
2279  *	Routine:	pmap_change_wiring
2280  *	Function:	Change the wiring attribute for a map/virtual-address
2281  *			pair.
2282  *	In/out conditions:
2283  *			The mapping must already exist in the pmap.
2284  */
2285 void
pmap_change_wiring(pmap_t map,vm_map_offset_t vaddr,boolean_t wired)2286 pmap_change_wiring(
2287 	pmap_t          map,
2288 	vm_map_offset_t vaddr,
2289 	boolean_t       wired)
2290 {
2291 	pt_entry_t      *pte;
2292 
2293 	PMAP_LOCK_SHARED(map);
2294 
2295 	if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL) {
2296 		panic("pmap_change_wiring(%p,0x%llx,%d): pte missing",
2297 		    map, vaddr, wired);
2298 	}
2299 
2300 	if (wired && !iswired(*pte)) {
2301 		/*
2302 		 * wiring down mapping
2303 		 */
2304 		pmap_ledger_credit(map, task_ledgers.wired_mem, PAGE_SIZE);
2305 		pmap_update_pte(is_ept_pmap(map), pte, 0, PTE_WIRED, false);
2306 	} else if (!wired && iswired(*pte)) {
2307 		/*
2308 		 * unwiring mapping
2309 		 */
2310 		pmap_ledger_debit(map, task_ledgers.wired_mem, PAGE_SIZE);
2311 		pmap_update_pte(is_ept_pmap(map), pte, PTE_WIRED, 0, false);
2312 	}
2313 
2314 	PMAP_UNLOCK_SHARED(map);
2315 }
2316 
2317 /*
2318  *	"Backdoor" direct map routine for early mappings.
2319  *      Useful for mapping memory outside the range
2320  *      Sets A, D and NC if requested
2321  */
2322 
2323 vm_offset_t
pmap_map_bd(vm_offset_t virt,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int flags)2324 pmap_map_bd(
2325 	vm_offset_t     virt,
2326 	vm_map_offset_t start_addr,
2327 	vm_map_offset_t end_addr,
2328 	vm_prot_t       prot,
2329 	unsigned int    flags)
2330 {
2331 	pt_entry_t      template;
2332 	pt_entry_t      *ptep;
2333 
2334 	vm_offset_t     base = virt;
2335 	boolean_t       doflush = FALSE;
2336 
2337 	template = pa_to_pte(start_addr)
2338 	    | INTEL_PTE_REF
2339 	    | INTEL_PTE_MOD
2340 	    | INTEL_PTE_WIRED
2341 	    | INTEL_PTE_VALID;
2342 
2343 	if ((flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) == VM_MEM_NOT_CACHEABLE) {
2344 		template |= INTEL_PTE_NCACHE;
2345 		if (!(flags & (VM_MEM_GUARDED))) {
2346 			template |= INTEL_PTE_PAT;
2347 		}
2348 	}
2349 
2350 	if ((prot & VM_PROT_EXECUTE) == 0) {
2351 		template |= INTEL_PTE_NX;
2352 	}
2353 
2354 	if (prot & VM_PROT_WRITE) {
2355 		template |= INTEL_PTE_WRITE;
2356 	}
2357 	vm_map_offset_t caddr = start_addr;
2358 	while (caddr < end_addr) {
2359 		ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
2360 		if (ptep == PT_ENTRY_NULL) {
2361 			panic("pmap_map_bd: Invalid kernel address");
2362 		}
2363 		if (pte_to_pa(*ptep)) {
2364 			doflush = TRUE;
2365 		}
2366 		pmap_store_pte(FALSE, ptep, template);
2367 		pte_increment_pa(template);
2368 		virt += PAGE_SIZE;
2369 		caddr += PAGE_SIZE;
2370 	}
2371 	if (doflush) {
2372 		pmap_tlbi_range(0, ~0ULL, true, 0);
2373 		PMAP_UPDATE_TLBS(kernel_pmap, base, base + end_addr - start_addr);
2374 	}
2375 	return virt;
2376 }
2377 
2378 /* Create a virtual alias beginning at 'ava' of the specified kernel virtual
2379  * range. The aliased pagetable range is expanded if
2380  * PMAP_EXPAND_OPTIONS_ALIASMAP is specified. Performs no synchronization,
2381  * assumes caller has stabilized the source and destination ranges. Currently
2382  * used to populate sections of the trampoline "doublemap" at CPU startup.
2383  */
2384 
2385 void
pmap_alias(vm_offset_t ava,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int eoptions)2386 pmap_alias(
2387 	vm_offset_t     ava,
2388 	vm_map_offset_t start_addr,
2389 	vm_map_offset_t end_addr,
2390 	vm_prot_t       prot,
2391 	unsigned int    eoptions)
2392 {
2393 	pt_entry_t      prot_template, template;
2394 	pt_entry_t      *aptep, *sptep;
2395 
2396 	prot_template =  INTEL_PTE_REF | INTEL_PTE_MOD | INTEL_PTE_WIRED | INTEL_PTE_VALID;
2397 	if ((prot & VM_PROT_EXECUTE) == 0) {
2398 		prot_template |= INTEL_PTE_NX;
2399 	}
2400 
2401 	if (prot & VM_PROT_WRITE) {
2402 		prot_template |= INTEL_PTE_WRITE;
2403 	}
2404 	assert(((start_addr | end_addr) & PAGE_MASK) == 0);
2405 	while (start_addr < end_addr) {
2406 		aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2407 		if (aptep == PT_ENTRY_NULL) {
2408 			if (eoptions & PMAP_EXPAND_OPTIONS_ALIASMAP) {
2409 				pmap_expand(kernel_pmap, ava, PMAP_EXPAND_OPTIONS_ALIASMAP);
2410 				aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2411 			} else {
2412 				panic("pmap_alias: Invalid alias address");
2413 			}
2414 		}
2415 		/* The aliased range should not have any active mappings */
2416 		assert(pte_to_pa(*aptep) == 0);
2417 
2418 		sptep = pmap_pte(kernel_pmap, start_addr);
2419 		assert(sptep != PT_ENTRY_NULL && (pte_to_pa(*sptep) != 0));
2420 		template = pa_to_pte(pte_to_pa(*sptep)) | prot_template;
2421 		pmap_store_pte(FALSE, aptep, template);
2422 
2423 		ava += PAGE_SIZE;
2424 		start_addr += PAGE_SIZE;
2425 	}
2426 }
2427 
2428 mach_vm_size_t
pmap_query_resident(pmap_t pmap,addr64_t s64,addr64_t e64,mach_vm_size_t * compressed_bytes_p)2429 pmap_query_resident(
2430 	pmap_t          pmap,
2431 	addr64_t        s64,
2432 	addr64_t        e64,
2433 	mach_vm_size_t  *compressed_bytes_p)
2434 {
2435 	pt_entry_t     *pde;
2436 	pt_entry_t     *spte, *epte;
2437 	addr64_t        l64;
2438 	uint64_t        deadline = 0;
2439 	mach_vm_size_t  resident_bytes;
2440 	mach_vm_size_t  compressed_bytes;
2441 	boolean_t       is_ept;
2442 
2443 	pmap_intr_assert();
2444 
2445 	if (pmap == PMAP_NULL || pmap == kernel_pmap || s64 == e64) {
2446 		if (compressed_bytes_p) {
2447 			*compressed_bytes_p = 0;
2448 		}
2449 		return 0;
2450 	}
2451 
2452 	is_ept = is_ept_pmap(pmap);
2453 
2454 	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
2455 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(s64),
2456 	    VM_KERNEL_ADDRHIDE(e64));
2457 
2458 	resident_bytes = 0;
2459 	compressed_bytes = 0;
2460 
2461 	PMAP_LOCK_EXCLUSIVE(pmap);
2462 	uint32_t traverse_count = 0;
2463 
2464 	while (s64 < e64) {
2465 		l64 = (s64 + PDE_MAPPED_SIZE) & ~(PDE_MAPPED_SIZE - 1);
2466 		if (l64 > e64) {
2467 			l64 = e64;
2468 		}
2469 		pde = pmap_pde(pmap, s64);
2470 
2471 		if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
2472 			if (*pde & PTE_PS) {
2473 				/* superpage: not supported */
2474 			} else {
2475 				spte = pmap_pte(pmap,
2476 				    (s64 & ~(PDE_MAPPED_SIZE - 1)));
2477 				spte = &spte[ptenum(s64)];
2478 				epte = &spte[intel_btop(l64 - s64)];
2479 
2480 				for (; spte < epte; spte++) {
2481 					if (pte_to_pa(*spte) != 0) {
2482 						resident_bytes += PAGE_SIZE;
2483 					} else if (*spte & PTE_COMPRESSED) {
2484 						compressed_bytes += PAGE_SIZE;
2485 					}
2486 				}
2487 			}
2488 		}
2489 		s64 = l64;
2490 
2491 		if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
2492 			if (deadline == 0) {
2493 				deadline = rdtsc64() + max_preemption_latency_tsc;
2494 			} else {
2495 				if (rdtsc64() > deadline) {
2496 					PMAP_UNLOCK_EXCLUSIVE(pmap);
2497 					__builtin_ia32_pause();
2498 					PMAP_LOCK_EXCLUSIVE(pmap);
2499 					deadline = rdtsc64() + max_preemption_latency_tsc;
2500 				}
2501 			}
2502 		}
2503 	}
2504 
2505 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2506 
2507 	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
2508 	    resident_bytes);
2509 
2510 	if (compressed_bytes_p) {
2511 		*compressed_bytes_p = compressed_bytes;
2512 	}
2513 	return resident_bytes;
2514 }
2515 
2516 kern_return_t
pmap_query_page_info(pmap_t pmap,vm_map_offset_t va,int * disp_p)2517 pmap_query_page_info(
2518 	pmap_t          pmap,
2519 	vm_map_offset_t va,
2520 	int             *disp_p)
2521 {
2522 	int             disp;
2523 	boolean_t       is_ept;
2524 	pmap_paddr_t    pa;
2525 	ppnum_t         pai;
2526 	pd_entry_t      *pde;
2527 	pt_entry_t      *pte;
2528 
2529 	pmap_intr_assert();
2530 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
2531 		*disp_p = 0;
2532 		return KERN_INVALID_ARGUMENT;
2533 	}
2534 
2535 	disp = 0;
2536 	is_ept = is_ept_pmap(pmap);
2537 
2538 	PMAP_LOCK_EXCLUSIVE(pmap);
2539 
2540 	pde = pmap_pde(pmap, va);
2541 	if (!pde ||
2542 	    !(*pde & PTE_VALID_MASK(is_ept)) ||
2543 	    (*pde & PTE_PS)) {
2544 		goto done;
2545 	}
2546 
2547 	pte = pmap_pte(pmap, va);
2548 	if (pte == PT_ENTRY_NULL) {
2549 		goto done;
2550 	}
2551 
2552 	pa = pte_to_pa(*pte);
2553 	if (pa == 0) {
2554 		if (PTE_IS_COMPRESSED(*pte, pte, pmap, va)) {
2555 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
2556 			if (*pte & PTE_COMPRESSED_ALT) {
2557 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
2558 			}
2559 		}
2560 	} else {
2561 		disp |= PMAP_QUERY_PAGE_PRESENT;
2562 		pai = pa_index(pa);
2563 		if (!IS_MANAGED_PAGE(pai)) {
2564 		} else if (pmap_pv_is_altacct(pmap, va, pai)) {
2565 			assert(IS_INTERNAL_PAGE(pai));
2566 			disp |= PMAP_QUERY_PAGE_INTERNAL;
2567 			disp |= PMAP_QUERY_PAGE_ALTACCT;
2568 		} else if (IS_REUSABLE_PAGE(pai)) {
2569 			disp |= PMAP_QUERY_PAGE_REUSABLE;
2570 		} else if (IS_INTERNAL_PAGE(pai)) {
2571 			disp |= PMAP_QUERY_PAGE_INTERNAL;
2572 		}
2573 	}
2574 
2575 done:
2576 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2577 	*disp_p = disp;
2578 	return KERN_SUCCESS;
2579 }
2580 
2581 void
pmap_set_vm_map_cs_enforced(pmap_t pmap,bool new_value)2582 pmap_set_vm_map_cs_enforced(
2583 	pmap_t pmap,
2584 	bool new_value)
2585 {
2586 	PMAP_LOCK_EXCLUSIVE(pmap);
2587 	pmap->pm_vm_map_cs_enforced = new_value;
2588 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2589 }
2590 extern int cs_process_enforcement_enable;
2591 bool
pmap_get_vm_map_cs_enforced(pmap_t pmap)2592 pmap_get_vm_map_cs_enforced(
2593 	pmap_t pmap)
2594 {
2595 	if (cs_process_enforcement_enable) {
2596 		return true;
2597 	}
2598 	return pmap->pm_vm_map_cs_enforced;
2599 }
2600 
2601 void
pmap_set_jit_entitled(__unused pmap_t pmap)2602 pmap_set_jit_entitled(__unused pmap_t pmap)
2603 {
2604 	/* The x86 pmap layer does not care if a map has a JIT entry. */
2605 	return;
2606 }
2607 
2608 bool
pmap_get_jit_entitled(__unused pmap_t pmap)2609 pmap_get_jit_entitled(__unused pmap_t pmap)
2610 {
2611 	/* The x86 pmap layer does not care if a map is using JIT. */
2612 	return false;
2613 }
2614 
2615 bool
pmap_has_prot_policy(__unused pmap_t pmap,__unused bool translated_allow_execute,__unused vm_prot_t prot)2616 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
2617 {
2618 	/*
2619 	 * The x86 pmap layer does not apply any policy to any protection
2620 	 * types.
2621 	 */
2622 	return false;
2623 }
2624 
2625 uint64_t
pmap_release_pages_fast(void)2626 pmap_release_pages_fast(void)
2627 {
2628 	return 0;
2629 }
2630 
2631 void
pmap_trim(__unused pmap_t grand,__unused pmap_t subord,__unused addr64_t vstart,__unused uint64_t size)2632 pmap_trim(__unused pmap_t grand, __unused pmap_t subord, __unused addr64_t vstart, __unused uint64_t size)
2633 {
2634 	return;
2635 }
2636 
2637 __dead2
2638 void
pmap_ledger_verify_size(size_t size)2639 pmap_ledger_verify_size(size_t size)
2640 {
2641 	panic("%s: unsupported, "
2642 	    "size=%lu",
2643 	    __func__, size);
2644 }
2645 
2646 __dead2
2647 ledger_t
pmap_ledger_alloc(void)2648 pmap_ledger_alloc(void)
2649 {
2650 	panic("%s: unsupported",
2651 	    __func__);
2652 }
2653 
2654 __dead2
2655 void
pmap_ledger_free(ledger_t ledger)2656 pmap_ledger_free(ledger_t ledger)
2657 {
2658 	panic("%s: unsupported, "
2659 	    "ledger=%p",
2660 	    __func__, ledger);
2661 }
2662 
2663 kern_return_t
pmap_dump_page_tables(pmap_t pmap __unused,void * bufp __unused,void * buf_end __unused,unsigned int level_mask __unused,size_t * bytes_copied __unused)2664 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
2665     unsigned int level_mask __unused, size_t *bytes_copied __unused)
2666 {
2667 	return KERN_NOT_SUPPORTED;
2668 }
2669 
2670 void *
pmap_map_compressor_page(ppnum_t pn)2671 pmap_map_compressor_page(ppnum_t pn)
2672 {
2673 	assertf(IS_MANAGED_PAGE(ppn_to_pai(pn)), "%s called on non-managed page 0x%08x", __func__, pn);
2674 	return PHYSMAP_PTOV((uint64_t)pn << (uint64_t)PAGE_SHIFT);
2675 }
2676 
2677 void
pmap_unmap_compressor_page(ppnum_t pn __unused,void * kva __unused)2678 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
2679 {
2680 }
2681 
2682 bool
pmap_clear_refmod_range_options(pmap_t pmap __unused,vm_map_address_t start __unused,vm_map_address_t end __unused,unsigned int mask __unused,unsigned int options __unused)2683 pmap_clear_refmod_range_options(
2684 	pmap_t pmap __unused,
2685 	vm_map_address_t start __unused,
2686 	vm_map_address_t end __unused,
2687 	unsigned int mask __unused,
2688 	unsigned int options __unused)
2689 {
2690 	/*
2691 	 * x86 doesn't have ranged tlbi instructions, and we already have
2692 	 * the pmap_flush_context. This operation isn't implemented.
2693 	 */
2694 	return false;
2695 }
2696 
2697 bool
pmap_supported_feature(pmap_t pmap,pmap_feature_flags_t feat)2698 pmap_supported_feature(pmap_t pmap, pmap_feature_flags_t feat)
2699 {
2700 	switch (feat) {
2701 	case PMAP_FEAT_UEXEC:
2702 		return pmap != NULL && is_ept_pmap(pmap);
2703 	default:
2704 		return false;
2705 	}
2706 }
2707