xref: /xnu-10063.141.1/osfmk/i386/pmap_x86_common.c (revision d8b80295118ef25ac3a784134bcf95cd8e88109f)
1 /*
2  * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <mach_assert.h>
30 
31 #include <vm/pmap.h>
32 #include <vm/vm_map.h>
33 #include <vm/vm_kern.h>
34 #include <kern/ledger.h>
35 #include <kern/zalloc_internal.h>
36 #include <i386/pmap_internal.h>
37 
38 void            pmap_remove_range(
39 	pmap_t          pmap,
40 	vm_map_offset_t va,
41 	pt_entry_t      *spte,
42 	pt_entry_t      *epte);
43 
44 static void            pmap_remove_range_options(
45 	pmap_t          pmap,
46 	vm_map_offset_t va,
47 	pt_entry_t      *spte,
48 	pt_entry_t      *epte,
49 	int             options);
50 
51 void            pmap_reusable_range(
52 	pmap_t          pmap,
53 	vm_map_offset_t va,
54 	pt_entry_t      *spte,
55 	pt_entry_t      *epte,
56 	boolean_t       reusable);
57 
58 pt_entry_t *PTE_corrupted_ptr;
59 
60 #if DEVELOPMENT || DEBUG
61 int pmap_inject_pte_corruption;
62 uint32_t pmap_update_clear_pte_count;
63 uint32_t pmap_update_invalid_pte_count;
64 #endif
65 
66 /*
67  * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time,
68  * on a NBPDE boundary.
69  */
70 
71 uint64_t
pmap_shared_region_size_min(__unused pmap_t pmap)72 pmap_shared_region_size_min(__unused pmap_t pmap)
73 {
74 	return NBPDE;
75 }
76 
77 uint64_t
pmap_commpage_size_min(__unused pmap_t pmap)78 pmap_commpage_size_min(__unused pmap_t pmap)
79 {
80 	return NBPDE;
81 }
82 
83 /*
84  *	kern_return_t pmap_nest(grand, subord, va_start, size)
85  *
86  *	grand  = the pmap that we will nest subord into
87  *	subord = the pmap that goes into the grand
88  *	va_start  = start of range in pmap to be inserted
89  *	size   = Size of nest area (up to 16TB)
90  *
91  *	Inserts a pmap into another.  This is used to implement shared segments.
92  *
93  *	Note that we depend upon higher level VM locks to insure that things don't change while
94  *	we are doing this.  For example, VM should not be doing any pmap enters while it is nesting
95  *	or do 2 nests at once.
96  */
97 
98 /*
99  * This routine can nest subtrees either at the PDPT level (1GiB) or at the
100  * PDE level (2MiB). We currently disallow disparate offsets for the "subord"
101  * container and the "grand" parent. A minor optimization to consider for the
102  * future: make the "subord" truly a container rather than a full-fledged
103  * pagetable hierarchy which can be unnecessarily sparse (DRK).
104  */
105 
106 kern_return_t
pmap_nest(pmap_t grand,pmap_t subord,addr64_t va_start,uint64_t size)107 pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, uint64_t size)
108 {
109 	vm_map_offset_t vaddr;
110 	pd_entry_t      *pde, *npde;
111 	unsigned int    i;
112 	uint64_t        num_pde;
113 
114 	assert(!is_ept_pmap(grand));
115 	assert(!is_ept_pmap(subord));
116 
117 	if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
118 	    (va_start & (pmap_shared_region_size_min(grand) - 1)) ||
119 	    ((size >> 28) > 65536)) {   /* Max size we can nest is 16TB */
120 		return KERN_INVALID_VALUE;
121 	}
122 
123 	if (size == 0) {
124 		panic("pmap_nest: size is invalid - %016llX", size);
125 	}
126 
127 	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
128 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
129 	    VM_KERNEL_ADDRHIDE(va_start));
130 
131 	vaddr = (vm_map_offset_t)va_start;
132 	num_pde = size >> PDESHIFT;
133 
134 	PMAP_LOCK_EXCLUSIVE(subord);
135 
136 	subord->pm_shared = TRUE;
137 
138 	for (i = 0; i < num_pde;) {
139 		if (((vaddr & PDPTMASK) == 0) && (num_pde - i) >= NPDEPG) {
140 			npde = pmap64_pdpt(subord, vaddr);
141 
142 			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
143 				PMAP_UNLOCK_EXCLUSIVE(subord);
144 				pmap_expand_pdpt(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
145 				PMAP_LOCK_EXCLUSIVE(subord);
146 				npde = pmap64_pdpt(subord, vaddr);
147 			}
148 			*npde |= INTEL_PDPTE_NESTED;
149 			vaddr += NBPDPT;
150 			i += (uint32_t)NPDEPG;
151 		} else {
152 			npde = pmap_pde(subord, vaddr);
153 
154 			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
155 				PMAP_UNLOCK_EXCLUSIVE(subord);
156 				pmap_expand(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
157 				PMAP_LOCK_EXCLUSIVE(subord);
158 				npde = pmap_pde(subord, vaddr);
159 			}
160 			vaddr += NBPDE;
161 			i++;
162 		}
163 	}
164 
165 	PMAP_UNLOCK_EXCLUSIVE(subord);
166 
167 	vaddr = (vm_map_offset_t)va_start;
168 
169 	PMAP_LOCK_EXCLUSIVE(grand);
170 
171 	for (i = 0; i < num_pde;) {
172 		pd_entry_t tpde;
173 
174 		if (((vaddr & PDPTMASK) == 0) && ((num_pde - i) >= NPDEPG)) {
175 			npde = pmap64_pdpt(subord, vaddr);
176 			if (npde == 0) {
177 				panic("pmap_nest: no PDPT, subord %p nstart 0x%llx", subord, vaddr);
178 			}
179 			tpde = *npde;
180 			pde = pmap64_pdpt(grand, vaddr);
181 			if (0 == pde) {
182 				PMAP_UNLOCK_EXCLUSIVE(grand);
183 				pmap_expand_pml4(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
184 				PMAP_LOCK_EXCLUSIVE(grand);
185 				pde = pmap64_pdpt(grand, vaddr);
186 			}
187 			if (pde == 0) {
188 				panic("pmap_nest: no PDPT, grand  %p vaddr 0x%llx", grand, vaddr);
189 			}
190 			pmap_store_pte(FALSE, pde, tpde);
191 			vaddr += NBPDPT;
192 			i += (uint32_t) NPDEPG;
193 		} else {
194 			npde = pmap_pde(subord, vaddr);
195 			if (npde == 0) {
196 				panic("pmap_nest: no npde, subord %p vaddr 0x%llx", subord, vaddr);
197 			}
198 			tpde = *npde;
199 			pde = pmap_pde(grand, vaddr);
200 			if (0 == pde) {
201 				PMAP_UNLOCK_EXCLUSIVE(grand);
202 				pmap_expand_pdpt(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
203 				PMAP_LOCK_EXCLUSIVE(grand);
204 				pde = pmap_pde(grand, vaddr);
205 			}
206 
207 			if (pde == 0) {
208 				panic("pmap_nest: no pde, grand  %p vaddr 0x%llx", grand, vaddr);
209 			}
210 			vaddr += NBPDE;
211 			pmap_store_pte(FALSE, pde, tpde);
212 			i++;
213 		}
214 	}
215 
216 	PMAP_UNLOCK_EXCLUSIVE(grand);
217 
218 	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, KERN_SUCCESS);
219 
220 	return KERN_SUCCESS;
221 }
222 
223 /*
224  *	kern_return_t pmap_unnest(grand, vaddr)
225  *
226  *	grand  = the pmap that we will un-nest subord from
227  *	vaddr  = start of range in pmap to be unnested
228  *
229  *	Removes a pmap from another.  This is used to implement shared segments.
230  */
231 
232 kern_return_t
pmap_unnest(pmap_t grand,addr64_t vaddr,uint64_t size)233 pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size)
234 {
235 	pd_entry_t *pde;
236 	unsigned int i;
237 	uint64_t num_pde;
238 	addr64_t va_start, va_end;
239 	uint64_t npdpt = PMAP_INVALID_PDPTNUM;
240 
241 	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
242 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
243 
244 	if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
245 	    (vaddr & (pmap_shared_region_size_min(grand) - 1))) {
246 		panic("pmap_unnest(%p,0x%llx,0x%llx): unaligned...",
247 		    grand, vaddr, size);
248 	}
249 
250 	assert(!is_ept_pmap(grand));
251 
252 	/* align everything to PDE boundaries */
253 	va_start = vaddr & ~(NBPDE - 1);
254 
255 	if (os_add_overflow(vaddr, size + NBPDE - 1, &va_end)) {
256 		panic("pmap_unnest: Overflow when calculating range end: s=0x%llx sz=0x%llx\n", vaddr, size);
257 	}
258 
259 	va_end &= ~(NBPDE - 1);
260 	size = va_end - va_start;
261 
262 	PMAP_LOCK_EXCLUSIVE(grand);
263 
264 	num_pde = size >> PDESHIFT;
265 	vaddr = va_start;
266 
267 	for (i = 0; i < num_pde;) {
268 		if (pdptnum(grand, vaddr) != npdpt) {
269 			npdpt = pdptnum(grand, vaddr);
270 			pde = pmap64_pdpt(grand, vaddr);
271 			if (pde && (*pde & INTEL_PDPTE_NESTED)) {
272 				pmap_store_pte(FALSE, pde, (pd_entry_t)0);
273 				i += (uint32_t) NPDEPG;
274 				vaddr += NBPDPT;
275 				continue;
276 			}
277 		}
278 		pde = pmap_pde(grand, (vm_map_offset_t)vaddr);
279 		if (pde == 0) {
280 			panic("pmap_unnest: no pde, grand %p vaddr 0x%llx", grand, vaddr);
281 		}
282 		pmap_store_pte(FALSE, pde, (pd_entry_t)0);
283 		i++;
284 		vaddr += NBPDE;
285 	}
286 
287 	PMAP_UPDATE_TLBS(grand, va_start, va_end);
288 
289 	PMAP_UNLOCK_EXCLUSIVE(grand);
290 
291 	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
292 
293 	return KERN_SUCCESS;
294 }
295 
296 kern_return_t
pmap_unnest_options(pmap_t grand,addr64_t vaddr,__unused uint64_t size,__unused unsigned int options)297 pmap_unnest_options(
298 	pmap_t grand,
299 	addr64_t vaddr,
300 	__unused uint64_t size,
301 	__unused unsigned int options)
302 {
303 	return pmap_unnest(grand, vaddr, size);
304 }
305 
306 /* Invoked by the Mach VM to determine the platform specific unnest region */
307 
308 boolean_t
pmap_adjust_unnest_parameters(pmap_t p,vm_map_offset_t * s,vm_map_offset_t * e)309 pmap_adjust_unnest_parameters(pmap_t p, vm_map_offset_t *s, vm_map_offset_t *e)
310 {
311 	pd_entry_t *pdpte;
312 	boolean_t rval = FALSE;
313 
314 	PMAP_LOCK_EXCLUSIVE(p);
315 
316 	pdpte = pmap64_pdpt(p, *s);
317 	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
318 		*s &= ~(NBPDPT - 1);
319 		rval = TRUE;
320 	}
321 
322 	pdpte = pmap64_pdpt(p, *e);
323 	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
324 		*e = ((*e + NBPDPT) & ~(NBPDPT - 1));
325 		rval = TRUE;
326 	}
327 
328 	PMAP_UNLOCK_EXCLUSIVE(p);
329 
330 	return rval;
331 }
332 
333 pmap_paddr_t
pmap_find_pa(pmap_t pmap,addr64_t va)334 pmap_find_pa(pmap_t pmap, addr64_t va)
335 {
336 	pt_entry_t      *ptp;
337 	pd_entry_t      *pdep;
338 	pd_entry_t      pde;
339 	pt_entry_t      pte;
340 	boolean_t       is_ept, locked = FALSE;
341 	pmap_paddr_t    pa = 0;
342 
343 	is_ept = is_ept_pmap(pmap);
344 
345 	if ((pmap != kernel_pmap) && not_in_kdp) {
346 		PMAP_LOCK_EXCLUSIVE(pmap);
347 		locked = TRUE;
348 	} else {
349 		mp_disable_preemption();
350 	}
351 
352 	if (os_ref_get_count(&pmap->ref_count) == 0) {
353 		goto pfp_exit;
354 	}
355 
356 	pdep = pmap_pde(pmap, va);
357 
358 	if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
359 		if (pde & PTE_PS) {
360 			pa = pte_to_pa(pde) + (va & I386_LPGMASK);
361 		} else {
362 			ptp = pmap_pte(pmap, va);
363 			if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
364 				pa = pte_to_pa(pte) + (va & PAGE_MASK);
365 			}
366 		}
367 	}
368 pfp_exit:
369 	if (locked) {
370 		PMAP_UNLOCK_EXCLUSIVE(pmap);
371 	} else {
372 		mp_enable_preemption();
373 	}
374 
375 	return pa;
376 }
377 
378 /*
379  * pmap_find_phys returns the (4K) physical page number containing a
380  * given virtual address in a given pmap.
381  * Note that pmap_pte may return a pde if this virtual address is
382  * mapped by a large page and this is taken into account in order
383  * to return the correct page number in this case.
384  */
385 ppnum_t
pmap_find_phys(pmap_t pmap,addr64_t va)386 pmap_find_phys(pmap_t pmap, addr64_t va)
387 {
388 	ppnum_t         ppn = 0;
389 	pmap_paddr_t    pa = 0;
390 
391 	pa = pmap_find_pa(pmap, va);
392 	ppn = (ppnum_t) i386_btop(pa);
393 
394 	return ppn;
395 }
396 
397 ppnum_t
pmap_find_phys_nofault(pmap_t pmap,addr64_t va)398 pmap_find_phys_nofault(pmap_t pmap, addr64_t va)
399 {
400 	if ((pmap == kernel_pmap) ||
401 	    ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map)))) {
402 		return pmap_find_phys(pmap, va);
403 	}
404 	return 0;
405 }
406 
407 /*
408  *  pmap_get_prot returns the equivalent Vm page protections
409  *  set on a given address, 'va'. This function is used in the
410  *  ml_static_verify_page_protections() routine which is used
411  *  by the kext loading code to validate that the TEXT segment
412  *  of a kext is mapped executable.
413  */
414 kern_return_t
pmap_get_prot(pmap_t pmap,addr64_t va,vm_prot_t * protp)415 pmap_get_prot(pmap_t pmap, addr64_t va, vm_prot_t *protp)
416 {
417 	pt_entry_t      *ptp;
418 	pd_entry_t      *pdep;
419 	pd_entry_t      pde;
420 	pt_entry_t      pte;
421 	boolean_t       is_ept, locked = FALSE;
422 	kern_return_t   retval = KERN_FAILURE;
423 	vm_prot_t       prot = 0;
424 
425 	is_ept = is_ept_pmap(pmap);
426 
427 	if ((pmap != kernel_pmap) && not_in_kdp) {
428 		PMAP_LOCK_EXCLUSIVE(pmap);
429 		locked = TRUE;
430 	} else {
431 		mp_disable_preemption();
432 	}
433 
434 	if (os_ref_get_count(&pmap->ref_count) == 0) {
435 		goto pfp_exit;
436 	}
437 
438 	pdep = pmap_pde(pmap, va);
439 
440 	if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
441 		if (pde & PTE_PS) {
442 			prot = VM_PROT_READ;
443 
444 			if (pde & PTE_WRITE(is_ept)) {
445 				prot |= VM_PROT_WRITE;
446 			}
447 			if (PTE_IS_EXECUTABLE(is_ept, pde)) {
448 				prot |= VM_PROT_EXECUTE;
449 			}
450 			retval = KERN_SUCCESS;
451 		} else {
452 			ptp = pmap_pte(pmap, va);
453 			if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
454 				prot = VM_PROT_READ;
455 
456 				if (pte & PTE_WRITE(is_ept)) {
457 					prot |= VM_PROT_WRITE;
458 				}
459 				if (PTE_IS_EXECUTABLE(is_ept, pte)) {
460 					prot |= VM_PROT_EXECUTE;
461 				}
462 				retval = KERN_SUCCESS;
463 			}
464 		}
465 	}
466 
467 pfp_exit:
468 	if (locked) {
469 		PMAP_UNLOCK_EXCLUSIVE(pmap);
470 	} else {
471 		mp_enable_preemption();
472 	}
473 
474 	if (protp) {
475 		*protp = prot;
476 	}
477 
478 	return retval;
479 }
480 
481 /*
482  * Update cache attributes for all extant managed mappings.
483  * Assumes PV for this page is locked, and that the page
484  * is managed. We assume that this physical page may be mapped in
485  * both EPT and normal Intel PTEs, so we convert the attributes
486  * to the corresponding format for each pmap.
487  *
488  * We assert that the passed set of attributes is a subset of the
489  * PHYS_CACHEABILITY_MASK.
490  */
491 void
pmap_update_cache_attributes_locked(ppnum_t pn,unsigned attributes)492 pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes)
493 {
494 	pv_rooted_entry_t       pv_h, pv_e;
495 	pv_hashed_entry_t       pvh_e, nexth;
496 	vm_map_offset_t vaddr;
497 	pmap_t  pmap;
498 	pt_entry_t      *ptep;
499 	boolean_t       is_ept;
500 	unsigned        ept_attributes;
501 
502 	assert(IS_MANAGED_PAGE(pn));
503 	assert(((~PHYS_CACHEABILITY_MASK) & attributes) == 0);
504 
505 	/* We don't support the PAT bit for EPT PTEs */
506 	if (attributes & INTEL_PTE_NCACHE) {
507 		ept_attributes = INTEL_EPT_NCACHE;
508 	} else {
509 		ept_attributes = INTEL_EPT_WB;
510 	}
511 
512 	pv_h = pai_to_pvh(pn);
513 	/* TODO: translate the PHYS_* bits to PTE bits, while they're
514 	 * currently identical, they may not remain so
515 	 * Potential optimization (here and in page_protect),
516 	 * parallel shootdowns, check for redundant
517 	 * attribute modifications.
518 	 */
519 
520 	/*
521 	 * Alter attributes on all mappings
522 	 */
523 	if (pv_h->pmap != PMAP_NULL) {
524 		pv_e = pv_h;
525 		pvh_e = (pv_hashed_entry_t)pv_e;
526 
527 		do {
528 			pmap = pv_e->pmap;
529 			vaddr = PVE_VA(pv_e);
530 			ptep = pmap_pte(pmap, vaddr);
531 
532 			if (0 == ptep) {
533 				panic("pmap_update_cache_attributes_locked: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx kernel_pmap: %p", pmap, pn, vaddr, kernel_pmap);
534 			}
535 
536 			is_ept = is_ept_pmap(pmap);
537 
538 			nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink);
539 			if (!is_ept) {
540 				pmap_update_pte(is_ept, ptep, PHYS_CACHEABILITY_MASK, attributes, true);
541 			} else {
542 				pmap_update_pte(is_ept, ptep, INTEL_EPT_CACHE_MASK, ept_attributes, true);
543 			}
544 			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
545 			pvh_e = nexth;
546 		} while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
547 	}
548 }
549 
550 void
x86_filter_TLB_coherency_interrupts(boolean_t dofilter)551 x86_filter_TLB_coherency_interrupts(boolean_t dofilter)
552 {
553 	assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
554 
555 	if (dofilter) {
556 		CPU_CR3_MARK_INACTIVE();
557 	} else {
558 		CPU_CR3_MARK_ACTIVE();
559 		mfence();
560 		pmap_update_interrupt();
561 	}
562 }
563 
564 
565 /*
566  *	Insert the given physical page (p) at
567  *	the specified virtual address (v) in the
568  *	target physical map with the protection requested.
569  *
570  *	If specified, the page will be wired down, meaning
571  *	that the related pte cannot be reclaimed.
572  *
573  *	NB:  This is the only routine which MAY NOT lazy-evaluate
574  *	or lose information.  That is, this routine must actually
575  *	insert this page into the given map NOW.
576  */
577 
578 kern_return_t
pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired,pmap_mapping_type_t mapping_type)579 pmap_enter(
580 	pmap_t          pmap,
581 	vm_map_offset_t         vaddr,
582 	ppnum_t                 pn,
583 	vm_prot_t               prot,
584 	vm_prot_t               fault_type,
585 	unsigned int            flags,
586 	boolean_t               wired,
587 	pmap_mapping_type_t     mapping_type)
588 {
589 	return pmap_enter_options(pmap, vaddr, pn, prot, fault_type, flags, wired, PMAP_EXPAND_OPTIONS_NONE, NULL, mapping_type);
590 }
591 
592 #define PTE_LOCK(EPT) INTEL_PTE_SWLOCK
593 
594 static inline void PTE_LOCK_LOCK(pt_entry_t *);
595 static inline void PTE_LOCK_UNLOCK(pt_entry_t *);
596 
597 void
PTE_LOCK_LOCK(pt_entry_t * lpte)598 PTE_LOCK_LOCK(pt_entry_t *lpte)
599 {
600 	pt_entry_t pte;
601 plretry:
602 	while ((pte = __c11_atomic_load((_Atomic pt_entry_t *)lpte, memory_order_relaxed)) & PTE_LOCK(0)) {
603 		__builtin_ia32_pause();
604 	}
605 	if (__c11_atomic_compare_exchange_strong((_Atomic pt_entry_t *)lpte, &pte, pte | PTE_LOCK(0), memory_order_acquire_smp, TRUE)) {
606 		return;
607 	}
608 
609 	goto plretry;
610 }
611 
612 void
PTE_LOCK_UNLOCK(pt_entry_t * lpte)613 PTE_LOCK_UNLOCK(pt_entry_t *lpte)
614 {
615 	__c11_atomic_fetch_and((_Atomic pt_entry_t *)lpte, ~PTE_LOCK(0), memory_order_release_smp);
616 }
617 
618 kern_return_t
pmap_enter_options_addr(pmap_t pmap,vm_map_address_t v,pmap_paddr_t pa,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,__unused void * arg,pmap_mapping_type_t mapping_type)619 pmap_enter_options_addr(
620 	pmap_t pmap,
621 	vm_map_address_t v,
622 	pmap_paddr_t pa,
623 	vm_prot_t prot,
624 	vm_prot_t fault_type,
625 	unsigned int flags,
626 	boolean_t wired,
627 	unsigned int options,
628 	__unused void   *arg,
629 	pmap_mapping_type_t mapping_type)
630 {
631 	return pmap_enter_options(pmap, v, intel_btop(pa), prot, fault_type, flags, wired, options, arg, mapping_type);
632 }
633 
634 kern_return_t
pmap_enter_options(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,__unused vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,void * arg,__unused pmap_mapping_type_t mapping_type)635 pmap_enter_options(
636 	pmap_t          pmap,
637 	vm_map_offset_t         vaddr,
638 	ppnum_t                 pn,
639 	vm_prot_t               prot,
640 	__unused vm_prot_t      fault_type,
641 	unsigned int            flags,
642 	boolean_t               wired,
643 	unsigned int            options,
644 	void                    *arg,
645 	__unused pmap_mapping_type_t mapping_type)
646 {
647 	pt_entry_t              *pte = NULL;
648 	pv_rooted_entry_t       pv_h;
649 	ppnum_t                 pai;
650 	pv_hashed_entry_t       pvh_e;
651 	pv_hashed_entry_t       pvh_new;
652 	pt_entry_t              template;
653 	pmap_paddr_t            old_pa;
654 	pmap_paddr_t            pa = (pmap_paddr_t) i386_ptob(pn);
655 	boolean_t               need_tlbflush = FALSE;
656 	boolean_t               set_NX;
657 	char                    oattr;
658 	boolean_t               old_pa_locked;
659 	/* 2MiB mappings are confined to x86_64 by VM */
660 	boolean_t               superpage = flags & VM_MEM_SUPERPAGE;
661 	vm_object_t             delpage_pm_obj = NULL;
662 	uint64_t                delpage_pde_index = 0;
663 	pt_entry_t              old_pte;
664 	kern_return_t           kr = KERN_FAILURE;
665 	boolean_t               is_ept;
666 	boolean_t               is_altacct;
667 	boolean_t               ptelocked = FALSE;
668 
669 	pmap_intr_assert();
670 
671 	if (__improbable(pmap == PMAP_NULL)) {
672 		return KERN_INVALID_ARGUMENT;
673 	}
674 	if (__improbable(pn == vm_page_guard_addr)) {
675 		return KERN_INVALID_ARGUMENT;
676 	}
677 
678 	is_ept = is_ept_pmap(pmap);
679 
680 	/* N.B. We can be supplied a zero page frame in the NOENTER case, it's an
681 	 * unused value for that scenario.
682 	 */
683 	assert(pn != vm_page_fictitious_addr);
684 
685 
686 	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
687 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(vaddr), pn,
688 	    prot);
689 
690 	if ((prot & VM_PROT_EXECUTE) || __improbable(is_ept && (prot & VM_PROT_UEXEC))) {
691 		set_NX = FALSE;
692 	} else {
693 		set_NX = TRUE;
694 	}
695 
696 #if DEVELOPMENT || DEBUG
697 	if (__improbable(set_NX && (!nx_enabled || !pmap->nx_enabled))) {
698 		set_NX = FALSE;
699 	}
700 
701 	if (__improbable(set_NX && (pmap == kernel_pmap) &&
702 	    ((pmap_disable_kstack_nx && (flags & VM_MEM_STACK)) ||
703 	    (pmap_disable_kheap_nx && !(flags & VM_MEM_STACK))))) {
704 		set_NX = FALSE;
705 	}
706 #endif
707 
708 	pvh_new = PV_HASHED_ENTRY_NULL;
709 Retry:
710 	pvh_e = PV_HASHED_ENTRY_NULL;
711 
712 	PMAP_LOCK_SHARED(pmap);
713 
714 	/*
715 	 *	Expand pmap to include this pte.  Assume that
716 	 *	pmap is always expanded to include enough hardware
717 	 *	pages to map one VM page.
718 	 */
719 	if (__improbable(superpage)) {
720 		while ((pte = pmap_pde(pmap, vaddr)) == PD_ENTRY_NULL) {
721 			/* need room for another pde entry */
722 			PMAP_UNLOCK_SHARED(pmap);
723 			kr = pmap_expand_pdpt(pmap, vaddr, options);
724 			if (kr != KERN_SUCCESS) {
725 				goto done1;
726 			}
727 			PMAP_LOCK_SHARED(pmap);
728 		}
729 	} else {
730 		while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
731 			/*
732 			 * Must unlock to expand the pmap
733 			 * going to grow pde level page(s)
734 			 */
735 			PMAP_UNLOCK_SHARED(pmap);
736 			kr = pmap_expand(pmap, vaddr, options);
737 			if (kr != KERN_SUCCESS) {
738 				goto done1;
739 			}
740 			PMAP_LOCK_SHARED(pmap);
741 		}
742 	}
743 
744 	if (__improbable(options & PMAP_EXPAND_OPTIONS_NOENTER)) {
745 		PMAP_UNLOCK_SHARED(pmap);
746 		kr = KERN_SUCCESS;
747 		goto done1;
748 	}
749 
750 	if (__improbable(superpage && *pte && !(*pte & PTE_PS))) {
751 		/*
752 		 * There is still an empty page table mapped that
753 		 * was used for a previous base page mapping.
754 		 * Remember the PDE and the PDE index, so that we
755 		 * can free the page at the end of this function.
756 		 */
757 		delpage_pde_index = pdeidx(pmap, vaddr);
758 		delpage_pm_obj = pmap->pm_obj;
759 		pmap_store_pte(is_ept, pte, 0);
760 	}
761 
762 	PTE_LOCK_LOCK(pte);
763 	ptelocked = TRUE;
764 
765 	old_pa = pte_to_pa(*pte);
766 	pai = pa_index(old_pa);
767 	old_pa_locked = FALSE;
768 
769 	if (old_pa == 0 &&
770 	    PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr)) {
771 		/*
772 		 * "pmap" should be locked at this point, so this should
773 		 * not race with another pmap_enter() or pmap_remove_range().
774 		 */
775 		assert(pmap != kernel_pmap);
776 
777 		/* one less "compressed" */
778 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
779 		    PAGE_SIZE);
780 		if (*pte & PTE_COMPRESSED_ALT) {
781 			pmap_ledger_debit(
782 				pmap,
783 				task_ledgers.alternate_accounting_compressed,
784 				PAGE_SIZE);
785 		} else {
786 			/* was part of the footprint */
787 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
788 			    PAGE_SIZE);
789 		}
790 		/* marker will be cleared below */
791 	}
792 
793 	/*
794 	 * if we have a previous managed page, lock the pv entry now. after
795 	 * we lock it, check to see if someone beat us to the lock and if so
796 	 * drop the lock
797 	 */
798 	if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) {
799 		LOCK_PVH(pai);
800 		old_pa_locked = TRUE;
801 		old_pa = pte_to_pa(*pte);
802 		if (0 == old_pa) {
803 			UNLOCK_PVH(pai);        /* another path beat us to it */
804 			old_pa_locked = FALSE;
805 		}
806 	}
807 
808 	/*
809 	 *	Special case if the incoming physical page is already mapped
810 	 *	at this address.
811 	 */
812 	if (old_pa == pa) {
813 		pt_entry_t old_attributes =
814 		    *pte & ~(PTE_REF(is_ept) | PTE_MOD(is_ept) | PTE_LOCK(is_ept));
815 
816 		/*
817 		 *	May be changing its wired attribute or protection
818 		 */
819 
820 		template = pa_to_pte(pa);
821 
822 		if (__probable(!is_ept)) {
823 			template |= INTEL_PTE_VALID;
824 		} else {
825 			template |= INTEL_EPT_IPAT;
826 		}
827 
828 		template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
829 
830 		/*
831 		 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
832 		 */
833 		if (!is_ept && (VM_MEM_NOT_CACHEABLE ==
834 		    (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)))) {
835 			if (!(flags & VM_MEM_GUARDED)) {
836 				template |= INTEL_PTE_PAT;
837 			}
838 			template |= INTEL_PTE_NCACHE;
839 		}
840 		if (pmap != kernel_pmap && !is_ept) {
841 			template |= INTEL_PTE_USER;
842 		}
843 
844 		if (prot & VM_PROT_READ) {
845 			template |= PTE_READ(is_ept);
846 		}
847 
848 		if (prot & VM_PROT_WRITE) {
849 			template |= PTE_WRITE(is_ept);
850 			if (is_ept && !pmap_ept_support_ad) {
851 				template |= PTE_MOD(is_ept);
852 				if (old_pa_locked) {
853 					assert(IS_MANAGED_PAGE(pai));
854 					pmap_phys_attributes[pai] |= PHYS_MODIFIED;
855 				}
856 			}
857 		}
858 
859 		if (prot & VM_PROT_EXECUTE) {
860 			assert(set_NX == 0);
861 			template = pte_set_ex(template, is_ept);
862 		}
863 
864 		if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
865 			assert(set_NX == 0);
866 			template = pte_set_uex(template);
867 		}
868 
869 		if (set_NX) {
870 			template = pte_remove_ex(template, is_ept);
871 		}
872 
873 		if (wired) {
874 			template |= PTE_WIRED;
875 			if (!iswired(old_attributes)) {
876 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
877 			}
878 		} else {
879 			if (iswired(old_attributes)) {
880 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
881 			}
882 		}
883 
884 		if (superpage) {        /* this path can not be used */
885 			template |= PTE_PS;     /* to change the page size! */
886 		}
887 		if (old_attributes == template) {
888 			goto dont_update_pte;
889 		}
890 
891 		/* Determine delta, PV locked */
892 		need_tlbflush =
893 		    ((old_attributes ^ template) != PTE_WIRED);
894 
895 		/* Optimisation: avoid TLB flush when adding writability */
896 		if (need_tlbflush == TRUE && !(old_attributes & PTE_WRITE(is_ept))) {
897 			if ((old_attributes ^ template) == PTE_WRITE(is_ept)) {
898 				need_tlbflush = FALSE;
899 			}
900 		}
901 
902 		/* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
903 		if (__improbable(is_ept && !pmap_ept_support_ad)) {
904 			template |= PTE_REF(is_ept);
905 			if (old_pa_locked) {
906 				assert(IS_MANAGED_PAGE(pai));
907 				pmap_phys_attributes[pai] |= PHYS_REFERENCED;
908 			}
909 		}
910 
911 		/* store modified PTE and preserve RC bits */
912 		pt_entry_t npte, opte;
913 
914 		assert((*pte & PTE_LOCK(is_ept)) != 0);
915 
916 		do {
917 			opte = *pte;
918 			npte = template | (opte & (PTE_REF(is_ept) |
919 			    PTE_MOD(is_ept))) | PTE_LOCK(is_ept);
920 		} while (!pmap_cmpx_pte(pte, opte, npte));
921 
922 		DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, opte, uint64_t, npte);
923 
924 dont_update_pte:
925 		if (old_pa_locked) {
926 			UNLOCK_PVH(pai);
927 			old_pa_locked = FALSE;
928 		}
929 		goto done2;
930 	}
931 
932 	/*
933 	 *	Outline of code from here:
934 	 *	   1) If va was mapped, update TLBs, remove the mapping
935 	 *	      and remove old pvlist entry.
936 	 *	   2) Add pvlist entry for new mapping
937 	 *	   3) Enter new mapping.
938 	 *
939 	 *	If the old physical page is not managed step 1) is skipped
940 	 *	(except for updating the TLBs), and the mapping is
941 	 *	overwritten at step 3).  If the new physical page is not
942 	 *	managed, step 2) is skipped.
943 	 */
944 	/* TODO: add opportunistic refmod collect */
945 	if (old_pa != (pmap_paddr_t) 0) {
946 		boolean_t       was_altacct = FALSE;
947 
948 		/*
949 		 *	Don't do anything to pages outside valid memory here.
950 		 *	Instead convince the code that enters a new mapping
951 		 *	to overwrite the old one.
952 		 */
953 
954 		/* invalidate the PTE */
955 		pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
956 		/* propagate invalidate everywhere */
957 		PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
958 		/* remember reference and change */
959 		old_pte = *pte;
960 		oattr = (char) (old_pte & (PTE_MOD(is_ept) | PTE_REF(is_ept)));
961 		/* completely invalidate the PTE */
962 		pmap_store_pte(is_ept, pte, PTE_LOCK(is_ept));
963 
964 		if (IS_MANAGED_PAGE(pai)) {
965 			/*
966 			 *	Remove the mapping from the pvlist for
967 			 *	this physical page.
968 			 *      We'll end up with either a rooted pv or a
969 			 *      hashed pv
970 			 */
971 			pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, &old_pte, &was_altacct);
972 		}
973 
974 		if (IS_MANAGED_PAGE(pai)) {
975 			pmap_assert(old_pa_locked == TRUE);
976 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
977 			if (pmap != kernel_pmap) {
978 				/* update ledgers */
979 				if (was_altacct) {
980 					assert(IS_INTERNAL_PAGE(pai));
981 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
982 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
983 				} else if (IS_REUSABLE_PAGE(pai)) {
984 					assert(!was_altacct);
985 					assert(IS_INTERNAL_PAGE(pai));
986 					pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
987 					/* was already not in phys_footprint */
988 				} else if (IS_INTERNAL_PAGE(pai)) {
989 					assert(!was_altacct);
990 					assert(!IS_REUSABLE_PAGE(pai));
991 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
992 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
993 				} else {
994 					/* not an internal page */
995 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
996 				}
997 			}
998 			if (iswired(*pte)) {
999 				pmap_ledger_debit(pmap, task_ledgers.wired_mem,
1000 				    PAGE_SIZE);
1001 			}
1002 
1003 			if (!is_ept) {
1004 				pmap_phys_attributes[pai] |= oattr;
1005 			} else {
1006 				pmap_phys_attributes[pai] |= ept_refmod_to_physmap(oattr);
1007 			}
1008 		} else {
1009 			/*
1010 			 *	old_pa is not managed.
1011 			 *	Do removal part of accounting.
1012 			 */
1013 
1014 			if (pmap != kernel_pmap) {
1015 #if 00
1016 				assert(pmap->stats.device > 0);
1017 				OSAddAtomic(-1, &pmap->stats.device);
1018 #endif
1019 			}
1020 			if (iswired(*pte)) {
1021 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1022 			}
1023 		}
1024 	}
1025 
1026 	/*
1027 	 * if we had a previously managed paged locked, unlock it now
1028 	 */
1029 	if (old_pa_locked) {
1030 		UNLOCK_PVH(pai);
1031 		old_pa_locked = FALSE;
1032 	}
1033 
1034 	pai = pa_index(pa);     /* now working with new incoming phys page */
1035 	if (IS_MANAGED_PAGE(pai)) {
1036 		/*
1037 		 *	Step 2) Enter the mapping in the PV list for this
1038 		 *	physical page.
1039 		 */
1040 		pv_h = pai_to_pvh(pai);
1041 
1042 		LOCK_PVH(pai);
1043 
1044 		if (pv_h->pmap == PMAP_NULL) {
1045 			/*
1046 			 *	No mappings yet, use rooted pv
1047 			 */
1048 			pv_h->va_and_flags = vaddr;
1049 			pv_h->pmap = pmap;
1050 			queue_init(&pv_h->qlink);
1051 
1052 			if (options & PMAP_OPTIONS_INTERNAL) {
1053 				pmap_phys_attributes[pai] |= PHYS_INTERNAL;
1054 			} else {
1055 				pmap_phys_attributes[pai] &= ~PHYS_INTERNAL;
1056 			}
1057 			if (options & PMAP_OPTIONS_REUSABLE) {
1058 				pmap_phys_attributes[pai] |= PHYS_REUSABLE;
1059 			} else {
1060 				pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
1061 			}
1062 			if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1063 			    IS_INTERNAL_PAGE(pai)) {
1064 				pv_h->va_and_flags |= PVE_IS_ALTACCT;
1065 				is_altacct = TRUE;
1066 			} else {
1067 				pv_h->va_and_flags &= ~PVE_IS_ALTACCT;
1068 				is_altacct = FALSE;
1069 			}
1070 		} else {
1071 			/*
1072 			 *	Add new pv_hashed_entry after header.
1073 			 */
1074 			if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
1075 				pvh_e = pvh_new;
1076 				pvh_new = PV_HASHED_ENTRY_NULL;
1077 			} else if (PV_HASHED_ENTRY_NULL == pvh_e) {
1078 				PV_HASHED_ALLOC(&pvh_e);
1079 				if (PV_HASHED_ENTRY_NULL == pvh_e) {
1080 					/*
1081 					 * the pv list is empty. if we are on
1082 					 * the kernel pmap we'll use one of
1083 					 * the special private kernel pv_e's,
1084 					 * else, we need to unlock
1085 					 * everything, zalloc a pv_e, and
1086 					 * restart bringing in the pv_e with
1087 					 * us.
1088 					 */
1089 					if (kernel_pmap == pmap) {
1090 						PV_HASHED_KERN_ALLOC(&pvh_e);
1091 					} else {
1092 						UNLOCK_PVH(pai);
1093 						PTE_LOCK_UNLOCK(pte);
1094 						PMAP_UNLOCK_SHARED(pmap);
1095 						pmap_pv_throttle(pmap);
1096 						pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
1097 						goto Retry;
1098 					}
1099 				}
1100 			}
1101 
1102 			if (PV_HASHED_ENTRY_NULL == pvh_e) {
1103 				panic("Mapping alias chain exhaustion, possibly induced by numerous kernel virtual double mappings");
1104 			}
1105 
1106 			pvh_e->va_and_flags = vaddr;
1107 			pvh_e->pmap = pmap;
1108 			pvh_e->ppn = pn;
1109 			if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1110 			    IS_INTERNAL_PAGE(pai)) {
1111 				pvh_e->va_and_flags |= PVE_IS_ALTACCT;
1112 				is_altacct = TRUE;
1113 			} else {
1114 				pvh_e->va_and_flags &= ~PVE_IS_ALTACCT;
1115 				is_altacct = FALSE;
1116 			}
1117 			pv_hash_add(pvh_e, pv_h);
1118 
1119 			/*
1120 			 *	Remember that we used the pvlist entry.
1121 			 */
1122 			pvh_e = PV_HASHED_ENTRY_NULL;
1123 		}
1124 
1125 		/*
1126 		 * only count the mapping
1127 		 * for 'managed memory'
1128 		 */
1129 		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1130 		if (pmap != kernel_pmap) {
1131 			/* update ledgers */
1132 			if (is_altacct) {
1133 				/* internal but also alternate accounting */
1134 				assert(IS_INTERNAL_PAGE(pai));
1135 				pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1136 				pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1137 				/* alternate accounting, so not in footprint */
1138 			} else if (IS_REUSABLE_PAGE(pai)) {
1139 				assert(!is_altacct);
1140 				assert(IS_INTERNAL_PAGE(pai));
1141 				pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
1142 				/* internal but reusable: not in footprint */
1143 			} else if (IS_INTERNAL_PAGE(pai)) {
1144 				assert(!is_altacct);
1145 				assert(!IS_REUSABLE_PAGE(pai));
1146 				/* internal: add to footprint */
1147 				pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1148 				pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1149 			} else {
1150 				/* not internal: not in footprint */
1151 				pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
1152 			}
1153 		}
1154 	} else if (last_managed_page == 0) {
1155 		/* Account for early mappings created before "managed pages"
1156 		 * are determined. Consider consulting the available DRAM map.
1157 		 */
1158 		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1159 		if (pmap != kernel_pmap) {
1160 #if 00
1161 			OSAddAtomic(+1, &pmap->stats.device);
1162 			PMAP_STATS_PEAK(pmap->stats.device);
1163 #endif
1164 		}
1165 	}
1166 	/*
1167 	 * Step 3) Enter the mapping.
1168 	 *
1169 	 *	Build a template to speed up entering -
1170 	 *	only the pfn changes.
1171 	 */
1172 	template = pa_to_pte(pa);
1173 
1174 	if (!is_ept) {
1175 		template |= INTEL_PTE_VALID;
1176 	} else {
1177 		template |= INTEL_EPT_IPAT;
1178 	}
1179 
1180 	/*
1181 	 * DRK: It may be worth asserting on cache attribute flags that diverge
1182 	 * from the existing physical page attributes.
1183 	 */
1184 
1185 	template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
1186 
1187 	/*
1188 	 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
1189 	 */
1190 	if (!is_ept && (flags & VM_MEM_NOT_CACHEABLE)) {
1191 		if (!(flags & VM_MEM_GUARDED)) {
1192 			template |= INTEL_PTE_PAT;
1193 		}
1194 		template |= INTEL_PTE_NCACHE;
1195 	}
1196 	if (pmap != kernel_pmap && !is_ept) {
1197 		template |= INTEL_PTE_USER;
1198 	}
1199 	if (prot & VM_PROT_READ) {
1200 		template |= PTE_READ(is_ept);
1201 	}
1202 	if (prot & VM_PROT_WRITE) {
1203 		template |= PTE_WRITE(is_ept);
1204 		if (is_ept && !pmap_ept_support_ad) {
1205 			template |= PTE_MOD(is_ept);
1206 			if (IS_MANAGED_PAGE(pai)) {
1207 				pmap_phys_attributes[pai] |= PHYS_MODIFIED;
1208 			}
1209 		}
1210 	}
1211 	if (prot & VM_PROT_EXECUTE) {
1212 		assert(set_NX == 0);
1213 		template = pte_set_ex(template, is_ept);
1214 	}
1215 	if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
1216 		assert(set_NX == 0);
1217 		template = pte_set_uex(template);
1218 	}
1219 
1220 	if (set_NX) {
1221 		template = pte_remove_ex(template, is_ept);
1222 	}
1223 	if (wired) {
1224 		template |= INTEL_PTE_WIRED;
1225 		pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1226 	}
1227 	if (__improbable(superpage)) {
1228 		template |= INTEL_PTE_PS;
1229 	}
1230 
1231 	/* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
1232 	if (__improbable(is_ept && !pmap_ept_support_ad)) {
1233 		template |= PTE_REF(is_ept);
1234 		if (IS_MANAGED_PAGE(pai)) {
1235 			pmap_phys_attributes[pai] |= PHYS_REFERENCED;
1236 		}
1237 	}
1238 	template |= PTE_LOCK(is_ept);
1239 	pmap_store_pte(is_ept, pte, template);
1240 	DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, 0, uint64_t, template);
1241 
1242 	/*
1243 	 * if this was a managed page we delayed unlocking the pv until here
1244 	 * to prevent pmap_page_protect et al from finding it until the pte
1245 	 * has been stored
1246 	 */
1247 	if (IS_MANAGED_PAGE(pai)) {
1248 		UNLOCK_PVH(pai);
1249 	}
1250 done2:
1251 	if (need_tlbflush == TRUE) {
1252 		if (options & PMAP_OPTIONS_NOFLUSH) {
1253 			PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1254 		} else {
1255 			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1256 		}
1257 	}
1258 	if (ptelocked) {
1259 		PTE_LOCK_UNLOCK(pte);
1260 	}
1261 	PMAP_UNLOCK_SHARED(pmap);
1262 
1263 	if (pvh_e != PV_HASHED_ENTRY_NULL) {
1264 		PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
1265 	}
1266 	if (pvh_new != PV_HASHED_ENTRY_NULL) {
1267 		PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
1268 	}
1269 
1270 	if (delpage_pm_obj) {
1271 		vm_page_t m;
1272 
1273 		vm_object_lock(delpage_pm_obj);
1274 		m = vm_page_lookup(delpage_pm_obj, (delpage_pde_index * PAGE_SIZE));
1275 		if (m == VM_PAGE_NULL) {
1276 			panic("pmap_enter: pte page not in object");
1277 		}
1278 		VM_PAGE_FREE(m);
1279 		vm_object_unlock(delpage_pm_obj);
1280 		OSAddAtomic(-1, &inuse_ptepages_count);
1281 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
1282 	}
1283 
1284 	kr = KERN_SUCCESS;
1285 done1:
1286 	if (__improbable((kr == KERN_SUCCESS) && (pmap == kernel_pmap) &&
1287 	    zone_spans_ro_va(vaddr, vaddr + PAGE_SIZE))) {
1288 		pmap_page_protect((ppnum_t)atop_kernel(kvtophys(vaddr)), VM_PROT_READ);
1289 	}
1290 	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
1291 	return kr;
1292 }
1293 
1294 /*
1295  *	Remove a range of hardware page-table entries.
1296  *	The entries given are the first (inclusive)
1297  *	and last (exclusive) entries for the VM pages.
1298  *	The virtual address is the va for the first pte.
1299  *
1300  *	The pmap must be locked.
1301  *	If the pmap is not the kernel pmap, the range must lie
1302  *	entirely within one pte-page.  This is NOT checked.
1303  *	Assumes that the pte-page exists.
1304  */
1305 
1306 void
pmap_remove_range(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte)1307 pmap_remove_range(
1308 	pmap_t                  pmap,
1309 	vm_map_offset_t         start_vaddr,
1310 	pt_entry_t              *spte,
1311 	pt_entry_t              *epte)
1312 {
1313 	pmap_remove_range_options(pmap, start_vaddr, spte, epte,
1314 	    PMAP_OPTIONS_REMOVE);
1315 }
1316 
1317 static void
pmap_remove_range_options(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte,int options)1318 pmap_remove_range_options(
1319 	pmap_t                  pmap,
1320 	vm_map_offset_t         start_vaddr,
1321 	pt_entry_t              *spte,
1322 	pt_entry_t              *epte,
1323 	int                     options)
1324 {
1325 	pt_entry_t              *cpte;
1326 	pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1327 	pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1328 	pv_hashed_entry_t       pvh_e;
1329 	int                     pvh_cnt = 0;
1330 	int                     num_removed, num_unwired, num_found, num_invalid;
1331 	int                     ledgers_external, ledgers_reusable, ledgers_internal, ledgers_alt_internal;
1332 	uint64_t                ledgers_compressed, ledgers_alt_compressed;
1333 	ppnum_t                 pai;
1334 	pmap_paddr_t            pa;
1335 	vm_map_offset_t         vaddr;
1336 	boolean_t               is_ept = is_ept_pmap(pmap);
1337 	boolean_t               was_altacct;
1338 
1339 	num_removed = 0;
1340 	num_unwired = 0;
1341 	num_found   = 0;
1342 	num_invalid = 0;
1343 	ledgers_external = 0;
1344 	ledgers_reusable = 0;
1345 	ledgers_internal = 0;
1346 	ledgers_compressed = 0;
1347 	ledgers_alt_internal = 0;
1348 	ledgers_alt_compressed = 0;
1349 
1350 	/* invalidate the PTEs first to "freeze" them */
1351 	for (cpte = spte, vaddr = start_vaddr;
1352 	    cpte < epte;
1353 	    cpte++, vaddr += PAGE_SIZE_64) {
1354 		pt_entry_t p = *cpte;
1355 
1356 		pa = pte_to_pa(p);
1357 		if (pa == 0) {
1358 			if ((options & PMAP_OPTIONS_REMOVE) &&
1359 			    (PTE_IS_COMPRESSED(p, cpte, pmap, vaddr))) {
1360 				assert(pmap != kernel_pmap);
1361 				/* one less "compressed"... */
1362 				ledgers_compressed++;
1363 				if (p & PTE_COMPRESSED_ALT) {
1364 					/* ... but it used to be "ALTACCT" */
1365 					ledgers_alt_compressed++;
1366 				}
1367 				/* clear marker(s) */
1368 				/* XXX probably does not need to be atomic! */
1369 				pmap_update_pte(is_ept, cpte, INTEL_PTE_COMPRESSED_MASK, 0, true);
1370 			}
1371 			continue;
1372 		}
1373 		num_found++;
1374 
1375 		if (iswired(p)) {
1376 			num_unwired++;
1377 		}
1378 
1379 		pai = pa_index(pa);
1380 
1381 		if (!IS_MANAGED_PAGE(pai)) {
1382 			/*
1383 			 *	Outside range of managed physical memory.
1384 			 *	Just remove the mappings.
1385 			 */
1386 			pmap_store_pte(is_ept, cpte, 0);
1387 			continue;
1388 		}
1389 
1390 		if ((p & PTE_VALID_MASK(is_ept)) == 0) {
1391 			num_invalid++;
1392 		}
1393 
1394 		/* invalidate the PTE */
1395 		pmap_update_pte(is_ept, cpte, PTE_VALID_MASK(is_ept), 0, true);
1396 	}
1397 
1398 	if (num_found == 0) {
1399 		/* nothing was changed: we're done */
1400 		goto update_counts;
1401 	}
1402 
1403 	/* propagate the invalidates to other CPUs */
1404 
1405 	PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1406 
1407 	for (cpte = spte, vaddr = start_vaddr;
1408 	    cpte < epte;
1409 	    cpte++, vaddr += PAGE_SIZE_64) {
1410 		pa = pte_to_pa(*cpte);
1411 		if (pa == 0) {
1412 check_pte_for_compressed_marker:
1413 			/*
1414 			 * This PTE could have been replaced with a
1415 			 * "compressed" marker after our first "freeze"
1416 			 * loop above, so check again.
1417 			 */
1418 			if ((options & PMAP_OPTIONS_REMOVE) &&
1419 			    (PTE_IS_COMPRESSED(*cpte, cpte, pmap, vaddr))) {
1420 				assert(pmap != kernel_pmap);
1421 				/* one less "compressed"... */
1422 				ledgers_compressed++;
1423 				if (*cpte & PTE_COMPRESSED_ALT) {
1424 					/* ... but it used to be "ALTACCT" */
1425 					ledgers_alt_compressed++;
1426 				}
1427 				pmap_store_pte(is_ept, cpte, 0);
1428 			}
1429 			continue;
1430 		}
1431 
1432 		pai = pa_index(pa);
1433 
1434 		LOCK_PVH(pai);
1435 
1436 		pa = pte_to_pa(*cpte);
1437 		if (pa == 0) {
1438 			UNLOCK_PVH(pai);
1439 			goto check_pte_for_compressed_marker;
1440 		}
1441 
1442 		/*
1443 		 * Remove the mapping from the pvlist for this physical page.
1444 		 */
1445 		pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, cpte, &was_altacct);
1446 
1447 		num_removed++;
1448 		/* update ledgers */
1449 		if (was_altacct) {
1450 			/* internal and alternate accounting */
1451 			assert(IS_INTERNAL_PAGE(pai));
1452 			ledgers_internal++;
1453 			ledgers_alt_internal++;
1454 		} else if (IS_REUSABLE_PAGE(pai)) {
1455 			/* internal but reusable */
1456 			assert(!was_altacct);
1457 			assert(IS_INTERNAL_PAGE(pai));
1458 			ledgers_reusable++;
1459 		} else if (IS_INTERNAL_PAGE(pai)) {
1460 			/* internal */
1461 			assert(!was_altacct);
1462 			assert(!IS_REUSABLE_PAGE(pai));
1463 			ledgers_internal++;
1464 		} else {
1465 			/* not internal */
1466 			ledgers_external++;
1467 		}
1468 
1469 		/*
1470 		 * Get the modify and reference bits, then
1471 		 * nuke the entry in the page table
1472 		 */
1473 		/* remember reference and change */
1474 		if (!is_ept) {
1475 			pmap_phys_attributes[pai] |=
1476 			    *cpte & (PHYS_MODIFIED | PHYS_REFERENCED);
1477 		} else {
1478 			pmap_phys_attributes[pai] |=
1479 			    ept_refmod_to_physmap((*cpte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1480 		}
1481 
1482 		/* completely invalidate the PTE */
1483 		pmap_store_pte(is_ept, cpte, 0);
1484 
1485 		UNLOCK_PVH(pai);
1486 
1487 		if (pvh_e != PV_HASHED_ENTRY_NULL) {
1488 			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1489 			pvh_eh = pvh_e;
1490 
1491 			if (pvh_et == PV_HASHED_ENTRY_NULL) {
1492 				pvh_et = pvh_e;
1493 			}
1494 			pvh_cnt++;
1495 		}
1496 		/* We can encounter at most 'num_found' PTEs for this level
1497 		 * Fewer may be encountered if some were replaced by
1498 		 * compressed markers. No new valid PTEs can be created
1499 		 * since the pmap lock is held exclusively.
1500 		 */
1501 		if (num_removed == num_found) {
1502 			break;
1503 		}
1504 	} /* for loop */
1505 
1506 	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1507 		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1508 	}
1509 update_counts:
1510 	/*
1511 	 *	Update the counts
1512 	 */
1513 #if TESTING
1514 	if (pmap->stats.resident_count < num_removed) {
1515 		panic("pmap_remove_range: resident_count");
1516 	}
1517 #endif
1518 	if (num_removed) {
1519 		pmap_ledger_debit(pmap, task_ledgers.phys_mem, machine_ptob(num_removed));
1520 	}
1521 
1522 	if (pmap != kernel_pmap) {
1523 		if (ledgers_external) {
1524 			pmap_ledger_debit(pmap,
1525 			    task_ledgers.external,
1526 			    machine_ptob(ledgers_external));
1527 		}
1528 		if (ledgers_reusable) {
1529 			pmap_ledger_debit(pmap,
1530 			    task_ledgers.reusable,
1531 			    machine_ptob(ledgers_reusable));
1532 		}
1533 		if (ledgers_internal) {
1534 			pmap_ledger_debit(pmap,
1535 			    task_ledgers.internal,
1536 			    machine_ptob(ledgers_internal));
1537 		}
1538 		if (ledgers_compressed) {
1539 			pmap_ledger_debit(pmap,
1540 			    task_ledgers.internal_compressed,
1541 			    machine_ptob(ledgers_compressed));
1542 		}
1543 		if (ledgers_alt_internal) {
1544 			pmap_ledger_debit(pmap,
1545 			    task_ledgers.alternate_accounting,
1546 			    machine_ptob(ledgers_alt_internal));
1547 		}
1548 		if (ledgers_alt_compressed) {
1549 			pmap_ledger_debit(pmap,
1550 			    task_ledgers.alternate_accounting_compressed,
1551 			    machine_ptob(ledgers_alt_compressed));
1552 		}
1553 
1554 		uint64_t net_debit = (ledgers_internal - ledgers_alt_internal) + (ledgers_compressed - ledgers_alt_compressed);
1555 		if (net_debit) {
1556 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(net_debit));
1557 		}
1558 	}
1559 
1560 	if (num_unwired != 0) {
1561 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, machine_ptob(num_unwired));
1562 	}
1563 	return;
1564 }
1565 
1566 
1567 /*
1568  *	Remove the given range of addresses
1569  *	from the specified map.
1570  *
1571  *	It is assumed that the start and end are properly
1572  *	rounded to the hardware page size.
1573  */
1574 void
pmap_remove(pmap_t map,addr64_t s64,addr64_t e64)1575 pmap_remove(
1576 	pmap_t          map,
1577 	addr64_t        s64,
1578 	addr64_t        e64)
1579 {
1580 	pmap_remove_options(map, s64, e64, PMAP_OPTIONS_REMOVE);
1581 }
1582 #define PLCHECK_THRESHOLD (2)
1583 
1584 void
pmap_remove_options(pmap_t map,addr64_t s64,addr64_t e64,int options)1585 pmap_remove_options(
1586 	pmap_t          map,
1587 	addr64_t        s64,
1588 	addr64_t        e64,
1589 	int             options)
1590 {
1591 	pt_entry_t     *pde;
1592 	pt_entry_t     *spte, *epte;
1593 	addr64_t        l64;
1594 	uint64_t        deadline = 0;
1595 	boolean_t       is_ept;
1596 
1597 	pmap_intr_assert();
1598 
1599 	if (map == PMAP_NULL || s64 == e64) {
1600 		return;
1601 	}
1602 
1603 	is_ept = is_ept_pmap(map);
1604 
1605 	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
1606 	    VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(s64),
1607 	    VM_KERNEL_ADDRHIDE(e64));
1608 
1609 	PMAP_LOCK_EXCLUSIVE(map);
1610 	uint32_t traverse_count = 0;
1611 
1612 	while (s64 < e64) {
1613 		pml4_entry_t *pml4e = pmap64_pml4(map, s64);
1614 		if ((pml4e == NULL) ||
1615 		    ((*pml4e & PTE_VALID_MASK(is_ept)) == 0)) {
1616 			if (os_add_overflow(s64, NBPML4, &s64)) {
1617 				/* wrap; clip s64 to e64 */
1618 				s64 = e64;
1619 				break;
1620 			}
1621 			s64 &= ~(PML4MASK);
1622 			continue;
1623 		}
1624 		pdpt_entry_t *pdpte = pmap64_pdpt(map, s64);
1625 		if ((pdpte == NULL) ||
1626 		    ((*pdpte & PTE_VALID_MASK(is_ept)) == 0)) {
1627 			if (os_add_overflow(s64, NBPDPT, &s64)) {
1628 				/* wrap; clip s64 to e64 */
1629 				s64 = e64;
1630 				break;
1631 			}
1632 			s64 &= ~(PDPTMASK);
1633 			continue;
1634 		}
1635 
1636 		if (os_add_overflow(s64, PDE_MAPPED_SIZE, &l64)) {
1637 			l64 = e64;
1638 		} else {
1639 			l64 &= ~(PDE_MAPPED_SIZE - 1);
1640 
1641 			if (l64 > e64) {
1642 				l64 = e64;
1643 			}
1644 		}
1645 
1646 		pde = pmap_pde(map, s64);
1647 
1648 		if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1649 			if (*pde & PTE_PS) {
1650 				/*
1651 				 * If we're removing a superpage, pmap_remove_range()
1652 				 * must work on level 2 instead of level 1; and we're
1653 				 * only passing a single level 2 entry instead of a
1654 				 * level 1 range.
1655 				 */
1656 				spte = pde;
1657 				epte = spte + 1; /* excluded */
1658 			} else {
1659 				spte = pmap_pte(map, (s64 & ~(PDE_MAPPED_SIZE - 1)));
1660 				spte = &spte[ptenum(s64)];
1661 				epte = &spte[intel_btop(l64 - s64)];
1662 			}
1663 			pmap_remove_range_options(map, s64, spte, epte,
1664 			    options);
1665 		}
1666 		s64 = l64;
1667 
1668 		if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
1669 			if (deadline == 0) {
1670 				deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1671 			} else {
1672 				if (rdtsc64_nofence() > deadline) {
1673 					PMAP_UNLOCK_EXCLUSIVE(map);
1674 					__builtin_ia32_pause();
1675 					PMAP_LOCK_EXCLUSIVE(map);
1676 					deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1677 				}
1678 			}
1679 		}
1680 	}
1681 
1682 	PMAP_UNLOCK_EXCLUSIVE(map);
1683 
1684 	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
1685 }
1686 
1687 void
pmap_page_protect(ppnum_t pn,vm_prot_t prot)1688 pmap_page_protect(
1689 	ppnum_t         pn,
1690 	vm_prot_t       prot)
1691 {
1692 	pmap_page_protect_options(pn, prot, 0, NULL);
1693 }
1694 
1695 /*
1696  *	Routine:	pmap_page_protect_options
1697  *
1698  *	Function:
1699  *		Lower the permission for all mappings to a given
1700  *		page.
1701  */
1702 void
pmap_page_protect_options(ppnum_t pn,vm_prot_t prot,unsigned int options,void * arg)1703 pmap_page_protect_options(
1704 	ppnum_t         pn,
1705 	vm_prot_t       prot,
1706 	unsigned int    options,
1707 	void            *arg)
1708 {
1709 	pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1710 	pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1711 	pv_hashed_entry_t       nexth;
1712 	int                     pvh_cnt = 0;
1713 	pv_rooted_entry_t       pv_h;
1714 	pv_rooted_entry_t       pv_e;
1715 	pv_hashed_entry_t       pvh_e;
1716 	pt_entry_t              *pte;
1717 	int                     pai;
1718 	pmap_t                  pmap;
1719 	boolean_t               remove;
1720 	pt_entry_t              new_pte_value;
1721 	boolean_t               is_ept;
1722 
1723 	pmap_intr_assert();
1724 	assert(pn != vm_page_fictitious_addr);
1725 	if (pn == vm_page_guard_addr) {
1726 		return;
1727 	}
1728 
1729 	pai = ppn_to_pai(pn);
1730 
1731 	if (!IS_MANAGED_PAGE(pai)) {
1732 		/*
1733 		 *	Not a managed page.
1734 		 */
1735 		return;
1736 	}
1737 
1738 	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, pn, prot);
1739 
1740 	/*
1741 	 * Determine the new protection.
1742 	 */
1743 	switch (prot) {
1744 	case VM_PROT_READ:
1745 	case VM_PROT_READ | VM_PROT_EXECUTE:
1746 		remove = FALSE;
1747 		break;
1748 	case VM_PROT_ALL:
1749 		return;         /* nothing to do */
1750 	default:
1751 		remove = TRUE;
1752 		break;
1753 	}
1754 
1755 	pv_h = pai_to_pvh(pai);
1756 
1757 	LOCK_PVH(pai);
1758 
1759 
1760 	/*
1761 	 * Walk down PV list, if any, changing or removing all mappings.
1762 	 */
1763 	if (pv_h->pmap == PMAP_NULL) {
1764 		goto done;
1765 	}
1766 
1767 	pv_e = pv_h;
1768 	pvh_e = (pv_hashed_entry_t) pv_e;       /* cheat */
1769 
1770 	do {
1771 		vm_map_offset_t vaddr;
1772 
1773 		if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1774 		    (pmap_phys_attributes[pai] & PHYS_MODIFIED)) {
1775 			/* page was modified, so it will be compressed */
1776 			options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1777 			options |= PMAP_OPTIONS_COMPRESSOR;
1778 		}
1779 
1780 		pmap = pv_e->pmap;
1781 		is_ept = is_ept_pmap(pmap);
1782 		vaddr = PVE_VA(pv_e);
1783 		pte = pmap_pte(pmap, vaddr);
1784 
1785 		pmap_assert2((pa_index(pte_to_pa(*pte)) == pn),
1786 		    "pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte);
1787 
1788 		if (0 == pte) {
1789 			panic("pmap_page_protect() "
1790 			    "pmap=%p pn=0x%x vaddr=0x%llx\n",
1791 			    pmap, pn, vaddr);
1792 		}
1793 		nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink);
1794 
1795 		/*
1796 		 * Remove the mapping if new protection is NONE
1797 		 */
1798 		if (remove) {
1799 			/* Remove per-pmap wired count */
1800 			if (iswired(*pte)) {
1801 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1802 			}
1803 
1804 			if (pmap != kernel_pmap &&
1805 			    (options & PMAP_OPTIONS_COMPRESSOR) &&
1806 			    IS_INTERNAL_PAGE(pai)) {
1807 				assert(!PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr));
1808 				/* mark this PTE as having been "compressed" */
1809 				new_pte_value = PTE_COMPRESSED;
1810 				if (IS_ALTACCT_PAGE(pai, pv_e)) {
1811 					new_pte_value |= PTE_COMPRESSED_ALT;
1812 				}
1813 			} else {
1814 				new_pte_value = 0;
1815 			}
1816 
1817 			if (options & PMAP_OPTIONS_NOREFMOD) {
1818 				pmap_store_pte(is_ept, pte, new_pte_value);
1819 
1820 				if (options & PMAP_OPTIONS_NOFLUSH) {
1821 					PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1822 				} else {
1823 					PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1824 				}
1825 			} else {
1826 				/*
1827 				 * Remove the mapping, collecting dirty bits.
1828 				 */
1829 				pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
1830 
1831 				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1832 				if (!is_ept) {
1833 					pmap_phys_attributes[pai] |=
1834 					    *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1835 				} else {
1836 					pmap_phys_attributes[pai] |=
1837 					    ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1838 				}
1839 				if ((options &
1840 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1841 				    IS_INTERNAL_PAGE(pai) &&
1842 				    (pmap_phys_attributes[pai] &
1843 				    PHYS_MODIFIED)) {
1844 					/*
1845 					 * Page is actually "modified" and
1846 					 * will be compressed.  Start
1847 					 * accounting for it as "compressed".
1848 					 */
1849 					assert(!(options & PMAP_OPTIONS_COMPRESSOR));
1850 					options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1851 					options |= PMAP_OPTIONS_COMPRESSOR;
1852 					assert(new_pte_value == 0);
1853 					if (pmap != kernel_pmap) {
1854 						new_pte_value = PTE_COMPRESSED;
1855 						if (IS_ALTACCT_PAGE(pai, pv_e)) {
1856 							new_pte_value |= PTE_COMPRESSED_ALT;
1857 						}
1858 					}
1859 				}
1860 				pmap_store_pte(is_ept, pte, new_pte_value);
1861 			}
1862 
1863 #if TESTING
1864 			if (pmap->stats.resident_count < 1) {
1865 				panic("pmap_page_protect: resident_count");
1866 			}
1867 #endif
1868 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1869 
1870 			/*
1871 			 * We only ever compress internal pages.
1872 			 */
1873 			if (options & PMAP_OPTIONS_COMPRESSOR) {
1874 				assert(IS_INTERNAL_PAGE(pai));
1875 			}
1876 			if (pmap != kernel_pmap) {
1877 				/* update ledgers */
1878 				if (IS_ALTACCT_PAGE(pai, pv_e)) {
1879 					assert(IS_INTERNAL_PAGE(pai));
1880 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1881 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1882 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1883 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1884 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, PAGE_SIZE);
1885 					}
1886 				} else if (IS_REUSABLE_PAGE(pai)) {
1887 					assert(!IS_ALTACCT_PAGE(pai, pv_e));
1888 					assert(IS_INTERNAL_PAGE(pai));
1889 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1890 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1891 						/* was not in footprint, but is now */
1892 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1893 					}
1894 					pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
1895 				} else if (IS_INTERNAL_PAGE(pai)) {
1896 					assert(!IS_ALTACCT_PAGE(pai, pv_e));
1897 					assert(!IS_REUSABLE_PAGE(pai));
1898 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1899 					/*
1900 					 * Update all stats related to physical
1901 					 * footprint, which only deals with
1902 					 * internal pages.
1903 					 */
1904 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1905 						/*
1906 						 * This removal is only being
1907 						 * done so we can send this page
1908 						 * to the compressor;  therefore
1909 						 * it mustn't affect total task
1910 						 * footprint.
1911 						 */
1912 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1913 					} else {
1914 						/*
1915 						 * This internal page isn't
1916 						 * going to the compressor,
1917 						 * so adjust stats to keep
1918 						 * phys_footprint up to date.
1919 						 */
1920 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1921 					}
1922 				} else {
1923 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
1924 				}
1925 			}
1926 
1927 			/*
1928 			 * Deal with the pv_rooted_entry.
1929 			 */
1930 
1931 			if (pv_e == pv_h) {
1932 				/*
1933 				 * Fix up head later.
1934 				 */
1935 				pv_h->pmap = PMAP_NULL;
1936 			} else {
1937 				/*
1938 				 * Delete this entry.
1939 				 */
1940 				pv_hash_remove(pvh_e);
1941 				pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1942 				pvh_eh = pvh_e;
1943 
1944 				if (pvh_et == PV_HASHED_ENTRY_NULL) {
1945 					pvh_et = pvh_e;
1946 				}
1947 				pvh_cnt++;
1948 			}
1949 		} else {
1950 			/*
1951 			 * Write-protect, after opportunistic refmod collect
1952 			 */
1953 			if (!is_ept) {
1954 				pmap_phys_attributes[pai] |=
1955 				    *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1956 			} else {
1957 				pmap_phys_attributes[pai] |=
1958 				    ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1959 			}
1960 
1961 			pmap_update_pte(is_ept, pte, PTE_WRITE(is_ept), 0, true);
1962 			if (options & PMAP_OPTIONS_NOFLUSH) {
1963 				PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1964 			} else {
1965 				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1966 			}
1967 		}
1968 		pvh_e = nexth;
1969 	} while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h);
1970 
1971 
1972 	/*
1973 	 * If pv_head mapping was removed, fix it up.
1974 	 */
1975 	if (pv_h->pmap == PMAP_NULL) {
1976 		pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
1977 
1978 		if (pvh_e != (pv_hashed_entry_t) pv_h) {
1979 			pv_hash_remove(pvh_e);
1980 			pv_h->pmap = pvh_e->pmap;
1981 			pv_h->va_and_flags = pvh_e->va_and_flags;
1982 			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1983 			pvh_eh = pvh_e;
1984 
1985 			if (pvh_et == PV_HASHED_ENTRY_NULL) {
1986 				pvh_et = pvh_e;
1987 			}
1988 			pvh_cnt++;
1989 		}
1990 	}
1991 	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1992 		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1993 	}
1994 done:
1995 	UNLOCK_PVH(pai);
1996 
1997 	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
1998 }
1999 
2000 
2001 /*
2002  *	Clear specified attribute bits.
2003  */
2004 void
phys_attribute_clear(ppnum_t pn,int bits,unsigned int options,void * arg)2005 phys_attribute_clear(
2006 	ppnum_t         pn,
2007 	int             bits,
2008 	unsigned int    options,
2009 	void            *arg)
2010 {
2011 	pv_rooted_entry_t       pv_h;
2012 	pv_hashed_entry_t       pv_e;
2013 	pt_entry_t              *pte = NULL;
2014 	int                     pai;
2015 	pmap_t                  pmap;
2016 	char                    attributes = 0;
2017 	boolean_t               is_internal, is_reusable, is_altacct, is_ept;
2018 	int                     ept_bits_to_clear;
2019 	boolean_t               ept_keep_global_mod = FALSE;
2020 
2021 	if ((bits & PHYS_MODIFIED) &&
2022 	    (options & PMAP_OPTIONS_NOFLUSH) &&
2023 	    arg == NULL) {
2024 		panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p): "
2025 		    "should not clear 'modified' without flushing TLBs\n",
2026 		    pn, bits, options, arg);
2027 	}
2028 
2029 	/* We only support converting MOD and REF bits for EPT PTEs in this function */
2030 	assert((bits & ~(PHYS_REFERENCED | PHYS_MODIFIED)) == 0);
2031 
2032 	ept_bits_to_clear = (unsigned)physmap_refmod_to_ept(bits & (PHYS_MODIFIED | PHYS_REFERENCED));
2033 
2034 	pmap_intr_assert();
2035 	assert(pn != vm_page_fictitious_addr);
2036 	if (pn == vm_page_guard_addr) {
2037 		return;
2038 	}
2039 
2040 	pai = ppn_to_pai(pn);
2041 
2042 	if (!IS_MANAGED_PAGE(pai)) {
2043 		/*
2044 		 *	Not a managed page.
2045 		 */
2046 		return;
2047 	}
2048 
2049 	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
2050 
2051 	pv_h = pai_to_pvh(pai);
2052 
2053 	LOCK_PVH(pai);
2054 
2055 
2056 	/*
2057 	 * Walk down PV list, clearing all modify or reference bits.
2058 	 * We do not have to lock the pv_list because we have
2059 	 * the per-pmap lock
2060 	 */
2061 	if (pv_h->pmap != PMAP_NULL) {
2062 		/*
2063 		 * There are some mappings.
2064 		 */
2065 
2066 		is_internal = IS_INTERNAL_PAGE(pai);
2067 		is_reusable = IS_REUSABLE_PAGE(pai);
2068 
2069 		pv_e = (pv_hashed_entry_t)pv_h;
2070 
2071 		do {
2072 			vm_map_offset_t va;
2073 			char pte_bits;
2074 
2075 			pmap = pv_e->pmap;
2076 			is_ept = is_ept_pmap(pmap);
2077 			is_altacct = IS_ALTACCT_PAGE(pai, pv_e);
2078 			va = PVE_VA(pv_e);
2079 			pte_bits = 0;
2080 
2081 			if (bits) {
2082 				pte = pmap_pte(pmap, va);
2083 				/* grab ref/mod bits from this PTE */
2084 				pte_bits = (*pte & (PTE_REF(is_ept) | PTE_MOD(is_ept)));
2085 				/* propagate to page's global attributes */
2086 				if (!is_ept) {
2087 					attributes |= pte_bits;
2088 				} else {
2089 					attributes |= ept_refmod_to_physmap(pte_bits);
2090 					if (!pmap_ept_support_ad && (pte_bits & INTEL_EPT_MOD)) {
2091 						ept_keep_global_mod = TRUE;
2092 					}
2093 				}
2094 				/* which bits to clear for this PTE? */
2095 				if (!is_ept) {
2096 					pte_bits &= bits;
2097 				} else {
2098 					pte_bits &= ept_bits_to_clear;
2099 				}
2100 			}
2101 			if (options & PMAP_OPTIONS_CLEAR_WRITE) {
2102 				pte_bits |= PTE_WRITE(is_ept);
2103 			}
2104 
2105 			/*
2106 			 * Clear modify and/or reference bits.
2107 			 */
2108 			if (pte_bits) {
2109 				pmap_update_pte(is_ept, pte, pte_bits, 0, true);
2110 
2111 				/* Ensure all processors using this translation
2112 				 * invalidate this TLB entry. The invalidation
2113 				 * *must* follow the PTE update, to ensure that
2114 				 * the TLB shadow of the 'D' bit (in particular)
2115 				 * is synchronized with the updated PTE.
2116 				 */
2117 				if (!(options & PMAP_OPTIONS_NOFLUSH)) {
2118 					/* flush TLBS now */
2119 					PMAP_UPDATE_TLBS(pmap,
2120 					    va,
2121 					    va + PAGE_SIZE);
2122 				} else if (arg) {
2123 					/* delayed TLB flush: add "pmap" info */
2124 					PMAP_UPDATE_TLBS_DELAYED(
2125 						pmap,
2126 						va,
2127 						va + PAGE_SIZE,
2128 						(pmap_flush_context *)arg);
2129 				} else {
2130 					/* no TLB flushing at all */
2131 				}
2132 			}
2133 
2134 			/* update pmap "reusable" stats */
2135 			if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
2136 			    is_reusable &&
2137 			    pmap != kernel_pmap) {
2138 				/* one less "reusable" */
2139 				pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
2140 				if (is_internal) {
2141 					/* one more "internal" */
2142 					if (is_altacct) {
2143 						/* no impact on ledgers */
2144 					} else {
2145 						pmap_ledger_credit(pmap,
2146 						    task_ledgers.internal,
2147 						    PAGE_SIZE);
2148 						pmap_ledger_credit(
2149 							pmap,
2150 							task_ledgers.phys_footprint,
2151 							PAGE_SIZE);
2152 					}
2153 				} else {
2154 					/* one more "external" */
2155 					pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
2156 				}
2157 			} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
2158 			    !is_reusable &&
2159 			    pmap != kernel_pmap) {
2160 				/* one more "reusable" */
2161 				pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
2162 				if (is_internal) {
2163 					/* one less "internal" */
2164 					if (is_altacct) {
2165 						/* no impact on footprint */
2166 					} else {
2167 						pmap_ledger_debit(pmap,
2168 						    task_ledgers.internal,
2169 						    PAGE_SIZE);
2170 						pmap_ledger_debit(
2171 							pmap,
2172 							task_ledgers.phys_footprint,
2173 							PAGE_SIZE);
2174 					}
2175 				} else {
2176 					/* one less "external" */
2177 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
2178 				}
2179 			}
2180 
2181 			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2182 		} while (pv_e != (pv_hashed_entry_t)pv_h);
2183 	}
2184 	/* Opportunistic refmod collection, annulled
2185 	 * if both REF and MOD are being cleared.
2186 	 */
2187 
2188 	pmap_phys_attributes[pai] |= attributes;
2189 
2190 	if (ept_keep_global_mod) {
2191 		/*
2192 		 * If the hardware doesn't support AD bits for EPT PTEs and someone is
2193 		 * requesting that we clear the modified bit for a phys page, we need
2194 		 * to ensure that there are no EPT mappings for the page with the
2195 		 * modified bit set. If there are, we cannot clear the global modified bit.
2196 		 */
2197 		bits &= ~PHYS_MODIFIED;
2198 	}
2199 	pmap_phys_attributes[pai] &= ~(bits);
2200 
2201 	/* update this page's "reusable" status */
2202 	if (options & PMAP_OPTIONS_CLEAR_REUSABLE) {
2203 		pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
2204 	} else if (options & PMAP_OPTIONS_SET_REUSABLE) {
2205 		pmap_phys_attributes[pai] |= PHYS_REUSABLE;
2206 	}
2207 
2208 	UNLOCK_PVH(pai);
2209 
2210 	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
2211 }
2212 
2213 /*
2214  *	Check specified attribute bits.
2215  */
2216 int
phys_attribute_test(ppnum_t pn,int bits)2217 phys_attribute_test(
2218 	ppnum_t         pn,
2219 	int             bits)
2220 {
2221 	pv_rooted_entry_t       pv_h;
2222 	pv_hashed_entry_t       pv_e;
2223 	pt_entry_t              *pte;
2224 	int                     pai;
2225 	pmap_t                  pmap;
2226 	int                     attributes = 0;
2227 	boolean_t               is_ept;
2228 
2229 	pmap_intr_assert();
2230 	assert(pn != vm_page_fictitious_addr);
2231 	assert((bits & ~(PHYS_MODIFIED | PHYS_REFERENCED)) == 0);
2232 	if (pn == vm_page_guard_addr) {
2233 		return 0;
2234 	}
2235 
2236 	pai = ppn_to_pai(pn);
2237 
2238 	if (!IS_MANAGED_PAGE(pai)) {
2239 		/*
2240 		 *	Not a managed page.
2241 		 */
2242 		return 0;
2243 	}
2244 
2245 	/*
2246 	 * Fast check...  if bits already collected
2247 	 * no need to take any locks...
2248 	 * if not set, we need to recheck after taking
2249 	 * the lock in case they got pulled in while
2250 	 * we were waiting for the lock
2251 	 */
2252 	if ((pmap_phys_attributes[pai] & bits) == bits) {
2253 		return bits;
2254 	}
2255 
2256 	pv_h = pai_to_pvh(pai);
2257 
2258 	LOCK_PVH(pai);
2259 
2260 	attributes = pmap_phys_attributes[pai] & bits;
2261 
2262 
2263 	/*
2264 	 * Walk down PV list, checking the mappings until we
2265 	 * reach the end or we've found the desired attributes.
2266 	 */
2267 	if (attributes != bits &&
2268 	    pv_h->pmap != PMAP_NULL) {
2269 		/*
2270 		 * There are some mappings.
2271 		 */
2272 		pv_e = (pv_hashed_entry_t)pv_h;
2273 		do {
2274 			vm_map_offset_t va;
2275 
2276 			pmap = pv_e->pmap;
2277 			is_ept = is_ept_pmap(pmap);
2278 			va = PVE_VA(pv_e);
2279 			/*
2280 			 * pick up modify and/or reference bits from mapping
2281 			 */
2282 
2283 			pte = pmap_pte(pmap, va);
2284 			if (!is_ept) {
2285 				attributes |= (int)(*pte & bits);
2286 			} else {
2287 				attributes |= (int)(ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED));
2288 			}
2289 
2290 			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2291 		} while ((attributes != bits) &&
2292 		    (pv_e != (pv_hashed_entry_t)pv_h));
2293 	}
2294 	pmap_phys_attributes[pai] |= attributes;
2295 
2296 	UNLOCK_PVH(pai);
2297 	return attributes;
2298 }
2299 
2300 /*
2301  *	Routine:	pmap_change_wiring
2302  *	Function:	Change the wiring attribute for a map/virtual-address
2303  *			pair.
2304  *	In/out conditions:
2305  *			The mapping must already exist in the pmap.
2306  */
2307 void
pmap_change_wiring(pmap_t map,vm_map_offset_t vaddr,boolean_t wired)2308 pmap_change_wiring(
2309 	pmap_t          map,
2310 	vm_map_offset_t vaddr,
2311 	boolean_t       wired)
2312 {
2313 	pt_entry_t      *pte;
2314 
2315 	PMAP_LOCK_SHARED(map);
2316 
2317 	if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL) {
2318 		panic("pmap_change_wiring(%p,0x%llx,%d): pte missing",
2319 		    map, vaddr, wired);
2320 	}
2321 
2322 	if (wired && !iswired(*pte)) {
2323 		/*
2324 		 * wiring down mapping
2325 		 */
2326 		pmap_ledger_credit(map, task_ledgers.wired_mem, PAGE_SIZE);
2327 		pmap_update_pte(is_ept_pmap(map), pte, 0, PTE_WIRED, false);
2328 	} else if (!wired && iswired(*pte)) {
2329 		/*
2330 		 * unwiring mapping
2331 		 */
2332 		pmap_ledger_debit(map, task_ledgers.wired_mem, PAGE_SIZE);
2333 		pmap_update_pte(is_ept_pmap(map), pte, PTE_WIRED, 0, false);
2334 	}
2335 
2336 	PMAP_UNLOCK_SHARED(map);
2337 }
2338 
2339 /*
2340  *	"Backdoor" direct map routine for early mappings.
2341  *      Useful for mapping memory outside the range
2342  *      Sets A, D and NC if requested
2343  */
2344 
2345 vm_offset_t
pmap_map_bd(vm_offset_t virt,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int flags)2346 pmap_map_bd(
2347 	vm_offset_t     virt,
2348 	vm_map_offset_t start_addr,
2349 	vm_map_offset_t end_addr,
2350 	vm_prot_t       prot,
2351 	unsigned int    flags)
2352 {
2353 	pt_entry_t      template;
2354 	pt_entry_t      *ptep;
2355 
2356 	vm_offset_t     base = virt;
2357 	boolean_t       doflush = FALSE;
2358 
2359 	template = pa_to_pte(start_addr)
2360 	    | INTEL_PTE_REF
2361 	    | INTEL_PTE_MOD
2362 	    | INTEL_PTE_WIRED
2363 	    | INTEL_PTE_VALID;
2364 
2365 	if ((flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) == VM_MEM_NOT_CACHEABLE) {
2366 		template |= INTEL_PTE_NCACHE;
2367 		if (!(flags & (VM_MEM_GUARDED))) {
2368 			template |= INTEL_PTE_PAT;
2369 		}
2370 	}
2371 
2372 	if ((prot & VM_PROT_EXECUTE) == 0) {
2373 		template |= INTEL_PTE_NX;
2374 	}
2375 
2376 	if (prot & VM_PROT_WRITE) {
2377 		template |= INTEL_PTE_WRITE;
2378 	}
2379 	vm_map_offset_t caddr = start_addr;
2380 	while (caddr < end_addr) {
2381 		ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
2382 		if (ptep == PT_ENTRY_NULL) {
2383 			panic("pmap_map_bd: Invalid kernel address");
2384 		}
2385 		if (pte_to_pa(*ptep)) {
2386 			doflush = TRUE;
2387 		}
2388 		pmap_store_pte(FALSE, ptep, template);
2389 		pte_increment_pa(template);
2390 		virt += PAGE_SIZE;
2391 		caddr += PAGE_SIZE;
2392 	}
2393 	if (doflush) {
2394 		pmap_tlbi_range(0, ~0ULL, true, 0);
2395 		PMAP_UPDATE_TLBS(kernel_pmap, base, base + end_addr - start_addr);
2396 	}
2397 	return virt;
2398 }
2399 
2400 /* Create a virtual alias beginning at 'ava' of the specified kernel virtual
2401  * range. The aliased pagetable range is expanded if
2402  * PMAP_EXPAND_OPTIONS_ALIASMAP is specified. Performs no synchronization,
2403  * assumes caller has stabilized the source and destination ranges. Currently
2404  * used to populate sections of the trampoline "doublemap" at CPU startup.
2405  */
2406 
2407 void
pmap_alias(vm_offset_t ava,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int eoptions)2408 pmap_alias(
2409 	vm_offset_t     ava,
2410 	vm_map_offset_t start_addr,
2411 	vm_map_offset_t end_addr,
2412 	vm_prot_t       prot,
2413 	unsigned int    eoptions)
2414 {
2415 	pt_entry_t      prot_template, template;
2416 	pt_entry_t      *aptep, *sptep;
2417 
2418 	prot_template =  INTEL_PTE_REF | INTEL_PTE_MOD | INTEL_PTE_WIRED | INTEL_PTE_VALID;
2419 	if ((prot & VM_PROT_EXECUTE) == 0) {
2420 		prot_template |= INTEL_PTE_NX;
2421 	}
2422 
2423 	if (prot & VM_PROT_WRITE) {
2424 		prot_template |= INTEL_PTE_WRITE;
2425 	}
2426 	assert(((start_addr | end_addr) & PAGE_MASK) == 0);
2427 	while (start_addr < end_addr) {
2428 		aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2429 		if (aptep == PT_ENTRY_NULL) {
2430 			if (eoptions & PMAP_EXPAND_OPTIONS_ALIASMAP) {
2431 				pmap_expand(kernel_pmap, ava, PMAP_EXPAND_OPTIONS_ALIASMAP);
2432 				aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2433 			} else {
2434 				panic("pmap_alias: Invalid alias address");
2435 			}
2436 		}
2437 		/* The aliased range should not have any active mappings */
2438 		assert(pte_to_pa(*aptep) == 0);
2439 
2440 		sptep = pmap_pte(kernel_pmap, start_addr);
2441 		assert(sptep != PT_ENTRY_NULL && (pte_to_pa(*sptep) != 0));
2442 		template = pa_to_pte(pte_to_pa(*sptep)) | prot_template;
2443 		pmap_store_pte(FALSE, aptep, template);
2444 
2445 		ava += PAGE_SIZE;
2446 		start_addr += PAGE_SIZE;
2447 	}
2448 }
2449 
2450 mach_vm_size_t
pmap_query_resident(pmap_t pmap,addr64_t s64,addr64_t e64,mach_vm_size_t * compressed_bytes_p)2451 pmap_query_resident(
2452 	pmap_t          pmap,
2453 	addr64_t        s64,
2454 	addr64_t        e64,
2455 	mach_vm_size_t  *compressed_bytes_p)
2456 {
2457 	pt_entry_t     *pde;
2458 	pt_entry_t     *spte, *epte;
2459 	addr64_t        l64;
2460 	uint64_t        deadline = 0;
2461 	mach_vm_size_t  resident_bytes;
2462 	mach_vm_size_t  compressed_bytes;
2463 	boolean_t       is_ept;
2464 
2465 	pmap_intr_assert();
2466 
2467 	if (pmap == PMAP_NULL || pmap == kernel_pmap || s64 == e64) {
2468 		if (compressed_bytes_p) {
2469 			*compressed_bytes_p = 0;
2470 		}
2471 		return 0;
2472 	}
2473 
2474 	is_ept = is_ept_pmap(pmap);
2475 
2476 	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
2477 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(s64),
2478 	    VM_KERNEL_ADDRHIDE(e64));
2479 
2480 	resident_bytes = 0;
2481 	compressed_bytes = 0;
2482 
2483 	PMAP_LOCK_EXCLUSIVE(pmap);
2484 	uint32_t traverse_count = 0;
2485 
2486 	while (s64 < e64) {
2487 		if (os_add_overflow(s64, PDE_MAPPED_SIZE, &l64)) {
2488 			l64 = e64;
2489 		} else {
2490 			l64 &= ~(PDE_MAPPED_SIZE - 1);
2491 
2492 			if (l64 > e64) {
2493 				l64 = e64;
2494 			}
2495 		}
2496 
2497 		pde = pmap_pde(pmap, s64);
2498 
2499 		if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
2500 			if (*pde & PTE_PS) {
2501 				/* superpage: not supported */
2502 			} else {
2503 				spte = pmap_pte(pmap,
2504 				    (s64 & ~(PDE_MAPPED_SIZE - 1)));
2505 				spte = &spte[ptenum(s64)];
2506 				epte = &spte[intel_btop(l64 - s64)];
2507 
2508 				for (; spte < epte; spte++) {
2509 					if (pte_to_pa(*spte) != 0) {
2510 						resident_bytes += PAGE_SIZE;
2511 					} else if (*spte & PTE_COMPRESSED) {
2512 						compressed_bytes += PAGE_SIZE;
2513 					}
2514 				}
2515 			}
2516 		}
2517 		s64 = l64;
2518 
2519 		if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
2520 			if (deadline == 0) {
2521 				deadline = rdtsc64() + max_preemption_latency_tsc;
2522 			} else {
2523 				if (rdtsc64() > deadline) {
2524 					PMAP_UNLOCK_EXCLUSIVE(pmap);
2525 					__builtin_ia32_pause();
2526 					PMAP_LOCK_EXCLUSIVE(pmap);
2527 					deadline = rdtsc64() + max_preemption_latency_tsc;
2528 				}
2529 			}
2530 		}
2531 	}
2532 
2533 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2534 
2535 	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
2536 	    resident_bytes);
2537 
2538 	if (compressed_bytes_p) {
2539 		*compressed_bytes_p = compressed_bytes;
2540 	}
2541 	return resident_bytes;
2542 }
2543 
2544 uint64_t pmap_query_page_info_retries;
2545 
2546 kern_return_t
pmap_query_page_info(pmap_t pmap,vm_map_offset_t va,int * disp_p)2547 pmap_query_page_info(
2548 	pmap_t          pmap,
2549 	vm_map_offset_t va,
2550 	int             *disp_p)
2551 {
2552 	int             disp;
2553 	boolean_t       is_ept;
2554 	pmap_paddr_t    pa;
2555 	ppnum_t         pai;
2556 	pd_entry_t      *pde_p;
2557 	pt_entry_t      *pte_p, pte;
2558 
2559 	pmap_intr_assert();
2560 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
2561 		*disp_p = 0;
2562 		return KERN_INVALID_ARGUMENT;
2563 	}
2564 
2565 	disp = 0;
2566 	is_ept = is_ept_pmap(pmap);
2567 
2568 	PMAP_LOCK_EXCLUSIVE(pmap);
2569 
2570 	pde_p = pmap_pde(pmap, va);
2571 	if (!pde_p ||
2572 	    !(*pde_p & PTE_VALID_MASK(is_ept)) ||
2573 	    (*pde_p & PTE_PS)) {
2574 		goto done;
2575 	}
2576 
2577 try_again:
2578 	disp = 0;
2579 
2580 	pte_p = pmap_pte(pmap, va);
2581 	if (pte_p == PT_ENTRY_NULL) {
2582 		goto done;
2583 	}
2584 
2585 	pte = *pte_p;
2586 	pa = pte_to_pa(pte);
2587 	if (pa == 0) {
2588 		if (PTE_IS_COMPRESSED(pte, pte_p, pmap, va)) {
2589 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
2590 			if (pte & PTE_COMPRESSED_ALT) {
2591 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
2592 			}
2593 		}
2594 	} else {
2595 		disp |= PMAP_QUERY_PAGE_PRESENT;
2596 		pai = pa_index(pa);
2597 		if (!IS_MANAGED_PAGE(pai)) {
2598 		} else if (pmap_pv_is_altacct(pmap, va, pai)) {
2599 			assert(IS_INTERNAL_PAGE(pai));
2600 			disp |= PMAP_QUERY_PAGE_INTERNAL;
2601 			disp |= PMAP_QUERY_PAGE_ALTACCT;
2602 		} else if (IS_REUSABLE_PAGE(pai)) {
2603 			disp |= PMAP_QUERY_PAGE_REUSABLE;
2604 		} else if (IS_INTERNAL_PAGE(pai)) {
2605 			disp |= PMAP_QUERY_PAGE_INTERNAL;
2606 		}
2607 	}
2608 	if (__improbable(pte_p != pmap_pte(pmap, va) || pte != *pte_p)) {
2609 		/* something changed: try again */
2610 		pmap_query_page_info_retries++;
2611 		goto try_again;
2612 	}
2613 done:
2614 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2615 	*disp_p = disp;
2616 	return KERN_SUCCESS;
2617 }
2618 
2619 void
pmap_set_vm_map_cs_enforced(pmap_t pmap,bool new_value)2620 pmap_set_vm_map_cs_enforced(
2621 	pmap_t pmap,
2622 	bool new_value)
2623 {
2624 	PMAP_LOCK_EXCLUSIVE(pmap);
2625 	pmap->pm_vm_map_cs_enforced = new_value;
2626 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2627 }
2628 extern int cs_process_enforcement_enable;
2629 bool
pmap_get_vm_map_cs_enforced(pmap_t pmap)2630 pmap_get_vm_map_cs_enforced(
2631 	pmap_t pmap)
2632 {
2633 	if (cs_process_enforcement_enable) {
2634 		return true;
2635 	}
2636 	return pmap->pm_vm_map_cs_enforced;
2637 }
2638 
2639 void
pmap_set_jit_entitled(__unused pmap_t pmap)2640 pmap_set_jit_entitled(__unused pmap_t pmap)
2641 {
2642 	/* The x86 pmap layer does not care if a map has a JIT entry. */
2643 	return;
2644 }
2645 
2646 bool
pmap_get_jit_entitled(__unused pmap_t pmap)2647 pmap_get_jit_entitled(__unused pmap_t pmap)
2648 {
2649 	/* The x86 pmap layer does not care if a map is using JIT. */
2650 	return false;
2651 }
2652 
2653 void
pmap_set_tpro(__unused pmap_t pmap)2654 pmap_set_tpro(__unused pmap_t pmap)
2655 {
2656 	/* The x86 pmap layer does not care if a map is using TPRO */
2657 	return;
2658 }
2659 
2660 bool
pmap_get_tpro(__unused pmap_t pmap)2661 pmap_get_tpro(__unused pmap_t pmap)
2662 {
2663 	/* The x86 pmap layer does not care if a map is using TPRO */
2664 	return false;
2665 }
2666 
2667 bool
pmap_has_prot_policy(__unused pmap_t pmap,__unused bool translated_allow_execute,__unused vm_prot_t prot)2668 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
2669 {
2670 	/*
2671 	 * The x86 pmap layer does not apply any policy to any protection
2672 	 * types.
2673 	 */
2674 	return false;
2675 }
2676 
2677 uint64_t
pmap_release_pages_fast(void)2678 pmap_release_pages_fast(void)
2679 {
2680 	return 0;
2681 }
2682 
2683 void
pmap_trim(__unused pmap_t grand,__unused pmap_t subord,__unused addr64_t vstart,__unused uint64_t size)2684 pmap_trim(__unused pmap_t grand, __unused pmap_t subord, __unused addr64_t vstart, __unused uint64_t size)
2685 {
2686 	return;
2687 }
2688 
2689 __dead2
2690 void
pmap_ledger_verify_size(size_t size)2691 pmap_ledger_verify_size(size_t size)
2692 {
2693 	panic("%s: unsupported, "
2694 	    "size=%lu",
2695 	    __func__, size);
2696 }
2697 
2698 __dead2
2699 ledger_t
pmap_ledger_alloc(void)2700 pmap_ledger_alloc(void)
2701 {
2702 	panic("%s: unsupported",
2703 	    __func__);
2704 }
2705 
2706 __dead2
2707 void
pmap_ledger_free(ledger_t ledger)2708 pmap_ledger_free(ledger_t ledger)
2709 {
2710 	panic("%s: unsupported, "
2711 	    "ledger=%p",
2712 	    __func__, ledger);
2713 }
2714 
2715 kern_return_t
pmap_dump_page_tables(pmap_t pmap __unused,void * bufp __unused,void * buf_end __unused,unsigned int level_mask __unused,size_t * bytes_copied __unused)2716 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
2717     unsigned int level_mask __unused, size_t *bytes_copied __unused)
2718 {
2719 	return KERN_NOT_SUPPORTED;
2720 }
2721 
2722 void *
pmap_map_compressor_page(ppnum_t pn)2723 pmap_map_compressor_page(ppnum_t pn)
2724 {
2725 	assertf(IS_MANAGED_PAGE(ppn_to_pai(pn)), "%s called on non-managed page 0x%08x", __func__, pn);
2726 	return PHYSMAP_PTOV((uint64_t)pn << (uint64_t)PAGE_SHIFT);
2727 }
2728 
2729 void
pmap_unmap_compressor_page(ppnum_t pn __unused,void * kva __unused)2730 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
2731 {
2732 }
2733 
2734 bool
pmap_clear_refmod_range_options(pmap_t pmap __unused,vm_map_address_t start __unused,vm_map_address_t end __unused,unsigned int mask __unused,unsigned int options __unused)2735 pmap_clear_refmod_range_options(
2736 	pmap_t pmap __unused,
2737 	vm_map_address_t start __unused,
2738 	vm_map_address_t end __unused,
2739 	unsigned int mask __unused,
2740 	unsigned int options __unused)
2741 {
2742 	/*
2743 	 * x86 doesn't have ranged tlbi instructions, and we already have
2744 	 * the pmap_flush_context. This operation isn't implemented.
2745 	 */
2746 	return false;
2747 }
2748 
2749 bool
pmap_supported_feature(pmap_t pmap,pmap_feature_flags_t feat)2750 pmap_supported_feature(pmap_t pmap, pmap_feature_flags_t feat)
2751 {
2752 	switch (feat) {
2753 	case PMAP_FEAT_UEXEC:
2754 		return pmap != NULL && is_ept_pmap(pmap);
2755 	default:
2756 		return false;
2757 	}
2758 }
2759