xref: /xnu-12377.81.4/osfmk/i386/pmap_x86_common.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <mach_assert.h>
30 
31 #include <vm/pmap.h>
32 #include <vm/vm_map_xnu.h>
33 #include <vm/vm_kern_xnu.h>
34 #include <vm/vm_page_internal.h>
35 #include <kern/ledger.h>
36 #include <kern/zalloc_internal.h>
37 #include <i386/pmap_internal.h>
38 
39 void            pmap_remove_range(
40 	pmap_t          pmap,
41 	vm_map_offset_t va,
42 	pt_entry_t      *spte,
43 	pt_entry_t      *epte);
44 
45 static void            pmap_remove_range_options(
46 	pmap_t          pmap,
47 	vm_map_offset_t va,
48 	pt_entry_t      *spte,
49 	pt_entry_t      *epte,
50 	int             options);
51 
52 void            pmap_reusable_range(
53 	pmap_t          pmap,
54 	vm_map_offset_t va,
55 	pt_entry_t      *spte,
56 	pt_entry_t      *epte,
57 	boolean_t       reusable);
58 
59 pt_entry_t *PTE_corrupted_ptr;
60 
61 #if DEVELOPMENT || DEBUG
62 int pmap_inject_pte_corruption;
63 uint32_t pmap_update_clear_pte_count;
64 uint32_t pmap_update_invalid_pte_count;
65 #endif
66 
67 /*
68  * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time,
69  * on a NBPDE boundary.
70  */
71 
72 uint64_t
pmap_shared_region_size_min(__unused pmap_t pmap)73 pmap_shared_region_size_min(__unused pmap_t pmap)
74 {
75 	return NBPDE;
76 }
77 
78 uint64_t
pmap_commpage_size_min(__unused pmap_t pmap)79 pmap_commpage_size_min(__unused pmap_t pmap)
80 {
81 	return NBPDE;
82 }
83 
84 void
pmap_set_shared_region(pmap_t grand __unused,pmap_t subord __unused,addr64_t vstart __unused,uint64_t size __unused)85 pmap_set_shared_region(
86 	pmap_t grand __unused,
87 	pmap_t subord __unused,
88 	addr64_t vstart __unused,
89 	uint64_t size __unused)
90 {
91 }
92 
93 kern_return_t
pmap_fork_nest(pmap_t old_pmap __unused,pmap_t new_pmap __unused)94 pmap_fork_nest(
95 	pmap_t old_pmap __unused,
96 	pmap_t new_pmap __unused)
97 {
98 	return KERN_SUCCESS;
99 }
100 
101 /*
102  *	kern_return_t pmap_nest(grand, subord, va_start, size)
103  *
104  *	grand  = the pmap that we will nest subord into
105  *	subord = the pmap that goes into the grand
106  *	va_start  = start of range in pmap to be inserted
107  *	size   = Size of nest area (up to 16TB)
108  *
109  *	Inserts a pmap into another.  This is used to implement shared segments.
110  *
111  *	Note that we depend upon higher level VM locks to insure that things don't change while
112  *	we are doing this.  For example, VM should not be doing any pmap enters while it is nesting
113  *	or do 2 nests at once.
114  */
115 
116 /*
117  * This routine can nest subtrees either at the PDPT level (1GiB) or at the
118  * PDE level (2MiB). We currently disallow disparate offsets for the "subord"
119  * container and the "grand" parent. A minor optimization to consider for the
120  * future: make the "subord" truly a container rather than a full-fledged
121  * pagetable hierarchy which can be unnecessarily sparse (DRK).
122  */
123 
124 kern_return_t
pmap_nest(pmap_t grand,pmap_t subord,addr64_t va_start,uint64_t size)125 pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, uint64_t size)
126 {
127 	vm_map_offset_t vaddr;
128 	pd_entry_t      *pde, *npde;
129 	unsigned int    i;
130 	uint64_t        num_pde;
131 
132 	assert(!is_ept_pmap(grand));
133 	assert(!is_ept_pmap(subord));
134 
135 	if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
136 	    (va_start & (pmap_shared_region_size_min(grand) - 1)) ||
137 	    ((size >> 28) > 65536)) {   /* Max size we can nest is 16TB */
138 		return KERN_INVALID_VALUE;
139 	}
140 
141 	if (size == 0) {
142 		panic("pmap_nest: size is invalid - %016llX", size);
143 	}
144 
145 	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
146 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
147 	    VM_KERNEL_ADDRHIDE(va_start));
148 
149 	vaddr = (vm_map_offset_t)va_start;
150 	num_pde = size >> PDESHIFT;
151 
152 	PMAP_LOCK_EXCLUSIVE(subord);
153 
154 	subord->pm_shared = TRUE;
155 
156 	for (i = 0; i < num_pde;) {
157 		if (((vaddr & PDPTMASK) == 0) && (num_pde - i) >= NPDEPG) {
158 			npde = pmap64_pdpt(subord, vaddr);
159 
160 			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
161 				PMAP_UNLOCK_EXCLUSIVE(subord);
162 				pmap_expand_pdpt(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
163 				PMAP_LOCK_EXCLUSIVE(subord);
164 				npde = pmap64_pdpt(subord, vaddr);
165 			}
166 			*npde |= INTEL_PDPTE_NESTED;
167 			vaddr += NBPDPT;
168 			i += (uint32_t)NPDEPG;
169 		} else {
170 			npde = pmap_pde(subord, vaddr);
171 
172 			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
173 				PMAP_UNLOCK_EXCLUSIVE(subord);
174 				pmap_expand(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
175 				PMAP_LOCK_EXCLUSIVE(subord);
176 				npde = pmap_pde(subord, vaddr);
177 			}
178 			vaddr += NBPDE;
179 			i++;
180 		}
181 	}
182 
183 	PMAP_UNLOCK_EXCLUSIVE(subord);
184 
185 	vaddr = (vm_map_offset_t)va_start;
186 
187 	PMAP_LOCK_EXCLUSIVE(grand);
188 
189 	for (i = 0; i < num_pde;) {
190 		pd_entry_t tpde;
191 
192 		if (((vaddr & PDPTMASK) == 0) && ((num_pde - i) >= NPDEPG)) {
193 			npde = pmap64_pdpt(subord, vaddr);
194 			if (npde == 0) {
195 				panic("pmap_nest: no PDPT, subord %p nstart 0x%llx", subord, vaddr);
196 			}
197 			tpde = *npde;
198 			pde = pmap64_pdpt(grand, vaddr);
199 			if (0 == pde) {
200 				PMAP_UNLOCK_EXCLUSIVE(grand);
201 				pmap_expand_pml4(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
202 				PMAP_LOCK_EXCLUSIVE(grand);
203 				pde = pmap64_pdpt(grand, vaddr);
204 			}
205 			if (pde == 0) {
206 				panic("pmap_nest: no PDPT, grand  %p vaddr 0x%llx", grand, vaddr);
207 			}
208 			pmap_store_pte(FALSE, pde, tpde);
209 			vaddr += NBPDPT;
210 			i += (uint32_t) NPDEPG;
211 		} else {
212 			npde = pmap_pde(subord, vaddr);
213 			if (npde == 0) {
214 				panic("pmap_nest: no npde, subord %p vaddr 0x%llx", subord, vaddr);
215 			}
216 			tpde = *npde;
217 			pde = pmap_pde(grand, vaddr);
218 			if (0 == pde) {
219 				PMAP_UNLOCK_EXCLUSIVE(grand);
220 				pmap_expand_pdpt(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
221 				PMAP_LOCK_EXCLUSIVE(grand);
222 				pde = pmap_pde(grand, vaddr);
223 			}
224 
225 			if (pde == 0) {
226 				panic("pmap_nest: no pde, grand  %p vaddr 0x%llx", grand, vaddr);
227 			}
228 			vaddr += NBPDE;
229 			pmap_store_pte(FALSE, pde, tpde);
230 			i++;
231 		}
232 	}
233 
234 	PMAP_UNLOCK_EXCLUSIVE(grand);
235 
236 	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, KERN_SUCCESS);
237 
238 	return KERN_SUCCESS;
239 }
240 
241 /*
242  *	kern_return_t pmap_unnest(grand, vaddr)
243  *
244  *	grand  = the pmap that we will un-nest subord from
245  *	vaddr  = start of range in pmap to be unnested
246  *
247  *	Removes a pmap from another.  This is used to implement shared segments.
248  */
249 
250 kern_return_t
pmap_unnest(pmap_t grand,addr64_t vaddr,uint64_t size)251 pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size)
252 {
253 	pd_entry_t *pde;
254 	unsigned int i;
255 	uint64_t num_pde;
256 	addr64_t va_start, va_end;
257 	uint64_t npdpt = PMAP_INVALID_PDPTNUM;
258 
259 	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
260 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
261 
262 	if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
263 	    (vaddr & (pmap_shared_region_size_min(grand) - 1))) {
264 		panic("pmap_unnest(%p,0x%llx,0x%llx): unaligned...",
265 		    grand, vaddr, size);
266 	}
267 
268 	assert(!is_ept_pmap(grand));
269 
270 	/* align everything to PDE boundaries */
271 	va_start = vaddr & ~(NBPDE - 1);
272 
273 	if (os_add_overflow(vaddr, size + NBPDE - 1, &va_end)) {
274 		panic("pmap_unnest: Overflow when calculating range end: s=0x%llx sz=0x%llx\n", vaddr, size);
275 	}
276 
277 	va_end &= ~(NBPDE - 1);
278 	size = va_end - va_start;
279 
280 	PMAP_LOCK_EXCLUSIVE(grand);
281 
282 	num_pde = size >> PDESHIFT;
283 	vaddr = va_start;
284 
285 	for (i = 0; i < num_pde;) {
286 		if (pdptnum(grand, vaddr) != npdpt) {
287 			npdpt = pdptnum(grand, vaddr);
288 			pde = pmap64_pdpt(grand, vaddr);
289 			if (pde && (*pde & INTEL_PDPTE_NESTED)) {
290 				pmap_store_pte(FALSE, pde, (pd_entry_t)0);
291 				i += (uint32_t) NPDEPG;
292 				vaddr += NBPDPT;
293 				continue;
294 			}
295 		}
296 		pde = pmap_pde(grand, (vm_map_offset_t)vaddr);
297 		if (pde == 0) {
298 			panic("pmap_unnest: no pde, grand %p vaddr 0x%llx", grand, vaddr);
299 		}
300 		pmap_store_pte(FALSE, pde, (pd_entry_t)0);
301 		i++;
302 		vaddr += NBPDE;
303 	}
304 
305 	PMAP_UPDATE_TLBS(grand, va_start, va_end);
306 
307 	PMAP_UNLOCK_EXCLUSIVE(grand);
308 
309 	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
310 
311 	return KERN_SUCCESS;
312 }
313 
314 kern_return_t
pmap_unnest_options(pmap_t grand,addr64_t vaddr,__unused uint64_t size,__unused unsigned int options)315 pmap_unnest_options(
316 	pmap_t grand,
317 	addr64_t vaddr,
318 	__unused uint64_t size,
319 	__unused unsigned int options)
320 {
321 	return pmap_unnest(grand, vaddr, size);
322 }
323 
324 /* Invoked by the Mach VM to determine the platform specific unnest region */
325 
326 boolean_t
pmap_adjust_unnest_parameters(pmap_t p,vm_map_offset_t * s,vm_map_offset_t * e)327 pmap_adjust_unnest_parameters(pmap_t p, vm_map_offset_t *s, vm_map_offset_t *e)
328 {
329 	pd_entry_t *pdpte;
330 	boolean_t rval = FALSE;
331 
332 	PMAP_LOCK_EXCLUSIVE(p);
333 
334 	pdpte = pmap64_pdpt(p, *s);
335 	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
336 		*s &= ~(NBPDPT - 1);
337 		rval = TRUE;
338 	}
339 
340 	pdpte = pmap64_pdpt(p, *e);
341 	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
342 		*e = ((*e + NBPDPT) & ~(NBPDPT - 1));
343 		rval = TRUE;
344 	}
345 
346 	PMAP_UNLOCK_EXCLUSIVE(p);
347 
348 	return rval;
349 }
350 
351 pmap_paddr_t
pmap_find_pa(pmap_t pmap,addr64_t va)352 pmap_find_pa(pmap_t pmap, addr64_t va)
353 {
354 	pt_entry_t      *ptp;
355 	pd_entry_t      *pdep;
356 	pd_entry_t      pde;
357 	pt_entry_t      pte;
358 	boolean_t       is_ept, locked = FALSE;
359 	pmap_paddr_t    pa = 0;
360 
361 	is_ept = is_ept_pmap(pmap);
362 
363 	if ((pmap != kernel_pmap) && not_in_kdp) {
364 		PMAP_LOCK_EXCLUSIVE(pmap);
365 		locked = TRUE;
366 	} else {
367 		mp_disable_preemption();
368 	}
369 
370 	if (os_ref_get_count(&pmap->ref_count) == 0) {
371 		goto pfp_exit;
372 	}
373 
374 	pdep = pmap_pde(pmap, va);
375 
376 	if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
377 		if (pde & PTE_PS) {
378 			pa = pte_to_pa(pde) + (va & I386_LPGMASK);
379 		} else {
380 			ptp = pmap_pte(pmap, va);
381 			if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
382 				pa = pte_to_pa(pte) + (va & PAGE_MASK);
383 			}
384 		}
385 	}
386 pfp_exit:
387 	if (locked) {
388 		PMAP_UNLOCK_EXCLUSIVE(pmap);
389 	} else {
390 		mp_enable_preemption();
391 	}
392 
393 	return pa;
394 }
395 
396 /*
397  * pmap_find_phys returns the (4K) physical page number containing a
398  * given virtual address in a given pmap.
399  * Note that pmap_pte may return a pde if this virtual address is
400  * mapped by a large page and this is taken into account in order
401  * to return the correct page number in this case.
402  */
403 ppnum_t
pmap_find_phys(pmap_t pmap,addr64_t va)404 pmap_find_phys(pmap_t pmap, addr64_t va)
405 {
406 	ppnum_t         ppn = 0;
407 	pmap_paddr_t    pa = 0;
408 
409 	pa = pmap_find_pa(pmap, va);
410 	ppn = (ppnum_t) i386_btop(pa);
411 
412 	return ppn;
413 }
414 
415 ppnum_t
pmap_find_phys_nofault(pmap_t pmap,addr64_t va)416 pmap_find_phys_nofault(pmap_t pmap, addr64_t va)
417 {
418 	if ((pmap == kernel_pmap) ||
419 	    ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map)))) {
420 		return pmap_find_phys(pmap, va);
421 	}
422 	return 0;
423 }
424 
425 /*
426  *  pmap_get_prot returns the equivalent Vm page protections
427  *  set on a given address, 'va'. This function is used in the
428  *  ml_static_verify_page_protections() routine which is used
429  *  by the kext loading code to validate that the TEXT segment
430  *  of a kext is mapped executable.
431  */
432 kern_return_t
pmap_get_prot(pmap_t pmap,addr64_t va,vm_prot_t * protp)433 pmap_get_prot(pmap_t pmap, addr64_t va, vm_prot_t *protp)
434 {
435 	pt_entry_t      *ptp;
436 	pd_entry_t      *pdep;
437 	pd_entry_t      pde;
438 	pt_entry_t      pte;
439 	boolean_t       is_ept, locked = FALSE;
440 	kern_return_t   retval = KERN_FAILURE;
441 	vm_prot_t       prot = 0;
442 
443 	is_ept = is_ept_pmap(pmap);
444 
445 	if ((pmap != kernel_pmap) && not_in_kdp) {
446 		PMAP_LOCK_EXCLUSIVE(pmap);
447 		locked = TRUE;
448 	} else {
449 		mp_disable_preemption();
450 	}
451 
452 	if (os_ref_get_count(&pmap->ref_count) == 0) {
453 		goto pfp_exit;
454 	}
455 
456 	pdep = pmap_pde(pmap, va);
457 
458 	if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
459 		if (pde & PTE_PS) {
460 			prot = VM_PROT_READ;
461 
462 			if (pde & PTE_WRITE(is_ept)) {
463 				prot |= VM_PROT_WRITE;
464 			}
465 			if (PTE_IS_EXECUTABLE(is_ept, pde)) {
466 				prot |= VM_PROT_EXECUTE;
467 			}
468 			retval = KERN_SUCCESS;
469 		} else {
470 			ptp = pmap_pte(pmap, va);
471 			if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
472 				prot = VM_PROT_READ;
473 
474 				if (pte & PTE_WRITE(is_ept)) {
475 					prot |= VM_PROT_WRITE;
476 				}
477 				if (PTE_IS_EXECUTABLE(is_ept, pte)) {
478 					prot |= VM_PROT_EXECUTE;
479 				}
480 				retval = KERN_SUCCESS;
481 			}
482 		}
483 	}
484 
485 pfp_exit:
486 	if (locked) {
487 		PMAP_UNLOCK_EXCLUSIVE(pmap);
488 	} else {
489 		mp_enable_preemption();
490 	}
491 
492 	if (protp) {
493 		*protp = prot;
494 	}
495 
496 	return retval;
497 }
498 
499 /*
500  * Update cache attributes for all extant managed mappings.
501  * Assumes PV for this page is locked, and that the page
502  * is managed. We assume that this physical page may be mapped in
503  * both EPT and normal Intel PTEs, so we convert the attributes
504  * to the corresponding format for each pmap.
505  *
506  * We assert that the passed set of attributes is a subset of the
507  * PHYS_CACHEABILITY_MASK.
508  */
509 void
pmap_update_cache_attributes_locked(ppnum_t pn,unsigned attributes)510 pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes)
511 {
512 	pv_rooted_entry_t       pv_h, pv_e;
513 	pv_hashed_entry_t       pvh_e, nexth;
514 	vm_map_offset_t vaddr;
515 	pmap_t  pmap;
516 	pt_entry_t      *ptep;
517 	boolean_t       is_ept;
518 	unsigned        ept_attributes;
519 
520 	assert(IS_MANAGED_PAGE(pn));
521 	assert(((~PHYS_CACHEABILITY_MASK) & attributes) == 0);
522 
523 	/* We don't support the PAT bit for EPT PTEs */
524 	if (attributes & INTEL_PTE_NCACHE) {
525 		ept_attributes = INTEL_EPT_NCACHE;
526 	} else {
527 		ept_attributes = INTEL_EPT_WB;
528 	}
529 
530 	pv_h = pai_to_pvh(pn);
531 	/* TODO: translate the PHYS_* bits to PTE bits, while they're
532 	 * currently identical, they may not remain so
533 	 * Potential optimization (here and in page_protect),
534 	 * parallel shootdowns, check for redundant
535 	 * attribute modifications.
536 	 */
537 
538 	/*
539 	 * Alter attributes on all mappings
540 	 */
541 	if (pv_h->pmap != PMAP_NULL) {
542 		pv_e = pv_h;
543 		pvh_e = (pv_hashed_entry_t)pv_e;
544 
545 		do {
546 			pmap = pv_e->pmap;
547 			vaddr = PVE_VA(pv_e);
548 			ptep = pmap_pte(pmap, vaddr);
549 
550 			if (0 == ptep) {
551 				panic("pmap_update_cache_attributes_locked: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx kernel_pmap: %p", pmap, pn, vaddr, kernel_pmap);
552 			}
553 
554 			is_ept = is_ept_pmap(pmap);
555 
556 			nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink);
557 			if (!is_ept) {
558 				pmap_update_pte(is_ept, ptep, PHYS_CACHEABILITY_MASK, attributes, true);
559 			} else {
560 				pmap_update_pte(is_ept, ptep, INTEL_EPT_CACHE_MASK, ept_attributes, true);
561 			}
562 			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
563 			pvh_e = nexth;
564 		} while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
565 	}
566 }
567 
568 void
x86_filter_TLB_coherency_interrupts(boolean_t dofilter)569 x86_filter_TLB_coherency_interrupts(boolean_t dofilter)
570 {
571 	assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
572 
573 	if (dofilter) {
574 		CPU_CR3_MARK_INACTIVE();
575 	} else {
576 		CPU_CR3_MARK_ACTIVE();
577 		mfence();
578 		pmap_update_interrupt();
579 	}
580 }
581 
582 
583 /*
584  *	Insert the given physical page (p) at
585  *	the specified virtual address (v) in the
586  *	target physical map with the protection requested.
587  *
588  *	If specified, the page will be wired down, meaning
589  *	that the related pte cannot be reclaimed.
590  *
591  *	NB:  This is the only routine which MAY NOT lazy-evaluate
592  *	or lose information.  That is, this routine must actually
593  *	insert this page into the given map NOW.
594  */
595 
596 kern_return_t
pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired,pmap_mapping_type_t mapping_type)597 pmap_enter(
598 	pmap_t          pmap,
599 	vm_map_offset_t         vaddr,
600 	ppnum_t                 pn,
601 	vm_prot_t               prot,
602 	vm_prot_t               fault_type,
603 	unsigned int            flags,
604 	boolean_t               wired,
605 	pmap_mapping_type_t     mapping_type)
606 {
607 	return pmap_enter_options(pmap, vaddr, pn, prot, fault_type, flags, wired, PMAP_EXPAND_OPTIONS_NONE, NULL, mapping_type);
608 }
609 
610 #define PTE_LOCK(EPT) INTEL_PTE_SWLOCK
611 
612 static inline void PTE_LOCK_LOCK(pt_entry_t *);
613 static inline void PTE_LOCK_UNLOCK(pt_entry_t *);
614 
615 void
PTE_LOCK_LOCK(pt_entry_t * lpte)616 PTE_LOCK_LOCK(pt_entry_t *lpte)
617 {
618 	pt_entry_t pte;
619 plretry:
620 	while ((pte = __c11_atomic_load((_Atomic pt_entry_t *)lpte, memory_order_relaxed)) & PTE_LOCK(0)) {
621 		__builtin_ia32_pause();
622 	}
623 	if (__c11_atomic_compare_exchange_strong((_Atomic pt_entry_t *)lpte, &pte, pte | PTE_LOCK(0), memory_order_acquire_smp, TRUE)) {
624 		return;
625 	}
626 
627 	goto plretry;
628 }
629 
630 void
PTE_LOCK_UNLOCK(pt_entry_t * lpte)631 PTE_LOCK_UNLOCK(pt_entry_t *lpte)
632 {
633 	__c11_atomic_fetch_and((_Atomic pt_entry_t *)lpte, ~PTE_LOCK(0), memory_order_release_smp);
634 }
635 
636 kern_return_t
pmap_enter_options_addr(pmap_t pmap,vm_map_address_t v,pmap_paddr_t pa,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,__unused void * arg,pmap_mapping_type_t mapping_type)637 pmap_enter_options_addr(
638 	pmap_t pmap,
639 	vm_map_address_t v,
640 	pmap_paddr_t pa,
641 	vm_prot_t prot,
642 	vm_prot_t fault_type,
643 	unsigned int flags,
644 	boolean_t wired,
645 	unsigned int options,
646 	__unused void   *arg,
647 	pmap_mapping_type_t mapping_type)
648 {
649 	return pmap_enter_options(pmap, v, intel_btop(pa), prot, fault_type, flags, wired, options, arg, mapping_type);
650 }
651 
652 kern_return_t
pmap_enter_options(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,__unused vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,void * arg,__unused pmap_mapping_type_t mapping_type)653 pmap_enter_options(
654 	pmap_t          pmap,
655 	vm_map_offset_t         vaddr,
656 	ppnum_t                 pn,
657 	vm_prot_t               prot,
658 	__unused vm_prot_t      fault_type,
659 	unsigned int            flags,
660 	boolean_t               wired,
661 	unsigned int            options,
662 	void                    *arg,
663 	__unused pmap_mapping_type_t mapping_type)
664 {
665 	pt_entry_t              *pte = NULL;
666 	pv_rooted_entry_t       pv_h;
667 	ppnum_t                 pai;
668 	pv_hashed_entry_t       pvh_e;
669 	pv_hashed_entry_t       pvh_new;
670 	pt_entry_t              template;
671 	pmap_paddr_t            old_pa;
672 	pmap_paddr_t            pa = (pmap_paddr_t) i386_ptob(pn);
673 	boolean_t               need_tlbflush = FALSE;
674 	boolean_t               set_NX;
675 	char                    oattr;
676 	boolean_t               old_pa_locked;
677 	/* 2MiB mappings are confined to x86_64 by VM */
678 	boolean_t               superpage = flags & VM_MEM_SUPERPAGE;
679 	vm_object_t             delpage_pm_obj = NULL;
680 	uint64_t                delpage_pde_index = 0;
681 	pt_entry_t              old_pte;
682 	kern_return_t           kr = KERN_FAILURE;
683 	boolean_t               is_ept;
684 	boolean_t               is_altacct;
685 	boolean_t               ptelocked = FALSE;
686 
687 	pmap_intr_assert();
688 
689 	if (__improbable(pmap == PMAP_NULL)) {
690 		return KERN_INVALID_ARGUMENT;
691 	}
692 	if (__improbable(pn == vm_page_guard_addr)) {
693 		return KERN_INVALID_ARGUMENT;
694 	}
695 
696 	is_ept = is_ept_pmap(pmap);
697 
698 	/* N.B. We can be supplied a zero page frame in the NOENTER case, it's an
699 	 * unused value for that scenario.
700 	 */
701 	assert(pn != vm_page_fictitious_addr);
702 
703 
704 	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
705 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(vaddr), pn,
706 	    prot);
707 
708 	if ((prot & VM_PROT_EXECUTE) || __improbable(is_ept && (prot & VM_PROT_UEXEC))) {
709 		set_NX = FALSE;
710 	} else {
711 		set_NX = TRUE;
712 	}
713 
714 #if DEVELOPMENT || DEBUG
715 	if (__improbable(set_NX && (!nx_enabled || !pmap->nx_enabled))) {
716 		set_NX = FALSE;
717 	}
718 
719 	if (__improbable(set_NX && (pmap == kernel_pmap) &&
720 	    ((pmap_disable_kstack_nx && (flags & VM_MEM_STACK)) ||
721 	    (pmap_disable_kheap_nx && !(flags & VM_MEM_STACK))))) {
722 		set_NX = FALSE;
723 	}
724 #endif
725 
726 	pvh_new = PV_HASHED_ENTRY_NULL;
727 Retry:
728 	pvh_e = PV_HASHED_ENTRY_NULL;
729 
730 	PMAP_LOCK_SHARED(pmap);
731 
732 	/*
733 	 *	Expand pmap to include this pte.  Assume that
734 	 *	pmap is always expanded to include enough hardware
735 	 *	pages to map one VM page.
736 	 */
737 	if (__improbable(superpage)) {
738 		while ((pte = pmap_pde(pmap, vaddr)) == PD_ENTRY_NULL) {
739 			/* need room for another pde entry */
740 			PMAP_UNLOCK_SHARED(pmap);
741 			kr = pmap_expand_pdpt(pmap, vaddr, options);
742 			if (kr != KERN_SUCCESS) {
743 				goto done1;
744 			}
745 			PMAP_LOCK_SHARED(pmap);
746 		}
747 	} else {
748 		while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
749 			/*
750 			 * Must unlock to expand the pmap
751 			 * going to grow pde level page(s)
752 			 */
753 			PMAP_UNLOCK_SHARED(pmap);
754 			kr = pmap_expand(pmap, vaddr, options);
755 			if (kr != KERN_SUCCESS) {
756 				goto done1;
757 			}
758 			PMAP_LOCK_SHARED(pmap);
759 		}
760 	}
761 
762 	if (__improbable(options & PMAP_EXPAND_OPTIONS_NOENTER)) {
763 		PMAP_UNLOCK_SHARED(pmap);
764 		kr = KERN_SUCCESS;
765 		goto done1;
766 	}
767 
768 	if (__improbable(superpage && *pte && !(*pte & PTE_PS))) {
769 		/*
770 		 * There is still an empty page table mapped that
771 		 * was used for a previous base page mapping.
772 		 * Remember the PDE and the PDE index, so that we
773 		 * can free the page at the end of this function.
774 		 */
775 		delpage_pde_index = pdeidx(pmap, vaddr);
776 		delpage_pm_obj = pmap->pm_obj;
777 		pmap_store_pte(is_ept, pte, 0);
778 	}
779 
780 	PTE_LOCK_LOCK(pte);
781 	ptelocked = TRUE;
782 
783 	old_pa = pte_to_pa(*pte);
784 	pai = pa_index(old_pa);
785 	old_pa_locked = FALSE;
786 
787 	if (old_pa == 0 &&
788 	    PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr)) {
789 		/*
790 		 * "pmap" should be locked at this point, so this should
791 		 * not race with another pmap_enter() or pmap_remove_range().
792 		 */
793 		assert(pmap != kernel_pmap);
794 
795 		/* one less "compressed" */
796 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
797 		    PAGE_SIZE);
798 		if (*pte & PTE_COMPRESSED_ALT) {
799 			pmap_ledger_debit(
800 				pmap,
801 				task_ledgers.alternate_accounting_compressed,
802 				PAGE_SIZE);
803 		} else {
804 			/* was part of the footprint */
805 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
806 			    PAGE_SIZE);
807 		}
808 		/* marker will be cleared below */
809 	}
810 
811 	/*
812 	 * if we have a previous managed page, lock the pv entry now. after
813 	 * we lock it, check to see if someone beat us to the lock and if so
814 	 * drop the lock
815 	 */
816 	if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) {
817 		LOCK_PVH(pai);
818 		old_pa_locked = TRUE;
819 		old_pa = pte_to_pa(*pte);
820 		if (0 == old_pa) {
821 			UNLOCK_PVH(pai);        /* another path beat us to it */
822 			old_pa_locked = FALSE;
823 		}
824 	}
825 
826 	/*
827 	 *	Special case if the incoming physical page is already mapped
828 	 *	at this address.
829 	 */
830 	if (old_pa == pa) {
831 		pt_entry_t old_attributes =
832 		    *pte & ~(PTE_REF(is_ept) | PTE_MOD(is_ept) | PTE_LOCK(is_ept));
833 
834 		/*
835 		 *	May be changing its wired attribute or protection
836 		 */
837 
838 		template = pa_to_pte(pa);
839 
840 		if (__probable(!is_ept)) {
841 			template |= INTEL_PTE_VALID;
842 		} else {
843 			template |= INTEL_EPT_IPAT;
844 		}
845 
846 		template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
847 
848 		/*
849 		 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
850 		 */
851 		if (!is_ept && (VM_MEM_NOT_CACHEABLE ==
852 		    (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)))) {
853 			if (!(flags & VM_MEM_GUARDED)) {
854 				template |= INTEL_PTE_PAT;
855 			}
856 			template |= INTEL_PTE_NCACHE;
857 		}
858 		if (pmap != kernel_pmap && !is_ept) {
859 			template |= INTEL_PTE_USER;
860 		}
861 
862 		if (prot & VM_PROT_READ) {
863 			template |= PTE_READ(is_ept);
864 		}
865 
866 		if (prot & VM_PROT_WRITE) {
867 			template |= PTE_WRITE(is_ept);
868 			if (is_ept && !pmap_ept_support_ad) {
869 				template |= PTE_MOD(is_ept);
870 				if (old_pa_locked) {
871 					assert(IS_MANAGED_PAGE(pai));
872 					pmap_phys_attributes[pai] |= PHYS_MODIFIED;
873 				}
874 			}
875 		}
876 
877 		if (prot & VM_PROT_EXECUTE) {
878 			assert(set_NX == 0);
879 			template = pte_set_ex(template, is_ept);
880 		}
881 
882 		if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
883 			assert(set_NX == 0);
884 			template = pte_set_uex(template);
885 		}
886 
887 		if (set_NX) {
888 			template = pte_remove_ex(template, is_ept);
889 		}
890 
891 		if (wired) {
892 			template |= PTE_WIRED;
893 			if (!iswired(old_attributes)) {
894 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
895 			}
896 		} else {
897 			if (iswired(old_attributes)) {
898 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
899 			}
900 		}
901 
902 		if (superpage) {        /* this path can not be used */
903 			template |= PTE_PS;     /* to change the page size! */
904 		}
905 		if (old_attributes == template) {
906 			goto dont_update_pte;
907 		}
908 
909 		/* Determine delta, PV locked */
910 		need_tlbflush =
911 		    ((old_attributes ^ template) != PTE_WIRED);
912 
913 		/* Optimisation: avoid TLB flush when adding writability */
914 		if (need_tlbflush == TRUE && !(old_attributes & PTE_WRITE(is_ept))) {
915 			if ((old_attributes ^ template) == PTE_WRITE(is_ept)) {
916 				need_tlbflush = FALSE;
917 			}
918 		}
919 
920 		/* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
921 		if (__improbable(is_ept && !pmap_ept_support_ad)) {
922 			template |= PTE_REF(is_ept);
923 			if (old_pa_locked) {
924 				assert(IS_MANAGED_PAGE(pai));
925 				pmap_phys_attributes[pai] |= PHYS_REFERENCED;
926 			}
927 		}
928 
929 		/* store modified PTE and preserve RC bits */
930 		pt_entry_t npte, opte;
931 
932 		assert((*pte & PTE_LOCK(is_ept)) != 0);
933 
934 		do {
935 			opte = *pte;
936 			npte = template | (opte & (PTE_REF(is_ept) |
937 			    PTE_MOD(is_ept))) | PTE_LOCK(is_ept);
938 		} while (!pmap_cmpx_pte(pte, opte, npte));
939 
940 		DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, opte, uint64_t, npte);
941 
942 dont_update_pte:
943 		if (old_pa_locked) {
944 			UNLOCK_PVH(pai);
945 			old_pa_locked = FALSE;
946 		}
947 		goto done2;
948 	}
949 
950 	/*
951 	 *	Outline of code from here:
952 	 *	   1) If va was mapped, update TLBs, remove the mapping
953 	 *	      and remove old pvlist entry.
954 	 *	   2) Add pvlist entry for new mapping
955 	 *	   3) Enter new mapping.
956 	 *
957 	 *	If the old physical page is not managed step 1) is skipped
958 	 *	(except for updating the TLBs), and the mapping is
959 	 *	overwritten at step 3).  If the new physical page is not
960 	 *	managed, step 2) is skipped.
961 	 */
962 	/* TODO: add opportunistic refmod collect */
963 	if (old_pa != (pmap_paddr_t) 0) {
964 		boolean_t       was_altacct = FALSE;
965 
966 		/*
967 		 *	Don't do anything to pages outside valid memory here.
968 		 *	Instead convince the code that enters a new mapping
969 		 *	to overwrite the old one.
970 		 */
971 
972 		/* invalidate the PTE */
973 		pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
974 		/* propagate invalidate everywhere */
975 		PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
976 		/* remember reference and change */
977 		old_pte = *pte;
978 		oattr = (char) (old_pte & (PTE_MOD(is_ept) | PTE_REF(is_ept)));
979 		/* completely invalidate the PTE */
980 		pmap_store_pte(is_ept, pte, PTE_LOCK(is_ept));
981 
982 		if (IS_MANAGED_PAGE(pai)) {
983 			/*
984 			 *	Remove the mapping from the pvlist for
985 			 *	this physical page.
986 			 *      We'll end up with either a rooted pv or a
987 			 *      hashed pv
988 			 */
989 			pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, &old_pte, &was_altacct);
990 		}
991 
992 		if (IS_MANAGED_PAGE(pai)) {
993 			pmap_assert(old_pa_locked == TRUE);
994 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
995 			if (pmap != kernel_pmap) {
996 				/* update ledgers */
997 				if (was_altacct) {
998 					assert(IS_INTERNAL_PAGE(pai));
999 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1000 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1001 				} else if (IS_REUSABLE_PAGE(pai)) {
1002 					assert(!was_altacct);
1003 					assert(IS_INTERNAL_PAGE(pai));
1004 					pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
1005 					/* was already not in phys_footprint */
1006 				} else if (IS_INTERNAL_PAGE(pai)) {
1007 					assert(!was_altacct);
1008 					assert(!IS_REUSABLE_PAGE(pai));
1009 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1010 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1011 				} else {
1012 					/* not an internal page */
1013 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
1014 				}
1015 			}
1016 			if (iswired(*pte)) {
1017 				pmap_ledger_debit(pmap, task_ledgers.wired_mem,
1018 				    PAGE_SIZE);
1019 			}
1020 
1021 			if (!is_ept) {
1022 				pmap_phys_attributes[pai] |= oattr;
1023 			} else {
1024 				pmap_phys_attributes[pai] |= ept_refmod_to_physmap(oattr);
1025 			}
1026 		} else {
1027 			/*
1028 			 *	old_pa is not managed.
1029 			 *	Do removal part of accounting.
1030 			 */
1031 
1032 			if (pmap != kernel_pmap) {
1033 #if 00
1034 				assert(pmap->stats.device > 0);
1035 				OSAddAtomic(-1, &pmap->stats.device);
1036 #endif
1037 			}
1038 			if (iswired(*pte)) {
1039 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1040 			}
1041 		}
1042 	}
1043 
1044 	/*
1045 	 * if we had a previously managed paged locked, unlock it now
1046 	 */
1047 	if (old_pa_locked) {
1048 		UNLOCK_PVH(pai);
1049 		old_pa_locked = FALSE;
1050 	}
1051 
1052 	pai = pa_index(pa);     /* now working with new incoming phys page */
1053 	if (IS_MANAGED_PAGE(pai)) {
1054 		/*
1055 		 *	Step 2) Enter the mapping in the PV list for this
1056 		 *	physical page.
1057 		 */
1058 		pv_h = pai_to_pvh(pai);
1059 
1060 		LOCK_PVH(pai);
1061 
1062 		if (pv_h->pmap == PMAP_NULL) {
1063 			/*
1064 			 *	No mappings yet, use rooted pv
1065 			 */
1066 			pv_h->va_and_flags = vaddr;
1067 			pv_h->pmap = pmap;
1068 			queue_init(&pv_h->qlink);
1069 
1070 			if (options & PMAP_OPTIONS_INTERNAL) {
1071 				pmap_phys_attributes[pai] |= PHYS_INTERNAL;
1072 			} else {
1073 				pmap_phys_attributes[pai] &= ~PHYS_INTERNAL;
1074 			}
1075 			if (options & PMAP_OPTIONS_REUSABLE) {
1076 				pmap_phys_attributes[pai] |= PHYS_REUSABLE;
1077 			} else {
1078 				pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
1079 			}
1080 			if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1081 			    IS_INTERNAL_PAGE(pai)) {
1082 				pv_h->va_and_flags |= PVE_IS_ALTACCT;
1083 				is_altacct = TRUE;
1084 			} else {
1085 				pv_h->va_and_flags &= ~PVE_IS_ALTACCT;
1086 				is_altacct = FALSE;
1087 			}
1088 		} else {
1089 			/*
1090 			 *	Add new pv_hashed_entry after header.
1091 			 */
1092 			if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
1093 				pvh_e = pvh_new;
1094 				pvh_new = PV_HASHED_ENTRY_NULL;
1095 			} else if (PV_HASHED_ENTRY_NULL == pvh_e) {
1096 				PV_HASHED_ALLOC(&pvh_e);
1097 				if (PV_HASHED_ENTRY_NULL == pvh_e) {
1098 					/*
1099 					 * the pv list is empty. if we are on
1100 					 * the kernel pmap we'll use one of
1101 					 * the special private kernel pv_e's,
1102 					 * else, we need to unlock
1103 					 * everything, zalloc a pv_e, and
1104 					 * restart bringing in the pv_e with
1105 					 * us.
1106 					 */
1107 					if (kernel_pmap == pmap) {
1108 						PV_HASHED_KERN_ALLOC(&pvh_e);
1109 					} else {
1110 						UNLOCK_PVH(pai);
1111 						PTE_LOCK_UNLOCK(pte);
1112 						PMAP_UNLOCK_SHARED(pmap);
1113 						pmap_pv_throttle(pmap);
1114 						pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
1115 						goto Retry;
1116 					}
1117 				}
1118 			}
1119 
1120 			if (PV_HASHED_ENTRY_NULL == pvh_e) {
1121 				panic("Mapping alias chain exhaustion, possibly induced by numerous kernel virtual double mappings");
1122 			}
1123 
1124 			pvh_e->va_and_flags = vaddr;
1125 			pvh_e->pmap = pmap;
1126 			pvh_e->ppn = pn;
1127 			if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1128 			    IS_INTERNAL_PAGE(pai)) {
1129 				pvh_e->va_and_flags |= PVE_IS_ALTACCT;
1130 				is_altacct = TRUE;
1131 			} else {
1132 				pvh_e->va_and_flags &= ~PVE_IS_ALTACCT;
1133 				is_altacct = FALSE;
1134 			}
1135 			pv_hash_add(pvh_e, pv_h);
1136 
1137 			/*
1138 			 *	Remember that we used the pvlist entry.
1139 			 */
1140 			pvh_e = PV_HASHED_ENTRY_NULL;
1141 		}
1142 
1143 		/*
1144 		 * only count the mapping
1145 		 * for 'managed memory'
1146 		 */
1147 		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1148 		if (pmap != kernel_pmap) {
1149 			/* update ledgers */
1150 			if (is_altacct) {
1151 				/* internal but also alternate accounting */
1152 				assert(IS_INTERNAL_PAGE(pai));
1153 				pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1154 				pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1155 				/* alternate accounting, so not in footprint */
1156 			} else if (IS_REUSABLE_PAGE(pai)) {
1157 				assert(!is_altacct);
1158 				assert(IS_INTERNAL_PAGE(pai));
1159 				pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
1160 				/* internal but reusable: not in footprint */
1161 			} else if (IS_INTERNAL_PAGE(pai)) {
1162 				assert(!is_altacct);
1163 				assert(!IS_REUSABLE_PAGE(pai));
1164 				/* internal: add to footprint */
1165 				pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1166 				pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1167 			} else {
1168 				/* not internal: not in footprint */
1169 				pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
1170 			}
1171 		}
1172 	} else if (last_managed_page == 0) {
1173 		/* Account for early mappings created before "managed pages"
1174 		 * are determined. Consider consulting the available DRAM map.
1175 		 */
1176 		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1177 		if (pmap != kernel_pmap) {
1178 #if 00
1179 			OSAddAtomic(+1, &pmap->stats.device);
1180 			PMAP_STATS_PEAK(pmap->stats.device);
1181 #endif
1182 		}
1183 	}
1184 	/*
1185 	 * Step 3) Enter the mapping.
1186 	 *
1187 	 *	Build a template to speed up entering -
1188 	 *	only the pfn changes.
1189 	 */
1190 	template = pa_to_pte(pa);
1191 
1192 	if (!is_ept) {
1193 		template |= INTEL_PTE_VALID;
1194 	} else {
1195 		template |= INTEL_EPT_IPAT;
1196 	}
1197 
1198 	/*
1199 	 * DRK: It may be worth asserting on cache attribute flags that diverge
1200 	 * from the existing physical page attributes.
1201 	 */
1202 
1203 	template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
1204 
1205 	/*
1206 	 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
1207 	 */
1208 	if (!is_ept && (flags & VM_MEM_NOT_CACHEABLE)) {
1209 		if (!(flags & VM_MEM_GUARDED)) {
1210 			template |= INTEL_PTE_PAT;
1211 		}
1212 		template |= INTEL_PTE_NCACHE;
1213 	}
1214 	if (pmap != kernel_pmap && !is_ept) {
1215 		template |= INTEL_PTE_USER;
1216 	}
1217 	if (prot & VM_PROT_READ) {
1218 		template |= PTE_READ(is_ept);
1219 	}
1220 	if (prot & VM_PROT_WRITE) {
1221 		template |= PTE_WRITE(is_ept);
1222 		if (is_ept && !pmap_ept_support_ad) {
1223 			template |= PTE_MOD(is_ept);
1224 			if (IS_MANAGED_PAGE(pai)) {
1225 				pmap_phys_attributes[pai] |= PHYS_MODIFIED;
1226 			}
1227 		}
1228 	}
1229 	if (prot & VM_PROT_EXECUTE) {
1230 		assert(set_NX == 0);
1231 		template = pte_set_ex(template, is_ept);
1232 	}
1233 	if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
1234 		assert(set_NX == 0);
1235 		template = pte_set_uex(template);
1236 	}
1237 
1238 	if (set_NX) {
1239 		template = pte_remove_ex(template, is_ept);
1240 	}
1241 	if (wired) {
1242 		template |= INTEL_PTE_WIRED;
1243 		pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1244 	}
1245 	if (__improbable(superpage)) {
1246 		template |= INTEL_PTE_PS;
1247 	}
1248 
1249 	/* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
1250 	if (__improbable(is_ept && !pmap_ept_support_ad)) {
1251 		template |= PTE_REF(is_ept);
1252 		if (IS_MANAGED_PAGE(pai)) {
1253 			pmap_phys_attributes[pai] |= PHYS_REFERENCED;
1254 		}
1255 	}
1256 	template |= PTE_LOCK(is_ept);
1257 	pmap_store_pte(is_ept, pte, template);
1258 	DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, 0, uint64_t, template);
1259 
1260 	/*
1261 	 * if this was a managed page we delayed unlocking the pv until here
1262 	 * to prevent pmap_page_protect et al from finding it until the pte
1263 	 * has been stored
1264 	 */
1265 	if (IS_MANAGED_PAGE(pai)) {
1266 		UNLOCK_PVH(pai);
1267 	}
1268 done2:
1269 	if (need_tlbflush == TRUE) {
1270 		if (options & PMAP_OPTIONS_NOFLUSH) {
1271 			PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1272 		} else {
1273 			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1274 		}
1275 	}
1276 	if (ptelocked) {
1277 		PTE_LOCK_UNLOCK(pte);
1278 	}
1279 	PMAP_UNLOCK_SHARED(pmap);
1280 
1281 	if (pvh_e != PV_HASHED_ENTRY_NULL) {
1282 		PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
1283 	}
1284 	if (pvh_new != PV_HASHED_ENTRY_NULL) {
1285 		PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
1286 	}
1287 
1288 	if (delpage_pm_obj) {
1289 		vm_page_t m;
1290 
1291 		vm_object_lock(delpage_pm_obj);
1292 		m = vm_page_lookup(delpage_pm_obj, (delpage_pde_index * PAGE_SIZE));
1293 		if (m == VM_PAGE_NULL) {
1294 			panic("pmap_enter: pte page not in object");
1295 		}
1296 		VM_PAGE_FREE(m);
1297 		vm_object_unlock(delpage_pm_obj);
1298 		OSAddAtomic(-1, &inuse_ptepages_count);
1299 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
1300 	}
1301 
1302 	kr = KERN_SUCCESS;
1303 done1:
1304 	if (__improbable((kr == KERN_SUCCESS) && (pmap == kernel_pmap) &&
1305 	    zone_spans_ro_va(vaddr, vaddr + PAGE_SIZE))) {
1306 		pmap_page_protect((ppnum_t)atop_kernel(kvtophys(vaddr)), VM_PROT_READ);
1307 	}
1308 	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
1309 	return kr;
1310 }
1311 
1312 /*
1313  *	Remove a range of hardware page-table entries.
1314  *	The entries given are the first (inclusive)
1315  *	and last (exclusive) entries for the VM pages.
1316  *	The virtual address is the va for the first pte.
1317  *
1318  *	The pmap must be locked.
1319  *	If the pmap is not the kernel pmap, the range must lie
1320  *	entirely within one pte-page.  This is NOT checked.
1321  *	Assumes that the pte-page exists.
1322  */
1323 
1324 void
pmap_remove_range(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte)1325 pmap_remove_range(
1326 	pmap_t                  pmap,
1327 	vm_map_offset_t         start_vaddr,
1328 	pt_entry_t              *spte,
1329 	pt_entry_t              *epte)
1330 {
1331 	pmap_remove_range_options(pmap, start_vaddr, spte, epte,
1332 	    PMAP_OPTIONS_REMOVE);
1333 }
1334 
1335 static void
pmap_remove_range_options(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte,int options)1336 pmap_remove_range_options(
1337 	pmap_t                  pmap,
1338 	vm_map_offset_t         start_vaddr,
1339 	pt_entry_t              *spte,
1340 	pt_entry_t              *epte,
1341 	int                     options)
1342 {
1343 	pt_entry_t              *cpte;
1344 	pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1345 	pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1346 	pv_hashed_entry_t       pvh_e;
1347 	int                     pvh_cnt = 0;
1348 	int                     num_removed, num_unwired, num_found, num_invalid;
1349 	int                     ledgers_external, ledgers_reusable, ledgers_internal, ledgers_alt_internal;
1350 	uint64_t                ledgers_compressed, ledgers_alt_compressed;
1351 	ppnum_t                 pai;
1352 	pmap_paddr_t            pa;
1353 	vm_map_offset_t         vaddr;
1354 	boolean_t               is_ept = is_ept_pmap(pmap);
1355 	boolean_t               was_altacct;
1356 
1357 	num_removed = 0;
1358 	num_unwired = 0;
1359 	num_found   = 0;
1360 	num_invalid = 0;
1361 	ledgers_external = 0;
1362 	ledgers_reusable = 0;
1363 	ledgers_internal = 0;
1364 	ledgers_compressed = 0;
1365 	ledgers_alt_internal = 0;
1366 	ledgers_alt_compressed = 0;
1367 
1368 	/* invalidate the PTEs first to "freeze" them */
1369 	for (cpte = spte, vaddr = start_vaddr;
1370 	    cpte < epte;
1371 	    cpte++, vaddr += PAGE_SIZE_64) {
1372 		pt_entry_t p = *cpte;
1373 
1374 		pa = pte_to_pa(p);
1375 		if (pa == 0) {
1376 			if ((options & PMAP_OPTIONS_REMOVE) &&
1377 			    (PTE_IS_COMPRESSED(p, cpte, pmap, vaddr))) {
1378 				assert(pmap != kernel_pmap);
1379 				/* one less "compressed"... */
1380 				ledgers_compressed++;
1381 				if (p & PTE_COMPRESSED_ALT) {
1382 					/* ... but it used to be "ALTACCT" */
1383 					ledgers_alt_compressed++;
1384 				}
1385 				/* clear marker(s) */
1386 				/* XXX probably does not need to be atomic! */
1387 				pmap_update_pte(is_ept, cpte, INTEL_PTE_COMPRESSED_MASK, 0, true);
1388 			}
1389 			continue;
1390 		}
1391 		num_found++;
1392 
1393 		if (iswired(p)) {
1394 			num_unwired++;
1395 		}
1396 
1397 		pai = pa_index(pa);
1398 
1399 		if (!IS_MANAGED_PAGE(pai)) {
1400 			/*
1401 			 *	Outside range of managed physical memory.
1402 			 *	Just remove the mappings.
1403 			 */
1404 			pmap_store_pte(is_ept, cpte, 0);
1405 			continue;
1406 		}
1407 
1408 		if ((p & PTE_VALID_MASK(is_ept)) == 0) {
1409 			num_invalid++;
1410 		}
1411 
1412 		/* invalidate the PTE */
1413 		pmap_update_pte(is_ept, cpte, PTE_VALID_MASK(is_ept), 0, true);
1414 	}
1415 
1416 	if (num_found == 0) {
1417 		/* nothing was changed: we're done */
1418 		goto update_counts;
1419 	}
1420 
1421 	/* propagate the invalidates to other CPUs */
1422 
1423 	PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1424 
1425 	for (cpte = spte, vaddr = start_vaddr;
1426 	    cpte < epte;
1427 	    cpte++, vaddr += PAGE_SIZE_64) {
1428 		pa = pte_to_pa(*cpte);
1429 		if (pa == 0) {
1430 check_pte_for_compressed_marker:
1431 			/*
1432 			 * This PTE could have been replaced with a
1433 			 * "compressed" marker after our first "freeze"
1434 			 * loop above, so check again.
1435 			 */
1436 			if ((options & PMAP_OPTIONS_REMOVE) &&
1437 			    (PTE_IS_COMPRESSED(*cpte, cpte, pmap, vaddr))) {
1438 				assert(pmap != kernel_pmap);
1439 				/* one less "compressed"... */
1440 				ledgers_compressed++;
1441 				if (*cpte & PTE_COMPRESSED_ALT) {
1442 					/* ... but it used to be "ALTACCT" */
1443 					ledgers_alt_compressed++;
1444 				}
1445 				pmap_store_pte(is_ept, cpte, 0);
1446 			}
1447 			continue;
1448 		}
1449 
1450 		pai = pa_index(pa);
1451 
1452 		LOCK_PVH(pai);
1453 
1454 		pa = pte_to_pa(*cpte);
1455 		if (pa == 0) {
1456 			UNLOCK_PVH(pai);
1457 			goto check_pte_for_compressed_marker;
1458 		}
1459 
1460 		/*
1461 		 * Remove the mapping from the pvlist for this physical page.
1462 		 */
1463 		pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, cpte, &was_altacct);
1464 
1465 		num_removed++;
1466 		/* update ledgers */
1467 		if (was_altacct) {
1468 			/* internal and alternate accounting */
1469 			assert(IS_INTERNAL_PAGE(pai));
1470 			ledgers_internal++;
1471 			ledgers_alt_internal++;
1472 		} else if (IS_REUSABLE_PAGE(pai)) {
1473 			/* internal but reusable */
1474 			assert(!was_altacct);
1475 			assert(IS_INTERNAL_PAGE(pai));
1476 			ledgers_reusable++;
1477 		} else if (IS_INTERNAL_PAGE(pai)) {
1478 			/* internal */
1479 			assert(!was_altacct);
1480 			assert(!IS_REUSABLE_PAGE(pai));
1481 			ledgers_internal++;
1482 		} else {
1483 			/* not internal */
1484 			ledgers_external++;
1485 		}
1486 
1487 		/*
1488 		 * Get the modify and reference bits, then
1489 		 * nuke the entry in the page table
1490 		 */
1491 		/* remember reference and change */
1492 		if (!is_ept) {
1493 			pmap_phys_attributes[pai] |=
1494 			    *cpte & (PHYS_MODIFIED | PHYS_REFERENCED);
1495 		} else {
1496 			pmap_phys_attributes[pai] |=
1497 			    ept_refmod_to_physmap((*cpte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1498 		}
1499 
1500 		/* completely invalidate the PTE */
1501 		pmap_store_pte(is_ept, cpte, 0);
1502 
1503 		UNLOCK_PVH(pai);
1504 
1505 		if (pvh_e != PV_HASHED_ENTRY_NULL) {
1506 			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1507 			pvh_eh = pvh_e;
1508 
1509 			if (pvh_et == PV_HASHED_ENTRY_NULL) {
1510 				pvh_et = pvh_e;
1511 			}
1512 			pvh_cnt++;
1513 		}
1514 		/* We can encounter at most 'num_found' PTEs for this level
1515 		 * Fewer may be encountered if some were replaced by
1516 		 * compressed markers. No new valid PTEs can be created
1517 		 * since the pmap lock is held exclusively.
1518 		 */
1519 		if (num_removed == num_found) {
1520 			break;
1521 		}
1522 	} /* for loop */
1523 
1524 	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1525 		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1526 	}
1527 update_counts:
1528 	/*
1529 	 *	Update the counts
1530 	 */
1531 #if TESTING
1532 	if (pmap->stats.resident_count < num_removed) {
1533 		panic("pmap_remove_range: resident_count");
1534 	}
1535 #endif
1536 	if (num_removed) {
1537 		pmap_ledger_debit(pmap, task_ledgers.phys_mem, machine_ptob(num_removed));
1538 	}
1539 
1540 	if (pmap != kernel_pmap) {
1541 		if (ledgers_external) {
1542 			pmap_ledger_debit(pmap,
1543 			    task_ledgers.external,
1544 			    machine_ptob(ledgers_external));
1545 		}
1546 		if (ledgers_reusable) {
1547 			pmap_ledger_debit(pmap,
1548 			    task_ledgers.reusable,
1549 			    machine_ptob(ledgers_reusable));
1550 		}
1551 		if (ledgers_internal) {
1552 			pmap_ledger_debit(pmap,
1553 			    task_ledgers.internal,
1554 			    machine_ptob(ledgers_internal));
1555 		}
1556 		if (ledgers_compressed) {
1557 			pmap_ledger_debit(pmap,
1558 			    task_ledgers.internal_compressed,
1559 			    machine_ptob(ledgers_compressed));
1560 		}
1561 		if (ledgers_alt_internal) {
1562 			pmap_ledger_debit(pmap,
1563 			    task_ledgers.alternate_accounting,
1564 			    machine_ptob(ledgers_alt_internal));
1565 		}
1566 		if (ledgers_alt_compressed) {
1567 			pmap_ledger_debit(pmap,
1568 			    task_ledgers.alternate_accounting_compressed,
1569 			    machine_ptob(ledgers_alt_compressed));
1570 		}
1571 
1572 		uint64_t net_debit = (ledgers_internal - ledgers_alt_internal) + (ledgers_compressed - ledgers_alt_compressed);
1573 		if (net_debit) {
1574 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(net_debit));
1575 		}
1576 	}
1577 
1578 	if (num_unwired != 0) {
1579 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, machine_ptob(num_unwired));
1580 	}
1581 	return;
1582 }
1583 
1584 
1585 /*
1586  *	Remove the given range of addresses
1587  *	from the specified map.
1588  *
1589  *	It is assumed that the start and end are properly
1590  *	rounded to the hardware page size.
1591  */
1592 void
pmap_remove(pmap_t map,addr64_t s64,addr64_t e64)1593 pmap_remove(
1594 	pmap_t          map,
1595 	addr64_t        s64,
1596 	addr64_t        e64)
1597 {
1598 	pmap_remove_options(map, s64, e64, PMAP_OPTIONS_REMOVE);
1599 }
1600 #define PLCHECK_THRESHOLD (2)
1601 
1602 void
pmap_remove_options(pmap_t map,addr64_t s64,addr64_t e64,int options)1603 pmap_remove_options(
1604 	pmap_t          map,
1605 	addr64_t        s64,
1606 	addr64_t        e64,
1607 	int             options)
1608 {
1609 	pt_entry_t     *pde;
1610 	pt_entry_t     *spte, *epte;
1611 	addr64_t        l64;
1612 	uint64_t        deadline = 0;
1613 	boolean_t       is_ept;
1614 
1615 	pmap_intr_assert();
1616 
1617 	if (map == PMAP_NULL || s64 == e64) {
1618 		return;
1619 	}
1620 
1621 	is_ept = is_ept_pmap(map);
1622 
1623 	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
1624 	    VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(s64),
1625 	    VM_KERNEL_ADDRHIDE(e64));
1626 
1627 	PMAP_LOCK_EXCLUSIVE(map);
1628 	uint32_t traverse_count = 0;
1629 
1630 	while (s64 < e64) {
1631 		pml4_entry_t *pml4e = pmap64_pml4(map, s64);
1632 		if ((pml4e == NULL) ||
1633 		    ((*pml4e & PTE_VALID_MASK(is_ept)) == 0)) {
1634 			if (os_add_overflow(s64, NBPML4, &s64)) {
1635 				/* wrap; clip s64 to e64 */
1636 				s64 = e64;
1637 				break;
1638 			}
1639 			s64 &= ~(PML4MASK);
1640 			continue;
1641 		}
1642 		pdpt_entry_t *pdpte = pmap64_pdpt(map, s64);
1643 		if ((pdpte == NULL) ||
1644 		    ((*pdpte & PTE_VALID_MASK(is_ept)) == 0)) {
1645 			if (os_add_overflow(s64, NBPDPT, &s64)) {
1646 				/* wrap; clip s64 to e64 */
1647 				s64 = e64;
1648 				break;
1649 			}
1650 			s64 &= ~(PDPTMASK);
1651 			continue;
1652 		}
1653 
1654 		if (os_add_overflow(s64, PDE_MAPPED_SIZE, &l64)) {
1655 			l64 = e64;
1656 		} else {
1657 			l64 &= ~(PDE_MAPPED_SIZE - 1);
1658 
1659 			if (l64 > e64) {
1660 				l64 = e64;
1661 			}
1662 		}
1663 
1664 		pde = pmap_pde(map, s64);
1665 
1666 		if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1667 			if (*pde & PTE_PS) {
1668 				/*
1669 				 * If we're removing a superpage, pmap_remove_range()
1670 				 * must work on level 2 instead of level 1; and we're
1671 				 * only passing a single level 2 entry instead of a
1672 				 * level 1 range.
1673 				 */
1674 				spte = pde;
1675 				epte = spte + 1; /* excluded */
1676 			} else {
1677 				spte = pmap_pte(map, (s64 & ~(PDE_MAPPED_SIZE - 1)));
1678 				spte = &spte[ptenum(s64)];
1679 				epte = &spte[intel_btop(l64 - s64)];
1680 			}
1681 			pmap_remove_range_options(map, s64, spte, epte,
1682 			    options);
1683 		}
1684 		s64 = l64;
1685 
1686 		if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
1687 			if (deadline == 0) {
1688 				deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1689 			} else {
1690 				if (rdtsc64_nofence() > deadline) {
1691 					PMAP_UNLOCK_EXCLUSIVE(map);
1692 					__builtin_ia32_pause();
1693 					PMAP_LOCK_EXCLUSIVE(map);
1694 					deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1695 				}
1696 			}
1697 		}
1698 	}
1699 
1700 	PMAP_UNLOCK_EXCLUSIVE(map);
1701 
1702 	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
1703 }
1704 
1705 void
pmap_page_protect(ppnum_t pn,vm_prot_t prot)1706 pmap_page_protect(
1707 	ppnum_t         pn,
1708 	vm_prot_t       prot)
1709 {
1710 	pmap_page_protect_options(pn, prot, 0, NULL);
1711 }
1712 
1713 /*
1714  *	Routine:	pmap_page_protect_options
1715  *
1716  *	Function:
1717  *		Lower the permission for all mappings to a given
1718  *		page.
1719  */
1720 void
pmap_page_protect_options(ppnum_t pn,vm_prot_t prot,unsigned int options,void * arg)1721 pmap_page_protect_options(
1722 	ppnum_t         pn,
1723 	vm_prot_t       prot,
1724 	unsigned int    options,
1725 	void            *arg)
1726 {
1727 	pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1728 	pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1729 	pv_hashed_entry_t       nexth;
1730 	int                     pvh_cnt = 0;
1731 	pv_rooted_entry_t       pv_h;
1732 	pv_rooted_entry_t       pv_e;
1733 	pv_hashed_entry_t       pvh_e;
1734 	pt_entry_t              *pte;
1735 	int                     pai;
1736 	pmap_t                  pmap;
1737 	boolean_t               remove;
1738 	pt_entry_t              new_pte_value;
1739 	boolean_t               is_ept;
1740 
1741 	pmap_intr_assert();
1742 	assert(pn != vm_page_fictitious_addr);
1743 	if (pn == vm_page_guard_addr) {
1744 		return;
1745 	}
1746 
1747 	pai = ppn_to_pai(pn);
1748 
1749 	if (!IS_MANAGED_PAGE(pai)) {
1750 		/*
1751 		 *	Not a managed page.
1752 		 */
1753 		return;
1754 	}
1755 
1756 	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, pn, prot);
1757 
1758 	/*
1759 	 * Determine the new protection.
1760 	 */
1761 	switch (prot) {
1762 	case VM_PROT_READ:
1763 	case VM_PROT_READ | VM_PROT_EXECUTE:
1764 		remove = FALSE;
1765 		break;
1766 	case VM_PROT_ALL:
1767 		return;         /* nothing to do */
1768 	default:
1769 		remove = TRUE;
1770 		break;
1771 	}
1772 
1773 	pv_h = pai_to_pvh(pai);
1774 
1775 	LOCK_PVH(pai);
1776 
1777 
1778 	/*
1779 	 * Walk down PV list, if any, changing or removing all mappings.
1780 	 */
1781 	if (pv_h->pmap == PMAP_NULL) {
1782 		goto done;
1783 	}
1784 
1785 	pv_e = pv_h;
1786 	pvh_e = (pv_hashed_entry_t) pv_e;       /* cheat */
1787 
1788 	do {
1789 		vm_map_offset_t vaddr;
1790 
1791 		if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1792 		    (pmap_phys_attributes[pai] & PHYS_MODIFIED)) {
1793 			/* page was modified, so it will be compressed */
1794 			options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1795 			options |= PMAP_OPTIONS_COMPRESSOR;
1796 		}
1797 
1798 		pmap = pv_e->pmap;
1799 		is_ept = is_ept_pmap(pmap);
1800 		vaddr = PVE_VA(pv_e);
1801 		pte = pmap_pte(pmap, vaddr);
1802 
1803 		pmap_assert2((pa_index(pte_to_pa(*pte)) == pn),
1804 		    "pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte);
1805 
1806 		if (0 == pte) {
1807 			panic("pmap_page_protect() "
1808 			    "pmap=%p pn=0x%x vaddr=0x%llx\n",
1809 			    pmap, pn, vaddr);
1810 		}
1811 		nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink);
1812 
1813 		/*
1814 		 * Remove the mapping if new protection is NONE
1815 		 */
1816 		if (remove) {
1817 			/* Remove per-pmap wired count */
1818 			if (iswired(*pte)) {
1819 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1820 			}
1821 
1822 			if (pmap != kernel_pmap &&
1823 			    (options & PMAP_OPTIONS_COMPRESSOR) &&
1824 			    IS_INTERNAL_PAGE(pai)) {
1825 				assert(!PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr));
1826 				/* mark this PTE as having been "compressed" */
1827 				new_pte_value = PTE_COMPRESSED;
1828 				if (IS_ALTACCT_PAGE(pai, pv_e)) {
1829 					new_pte_value |= PTE_COMPRESSED_ALT;
1830 				}
1831 			} else {
1832 				new_pte_value = 0;
1833 			}
1834 
1835 			if (options & PMAP_OPTIONS_NOREFMOD) {
1836 				pmap_store_pte(is_ept, pte, new_pte_value);
1837 
1838 				if (options & PMAP_OPTIONS_NOFLUSH) {
1839 					PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1840 				} else {
1841 					PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1842 				}
1843 			} else {
1844 				/*
1845 				 * Remove the mapping, collecting dirty bits.
1846 				 */
1847 				pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
1848 
1849 				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1850 				if (!is_ept) {
1851 					pmap_phys_attributes[pai] |=
1852 					    *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1853 				} else {
1854 					pmap_phys_attributes[pai] |=
1855 					    ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1856 				}
1857 				if ((options &
1858 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1859 				    IS_INTERNAL_PAGE(pai) &&
1860 				    (pmap_phys_attributes[pai] &
1861 				    PHYS_MODIFIED)) {
1862 					/*
1863 					 * Page is actually "modified" and
1864 					 * will be compressed.  Start
1865 					 * accounting for it as "compressed".
1866 					 */
1867 					assert(!(options & PMAP_OPTIONS_COMPRESSOR));
1868 					options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1869 					options |= PMAP_OPTIONS_COMPRESSOR;
1870 					assert(new_pte_value == 0);
1871 					if (pmap != kernel_pmap) {
1872 						new_pte_value = PTE_COMPRESSED;
1873 						if (IS_ALTACCT_PAGE(pai, pv_e)) {
1874 							new_pte_value |= PTE_COMPRESSED_ALT;
1875 						}
1876 					}
1877 				}
1878 				pmap_store_pte(is_ept, pte, new_pte_value);
1879 			}
1880 
1881 #if TESTING
1882 			if (pmap->stats.resident_count < 1) {
1883 				panic("pmap_page_protect: resident_count");
1884 			}
1885 #endif
1886 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1887 
1888 			/*
1889 			 * We only ever compress internal pages.
1890 			 */
1891 			if (options & PMAP_OPTIONS_COMPRESSOR) {
1892 				assert(IS_INTERNAL_PAGE(pai));
1893 			}
1894 			if (pmap != kernel_pmap) {
1895 				/* update ledgers */
1896 				if (IS_ALTACCT_PAGE(pai, pv_e)) {
1897 					assert(IS_INTERNAL_PAGE(pai));
1898 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1899 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1900 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1901 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1902 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, PAGE_SIZE);
1903 					}
1904 				} else if (IS_REUSABLE_PAGE(pai)) {
1905 					assert(!IS_ALTACCT_PAGE(pai, pv_e));
1906 					assert(IS_INTERNAL_PAGE(pai));
1907 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1908 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1909 						/* was not in footprint, but is now */
1910 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1911 					}
1912 					pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
1913 				} else if (IS_INTERNAL_PAGE(pai)) {
1914 					assert(!IS_ALTACCT_PAGE(pai, pv_e));
1915 					assert(!IS_REUSABLE_PAGE(pai));
1916 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1917 					/*
1918 					 * Update all stats related to physical
1919 					 * footprint, which only deals with
1920 					 * internal pages.
1921 					 */
1922 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1923 						/*
1924 						 * This removal is only being
1925 						 * done so we can send this page
1926 						 * to the compressor;  therefore
1927 						 * it mustn't affect total task
1928 						 * footprint.
1929 						 */
1930 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1931 					} else {
1932 						/*
1933 						 * This internal page isn't
1934 						 * going to the compressor,
1935 						 * so adjust stats to keep
1936 						 * phys_footprint up to date.
1937 						 */
1938 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1939 					}
1940 				} else {
1941 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
1942 				}
1943 			}
1944 
1945 			/*
1946 			 * Deal with the pv_rooted_entry.
1947 			 */
1948 
1949 			if (pv_e == pv_h) {
1950 				/*
1951 				 * Fix up head later.
1952 				 */
1953 				pv_h->pmap = PMAP_NULL;
1954 			} else {
1955 				/*
1956 				 * Delete this entry.
1957 				 */
1958 				pv_hash_remove(pvh_e);
1959 				pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1960 				pvh_eh = pvh_e;
1961 
1962 				if (pvh_et == PV_HASHED_ENTRY_NULL) {
1963 					pvh_et = pvh_e;
1964 				}
1965 				pvh_cnt++;
1966 			}
1967 		} else {
1968 			/*
1969 			 * Write-protect, after opportunistic refmod collect
1970 			 */
1971 			if (!is_ept) {
1972 				pmap_phys_attributes[pai] |=
1973 				    *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1974 			} else {
1975 				pmap_phys_attributes[pai] |=
1976 				    ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1977 			}
1978 
1979 			pmap_update_pte(is_ept, pte, PTE_WRITE(is_ept), 0, true);
1980 			if (options & PMAP_OPTIONS_NOFLUSH) {
1981 				PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1982 			} else {
1983 				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1984 			}
1985 		}
1986 		pvh_e = nexth;
1987 	} while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h);
1988 
1989 
1990 	/*
1991 	 * If pv_head mapping was removed, fix it up.
1992 	 */
1993 	if (pv_h->pmap == PMAP_NULL) {
1994 		pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
1995 
1996 		if (pvh_e != (pv_hashed_entry_t) pv_h) {
1997 			pv_hash_remove(pvh_e);
1998 			pv_h->pmap = pvh_e->pmap;
1999 			pv_h->va_and_flags = pvh_e->va_and_flags;
2000 			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
2001 			pvh_eh = pvh_e;
2002 
2003 			if (pvh_et == PV_HASHED_ENTRY_NULL) {
2004 				pvh_et = pvh_e;
2005 			}
2006 			pvh_cnt++;
2007 		}
2008 	}
2009 	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
2010 		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
2011 	}
2012 done:
2013 	UNLOCK_PVH(pai);
2014 
2015 	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
2016 }
2017 
2018 
2019 /*
2020  *	Clear specified attribute bits.
2021  */
2022 void
phys_attribute_clear(ppnum_t pn,int bits,unsigned int options,void * arg)2023 phys_attribute_clear(
2024 	ppnum_t         pn,
2025 	int             bits,
2026 	unsigned int    options,
2027 	void            *arg)
2028 {
2029 	pv_rooted_entry_t       pv_h;
2030 	pv_hashed_entry_t       pv_e;
2031 	pt_entry_t              *pte = NULL;
2032 	int                     pai;
2033 	pmap_t                  pmap;
2034 	char                    attributes = 0;
2035 	boolean_t               is_internal, is_reusable, is_altacct, is_ept;
2036 	int                     ept_bits_to_clear;
2037 	boolean_t               ept_keep_global_mod = FALSE;
2038 
2039 	if ((bits & PHYS_MODIFIED) &&
2040 	    (options & PMAP_OPTIONS_NOFLUSH) &&
2041 	    arg == NULL) {
2042 		panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p): "
2043 		    "should not clear 'modified' without flushing TLBs\n",
2044 		    pn, bits, options, arg);
2045 	}
2046 
2047 	/* We only support converting MOD and REF bits for EPT PTEs in this function */
2048 	assert((bits & ~(PHYS_REFERENCED | PHYS_MODIFIED)) == 0);
2049 
2050 	ept_bits_to_clear = (unsigned)physmap_refmod_to_ept(bits & (PHYS_MODIFIED | PHYS_REFERENCED));
2051 
2052 	pmap_intr_assert();
2053 	assert(pn != vm_page_fictitious_addr);
2054 	if (pn == vm_page_guard_addr) {
2055 		return;
2056 	}
2057 
2058 	pai = ppn_to_pai(pn);
2059 
2060 	if (!IS_MANAGED_PAGE(pai)) {
2061 		/*
2062 		 *	Not a managed page.
2063 		 */
2064 		return;
2065 	}
2066 
2067 	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
2068 
2069 	pv_h = pai_to_pvh(pai);
2070 
2071 	LOCK_PVH(pai);
2072 
2073 
2074 	/*
2075 	 * Walk down PV list, clearing all modify or reference bits.
2076 	 * We do not have to lock the pv_list because we have
2077 	 * the per-pmap lock
2078 	 */
2079 	if (pv_h->pmap != PMAP_NULL) {
2080 		/*
2081 		 * There are some mappings.
2082 		 */
2083 
2084 		is_internal = IS_INTERNAL_PAGE(pai);
2085 		is_reusable = IS_REUSABLE_PAGE(pai);
2086 
2087 		pv_e = (pv_hashed_entry_t)pv_h;
2088 
2089 		do {
2090 			vm_map_offset_t va;
2091 			char pte_bits;
2092 
2093 			pmap = pv_e->pmap;
2094 			is_ept = is_ept_pmap(pmap);
2095 			is_altacct = IS_ALTACCT_PAGE(pai, pv_e);
2096 			va = PVE_VA(pv_e);
2097 			pte_bits = 0;
2098 
2099 			if (bits) {
2100 				pte = pmap_pte(pmap, va);
2101 				/* grab ref/mod bits from this PTE */
2102 				pte_bits = (*pte & (PTE_REF(is_ept) | PTE_MOD(is_ept)));
2103 				/* propagate to page's global attributes */
2104 				if (!is_ept) {
2105 					attributes |= pte_bits;
2106 				} else {
2107 					attributes |= ept_refmod_to_physmap(pte_bits);
2108 					if (!pmap_ept_support_ad && (pte_bits & INTEL_EPT_MOD)) {
2109 						ept_keep_global_mod = TRUE;
2110 					}
2111 				}
2112 				/* which bits to clear for this PTE? */
2113 				if (!is_ept) {
2114 					pte_bits &= bits;
2115 				} else {
2116 					pte_bits &= ept_bits_to_clear;
2117 				}
2118 			}
2119 			if (options & PMAP_OPTIONS_CLEAR_WRITE) {
2120 				pte_bits |= PTE_WRITE(is_ept);
2121 			}
2122 
2123 			/*
2124 			 * Clear modify and/or reference bits.
2125 			 */
2126 			if (pte_bits) {
2127 				pmap_update_pte(is_ept, pte, pte_bits, 0, true);
2128 
2129 				/* Ensure all processors using this translation
2130 				 * invalidate this TLB entry. The invalidation
2131 				 * *must* follow the PTE update, to ensure that
2132 				 * the TLB shadow of the 'D' bit (in particular)
2133 				 * is synchronized with the updated PTE.
2134 				 */
2135 				if (!(options & PMAP_OPTIONS_NOFLUSH)) {
2136 					/* flush TLBS now */
2137 					PMAP_UPDATE_TLBS(pmap,
2138 					    va,
2139 					    va + PAGE_SIZE);
2140 				} else if (arg) {
2141 					/* delayed TLB flush: add "pmap" info */
2142 					PMAP_UPDATE_TLBS_DELAYED(
2143 						pmap,
2144 						va,
2145 						va + PAGE_SIZE,
2146 						(pmap_flush_context *)arg);
2147 				} else {
2148 					/* no TLB flushing at all */
2149 				}
2150 			}
2151 
2152 			/* update pmap "reusable" stats */
2153 			if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
2154 			    is_reusable &&
2155 			    pmap != kernel_pmap) {
2156 				/* one less "reusable" */
2157 				pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
2158 				if (is_internal) {
2159 					/* one more "internal" */
2160 					if (is_altacct) {
2161 						/* no impact on ledgers */
2162 					} else {
2163 						pmap_ledger_credit(pmap,
2164 						    task_ledgers.internal,
2165 						    PAGE_SIZE);
2166 						pmap_ledger_credit(
2167 							pmap,
2168 							task_ledgers.phys_footprint,
2169 							PAGE_SIZE);
2170 					}
2171 				} else {
2172 					/* one more "external" */
2173 					pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
2174 				}
2175 			} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
2176 			    !is_reusable &&
2177 			    pmap != kernel_pmap) {
2178 				/* one more "reusable" */
2179 				pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
2180 				if (is_internal) {
2181 					/* one less "internal" */
2182 					if (is_altacct) {
2183 						/* no impact on footprint */
2184 					} else {
2185 						pmap_ledger_debit(pmap,
2186 						    task_ledgers.internal,
2187 						    PAGE_SIZE);
2188 						pmap_ledger_debit(
2189 							pmap,
2190 							task_ledgers.phys_footprint,
2191 							PAGE_SIZE);
2192 					}
2193 				} else {
2194 					/* one less "external" */
2195 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
2196 				}
2197 			}
2198 
2199 			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2200 		} while (pv_e != (pv_hashed_entry_t)pv_h);
2201 	}
2202 	/* Opportunistic refmod collection, annulled
2203 	 * if both REF and MOD are being cleared.
2204 	 */
2205 
2206 	pmap_phys_attributes[pai] |= attributes;
2207 
2208 	if (ept_keep_global_mod) {
2209 		/*
2210 		 * If the hardware doesn't support AD bits for EPT PTEs and someone is
2211 		 * requesting that we clear the modified bit for a phys page, we need
2212 		 * to ensure that there are no EPT mappings for the page with the
2213 		 * modified bit set. If there are, we cannot clear the global modified bit.
2214 		 */
2215 		bits &= ~PHYS_MODIFIED;
2216 	}
2217 	pmap_phys_attributes[pai] &= ~(bits);
2218 
2219 	/* update this page's "reusable" status */
2220 	if (options & PMAP_OPTIONS_CLEAR_REUSABLE) {
2221 		pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
2222 	} else if (options & PMAP_OPTIONS_SET_REUSABLE) {
2223 		pmap_phys_attributes[pai] |= PHYS_REUSABLE;
2224 	}
2225 
2226 	UNLOCK_PVH(pai);
2227 
2228 	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
2229 }
2230 
2231 /*
2232  *	Check specified attribute bits.
2233  */
2234 int
phys_attribute_test(ppnum_t pn,int bits)2235 phys_attribute_test(
2236 	ppnum_t         pn,
2237 	int             bits)
2238 {
2239 	pv_rooted_entry_t       pv_h;
2240 	pv_hashed_entry_t       pv_e;
2241 	pt_entry_t              *pte;
2242 	int                     pai;
2243 	pmap_t                  pmap;
2244 	int                     attributes = 0;
2245 	boolean_t               is_ept;
2246 
2247 	pmap_intr_assert();
2248 	assert(pn != vm_page_fictitious_addr);
2249 	assert((bits & ~(PHYS_MODIFIED | PHYS_REFERENCED)) == 0);
2250 	if (pn == vm_page_guard_addr) {
2251 		return 0;
2252 	}
2253 
2254 	pai = ppn_to_pai(pn);
2255 
2256 	if (!IS_MANAGED_PAGE(pai)) {
2257 		/*
2258 		 *	Not a managed page.
2259 		 */
2260 		return 0;
2261 	}
2262 
2263 	/*
2264 	 * Fast check...  if bits already collected
2265 	 * no need to take any locks...
2266 	 * if not set, we need to recheck after taking
2267 	 * the lock in case they got pulled in while
2268 	 * we were waiting for the lock
2269 	 */
2270 	if ((pmap_phys_attributes[pai] & bits) == bits) {
2271 		return bits;
2272 	}
2273 
2274 	pv_h = pai_to_pvh(pai);
2275 
2276 	LOCK_PVH(pai);
2277 
2278 	attributes = pmap_phys_attributes[pai] & bits;
2279 
2280 
2281 	/*
2282 	 * Walk down PV list, checking the mappings until we
2283 	 * reach the end or we've found the desired attributes.
2284 	 */
2285 	if (attributes != bits &&
2286 	    pv_h->pmap != PMAP_NULL) {
2287 		/*
2288 		 * There are some mappings.
2289 		 */
2290 		pv_e = (pv_hashed_entry_t)pv_h;
2291 		do {
2292 			vm_map_offset_t va;
2293 
2294 			pmap = pv_e->pmap;
2295 			is_ept = is_ept_pmap(pmap);
2296 			va = PVE_VA(pv_e);
2297 			/*
2298 			 * pick up modify and/or reference bits from mapping
2299 			 */
2300 
2301 			pte = pmap_pte(pmap, va);
2302 			if (!is_ept) {
2303 				attributes |= (int)(*pte & bits);
2304 			} else {
2305 				attributes |= (int)(ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED));
2306 			}
2307 
2308 			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2309 		} while ((attributes != bits) &&
2310 		    (pv_e != (pv_hashed_entry_t)pv_h));
2311 	}
2312 	pmap_phys_attributes[pai] |= attributes;
2313 
2314 	UNLOCK_PVH(pai);
2315 	return attributes;
2316 }
2317 
2318 /*
2319  *	Routine:	pmap_change_wiring
2320  *	Function:	Change the wiring attribute for a map/virtual-address
2321  *			pair.
2322  *	In/out conditions:
2323  *			The mapping must already exist in the pmap.
2324  */
2325 void
pmap_change_wiring(pmap_t map,vm_map_offset_t vaddr,boolean_t wired)2326 pmap_change_wiring(
2327 	pmap_t          map,
2328 	vm_map_offset_t vaddr,
2329 	boolean_t       wired)
2330 {
2331 	pt_entry_t      *pte;
2332 
2333 	PMAP_LOCK_SHARED(map);
2334 
2335 	if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL) {
2336 		panic("pmap_change_wiring(%p,0x%llx,%d): pte missing",
2337 		    map, vaddr, wired);
2338 	}
2339 
2340 	if (wired && !iswired(*pte)) {
2341 		/*
2342 		 * wiring down mapping
2343 		 */
2344 		pmap_ledger_credit(map, task_ledgers.wired_mem, PAGE_SIZE);
2345 		pmap_update_pte(is_ept_pmap(map), pte, 0, PTE_WIRED, false);
2346 	} else if (!wired && iswired(*pte)) {
2347 		/*
2348 		 * unwiring mapping
2349 		 */
2350 		pmap_ledger_debit(map, task_ledgers.wired_mem, PAGE_SIZE);
2351 		pmap_update_pte(is_ept_pmap(map), pte, PTE_WIRED, 0, false);
2352 	}
2353 
2354 	PMAP_UNLOCK_SHARED(map);
2355 }
2356 
2357 /*
2358  *	"Backdoor" direct map routine for early mappings.
2359  *      Useful for mapping memory outside the range
2360  *      Sets A, D and NC if requested
2361  */
2362 
2363 vm_offset_t
pmap_map_bd(vm_offset_t virt,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int flags)2364 pmap_map_bd(
2365 	vm_offset_t     virt,
2366 	vm_map_offset_t start_addr,
2367 	vm_map_offset_t end_addr,
2368 	vm_prot_t       prot,
2369 	unsigned int    flags)
2370 {
2371 	pt_entry_t      template;
2372 	pt_entry_t      *ptep;
2373 
2374 	vm_offset_t     base = virt;
2375 	boolean_t       doflush = FALSE;
2376 
2377 	template = pa_to_pte(start_addr)
2378 	    | INTEL_PTE_REF
2379 	    | INTEL_PTE_MOD
2380 	    | INTEL_PTE_WIRED
2381 	    | INTEL_PTE_VALID;
2382 
2383 	if ((flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) == VM_MEM_NOT_CACHEABLE) {
2384 		template |= INTEL_PTE_NCACHE;
2385 		if (!(flags & (VM_MEM_GUARDED))) {
2386 			template |= INTEL_PTE_PAT;
2387 		}
2388 	}
2389 
2390 	if ((prot & VM_PROT_EXECUTE) == 0) {
2391 		template |= INTEL_PTE_NX;
2392 	}
2393 
2394 	if (prot & VM_PROT_WRITE) {
2395 		template |= INTEL_PTE_WRITE;
2396 	}
2397 	vm_map_offset_t caddr = start_addr;
2398 	while (caddr < end_addr) {
2399 		ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
2400 		if (ptep == PT_ENTRY_NULL) {
2401 			panic("pmap_map_bd: Invalid kernel address");
2402 		}
2403 		if (pte_to_pa(*ptep)) {
2404 			doflush = TRUE;
2405 		}
2406 		pmap_store_pte(FALSE, ptep, template);
2407 		pte_increment_pa(template);
2408 		virt += PAGE_SIZE;
2409 		caddr += PAGE_SIZE;
2410 	}
2411 	if (doflush) {
2412 		pmap_tlbi_range(0, ~0ULL, true, 0);
2413 		PMAP_UPDATE_TLBS(kernel_pmap, base, base + end_addr - start_addr);
2414 	}
2415 	return virt;
2416 }
2417 
2418 /* Create a virtual alias beginning at 'ava' of the specified kernel virtual
2419  * range. The aliased pagetable range is expanded if
2420  * PMAP_EXPAND_OPTIONS_ALIASMAP is specified. Performs no synchronization,
2421  * assumes caller has stabilized the source and destination ranges. Currently
2422  * used to populate sections of the trampoline "doublemap" at CPU startup.
2423  */
2424 
2425 void
pmap_alias(vm_offset_t ava,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int eoptions)2426 pmap_alias(
2427 	vm_offset_t     ava,
2428 	vm_map_offset_t start_addr,
2429 	vm_map_offset_t end_addr,
2430 	vm_prot_t       prot,
2431 	unsigned int    eoptions)
2432 {
2433 	pt_entry_t      prot_template, template;
2434 	pt_entry_t      *aptep, *sptep;
2435 
2436 	prot_template =  INTEL_PTE_REF | INTEL_PTE_MOD | INTEL_PTE_WIRED | INTEL_PTE_VALID;
2437 	if ((prot & VM_PROT_EXECUTE) == 0) {
2438 		prot_template |= INTEL_PTE_NX;
2439 	}
2440 
2441 	if (prot & VM_PROT_WRITE) {
2442 		prot_template |= INTEL_PTE_WRITE;
2443 	}
2444 	assert(((start_addr | end_addr) & PAGE_MASK) == 0);
2445 	while (start_addr < end_addr) {
2446 		aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2447 		if (aptep == PT_ENTRY_NULL) {
2448 			if (eoptions & PMAP_EXPAND_OPTIONS_ALIASMAP) {
2449 				pmap_expand(kernel_pmap, ava, PMAP_EXPAND_OPTIONS_ALIASMAP);
2450 				aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2451 			} else {
2452 				panic("pmap_alias: Invalid alias address");
2453 			}
2454 		}
2455 		/* The aliased range should not have any active mappings */
2456 		assert(pte_to_pa(*aptep) == 0);
2457 
2458 		sptep = pmap_pte(kernel_pmap, start_addr);
2459 		assert(sptep != PT_ENTRY_NULL && (pte_to_pa(*sptep) != 0));
2460 		template = pa_to_pte(pte_to_pa(*sptep)) | prot_template;
2461 		pmap_store_pte(FALSE, aptep, template);
2462 
2463 		ava += PAGE_SIZE;
2464 		start_addr += PAGE_SIZE;
2465 	}
2466 }
2467 
2468 mach_vm_size_t
pmap_query_resident(pmap_t pmap,addr64_t s64,addr64_t e64,mach_vm_size_t * compressed_bytes_p)2469 pmap_query_resident(
2470 	pmap_t          pmap,
2471 	addr64_t        s64,
2472 	addr64_t        e64,
2473 	mach_vm_size_t  *compressed_bytes_p)
2474 {
2475 	pt_entry_t     *pde;
2476 	pt_entry_t     *spte, *epte;
2477 	addr64_t        l64;
2478 	uint64_t        deadline = 0;
2479 	mach_vm_size_t  resident_bytes;
2480 	mach_vm_size_t  compressed_bytes;
2481 	boolean_t       is_ept;
2482 
2483 	pmap_intr_assert();
2484 
2485 	if (pmap == PMAP_NULL || pmap == kernel_pmap || s64 == e64) {
2486 		if (compressed_bytes_p) {
2487 			*compressed_bytes_p = 0;
2488 		}
2489 		return 0;
2490 	}
2491 
2492 	is_ept = is_ept_pmap(pmap);
2493 
2494 	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
2495 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(s64),
2496 	    VM_KERNEL_ADDRHIDE(e64));
2497 
2498 	resident_bytes = 0;
2499 	compressed_bytes = 0;
2500 
2501 	PMAP_LOCK_EXCLUSIVE(pmap);
2502 	uint32_t traverse_count = 0;
2503 
2504 	while (s64 < e64) {
2505 		if (os_add_overflow(s64, PDE_MAPPED_SIZE, &l64)) {
2506 			l64 = e64;
2507 		} else {
2508 			l64 &= ~(PDE_MAPPED_SIZE - 1);
2509 
2510 			if (l64 > e64) {
2511 				l64 = e64;
2512 			}
2513 		}
2514 
2515 		pde = pmap_pde(pmap, s64);
2516 
2517 		if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
2518 			if (*pde & PTE_PS) {
2519 				/* superpage: not supported */
2520 			} else {
2521 				spte = pmap_pte(pmap,
2522 				    (s64 & ~(PDE_MAPPED_SIZE - 1)));
2523 				spte = &spte[ptenum(s64)];
2524 				epte = &spte[intel_btop(l64 - s64)];
2525 
2526 				for (; spte < epte; spte++) {
2527 					if (pte_to_pa(*spte) != 0) {
2528 						resident_bytes += PAGE_SIZE;
2529 					} else if (*spte & PTE_COMPRESSED) {
2530 						compressed_bytes += PAGE_SIZE;
2531 					}
2532 				}
2533 			}
2534 		}
2535 		s64 = l64;
2536 
2537 		if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
2538 			if (deadline == 0) {
2539 				deadline = rdtsc64() + max_preemption_latency_tsc;
2540 			} else {
2541 				if (rdtsc64() > deadline) {
2542 					PMAP_UNLOCK_EXCLUSIVE(pmap);
2543 					__builtin_ia32_pause();
2544 					PMAP_LOCK_EXCLUSIVE(pmap);
2545 					deadline = rdtsc64() + max_preemption_latency_tsc;
2546 				}
2547 			}
2548 		}
2549 	}
2550 
2551 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2552 
2553 	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
2554 	    resident_bytes);
2555 
2556 	if (compressed_bytes_p) {
2557 		*compressed_bytes_p = compressed_bytes;
2558 	}
2559 	return resident_bytes;
2560 }
2561 
2562 uint64_t pmap_query_page_info_retries;
2563 
2564 kern_return_t
pmap_query_page_info(pmap_t pmap,vm_map_offset_t va,int * disp_p)2565 pmap_query_page_info(
2566 	pmap_t          pmap,
2567 	vm_map_offset_t va,
2568 	int             *disp_p)
2569 {
2570 	int             disp;
2571 	boolean_t       is_ept;
2572 	pmap_paddr_t    pa;
2573 	ppnum_t         pai;
2574 	pd_entry_t      *pde_p;
2575 	pt_entry_t      *pte_p, pte;
2576 
2577 	pmap_intr_assert();
2578 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
2579 		*disp_p = 0;
2580 		return KERN_INVALID_ARGUMENT;
2581 	}
2582 
2583 	disp = 0;
2584 	is_ept = is_ept_pmap(pmap);
2585 
2586 	PMAP_LOCK_EXCLUSIVE(pmap);
2587 
2588 	pde_p = pmap_pde(pmap, va);
2589 	if (!pde_p ||
2590 	    !(*pde_p & PTE_VALID_MASK(is_ept)) ||
2591 	    (*pde_p & PTE_PS)) {
2592 		goto done;
2593 	}
2594 
2595 try_again:
2596 	disp = 0;
2597 
2598 	pte_p = pmap_pte(pmap, va);
2599 	if (pte_p == PT_ENTRY_NULL) {
2600 		goto done;
2601 	}
2602 
2603 	pte = *pte_p;
2604 	pa = pte_to_pa(pte);
2605 	if (pa == 0) {
2606 		if (PTE_IS_COMPRESSED(pte, pte_p, pmap, va)) {
2607 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
2608 			if (pte & PTE_COMPRESSED_ALT) {
2609 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
2610 			}
2611 		}
2612 	} else {
2613 		disp |= PMAP_QUERY_PAGE_PRESENT;
2614 		pai = pa_index(pa);
2615 		if (!IS_MANAGED_PAGE(pai)) {
2616 		} else if (pmap_pv_is_altacct(pmap, va, pai)) {
2617 			assert(IS_INTERNAL_PAGE(pai));
2618 			disp |= PMAP_QUERY_PAGE_INTERNAL;
2619 			disp |= PMAP_QUERY_PAGE_ALTACCT;
2620 		} else if (IS_REUSABLE_PAGE(pai)) {
2621 			disp |= PMAP_QUERY_PAGE_REUSABLE;
2622 		} else if (IS_INTERNAL_PAGE(pai)) {
2623 			disp |= PMAP_QUERY_PAGE_INTERNAL;
2624 		}
2625 	}
2626 	if (__improbable(pte_p != pmap_pte(pmap, va) || pte != *pte_p)) {
2627 		/* something changed: try again */
2628 		pmap_query_page_info_retries++;
2629 		goto try_again;
2630 	}
2631 done:
2632 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2633 	*disp_p = disp;
2634 	return KERN_SUCCESS;
2635 }
2636 
2637 void
pmap_set_vm_map_cs_enforced(pmap_t pmap,bool new_value)2638 pmap_set_vm_map_cs_enforced(
2639 	pmap_t pmap,
2640 	bool new_value)
2641 {
2642 	PMAP_LOCK_EXCLUSIVE(pmap);
2643 	pmap->pm_vm_map_cs_enforced = new_value;
2644 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2645 }
2646 extern int cs_process_enforcement_enable;
2647 bool
pmap_get_vm_map_cs_enforced(pmap_t pmap)2648 pmap_get_vm_map_cs_enforced(
2649 	pmap_t pmap)
2650 {
2651 	if (cs_process_enforcement_enable) {
2652 		return true;
2653 	}
2654 	return pmap->pm_vm_map_cs_enforced;
2655 }
2656 
2657 void
pmap_set_jit_entitled(__unused pmap_t pmap)2658 pmap_set_jit_entitled(__unused pmap_t pmap)
2659 {
2660 	/* The x86 pmap layer does not care if a map has a JIT entry. */
2661 	return;
2662 }
2663 
2664 bool
pmap_get_jit_entitled(__unused pmap_t pmap)2665 pmap_get_jit_entitled(__unused pmap_t pmap)
2666 {
2667 	/* The x86 pmap layer does not care if a map is using JIT. */
2668 	return false;
2669 }
2670 
2671 void
pmap_set_tpro(__unused pmap_t pmap)2672 pmap_set_tpro(__unused pmap_t pmap)
2673 {
2674 	/* The x86 pmap layer does not care if a map is using TPRO */
2675 	return;
2676 }
2677 
2678 bool
pmap_get_tpro(__unused pmap_t pmap)2679 pmap_get_tpro(__unused pmap_t pmap)
2680 {
2681 	/* The x86 pmap layer does not care if a map is using TPRO */
2682 	return false;
2683 }
2684 
2685 bool
pmap_has_prot_policy(__unused pmap_t pmap,__unused bool translated_allow_execute,__unused vm_prot_t prot)2686 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
2687 {
2688 	/*
2689 	 * The x86 pmap layer does not apply any policy to any protection
2690 	 * types.
2691 	 */
2692 	return false;
2693 }
2694 
2695 uint64_t
pmap_release_pages_fast(void)2696 pmap_release_pages_fast(void)
2697 {
2698 	return 0;
2699 }
2700 
2701 void
pmap_trim(__unused pmap_t grand,__unused pmap_t subord,__unused addr64_t vstart,__unused uint64_t size)2702 pmap_trim(__unused pmap_t grand, __unused pmap_t subord, __unused addr64_t vstart, __unused uint64_t size)
2703 {
2704 	return;
2705 }
2706 
2707 __dead2
2708 void
pmap_ledger_verify_size(size_t size)2709 pmap_ledger_verify_size(size_t size)
2710 {
2711 	panic("%s: unsupported, "
2712 	    "size=%lu",
2713 	    __func__, size);
2714 }
2715 
2716 __dead2
2717 ledger_t
pmap_ledger_alloc(void)2718 pmap_ledger_alloc(void)
2719 {
2720 	panic("%s: unsupported",
2721 	    __func__);
2722 }
2723 
2724 __dead2
2725 void
pmap_ledger_free(ledger_t ledger)2726 pmap_ledger_free(ledger_t ledger)
2727 {
2728 	panic("%s: unsupported, "
2729 	    "ledger=%p",
2730 	    __func__, ledger);
2731 }
2732 
2733 kern_return_t
pmap_dump_page_tables(pmap_t pmap __unused,void * bufp __unused,void * buf_end __unused,unsigned int level_mask __unused,size_t * bytes_copied __unused)2734 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
2735     unsigned int level_mask __unused, size_t *bytes_copied __unused)
2736 {
2737 	return KERN_NOT_SUPPORTED;
2738 }
2739 
2740 void *
pmap_map_compressor_page(ppnum_t pn)2741 pmap_map_compressor_page(ppnum_t pn)
2742 {
2743 	assertf(IS_MANAGED_PAGE(ppn_to_pai(pn)), "%s called on non-managed page 0x%08x", __func__, pn);
2744 	return PHYSMAP_PTOV((uint64_t)pn << (uint64_t)PAGE_SHIFT);
2745 }
2746 
2747 void
pmap_unmap_compressor_page(ppnum_t pn __unused,void * kva __unused)2748 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
2749 {
2750 }
2751 
2752 bool
pmap_clear_refmod_range_options(pmap_t pmap __unused,vm_map_address_t start __unused,vm_map_address_t end __unused,unsigned int mask __unused,unsigned int options __unused)2753 pmap_clear_refmod_range_options(
2754 	pmap_t pmap __unused,
2755 	vm_map_address_t start __unused,
2756 	vm_map_address_t end __unused,
2757 	unsigned int mask __unused,
2758 	unsigned int options __unused)
2759 {
2760 	/*
2761 	 * x86 doesn't have ranged tlbi instructions, and we already have
2762 	 * the pmap_flush_context. This operation isn't implemented.
2763 	 */
2764 	return false;
2765 }
2766 
2767 bool
pmap_supported_feature(pmap_t pmap,pmap_feature_flags_t feat)2768 pmap_supported_feature(pmap_t pmap, pmap_feature_flags_t feat)
2769 {
2770 	switch (feat) {
2771 	case PMAP_FEAT_UEXEC:
2772 		return pmap != NULL && is_ept_pmap(pmap);
2773 	default:
2774 		return false;
2775 	}
2776 }
2777