xref: /xnu-11215.61.5/osfmk/i386/pmap_x86_common.c (revision 4f1223e81cd707a65cc109d0b8ad6653699da3c4)
1 /*
2  * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <mach_assert.h>
30 
31 #include <vm/pmap.h>
32 #include <vm/vm_map_xnu.h>
33 #include <vm/vm_kern_xnu.h>
34 #include <vm/vm_page_internal.h>
35 #include <kern/ledger.h>
36 #include <kern/zalloc_internal.h>
37 #include <i386/pmap_internal.h>
38 
39 void            pmap_remove_range(
40 	pmap_t          pmap,
41 	vm_map_offset_t va,
42 	pt_entry_t      *spte,
43 	pt_entry_t      *epte);
44 
45 static void            pmap_remove_range_options(
46 	pmap_t          pmap,
47 	vm_map_offset_t va,
48 	pt_entry_t      *spte,
49 	pt_entry_t      *epte,
50 	int             options);
51 
52 void            pmap_reusable_range(
53 	pmap_t          pmap,
54 	vm_map_offset_t va,
55 	pt_entry_t      *spte,
56 	pt_entry_t      *epte,
57 	boolean_t       reusable);
58 
59 pt_entry_t *PTE_corrupted_ptr;
60 
61 #if DEVELOPMENT || DEBUG
62 int pmap_inject_pte_corruption;
63 uint32_t pmap_update_clear_pte_count;
64 uint32_t pmap_update_invalid_pte_count;
65 #endif
66 
67 /*
68  * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time,
69  * on a NBPDE boundary.
70  */
71 
72 uint64_t
pmap_shared_region_size_min(__unused pmap_t pmap)73 pmap_shared_region_size_min(__unused pmap_t pmap)
74 {
75 	return NBPDE;
76 }
77 
78 uint64_t
pmap_commpage_size_min(__unused pmap_t pmap)79 pmap_commpage_size_min(__unused pmap_t pmap)
80 {
81 	return NBPDE;
82 }
83 
84 /*
85  *	kern_return_t pmap_nest(grand, subord, va_start, size)
86  *
87  *	grand  = the pmap that we will nest subord into
88  *	subord = the pmap that goes into the grand
89  *	va_start  = start of range in pmap to be inserted
90  *	size   = Size of nest area (up to 16TB)
91  *
92  *	Inserts a pmap into another.  This is used to implement shared segments.
93  *
94  *	Note that we depend upon higher level VM locks to insure that things don't change while
95  *	we are doing this.  For example, VM should not be doing any pmap enters while it is nesting
96  *	or do 2 nests at once.
97  */
98 
99 /*
100  * This routine can nest subtrees either at the PDPT level (1GiB) or at the
101  * PDE level (2MiB). We currently disallow disparate offsets for the "subord"
102  * container and the "grand" parent. A minor optimization to consider for the
103  * future: make the "subord" truly a container rather than a full-fledged
104  * pagetable hierarchy which can be unnecessarily sparse (DRK).
105  */
106 
107 kern_return_t
pmap_nest(pmap_t grand,pmap_t subord,addr64_t va_start,uint64_t size)108 pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, uint64_t size)
109 {
110 	vm_map_offset_t vaddr;
111 	pd_entry_t      *pde, *npde;
112 	unsigned int    i;
113 	uint64_t        num_pde;
114 
115 	assert(!is_ept_pmap(grand));
116 	assert(!is_ept_pmap(subord));
117 
118 	if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
119 	    (va_start & (pmap_shared_region_size_min(grand) - 1)) ||
120 	    ((size >> 28) > 65536)) {   /* Max size we can nest is 16TB */
121 		return KERN_INVALID_VALUE;
122 	}
123 
124 	if (size == 0) {
125 		panic("pmap_nest: size is invalid - %016llX", size);
126 	}
127 
128 	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
129 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
130 	    VM_KERNEL_ADDRHIDE(va_start));
131 
132 	vaddr = (vm_map_offset_t)va_start;
133 	num_pde = size >> PDESHIFT;
134 
135 	PMAP_LOCK_EXCLUSIVE(subord);
136 
137 	subord->pm_shared = TRUE;
138 
139 	for (i = 0; i < num_pde;) {
140 		if (((vaddr & PDPTMASK) == 0) && (num_pde - i) >= NPDEPG) {
141 			npde = pmap64_pdpt(subord, vaddr);
142 
143 			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
144 				PMAP_UNLOCK_EXCLUSIVE(subord);
145 				pmap_expand_pdpt(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
146 				PMAP_LOCK_EXCLUSIVE(subord);
147 				npde = pmap64_pdpt(subord, vaddr);
148 			}
149 			*npde |= INTEL_PDPTE_NESTED;
150 			vaddr += NBPDPT;
151 			i += (uint32_t)NPDEPG;
152 		} else {
153 			npde = pmap_pde(subord, vaddr);
154 
155 			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
156 				PMAP_UNLOCK_EXCLUSIVE(subord);
157 				pmap_expand(subord, vaddr, PMAP_EXPAND_OPTIONS_NONE);
158 				PMAP_LOCK_EXCLUSIVE(subord);
159 				npde = pmap_pde(subord, vaddr);
160 			}
161 			vaddr += NBPDE;
162 			i++;
163 		}
164 	}
165 
166 	PMAP_UNLOCK_EXCLUSIVE(subord);
167 
168 	vaddr = (vm_map_offset_t)va_start;
169 
170 	PMAP_LOCK_EXCLUSIVE(grand);
171 
172 	for (i = 0; i < num_pde;) {
173 		pd_entry_t tpde;
174 
175 		if (((vaddr & PDPTMASK) == 0) && ((num_pde - i) >= NPDEPG)) {
176 			npde = pmap64_pdpt(subord, vaddr);
177 			if (npde == 0) {
178 				panic("pmap_nest: no PDPT, subord %p nstart 0x%llx", subord, vaddr);
179 			}
180 			tpde = *npde;
181 			pde = pmap64_pdpt(grand, vaddr);
182 			if (0 == pde) {
183 				PMAP_UNLOCK_EXCLUSIVE(grand);
184 				pmap_expand_pml4(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
185 				PMAP_LOCK_EXCLUSIVE(grand);
186 				pde = pmap64_pdpt(grand, vaddr);
187 			}
188 			if (pde == 0) {
189 				panic("pmap_nest: no PDPT, grand  %p vaddr 0x%llx", grand, vaddr);
190 			}
191 			pmap_store_pte(FALSE, pde, tpde);
192 			vaddr += NBPDPT;
193 			i += (uint32_t) NPDEPG;
194 		} else {
195 			npde = pmap_pde(subord, vaddr);
196 			if (npde == 0) {
197 				panic("pmap_nest: no npde, subord %p vaddr 0x%llx", subord, vaddr);
198 			}
199 			tpde = *npde;
200 			pde = pmap_pde(grand, vaddr);
201 			if (0 == pde) {
202 				PMAP_UNLOCK_EXCLUSIVE(grand);
203 				pmap_expand_pdpt(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
204 				PMAP_LOCK_EXCLUSIVE(grand);
205 				pde = pmap_pde(grand, vaddr);
206 			}
207 
208 			if (pde == 0) {
209 				panic("pmap_nest: no pde, grand  %p vaddr 0x%llx", grand, vaddr);
210 			}
211 			vaddr += NBPDE;
212 			pmap_store_pte(FALSE, pde, tpde);
213 			i++;
214 		}
215 	}
216 
217 	PMAP_UNLOCK_EXCLUSIVE(grand);
218 
219 	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, KERN_SUCCESS);
220 
221 	return KERN_SUCCESS;
222 }
223 
224 /*
225  *	kern_return_t pmap_unnest(grand, vaddr)
226  *
227  *	grand  = the pmap that we will un-nest subord from
228  *	vaddr  = start of range in pmap to be unnested
229  *
230  *	Removes a pmap from another.  This is used to implement shared segments.
231  */
232 
233 kern_return_t
pmap_unnest(pmap_t grand,addr64_t vaddr,uint64_t size)234 pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size)
235 {
236 	pd_entry_t *pde;
237 	unsigned int i;
238 	uint64_t num_pde;
239 	addr64_t va_start, va_end;
240 	uint64_t npdpt = PMAP_INVALID_PDPTNUM;
241 
242 	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
243 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
244 
245 	if ((size & (pmap_shared_region_size_min(grand) - 1)) ||
246 	    (vaddr & (pmap_shared_region_size_min(grand) - 1))) {
247 		panic("pmap_unnest(%p,0x%llx,0x%llx): unaligned...",
248 		    grand, vaddr, size);
249 	}
250 
251 	assert(!is_ept_pmap(grand));
252 
253 	/* align everything to PDE boundaries */
254 	va_start = vaddr & ~(NBPDE - 1);
255 
256 	if (os_add_overflow(vaddr, size + NBPDE - 1, &va_end)) {
257 		panic("pmap_unnest: Overflow when calculating range end: s=0x%llx sz=0x%llx\n", vaddr, size);
258 	}
259 
260 	va_end &= ~(NBPDE - 1);
261 	size = va_end - va_start;
262 
263 	PMAP_LOCK_EXCLUSIVE(grand);
264 
265 	num_pde = size >> PDESHIFT;
266 	vaddr = va_start;
267 
268 	for (i = 0; i < num_pde;) {
269 		if (pdptnum(grand, vaddr) != npdpt) {
270 			npdpt = pdptnum(grand, vaddr);
271 			pde = pmap64_pdpt(grand, vaddr);
272 			if (pde && (*pde & INTEL_PDPTE_NESTED)) {
273 				pmap_store_pte(FALSE, pde, (pd_entry_t)0);
274 				i += (uint32_t) NPDEPG;
275 				vaddr += NBPDPT;
276 				continue;
277 			}
278 		}
279 		pde = pmap_pde(grand, (vm_map_offset_t)vaddr);
280 		if (pde == 0) {
281 			panic("pmap_unnest: no pde, grand %p vaddr 0x%llx", grand, vaddr);
282 		}
283 		pmap_store_pte(FALSE, pde, (pd_entry_t)0);
284 		i++;
285 		vaddr += NBPDE;
286 	}
287 
288 	PMAP_UPDATE_TLBS(grand, va_start, va_end);
289 
290 	PMAP_UNLOCK_EXCLUSIVE(grand);
291 
292 	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
293 
294 	return KERN_SUCCESS;
295 }
296 
297 kern_return_t
pmap_unnest_options(pmap_t grand,addr64_t vaddr,__unused uint64_t size,__unused unsigned int options)298 pmap_unnest_options(
299 	pmap_t grand,
300 	addr64_t vaddr,
301 	__unused uint64_t size,
302 	__unused unsigned int options)
303 {
304 	return pmap_unnest(grand, vaddr, size);
305 }
306 
307 /* Invoked by the Mach VM to determine the platform specific unnest region */
308 
309 boolean_t
pmap_adjust_unnest_parameters(pmap_t p,vm_map_offset_t * s,vm_map_offset_t * e)310 pmap_adjust_unnest_parameters(pmap_t p, vm_map_offset_t *s, vm_map_offset_t *e)
311 {
312 	pd_entry_t *pdpte;
313 	boolean_t rval = FALSE;
314 
315 	PMAP_LOCK_EXCLUSIVE(p);
316 
317 	pdpte = pmap64_pdpt(p, *s);
318 	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
319 		*s &= ~(NBPDPT - 1);
320 		rval = TRUE;
321 	}
322 
323 	pdpte = pmap64_pdpt(p, *e);
324 	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
325 		*e = ((*e + NBPDPT) & ~(NBPDPT - 1));
326 		rval = TRUE;
327 	}
328 
329 	PMAP_UNLOCK_EXCLUSIVE(p);
330 
331 	return rval;
332 }
333 
334 pmap_paddr_t
pmap_find_pa(pmap_t pmap,addr64_t va)335 pmap_find_pa(pmap_t pmap, addr64_t va)
336 {
337 	pt_entry_t      *ptp;
338 	pd_entry_t      *pdep;
339 	pd_entry_t      pde;
340 	pt_entry_t      pte;
341 	boolean_t       is_ept, locked = FALSE;
342 	pmap_paddr_t    pa = 0;
343 
344 	is_ept = is_ept_pmap(pmap);
345 
346 	if ((pmap != kernel_pmap) && not_in_kdp) {
347 		PMAP_LOCK_EXCLUSIVE(pmap);
348 		locked = TRUE;
349 	} else {
350 		mp_disable_preemption();
351 	}
352 
353 	if (os_ref_get_count(&pmap->ref_count) == 0) {
354 		goto pfp_exit;
355 	}
356 
357 	pdep = pmap_pde(pmap, va);
358 
359 	if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
360 		if (pde & PTE_PS) {
361 			pa = pte_to_pa(pde) + (va & I386_LPGMASK);
362 		} else {
363 			ptp = pmap_pte(pmap, va);
364 			if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
365 				pa = pte_to_pa(pte) + (va & PAGE_MASK);
366 			}
367 		}
368 	}
369 pfp_exit:
370 	if (locked) {
371 		PMAP_UNLOCK_EXCLUSIVE(pmap);
372 	} else {
373 		mp_enable_preemption();
374 	}
375 
376 	return pa;
377 }
378 
379 /*
380  * pmap_find_phys returns the (4K) physical page number containing a
381  * given virtual address in a given pmap.
382  * Note that pmap_pte may return a pde if this virtual address is
383  * mapped by a large page and this is taken into account in order
384  * to return the correct page number in this case.
385  */
386 ppnum_t
pmap_find_phys(pmap_t pmap,addr64_t va)387 pmap_find_phys(pmap_t pmap, addr64_t va)
388 {
389 	ppnum_t         ppn = 0;
390 	pmap_paddr_t    pa = 0;
391 
392 	pa = pmap_find_pa(pmap, va);
393 	ppn = (ppnum_t) i386_btop(pa);
394 
395 	return ppn;
396 }
397 
398 ppnum_t
pmap_find_phys_nofault(pmap_t pmap,addr64_t va)399 pmap_find_phys_nofault(pmap_t pmap, addr64_t va)
400 {
401 	if ((pmap == kernel_pmap) ||
402 	    ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map)))) {
403 		return pmap_find_phys(pmap, va);
404 	}
405 	return 0;
406 }
407 
408 /*
409  *  pmap_get_prot returns the equivalent Vm page protections
410  *  set on a given address, 'va'. This function is used in the
411  *  ml_static_verify_page_protections() routine which is used
412  *  by the kext loading code to validate that the TEXT segment
413  *  of a kext is mapped executable.
414  */
415 kern_return_t
pmap_get_prot(pmap_t pmap,addr64_t va,vm_prot_t * protp)416 pmap_get_prot(pmap_t pmap, addr64_t va, vm_prot_t *protp)
417 {
418 	pt_entry_t      *ptp;
419 	pd_entry_t      *pdep;
420 	pd_entry_t      pde;
421 	pt_entry_t      pte;
422 	boolean_t       is_ept, locked = FALSE;
423 	kern_return_t   retval = KERN_FAILURE;
424 	vm_prot_t       prot = 0;
425 
426 	is_ept = is_ept_pmap(pmap);
427 
428 	if ((pmap != kernel_pmap) && not_in_kdp) {
429 		PMAP_LOCK_EXCLUSIVE(pmap);
430 		locked = TRUE;
431 	} else {
432 		mp_disable_preemption();
433 	}
434 
435 	if (os_ref_get_count(&pmap->ref_count) == 0) {
436 		goto pfp_exit;
437 	}
438 
439 	pdep = pmap_pde(pmap, va);
440 
441 	if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & PTE_VALID_MASK(is_ept))) {
442 		if (pde & PTE_PS) {
443 			prot = VM_PROT_READ;
444 
445 			if (pde & PTE_WRITE(is_ept)) {
446 				prot |= VM_PROT_WRITE;
447 			}
448 			if (PTE_IS_EXECUTABLE(is_ept, pde)) {
449 				prot |= VM_PROT_EXECUTE;
450 			}
451 			retval = KERN_SUCCESS;
452 		} else {
453 			ptp = pmap_pte(pmap, va);
454 			if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & PTE_VALID_MASK(is_ept)) != 0)) {
455 				prot = VM_PROT_READ;
456 
457 				if (pte & PTE_WRITE(is_ept)) {
458 					prot |= VM_PROT_WRITE;
459 				}
460 				if (PTE_IS_EXECUTABLE(is_ept, pte)) {
461 					prot |= VM_PROT_EXECUTE;
462 				}
463 				retval = KERN_SUCCESS;
464 			}
465 		}
466 	}
467 
468 pfp_exit:
469 	if (locked) {
470 		PMAP_UNLOCK_EXCLUSIVE(pmap);
471 	} else {
472 		mp_enable_preemption();
473 	}
474 
475 	if (protp) {
476 		*protp = prot;
477 	}
478 
479 	return retval;
480 }
481 
482 /*
483  * Update cache attributes for all extant managed mappings.
484  * Assumes PV for this page is locked, and that the page
485  * is managed. We assume that this physical page may be mapped in
486  * both EPT and normal Intel PTEs, so we convert the attributes
487  * to the corresponding format for each pmap.
488  *
489  * We assert that the passed set of attributes is a subset of the
490  * PHYS_CACHEABILITY_MASK.
491  */
492 void
pmap_update_cache_attributes_locked(ppnum_t pn,unsigned attributes)493 pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes)
494 {
495 	pv_rooted_entry_t       pv_h, pv_e;
496 	pv_hashed_entry_t       pvh_e, nexth;
497 	vm_map_offset_t vaddr;
498 	pmap_t  pmap;
499 	pt_entry_t      *ptep;
500 	boolean_t       is_ept;
501 	unsigned        ept_attributes;
502 
503 	assert(IS_MANAGED_PAGE(pn));
504 	assert(((~PHYS_CACHEABILITY_MASK) & attributes) == 0);
505 
506 	/* We don't support the PAT bit for EPT PTEs */
507 	if (attributes & INTEL_PTE_NCACHE) {
508 		ept_attributes = INTEL_EPT_NCACHE;
509 	} else {
510 		ept_attributes = INTEL_EPT_WB;
511 	}
512 
513 	pv_h = pai_to_pvh(pn);
514 	/* TODO: translate the PHYS_* bits to PTE bits, while they're
515 	 * currently identical, they may not remain so
516 	 * Potential optimization (here and in page_protect),
517 	 * parallel shootdowns, check for redundant
518 	 * attribute modifications.
519 	 */
520 
521 	/*
522 	 * Alter attributes on all mappings
523 	 */
524 	if (pv_h->pmap != PMAP_NULL) {
525 		pv_e = pv_h;
526 		pvh_e = (pv_hashed_entry_t)pv_e;
527 
528 		do {
529 			pmap = pv_e->pmap;
530 			vaddr = PVE_VA(pv_e);
531 			ptep = pmap_pte(pmap, vaddr);
532 
533 			if (0 == ptep) {
534 				panic("pmap_update_cache_attributes_locked: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx kernel_pmap: %p", pmap, pn, vaddr, kernel_pmap);
535 			}
536 
537 			is_ept = is_ept_pmap(pmap);
538 
539 			nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink);
540 			if (!is_ept) {
541 				pmap_update_pte(is_ept, ptep, PHYS_CACHEABILITY_MASK, attributes, true);
542 			} else {
543 				pmap_update_pte(is_ept, ptep, INTEL_EPT_CACHE_MASK, ept_attributes, true);
544 			}
545 			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
546 			pvh_e = nexth;
547 		} while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
548 	}
549 }
550 
551 void
x86_filter_TLB_coherency_interrupts(boolean_t dofilter)552 x86_filter_TLB_coherency_interrupts(boolean_t dofilter)
553 {
554 	assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
555 
556 	if (dofilter) {
557 		CPU_CR3_MARK_INACTIVE();
558 	} else {
559 		CPU_CR3_MARK_ACTIVE();
560 		mfence();
561 		pmap_update_interrupt();
562 	}
563 }
564 
565 
566 /*
567  *	Insert the given physical page (p) at
568  *	the specified virtual address (v) in the
569  *	target physical map with the protection requested.
570  *
571  *	If specified, the page will be wired down, meaning
572  *	that the related pte cannot be reclaimed.
573  *
574  *	NB:  This is the only routine which MAY NOT lazy-evaluate
575  *	or lose information.  That is, this routine must actually
576  *	insert this page into the given map NOW.
577  */
578 
579 kern_return_t
pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired,pmap_mapping_type_t mapping_type)580 pmap_enter(
581 	pmap_t          pmap,
582 	vm_map_offset_t         vaddr,
583 	ppnum_t                 pn,
584 	vm_prot_t               prot,
585 	vm_prot_t               fault_type,
586 	unsigned int            flags,
587 	boolean_t               wired,
588 	pmap_mapping_type_t     mapping_type)
589 {
590 	return pmap_enter_options(pmap, vaddr, pn, prot, fault_type, flags, wired, PMAP_EXPAND_OPTIONS_NONE, NULL, mapping_type);
591 }
592 
593 #define PTE_LOCK(EPT) INTEL_PTE_SWLOCK
594 
595 static inline void PTE_LOCK_LOCK(pt_entry_t *);
596 static inline void PTE_LOCK_UNLOCK(pt_entry_t *);
597 
598 void
PTE_LOCK_LOCK(pt_entry_t * lpte)599 PTE_LOCK_LOCK(pt_entry_t *lpte)
600 {
601 	pt_entry_t pte;
602 plretry:
603 	while ((pte = __c11_atomic_load((_Atomic pt_entry_t *)lpte, memory_order_relaxed)) & PTE_LOCK(0)) {
604 		__builtin_ia32_pause();
605 	}
606 	if (__c11_atomic_compare_exchange_strong((_Atomic pt_entry_t *)lpte, &pte, pte | PTE_LOCK(0), memory_order_acquire_smp, TRUE)) {
607 		return;
608 	}
609 
610 	goto plretry;
611 }
612 
613 void
PTE_LOCK_UNLOCK(pt_entry_t * lpte)614 PTE_LOCK_UNLOCK(pt_entry_t *lpte)
615 {
616 	__c11_atomic_fetch_and((_Atomic pt_entry_t *)lpte, ~PTE_LOCK(0), memory_order_release_smp);
617 }
618 
619 kern_return_t
pmap_enter_options_addr(pmap_t pmap,vm_map_address_t v,pmap_paddr_t pa,vm_prot_t prot,vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,__unused void * arg,pmap_mapping_type_t mapping_type)620 pmap_enter_options_addr(
621 	pmap_t pmap,
622 	vm_map_address_t v,
623 	pmap_paddr_t pa,
624 	vm_prot_t prot,
625 	vm_prot_t fault_type,
626 	unsigned int flags,
627 	boolean_t wired,
628 	unsigned int options,
629 	__unused void   *arg,
630 	pmap_mapping_type_t mapping_type)
631 {
632 	return pmap_enter_options(pmap, v, intel_btop(pa), prot, fault_type, flags, wired, options, arg, mapping_type);
633 }
634 
635 kern_return_t
pmap_enter_options(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t pn,vm_prot_t prot,__unused vm_prot_t fault_type,unsigned int flags,boolean_t wired,unsigned int options,void * arg,__unused pmap_mapping_type_t mapping_type)636 pmap_enter_options(
637 	pmap_t          pmap,
638 	vm_map_offset_t         vaddr,
639 	ppnum_t                 pn,
640 	vm_prot_t               prot,
641 	__unused vm_prot_t      fault_type,
642 	unsigned int            flags,
643 	boolean_t               wired,
644 	unsigned int            options,
645 	void                    *arg,
646 	__unused pmap_mapping_type_t mapping_type)
647 {
648 	pt_entry_t              *pte = NULL;
649 	pv_rooted_entry_t       pv_h;
650 	ppnum_t                 pai;
651 	pv_hashed_entry_t       pvh_e;
652 	pv_hashed_entry_t       pvh_new;
653 	pt_entry_t              template;
654 	pmap_paddr_t            old_pa;
655 	pmap_paddr_t            pa = (pmap_paddr_t) i386_ptob(pn);
656 	boolean_t               need_tlbflush = FALSE;
657 	boolean_t               set_NX;
658 	char                    oattr;
659 	boolean_t               old_pa_locked;
660 	/* 2MiB mappings are confined to x86_64 by VM */
661 	boolean_t               superpage = flags & VM_MEM_SUPERPAGE;
662 	vm_object_t             delpage_pm_obj = NULL;
663 	uint64_t                delpage_pde_index = 0;
664 	pt_entry_t              old_pte;
665 	kern_return_t           kr = KERN_FAILURE;
666 	boolean_t               is_ept;
667 	boolean_t               is_altacct;
668 	boolean_t               ptelocked = FALSE;
669 
670 	pmap_intr_assert();
671 
672 	if (__improbable(pmap == PMAP_NULL)) {
673 		return KERN_INVALID_ARGUMENT;
674 	}
675 	if (__improbable(pn == vm_page_guard_addr)) {
676 		return KERN_INVALID_ARGUMENT;
677 	}
678 
679 	is_ept = is_ept_pmap(pmap);
680 
681 	/* N.B. We can be supplied a zero page frame in the NOENTER case, it's an
682 	 * unused value for that scenario.
683 	 */
684 	assert(pn != vm_page_fictitious_addr);
685 
686 
687 	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
688 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(vaddr), pn,
689 	    prot);
690 
691 	if ((prot & VM_PROT_EXECUTE) || __improbable(is_ept && (prot & VM_PROT_UEXEC))) {
692 		set_NX = FALSE;
693 	} else {
694 		set_NX = TRUE;
695 	}
696 
697 #if DEVELOPMENT || DEBUG
698 	if (__improbable(set_NX && (!nx_enabled || !pmap->nx_enabled))) {
699 		set_NX = FALSE;
700 	}
701 
702 	if (__improbable(set_NX && (pmap == kernel_pmap) &&
703 	    ((pmap_disable_kstack_nx && (flags & VM_MEM_STACK)) ||
704 	    (pmap_disable_kheap_nx && !(flags & VM_MEM_STACK))))) {
705 		set_NX = FALSE;
706 	}
707 #endif
708 
709 	pvh_new = PV_HASHED_ENTRY_NULL;
710 Retry:
711 	pvh_e = PV_HASHED_ENTRY_NULL;
712 
713 	PMAP_LOCK_SHARED(pmap);
714 
715 	/*
716 	 *	Expand pmap to include this pte.  Assume that
717 	 *	pmap is always expanded to include enough hardware
718 	 *	pages to map one VM page.
719 	 */
720 	if (__improbable(superpage)) {
721 		while ((pte = pmap_pde(pmap, vaddr)) == PD_ENTRY_NULL) {
722 			/* need room for another pde entry */
723 			PMAP_UNLOCK_SHARED(pmap);
724 			kr = pmap_expand_pdpt(pmap, vaddr, options);
725 			if (kr != KERN_SUCCESS) {
726 				goto done1;
727 			}
728 			PMAP_LOCK_SHARED(pmap);
729 		}
730 	} else {
731 		while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
732 			/*
733 			 * Must unlock to expand the pmap
734 			 * going to grow pde level page(s)
735 			 */
736 			PMAP_UNLOCK_SHARED(pmap);
737 			kr = pmap_expand(pmap, vaddr, options);
738 			if (kr != KERN_SUCCESS) {
739 				goto done1;
740 			}
741 			PMAP_LOCK_SHARED(pmap);
742 		}
743 	}
744 
745 	if (__improbable(options & PMAP_EXPAND_OPTIONS_NOENTER)) {
746 		PMAP_UNLOCK_SHARED(pmap);
747 		kr = KERN_SUCCESS;
748 		goto done1;
749 	}
750 
751 	if (__improbable(superpage && *pte && !(*pte & PTE_PS))) {
752 		/*
753 		 * There is still an empty page table mapped that
754 		 * was used for a previous base page mapping.
755 		 * Remember the PDE and the PDE index, so that we
756 		 * can free the page at the end of this function.
757 		 */
758 		delpage_pde_index = pdeidx(pmap, vaddr);
759 		delpage_pm_obj = pmap->pm_obj;
760 		pmap_store_pte(is_ept, pte, 0);
761 	}
762 
763 	PTE_LOCK_LOCK(pte);
764 	ptelocked = TRUE;
765 
766 	old_pa = pte_to_pa(*pte);
767 	pai = pa_index(old_pa);
768 	old_pa_locked = FALSE;
769 
770 	if (old_pa == 0 &&
771 	    PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr)) {
772 		/*
773 		 * "pmap" should be locked at this point, so this should
774 		 * not race with another pmap_enter() or pmap_remove_range().
775 		 */
776 		assert(pmap != kernel_pmap);
777 
778 		/* one less "compressed" */
779 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
780 		    PAGE_SIZE);
781 		if (*pte & PTE_COMPRESSED_ALT) {
782 			pmap_ledger_debit(
783 				pmap,
784 				task_ledgers.alternate_accounting_compressed,
785 				PAGE_SIZE);
786 		} else {
787 			/* was part of the footprint */
788 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
789 			    PAGE_SIZE);
790 		}
791 		/* marker will be cleared below */
792 	}
793 
794 	/*
795 	 * if we have a previous managed page, lock the pv entry now. after
796 	 * we lock it, check to see if someone beat us to the lock and if so
797 	 * drop the lock
798 	 */
799 	if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) {
800 		LOCK_PVH(pai);
801 		old_pa_locked = TRUE;
802 		old_pa = pte_to_pa(*pte);
803 		if (0 == old_pa) {
804 			UNLOCK_PVH(pai);        /* another path beat us to it */
805 			old_pa_locked = FALSE;
806 		}
807 	}
808 
809 	/*
810 	 *	Special case if the incoming physical page is already mapped
811 	 *	at this address.
812 	 */
813 	if (old_pa == pa) {
814 		pt_entry_t old_attributes =
815 		    *pte & ~(PTE_REF(is_ept) | PTE_MOD(is_ept) | PTE_LOCK(is_ept));
816 
817 		/*
818 		 *	May be changing its wired attribute or protection
819 		 */
820 
821 		template = pa_to_pte(pa);
822 
823 		if (__probable(!is_ept)) {
824 			template |= INTEL_PTE_VALID;
825 		} else {
826 			template |= INTEL_EPT_IPAT;
827 		}
828 
829 		template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
830 
831 		/*
832 		 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
833 		 */
834 		if (!is_ept && (VM_MEM_NOT_CACHEABLE ==
835 		    (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)))) {
836 			if (!(flags & VM_MEM_GUARDED)) {
837 				template |= INTEL_PTE_PAT;
838 			}
839 			template |= INTEL_PTE_NCACHE;
840 		}
841 		if (pmap != kernel_pmap && !is_ept) {
842 			template |= INTEL_PTE_USER;
843 		}
844 
845 		if (prot & VM_PROT_READ) {
846 			template |= PTE_READ(is_ept);
847 		}
848 
849 		if (prot & VM_PROT_WRITE) {
850 			template |= PTE_WRITE(is_ept);
851 			if (is_ept && !pmap_ept_support_ad) {
852 				template |= PTE_MOD(is_ept);
853 				if (old_pa_locked) {
854 					assert(IS_MANAGED_PAGE(pai));
855 					pmap_phys_attributes[pai] |= PHYS_MODIFIED;
856 				}
857 			}
858 		}
859 
860 		if (prot & VM_PROT_EXECUTE) {
861 			assert(set_NX == 0);
862 			template = pte_set_ex(template, is_ept);
863 		}
864 
865 		if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
866 			assert(set_NX == 0);
867 			template = pte_set_uex(template);
868 		}
869 
870 		if (set_NX) {
871 			template = pte_remove_ex(template, is_ept);
872 		}
873 
874 		if (wired) {
875 			template |= PTE_WIRED;
876 			if (!iswired(old_attributes)) {
877 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
878 			}
879 		} else {
880 			if (iswired(old_attributes)) {
881 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
882 			}
883 		}
884 
885 		if (superpage) {        /* this path can not be used */
886 			template |= PTE_PS;     /* to change the page size! */
887 		}
888 		if (old_attributes == template) {
889 			goto dont_update_pte;
890 		}
891 
892 		/* Determine delta, PV locked */
893 		need_tlbflush =
894 		    ((old_attributes ^ template) != PTE_WIRED);
895 
896 		/* Optimisation: avoid TLB flush when adding writability */
897 		if (need_tlbflush == TRUE && !(old_attributes & PTE_WRITE(is_ept))) {
898 			if ((old_attributes ^ template) == PTE_WRITE(is_ept)) {
899 				need_tlbflush = FALSE;
900 			}
901 		}
902 
903 		/* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
904 		if (__improbable(is_ept && !pmap_ept_support_ad)) {
905 			template |= PTE_REF(is_ept);
906 			if (old_pa_locked) {
907 				assert(IS_MANAGED_PAGE(pai));
908 				pmap_phys_attributes[pai] |= PHYS_REFERENCED;
909 			}
910 		}
911 
912 		/* store modified PTE and preserve RC bits */
913 		pt_entry_t npte, opte;
914 
915 		assert((*pte & PTE_LOCK(is_ept)) != 0);
916 
917 		do {
918 			opte = *pte;
919 			npte = template | (opte & (PTE_REF(is_ept) |
920 			    PTE_MOD(is_ept))) | PTE_LOCK(is_ept);
921 		} while (!pmap_cmpx_pte(pte, opte, npte));
922 
923 		DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, opte, uint64_t, npte);
924 
925 dont_update_pte:
926 		if (old_pa_locked) {
927 			UNLOCK_PVH(pai);
928 			old_pa_locked = FALSE;
929 		}
930 		goto done2;
931 	}
932 
933 	/*
934 	 *	Outline of code from here:
935 	 *	   1) If va was mapped, update TLBs, remove the mapping
936 	 *	      and remove old pvlist entry.
937 	 *	   2) Add pvlist entry for new mapping
938 	 *	   3) Enter new mapping.
939 	 *
940 	 *	If the old physical page is not managed step 1) is skipped
941 	 *	(except for updating the TLBs), and the mapping is
942 	 *	overwritten at step 3).  If the new physical page is not
943 	 *	managed, step 2) is skipped.
944 	 */
945 	/* TODO: add opportunistic refmod collect */
946 	if (old_pa != (pmap_paddr_t) 0) {
947 		boolean_t       was_altacct = FALSE;
948 
949 		/*
950 		 *	Don't do anything to pages outside valid memory here.
951 		 *	Instead convince the code that enters a new mapping
952 		 *	to overwrite the old one.
953 		 */
954 
955 		/* invalidate the PTE */
956 		pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
957 		/* propagate invalidate everywhere */
958 		PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
959 		/* remember reference and change */
960 		old_pte = *pte;
961 		oattr = (char) (old_pte & (PTE_MOD(is_ept) | PTE_REF(is_ept)));
962 		/* completely invalidate the PTE */
963 		pmap_store_pte(is_ept, pte, PTE_LOCK(is_ept));
964 
965 		if (IS_MANAGED_PAGE(pai)) {
966 			/*
967 			 *	Remove the mapping from the pvlist for
968 			 *	this physical page.
969 			 *      We'll end up with either a rooted pv or a
970 			 *      hashed pv
971 			 */
972 			pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, &old_pte, &was_altacct);
973 		}
974 
975 		if (IS_MANAGED_PAGE(pai)) {
976 			pmap_assert(old_pa_locked == TRUE);
977 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
978 			if (pmap != kernel_pmap) {
979 				/* update ledgers */
980 				if (was_altacct) {
981 					assert(IS_INTERNAL_PAGE(pai));
982 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
983 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
984 				} else if (IS_REUSABLE_PAGE(pai)) {
985 					assert(!was_altacct);
986 					assert(IS_INTERNAL_PAGE(pai));
987 					pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
988 					/* was already not in phys_footprint */
989 				} else if (IS_INTERNAL_PAGE(pai)) {
990 					assert(!was_altacct);
991 					assert(!IS_REUSABLE_PAGE(pai));
992 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
993 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
994 				} else {
995 					/* not an internal page */
996 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
997 				}
998 			}
999 			if (iswired(*pte)) {
1000 				pmap_ledger_debit(pmap, task_ledgers.wired_mem,
1001 				    PAGE_SIZE);
1002 			}
1003 
1004 			if (!is_ept) {
1005 				pmap_phys_attributes[pai] |= oattr;
1006 			} else {
1007 				pmap_phys_attributes[pai] |= ept_refmod_to_physmap(oattr);
1008 			}
1009 		} else {
1010 			/*
1011 			 *	old_pa is not managed.
1012 			 *	Do removal part of accounting.
1013 			 */
1014 
1015 			if (pmap != kernel_pmap) {
1016 #if 00
1017 				assert(pmap->stats.device > 0);
1018 				OSAddAtomic(-1, &pmap->stats.device);
1019 #endif
1020 			}
1021 			if (iswired(*pte)) {
1022 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1023 			}
1024 		}
1025 	}
1026 
1027 	/*
1028 	 * if we had a previously managed paged locked, unlock it now
1029 	 */
1030 	if (old_pa_locked) {
1031 		UNLOCK_PVH(pai);
1032 		old_pa_locked = FALSE;
1033 	}
1034 
1035 	pai = pa_index(pa);     /* now working with new incoming phys page */
1036 	if (IS_MANAGED_PAGE(pai)) {
1037 		/*
1038 		 *	Step 2) Enter the mapping in the PV list for this
1039 		 *	physical page.
1040 		 */
1041 		pv_h = pai_to_pvh(pai);
1042 
1043 		LOCK_PVH(pai);
1044 
1045 		if (pv_h->pmap == PMAP_NULL) {
1046 			/*
1047 			 *	No mappings yet, use rooted pv
1048 			 */
1049 			pv_h->va_and_flags = vaddr;
1050 			pv_h->pmap = pmap;
1051 			queue_init(&pv_h->qlink);
1052 
1053 			if (options & PMAP_OPTIONS_INTERNAL) {
1054 				pmap_phys_attributes[pai] |= PHYS_INTERNAL;
1055 			} else {
1056 				pmap_phys_attributes[pai] &= ~PHYS_INTERNAL;
1057 			}
1058 			if (options & PMAP_OPTIONS_REUSABLE) {
1059 				pmap_phys_attributes[pai] |= PHYS_REUSABLE;
1060 			} else {
1061 				pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
1062 			}
1063 			if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1064 			    IS_INTERNAL_PAGE(pai)) {
1065 				pv_h->va_and_flags |= PVE_IS_ALTACCT;
1066 				is_altacct = TRUE;
1067 			} else {
1068 				pv_h->va_and_flags &= ~PVE_IS_ALTACCT;
1069 				is_altacct = FALSE;
1070 			}
1071 		} else {
1072 			/*
1073 			 *	Add new pv_hashed_entry after header.
1074 			 */
1075 			if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
1076 				pvh_e = pvh_new;
1077 				pvh_new = PV_HASHED_ENTRY_NULL;
1078 			} else if (PV_HASHED_ENTRY_NULL == pvh_e) {
1079 				PV_HASHED_ALLOC(&pvh_e);
1080 				if (PV_HASHED_ENTRY_NULL == pvh_e) {
1081 					/*
1082 					 * the pv list is empty. if we are on
1083 					 * the kernel pmap we'll use one of
1084 					 * the special private kernel pv_e's,
1085 					 * else, we need to unlock
1086 					 * everything, zalloc a pv_e, and
1087 					 * restart bringing in the pv_e with
1088 					 * us.
1089 					 */
1090 					if (kernel_pmap == pmap) {
1091 						PV_HASHED_KERN_ALLOC(&pvh_e);
1092 					} else {
1093 						UNLOCK_PVH(pai);
1094 						PTE_LOCK_UNLOCK(pte);
1095 						PMAP_UNLOCK_SHARED(pmap);
1096 						pmap_pv_throttle(pmap);
1097 						pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
1098 						goto Retry;
1099 					}
1100 				}
1101 			}
1102 
1103 			if (PV_HASHED_ENTRY_NULL == pvh_e) {
1104 				panic("Mapping alias chain exhaustion, possibly induced by numerous kernel virtual double mappings");
1105 			}
1106 
1107 			pvh_e->va_and_flags = vaddr;
1108 			pvh_e->pmap = pmap;
1109 			pvh_e->ppn = pn;
1110 			if ((options & PMAP_OPTIONS_ALT_ACCT) &&
1111 			    IS_INTERNAL_PAGE(pai)) {
1112 				pvh_e->va_and_flags |= PVE_IS_ALTACCT;
1113 				is_altacct = TRUE;
1114 			} else {
1115 				pvh_e->va_and_flags &= ~PVE_IS_ALTACCT;
1116 				is_altacct = FALSE;
1117 			}
1118 			pv_hash_add(pvh_e, pv_h);
1119 
1120 			/*
1121 			 *	Remember that we used the pvlist entry.
1122 			 */
1123 			pvh_e = PV_HASHED_ENTRY_NULL;
1124 		}
1125 
1126 		/*
1127 		 * only count the mapping
1128 		 * for 'managed memory'
1129 		 */
1130 		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1131 		if (pmap != kernel_pmap) {
1132 			/* update ledgers */
1133 			if (is_altacct) {
1134 				/* internal but also alternate accounting */
1135 				assert(IS_INTERNAL_PAGE(pai));
1136 				pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1137 				pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1138 				/* alternate accounting, so not in footprint */
1139 			} else if (IS_REUSABLE_PAGE(pai)) {
1140 				assert(!is_altacct);
1141 				assert(IS_INTERNAL_PAGE(pai));
1142 				pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
1143 				/* internal but reusable: not in footprint */
1144 			} else if (IS_INTERNAL_PAGE(pai)) {
1145 				assert(!is_altacct);
1146 				assert(!IS_REUSABLE_PAGE(pai));
1147 				/* internal: add to footprint */
1148 				pmap_ledger_credit(pmap, task_ledgers.internal, PAGE_SIZE);
1149 				pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1150 			} else {
1151 				/* not internal: not in footprint */
1152 				pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
1153 			}
1154 		}
1155 	} else if (last_managed_page == 0) {
1156 		/* Account for early mappings created before "managed pages"
1157 		 * are determined. Consider consulting the available DRAM map.
1158 		 */
1159 		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1160 		if (pmap != kernel_pmap) {
1161 #if 00
1162 			OSAddAtomic(+1, &pmap->stats.device);
1163 			PMAP_STATS_PEAK(pmap->stats.device);
1164 #endif
1165 		}
1166 	}
1167 	/*
1168 	 * Step 3) Enter the mapping.
1169 	 *
1170 	 *	Build a template to speed up entering -
1171 	 *	only the pfn changes.
1172 	 */
1173 	template = pa_to_pte(pa);
1174 
1175 	if (!is_ept) {
1176 		template |= INTEL_PTE_VALID;
1177 	} else {
1178 		template |= INTEL_EPT_IPAT;
1179 	}
1180 
1181 	/*
1182 	 * DRK: It may be worth asserting on cache attribute flags that diverge
1183 	 * from the existing physical page attributes.
1184 	 */
1185 
1186 	template |= pmap_get_cache_attributes(pa_index(pa), is_ept);
1187 
1188 	/*
1189 	 * We don't support passing VM_MEM_NOT_CACHEABLE flags for EPT PTEs
1190 	 */
1191 	if (!is_ept && (flags & VM_MEM_NOT_CACHEABLE)) {
1192 		if (!(flags & VM_MEM_GUARDED)) {
1193 			template |= INTEL_PTE_PAT;
1194 		}
1195 		template |= INTEL_PTE_NCACHE;
1196 	}
1197 	if (pmap != kernel_pmap && !is_ept) {
1198 		template |= INTEL_PTE_USER;
1199 	}
1200 	if (prot & VM_PROT_READ) {
1201 		template |= PTE_READ(is_ept);
1202 	}
1203 	if (prot & VM_PROT_WRITE) {
1204 		template |= PTE_WRITE(is_ept);
1205 		if (is_ept && !pmap_ept_support_ad) {
1206 			template |= PTE_MOD(is_ept);
1207 			if (IS_MANAGED_PAGE(pai)) {
1208 				pmap_phys_attributes[pai] |= PHYS_MODIFIED;
1209 			}
1210 		}
1211 	}
1212 	if (prot & VM_PROT_EXECUTE) {
1213 		assert(set_NX == 0);
1214 		template = pte_set_ex(template, is_ept);
1215 	}
1216 	if (__improbable(is_ept && (prot & VM_PROT_UEXEC))) {
1217 		assert(set_NX == 0);
1218 		template = pte_set_uex(template);
1219 	}
1220 
1221 	if (set_NX) {
1222 		template = pte_remove_ex(template, is_ept);
1223 	}
1224 	if (wired) {
1225 		template |= INTEL_PTE_WIRED;
1226 		pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1227 	}
1228 	if (__improbable(superpage)) {
1229 		template |= INTEL_PTE_PS;
1230 	}
1231 
1232 	/* For hardware that doesn't have EPT AD support, we always set REFMOD for EPT PTEs */
1233 	if (__improbable(is_ept && !pmap_ept_support_ad)) {
1234 		template |= PTE_REF(is_ept);
1235 		if (IS_MANAGED_PAGE(pai)) {
1236 			pmap_phys_attributes[pai] |= PHYS_REFERENCED;
1237 		}
1238 	}
1239 	template |= PTE_LOCK(is_ept);
1240 	pmap_store_pte(is_ept, pte, template);
1241 	DTRACE_VM3(set_pte, uint64_t, vaddr, uint64_t, 0, uint64_t, template);
1242 
1243 	/*
1244 	 * if this was a managed page we delayed unlocking the pv until here
1245 	 * to prevent pmap_page_protect et al from finding it until the pte
1246 	 * has been stored
1247 	 */
1248 	if (IS_MANAGED_PAGE(pai)) {
1249 		UNLOCK_PVH(pai);
1250 	}
1251 done2:
1252 	if (need_tlbflush == TRUE) {
1253 		if (options & PMAP_OPTIONS_NOFLUSH) {
1254 			PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1255 		} else {
1256 			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1257 		}
1258 	}
1259 	if (ptelocked) {
1260 		PTE_LOCK_UNLOCK(pte);
1261 	}
1262 	PMAP_UNLOCK_SHARED(pmap);
1263 
1264 	if (pvh_e != PV_HASHED_ENTRY_NULL) {
1265 		PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
1266 	}
1267 	if (pvh_new != PV_HASHED_ENTRY_NULL) {
1268 		PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
1269 	}
1270 
1271 	if (delpage_pm_obj) {
1272 		vm_page_t m;
1273 
1274 		vm_object_lock(delpage_pm_obj);
1275 		m = vm_page_lookup(delpage_pm_obj, (delpage_pde_index * PAGE_SIZE));
1276 		if (m == VM_PAGE_NULL) {
1277 			panic("pmap_enter: pte page not in object");
1278 		}
1279 		VM_PAGE_FREE(m);
1280 		vm_object_unlock(delpage_pm_obj);
1281 		OSAddAtomic(-1, &inuse_ptepages_count);
1282 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
1283 	}
1284 
1285 	kr = KERN_SUCCESS;
1286 done1:
1287 	if (__improbable((kr == KERN_SUCCESS) && (pmap == kernel_pmap) &&
1288 	    zone_spans_ro_va(vaddr, vaddr + PAGE_SIZE))) {
1289 		pmap_page_protect((ppnum_t)atop_kernel(kvtophys(vaddr)), VM_PROT_READ);
1290 	}
1291 	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
1292 	return kr;
1293 }
1294 
1295 /*
1296  *	Remove a range of hardware page-table entries.
1297  *	The entries given are the first (inclusive)
1298  *	and last (exclusive) entries for the VM pages.
1299  *	The virtual address is the va for the first pte.
1300  *
1301  *	The pmap must be locked.
1302  *	If the pmap is not the kernel pmap, the range must lie
1303  *	entirely within one pte-page.  This is NOT checked.
1304  *	Assumes that the pte-page exists.
1305  */
1306 
1307 void
pmap_remove_range(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte)1308 pmap_remove_range(
1309 	pmap_t                  pmap,
1310 	vm_map_offset_t         start_vaddr,
1311 	pt_entry_t              *spte,
1312 	pt_entry_t              *epte)
1313 {
1314 	pmap_remove_range_options(pmap, start_vaddr, spte, epte,
1315 	    PMAP_OPTIONS_REMOVE);
1316 }
1317 
1318 static void
pmap_remove_range_options(pmap_t pmap,vm_map_offset_t start_vaddr,pt_entry_t * spte,pt_entry_t * epte,int options)1319 pmap_remove_range_options(
1320 	pmap_t                  pmap,
1321 	vm_map_offset_t         start_vaddr,
1322 	pt_entry_t              *spte,
1323 	pt_entry_t              *epte,
1324 	int                     options)
1325 {
1326 	pt_entry_t              *cpte;
1327 	pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1328 	pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1329 	pv_hashed_entry_t       pvh_e;
1330 	int                     pvh_cnt = 0;
1331 	int                     num_removed, num_unwired, num_found, num_invalid;
1332 	int                     ledgers_external, ledgers_reusable, ledgers_internal, ledgers_alt_internal;
1333 	uint64_t                ledgers_compressed, ledgers_alt_compressed;
1334 	ppnum_t                 pai;
1335 	pmap_paddr_t            pa;
1336 	vm_map_offset_t         vaddr;
1337 	boolean_t               is_ept = is_ept_pmap(pmap);
1338 	boolean_t               was_altacct;
1339 
1340 	num_removed = 0;
1341 	num_unwired = 0;
1342 	num_found   = 0;
1343 	num_invalid = 0;
1344 	ledgers_external = 0;
1345 	ledgers_reusable = 0;
1346 	ledgers_internal = 0;
1347 	ledgers_compressed = 0;
1348 	ledgers_alt_internal = 0;
1349 	ledgers_alt_compressed = 0;
1350 
1351 	/* invalidate the PTEs first to "freeze" them */
1352 	for (cpte = spte, vaddr = start_vaddr;
1353 	    cpte < epte;
1354 	    cpte++, vaddr += PAGE_SIZE_64) {
1355 		pt_entry_t p = *cpte;
1356 
1357 		pa = pte_to_pa(p);
1358 		if (pa == 0) {
1359 			if ((options & PMAP_OPTIONS_REMOVE) &&
1360 			    (PTE_IS_COMPRESSED(p, cpte, pmap, vaddr))) {
1361 				assert(pmap != kernel_pmap);
1362 				/* one less "compressed"... */
1363 				ledgers_compressed++;
1364 				if (p & PTE_COMPRESSED_ALT) {
1365 					/* ... but it used to be "ALTACCT" */
1366 					ledgers_alt_compressed++;
1367 				}
1368 				/* clear marker(s) */
1369 				/* XXX probably does not need to be atomic! */
1370 				pmap_update_pte(is_ept, cpte, INTEL_PTE_COMPRESSED_MASK, 0, true);
1371 			}
1372 			continue;
1373 		}
1374 		num_found++;
1375 
1376 		if (iswired(p)) {
1377 			num_unwired++;
1378 		}
1379 
1380 		pai = pa_index(pa);
1381 
1382 		if (!IS_MANAGED_PAGE(pai)) {
1383 			/*
1384 			 *	Outside range of managed physical memory.
1385 			 *	Just remove the mappings.
1386 			 */
1387 			pmap_store_pte(is_ept, cpte, 0);
1388 			continue;
1389 		}
1390 
1391 		if ((p & PTE_VALID_MASK(is_ept)) == 0) {
1392 			num_invalid++;
1393 		}
1394 
1395 		/* invalidate the PTE */
1396 		pmap_update_pte(is_ept, cpte, PTE_VALID_MASK(is_ept), 0, true);
1397 	}
1398 
1399 	if (num_found == 0) {
1400 		/* nothing was changed: we're done */
1401 		goto update_counts;
1402 	}
1403 
1404 	/* propagate the invalidates to other CPUs */
1405 
1406 	PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1407 
1408 	for (cpte = spte, vaddr = start_vaddr;
1409 	    cpte < epte;
1410 	    cpte++, vaddr += PAGE_SIZE_64) {
1411 		pa = pte_to_pa(*cpte);
1412 		if (pa == 0) {
1413 check_pte_for_compressed_marker:
1414 			/*
1415 			 * This PTE could have been replaced with a
1416 			 * "compressed" marker after our first "freeze"
1417 			 * loop above, so check again.
1418 			 */
1419 			if ((options & PMAP_OPTIONS_REMOVE) &&
1420 			    (PTE_IS_COMPRESSED(*cpte, cpte, pmap, vaddr))) {
1421 				assert(pmap != kernel_pmap);
1422 				/* one less "compressed"... */
1423 				ledgers_compressed++;
1424 				if (*cpte & PTE_COMPRESSED_ALT) {
1425 					/* ... but it used to be "ALTACCT" */
1426 					ledgers_alt_compressed++;
1427 				}
1428 				pmap_store_pte(is_ept, cpte, 0);
1429 			}
1430 			continue;
1431 		}
1432 
1433 		pai = pa_index(pa);
1434 
1435 		LOCK_PVH(pai);
1436 
1437 		pa = pte_to_pa(*cpte);
1438 		if (pa == 0) {
1439 			UNLOCK_PVH(pai);
1440 			goto check_pte_for_compressed_marker;
1441 		}
1442 
1443 		/*
1444 		 * Remove the mapping from the pvlist for this physical page.
1445 		 */
1446 		pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, cpte, &was_altacct);
1447 
1448 		num_removed++;
1449 		/* update ledgers */
1450 		if (was_altacct) {
1451 			/* internal and alternate accounting */
1452 			assert(IS_INTERNAL_PAGE(pai));
1453 			ledgers_internal++;
1454 			ledgers_alt_internal++;
1455 		} else if (IS_REUSABLE_PAGE(pai)) {
1456 			/* internal but reusable */
1457 			assert(!was_altacct);
1458 			assert(IS_INTERNAL_PAGE(pai));
1459 			ledgers_reusable++;
1460 		} else if (IS_INTERNAL_PAGE(pai)) {
1461 			/* internal */
1462 			assert(!was_altacct);
1463 			assert(!IS_REUSABLE_PAGE(pai));
1464 			ledgers_internal++;
1465 		} else {
1466 			/* not internal */
1467 			ledgers_external++;
1468 		}
1469 
1470 		/*
1471 		 * Get the modify and reference bits, then
1472 		 * nuke the entry in the page table
1473 		 */
1474 		/* remember reference and change */
1475 		if (!is_ept) {
1476 			pmap_phys_attributes[pai] |=
1477 			    *cpte & (PHYS_MODIFIED | PHYS_REFERENCED);
1478 		} else {
1479 			pmap_phys_attributes[pai] |=
1480 			    ept_refmod_to_physmap((*cpte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1481 		}
1482 
1483 		/* completely invalidate the PTE */
1484 		pmap_store_pte(is_ept, cpte, 0);
1485 
1486 		UNLOCK_PVH(pai);
1487 
1488 		if (pvh_e != PV_HASHED_ENTRY_NULL) {
1489 			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1490 			pvh_eh = pvh_e;
1491 
1492 			if (pvh_et == PV_HASHED_ENTRY_NULL) {
1493 				pvh_et = pvh_e;
1494 			}
1495 			pvh_cnt++;
1496 		}
1497 		/* We can encounter at most 'num_found' PTEs for this level
1498 		 * Fewer may be encountered if some were replaced by
1499 		 * compressed markers. No new valid PTEs can be created
1500 		 * since the pmap lock is held exclusively.
1501 		 */
1502 		if (num_removed == num_found) {
1503 			break;
1504 		}
1505 	} /* for loop */
1506 
1507 	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1508 		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1509 	}
1510 update_counts:
1511 	/*
1512 	 *	Update the counts
1513 	 */
1514 #if TESTING
1515 	if (pmap->stats.resident_count < num_removed) {
1516 		panic("pmap_remove_range: resident_count");
1517 	}
1518 #endif
1519 	if (num_removed) {
1520 		pmap_ledger_debit(pmap, task_ledgers.phys_mem, machine_ptob(num_removed));
1521 	}
1522 
1523 	if (pmap != kernel_pmap) {
1524 		if (ledgers_external) {
1525 			pmap_ledger_debit(pmap,
1526 			    task_ledgers.external,
1527 			    machine_ptob(ledgers_external));
1528 		}
1529 		if (ledgers_reusable) {
1530 			pmap_ledger_debit(pmap,
1531 			    task_ledgers.reusable,
1532 			    machine_ptob(ledgers_reusable));
1533 		}
1534 		if (ledgers_internal) {
1535 			pmap_ledger_debit(pmap,
1536 			    task_ledgers.internal,
1537 			    machine_ptob(ledgers_internal));
1538 		}
1539 		if (ledgers_compressed) {
1540 			pmap_ledger_debit(pmap,
1541 			    task_ledgers.internal_compressed,
1542 			    machine_ptob(ledgers_compressed));
1543 		}
1544 		if (ledgers_alt_internal) {
1545 			pmap_ledger_debit(pmap,
1546 			    task_ledgers.alternate_accounting,
1547 			    machine_ptob(ledgers_alt_internal));
1548 		}
1549 		if (ledgers_alt_compressed) {
1550 			pmap_ledger_debit(pmap,
1551 			    task_ledgers.alternate_accounting_compressed,
1552 			    machine_ptob(ledgers_alt_compressed));
1553 		}
1554 
1555 		uint64_t net_debit = (ledgers_internal - ledgers_alt_internal) + (ledgers_compressed - ledgers_alt_compressed);
1556 		if (net_debit) {
1557 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(net_debit));
1558 		}
1559 	}
1560 
1561 	if (num_unwired != 0) {
1562 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, machine_ptob(num_unwired));
1563 	}
1564 	return;
1565 }
1566 
1567 
1568 /*
1569  *	Remove the given range of addresses
1570  *	from the specified map.
1571  *
1572  *	It is assumed that the start and end are properly
1573  *	rounded to the hardware page size.
1574  */
1575 void
pmap_remove(pmap_t map,addr64_t s64,addr64_t e64)1576 pmap_remove(
1577 	pmap_t          map,
1578 	addr64_t        s64,
1579 	addr64_t        e64)
1580 {
1581 	pmap_remove_options(map, s64, e64, PMAP_OPTIONS_REMOVE);
1582 }
1583 #define PLCHECK_THRESHOLD (2)
1584 
1585 void
pmap_remove_options(pmap_t map,addr64_t s64,addr64_t e64,int options)1586 pmap_remove_options(
1587 	pmap_t          map,
1588 	addr64_t        s64,
1589 	addr64_t        e64,
1590 	int             options)
1591 {
1592 	pt_entry_t     *pde;
1593 	pt_entry_t     *spte, *epte;
1594 	addr64_t        l64;
1595 	uint64_t        deadline = 0;
1596 	boolean_t       is_ept;
1597 
1598 	pmap_intr_assert();
1599 
1600 	if (map == PMAP_NULL || s64 == e64) {
1601 		return;
1602 	}
1603 
1604 	is_ept = is_ept_pmap(map);
1605 
1606 	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
1607 	    VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(s64),
1608 	    VM_KERNEL_ADDRHIDE(e64));
1609 
1610 	PMAP_LOCK_EXCLUSIVE(map);
1611 	uint32_t traverse_count = 0;
1612 
1613 	while (s64 < e64) {
1614 		pml4_entry_t *pml4e = pmap64_pml4(map, s64);
1615 		if ((pml4e == NULL) ||
1616 		    ((*pml4e & PTE_VALID_MASK(is_ept)) == 0)) {
1617 			if (os_add_overflow(s64, NBPML4, &s64)) {
1618 				/* wrap; clip s64 to e64 */
1619 				s64 = e64;
1620 				break;
1621 			}
1622 			s64 &= ~(PML4MASK);
1623 			continue;
1624 		}
1625 		pdpt_entry_t *pdpte = pmap64_pdpt(map, s64);
1626 		if ((pdpte == NULL) ||
1627 		    ((*pdpte & PTE_VALID_MASK(is_ept)) == 0)) {
1628 			if (os_add_overflow(s64, NBPDPT, &s64)) {
1629 				/* wrap; clip s64 to e64 */
1630 				s64 = e64;
1631 				break;
1632 			}
1633 			s64 &= ~(PDPTMASK);
1634 			continue;
1635 		}
1636 
1637 		if (os_add_overflow(s64, PDE_MAPPED_SIZE, &l64)) {
1638 			l64 = e64;
1639 		} else {
1640 			l64 &= ~(PDE_MAPPED_SIZE - 1);
1641 
1642 			if (l64 > e64) {
1643 				l64 = e64;
1644 			}
1645 		}
1646 
1647 		pde = pmap_pde(map, s64);
1648 
1649 		if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1650 			if (*pde & PTE_PS) {
1651 				/*
1652 				 * If we're removing a superpage, pmap_remove_range()
1653 				 * must work on level 2 instead of level 1; and we're
1654 				 * only passing a single level 2 entry instead of a
1655 				 * level 1 range.
1656 				 */
1657 				spte = pde;
1658 				epte = spte + 1; /* excluded */
1659 			} else {
1660 				spte = pmap_pte(map, (s64 & ~(PDE_MAPPED_SIZE - 1)));
1661 				spte = &spte[ptenum(s64)];
1662 				epte = &spte[intel_btop(l64 - s64)];
1663 			}
1664 			pmap_remove_range_options(map, s64, spte, epte,
1665 			    options);
1666 		}
1667 		s64 = l64;
1668 
1669 		if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
1670 			if (deadline == 0) {
1671 				deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1672 			} else {
1673 				if (rdtsc64_nofence() > deadline) {
1674 					PMAP_UNLOCK_EXCLUSIVE(map);
1675 					__builtin_ia32_pause();
1676 					PMAP_LOCK_EXCLUSIVE(map);
1677 					deadline = rdtsc64_nofence() + max_preemption_latency_tsc;
1678 				}
1679 			}
1680 		}
1681 	}
1682 
1683 	PMAP_UNLOCK_EXCLUSIVE(map);
1684 
1685 	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
1686 }
1687 
1688 void
pmap_page_protect(ppnum_t pn,vm_prot_t prot)1689 pmap_page_protect(
1690 	ppnum_t         pn,
1691 	vm_prot_t       prot)
1692 {
1693 	pmap_page_protect_options(pn, prot, 0, NULL);
1694 }
1695 
1696 /*
1697  *	Routine:	pmap_page_protect_options
1698  *
1699  *	Function:
1700  *		Lower the permission for all mappings to a given
1701  *		page.
1702  */
1703 void
pmap_page_protect_options(ppnum_t pn,vm_prot_t prot,unsigned int options,void * arg)1704 pmap_page_protect_options(
1705 	ppnum_t         pn,
1706 	vm_prot_t       prot,
1707 	unsigned int    options,
1708 	void            *arg)
1709 {
1710 	pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1711 	pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1712 	pv_hashed_entry_t       nexth;
1713 	int                     pvh_cnt = 0;
1714 	pv_rooted_entry_t       pv_h;
1715 	pv_rooted_entry_t       pv_e;
1716 	pv_hashed_entry_t       pvh_e;
1717 	pt_entry_t              *pte;
1718 	int                     pai;
1719 	pmap_t                  pmap;
1720 	boolean_t               remove;
1721 	pt_entry_t              new_pte_value;
1722 	boolean_t               is_ept;
1723 
1724 	pmap_intr_assert();
1725 	assert(pn != vm_page_fictitious_addr);
1726 	if (pn == vm_page_guard_addr) {
1727 		return;
1728 	}
1729 
1730 	pai = ppn_to_pai(pn);
1731 
1732 	if (!IS_MANAGED_PAGE(pai)) {
1733 		/*
1734 		 *	Not a managed page.
1735 		 */
1736 		return;
1737 	}
1738 
1739 	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, pn, prot);
1740 
1741 	/*
1742 	 * Determine the new protection.
1743 	 */
1744 	switch (prot) {
1745 	case VM_PROT_READ:
1746 	case VM_PROT_READ | VM_PROT_EXECUTE:
1747 		remove = FALSE;
1748 		break;
1749 	case VM_PROT_ALL:
1750 		return;         /* nothing to do */
1751 	default:
1752 		remove = TRUE;
1753 		break;
1754 	}
1755 
1756 	pv_h = pai_to_pvh(pai);
1757 
1758 	LOCK_PVH(pai);
1759 
1760 
1761 	/*
1762 	 * Walk down PV list, if any, changing or removing all mappings.
1763 	 */
1764 	if (pv_h->pmap == PMAP_NULL) {
1765 		goto done;
1766 	}
1767 
1768 	pv_e = pv_h;
1769 	pvh_e = (pv_hashed_entry_t) pv_e;       /* cheat */
1770 
1771 	do {
1772 		vm_map_offset_t vaddr;
1773 
1774 		if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1775 		    (pmap_phys_attributes[pai] & PHYS_MODIFIED)) {
1776 			/* page was modified, so it will be compressed */
1777 			options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1778 			options |= PMAP_OPTIONS_COMPRESSOR;
1779 		}
1780 
1781 		pmap = pv_e->pmap;
1782 		is_ept = is_ept_pmap(pmap);
1783 		vaddr = PVE_VA(pv_e);
1784 		pte = pmap_pte(pmap, vaddr);
1785 
1786 		pmap_assert2((pa_index(pte_to_pa(*pte)) == pn),
1787 		    "pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte);
1788 
1789 		if (0 == pte) {
1790 			panic("pmap_page_protect() "
1791 			    "pmap=%p pn=0x%x vaddr=0x%llx\n",
1792 			    pmap, pn, vaddr);
1793 		}
1794 		nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink);
1795 
1796 		/*
1797 		 * Remove the mapping if new protection is NONE
1798 		 */
1799 		if (remove) {
1800 			/* Remove per-pmap wired count */
1801 			if (iswired(*pte)) {
1802 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1803 			}
1804 
1805 			if (pmap != kernel_pmap &&
1806 			    (options & PMAP_OPTIONS_COMPRESSOR) &&
1807 			    IS_INTERNAL_PAGE(pai)) {
1808 				assert(!PTE_IS_COMPRESSED(*pte, pte, pmap, vaddr));
1809 				/* mark this PTE as having been "compressed" */
1810 				new_pte_value = PTE_COMPRESSED;
1811 				if (IS_ALTACCT_PAGE(pai, pv_e)) {
1812 					new_pte_value |= PTE_COMPRESSED_ALT;
1813 				}
1814 			} else {
1815 				new_pte_value = 0;
1816 			}
1817 
1818 			if (options & PMAP_OPTIONS_NOREFMOD) {
1819 				pmap_store_pte(is_ept, pte, new_pte_value);
1820 
1821 				if (options & PMAP_OPTIONS_NOFLUSH) {
1822 					PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1823 				} else {
1824 					PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1825 				}
1826 			} else {
1827 				/*
1828 				 * Remove the mapping, collecting dirty bits.
1829 				 */
1830 				pmap_update_pte(is_ept, pte, PTE_VALID_MASK(is_ept), 0, true);
1831 
1832 				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1833 				if (!is_ept) {
1834 					pmap_phys_attributes[pai] |=
1835 					    *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1836 				} else {
1837 					pmap_phys_attributes[pai] |=
1838 					    ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1839 				}
1840 				if ((options &
1841 				    PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED) &&
1842 				    IS_INTERNAL_PAGE(pai) &&
1843 				    (pmap_phys_attributes[pai] &
1844 				    PHYS_MODIFIED)) {
1845 					/*
1846 					 * Page is actually "modified" and
1847 					 * will be compressed.  Start
1848 					 * accounting for it as "compressed".
1849 					 */
1850 					assert(!(options & PMAP_OPTIONS_COMPRESSOR));
1851 					options &= ~PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1852 					options |= PMAP_OPTIONS_COMPRESSOR;
1853 					assert(new_pte_value == 0);
1854 					if (pmap != kernel_pmap) {
1855 						new_pte_value = PTE_COMPRESSED;
1856 						if (IS_ALTACCT_PAGE(pai, pv_e)) {
1857 							new_pte_value |= PTE_COMPRESSED_ALT;
1858 						}
1859 					}
1860 				}
1861 				pmap_store_pte(is_ept, pte, new_pte_value);
1862 			}
1863 
1864 #if TESTING
1865 			if (pmap->stats.resident_count < 1) {
1866 				panic("pmap_page_protect: resident_count");
1867 			}
1868 #endif
1869 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1870 
1871 			/*
1872 			 * We only ever compress internal pages.
1873 			 */
1874 			if (options & PMAP_OPTIONS_COMPRESSOR) {
1875 				assert(IS_INTERNAL_PAGE(pai));
1876 			}
1877 			if (pmap != kernel_pmap) {
1878 				/* update ledgers */
1879 				if (IS_ALTACCT_PAGE(pai, pv_e)) {
1880 					assert(IS_INTERNAL_PAGE(pai));
1881 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1882 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, PAGE_SIZE);
1883 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1884 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1885 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, PAGE_SIZE);
1886 					}
1887 				} else if (IS_REUSABLE_PAGE(pai)) {
1888 					assert(!IS_ALTACCT_PAGE(pai, pv_e));
1889 					assert(IS_INTERNAL_PAGE(pai));
1890 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1891 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1892 						/* was not in footprint, but is now */
1893 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1894 					}
1895 					pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
1896 				} else if (IS_INTERNAL_PAGE(pai)) {
1897 					assert(!IS_ALTACCT_PAGE(pai, pv_e));
1898 					assert(!IS_REUSABLE_PAGE(pai));
1899 					pmap_ledger_debit(pmap, task_ledgers.internal, PAGE_SIZE);
1900 					/*
1901 					 * Update all stats related to physical
1902 					 * footprint, which only deals with
1903 					 * internal pages.
1904 					 */
1905 					if (options & PMAP_OPTIONS_COMPRESSOR) {
1906 						/*
1907 						 * This removal is only being
1908 						 * done so we can send this page
1909 						 * to the compressor;  therefore
1910 						 * it mustn't affect total task
1911 						 * footprint.
1912 						 */
1913 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1914 					} else {
1915 						/*
1916 						 * This internal page isn't
1917 						 * going to the compressor,
1918 						 * so adjust stats to keep
1919 						 * phys_footprint up to date.
1920 						 */
1921 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1922 					}
1923 				} else {
1924 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
1925 				}
1926 			}
1927 
1928 			/*
1929 			 * Deal with the pv_rooted_entry.
1930 			 */
1931 
1932 			if (pv_e == pv_h) {
1933 				/*
1934 				 * Fix up head later.
1935 				 */
1936 				pv_h->pmap = PMAP_NULL;
1937 			} else {
1938 				/*
1939 				 * Delete this entry.
1940 				 */
1941 				pv_hash_remove(pvh_e);
1942 				pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1943 				pvh_eh = pvh_e;
1944 
1945 				if (pvh_et == PV_HASHED_ENTRY_NULL) {
1946 					pvh_et = pvh_e;
1947 				}
1948 				pvh_cnt++;
1949 			}
1950 		} else {
1951 			/*
1952 			 * Write-protect, after opportunistic refmod collect
1953 			 */
1954 			if (!is_ept) {
1955 				pmap_phys_attributes[pai] |=
1956 				    *pte & (PHYS_MODIFIED | PHYS_REFERENCED);
1957 			} else {
1958 				pmap_phys_attributes[pai] |=
1959 				    ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED);
1960 			}
1961 
1962 			pmap_update_pte(is_ept, pte, PTE_WRITE(is_ept), 0, true);
1963 			if (options & PMAP_OPTIONS_NOFLUSH) {
1964 				PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1965 			} else {
1966 				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1967 			}
1968 		}
1969 		pvh_e = nexth;
1970 	} while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h);
1971 
1972 
1973 	/*
1974 	 * If pv_head mapping was removed, fix it up.
1975 	 */
1976 	if (pv_h->pmap == PMAP_NULL) {
1977 		pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
1978 
1979 		if (pvh_e != (pv_hashed_entry_t) pv_h) {
1980 			pv_hash_remove(pvh_e);
1981 			pv_h->pmap = pvh_e->pmap;
1982 			pv_h->va_and_flags = pvh_e->va_and_flags;
1983 			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1984 			pvh_eh = pvh_e;
1985 
1986 			if (pvh_et == PV_HASHED_ENTRY_NULL) {
1987 				pvh_et = pvh_e;
1988 			}
1989 			pvh_cnt++;
1990 		}
1991 	}
1992 	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1993 		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1994 	}
1995 done:
1996 	UNLOCK_PVH(pai);
1997 
1998 	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
1999 }
2000 
2001 
2002 /*
2003  *	Clear specified attribute bits.
2004  */
2005 void
phys_attribute_clear(ppnum_t pn,int bits,unsigned int options,void * arg)2006 phys_attribute_clear(
2007 	ppnum_t         pn,
2008 	int             bits,
2009 	unsigned int    options,
2010 	void            *arg)
2011 {
2012 	pv_rooted_entry_t       pv_h;
2013 	pv_hashed_entry_t       pv_e;
2014 	pt_entry_t              *pte = NULL;
2015 	int                     pai;
2016 	pmap_t                  pmap;
2017 	char                    attributes = 0;
2018 	boolean_t               is_internal, is_reusable, is_altacct, is_ept;
2019 	int                     ept_bits_to_clear;
2020 	boolean_t               ept_keep_global_mod = FALSE;
2021 
2022 	if ((bits & PHYS_MODIFIED) &&
2023 	    (options & PMAP_OPTIONS_NOFLUSH) &&
2024 	    arg == NULL) {
2025 		panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p): "
2026 		    "should not clear 'modified' without flushing TLBs\n",
2027 		    pn, bits, options, arg);
2028 	}
2029 
2030 	/* We only support converting MOD and REF bits for EPT PTEs in this function */
2031 	assert((bits & ~(PHYS_REFERENCED | PHYS_MODIFIED)) == 0);
2032 
2033 	ept_bits_to_clear = (unsigned)physmap_refmod_to_ept(bits & (PHYS_MODIFIED | PHYS_REFERENCED));
2034 
2035 	pmap_intr_assert();
2036 	assert(pn != vm_page_fictitious_addr);
2037 	if (pn == vm_page_guard_addr) {
2038 		return;
2039 	}
2040 
2041 	pai = ppn_to_pai(pn);
2042 
2043 	if (!IS_MANAGED_PAGE(pai)) {
2044 		/*
2045 		 *	Not a managed page.
2046 		 */
2047 		return;
2048 	}
2049 
2050 	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
2051 
2052 	pv_h = pai_to_pvh(pai);
2053 
2054 	LOCK_PVH(pai);
2055 
2056 
2057 	/*
2058 	 * Walk down PV list, clearing all modify or reference bits.
2059 	 * We do not have to lock the pv_list because we have
2060 	 * the per-pmap lock
2061 	 */
2062 	if (pv_h->pmap != PMAP_NULL) {
2063 		/*
2064 		 * There are some mappings.
2065 		 */
2066 
2067 		is_internal = IS_INTERNAL_PAGE(pai);
2068 		is_reusable = IS_REUSABLE_PAGE(pai);
2069 
2070 		pv_e = (pv_hashed_entry_t)pv_h;
2071 
2072 		do {
2073 			vm_map_offset_t va;
2074 			char pte_bits;
2075 
2076 			pmap = pv_e->pmap;
2077 			is_ept = is_ept_pmap(pmap);
2078 			is_altacct = IS_ALTACCT_PAGE(pai, pv_e);
2079 			va = PVE_VA(pv_e);
2080 			pte_bits = 0;
2081 
2082 			if (bits) {
2083 				pte = pmap_pte(pmap, va);
2084 				/* grab ref/mod bits from this PTE */
2085 				pte_bits = (*pte & (PTE_REF(is_ept) | PTE_MOD(is_ept)));
2086 				/* propagate to page's global attributes */
2087 				if (!is_ept) {
2088 					attributes |= pte_bits;
2089 				} else {
2090 					attributes |= ept_refmod_to_physmap(pte_bits);
2091 					if (!pmap_ept_support_ad && (pte_bits & INTEL_EPT_MOD)) {
2092 						ept_keep_global_mod = TRUE;
2093 					}
2094 				}
2095 				/* which bits to clear for this PTE? */
2096 				if (!is_ept) {
2097 					pte_bits &= bits;
2098 				} else {
2099 					pte_bits &= ept_bits_to_clear;
2100 				}
2101 			}
2102 			if (options & PMAP_OPTIONS_CLEAR_WRITE) {
2103 				pte_bits |= PTE_WRITE(is_ept);
2104 			}
2105 
2106 			/*
2107 			 * Clear modify and/or reference bits.
2108 			 */
2109 			if (pte_bits) {
2110 				pmap_update_pte(is_ept, pte, pte_bits, 0, true);
2111 
2112 				/* Ensure all processors using this translation
2113 				 * invalidate this TLB entry. The invalidation
2114 				 * *must* follow the PTE update, to ensure that
2115 				 * the TLB shadow of the 'D' bit (in particular)
2116 				 * is synchronized with the updated PTE.
2117 				 */
2118 				if (!(options & PMAP_OPTIONS_NOFLUSH)) {
2119 					/* flush TLBS now */
2120 					PMAP_UPDATE_TLBS(pmap,
2121 					    va,
2122 					    va + PAGE_SIZE);
2123 				} else if (arg) {
2124 					/* delayed TLB flush: add "pmap" info */
2125 					PMAP_UPDATE_TLBS_DELAYED(
2126 						pmap,
2127 						va,
2128 						va + PAGE_SIZE,
2129 						(pmap_flush_context *)arg);
2130 				} else {
2131 					/* no TLB flushing at all */
2132 				}
2133 			}
2134 
2135 			/* update pmap "reusable" stats */
2136 			if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
2137 			    is_reusable &&
2138 			    pmap != kernel_pmap) {
2139 				/* one less "reusable" */
2140 				pmap_ledger_debit(pmap, task_ledgers.reusable, PAGE_SIZE);
2141 				if (is_internal) {
2142 					/* one more "internal" */
2143 					if (is_altacct) {
2144 						/* no impact on ledgers */
2145 					} else {
2146 						pmap_ledger_credit(pmap,
2147 						    task_ledgers.internal,
2148 						    PAGE_SIZE);
2149 						pmap_ledger_credit(
2150 							pmap,
2151 							task_ledgers.phys_footprint,
2152 							PAGE_SIZE);
2153 					}
2154 				} else {
2155 					/* one more "external" */
2156 					pmap_ledger_credit(pmap, task_ledgers.external, PAGE_SIZE);
2157 				}
2158 			} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
2159 			    !is_reusable &&
2160 			    pmap != kernel_pmap) {
2161 				/* one more "reusable" */
2162 				pmap_ledger_credit(pmap, task_ledgers.reusable, PAGE_SIZE);
2163 				if (is_internal) {
2164 					/* one less "internal" */
2165 					if (is_altacct) {
2166 						/* no impact on footprint */
2167 					} else {
2168 						pmap_ledger_debit(pmap,
2169 						    task_ledgers.internal,
2170 						    PAGE_SIZE);
2171 						pmap_ledger_debit(
2172 							pmap,
2173 							task_ledgers.phys_footprint,
2174 							PAGE_SIZE);
2175 					}
2176 				} else {
2177 					/* one less "external" */
2178 					pmap_ledger_debit(pmap, task_ledgers.external, PAGE_SIZE);
2179 				}
2180 			}
2181 
2182 			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2183 		} while (pv_e != (pv_hashed_entry_t)pv_h);
2184 	}
2185 	/* Opportunistic refmod collection, annulled
2186 	 * if both REF and MOD are being cleared.
2187 	 */
2188 
2189 	pmap_phys_attributes[pai] |= attributes;
2190 
2191 	if (ept_keep_global_mod) {
2192 		/*
2193 		 * If the hardware doesn't support AD bits for EPT PTEs and someone is
2194 		 * requesting that we clear the modified bit for a phys page, we need
2195 		 * to ensure that there are no EPT mappings for the page with the
2196 		 * modified bit set. If there are, we cannot clear the global modified bit.
2197 		 */
2198 		bits &= ~PHYS_MODIFIED;
2199 	}
2200 	pmap_phys_attributes[pai] &= ~(bits);
2201 
2202 	/* update this page's "reusable" status */
2203 	if (options & PMAP_OPTIONS_CLEAR_REUSABLE) {
2204 		pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
2205 	} else if (options & PMAP_OPTIONS_SET_REUSABLE) {
2206 		pmap_phys_attributes[pai] |= PHYS_REUSABLE;
2207 	}
2208 
2209 	UNLOCK_PVH(pai);
2210 
2211 	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
2212 }
2213 
2214 /*
2215  *	Check specified attribute bits.
2216  */
2217 int
phys_attribute_test(ppnum_t pn,int bits)2218 phys_attribute_test(
2219 	ppnum_t         pn,
2220 	int             bits)
2221 {
2222 	pv_rooted_entry_t       pv_h;
2223 	pv_hashed_entry_t       pv_e;
2224 	pt_entry_t              *pte;
2225 	int                     pai;
2226 	pmap_t                  pmap;
2227 	int                     attributes = 0;
2228 	boolean_t               is_ept;
2229 
2230 	pmap_intr_assert();
2231 	assert(pn != vm_page_fictitious_addr);
2232 	assert((bits & ~(PHYS_MODIFIED | PHYS_REFERENCED)) == 0);
2233 	if (pn == vm_page_guard_addr) {
2234 		return 0;
2235 	}
2236 
2237 	pai = ppn_to_pai(pn);
2238 
2239 	if (!IS_MANAGED_PAGE(pai)) {
2240 		/*
2241 		 *	Not a managed page.
2242 		 */
2243 		return 0;
2244 	}
2245 
2246 	/*
2247 	 * Fast check...  if bits already collected
2248 	 * no need to take any locks...
2249 	 * if not set, we need to recheck after taking
2250 	 * the lock in case they got pulled in while
2251 	 * we were waiting for the lock
2252 	 */
2253 	if ((pmap_phys_attributes[pai] & bits) == bits) {
2254 		return bits;
2255 	}
2256 
2257 	pv_h = pai_to_pvh(pai);
2258 
2259 	LOCK_PVH(pai);
2260 
2261 	attributes = pmap_phys_attributes[pai] & bits;
2262 
2263 
2264 	/*
2265 	 * Walk down PV list, checking the mappings until we
2266 	 * reach the end or we've found the desired attributes.
2267 	 */
2268 	if (attributes != bits &&
2269 	    pv_h->pmap != PMAP_NULL) {
2270 		/*
2271 		 * There are some mappings.
2272 		 */
2273 		pv_e = (pv_hashed_entry_t)pv_h;
2274 		do {
2275 			vm_map_offset_t va;
2276 
2277 			pmap = pv_e->pmap;
2278 			is_ept = is_ept_pmap(pmap);
2279 			va = PVE_VA(pv_e);
2280 			/*
2281 			 * pick up modify and/or reference bits from mapping
2282 			 */
2283 
2284 			pte = pmap_pte(pmap, va);
2285 			if (!is_ept) {
2286 				attributes |= (int)(*pte & bits);
2287 			} else {
2288 				attributes |= (int)(ept_refmod_to_physmap((*pte & (INTEL_EPT_REF | INTEL_EPT_MOD))) & (PHYS_MODIFIED | PHYS_REFERENCED));
2289 			}
2290 
2291 			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
2292 		} while ((attributes != bits) &&
2293 		    (pv_e != (pv_hashed_entry_t)pv_h));
2294 	}
2295 	pmap_phys_attributes[pai] |= attributes;
2296 
2297 	UNLOCK_PVH(pai);
2298 	return attributes;
2299 }
2300 
2301 /*
2302  *	Routine:	pmap_change_wiring
2303  *	Function:	Change the wiring attribute for a map/virtual-address
2304  *			pair.
2305  *	In/out conditions:
2306  *			The mapping must already exist in the pmap.
2307  */
2308 void
pmap_change_wiring(pmap_t map,vm_map_offset_t vaddr,boolean_t wired)2309 pmap_change_wiring(
2310 	pmap_t          map,
2311 	vm_map_offset_t vaddr,
2312 	boolean_t       wired)
2313 {
2314 	pt_entry_t      *pte;
2315 
2316 	PMAP_LOCK_SHARED(map);
2317 
2318 	if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL) {
2319 		panic("pmap_change_wiring(%p,0x%llx,%d): pte missing",
2320 		    map, vaddr, wired);
2321 	}
2322 
2323 	if (wired && !iswired(*pte)) {
2324 		/*
2325 		 * wiring down mapping
2326 		 */
2327 		pmap_ledger_credit(map, task_ledgers.wired_mem, PAGE_SIZE);
2328 		pmap_update_pte(is_ept_pmap(map), pte, 0, PTE_WIRED, false);
2329 	} else if (!wired && iswired(*pte)) {
2330 		/*
2331 		 * unwiring mapping
2332 		 */
2333 		pmap_ledger_debit(map, task_ledgers.wired_mem, PAGE_SIZE);
2334 		pmap_update_pte(is_ept_pmap(map), pte, PTE_WIRED, 0, false);
2335 	}
2336 
2337 	PMAP_UNLOCK_SHARED(map);
2338 }
2339 
2340 /*
2341  *	"Backdoor" direct map routine for early mappings.
2342  *      Useful for mapping memory outside the range
2343  *      Sets A, D and NC if requested
2344  */
2345 
2346 vm_offset_t
pmap_map_bd(vm_offset_t virt,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int flags)2347 pmap_map_bd(
2348 	vm_offset_t     virt,
2349 	vm_map_offset_t start_addr,
2350 	vm_map_offset_t end_addr,
2351 	vm_prot_t       prot,
2352 	unsigned int    flags)
2353 {
2354 	pt_entry_t      template;
2355 	pt_entry_t      *ptep;
2356 
2357 	vm_offset_t     base = virt;
2358 	boolean_t       doflush = FALSE;
2359 
2360 	template = pa_to_pte(start_addr)
2361 	    | INTEL_PTE_REF
2362 	    | INTEL_PTE_MOD
2363 	    | INTEL_PTE_WIRED
2364 	    | INTEL_PTE_VALID;
2365 
2366 	if ((flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) == VM_MEM_NOT_CACHEABLE) {
2367 		template |= INTEL_PTE_NCACHE;
2368 		if (!(flags & (VM_MEM_GUARDED))) {
2369 			template |= INTEL_PTE_PAT;
2370 		}
2371 	}
2372 
2373 	if ((prot & VM_PROT_EXECUTE) == 0) {
2374 		template |= INTEL_PTE_NX;
2375 	}
2376 
2377 	if (prot & VM_PROT_WRITE) {
2378 		template |= INTEL_PTE_WRITE;
2379 	}
2380 	vm_map_offset_t caddr = start_addr;
2381 	while (caddr < end_addr) {
2382 		ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
2383 		if (ptep == PT_ENTRY_NULL) {
2384 			panic("pmap_map_bd: Invalid kernel address");
2385 		}
2386 		if (pte_to_pa(*ptep)) {
2387 			doflush = TRUE;
2388 		}
2389 		pmap_store_pte(FALSE, ptep, template);
2390 		pte_increment_pa(template);
2391 		virt += PAGE_SIZE;
2392 		caddr += PAGE_SIZE;
2393 	}
2394 	if (doflush) {
2395 		pmap_tlbi_range(0, ~0ULL, true, 0);
2396 		PMAP_UPDATE_TLBS(kernel_pmap, base, base + end_addr - start_addr);
2397 	}
2398 	return virt;
2399 }
2400 
2401 /* Create a virtual alias beginning at 'ava' of the specified kernel virtual
2402  * range. The aliased pagetable range is expanded if
2403  * PMAP_EXPAND_OPTIONS_ALIASMAP is specified. Performs no synchronization,
2404  * assumes caller has stabilized the source and destination ranges. Currently
2405  * used to populate sections of the trampoline "doublemap" at CPU startup.
2406  */
2407 
2408 void
pmap_alias(vm_offset_t ava,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int eoptions)2409 pmap_alias(
2410 	vm_offset_t     ava,
2411 	vm_map_offset_t start_addr,
2412 	vm_map_offset_t end_addr,
2413 	vm_prot_t       prot,
2414 	unsigned int    eoptions)
2415 {
2416 	pt_entry_t      prot_template, template;
2417 	pt_entry_t      *aptep, *sptep;
2418 
2419 	prot_template =  INTEL_PTE_REF | INTEL_PTE_MOD | INTEL_PTE_WIRED | INTEL_PTE_VALID;
2420 	if ((prot & VM_PROT_EXECUTE) == 0) {
2421 		prot_template |= INTEL_PTE_NX;
2422 	}
2423 
2424 	if (prot & VM_PROT_WRITE) {
2425 		prot_template |= INTEL_PTE_WRITE;
2426 	}
2427 	assert(((start_addr | end_addr) & PAGE_MASK) == 0);
2428 	while (start_addr < end_addr) {
2429 		aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2430 		if (aptep == PT_ENTRY_NULL) {
2431 			if (eoptions & PMAP_EXPAND_OPTIONS_ALIASMAP) {
2432 				pmap_expand(kernel_pmap, ava, PMAP_EXPAND_OPTIONS_ALIASMAP);
2433 				aptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ava);
2434 			} else {
2435 				panic("pmap_alias: Invalid alias address");
2436 			}
2437 		}
2438 		/* The aliased range should not have any active mappings */
2439 		assert(pte_to_pa(*aptep) == 0);
2440 
2441 		sptep = pmap_pte(kernel_pmap, start_addr);
2442 		assert(sptep != PT_ENTRY_NULL && (pte_to_pa(*sptep) != 0));
2443 		template = pa_to_pte(pte_to_pa(*sptep)) | prot_template;
2444 		pmap_store_pte(FALSE, aptep, template);
2445 
2446 		ava += PAGE_SIZE;
2447 		start_addr += PAGE_SIZE;
2448 	}
2449 }
2450 
2451 mach_vm_size_t
pmap_query_resident(pmap_t pmap,addr64_t s64,addr64_t e64,mach_vm_size_t * compressed_bytes_p)2452 pmap_query_resident(
2453 	pmap_t          pmap,
2454 	addr64_t        s64,
2455 	addr64_t        e64,
2456 	mach_vm_size_t  *compressed_bytes_p)
2457 {
2458 	pt_entry_t     *pde;
2459 	pt_entry_t     *spte, *epte;
2460 	addr64_t        l64;
2461 	uint64_t        deadline = 0;
2462 	mach_vm_size_t  resident_bytes;
2463 	mach_vm_size_t  compressed_bytes;
2464 	boolean_t       is_ept;
2465 
2466 	pmap_intr_assert();
2467 
2468 	if (pmap == PMAP_NULL || pmap == kernel_pmap || s64 == e64) {
2469 		if (compressed_bytes_p) {
2470 			*compressed_bytes_p = 0;
2471 		}
2472 		return 0;
2473 	}
2474 
2475 	is_ept = is_ept_pmap(pmap);
2476 
2477 	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
2478 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(s64),
2479 	    VM_KERNEL_ADDRHIDE(e64));
2480 
2481 	resident_bytes = 0;
2482 	compressed_bytes = 0;
2483 
2484 	PMAP_LOCK_EXCLUSIVE(pmap);
2485 	uint32_t traverse_count = 0;
2486 
2487 	while (s64 < e64) {
2488 		if (os_add_overflow(s64, PDE_MAPPED_SIZE, &l64)) {
2489 			l64 = e64;
2490 		} else {
2491 			l64 &= ~(PDE_MAPPED_SIZE - 1);
2492 
2493 			if (l64 > e64) {
2494 				l64 = e64;
2495 			}
2496 		}
2497 
2498 		pde = pmap_pde(pmap, s64);
2499 
2500 		if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
2501 			if (*pde & PTE_PS) {
2502 				/* superpage: not supported */
2503 			} else {
2504 				spte = pmap_pte(pmap,
2505 				    (s64 & ~(PDE_MAPPED_SIZE - 1)));
2506 				spte = &spte[ptenum(s64)];
2507 				epte = &spte[intel_btop(l64 - s64)];
2508 
2509 				for (; spte < epte; spte++) {
2510 					if (pte_to_pa(*spte) != 0) {
2511 						resident_bytes += PAGE_SIZE;
2512 					} else if (*spte & PTE_COMPRESSED) {
2513 						compressed_bytes += PAGE_SIZE;
2514 					}
2515 				}
2516 			}
2517 		}
2518 		s64 = l64;
2519 
2520 		if ((s64 < e64) && (traverse_count++ > PLCHECK_THRESHOLD)) {
2521 			if (deadline == 0) {
2522 				deadline = rdtsc64() + max_preemption_latency_tsc;
2523 			} else {
2524 				if (rdtsc64() > deadline) {
2525 					PMAP_UNLOCK_EXCLUSIVE(pmap);
2526 					__builtin_ia32_pause();
2527 					PMAP_LOCK_EXCLUSIVE(pmap);
2528 					deadline = rdtsc64() + max_preemption_latency_tsc;
2529 				}
2530 			}
2531 		}
2532 	}
2533 
2534 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2535 
2536 	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
2537 	    resident_bytes);
2538 
2539 	if (compressed_bytes_p) {
2540 		*compressed_bytes_p = compressed_bytes;
2541 	}
2542 	return resident_bytes;
2543 }
2544 
2545 uint64_t pmap_query_page_info_retries;
2546 
2547 kern_return_t
pmap_query_page_info(pmap_t pmap,vm_map_offset_t va,int * disp_p)2548 pmap_query_page_info(
2549 	pmap_t          pmap,
2550 	vm_map_offset_t va,
2551 	int             *disp_p)
2552 {
2553 	int             disp;
2554 	boolean_t       is_ept;
2555 	pmap_paddr_t    pa;
2556 	ppnum_t         pai;
2557 	pd_entry_t      *pde_p;
2558 	pt_entry_t      *pte_p, pte;
2559 
2560 	pmap_intr_assert();
2561 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
2562 		*disp_p = 0;
2563 		return KERN_INVALID_ARGUMENT;
2564 	}
2565 
2566 	disp = 0;
2567 	is_ept = is_ept_pmap(pmap);
2568 
2569 	PMAP_LOCK_EXCLUSIVE(pmap);
2570 
2571 	pde_p = pmap_pde(pmap, va);
2572 	if (!pde_p ||
2573 	    !(*pde_p & PTE_VALID_MASK(is_ept)) ||
2574 	    (*pde_p & PTE_PS)) {
2575 		goto done;
2576 	}
2577 
2578 try_again:
2579 	disp = 0;
2580 
2581 	pte_p = pmap_pte(pmap, va);
2582 	if (pte_p == PT_ENTRY_NULL) {
2583 		goto done;
2584 	}
2585 
2586 	pte = *pte_p;
2587 	pa = pte_to_pa(pte);
2588 	if (pa == 0) {
2589 		if (PTE_IS_COMPRESSED(pte, pte_p, pmap, va)) {
2590 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
2591 			if (pte & PTE_COMPRESSED_ALT) {
2592 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
2593 			}
2594 		}
2595 	} else {
2596 		disp |= PMAP_QUERY_PAGE_PRESENT;
2597 		pai = pa_index(pa);
2598 		if (!IS_MANAGED_PAGE(pai)) {
2599 		} else if (pmap_pv_is_altacct(pmap, va, pai)) {
2600 			assert(IS_INTERNAL_PAGE(pai));
2601 			disp |= PMAP_QUERY_PAGE_INTERNAL;
2602 			disp |= PMAP_QUERY_PAGE_ALTACCT;
2603 		} else if (IS_REUSABLE_PAGE(pai)) {
2604 			disp |= PMAP_QUERY_PAGE_REUSABLE;
2605 		} else if (IS_INTERNAL_PAGE(pai)) {
2606 			disp |= PMAP_QUERY_PAGE_INTERNAL;
2607 		}
2608 	}
2609 	if (__improbable(pte_p != pmap_pte(pmap, va) || pte != *pte_p)) {
2610 		/* something changed: try again */
2611 		pmap_query_page_info_retries++;
2612 		goto try_again;
2613 	}
2614 done:
2615 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2616 	*disp_p = disp;
2617 	return KERN_SUCCESS;
2618 }
2619 
2620 void
pmap_set_vm_map_cs_enforced(pmap_t pmap,bool new_value)2621 pmap_set_vm_map_cs_enforced(
2622 	pmap_t pmap,
2623 	bool new_value)
2624 {
2625 	PMAP_LOCK_EXCLUSIVE(pmap);
2626 	pmap->pm_vm_map_cs_enforced = new_value;
2627 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2628 }
2629 extern int cs_process_enforcement_enable;
2630 bool
pmap_get_vm_map_cs_enforced(pmap_t pmap)2631 pmap_get_vm_map_cs_enforced(
2632 	pmap_t pmap)
2633 {
2634 	if (cs_process_enforcement_enable) {
2635 		return true;
2636 	}
2637 	return pmap->pm_vm_map_cs_enforced;
2638 }
2639 
2640 void
pmap_set_jit_entitled(__unused pmap_t pmap)2641 pmap_set_jit_entitled(__unused pmap_t pmap)
2642 {
2643 	/* The x86 pmap layer does not care if a map has a JIT entry. */
2644 	return;
2645 }
2646 
2647 bool
pmap_get_jit_entitled(__unused pmap_t pmap)2648 pmap_get_jit_entitled(__unused pmap_t pmap)
2649 {
2650 	/* The x86 pmap layer does not care if a map is using JIT. */
2651 	return false;
2652 }
2653 
2654 void
pmap_set_tpro(__unused pmap_t pmap)2655 pmap_set_tpro(__unused pmap_t pmap)
2656 {
2657 	/* The x86 pmap layer does not care if a map is using TPRO */
2658 	return;
2659 }
2660 
2661 bool
pmap_get_tpro(__unused pmap_t pmap)2662 pmap_get_tpro(__unused pmap_t pmap)
2663 {
2664 	/* The x86 pmap layer does not care if a map is using TPRO */
2665 	return false;
2666 }
2667 
2668 bool
pmap_has_prot_policy(__unused pmap_t pmap,__unused bool translated_allow_execute,__unused vm_prot_t prot)2669 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
2670 {
2671 	/*
2672 	 * The x86 pmap layer does not apply any policy to any protection
2673 	 * types.
2674 	 */
2675 	return false;
2676 }
2677 
2678 uint64_t
pmap_release_pages_fast(void)2679 pmap_release_pages_fast(void)
2680 {
2681 	return 0;
2682 }
2683 
2684 void
pmap_trim(__unused pmap_t grand,__unused pmap_t subord,__unused addr64_t vstart,__unused uint64_t size)2685 pmap_trim(__unused pmap_t grand, __unused pmap_t subord, __unused addr64_t vstart, __unused uint64_t size)
2686 {
2687 	return;
2688 }
2689 
2690 __dead2
2691 void
pmap_ledger_verify_size(size_t size)2692 pmap_ledger_verify_size(size_t size)
2693 {
2694 	panic("%s: unsupported, "
2695 	    "size=%lu",
2696 	    __func__, size);
2697 }
2698 
2699 __dead2
2700 ledger_t
pmap_ledger_alloc(void)2701 pmap_ledger_alloc(void)
2702 {
2703 	panic("%s: unsupported",
2704 	    __func__);
2705 }
2706 
2707 __dead2
2708 void
pmap_ledger_free(ledger_t ledger)2709 pmap_ledger_free(ledger_t ledger)
2710 {
2711 	panic("%s: unsupported, "
2712 	    "ledger=%p",
2713 	    __func__, ledger);
2714 }
2715 
2716 kern_return_t
pmap_dump_page_tables(pmap_t pmap __unused,void * bufp __unused,void * buf_end __unused,unsigned int level_mask __unused,size_t * bytes_copied __unused)2717 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
2718     unsigned int level_mask __unused, size_t *bytes_copied __unused)
2719 {
2720 	return KERN_NOT_SUPPORTED;
2721 }
2722 
2723 void *
pmap_map_compressor_page(ppnum_t pn)2724 pmap_map_compressor_page(ppnum_t pn)
2725 {
2726 	assertf(IS_MANAGED_PAGE(ppn_to_pai(pn)), "%s called on non-managed page 0x%08x", __func__, pn);
2727 	return PHYSMAP_PTOV((uint64_t)pn << (uint64_t)PAGE_SHIFT);
2728 }
2729 
2730 void
pmap_unmap_compressor_page(ppnum_t pn __unused,void * kva __unused)2731 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
2732 {
2733 }
2734 
2735 bool
pmap_clear_refmod_range_options(pmap_t pmap __unused,vm_map_address_t start __unused,vm_map_address_t end __unused,unsigned int mask __unused,unsigned int options __unused)2736 pmap_clear_refmod_range_options(
2737 	pmap_t pmap __unused,
2738 	vm_map_address_t start __unused,
2739 	vm_map_address_t end __unused,
2740 	unsigned int mask __unused,
2741 	unsigned int options __unused)
2742 {
2743 	/*
2744 	 * x86 doesn't have ranged tlbi instructions, and we already have
2745 	 * the pmap_flush_context. This operation isn't implemented.
2746 	 */
2747 	return false;
2748 }
2749 
2750 bool
pmap_supported_feature(pmap_t pmap,pmap_feature_flags_t feat)2751 pmap_supported_feature(pmap_t pmap, pmap_feature_flags_t feat)
2752 {
2753 	switch (feat) {
2754 	case PMAP_FEAT_UEXEC:
2755 		return pmap != NULL && is_ept_pmap(pmap);
2756 	default:
2757 		return false;
2758 	}
2759 }
2760