xref: /xnu-11417.101.15/osfmk/x86_64/pmap.c (revision e3723e1f17661b24996789d8afc084c0c3303b26)
1 /*
2  * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 
59 /*
60  *	File:	pmap.c
61  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
62  *	(These guys wrote the Vax version)
63  *
64  *	Physical Map management code for Intel i386, i486, and i860.
65  *
66  *	Manages physical address maps.
67  *
68  *	In addition to hardware address maps, this
69  *	module is called upon to provide software-use-only
70  *	maps which may or may not be stored in the same
71  *	form as hardware maps.  These pseudo-maps are
72  *	used to store intermediate results from copy
73  *	operations to and from address spaces.
74  *
75  *	Since the information managed by this module is
76  *	also stored by the logical address mapping module,
77  *	this module may throw away valid virtual-to-physical
78  *	mappings at almost any time.  However, invalidations
79  *	of virtual-to-physical mappings must be done as
80  *	requested.
81  *
82  *	In order to cope with hardware architectures which
83  *	make virtual-to-physical map invalidates expensive,
84  *	this module may delay invalidate or reduced protection
85  *	operations until such time as they are actually
86  *	necessary.  This module is given full information as
87  *	to which processors are currently using which maps,
88  *	and to when physical maps must be made correct.
89  */
90 
91 #include <string.h>
92 #include <mach_ldebug.h>
93 
94 #include <libkern/OSAtomic.h>
95 
96 #include <mach/machine/vm_types.h>
97 
98 #include <mach/boolean.h>
99 #include <kern/thread.h>
100 #include <kern/zalloc.h>
101 #include <kern/zalloc_internal.h>
102 #include <kern/queue.h>
103 #include <kern/ledger.h>
104 #include <kern/mach_param.h>
105 
106 #include <kern/spl.h>
107 
108 #include <vm/pmap.h>
109 #include <vm/pmap_cs.h>
110 #include <vm/vm_map_xnu.h>
111 #include <vm/vm_kern_xnu.h>
112 #include <mach/vm_param.h>
113 #include <mach/vm_prot.h>
114 #include <vm/vm_object_internal.h>
115 #include <vm/vm_page_internal.h>
116 
117 #include <mach/machine/vm_param.h>
118 #include <machine/thread.h>
119 
120 #include <kern/misc_protos.h>                   /* prototyping */
121 #include <i386/misc_protos.h>
122 #include <i386/i386_lowmem.h>
123 #include <x86_64/lowglobals.h>
124 
125 #include <i386/cpuid.h>
126 #include <i386/cpu_data.h>
127 #include <i386/cpu_number.h>
128 #include <i386/machine_cpu.h>
129 #include <i386/seg.h>
130 #include <i386/serial_io.h>
131 #include <i386/cpu_capabilities.h>
132 #include <i386/machine_routines.h>
133 #include <i386/proc_reg.h>
134 #include <i386/tsc.h>
135 #include <i386/pmap_internal.h>
136 #include <i386/pmap_pcid.h>
137 #if CONFIG_VMX
138 #include <i386/vmx/vmx_cpu.h>
139 #endif
140 
141 #include <vm/vm_protos.h>
142 #include <san/kasan.h>
143 
144 #include <i386/mp.h>
145 #include <i386/mp_desc.h>
146 #include <libkern/kernel_mach_header.h>
147 
148 #include <pexpert/i386/efi.h>
149 #include <libkern/section_keywords.h>
150 #if MACH_ASSERT
151 int pmap_stats_assert = 1;
152 #endif /* MACH_ASSERT */
153 
154 #ifdef IWANTTODEBUG
155 #undef  DEBUG
156 #define DEBUG 1
157 #define POSTCODE_DELAY 1
158 #include <i386/postcode.h>
159 #endif /* IWANTTODEBUG */
160 
161 #ifdef  PMAP_DEBUG
162 #define DBG(x...)       kprintf("DBG: " x)
163 #else
164 #define DBG(x...)
165 #endif
166 /* Compile time assert to ensure adjacency/alignment of per-CPU data fields used
167  * in the trampolines for kernel/user boundary TLB coherency.
168  */
169 char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1];
170 boolean_t pmap_trace = FALSE;
171 
172 boolean_t       no_shared_cr3 = DEBUG;          /* TRUE for DEBUG by default */
173 
174 #if DEVELOPMENT || DEBUG
175 int nx_enabled = 1;                     /* enable no-execute protection -- set during boot */
176 #else
177 const int nx_enabled = 1;
178 #endif
179 
180 #if DEBUG || DEVELOPMENT
181 int allow_data_exec  = VM_ABI_32;       /* 32-bit apps may execute data by default, 64-bit apps may not */
182 int allow_stack_exec = 0;               /* No apps may execute from the stack by default */
183 #else /* DEBUG || DEVELOPMENT */
184 const int allow_data_exec  = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
185 const int allow_stack_exec = 0;         /* No apps may execute from the stack by default */
186 #endif /* DEBUG || DEVELOPMENT */
187 
188 uint64_t max_preemption_latency_tsc = 0;
189 
190 pv_hashed_entry_t     *pv_hash_table;  /* hash lists */
191 
192 uint32_t npvhashmask = 0, npvhashbuckets = 0;
193 
194 pv_hashed_entry_t       pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
195 pv_hashed_entry_t       pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
196 SIMPLE_LOCK_DECLARE(pv_hashed_free_list_lock, 0);
197 SIMPLE_LOCK_DECLARE(pv_hashed_kern_free_list_lock, 0);
198 SIMPLE_LOCK_DECLARE(pv_hash_table_lock, 0);
199 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
200 
201 SECURITY_READ_ONLY_LATE(zone_t) pv_hashed_list_zone;    /* zone of pv_hashed_entry structures */
202 
203 /*
204  *	First and last physical addresses that we maintain any information
205  *	for.  Initialized to zero so that pmap operations done before
206  *	pmap_init won't touch any non-existent structures.
207  */
208 boolean_t       pmap_initialized = FALSE;/* Has pmap_init completed? */
209 
210 static struct vm_object kptobj_object_store VM_PAGE_PACKED_ALIGNED;
211 static struct vm_object kpml4obj_object_store VM_PAGE_PACKED_ALIGNED;
212 static struct vm_object kpdptobj_object_store VM_PAGE_PACKED_ALIGNED;
213 
214 /*
215  *	Array of physical page attribites for managed pages.
216  *	One byte per physical page.
217  */
218 char            *pmap_phys_attributes;
219 ppnum_t         last_managed_page = 0;
220 
221 unsigned pmap_memory_region_count;
222 unsigned pmap_memory_region_current;
223 
224 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
225 
226 /*
227  *	Other useful macros.
228  */
229 #define current_pmap()          (vm_map_pmap(current_thread()->map))
230 
231 struct pmap     kernel_pmap_store;
232 const pmap_t    kernel_pmap = &kernel_pmap_store;
233 SECURITY_READ_ONLY_LATE(zone_t)          pmap_zone; /* zone of pmap structures */
234 SECURITY_READ_ONLY_LATE(zone_t)          pmap_anchor_zone;
235 SECURITY_READ_ONLY_LATE(zone_t)          pmap_uanchor_zone;
236 int             pmap_debug = 0;         /* flag for debugging prints */
237 
238 unsigned int    inuse_ptepages_count = 0;
239 long long       alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */
240 unsigned int    bootstrap_wired_pages = 0;
241 
242 extern  long    NMIPI_acks;
243 
244 SECURITY_READ_ONLY_LATE(boolean_t)       kernel_text_ps_4K = TRUE;
245 
246 extern char     end;
247 
248 static int      nkpt;
249 
250 #if DEVELOPMENT || DEBUG
251 SECURITY_READ_ONLY_LATE(boolean_t)       pmap_disable_kheap_nx = FALSE;
252 SECURITY_READ_ONLY_LATE(boolean_t)       pmap_disable_kstack_nx = FALSE;
253 SECURITY_READ_ONLY_LATE(boolean_t)       wpkernel = TRUE;
254 #else
255 const boolean_t wpkernel = TRUE;
256 #endif
257 
258 extern long __stack_chk_guard[];
259 
260 static uint64_t pmap_eptp_flags = 0;
261 boolean_t pmap_ept_support_ad = FALSE;
262 
263 static void process_pmap_updates(pmap_t, bool, addr64_t, addr64_t);
264 /*
265  *	Map memory at initialization.  The physical addresses being
266  *	mapped are not managed and are never unmapped.
267  *
268  *	For now, VM is already on, we only need to map the
269  *	specified memory.
270  */
271 vm_offset_t
pmap_map(vm_offset_t virt,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int flags)272 pmap_map(
273 	vm_offset_t     virt,
274 	vm_map_offset_t start_addr,
275 	vm_map_offset_t end_addr,
276 	vm_prot_t       prot,
277 	unsigned int    flags)
278 {
279 	kern_return_t   kr;
280 	int             ps;
281 
282 	ps = PAGE_SIZE;
283 	while (start_addr < end_addr) {
284 		kr = pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
285 		    (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE, PMAP_MAPPING_TYPE_INFER);
286 
287 		if (kr != KERN_SUCCESS) {
288 			panic("%s: failed pmap_enter, "
289 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
290 			    __FUNCTION__,
291 			    (void *)virt, (void *)start_addr, (void *)end_addr, prot, flags);
292 		}
293 
294 		virt += ps;
295 		start_addr += ps;
296 	}
297 	return virt;
298 }
299 
300 extern  char                    *first_avail;
301 extern  vm_offset_t             virtual_avail, virtual_end;
302 extern  pmap_paddr_t            avail_start, avail_end;
303 extern  vm_offset_t             sHIB;
304 extern  vm_offset_t             eHIB;
305 extern  vm_offset_t             stext;
306 extern  vm_offset_t             etext;
307 extern  vm_offset_t             sdata, edata;
308 extern  vm_offset_t             sconst, econst;
309 
310 extern void                     *KPTphys;
311 
312 boolean_t pmap_smep_enabled = FALSE;
313 boolean_t pmap_smap_enabled = FALSE;
314 
315 void
pmap_cpu_init(void)316 pmap_cpu_init(void)
317 {
318 	cpu_data_t      *cdp = current_cpu_datap();
319 
320 	set_cr4(get_cr4() | CR4_PGE);
321 
322 	/*
323 	 * Initialize the per-cpu, TLB-related fields.
324 	 */
325 	cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
326 	cpu_shadowp(cdp->cpu_number)->cpu_kernel_cr3 = cdp->cpu_kernel_cr3;
327 	cdp->cpu_active_cr3 = kernel_pmap->pm_cr3;
328 	cdp->cpu_tlb_invalid = 0;
329 	cdp->cpu_task_map = TASK_MAP_64BIT;
330 
331 	pmap_pcid_configure();
332 	if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) {
333 		pmap_smep_enabled = TRUE;
334 #if     DEVELOPMENT || DEBUG
335 		boolean_t nsmep;
336 		if (PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) {
337 			pmap_smep_enabled = FALSE;
338 		}
339 #endif
340 		if (pmap_smep_enabled) {
341 			set_cr4(get_cr4() | CR4_SMEP);
342 		}
343 	}
344 	if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
345 		pmap_smap_enabled = TRUE;
346 #if DEVELOPMENT || DEBUG
347 		boolean_t nsmap;
348 		if (PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
349 			pmap_smap_enabled = FALSE;
350 		}
351 #endif
352 		if (pmap_smap_enabled) {
353 			set_cr4(get_cr4() | CR4_SMAP);
354 		}
355 	}
356 
357 #if !CONFIG_CPU_COUNTERS
358 	if (cdp->cpu_fixed_pmcs_enabled) {
359 		boolean_t enable = TRUE;
360 		cpu_pmc_control(&enable);
361 	}
362 #endif /* !CONFIG_CPU_COUNTERS */
363 }
364 
365 static void
pmap_ro_zone_validate_element_dst(zone_id_t zid,vm_offset_t va,vm_offset_t offset,vm_size_t new_data_size)366 pmap_ro_zone_validate_element_dst(
367 	zone_id_t           zid,
368 	vm_offset_t         va,
369 	vm_offset_t         offset,
370 	vm_size_t           new_data_size)
371 {
372 	if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
373 		panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
374 		    ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
375 	}
376 
377 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
378 
379 	/* Check element is from correct zone and properly aligned */
380 	zone_require_ro(zid, elem_size, (void*)va);
381 
382 	if (__improbable(new_data_size > (elem_size - offset))) {
383 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
384 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
385 	}
386 	if (__improbable(offset >= elem_size)) {
387 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
388 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
389 	}
390 }
391 
392 static void
pmap_ro_zone_validate_element(zone_id_t zid,vm_offset_t va,vm_offset_t offset,const vm_offset_t new_data,vm_size_t new_data_size)393 pmap_ro_zone_validate_element(
394 	zone_id_t           zid,
395 	vm_offset_t         va,
396 	vm_offset_t         offset,
397 	const vm_offset_t   new_data,
398 	vm_size_t           new_data_size)
399 {
400 	vm_offset_t sum = 0;
401 
402 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
403 		panic("%s: Integer addition overflow %p + %lu = %lu",
404 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
405 	}
406 
407 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
408 }
409 
410 void
pmap_ro_zone_memcpy(zone_id_t zid,vm_offset_t va,vm_offset_t offset,const vm_offset_t new_data,vm_size_t new_data_size)411 pmap_ro_zone_memcpy(
412 	zone_id_t             zid,
413 	vm_offset_t           va,
414 	vm_offset_t           offset,
415 	const vm_offset_t     new_data,
416 	vm_size_t             new_data_size)
417 {
418 	const pmap_paddr_t pa = kvtophys(va + offset);
419 
420 	if (!new_data || new_data_size == 0) {
421 		return;
422 	}
423 
424 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
425 	/* Write through Physical Aperture */
426 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
427 }
428 
429 uint64_t
pmap_ro_zone_atomic_op(zone_id_t zid,vm_offset_t va,vm_offset_t offset,zro_atomic_op_t op,uint64_t value)430 pmap_ro_zone_atomic_op(
431 	zone_id_t             zid,
432 	vm_offset_t           va,
433 	vm_offset_t           offset,
434 	zro_atomic_op_t       op,
435 	uint64_t              value)
436 {
437 	const pmap_paddr_t pa = kvtophys(va + offset);
438 	vm_size_t value_size = op & 0xf;
439 
440 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
441 	/* Write through Physical Aperture */
442 	return __zalloc_ro_mut_atomic(phystokv(pa), op, value);
443 }
444 
445 void
pmap_ro_zone_bzero(zone_id_t zid,vm_offset_t va,vm_offset_t offset,vm_size_t size)446 pmap_ro_zone_bzero(
447 	zone_id_t         zid,
448 	vm_offset_t       va,
449 	vm_offset_t       offset,
450 	vm_size_t         size)
451 {
452 	const pmap_paddr_t pa = kvtophys(va + offset);
453 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
454 	bzero((void*)phystokv(pa), size);
455 }
456 
457 static uint32_t
pmap_scale_shift(void)458 pmap_scale_shift(void)
459 {
460 	uint32_t scale = 0;
461 
462 	if (sane_size <= 8 * GB) {
463 		scale = (uint32_t)(sane_size / (2 * GB));
464 	} else if (sane_size <= 32 * GB) {
465 		scale = 4 + (uint32_t)((sane_size - (8 * GB)) / (4 * GB));
466 	} else {
467 		scale = 10 + (uint32_t)MIN(4, ((sane_size - (32 * GB)) / (8 * GB)));
468 	}
469 	return scale;
470 }
471 
472 LCK_GRP_DECLARE(pmap_lck_grp, "pmap");
473 LCK_ATTR_DECLARE(pmap_lck_rw_attr, 0, LCK_ATTR_DEBUG);
474 
475 /*
476  *	Bootstrap the system enough to run with virtual memory.
477  *	Map the kernel's code and data, and allocate the system page table.
478  *	Called with mapping OFF.  Page_size must already be set.
479  */
480 
481 void
pmap_bootstrap(__unused vm_offset_t load_start,__unused boolean_t IA32e)482 pmap_bootstrap(
483 	__unused vm_offset_t    load_start,
484 	__unused boolean_t      IA32e)
485 {
486 	assert(IA32e);
487 
488 	vm_last_addr = VM_MAX_KERNEL_ADDRESS;   /* Set the highest address
489 	                                         * known to VM */
490 	/*
491 	 *	The kernel's pmap is statically allocated so we don't
492 	 *	have to use pmap_create, which is unlikely to work
493 	 *	correctly at this part of the boot sequence.
494 	 */
495 
496 	os_ref_init(&kernel_pmap->ref_count, NULL);
497 #if DEVELOPMENT || DEBUG
498 	kernel_pmap->nx_enabled = TRUE;
499 #endif
500 	kernel_pmap->pm_task_map = TASK_MAP_64BIT;
501 	kernel_pmap->pm_obj = (vm_object_t) NULL;
502 	kernel_pmap->pm_pml4 = IdlePML4;
503 	kernel_pmap->pm_upml4 = IdlePML4;
504 	kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
505 	kernel_pmap->pm_ucr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
506 	kernel_pmap->pm_eptp = 0;
507 
508 	pmap_pcid_initialize_kernel(kernel_pmap);
509 
510 	current_cpu_datap()->cpu_kernel_cr3 = cpu_shadowp(cpu_number())->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
511 
512 	nkpt = NKPT;
513 	OSAddAtomic(NKPT, &inuse_ptepages_count);
514 	OSAddAtomic64(NKPT, &alloc_ptepages_count);
515 	bootstrap_wired_pages = NKPT;
516 
517 	virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
518 	virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
519 
520 	if (!PE_parse_boot_argn("npvhash", &npvhashmask, sizeof(npvhashmask))) {
521 		npvhashmask = ((NPVHASHBUCKETS) << pmap_scale_shift()) - 1;
522 	}
523 
524 	npvhashbuckets = npvhashmask + 1;
525 
526 	if (0 != ((npvhashbuckets) & npvhashmask)) {
527 		panic("invalid hash %d, must be ((2^N)-1), "
528 		    "using default %d\n", npvhashmask, NPVHASHMASK);
529 	}
530 
531 	lck_rw_init(&kernel_pmap->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
532 	kernel_pmap->pmap_rwl.lck_rw_can_sleep = FALSE;
533 
534 	pmap_cpu_init();
535 
536 	if (pmap_pcid_ncpus) {
537 		printf("PMAP: PCID enabled\n");
538 	}
539 
540 	if (pmap_smep_enabled) {
541 		printf("PMAP: Supervisor Mode Execute Protection enabled\n");
542 	}
543 	if (pmap_smap_enabled) {
544 		printf("PMAP: Supervisor Mode Access Protection enabled\n");
545 	}
546 
547 #if     DEBUG
548 	printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
549 	printf("early_random(): 0x%qx\n", early_random());
550 #endif
551 #if     DEVELOPMENT || DEBUG
552 	boolean_t ptmp;
553 	/* Check if the user has requested disabling stack or heap no-execute
554 	 * enforcement. These are "const" variables; that qualifier is cast away
555 	 * when altering them. The TEXT/DATA const sections are marked
556 	 * write protected later in the kernel startup sequence, so altering
557 	 * them is possible at this point, in pmap_bootstrap().
558 	 */
559 	if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) {
560 		boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx;
561 		*pdknxp = TRUE;
562 	}
563 
564 	if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) {
565 		boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx;
566 		*pdknhp = TRUE;
567 	}
568 #endif /* DEVELOPMENT || DEBUG */
569 
570 	boot_args *args = (boot_args *)PE_state.bootArgs;
571 	if (args->efiMode == kBootArgsEfiMode32) {
572 		printf("EFI32: kernel virtual space limited to 4GB\n");
573 		virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32;
574 	}
575 	kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
576 	    (long)KERNEL_BASE, (long)virtual_end);
577 	kprintf("Available physical space from 0x%llx to 0x%llx\n",
578 	    avail_start, avail_end);
579 
580 	/*
581 	 * The -no_shared_cr3 boot-arg is a debugging feature (set by default
582 	 * in the DEBUG kernel) to force the kernel to switch to its own map
583 	 * (and cr3) when control is in kernelspace. The kernel's map does not
584 	 * include (i.e. share) userspace so wild references will cause
585 	 * a panic. Only copyin and copyout are exempt from this.
586 	 */
587 	(void) PE_parse_boot_argn("-no_shared_cr3",
588 	    &no_shared_cr3, sizeof(no_shared_cr3));
589 	if (no_shared_cr3) {
590 		kprintf("Kernel not sharing user map\n");
591 	}
592 
593 #ifdef  PMAP_TRACES
594 	if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof(pmap_trace))) {
595 		kprintf("Kernel traces for pmap operations enabled\n");
596 	}
597 #endif  /* PMAP_TRACES */
598 
599 #if MACH_ASSERT
600 	PE_parse_boot_argn("pmap_asserts", &pmap_asserts_enabled, sizeof(pmap_asserts_enabled));
601 	PE_parse_boot_argn("pmap_stats_assert",
602 	    &pmap_stats_assert,
603 	    sizeof(pmap_stats_assert));
604 #endif /* MACH_ASSERT */
605 }
606 
607 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)608 pmap_virtual_space(
609 	vm_offset_t *startp,
610 	vm_offset_t *endp)
611 {
612 	*startp = virtual_avail;
613 	*endp = virtual_end;
614 }
615 
616 
617 
618 
619 #if HIBERNATION
620 
621 #include <IOKit/IOHibernatePrivate.h>
622 #include <machine/pal_hibernate.h>
623 
624 int32_t         pmap_npages;
625 int32_t         pmap_teardown_last_valid_compact_indx = -1;
626 
627 void    pmap_pack_index(uint32_t);
628 int32_t pmap_unpack_index(pv_rooted_entry_t);
629 
630 int32_t
pmap_unpack_index(pv_rooted_entry_t pv_h)631 pmap_unpack_index(pv_rooted_entry_t pv_h)
632 {
633 	int32_t indx = 0;
634 
635 	indx = (int32_t)(*((uint64_t *)(&pv_h->qlink.next)) >> 48);
636 	indx = indx << 16;
637 	indx |= (int32_t)(*((uint64_t *)(&pv_h->qlink.prev)) >> 48);
638 
639 	*((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)0xffff << 48);
640 	*((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)0xffff << 48);
641 
642 	return indx;
643 }
644 
645 
646 void
pmap_pack_index(uint32_t indx)647 pmap_pack_index(uint32_t indx)
648 {
649 	pv_rooted_entry_t       pv_h;
650 
651 	pv_h = &pv_head_table[indx];
652 
653 	*((uint64_t *)(&pv_h->qlink.next)) &= ~((uint64_t)0xffff << 48);
654 	*((uint64_t *)(&pv_h->qlink.prev)) &= ~((uint64_t)0xffff << 48);
655 
656 	*((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)(indx >> 16)) << 48;
657 	*((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)(indx & 0xffff)) << 48;
658 }
659 
660 
661 void
pal_hib_teardown_pmap_structs(addr64_t * unneeded_start,addr64_t * unneeded_end)662 pal_hib_teardown_pmap_structs(addr64_t *unneeded_start, addr64_t *unneeded_end)
663 {
664 	int32_t         i;
665 	int32_t         compact_target_indx;
666 
667 	compact_target_indx = 0;
668 
669 	for (i = 0; i < pmap_npages; i++) {
670 		if (pv_head_table[i].pmap == PMAP_NULL) {
671 			if (pv_head_table[compact_target_indx].pmap != PMAP_NULL) {
672 				compact_target_indx = i;
673 			}
674 		} else {
675 			pmap_pack_index((uint32_t)i);
676 
677 			if (pv_head_table[compact_target_indx].pmap == PMAP_NULL) {
678 				/*
679 				 * we've got a hole to fill, so
680 				 * move this pv_rooted_entry_t to it's new home
681 				 */
682 				pv_head_table[compact_target_indx] = pv_head_table[i];
683 				pv_head_table[i].pmap = PMAP_NULL;
684 
685 				pmap_teardown_last_valid_compact_indx = compact_target_indx;
686 				compact_target_indx++;
687 			} else {
688 				pmap_teardown_last_valid_compact_indx = i;
689 			}
690 		}
691 	}
692 	*unneeded_start = (addr64_t)&pv_head_table[pmap_teardown_last_valid_compact_indx + 1];
693 	*unneeded_end = (addr64_t)&pv_head_table[pmap_npages - 1];
694 
695 	HIBLOG("pal_hib_teardown_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
696 }
697 
698 
699 void
pal_hib_rebuild_pmap_structs(void)700 pal_hib_rebuild_pmap_structs(void)
701 {
702 	int32_t                 cindx, eindx, rindx = 0;
703 	pv_rooted_entry_t       pv_h;
704 
705 	eindx = (int32_t)pmap_npages;
706 
707 	for (cindx = pmap_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
708 		pv_h = &pv_head_table[cindx];
709 
710 		rindx = pmap_unpack_index(pv_h);
711 		assert(rindx < pmap_npages);
712 
713 		if (rindx != cindx) {
714 			/*
715 			 * this pv_rooted_entry_t was moved by pal_hib_teardown_pmap_structs,
716 			 * so move it back to its real location
717 			 */
718 			pv_head_table[rindx] = pv_head_table[cindx];
719 		}
720 		if (rindx + 1 != eindx) {
721 			/*
722 			 * the 'hole' between this vm_rooted_entry_t and the previous
723 			 * vm_rooted_entry_t we moved needs to be initialized as
724 			 * a range of zero'd vm_rooted_entry_t's
725 			 */
726 			bzero((char *)&pv_head_table[rindx + 1], (eindx - rindx - 1) * sizeof(struct pv_rooted_entry));
727 		}
728 		eindx = rindx;
729 	}
730 	if (rindx) {
731 		bzero((char *)&pv_head_table[0], rindx * sizeof(struct pv_rooted_entry));
732 	}
733 
734 	HIBLOG("pal_hib_rebuild_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
735 }
736 
737 #endif
738 
739 /*
740  * Create pv entries for kernel pages mapped by early startup code.
741  * These have to exist so we can ml_static_mfree() them later.
742  */
743 static void
pmap_pv_fixup(vm_offset_t start_va,vm_offset_t end_va)744 pmap_pv_fixup(vm_offset_t start_va, vm_offset_t end_va)
745 {
746 	ppnum_t           ppn;
747 	pv_rooted_entry_t pv_h;
748 	uint32_t          pgsz;
749 
750 	start_va = round_page(start_va);
751 	end_va = trunc_page(end_va);
752 	while (start_va < end_va) {
753 		pgsz = PAGE_SIZE;
754 		ppn = pmap_find_phys(kernel_pmap, start_va);
755 		if (ppn != 0 && IS_MANAGED_PAGE(ppn)) {
756 			pv_h = pai_to_pvh(ppn);
757 			assert(pv_h->qlink.next == 0);           /* shouldn't be init'd yet */
758 			assert(pv_h->pmap == 0);
759 			pv_h->va_and_flags = start_va;
760 			pv_h->pmap = kernel_pmap;
761 			queue_init(&pv_h->qlink);
762 			/*
763 			 * Note that pmap_query_pagesize does not enforce start_va is aligned
764 			 * on a 2M boundary if it's within a large page
765 			 */
766 			if (pmap_query_pagesize(kernel_pmap, start_va) == I386_LPGBYTES) {
767 				pgsz = I386_LPGBYTES;
768 			}
769 		}
770 		if (os_add_overflow(start_va, pgsz, &start_va)) {
771 #if DEVELOPMENT || DEBUG
772 			panic("pmap_pv_fixup: Unexpected address wrap (0x%lx after adding 0x%x)", start_va, pgsz);
773 #else
774 			start_va = end_va;
775 #endif
776 		}
777 	}
778 }
779 
780 static SECURITY_READ_ONLY_LATE(struct mach_vm_range) pmap_struct_range = {};
781 static __startup_data vm_map_t pmap_struct_map;
782 static __startup_data long pmap_npages_early;
783 static __startup_data vm_map_size_t pmap_struct_size;
784 KMEM_RANGE_REGISTER_DYNAMIC(pmap_struct, &pmap_struct_range, ^() {
785 	vm_map_size_t s;
786 
787 	pmap_npages_early = i386_btop(avail_end);
788 	s = (vm_map_size_t) (sizeof(struct pv_rooted_entry) * pmap_npages_early +
789 	(sizeof(struct pv_hashed_entry_t *) * (npvhashbuckets)) +
790 	pv_lock_table_size(pmap_npages_early) +
791 	pv_hash_lock_table_size((npvhashbuckets)) +
792 	pmap_npages_early);
793 	pmap_struct_size = round_page(s);
794 	return pmap_struct_size;
795 });
796 
797 /*
798  *	Initialize the pmap module.
799  *	Called by vm_init, to initialize any structures that the pmap
800  *	system needs to map virtual memory.
801  */
802 void
pmap_init(void)803 pmap_init(void)
804 {
805 	long                    npages;
806 	vm_offset_t             addr;
807 	vm_size_t               vsize;
808 	vm_map_offset_t         vaddr;
809 	ppnum_t                 ppn;
810 
811 	kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
812 	_vm_object_allocate((vm_object_size_t)NPML4PGS * PAGE_SIZE, &kpml4obj_object_store);
813 
814 	kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
815 	_vm_object_allocate((vm_object_size_t)NPDPTPGS * PAGE_SIZE, &kpdptobj_object_store);
816 
817 	kernel_pmap->pm_obj = &kptobj_object_store;
818 	_vm_object_allocate((vm_object_size_t)NPDEPGS * PAGE_SIZE, &kptobj_object_store);
819 
820 	/*
821 	 *	Allocate memory for the pv_head_table and its lock bits,
822 	 *	the modify bit array, and the pte_page table.
823 	 */
824 
825 	/*
826 	 * zero bias all these arrays now instead of off avail_start
827 	 * so we cover all memory
828 	 */
829 
830 	npages = pmap_npages_early;
831 	assert(npages == i386_btop(avail_end));
832 #if HIBERNATION
833 	pmap_npages = (uint32_t)npages;
834 #endif
835 	vm_map_will_allocate_early_map(&pmap_struct_map);
836 	pmap_struct_map = kmem_suballoc(kernel_map, &pmap_struct_range.min_address,
837 	    pmap_struct_size, VM_MAP_CREATE_NEVER_FAULTS,
838 	    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, KMS_NOFAIL | KMS_PERMANENT,
839 	    VM_KERN_MEMORY_PMAP).kmr_submap;
840 	kmem_alloc(pmap_struct_map, &addr, pmap_struct_size,
841 	    KMA_NOFAIL | KMA_ZERO | KMA_KOBJECT | KMA_PERMANENT,
842 	    VM_KERN_MEMORY_PMAP);
843 
844 	vaddr = addr;
845 	vsize = pmap_struct_size;
846 
847 #if PV_DEBUG
848 	if (0 == npvhashmask) {
849 		panic("npvhashmask not initialized");
850 	}
851 #endif
852 
853 	/*
854 	 *	Allocate the structures first to preserve word-alignment.
855 	 */
856 	pv_head_table = (pv_rooted_entry_t) addr;
857 	addr = (vm_offset_t) (pv_head_table + npages);
858 
859 	pv_hash_table = (pv_hashed_entry_t *)addr;
860 	addr = (vm_offset_t) (pv_hash_table + (npvhashbuckets));
861 
862 	pv_lock_table = (char *) addr;
863 	addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
864 
865 	pv_hash_lock_table = (char *) addr;
866 	addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhashbuckets)));
867 
868 	pmap_phys_attributes = (char *) addr;
869 
870 	ppnum_t  last_pn = i386_btop(avail_end);
871 	unsigned int i;
872 	pmap_memory_region_t *pmptr = pmap_memory_regions;
873 	for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
874 		if (pmptr->type != kEfiConventionalMemory) {
875 			continue;
876 		}
877 		ppnum_t pn;
878 		for (pn = pmptr->base; pn <= pmptr->end; pn++) {
879 			if (pn < last_pn) {
880 				pmap_phys_attributes[pn] |= PHYS_MANAGED;
881 
882 				if (pn > last_managed_page) {
883 					last_managed_page = pn;
884 				}
885 
886 				if ((pmap_high_used_bottom <= pn && pn <= pmap_high_used_top) ||
887 				    (pmap_middle_used_bottom <= pn && pn <= pmap_middle_used_top)) {
888 					pmap_phys_attributes[pn] |= PHYS_NOENCRYPT;
889 				}
890 			}
891 		}
892 	}
893 	while (vsize) {
894 		ppn = pmap_find_phys(kernel_pmap, vaddr);
895 
896 		pmap_phys_attributes[ppn] |= PHYS_NOENCRYPT;
897 
898 		vaddr += PAGE_SIZE;
899 		vsize -= PAGE_SIZE;
900 	}
901 	/*
902 	 *	Create the zone of physical maps,
903 	 *	and of the physical-to-virtual entries.
904 	 */
905 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
906 	    ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
907 
908 	/* The anchor is required to be page aligned. Zone debugging adds
909 	 * padding which may violate that requirement. Tell the zone
910 	 * subsystem that alignment is required.
911 	 */
912 	pmap_anchor_zone = zone_create("pagetable anchors", PAGE_SIZE,
913 	    ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
914 
915 /* TODO: possible general optimisation...pre-allocate via zones commonly created
916  * level3/2 pagetables
917  */
918 	/* The anchor is required to be page aligned. Zone debugging adds
919 	 * padding which may violate that requirement. Tell the zone
920 	 * subsystem that alignment is required.
921 	 */
922 	pmap_uanchor_zone = zone_create("pagetable user anchors", PAGE_SIZE,
923 	    ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
924 
925 	pv_hashed_list_zone = zone_create("pv_list", sizeof(struct pv_hashed_entry),
926 	    ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
927 
928 	/*
929 	 * Create pv entries for kernel pages that might get pmap_remove()ed.
930 	 *
931 	 * - very low pages that were identity mapped.
932 	 * - vm_pages[] entries that might be unused and reclaimed.
933 	 */
934 	assert((uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start <= (uintptr_t)vm_page_get(0));
935 	pmap_pv_fixup((uintptr_t)VM_MIN_KERNEL_ADDRESS, (uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start);
936 	pmap_pv_fixup((uintptr_t)vm_page_get(0), (uintptr_t)vm_pages_end);
937 
938 	pmap_initialized = TRUE;
939 
940 	max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
941 
942 	/*
943 	 * Ensure the kernel's PML4 entry exists for the basement
944 	 * before this is shared with any user.
945 	 */
946 	pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE);
947 
948 #if CONFIG_VMX
949 	pmap_ept_support_ad = vmx_hv_support() && (VMX_CAP(MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_EPT_VPID_CAP_AD_SHIFT, 1) ? TRUE : FALSE);
950 	pmap_eptp_flags = HV_VMX_EPTP_MEMORY_TYPE_WB | HV_VMX_EPTP_WALK_LENGTH(4) | (pmap_ept_support_ad ? HV_VMX_EPTP_ENABLE_AD_FLAGS : 0);
951 #endif /* CONFIG_VMX */
952 }
953 
954 void
pmap_mark_range(pmap_t npmap,uint64_t sv,uint64_t nxrosz,boolean_t NX,boolean_t ro)955 pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro)
956 {
957 	uint64_t ev, cv = sv;
958 	pd_entry_t *pdep;
959 	pt_entry_t *ptep = NULL;
960 
961 	if (os_add_overflow(sv, nxrosz, &ev)) {
962 		panic("pmap_mark_range: Unexpected address overflow: start=0x%llx size=0x%llx", sv, nxrosz);
963 	}
964 
965 	/* XXX what if nxrosz is 0?  we end up marking the page whose address is passed in via sv -- is that kosher? */
966 	assert(!is_ept_pmap(npmap));
967 
968 	assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0);
969 
970 	for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) {
971 		uint64_t pdev = (cv & ~((uint64_t)PDEMASK));
972 
973 		if (*pdep & INTEL_PTE_PS) {
974 #ifdef REMAP_DEBUG
975 			if ((NX ^ !!(*pdep & INTEL_PTE_NX)) || (ro ^ !!!(*pdep & INTEL_PTE_WRITE))) {
976 				kprintf("WARNING: Remapping PDE for %p from %s%s%s to %s%s%s\n", (void *)cv,
977 				    (*pdep & INTEL_PTE_VALID) ? "R" : "",
978 				    (*pdep & INTEL_PTE_WRITE) ? "W" : "",
979 				    (*pdep & INTEL_PTE_NX) ? "" : "X",
980 				    "R",
981 				    ro ? "" : "W",
982 				    NX ? "" : "X");
983 			}
984 #endif
985 
986 			if (NX) {
987 				*pdep |= INTEL_PTE_NX;
988 			} else {
989 				*pdep &= ~INTEL_PTE_NX;
990 			}
991 			if (ro) {
992 				*pdep &= ~INTEL_PTE_WRITE;
993 			} else {
994 				*pdep |= INTEL_PTE_WRITE;
995 			}
996 
997 			if (os_add_overflow(cv, NBPD, &cv)) {
998 				cv = ev;
999 			} else {
1000 				cv &= ~((uint64_t) PDEMASK);
1001 				pdep = pmap_pde(npmap, cv);
1002 			}
1003 			continue;
1004 		}
1005 
1006 		for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) {
1007 #ifdef REMAP_DEBUG
1008 			if ((NX ^ !!(*ptep & INTEL_PTE_NX)) || (ro ^ !!!(*ptep & INTEL_PTE_WRITE))) {
1009 				kprintf("WARNING: Remapping PTE for %p from %s%s%s to %s%s%s\n", (void *)cv,
1010 				    (*ptep & INTEL_PTE_VALID) ? "R" : "",
1011 				    (*ptep & INTEL_PTE_WRITE) ? "W" : "",
1012 				    (*ptep & INTEL_PTE_NX) ? "" : "X",
1013 				    "R",
1014 				    ro ? "" : "W",
1015 				    NX ? "" : "X");
1016 			}
1017 #endif
1018 			if (NX) {
1019 				*ptep |= INTEL_PTE_NX;
1020 			} else {
1021 				*ptep &= ~INTEL_PTE_NX;
1022 			}
1023 			if (ro) {
1024 				*ptep &= ~INTEL_PTE_WRITE;
1025 			} else {
1026 				*ptep |= INTEL_PTE_WRITE;
1027 			}
1028 			cv += NBPT;
1029 			ptep = pmap_pte(npmap, cv);
1030 		}
1031 	}
1032 	DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0);
1033 }
1034 
1035 /*
1036  * Reclaim memory for early boot 4K page tables that were converted to large page mappings.
1037  * We know this memory is part of the KPTphys[] array that was allocated in Idle_PTs_init(),
1038  * so we can free it using its address in that array.
1039  */
1040 static void
pmap_free_early_PT(ppnum_t ppn,uint32_t cnt)1041 pmap_free_early_PT(ppnum_t ppn, uint32_t cnt)
1042 {
1043 	ppnum_t KPTphys_ppn;
1044 	vm_offset_t offset;
1045 
1046 	KPTphys_ppn = pmap_find_phys(kernel_pmap, (uintptr_t)KPTphys);
1047 	assert(ppn >= KPTphys_ppn);
1048 	assert(ppn + cnt <= KPTphys_ppn + NKPT);
1049 	offset = (ppn - KPTphys_ppn) << PAGE_SHIFT;
1050 	ml_static_mfree((uintptr_t)KPTphys + offset, PAGE_SIZE * cnt);
1051 }
1052 
1053 /*
1054  * Called once VM is fully initialized so that we can release unused
1055  * sections of low memory to the general pool.
1056  * Also complete the set-up of identity-mapped sections of the kernel:
1057  *  1) write-protect kernel text
1058  *  2) map kernel text using large pages if possible
1059  *  3) read and write-protect page zero (for K32)
1060  *  4) map the global page at the appropriate virtual address.
1061  *
1062  * Use of large pages
1063  * ------------------
1064  * To effectively map and write-protect all kernel text pages, the text
1065  * must be 2M-aligned at the base, and the data section above must also be
1066  * 2M-aligned. That is, there's padding below and above. This is achieved
1067  * through linker directives. Large pages are used only if this alignment
1068  * exists (and not overriden by the -kernel_text_page_4K boot-arg). The
1069  * memory layout is:
1070  *
1071  *                       :                :
1072  *                       |     __DATA     |
1073  *               sdata:  ==================  2Meg
1074  *                       |                |
1075  *                       |  zero-padding  |
1076  *                       |                |
1077  *               etext:  ------------------
1078  *                       |                |
1079  *                       :                :
1080  *                       |                |
1081  *                       |     __TEXT     |
1082  *                       |                |
1083  *                       :                :
1084  *                       |                |
1085  *               stext:  ==================  2Meg
1086  *                       |                |
1087  *                       |  zero-padding  |
1088  *                       |                |
1089  *               eHIB:   ------------------
1090  *                       |     __HIB      |
1091  *                       :                :
1092  *
1093  * Prior to changing the mapping from 4K to 2M, the zero-padding pages
1094  * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the
1095  * 4K pages covering [stext,etext] are coalesced as 2M large pages.
1096  * The now unused level-1 PTE pages are also freed.
1097  */
1098 extern ppnum_t  vm_kernel_base_page;
1099 static uint32_t dataptes = 0;
1100 
1101 void
pmap_lowmem_finalize(void)1102 pmap_lowmem_finalize(void)
1103 {
1104 	spl_t           spl;
1105 	int             i;
1106 
1107 	/*
1108 	 * Update wired memory statistics for early boot pages
1109 	 */
1110 	PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE);
1111 
1112 	/*
1113 	 * Free pages in pmap regions below the base:
1114 	 * rdar://6332712
1115 	 *	We can't free all the pages to VM that EFI reports available.
1116 	 *	Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake.
1117 	 *	There's also a size miscalculation here: pend is one page less
1118 	 *	than it should be but this is not fixed to be backwards
1119 	 *	compatible.
1120 	 * This is important for KASLR because up to 256*2MB = 512MB of space
1121 	 * needs has to be released to VM.
1122 	 */
1123 	for (i = 0;
1124 	    pmap_memory_regions[i].end < vm_kernel_base_page;
1125 	    i++) {
1126 		vm_offset_t     pbase = i386_ptob(pmap_memory_regions[i].base);
1127 		vm_offset_t     pend  = i386_ptob(pmap_memory_regions[i].end + 1);
1128 
1129 		DBG("pmap region %d [%p..[%p\n",
1130 		    i, (void *) pbase, (void *) pend);
1131 
1132 		if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED) {
1133 			continue;
1134 		}
1135 		/*
1136 		 * rdar://6332712
1137 		 * Adjust limits not to free pages in range 0xc0000-0xff000.
1138 		 */
1139 		if (pbase >= 0xc0000 && pend <= 0x100000) {
1140 			continue;
1141 		}
1142 		if (pbase < 0xc0000 && pend > 0x100000) {
1143 			/* page range entirely within region, free lower part */
1144 			DBG("- ml_static_mfree(%p,%p)\n",
1145 			    (void *) ml_static_ptovirt(pbase),
1146 			    (void *) (0xc0000 - pbase));
1147 			ml_static_mfree(ml_static_ptovirt(pbase), 0xc0000 - pbase);
1148 			pbase = 0x100000;
1149 		}
1150 		if (pbase < 0xc0000) {
1151 			pend = MIN(pend, 0xc0000);
1152 		}
1153 		if (pend > 0x100000) {
1154 			pbase = MAX(pbase, 0x100000);
1155 		}
1156 		DBG("- ml_static_mfree(%p,%p)\n",
1157 		    (void *) ml_static_ptovirt(pbase),
1158 		    (void *) (pend - pbase));
1159 		ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase);
1160 	}
1161 
1162 	/* A final pass to get rid of all initial identity mappings to
1163 	 * low pages.
1164 	 */
1165 	DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base);
1166 
1167 	/*
1168 	 * Remove all mappings past the boot-cpu descriptor aliases and low globals.
1169 	 * Non-boot-cpu GDT aliases will be remapped later as needed.
1170 	 */
1171 	pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base);
1172 
1173 	/*
1174 	 * Release any memory for early boot 4K page table pages that got replaced
1175 	 * with large page mappings for vm_pages[]. We know this memory is part of
1176 	 * the KPTphys[] array that was allocated in Idle_PTs_init(), so we can free
1177 	 * it using that address.
1178 	 */
1179 	pmap_free_early_PT(released_PT_ppn, released_PT_cnt);
1180 
1181 	/*
1182 	 * If text and data are both 2MB-aligned,
1183 	 * we can map text with large-pages,
1184 	 * unless the -kernel_text_ps_4K boot-arg overrides.
1185 	 */
1186 	if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) {
1187 		kprintf("Kernel text is 2MB aligned");
1188 		kernel_text_ps_4K = FALSE;
1189 		if (PE_parse_boot_argn("-kernel_text_ps_4K",
1190 		    &kernel_text_ps_4K,
1191 		    sizeof(kernel_text_ps_4K))) {
1192 			kprintf(" but will be mapped with 4K pages\n");
1193 		} else {
1194 			kprintf(" and will be mapped with 2M pages\n");
1195 		}
1196 	}
1197 #if     DEVELOPMENT || DEBUG
1198 	(void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof(wpkernel));
1199 #endif
1200 	if (wpkernel) {
1201 		kprintf("Kernel text %p-%p to be write-protected\n",
1202 		    (void *) stext, (void *) etext);
1203 	}
1204 
1205 	spl = splhigh();
1206 
1207 	/*
1208 	 * Scan over text if mappings are to be changed:
1209 	 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0
1210 	 * - Change to large-pages if possible and not overriden.
1211 	 */
1212 	if (kernel_text_ps_4K && wpkernel) {
1213 		vm_offset_t     myva;
1214 		for (myva = stext; myva < etext; myva += PAGE_SIZE) {
1215 			pt_entry_t     *ptep;
1216 
1217 			ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1218 			if (ptep) {
1219 				pmap_store_pte(FALSE, ptep, *ptep & ~INTEL_PTE_WRITE);
1220 			}
1221 		}
1222 	}
1223 
1224 	if (!kernel_text_ps_4K) {
1225 		vm_offset_t     myva;
1226 
1227 		/*
1228 		 * Release zero-filled page padding used for 2M-alignment.
1229 		 */
1230 		DBG("ml_static_mfree(%p,%p) for padding below text\n",
1231 		    (void *) eHIB, (void *) (stext - eHIB));
1232 		ml_static_mfree(eHIB, stext - eHIB);
1233 		DBG("ml_static_mfree(%p,%p) for padding above text\n",
1234 		    (void *) etext, (void *) (sdata - etext));
1235 		ml_static_mfree(etext, sdata - etext);
1236 
1237 		/*
1238 		 * Coalesce text pages into large pages.
1239 		 */
1240 		for (myva = stext; myva < sdata; myva += I386_LPGBYTES) {
1241 			pt_entry_t      *ptep;
1242 			vm_offset_t     pte_phys;
1243 			pt_entry_t      *pdep;
1244 			pt_entry_t      pde;
1245 			ppnum_t         KPT_ppn;
1246 
1247 			pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva);
1248 			KPT_ppn = (ppnum_t)((*pdep & PG_FRAME) >> PAGE_SHIFT);
1249 			ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1250 			DBG("myva: %p pdep: %p ptep: %p\n",
1251 			    (void *) myva, (void *) pdep, (void *) ptep);
1252 			if ((*ptep & INTEL_PTE_VALID) == 0) {
1253 				continue;
1254 			}
1255 			pte_phys = (vm_offset_t)(*ptep & PG_FRAME);
1256 			pde = *pdep & PTMASK;   /* page attributes from pde */
1257 			pde |= INTEL_PTE_PS;    /* make it a 2M entry */
1258 			pde |= pte_phys;        /* take page frame from pte */
1259 
1260 			if (wpkernel) {
1261 				pde &= ~INTEL_PTE_WRITE;
1262 			}
1263 			DBG("pmap_store_pte(%p,0x%llx)\n",
1264 			    (void *)pdep, pde);
1265 			pmap_store_pte(FALSE, pdep, pde);
1266 
1267 			/*
1268 			 * Free the now-unused level-1 pte.
1269 			 */
1270 			pmap_free_early_PT(KPT_ppn, 1);
1271 		}
1272 
1273 		/* Change variable read by sysctl machdep.pmap */
1274 		pmap_kernel_text_ps = I386_LPGBYTES;
1275 	}
1276 
1277 	vm_offset_t dva;
1278 
1279 	for (dva = sdata; dva < edata; dva += I386_PGBYTES) {
1280 		assert(((sdata | edata) & PAGE_MASK) == 0);
1281 		pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
1282 
1283 		dpte = *dptep;
1284 		assert((dpte & INTEL_PTE_VALID));
1285 		dpte |= INTEL_PTE_NX;
1286 		pmap_store_pte(FALSE, dptep, dpte);
1287 		dataptes++;
1288 	}
1289 	assert(dataptes > 0);
1290 
1291 	kernel_segment_command_t * seg;
1292 	kernel_section_t         * sec;
1293 	kc_format_t kc_format;
1294 
1295 	PE_get_primary_kc_format(&kc_format);
1296 
1297 	for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) {
1298 		if (!strcmp(seg->segname, "__TEXT") ||
1299 		    !strcmp(seg->segname, "__DATA")) {
1300 			continue;
1301 		}
1302 
1303 		/* XXX: FIXME_IN_dyld: This is a workaround (see below) */
1304 		if (kc_format != KCFormatFileset) {
1305 			//XXX
1306 			if (!strcmp(seg->segname, "__KLD")) {
1307 				continue;
1308 			}
1309 		}
1310 
1311 		if (!strcmp(seg->segname, "__HIB")) {
1312 			for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
1313 				if (sec->addr & PAGE_MASK) {
1314 					panic("__HIB segment's sections misaligned");
1315 				}
1316 				if (!strcmp(sec->sectname, "__text")) {
1317 					pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE);
1318 				} else {
1319 					pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE);
1320 				}
1321 			}
1322 		} else {
1323 			if (kc_format == KCFormatFileset) {
1324 #if 0
1325 				/*
1326 				 * This block of code is commented out because it may or may not have induced an earlier panic
1327 				 * in ledger init.
1328 				 */
1329 
1330 
1331 				boolean_t NXbit = !(seg->initprot & VM_PROT_EXECUTE),
1332 				    robit = (seg->initprot & (VM_PROT_READ | VM_PROT_WRITE)) == VM_PROT_READ;
1333 
1334 				/*
1335 				 * XXX: FIXME_IN_dyld: This is a workaround for primary KC containing incorrect inaccurate
1336 				 * initprot for segments containing code.
1337 				 */
1338 				if (!strcmp(seg->segname, "__KLD") || !strcmp(seg->segname, "__VECTORS")) {
1339 					NXbit = FALSE;
1340 					robit = FALSE;
1341 				}
1342 
1343 				pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1344 				    round_page_64(seg->vmsize), NXbit, robit);
1345 #endif
1346 
1347 				/*
1348 				 * XXX: We are marking *every* segment with rwx permissions as a workaround
1349 				 * XXX: until the primary KC's kernel segments are page-aligned.
1350 				 */
1351 				kprintf("Marking (%p, %p) as rwx\n", (void *)(seg->vmaddr & ~(uint64_t)PAGE_MASK),
1352 				    (void *)((seg->vmaddr & ~(uint64_t)PAGE_MASK) + round_page_64(seg->vmsize)));
1353 				pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1354 				    round_page_64(seg->vmsize), FALSE, FALSE);
1355 			} else {
1356 				pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE);
1357 			}
1358 		}
1359 	}
1360 
1361 	/*
1362 	 * If we're debugging, map the low global vector page at the fixed
1363 	 * virtual address.  Otherwise, remove the mapping for this.
1364 	 */
1365 	if (debug_boot_arg) {
1366 		pt_entry_t *pte = NULL;
1367 		if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS))) {
1368 			panic("lowmem pte");
1369 		}
1370 
1371 		/* make sure it is defined on page boundary */
1372 		assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK));
1373 		pmap_store_pte(FALSE, pte, kvtophys((vm_offset_t)&lowGlo)
1374 		    | INTEL_PTE_REF
1375 		    | INTEL_PTE_MOD
1376 		    | INTEL_PTE_WIRED
1377 		    | INTEL_PTE_VALID
1378 		    | INTEL_PTE_WRITE
1379 		    | INTEL_PTE_NX);
1380 
1381 #if KASAN
1382 		kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
1383 #endif
1384 	} else {
1385 		pmap_remove(kernel_pmap,
1386 		    LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE);
1387 	}
1388 	pmap_tlbi_range(0, ~0ULL, true, 0);
1389 	splx(spl);
1390 }
1391 
1392 /*
1393  *	Mark the const data segment as read-only, non-executable.
1394  */
1395 void
x86_64_protect_data_const()1396 x86_64_protect_data_const()
1397 {
1398 	boolean_t doconstro = TRUE;
1399 #if DEVELOPMENT || DEBUG
1400 	(void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
1401 #endif
1402 	if (doconstro) {
1403 		if (sconst & PAGE_MASK) {
1404 			panic("CONST segment misaligned 0x%lx 0x%lx",
1405 			    sconst, econst);
1406 		}
1407 		kprintf("Marking const DATA read-only\n");
1408 		pmap_protect(kernel_pmap, sconst, econst, VM_PROT_READ);
1409 	}
1410 }
1411 /*
1412  * this function is only used for debugging fron the vm layer
1413  */
1414 bool
pmap_verify_free(ppnum_t pn)1415 pmap_verify_free(
1416 	ppnum_t pn)
1417 {
1418 	pv_rooted_entry_t       pv_h;
1419 	int             pai;
1420 	bool            result;
1421 
1422 	assert(pn != vm_page_fictitious_addr);
1423 
1424 	if (!pmap_initialized) {
1425 		return true;
1426 	}
1427 
1428 	if (pn == vm_page_guard_addr) {
1429 		return true;
1430 	}
1431 
1432 	pai = ppn_to_pai(pn);
1433 	if (!IS_MANAGED_PAGE(pai)) {
1434 		return false;
1435 	}
1436 	pv_h = pai_to_pvh(pn);
1437 	result = (pv_h->pmap == PMAP_NULL);
1438 	return result;
1439 }
1440 
1441 /**
1442  * Helper function to check wheter the given physical
1443  * page number is a restricted page.
1444  *
1445  * @param pn the physical page number to query.
1446  */
1447 bool
pmap_is_page_restricted(__unused ppnum_t pn)1448 pmap_is_page_restricted(
1449 	__unused ppnum_t pn)
1450 {
1451 	return false;
1452 }
1453 
1454 #if MACH_ASSERT
1455 void
pmap_assert_free(ppnum_t pn)1456 pmap_assert_free(ppnum_t pn)
1457 {
1458 	int pai;
1459 	pv_rooted_entry_t pv_h = NULL;
1460 	pmap_t pmap = NULL;
1461 	vm_offset_t va = 0;
1462 	static char buffer[32];
1463 	static char *pr_name = "not managed pn";
1464 	uint_t attr;
1465 	pt_entry_t *ptep;
1466 	pt_entry_t pte = -1ull;
1467 
1468 	if (pmap_verify_free(pn)) {
1469 		return;
1470 	}
1471 
1472 	if (pn > last_managed_page) {
1473 		attr = 0xff;
1474 		goto done;
1475 	}
1476 
1477 	pai = ppn_to_pai(pn);
1478 	attr = pmap_phys_attributes[pai];
1479 	pv_h = pai_to_pvh(pai);
1480 	va = pv_h->va_and_flags;
1481 	pmap = pv_h->pmap;
1482 	if (pmap == kernel_pmap) {
1483 		pr_name = "kernel";
1484 	} else if (pmap == NULL) {
1485 		pr_name = "pmap NULL";
1486 	} else if (pmap->pmap_procname[0] != 0) {
1487 		pr_name = &pmap->pmap_procname[0];
1488 	} else {
1489 		snprintf(buffer, sizeof(buffer), "pmap %p", pv_h->pmap);
1490 		pr_name = buffer;
1491 	}
1492 
1493 	if (pmap != NULL) {
1494 		ptep = pmap_pte(pmap, va);
1495 		if (ptep != NULL) {
1496 			pte = (uintptr_t)*ptep;
1497 		}
1498 	}
1499 
1500 done:
1501 	panic("page not FREE page: 0x%lx attr: 0x%x %s va: 0x%lx PTE: 0x%llx",
1502 	    (ulong_t)pn, attr, pr_name, va, pte);
1503 }
1504 #endif /* MACH_ASSERT */
1505 
1506 boolean_t
pmap_is_empty(pmap_t pmap,vm_map_offset_t va_start,vm_map_offset_t va_end)1507 pmap_is_empty(
1508 	pmap_t          pmap,
1509 	vm_map_offset_t va_start,
1510 	vm_map_offset_t va_end)
1511 {
1512 	vm_map_offset_t offset;
1513 	ppnum_t         phys_page;
1514 	ledger_amount_t phys_mem;
1515 
1516 	if (pmap == PMAP_NULL) {
1517 		return TRUE;
1518 	}
1519 
1520 	/*
1521 	 * Check the ledger's phys_mem value
1522 	 * - if it's zero, the pmap is completely empty.
1523 	 * This short-circuit test prevents a virtual address scan which is
1524 	 * painfully slow for 64-bit spaces.
1525 	 * This assumes the count is correct
1526 	 * .. the debug kernel ought to be checking perhaps by page table walk.
1527 	 */
1528 	if (pmap != kernel_pmap) {
1529 		ledger_get_balance(pmap->ledger, task_ledgers.phys_mem, &phys_mem);
1530 		if (phys_mem == 0) {
1531 			return TRUE;
1532 		}
1533 	}
1534 
1535 	for (offset = va_start;
1536 	    offset < va_end;
1537 	    offset += PAGE_SIZE_64) {
1538 		phys_page = pmap_find_phys(pmap, offset);
1539 		if (phys_page) {
1540 			kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1541 			    "page %d at 0x%llx\n",
1542 			    pmap, va_start, va_end, phys_page, offset);
1543 			return FALSE;
1544 		}
1545 	}
1546 
1547 	return TRUE;
1548 }
1549 
1550 void
hv_ept_pmap_create(void ** ept_pmap,void ** eptp)1551 hv_ept_pmap_create(void **ept_pmap, void **eptp)
1552 {
1553 	pmap_t p;
1554 
1555 	if ((ept_pmap == NULL) || (eptp == NULL)) {
1556 		return;
1557 	}
1558 
1559 	p = pmap_create_options(get_task_ledger(current_task()), 0, (PMAP_CREATE_64BIT | PMAP_CREATE_EPT));
1560 	if (p == PMAP_NULL) {
1561 		*ept_pmap = NULL;
1562 		*eptp = NULL;
1563 		return;
1564 	}
1565 
1566 	assert(is_ept_pmap(p));
1567 
1568 	*ept_pmap = (void*)p;
1569 	*eptp = (void*)(p->pm_eptp);
1570 	return;
1571 }
1572 
1573 /*
1574  * pmap_create() is used by some special, legacy 3rd party kexts.
1575  * In our kernel code, always use pmap_create_options().
1576  */
1577 extern pmap_t pmap_create(ledger_t ledger, vm_map_size_t sz, boolean_t is_64bit);
1578 
1579 __attribute__((used))
1580 pmap_t
pmap_create(ledger_t ledger,vm_map_size_t sz,boolean_t is_64bit)1581 pmap_create(
1582 	ledger_t      ledger,
1583 	vm_map_size_t sz,
1584 	boolean_t     is_64bit)
1585 {
1586 	return pmap_create_options(ledger, sz, is_64bit ? PMAP_CREATE_64BIT : 0);
1587 }
1588 
1589 /*
1590  *	Create and return a physical map.
1591  *
1592  *	If the size specified for the map
1593  *	is zero, the map is an actual physical
1594  *	map, and may be referenced by the
1595  *	hardware.
1596  *
1597  *	If the size specified is non-zero,
1598  *	the map will be used in software only, and
1599  *	is bounded by that size.
1600  */
1601 
1602 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t sz,unsigned int flags)1603 pmap_create_options(
1604 	ledger_t        ledger,
1605 	vm_map_size_t   sz,
1606 	unsigned int    flags)
1607 {
1608 	pmap_t          p;
1609 	vm_size_t       size;
1610 	pml4_entry_t    *pml4;
1611 	pml4_entry_t    *kpml4;
1612 	int             i;
1613 
1614 	PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, sz, flags);
1615 
1616 	size = (vm_size_t) sz;
1617 
1618 	/*
1619 	 *	A software use-only map doesn't even need a map.
1620 	 */
1621 
1622 	if (size != 0) {
1623 		return PMAP_NULL;
1624 	}
1625 
1626 	/*
1627 	 *	Return error when unrecognized flags are passed.
1628 	 */
1629 	if (__improbable((flags & ~(PMAP_CREATE_KNOWN_FLAGS)) != 0)) {
1630 		return PMAP_NULL;
1631 	}
1632 
1633 	p = zalloc_flags(pmap_zone, Z_WAITOK | Z_ZERO);
1634 	if (PMAP_NULL == p) {
1635 		panic("pmap_create zalloc");
1636 	}
1637 
1638 	lck_rw_init(&p->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
1639 	p->pmap_rwl.lck_rw_can_sleep = FALSE;
1640 
1641 	os_ref_init(&p->ref_count, NULL);
1642 #if DEVELOPMENT || DEBUG
1643 	p->nx_enabled = 1;
1644 #endif
1645 	p->pm_shared = FALSE;
1646 	ledger_reference(ledger);
1647 	p->ledger = ledger;
1648 
1649 	p->pm_task_map = ((flags & PMAP_CREATE_64BIT) ? TASK_MAP_64BIT : TASK_MAP_32BIT);
1650 
1651 	p->pagezero_accessible = FALSE;
1652 	p->pm_vm_map_cs_enforced = FALSE;
1653 
1654 	if (pmap_pcid_ncpus) {
1655 		pmap_pcid_initialize(p);
1656 	}
1657 
1658 	p->pm_pml4 = zalloc(pmap_anchor_zone);
1659 	p->pm_upml4 = zalloc(pmap_uanchor_zone); //cleanup for EPT
1660 
1661 	pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0);
1662 	pmap_assert((((uintptr_t)p->pm_upml4) & PAGE_MASK) == 0);
1663 
1664 	memset((char *)p->pm_pml4, 0, PAGE_SIZE);
1665 	memset((char *)p->pm_upml4, 0, PAGE_SIZE);
1666 
1667 	if (flags & PMAP_CREATE_EPT) {
1668 		p->pm_eptp = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4) | pmap_eptp_flags;
1669 		p->pm_cr3 = 0;
1670 	} else {
1671 		p->pm_eptp = 0;
1672 		p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1673 		p->pm_ucr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_upml4);
1674 	}
1675 
1676 	/* allocate the vm_objs to hold the pdpt, pde and pte pages */
1677 
1678 	p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS) *PAGE_SIZE);
1679 	if (NULL == p->pm_obj_pml4) {
1680 		panic("pmap_create pdpt obj");
1681 	}
1682 
1683 	p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS) *PAGE_SIZE);
1684 	if (NULL == p->pm_obj_pdpt) {
1685 		panic("pmap_create pdpt obj");
1686 	}
1687 
1688 	p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS) *PAGE_SIZE);
1689 	if (NULL == p->pm_obj) {
1690 		panic("pmap_create pte obj");
1691 	}
1692 
1693 	if (!(flags & PMAP_CREATE_EPT)) {
1694 		/* All host pmaps share the kernel's pml4 */
1695 		pml4 = pmap64_pml4(p, 0ULL);
1696 		kpml4 = kernel_pmap->pm_pml4;
1697 		for (i = KERNEL_PML4_INDEX; i < (KERNEL_PML4_INDEX + KERNEL_PML4_COUNT); i++) {
1698 			pml4[i] = kpml4[i];
1699 		}
1700 		pml4[KERNEL_KEXTS_INDEX]   = kpml4[KERNEL_KEXTS_INDEX];
1701 		for (i = KERNEL_PHYSMAP_PML4_INDEX; i < (KERNEL_PHYSMAP_PML4_INDEX + KERNEL_PHYSMAP_PML4_COUNT); i++) {
1702 			pml4[i] = kpml4[i];
1703 		}
1704 		pml4[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1705 #if KASAN
1706 		for (i = KERNEL_KASAN_PML4_FIRST; i <= KERNEL_KASAN_PML4_LAST; i++) {
1707 			pml4[i] = kpml4[i];
1708 		}
1709 #endif
1710 		pml4_entry_t    *pml4u = pmap64_user_pml4(p, 0ULL);
1711 		pml4u[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1712 	}
1713 
1714 #if MACH_ASSERT
1715 	p->pmap_stats_assert = TRUE;
1716 	p->pmap_pid = 0;
1717 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
1718 #endif /* MACH_ASSERT */
1719 
1720 	PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END,
1721 	    VM_KERNEL_ADDRHIDE(p));
1722 
1723 	return p;
1724 }
1725 
1726 /*
1727  * We maintain stats and ledgers so that a task's physical footprint is:
1728  * phys_footprint = ((internal - alternate_accounting)
1729  *                   + (internal_compressed - alternate_accounting_compressed)
1730  *                   + iokit_mapped
1731  *                   + purgeable_nonvolatile
1732  *                   + purgeable_nonvolatile_compressed
1733  *                   + page_table)
1734  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
1735  */
1736 
1737 #if MACH_ASSERT
1738 static void pmap_check_ledgers(pmap_t pmap);
1739 #else /* MACH_ASSERT */
1740 static inline void
pmap_check_ledgers(__unused pmap_t pmap)1741 pmap_check_ledgers(__unused pmap_t pmap)
1742 {
1743 }
1744 #endif /* MACH_ASSERT */
1745 
1746 /*
1747  *	Retire the given physical map from service.
1748  *	Should only be called if the map contains
1749  *	no valid mappings.
1750  */
1751 extern int vm_wired_objects_page_count;
1752 
1753 void
pmap_destroy(pmap_t p)1754 pmap_destroy(pmap_t     p)
1755 {
1756 	os_ref_count_t c;
1757 
1758 	if (p == PMAP_NULL) {
1759 		return;
1760 	}
1761 
1762 	PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1763 	    VM_KERNEL_ADDRHIDe(p));
1764 
1765 	PMAP_LOCK_EXCLUSIVE(p);
1766 
1767 	c = os_ref_release_locked(&p->ref_count);
1768 
1769 	pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE);
1770 
1771 	if (c == 0) {
1772 		/*
1773 		 * If some cpu is not using the physical pmap pointer that it
1774 		 * is supposed to be (see set_dirbase), we might be using the
1775 		 * pmap that is being destroyed! Make sure we are
1776 		 * physically on the right pmap:
1777 		 */
1778 		PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
1779 		if (pmap_pcid_ncpus) {
1780 			pmap_destroy_pcid_sync(p);
1781 		}
1782 	}
1783 
1784 	PMAP_UNLOCK_EXCLUSIVE(p);
1785 
1786 	if (c != 0) {
1787 		PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1788 		pmap_assert(p == kernel_pmap);
1789 		return; /* still in use */
1790 	}
1791 
1792 	/*
1793 	 *	Free the memory maps, then the
1794 	 *	pmap structure.
1795 	 */
1796 	int inuse_ptepages = 0;
1797 
1798 	zfree(pmap_anchor_zone, p->pm_pml4);
1799 	zfree(pmap_uanchor_zone, p->pm_upml4);
1800 
1801 	inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1802 	vm_object_deallocate(p->pm_obj_pml4);
1803 
1804 	inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1805 	vm_object_deallocate(p->pm_obj_pdpt);
1806 
1807 	inuse_ptepages += p->pm_obj->resident_page_count;
1808 	vm_object_deallocate(p->pm_obj);
1809 
1810 	OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
1811 	PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE);
1812 
1813 	pmap_check_ledgers(p);
1814 	ledger_dereference(p->ledger);
1815 	lck_rw_destroy(&p->pmap_rwl, &pmap_lck_grp);
1816 	zfree(pmap_zone, p);
1817 
1818 	PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1819 }
1820 
1821 /*
1822  *	Add a reference to the specified pmap.
1823  */
1824 
1825 void
pmap_reference(pmap_t p)1826 pmap_reference(pmap_t   p)
1827 {
1828 	if (p != PMAP_NULL) {
1829 		PMAP_LOCK_EXCLUSIVE(p);
1830 		os_ref_retain_locked(&p->ref_count);
1831 		PMAP_UNLOCK_EXCLUSIVE(p);
1832 	}
1833 }
1834 
1835 /*
1836  *	Remove phys addr if mapped in specified map
1837  *
1838  */
1839 void
pmap_remove_some_phys(__unused pmap_t map,__unused ppnum_t pn)1840 pmap_remove_some_phys(
1841 	__unused pmap_t         map,
1842 	__unused ppnum_t         pn)
1843 {
1844 /* Implement to support working set code */
1845 }
1846 
1847 
1848 void
pmap_protect(pmap_t map,vm_map_offset_t sva,vm_map_offset_t eva,vm_prot_t prot)1849 pmap_protect(
1850 	pmap_t          map,
1851 	vm_map_offset_t sva,
1852 	vm_map_offset_t eva,
1853 	vm_prot_t       prot)
1854 {
1855 	pmap_protect_options(map, sva, eva, prot, 0, NULL);
1856 }
1857 
1858 
1859 /*
1860  *	Set the physical protection on the
1861  *	specified range of this map as requested.
1862  *
1863  * VERY IMPORTANT: Will *NOT* increase permissions.
1864  *	pmap_protect_options() should protect the range against any access types
1865  *      that are not in "prot" but it should never grant extra access.
1866  *	For example, if "prot" is READ|EXECUTE, that means "remove write
1867  *      access" but it does *not* mean "add read and execute" access.
1868  *	VM relies on getting soft-faults to enforce extra checks (code
1869  *	signing, for example), for example.
1870  *	New access permissions are granted via pmap_enter() only.
1871  *      ***NOTE***:
1872  *	The only exception is for EPT pmaps, where we MUST populate all exec
1873  *      bits when the protection API is invoked (so that the HV fault handler
1874  *      can make decisions based on the exit qualification information, which
1875  *      includes the execute bits in the EPT entries.  Soft-faulting them
1876  *      in would cause a chicken-and-egg problem where the HV fault handler
1877  *      would not be able to identify mode-based execute control (MBE) faults.)
1878  */
1879 void
pmap_protect_options(pmap_t map,vm_map_offset_t sva,vm_map_offset_t eva,vm_prot_t prot,unsigned int options,void * arg)1880 pmap_protect_options(
1881 	pmap_t          map,
1882 	vm_map_offset_t sva,
1883 	vm_map_offset_t eva,
1884 	vm_prot_t       prot,
1885 	unsigned int    options,
1886 	void            *arg)
1887 {
1888 	pt_entry_t      *pde;
1889 	pt_entry_t      *spte, *epte;
1890 	vm_map_offset_t lva;
1891 	vm_map_offset_t orig_sva;
1892 	boolean_t       set_NX;
1893 	int             num_found = 0;
1894 	boolean_t       is_ept;
1895 	uint64_t        cur_vaddr;
1896 
1897 	pmap_intr_assert();
1898 
1899 	if (map == PMAP_NULL) {
1900 		return;
1901 	}
1902 
1903 	if (prot == VM_PROT_NONE) {
1904 		pmap_remove_options(map, sva, eva, options);
1905 		return;
1906 	}
1907 
1908 	PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1909 	    VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(sva),
1910 	    VM_KERNEL_ADDRHIDE(eva));
1911 
1912 	is_ept = is_ept_pmap(map);
1913 
1914 	if ((prot & VM_PROT_EXECUTE) || __improbable(is_ept && (prot & VM_PROT_UEXEC))) {
1915 		set_NX = FALSE;
1916 	} else {
1917 		set_NX = TRUE;
1918 	}
1919 
1920 #if DEVELOPMENT || DEBUG
1921 	if (__improbable(set_NX && (!nx_enabled || !map->nx_enabled))) {
1922 		set_NX = FALSE;
1923 	}
1924 #endif
1925 	PMAP_LOCK_EXCLUSIVE(map);
1926 
1927 	orig_sva = sva;
1928 	cur_vaddr = sva;
1929 	while (sva < eva) {
1930 		uint64_t vaddr_incr;
1931 
1932 		if (os_add_overflow(sva, PDE_MAPPED_SIZE, &lva)) {
1933 			lva = eva;
1934 		} else {
1935 			lva &= ~(PDE_MAPPED_SIZE - 1);
1936 
1937 			if (lva > eva) {
1938 				lva = eva;
1939 			}
1940 		}
1941 
1942 		pde = pmap_pde(map, sva);
1943 		if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1944 			if (*pde & PTE_PS) {
1945 				/* superpage */
1946 				spte = pde;
1947 				epte = spte + 1; /* excluded */
1948 				vaddr_incr = I386_LPGBYTES;
1949 			} else {
1950 				spte = pmap_pte(map, (sva & ~(PDE_MAPPED_SIZE - 1)));
1951 				spte = &spte[ptenum(sva)];
1952 				epte = &spte[intel_btop(lva - sva)];
1953 				vaddr_incr = I386_PGBYTES;
1954 			}
1955 
1956 			for (; spte < epte; spte++) {
1957 				uint64_t clear_bits, set_bits;
1958 
1959 				if (!(*spte & PTE_VALID_MASK(is_ept))) {
1960 					continue;
1961 				}
1962 
1963 				clear_bits = 0;
1964 				set_bits = 0;
1965 
1966 				if (is_ept) {
1967 					if (!(prot & VM_PROT_READ)) {
1968 						clear_bits |= PTE_READ(is_ept);
1969 					}
1970 				}
1971 				if (!(prot & VM_PROT_WRITE)) {
1972 					clear_bits |= PTE_WRITE(is_ept);
1973 				}
1974 #if DEVELOPMENT || DEBUG
1975 				else if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) &&
1976 				    map == kernel_pmap) {
1977 					set_bits |= PTE_WRITE(is_ept);
1978 				}
1979 #endif /* DEVELOPMENT || DEBUG */
1980 
1981 				if (set_NX) {
1982 					if (!is_ept) {
1983 						set_bits |= INTEL_PTE_NX;
1984 					} else {
1985 						clear_bits |= INTEL_EPT_EX | INTEL_EPT_UEX;
1986 					}
1987 				} else if (is_ept) {
1988 					/* This is the exception to the "Don't add permissions" statement, above */
1989 					set_bits |= ((prot & VM_PROT_EXECUTE) ? INTEL_EPT_EX : 0) |
1990 					    ((prot & VM_PROT_UEXEC) ? INTEL_EPT_UEX : 0);
1991 				}
1992 
1993 				pmap_update_pte(is_ept, spte, clear_bits, set_bits, false);
1994 
1995 				DTRACE_VM3(set_pte, pmap_t, map, void *, cur_vaddr, uint64_t, *spte);
1996 				cur_vaddr += vaddr_incr;
1997 
1998 				num_found++;
1999 			}
2000 		}
2001 		sva = lva;
2002 	}
2003 	if (num_found) {
2004 		if (options & PMAP_OPTIONS_NOFLUSH) {
2005 			PMAP_UPDATE_TLBS_DELAYED(map, orig_sva, eva, (pmap_flush_context *)arg);
2006 		} else {
2007 			PMAP_UPDATE_TLBS(map, orig_sva, eva);
2008 		}
2009 	}
2010 
2011 	PMAP_UNLOCK_EXCLUSIVE(map);
2012 
2013 	PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
2014 }
2015 
2016 /* Map a (possibly) autogenned block */
2017 kern_return_t
pmap_map_block_addr(pmap_t pmap,addr64_t va,pmap_paddr_t pa,uint32_t size,vm_prot_t prot,int attr,unsigned int flags)2018 pmap_map_block_addr(
2019 	pmap_t          pmap,
2020 	addr64_t        va,
2021 	pmap_paddr_t    pa,
2022 	uint32_t        size,
2023 	vm_prot_t       prot,
2024 	int             attr,
2025 	unsigned int    flags)
2026 {
2027 	return pmap_map_block(pmap, va, intel_btop(pa), size, prot, attr, flags);
2028 }
2029 
2030 kern_return_t
pmap_map_block(pmap_t pmap,addr64_t va,ppnum_t pa,uint32_t size,vm_prot_t prot,int attr,__unused unsigned int flags)2031 pmap_map_block(
2032 	pmap_t          pmap,
2033 	addr64_t        va,
2034 	ppnum_t         pa,
2035 	uint32_t        size,
2036 	vm_prot_t       prot,
2037 	int             attr,
2038 	__unused unsigned int   flags)
2039 {
2040 	kern_return_t   kr;
2041 	addr64_t        original_va = va;
2042 	uint32_t        page;
2043 	int             cur_page_size;
2044 
2045 	if (attr & VM_MEM_SUPERPAGE) {
2046 		cur_page_size =  SUPERPAGE_SIZE;
2047 	} else {
2048 		cur_page_size =  PAGE_SIZE;
2049 	}
2050 
2051 	for (page = 0; page < size; page += cur_page_size / PAGE_SIZE) {
2052 		kr = pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE, PMAP_MAPPING_TYPE_INFER);
2053 
2054 		if (kr != KERN_SUCCESS) {
2055 			/*
2056 			 * This will panic for now, as it is unclear that
2057 			 * removing the mappings is correct.
2058 			 */
2059 			panic("%s: failed pmap_enter, "
2060 			    "pmap=%p, va=%#llx, pa=%u, size=%u, prot=%#x, flags=%#x",
2061 			    __FUNCTION__,
2062 			    pmap, va, pa, size, prot, flags);
2063 
2064 			pmap_remove(pmap, original_va, va - original_va);
2065 			return kr;
2066 		}
2067 
2068 		va += cur_page_size;
2069 		pa += cur_page_size / PAGE_SIZE;
2070 	}
2071 
2072 	return KERN_SUCCESS;
2073 }
2074 
2075 kern_return_t
pmap_expand_pml4(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2076 pmap_expand_pml4(
2077 	pmap_t          map,
2078 	vm_map_offset_t vaddr,
2079 	unsigned int options)
2080 {
2081 	vm_page_t       m;
2082 	pmap_paddr_t    pa;
2083 	uint64_t        i;
2084 	ppnum_t         pn;
2085 	pml4_entry_t    *pml4p;
2086 	boolean_t       is_ept = is_ept_pmap(map);
2087 
2088 	DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
2089 
2090 	/* With the exception of the kext "basement", the kernel's level 4
2091 	 * pagetables must not be dynamically expanded.
2092 	 */
2093 	assert(map != kernel_pmap || (vaddr == KERNEL_BASEMENT));
2094 	/*
2095 	 *	Allocate a VM page for the pml4 page
2096 	 */
2097 	while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2098 		if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2099 			return KERN_RESOURCE_SHORTAGE;
2100 		}
2101 		VM_PAGE_WAIT();
2102 	}
2103 	/*
2104 	 *	put the page into the pmap's obj list so it
2105 	 *	can be found later.
2106 	 */
2107 	pn = VM_PAGE_GET_PHYS_PAGE(m);
2108 	pa = i386_ptob(pn);
2109 	i = pml4idx(map, vaddr);
2110 
2111 	/*
2112 	 *	Zero the page.
2113 	 */
2114 	pmap_zero_page(pn);
2115 
2116 	vm_page_lockspin_queues();
2117 	vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2118 	vm_page_unlock_queues();
2119 
2120 	OSAddAtomic(1, &inuse_ptepages_count);
2121 	OSAddAtomic64(1, &alloc_ptepages_count);
2122 	PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2123 
2124 	/* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2125 	vm_object_lock(map->pm_obj_pml4);
2126 
2127 	PMAP_LOCK_EXCLUSIVE(map);
2128 	/*
2129 	 *	See if someone else expanded us first
2130 	 */
2131 	if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
2132 		PMAP_UNLOCK_EXCLUSIVE(map);
2133 		vm_object_unlock(map->pm_obj_pml4);
2134 
2135 		VM_PAGE_FREE(m);
2136 
2137 		OSAddAtomic(-1, &inuse_ptepages_count);
2138 		PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2139 		return KERN_SUCCESS;
2140 	}
2141 
2142 #if 0 /* DEBUG */
2143 	if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE)) {
2144 		panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx",
2145 		    map, map->pm_obj_pml4, vaddr, i);
2146 	}
2147 #endif
2148 	vm_page_insert_wired(m, map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2149 	vm_object_unlock(map->pm_obj_pml4);
2150 
2151 	/*
2152 	 *	Set the page directory entry for this page table.
2153 	 */
2154 	pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
2155 
2156 	/*
2157 	 * Note that INTEL_EPT_UEX is unconditionally set (as is INTEL_EPT_EX) for
2158 	 * all intermediate paging levels, from PML4Es to PDEs.  Processors with
2159 	 * VT-x implementations that do not support MBE ignore the INTEL_EPT_UEX
2160 	 * bit at all levels of the EPT, so there is no risk of inducing EPT
2161 	 * violation faults.
2162 	 */
2163 	pmap_store_pte(is_ept, pml4p, pa_to_pte(pa)
2164 	    | PTE_READ(is_ept)
2165 	    | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2166 	    | PTE_WRITE(is_ept));
2167 	pml4_entry_t    *upml4p;
2168 
2169 	upml4p = pmap64_user_pml4(map, vaddr);
2170 	pmap_store_pte(is_ept, upml4p, pa_to_pte(pa)
2171 	    | PTE_READ(is_ept)
2172 	    | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2173 	    | PTE_WRITE(is_ept));
2174 
2175 	PMAP_UNLOCK_EXCLUSIVE(map);
2176 
2177 	return KERN_SUCCESS;
2178 }
2179 
2180 kern_return_t
pmap_expand_pdpt(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2181 pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options)
2182 {
2183 	vm_page_t       m;
2184 	pmap_paddr_t    pa;
2185 	uint64_t        i;
2186 	ppnum_t         pn;
2187 	pdpt_entry_t    *pdptp;
2188 	boolean_t       is_ept = is_ept_pmap(map);
2189 
2190 	DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
2191 
2192 	while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
2193 		kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options);
2194 		if (pep4kr != KERN_SUCCESS) {
2195 			return pep4kr;
2196 		}
2197 	}
2198 
2199 	/*
2200 	 *	Allocate a VM page for the pdpt page
2201 	 */
2202 	while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2203 		if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2204 			return KERN_RESOURCE_SHORTAGE;
2205 		}
2206 		VM_PAGE_WAIT();
2207 	}
2208 
2209 	/*
2210 	 *	put the page into the pmap's obj list so it
2211 	 *	can be found later.
2212 	 */
2213 	pn = VM_PAGE_GET_PHYS_PAGE(m);
2214 	pa = i386_ptob(pn);
2215 	i = pdptidx(map, vaddr);
2216 
2217 	/*
2218 	 *	Zero the page.
2219 	 */
2220 	pmap_zero_page(pn);
2221 
2222 	vm_page_lockspin_queues();
2223 	vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2224 	vm_page_unlock_queues();
2225 
2226 	OSAddAtomic(1, &inuse_ptepages_count);
2227 	OSAddAtomic64(1, &alloc_ptepages_count);
2228 	PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2229 
2230 	/* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2231 	vm_object_lock(map->pm_obj_pdpt);
2232 
2233 	PMAP_LOCK_EXCLUSIVE(map);
2234 	/*
2235 	 *	See if someone else expanded us first
2236 	 */
2237 	if (pmap_pde(map, vaddr) != PD_ENTRY_NULL) {
2238 		PMAP_UNLOCK_EXCLUSIVE(map);
2239 		vm_object_unlock(map->pm_obj_pdpt);
2240 
2241 		VM_PAGE_FREE(m);
2242 
2243 		OSAddAtomic(-1, &inuse_ptepages_count);
2244 		PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2245 		return KERN_SUCCESS;
2246 	}
2247 
2248 #if 0 /* DEBUG */
2249 	if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE)) {
2250 		panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx",
2251 		    map, map->pm_obj_pdpt, vaddr, i);
2252 	}
2253 #endif
2254 	vm_page_insert_wired(m, map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2255 	vm_object_unlock(map->pm_obj_pdpt);
2256 
2257 	/*
2258 	 *	Set the page directory entry for this page table.
2259 	 */
2260 	pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
2261 
2262 	pmap_store_pte(is_ept, pdptp, pa_to_pte(pa)
2263 	    | PTE_READ(is_ept)
2264 	    | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2265 	    | PTE_WRITE(is_ept));
2266 
2267 	PMAP_UNLOCK_EXCLUSIVE(map);
2268 
2269 	return KERN_SUCCESS;
2270 }
2271 
2272 
2273 
2274 /*
2275  *	Routine:	pmap_expand
2276  *
2277  *	Expands a pmap to be able to map the specified virtual address.
2278  *
2279  *	Allocates new virtual memory for the P0 or P1 portion of the
2280  *	pmap, then re-maps the physical pages that were in the old
2281  *	pmap to be in the new pmap.
2282  *
2283  *	Must be called with the pmap system and the pmap unlocked,
2284  *	since these must be unlocked to use vm_allocate or vm_deallocate.
2285  *	Thus it must be called in a loop that checks whether the map
2286  *	has been expanded enough.
2287  *	(We won't loop forever, since page tables aren't shrunk.)
2288  */
2289 kern_return_t
pmap_expand(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2290 pmap_expand(
2291 	pmap_t          map,
2292 	vm_map_offset_t vaddr,
2293 	unsigned int options)
2294 {
2295 	pt_entry_t              *pdp;
2296 	vm_page_t               m;
2297 	pmap_paddr_t            pa;
2298 	uint64_t                i;
2299 	ppnum_t                 pn;
2300 	boolean_t               is_ept = is_ept_pmap(map);
2301 
2302 
2303 	/*
2304 	 * For the kernel, the virtual address must be in or above the basement
2305 	 * which is for kexts and is in the 512GB immediately below the kernel..
2306 	 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
2307 	 */
2308 	if (__improbable(map == kernel_pmap &&
2309 	    !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))) {
2310 		if ((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0) {
2311 			panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
2312 		}
2313 	}
2314 
2315 	while ((pdp = pmap_pde(map, vaddr)) == PD_ENTRY_NULL) {
2316 		assert((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0);
2317 		kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options);
2318 		if (pepkr != KERN_SUCCESS) {
2319 			return pepkr;
2320 		}
2321 	}
2322 
2323 	/*
2324 	 *	Allocate a VM page for the pde entries.
2325 	 */
2326 	while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2327 		if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2328 			return KERN_RESOURCE_SHORTAGE;
2329 		}
2330 		VM_PAGE_WAIT();
2331 	}
2332 
2333 	/*
2334 	 *	put the page into the pmap's obj list so it
2335 	 *	can be found later.
2336 	 */
2337 	pn = VM_PAGE_GET_PHYS_PAGE(m);
2338 	pa = i386_ptob(pn);
2339 	i = pdeidx(map, vaddr);
2340 
2341 	/*
2342 	 *	Zero the page.
2343 	 */
2344 	pmap_zero_page(pn);
2345 
2346 	vm_page_lockspin_queues();
2347 	vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2348 	vm_page_unlock_queues();
2349 
2350 	OSAddAtomic(1, &inuse_ptepages_count);
2351 	OSAddAtomic64(1, &alloc_ptepages_count);
2352 	PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2353 
2354 	/* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2355 	vm_object_lock(map->pm_obj);
2356 
2357 	PMAP_LOCK_EXCLUSIVE(map);
2358 
2359 	/*
2360 	 *	See if someone else expanded us first
2361 	 */
2362 	if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
2363 		PMAP_UNLOCK_EXCLUSIVE(map);
2364 		vm_object_unlock(map->pm_obj);
2365 
2366 		VM_PAGE_FREE(m);
2367 
2368 		OSAddAtomic(-1, &inuse_ptepages_count); //todo replace all with inlines
2369 		PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2370 		return KERN_SUCCESS;
2371 	}
2372 
2373 #if 0 /* DEBUG */
2374 	if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE)) {
2375 		panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx",
2376 		    map, map->pm_obj, vaddr, i);
2377 	}
2378 #endif
2379 	vm_page_insert_wired(m, map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2380 	vm_object_unlock(map->pm_obj);
2381 
2382 	/*
2383 	 *	Set the page directory entry for this page table.
2384 	 */
2385 	pdp = pmap_pde(map, vaddr);
2386 
2387 	pmap_store_pte(is_ept, pdp, pa_to_pte(pa)
2388 	    | PTE_READ(is_ept)
2389 	    | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2390 	    | PTE_WRITE(is_ept));
2391 
2392 	PMAP_UNLOCK_EXCLUSIVE(map);
2393 
2394 	return KERN_SUCCESS;
2395 }
2396 /*
2397  * Query a pmap to see what size a given virtual address is mapped with.
2398  * If the vaddr is not mapped, returns 0.
2399  */
2400 vm_size_t
pmap_query_pagesize(pmap_t pmap,vm_map_offset_t vaddr)2401 pmap_query_pagesize(
2402 	pmap_t          pmap,
2403 	vm_map_offset_t vaddr)
2404 {
2405 	pd_entry_t      *pdep;
2406 	vm_size_t       size = 0;
2407 
2408 	assert(!is_ept_pmap(pmap));
2409 	PMAP_LOCK_EXCLUSIVE(pmap);
2410 
2411 	pdep = pmap_pde(pmap, vaddr);
2412 	if (pdep != PD_ENTRY_NULL) {
2413 		if (*pdep & INTEL_PTE_PS) {
2414 			size = I386_LPGBYTES;
2415 		} else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2416 			size = I386_PGBYTES;
2417 		}
2418 	}
2419 
2420 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2421 
2422 	return size;
2423 }
2424 
2425 uint32_t
pmap_user_va_bits(pmap_t pmap __unused)2426 pmap_user_va_bits(pmap_t pmap __unused)
2427 {
2428 	/* x86 has constant set of bits based on 4 level paging. */
2429 	return 48;
2430 }
2431 
2432 uint32_t
pmap_kernel_va_bits(void)2433 pmap_kernel_va_bits(void)
2434 {
2435 	/* x86 has constant set of bits based on 4 level paging. */
2436 	return 48;
2437 }
2438 
2439 /*
2440  * Ensure the page table hierarchy is filled in down to
2441  * the large page level. Additionally returns FAILURE if
2442  * a lower page table already exists.
2443  */
2444 static kern_return_t
pmap_pre_expand_large_internal(pmap_t pmap,vm_map_offset_t vaddr)2445 pmap_pre_expand_large_internal(
2446 	pmap_t          pmap,
2447 	vm_map_offset_t vaddr)
2448 {
2449 	ppnum_t         pn;
2450 	pt_entry_t      *pte;
2451 	boolean_t       is_ept = is_ept_pmap(pmap);
2452 	kern_return_t   kr = KERN_SUCCESS;
2453 
2454 	if (pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
2455 		if (!pmap_next_page_hi(&pn, FALSE)) {
2456 			panic("pmap_pre_expand_large no PDPT");
2457 		}
2458 
2459 		pmap_zero_page(pn);
2460 
2461 		pte = pmap64_pml4(pmap, vaddr);
2462 
2463 		pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2464 		    PTE_READ(is_ept) |
2465 		    (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2466 		    PTE_WRITE(is_ept));
2467 
2468 		pte = pmap64_user_pml4(pmap, vaddr);
2469 
2470 		pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2471 		    PTE_READ(is_ept) |
2472 		    (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2473 		    PTE_WRITE(is_ept));
2474 	}
2475 
2476 	if (pmap_pde(pmap, vaddr) == PD_ENTRY_NULL) {
2477 		if (!pmap_next_page_hi(&pn, FALSE)) {
2478 			panic("pmap_pre_expand_large no PDE");
2479 		}
2480 
2481 		pmap_zero_page(pn);
2482 
2483 		pte = pmap64_pdpt(pmap, vaddr);
2484 
2485 		pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2486 		    PTE_READ(is_ept) |
2487 		    (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2488 		    PTE_WRITE(is_ept));
2489 	} else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2490 		kr = KERN_FAILURE;
2491 	}
2492 
2493 	return kr;
2494 }
2495 
2496 /*
2497  * Wrapper that locks the pmap.
2498  */
2499 kern_return_t
pmap_pre_expand_large(pmap_t pmap,vm_map_offset_t vaddr)2500 pmap_pre_expand_large(
2501 	pmap_t          pmap,
2502 	vm_map_offset_t vaddr)
2503 {
2504 	kern_return_t   kr;
2505 
2506 	PMAP_LOCK_EXCLUSIVE(pmap);
2507 	kr = pmap_pre_expand_large_internal(pmap, vaddr);
2508 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2509 	return kr;
2510 }
2511 
2512 /*
2513  * On large memory machines, pmap_steal_memory() will allocate past
2514  * the 1GB of pre-allocated/mapped virtual kernel area. This function
2515  * expands kernel the page tables to cover a given vaddr. It uses pages
2516  * from the same pool that pmap_steal_memory() uses, since vm_page_grab()
2517  * isn't available yet.
2518  */
2519 void
pmap_pre_expand(pmap_t pmap,vm_map_offset_t vaddr)2520 pmap_pre_expand(
2521 	pmap_t          pmap,
2522 	vm_map_offset_t vaddr)
2523 {
2524 	ppnum_t         pn;
2525 	pt_entry_t      *pte;
2526 	boolean_t       is_ept = is_ept_pmap(pmap);
2527 
2528 	/*
2529 	 * This returns failure if a 4K page table already exists.
2530 	 * Othewise it fills in the page table hierarchy down
2531 	 * to that level.
2532 	 */
2533 	PMAP_LOCK_EXCLUSIVE(pmap);
2534 	if (pmap_pre_expand_large_internal(pmap, vaddr) == KERN_FAILURE) {
2535 		PMAP_UNLOCK_EXCLUSIVE(pmap);
2536 		return;
2537 	}
2538 
2539 	/* Add the lowest table */
2540 	if (!pmap_next_page_hi(&pn, FALSE)) {
2541 		panic("pmap_pre_expand");
2542 	}
2543 
2544 	pmap_zero_page(pn);
2545 
2546 	pte = pmap_pde(pmap, vaddr);
2547 
2548 	pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2549 	    PTE_READ(is_ept) |
2550 	    (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2551 	    PTE_WRITE(is_ept));
2552 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2553 }
2554 
2555 /*
2556  * pmap_sync_page_data_phys(ppnum_t pa)
2557  *
2558  * Invalidates all of the instruction cache on a physical page and
2559  * pushes any dirty data from the data cache for the same physical page
2560  * Not required in i386.
2561  */
2562 void
pmap_sync_page_data_phys(__unused ppnum_t pa)2563 pmap_sync_page_data_phys(__unused ppnum_t pa)
2564 {
2565 	return;
2566 }
2567 
2568 /*
2569  * pmap_sync_page_attributes_phys(ppnum_t pa)
2570  *
2571  * Write back and invalidate all cachelines on a physical page.
2572  */
2573 void
pmap_sync_page_attributes_phys(ppnum_t pa)2574 pmap_sync_page_attributes_phys(ppnum_t pa)
2575 {
2576 	cache_flush_page_phys(pa);
2577 }
2578 
2579 void
pmap_copy_page(ppnum_t src,ppnum_t dst,int options __unused)2580 pmap_copy_page(ppnum_t src, ppnum_t dst, int options __unused)
2581 {
2582 	bcopy_phys((addr64_t)i386_ptob(src),
2583 	    (addr64_t)i386_ptob(dst),
2584 	    PAGE_SIZE);
2585 }
2586 
2587 
2588 /*
2589  *	Routine:	pmap_pageable
2590  *	Function:
2591  *		Make the specified pages (by pmap, offset)
2592  *		pageable (or not) as requested.
2593  *
2594  *		A page which is not pageable may not take
2595  *		a fault; therefore, its page table entry
2596  *		must remain valid for the duration.
2597  *
2598  *		This routine is merely advisory; pmap_enter
2599  *		will specify that these pages are to be wired
2600  *		down (or not) as appropriate.
2601  */
2602 void
pmap_pageable(__unused pmap_t pmap,__unused vm_map_offset_t start_addr,__unused vm_map_offset_t end_addr,__unused boolean_t pageable)2603 pmap_pageable(
2604 	__unused pmap_t                 pmap,
2605 	__unused vm_map_offset_t        start_addr,
2606 	__unused vm_map_offset_t        end_addr,
2607 	__unused boolean_t              pageable)
2608 {
2609 #ifdef  lint
2610 	pmap++; start_addr++; end_addr++; pageable++;
2611 #endif  /* lint */
2612 }
2613 
2614 void
invalidate_icache(__unused vm_offset_t addr,__unused unsigned cnt,__unused int phys)2615 invalidate_icache(__unused vm_offset_t  addr,
2616     __unused unsigned     cnt,
2617     __unused int          phys)
2618 {
2619 	return;
2620 }
2621 
2622 void
flush_dcache(__unused vm_offset_t addr,__unused unsigned count,__unused int phys)2623 flush_dcache(__unused vm_offset_t       addr,
2624     __unused unsigned          count,
2625     __unused int               phys)
2626 {
2627 	return;
2628 }
2629 
2630 #if CONFIG_DTRACE
2631 /*
2632  * Constrain DTrace copyin/copyout actions
2633  */
2634 extern kern_return_t dtrace_copyio_preflight(addr64_t);
2635 extern kern_return_t dtrace_copyio_postflight(addr64_t);
2636 
2637 kern_return_t
dtrace_copyio_preflight(__unused addr64_t va)2638 dtrace_copyio_preflight(__unused addr64_t va)
2639 {
2640 	thread_t thread = current_thread();
2641 	uint64_t ccr3;
2642 	if (current_map() == kernel_map) {
2643 		return KERN_FAILURE;
2644 	} else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE)) {
2645 		return KERN_FAILURE;
2646 	} else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3)) {
2647 		return KERN_FAILURE;
2648 	} else {
2649 		return KERN_SUCCESS;
2650 	}
2651 }
2652 
2653 kern_return_t
dtrace_copyio_postflight(__unused addr64_t va)2654 dtrace_copyio_postflight(__unused addr64_t va)
2655 {
2656 	return KERN_SUCCESS;
2657 }
2658 #endif /* CONFIG_DTRACE */
2659 
2660 #include <mach_vm_debug.h>
2661 #if     MACH_VM_DEBUG
2662 #include <vm/vm_debug_internal.h>
2663 
2664 int
pmap_list_resident_pages(__unused pmap_t pmap,__unused vm_offset_t * listp,__unused int space)2665 pmap_list_resident_pages(
2666 	__unused pmap_t         pmap,
2667 	__unused vm_offset_t    *listp,
2668 	__unused int            space)
2669 {
2670 	return 0;
2671 }
2672 #endif  /* MACH_VM_DEBUG */
2673 
2674 
2675 #if CONFIG_COREDUMP
2676 /* temporary workaround */
2677 boolean_t
coredumpok(vm_map_t map,mach_vm_offset_t va)2678 coredumpok(vm_map_t map, mach_vm_offset_t va)
2679 {
2680 #if 0
2681 	pt_entry_t     *ptep;
2682 
2683 	ptep = pmap_pte(map->pmap, va);
2684 	if (0 == ptep) {
2685 		return FALSE;
2686 	}
2687 	return (*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED);
2688 #else
2689 	if (vm_map_entry_has_device_pager(map, va)) {
2690 		return FALSE;
2691 	}
2692 	return TRUE;
2693 #endif
2694 }
2695 #endif
2696 
2697 boolean_t
phys_page_exists(ppnum_t pn)2698 phys_page_exists(ppnum_t pn)
2699 {
2700 	assert(pn != vm_page_fictitious_addr);
2701 
2702 	if (!pmap_initialized) {
2703 		return TRUE;
2704 	}
2705 
2706 	if (pn == vm_page_guard_addr) {
2707 		return FALSE;
2708 	}
2709 
2710 	if (!IS_MANAGED_PAGE(ppn_to_pai(pn))) {
2711 		return FALSE;
2712 	}
2713 
2714 	return TRUE;
2715 }
2716 
2717 
2718 
2719 void
pmap_switch(pmap_t tpmap,thread_t thread __unused)2720 pmap_switch(pmap_t tpmap, thread_t thread __unused)
2721 {
2722 	PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(tpmap));
2723 	assert(ml_get_interrupts_enabled() == FALSE);
2724 	set_dirbase(tpmap, current_thread(), cpu_number());
2725 	PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
2726 }
2727 
2728 void
pmap_require(pmap_t pmap)2729 pmap_require(pmap_t pmap)
2730 {
2731 	if (pmap != kernel_pmap) {
2732 		zone_id_require(ZONE_ID_PMAP, sizeof(struct pmap), pmap);
2733 	}
2734 }
2735 
2736 /*
2737  * disable no-execute capability on
2738  * the specified pmap
2739  */
2740 void
pmap_disable_NX(__unused pmap_t pmap)2741 pmap_disable_NX(__unused pmap_t pmap)
2742 {
2743 #if DEVELOPMENT || DEBUG
2744 	pmap->nx_enabled = 0;
2745 #endif
2746 }
2747 
2748 void
pmap_flush_context_init(pmap_flush_context * pfc)2749 pmap_flush_context_init(pmap_flush_context *pfc)
2750 {
2751 	pfc->pfc_cpus = 0;
2752 	pfc->pfc_invalid_global = 0;
2753 }
2754 
2755 static bool
pmap_tlbi_response(uint32_t lcpu,uint32_t rcpu,bool ngflush)2756 pmap_tlbi_response(uint32_t lcpu, uint32_t rcpu, bool ngflush)
2757 {
2758 	bool responded = false;
2759 	bool gflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_global_count !=
2760 	    cpu_datap(lcpu)->cpu_tlb_gen_counts_global[rcpu]);
2761 
2762 	if (ngflush) {
2763 		if (gflushed) {
2764 			responded = true;
2765 		}
2766 	} else {
2767 		if (gflushed) {
2768 			responded = true;
2769 		} else {
2770 			bool lflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_local_count !=
2771 			    cpu_datap(lcpu)->cpu_tlb_gen_counts_local[rcpu]);
2772 			if (lflushed) {
2773 				responded = true;
2774 			}
2775 		}
2776 	}
2777 
2778 	if (responded == false) {
2779 		if ((cpu_datap(rcpu)->cpu_tlb_invalid == 0) ||
2780 		    !CPU_CR3_IS_ACTIVE(rcpu) ||
2781 		    !cpu_is_running(rcpu)) {
2782 			responded = true;
2783 		}
2784 	}
2785 	return responded;
2786 }
2787 
2788 extern uint64_t TLBTimeOut;
2789 void
pmap_flush(pmap_flush_context * pfc)2790 pmap_flush(
2791 	pmap_flush_context *pfc)
2792 {
2793 	unsigned int    my_cpu;
2794 	unsigned int    cpu;
2795 	cpumask_t       cpu_bit;
2796 	cpumask_t       cpus_to_respond = 0;
2797 	cpumask_t       cpus_to_signal = 0;
2798 	cpumask_t       cpus_signaled = 0;
2799 	boolean_t       flush_self = FALSE;
2800 	uint64_t        deadline;
2801 	bool            need_global_flush = false;
2802 
2803 	mp_disable_preemption();
2804 
2805 	my_cpu = cpu_number();
2806 	cpus_to_signal = pfc->pfc_cpus;
2807 
2808 	PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_START,
2809 	    NULL, cpus_to_signal);
2810 
2811 	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus && cpus_to_signal; cpu++, cpu_bit <<= 1) {
2812 		if (cpus_to_signal & cpu_bit) {
2813 			cpus_to_signal &= ~cpu_bit;
2814 
2815 			if (!cpu_is_running(cpu)) {
2816 				continue;
2817 			}
2818 
2819 			if (pfc->pfc_invalid_global & cpu_bit) {
2820 				cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2821 				need_global_flush = true;
2822 			} else {
2823 				cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2824 			}
2825 			cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2826 			cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2827 			mfence();
2828 
2829 			if (cpu == my_cpu) {
2830 				flush_self = TRUE;
2831 				continue;
2832 			}
2833 			if (CPU_CR3_IS_ACTIVE(cpu)) {
2834 				cpus_to_respond |= cpu_bit;
2835 				i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2836 			}
2837 		}
2838 	}
2839 	cpus_signaled = cpus_to_respond;
2840 
2841 	/*
2842 	 * Flush local tlb if required.
2843 	 * Do this now to overlap with other processors responding.
2844 	 */
2845 	if (flush_self) {
2846 		process_pmap_updates(NULL, (pfc->pfc_invalid_global != 0), 0ULL, ~0ULL);
2847 	}
2848 
2849 	if (cpus_to_respond) {
2850 		deadline = mach_absolute_time() +
2851 		    (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2852 		boolean_t is_timeout_traced = FALSE;
2853 
2854 		/*
2855 		 * Wait for those other cpus to acknowledge
2856 		 */
2857 		while (cpus_to_respond != 0) {
2858 			long orig_acks = 0;
2859 
2860 			for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2861 				bool responded = false;
2862 				if ((cpus_to_respond & cpu_bit) != 0) {
2863 					responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
2864 					if (responded) {
2865 						cpus_to_respond &= ~cpu_bit;
2866 					}
2867 					cpu_pause();
2868 				}
2869 
2870 				if (cpus_to_respond == 0) {
2871 					break;
2872 				}
2873 			}
2874 			if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2875 				if (machine_timeout_suspended()) {
2876 					continue;
2877 				}
2878 				if (TLBTimeOut == 0) {
2879 					if (is_timeout_traced) {
2880 						continue;
2881 					}
2882 
2883 					PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2884 					    NULL, cpus_to_signal, cpus_to_respond);
2885 
2886 					is_timeout_traced = TRUE;
2887 					continue;
2888 				}
2889 				orig_acks = NMIPI_acks;
2890 				NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
2891 				panic("Uninterruptible processor(s): CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu",
2892 				    cpus_to_respond, orig_acks, NMIPI_acks, deadline);
2893 			}
2894 		}
2895 	}
2896 
2897 	PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_END,
2898 	    NULL, cpus_signaled, flush_self);
2899 
2900 	mp_enable_preemption();
2901 }
2902 
2903 
2904 static void
invept(void * eptp)2905 invept(void *eptp)
2906 {
2907 	struct {
2908 		uint64_t eptp;
2909 		uint64_t reserved;
2910 	} __attribute__((aligned(16), packed)) invept_descriptor = {(uint64_t)eptp, 0};
2911 
2912 	__asm__ volatile ("invept (%%rax), %%rcx"
2913                  : : "c" (PMAP_INVEPT_SINGLE_CONTEXT), "a" (&invept_descriptor)
2914                  : "cc", "memory");
2915 }
2916 
2917 /*
2918  * Called with pmap locked, we:
2919  *  - scan through per-cpu data to see which other cpus need to flush
2920  *  - send an IPI to each non-idle cpu to be flushed
2921  *  - wait for all to signal back that they are inactive or we see that
2922  *    they are at a safe point (idle).
2923  *  - flush the local tlb if active for this pmap
2924  *  - return ... the caller will unlock the pmap
2925  */
2926 
2927 void
pmap_flush_tlbs(pmap_t pmap,vm_map_offset_t startv,vm_map_offset_t endv,int options,pmap_flush_context * pfc)2928 pmap_flush_tlbs(pmap_t  pmap, vm_map_offset_t startv, vm_map_offset_t endv, int options, pmap_flush_context *pfc)
2929 {
2930 	unsigned int    cpu;
2931 	cpumask_t       cpu_bit;
2932 	cpumask_t       cpus_to_signal = 0;
2933 	unsigned int    my_cpu = cpu_number();
2934 	pmap_paddr_t    pmap_cr3 = pmap->pm_cr3;
2935 	boolean_t       flush_self = FALSE;
2936 	uint64_t        deadline;
2937 	boolean_t       pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap));
2938 	bool            need_global_flush = false;
2939 	uint32_t        event_code = 0;
2940 	vm_map_offset_t event_startv = 0, event_endv = 0;
2941 	boolean_t       is_ept = is_ept_pmap(pmap);
2942 
2943 	assert((processor_avail_count < 2) ||
2944 	    (ml_get_interrupts_enabled() && get_preemption_level() != 0));
2945 
2946 	assert((endv - startv) >= PAGE_SIZE);
2947 	assert(((endv | startv) & PAGE_MASK) == 0);
2948 
2949 	if (__improbable(kdebug_enable)) {
2950 		if (pmap == kernel_pmap) {
2951 			event_code = PMAP_CODE(PMAP__FLUSH_KERN_TLBS);
2952 			event_startv = VM_KERNEL_UNSLIDE_OR_PERM(startv);
2953 			event_endv = VM_KERNEL_UNSLIDE_OR_PERM(endv);
2954 		} else if (__improbable(is_ept)) {
2955 			event_code = PMAP_CODE(PMAP__FLUSH_EPT);
2956 			event_startv = startv;
2957 			event_endv = endv;
2958 		} else {
2959 			event_code = PMAP_CODE(PMAP__FLUSH_TLBS);
2960 			event_startv = startv;
2961 			event_endv = endv;
2962 		}
2963 	}
2964 
2965 	PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_START,
2966 	    VM_KERNEL_UNSLIDE_OR_PERM(pmap), options,
2967 	    event_startv, event_endv);
2968 
2969 	if (__improbable(is_ept)) {
2970 		mp_cpus_call(CPUMASK_ALL, ASYNC, invept, (void*)pmap->pm_eptp);
2971 		goto out;
2972 	}
2973 
2974 	/*
2975 	 * Scan other cpus for matching active or task CR3.
2976 	 * For idle cpus (with no active map) we mark them invalid but
2977 	 * don't signal -- they'll check as they go busy.
2978 	 */
2979 	if (pmap_pcid_ncpus) {
2980 		if (pmap_is_shared) {
2981 			need_global_flush = true;
2982 		}
2983 		pmap_pcid_invalidate_all_cpus(pmap);
2984 		mfence();
2985 	}
2986 
2987 	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2988 		if (!cpu_is_running(cpu)) {
2989 			continue;
2990 		}
2991 		uint64_t        cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
2992 		uint64_t        cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
2993 
2994 		if ((pmap_cr3 == cpu_task_cr3) ||
2995 		    (pmap_cr3 == cpu_active_cr3) ||
2996 		    (pmap_is_shared)) {
2997 			if (options & PMAP_DELAY_TLB_FLUSH) {
2998 				if (need_global_flush == true) {
2999 					pfc->pfc_invalid_global |= cpu_bit;
3000 				}
3001 				pfc->pfc_cpus |= cpu_bit;
3002 
3003 				continue;
3004 			}
3005 			if (need_global_flush == true) {
3006 				cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
3007 				cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
3008 			} else {
3009 				cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
3010 				cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
3011 			}
3012 
3013 			if (cpu == my_cpu) {
3014 				flush_self = TRUE;
3015 				continue;
3016 			}
3017 
3018 			mfence();
3019 
3020 			/*
3021 			 * We don't need to signal processors which will flush
3022 			 * lazily at the idle state or kernel boundary.
3023 			 * For example, if we're invalidating the kernel pmap,
3024 			 * processors currently in userspace don't need to flush
3025 			 * their TLBs until the next time they enter the kernel.
3026 			 * Alterations to the address space of a task active
3027 			 * on a remote processor result in a signal, to
3028 			 * account for copy operations. (There may be room
3029 			 * for optimization in such cases).
3030 			 * The order of the loads below with respect
3031 			 * to the store to the "cpu_tlb_invalid" field above
3032 			 * is important--hence the barrier.
3033 			 */
3034 			if (CPU_CR3_IS_ACTIVE(cpu) &&
3035 			    (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
3036 			    pmap->pm_shared ||
3037 			    (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
3038 				cpus_to_signal |= cpu_bit;
3039 				i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
3040 			}
3041 		}
3042 	}
3043 
3044 	if ((options & PMAP_DELAY_TLB_FLUSH)) {
3045 		goto out;
3046 	}
3047 
3048 	/*
3049 	 * Flush local tlb if required.
3050 	 * Do this now to overlap with other processors responding.
3051 	 */
3052 	if (flush_self) {
3053 		process_pmap_updates(pmap, pmap_is_shared, startv, endv);
3054 	}
3055 
3056 	if (cpus_to_signal) {
3057 		cpumask_t       cpus_to_respond = cpus_to_signal;
3058 
3059 		deadline = mach_absolute_time() +
3060 		    (TLBTimeOut ? TLBTimeOut : LockTimeOut);
3061 		boolean_t is_timeout_traced = FALSE;
3062 
3063 		/*
3064 		 * Wait for those other cpus to acknowledge
3065 		 */
3066 		while (cpus_to_respond != 0) {
3067 			long orig_acks = 0;
3068 
3069 			for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
3070 				bool responded = false;
3071 				if ((cpus_to_respond & cpu_bit) != 0) {
3072 					responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
3073 					if (responded) {
3074 						cpus_to_respond &= ~cpu_bit;
3075 					}
3076 					cpu_pause();
3077 				}
3078 				if (cpus_to_respond == 0) {
3079 					break;
3080 				}
3081 			}
3082 			if (cpus_to_respond && (mach_absolute_time() > deadline)) {
3083 				if (machine_timeout_suspended()) {
3084 					continue;
3085 				}
3086 				if (TLBTimeOut == 0) {
3087 					/* cut tracepoint but don't panic */
3088 					if (is_timeout_traced) {
3089 						continue;
3090 					}
3091 
3092 					PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
3093 					    VM_KERNEL_UNSLIDE_OR_PERM(pmap),
3094 					    cpus_to_signal,
3095 					    cpus_to_respond);
3096 
3097 					is_timeout_traced = TRUE;
3098 					continue;
3099 				}
3100 				orig_acks = NMIPI_acks;
3101 				uint64_t tstamp1 = mach_absolute_time();
3102 				NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
3103 				uint64_t tstamp2 = mach_absolute_time();
3104 				panic("IPI timeout, unresponsive CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu, pre-NMIPI time: 0x%llx, current: 0x%llx, global: %d",
3105 				    cpus_to_respond, orig_acks, NMIPI_acks, deadline, tstamp1, tstamp2, need_global_flush);
3106 			}
3107 		}
3108 	}
3109 
3110 	if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) {
3111 		panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, pmap_cr3: 0x%llx, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, pmap_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
3112 	}
3113 
3114 out:
3115 	PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_END,
3116 	    VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal,
3117 	    event_startv, event_endv);
3118 }
3119 
3120 static void
process_pmap_updates(pmap_t p,bool pshared,addr64_t istart,addr64_t iend)3121 process_pmap_updates(pmap_t p, bool pshared, addr64_t istart, addr64_t iend)
3122 {
3123 	int ccpu = cpu_number();
3124 	bool gtlbf = false;
3125 
3126 	pmap_assert(ml_get_interrupts_enabled() == 0 ||
3127 	    get_preemption_level() != 0);
3128 
3129 	if (cpu_datap(ccpu)->cpu_tlb_invalid_global) {
3130 		cpu_datap(ccpu)->cpu_tlb_invalid_global_count++;
3131 		cpu_datap(ccpu)->cpu_tlb_invalid = 0;
3132 		gtlbf = true;
3133 	} else {
3134 		cpu_datap(ccpu)->cpu_tlb_invalid_local_count++;
3135 		cpu_datap(ccpu)->cpu_tlb_invalid_local = 0;
3136 	}
3137 
3138 	if (pmap_pcid_ncpus) {
3139 		if (p) {
3140 			/* TODO global generation count to
3141 			 * avoid potentially redundant
3142 			 * csw invalidations post-global invalidation
3143 			 */
3144 			pmap_pcid_validate_cpu(p, ccpu);
3145 			pmap_tlbi_range(istart, iend, (pshared || gtlbf), p->pmap_pcid_cpus[ccpu]);
3146 		} else {
3147 			pmap_pcid_validate_current();
3148 			pmap_tlbi_range(istart, iend, true, 0);
3149 		}
3150 	} else {
3151 		pmap_tlbi_range(0, ~0ULL, true, 0);
3152 	}
3153 }
3154 
3155 void
pmap_update_interrupt(void)3156 pmap_update_interrupt(void)
3157 {
3158 	PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START);
3159 
3160 	if (current_cpu_datap()->cpu_tlb_invalid) {
3161 		process_pmap_updates(NULL, true, 0ULL, ~0ULL);
3162 	}
3163 
3164 	PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END);
3165 }
3166 
3167 #include <mach/mach_vm.h>       /* mach_vm_region_recurse() */
3168 /* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries
3169  * and identify ranges with mismatched VM permissions and PTE permissions
3170  */
3171 kern_return_t
pmap_permissions_verify(pmap_t ipmap,vm_map_t ivmmap,vm_offset_t sv,vm_offset_t ev)3172 pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev)
3173 {
3174 	vm_offset_t cv = sv;
3175 	kern_return_t rv = KERN_SUCCESS;
3176 	uint64_t skip4 = 0, skip2 = 0;
3177 
3178 	assert(!is_ept_pmap(ipmap));
3179 
3180 	sv &= ~PAGE_MASK_64;
3181 	ev &= ~PAGE_MASK_64;
3182 	while (cv < ev) {
3183 		if (__improbable((cv > 0x00007FFFFFFFFFFFULL) &&
3184 		    (cv < 0xFFFF800000000000ULL))) {
3185 			cv = 0xFFFF800000000000ULL;
3186 		}
3187 		/* Potential inconsistencies from not holding pmap lock
3188 		 * but harmless for the moment.
3189 		 */
3190 		if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) {
3191 			if ((cv + NBPML4) > cv) {
3192 				cv += NBPML4;
3193 			} else {
3194 				break;
3195 			}
3196 			skip4++;
3197 			continue;
3198 		}
3199 		if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) {
3200 			if ((cv + NBPD) > cv) {
3201 				cv += NBPD;
3202 			} else {
3203 				break;
3204 			}
3205 			skip2++;
3206 			continue;
3207 		}
3208 
3209 		pt_entry_t *ptep = pmap_pte(ipmap, cv);
3210 		if (ptep && (*ptep & INTEL_PTE_VALID)) {
3211 			if (*ptep & INTEL_PTE_WRITE) {
3212 				if (!(*ptep & INTEL_PTE_NX)) {
3213 					kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep)))));
3214 					rv = KERN_FAILURE;
3215 				}
3216 			}
3217 		}
3218 		cv += PAGE_SIZE;
3219 	}
3220 	kprintf("Completed pmap scan\n");
3221 	cv = sv;
3222 
3223 	struct vm_region_submap_info_64 vbr;
3224 	mach_msg_type_number_t vbrcount = 0;
3225 	mach_vm_size_t  vmsize;
3226 	vm_prot_t       prot;
3227 	uint32_t nesting_depth = 0;
3228 	kern_return_t kret;
3229 
3230 	while (cv < ev) {
3231 		for (;;) {
3232 			vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64;
3233 			if ((kret = mach_vm_region_recurse(ivmmap,
3234 			    (mach_vm_address_t *) &cv, &vmsize, &nesting_depth,
3235 			    (vm_region_recurse_info_t)&vbr,
3236 			    &vbrcount)) != KERN_SUCCESS) {
3237 				break;
3238 			}
3239 
3240 			if (vbr.is_submap) {
3241 				nesting_depth++;
3242 				continue;
3243 			} else {
3244 				break;
3245 			}
3246 		}
3247 
3248 		if (kret != KERN_SUCCESS) {
3249 			break;
3250 		}
3251 
3252 		prot = vbr.protection;
3253 
3254 		if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
3255 			kprintf("W+X map entry at address 0x%lx\n", cv);
3256 			rv = KERN_FAILURE;
3257 		}
3258 
3259 		if (prot) {
3260 			vm_offset_t pcv;
3261 			for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) {
3262 				pt_entry_t *ptep = pmap_pte(ipmap, pcv);
3263 				vm_prot_t tprot;
3264 
3265 				if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID)) {
3266 					continue;
3267 				}
3268 				tprot = VM_PROT_READ;
3269 				if (*ptep & INTEL_PTE_WRITE) {
3270 					tprot |= VM_PROT_WRITE;
3271 				}
3272 				if ((*ptep & INTEL_PTE_NX) == 0) {
3273 					tprot |= VM_PROT_EXECUTE;
3274 				}
3275 				if (tprot != prot) {
3276 					kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot);
3277 					rv = KERN_FAILURE;
3278 				}
3279 			}
3280 		}
3281 		cv += vmsize;
3282 	}
3283 	return rv;
3284 }
3285 
3286 #if MACH_ASSERT
3287 extern int pmap_ledgers_panic;
3288 extern int pmap_ledgers_panic_leeway;
3289 
3290 static void
pmap_check_ledgers(pmap_t pmap)3291 pmap_check_ledgers(
3292 	pmap_t pmap)
3293 {
3294 	int     pid;
3295 	char    *procname;
3296 
3297 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
3298 		/*
3299 		 * This pmap was not or is no longer fully associated
3300 		 * with a task (e.g. the old pmap after a fork()/exec() or
3301 		 * spawn()).  Its "ledger" still points at a task that is
3302 		 * now using a different (and active) address space, so
3303 		 * we can't check that all the pmap ledgers are balanced here.
3304 		 *
3305 		 * If the "pid" is set, that means that we went through
3306 		 * pmap_set_process() in task_terminate_internal(), so
3307 		 * this task's ledger should not have been re-used and
3308 		 * all the pmap ledgers should be back to 0.
3309 		 */
3310 		return;
3311 	}
3312 
3313 	pid = pmap->pmap_pid;
3314 	procname = pmap->pmap_procname;
3315 
3316 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
3317 }
3318 
3319 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3320 pmap_set_process(
3321 	pmap_t pmap,
3322 	int pid,
3323 	char *procname)
3324 {
3325 	if (pmap == NULL || pmap->pmap_pid == -1) {
3326 		return;
3327 	}
3328 
3329 	pmap->pmap_pid = pid;
3330 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3331 	if (pmap_ledgers_panic_leeway) {
3332 		/*
3333 		 * XXX FBDP
3334 		 * Some processes somehow trigger some issues that make
3335 		 * the pmap stats and ledgers go off track, causing
3336 		 * some assertion failures and ledger panics.
3337 		 * Turn off the sanity checks if we allow some ledger leeway
3338 		 * because of that.  We'll still do a final check in
3339 		 * pmap_check_ledgers() for discrepancies larger than the
3340 		 * allowed leeway after the address space has been fully
3341 		 * cleaned up.
3342 		 */
3343 		pmap->pmap_stats_assert = FALSE;
3344 		ledger_disable_panic_on_negative(pmap->ledger,
3345 		    task_ledgers.phys_footprint);
3346 		ledger_disable_panic_on_negative(pmap->ledger,
3347 		    task_ledgers.internal);
3348 		ledger_disable_panic_on_negative(pmap->ledger,
3349 		    task_ledgers.internal_compressed);
3350 		ledger_disable_panic_on_negative(pmap->ledger,
3351 		    task_ledgers.iokit_mapped);
3352 		ledger_disable_panic_on_negative(pmap->ledger,
3353 		    task_ledgers.alternate_accounting);
3354 		ledger_disable_panic_on_negative(pmap->ledger,
3355 		    task_ledgers.alternate_accounting_compressed);
3356 	}
3357 }
3358 #endif /* MACH_ASSERT */
3359 
3360 
3361 #if DEVELOPMENT || DEBUG
3362 int pmap_pagezero_mitigation = 1;
3363 #endif
3364 
3365 void
pmap_advise_pagezero_range(pmap_t lpmap,uint64_t low_bound)3366 pmap_advise_pagezero_range(pmap_t lpmap, uint64_t low_bound)
3367 {
3368 #if DEVELOPMENT || DEBUG
3369 	if (pmap_pagezero_mitigation == 0) {
3370 		lpmap->pagezero_accessible = FALSE;
3371 		return;
3372 	}
3373 #endif
3374 	lpmap->pagezero_accessible = ((pmap_smap_enabled == FALSE) && (low_bound < 0x1000));
3375 	if (lpmap == current_pmap()) {
3376 		mp_disable_preemption();
3377 		current_cpu_datap()->cpu_pagezero_mapped = lpmap->pagezero_accessible;
3378 		mp_enable_preemption();
3379 	}
3380 }
3381 
3382 uintptr_t
pmap_verify_noncacheable(uintptr_t vaddr)3383 pmap_verify_noncacheable(uintptr_t vaddr)
3384 {
3385 	pt_entry_t *ptep = NULL;
3386 	ptep = pmap_pte(kernel_pmap, vaddr);
3387 	if (ptep == NULL) {
3388 		panic("pmap_verify_noncacheable: no translation for 0x%lx", vaddr);
3389 	}
3390 	/* Non-cacheable OK */
3391 	if (*ptep & (INTEL_PTE_NCACHE)) {
3392 		return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3393 	}
3394 	/* Write-combined OK */
3395 	if (*ptep & (INTEL_PTE_PAT)) {
3396 		return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3397 	}
3398 	panic("pmap_verify_noncacheable: IO read from a cacheable address? address: 0x%lx, PTE: %p, *PTE: 0x%llx", vaddr, ptep, *ptep);
3399 	/*NOTREACHED*/
3400 	return 0;
3401 }
3402 
3403 bool
pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])3404 pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])
3405 {
3406 	// Unsupported on this architecture.
3407 	return false;
3408 }
3409 
3410 uint32_t
pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])3411 pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])
3412 {
3413 	// Unsupported on this architecture.
3414 	return false;
3415 }
3416 
3417 int
pmap_cs_configuration(void)3418 pmap_cs_configuration(void)
3419 {
3420 	// Unsupported on this architecture.
3421 	return 0;
3422 }
3423 
3424 bool
pmap_in_ppl(void)3425 pmap_in_ppl(void)
3426 {
3427 	// Nonexistent on this architecture.
3428 	return false;
3429 }
3430 
3431 bool
pmap_has_iofilter_protected_write()3432 pmap_has_iofilter_protected_write()
3433 {
3434 	// Not supported on this architecture.
3435 	return false;
3436 }
3437 
3438 __attribute__((__noreturn__))
3439 void
pmap_iofilter_protected_write(__unused vm_address_t addr,__unused uint64_t value,__unused uint64_t width)3440 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
3441 {
3442 	panic("%s called on an unsupported platform.", __FUNCTION__);
3443 }
3444 void *
pmap_claim_reserved_ppl_page(void)3445 pmap_claim_reserved_ppl_page(void)
3446 {
3447 	// Unsupported on this architecture.
3448 	return NULL;
3449 }
3450 
3451 void
pmap_free_reserved_ppl_page(void __unused * kva)3452 pmap_free_reserved_ppl_page(void __unused *kva)
3453 {
3454 	// Unsupported on this architecture.
3455 }
3456 
3457 #if DEVELOPMENT || DEBUG
3458 /*
3459  * Used for unit testing recovery from text corruptions.
3460  */
3461 kern_return_t
pmap_test_text_corruption(pmap_paddr_t pa)3462 pmap_test_text_corruption(pmap_paddr_t pa)
3463 {
3464 	int pai;
3465 	uint8_t *va;
3466 
3467 	pai = ppn_to_pai(atop(pa));
3468 	if (!IS_MANAGED_PAGE(pai)) {
3469 		return KERN_FAILURE;
3470 	}
3471 
3472 	va = (uint8_t *)PHYSMAP_PTOV(pa);
3473 	va[0] = 0x0f; /* opcode for UD2 */
3474 	va[1] = 0x0b;
3475 
3476 	return KERN_SUCCESS;
3477 }
3478 #endif /* DEVELOPMENT || DEBUG */
3479