xref: /xnu-8792.41.9/osfmk/x86_64/pmap.c (revision 5c2921b07a2480ab43ec66f5b9e41cb872bc554f)
1 /*
2  * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 
59 /*
60  *	File:	pmap.c
61  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
62  *	(These guys wrote the Vax version)
63  *
64  *	Physical Map management code for Intel i386, i486, and i860.
65  *
66  *	Manages physical address maps.
67  *
68  *	In addition to hardware address maps, this
69  *	module is called upon to provide software-use-only
70  *	maps which may or may not be stored in the same
71  *	form as hardware maps.  These pseudo-maps are
72  *	used to store intermediate results from copy
73  *	operations to and from address spaces.
74  *
75  *	Since the information managed by this module is
76  *	also stored by the logical address mapping module,
77  *	this module may throw away valid virtual-to-physical
78  *	mappings at almost any time.  However, invalidations
79  *	of virtual-to-physical mappings must be done as
80  *	requested.
81  *
82  *	In order to cope with hardware architectures which
83  *	make virtual-to-physical map invalidates expensive,
84  *	this module may delay invalidate or reduced protection
85  *	operations until such time as they are actually
86  *	necessary.  This module is given full information as
87  *	to which processors are currently using which maps,
88  *	and to when physical maps must be made correct.
89  */
90 
91 #include <string.h>
92 #include <mach_ldebug.h>
93 
94 #include <libkern/OSAtomic.h>
95 
96 #include <mach/machine/vm_types.h>
97 
98 #include <mach/boolean.h>
99 #include <kern/thread.h>
100 #include <kern/zalloc.h>
101 #include <kern/zalloc_internal.h>
102 #include <kern/queue.h>
103 #include <kern/ledger.h>
104 #include <kern/mach_param.h>
105 
106 #include <kern/spl.h>
107 
108 #include <vm/pmap.h>
109 #include <vm/pmap_cs.h>
110 #include <vm/vm_map.h>
111 #include <vm/vm_kern.h>
112 #include <mach/vm_param.h>
113 #include <mach/vm_prot.h>
114 #include <vm/vm_object.h>
115 #include <vm/vm_page.h>
116 
117 #include <mach/machine/vm_param.h>
118 #include <machine/thread.h>
119 
120 #include <kern/misc_protos.h>                   /* prototyping */
121 #include <i386/misc_protos.h>
122 #include <i386/i386_lowmem.h>
123 #include <x86_64/lowglobals.h>
124 
125 #include <i386/cpuid.h>
126 #include <i386/cpu_data.h>
127 #include <i386/cpu_number.h>
128 #include <i386/machine_cpu.h>
129 #include <i386/seg.h>
130 #include <i386/serial_io.h>
131 #include <i386/cpu_capabilities.h>
132 #include <i386/machine_routines.h>
133 #include <i386/proc_reg.h>
134 #include <i386/tsc.h>
135 #include <i386/pmap_internal.h>
136 #include <i386/pmap_pcid.h>
137 #if CONFIG_VMX
138 #include <i386/vmx/vmx_cpu.h>
139 #endif
140 
141 #include <vm/vm_protos.h>
142 #include <san/kasan.h>
143 
144 #include <i386/mp.h>
145 #include <i386/mp_desc.h>
146 #include <libkern/kernel_mach_header.h>
147 
148 #include <pexpert/i386/efi.h>
149 #include <libkern/section_keywords.h>
150 #if MACH_ASSERT
151 int pmap_stats_assert = 1;
152 #endif /* MACH_ASSERT */
153 
154 #ifdef IWANTTODEBUG
155 #undef  DEBUG
156 #define DEBUG 1
157 #define POSTCODE_DELAY 1
158 #include <i386/postcode.h>
159 #endif /* IWANTTODEBUG */
160 
161 #ifdef  PMAP_DEBUG
162 #define DBG(x...)       kprintf("DBG: " x)
163 #else
164 #define DBG(x...)
165 #endif
166 /* Compile time assert to ensure adjacency/alignment of per-CPU data fields used
167  * in the trampolines for kernel/user boundary TLB coherency.
168  */
169 char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1];
170 boolean_t pmap_trace = FALSE;
171 
172 boolean_t       no_shared_cr3 = DEBUG;          /* TRUE for DEBUG by default */
173 
174 #if DEVELOPMENT || DEBUG
175 int nx_enabled = 1;                     /* enable no-execute protection -- set during boot */
176 #else
177 const int nx_enabled = 1;
178 #endif
179 
180 #if DEBUG || DEVELOPMENT
181 int allow_data_exec  = VM_ABI_32;       /* 32-bit apps may execute data by default, 64-bit apps may not */
182 int allow_stack_exec = 0;               /* No apps may execute from the stack by default */
183 #else /* DEBUG || DEVELOPMENT */
184 const int allow_data_exec  = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
185 const int allow_stack_exec = 0;         /* No apps may execute from the stack by default */
186 #endif /* DEBUG || DEVELOPMENT */
187 
188 uint64_t max_preemption_latency_tsc = 0;
189 
190 pv_hashed_entry_t     *pv_hash_table;  /* hash lists */
191 
192 uint32_t npvhashmask = 0, npvhashbuckets = 0;
193 
194 pv_hashed_entry_t       pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
195 pv_hashed_entry_t       pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
196 SIMPLE_LOCK_DECLARE(pv_hashed_free_list_lock, 0);
197 SIMPLE_LOCK_DECLARE(pv_hashed_kern_free_list_lock, 0);
198 SIMPLE_LOCK_DECLARE(pv_hash_table_lock, 0);
199 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
200 
201 SECURITY_READ_ONLY_LATE(zone_t) pv_hashed_list_zone;    /* zone of pv_hashed_entry structures */
202 
203 /*
204  *	First and last physical addresses that we maintain any information
205  *	for.  Initialized to zero so that pmap operations done before
206  *	pmap_init won't touch any non-existent structures.
207  */
208 boolean_t       pmap_initialized = FALSE;/* Has pmap_init completed? */
209 
210 static struct vm_object kptobj_object_store VM_PAGE_PACKED_ALIGNED;
211 static struct vm_object kpml4obj_object_store VM_PAGE_PACKED_ALIGNED;
212 static struct vm_object kpdptobj_object_store VM_PAGE_PACKED_ALIGNED;
213 
214 /*
215  *	Array of physical page attribites for managed pages.
216  *	One byte per physical page.
217  */
218 char            *pmap_phys_attributes;
219 ppnum_t         last_managed_page = 0;
220 
221 unsigned pmap_memory_region_count;
222 unsigned pmap_memory_region_current;
223 
224 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
225 
226 /*
227  *	Other useful macros.
228  */
229 #define current_pmap()          (vm_map_pmap(current_thread()->map))
230 
231 struct pmap     kernel_pmap_store;
232 const pmap_t    kernel_pmap = &kernel_pmap_store;
233 SECURITY_READ_ONLY_LATE(zone_t)          pmap_zone; /* zone of pmap structures */
234 SECURITY_READ_ONLY_LATE(zone_t)          pmap_anchor_zone;
235 SECURITY_READ_ONLY_LATE(zone_t)          pmap_uanchor_zone;
236 int             pmap_debug = 0;         /* flag for debugging prints */
237 
238 unsigned int    inuse_ptepages_count = 0;
239 long long       alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */
240 unsigned int    bootstrap_wired_pages = 0;
241 
242 extern  long    NMIPI_acks;
243 
244 SECURITY_READ_ONLY_LATE(boolean_t)       kernel_text_ps_4K = TRUE;
245 
246 extern char     end;
247 
248 static int      nkpt;
249 
250 #if DEVELOPMENT || DEBUG
251 SECURITY_READ_ONLY_LATE(boolean_t)       pmap_disable_kheap_nx = FALSE;
252 SECURITY_READ_ONLY_LATE(boolean_t)       pmap_disable_kstack_nx = FALSE;
253 SECURITY_READ_ONLY_LATE(boolean_t)       wpkernel = TRUE;
254 #else
255 const boolean_t wpkernel = TRUE;
256 #endif
257 
258 extern long __stack_chk_guard[];
259 
260 static uint64_t pmap_eptp_flags = 0;
261 boolean_t pmap_ept_support_ad = FALSE;
262 
263 static void process_pmap_updates(pmap_t, bool, addr64_t, addr64_t);
264 /*
265  *	Map memory at initialization.  The physical addresses being
266  *	mapped are not managed and are never unmapped.
267  *
268  *	For now, VM is already on, we only need to map the
269  *	specified memory.
270  */
271 vm_offset_t
pmap_map(vm_offset_t virt,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int flags)272 pmap_map(
273 	vm_offset_t     virt,
274 	vm_map_offset_t start_addr,
275 	vm_map_offset_t end_addr,
276 	vm_prot_t       prot,
277 	unsigned int    flags)
278 {
279 	kern_return_t   kr;
280 	int             ps;
281 
282 	ps = PAGE_SIZE;
283 	while (start_addr < end_addr) {
284 		kr = pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
285 		    (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE);
286 
287 		if (kr != KERN_SUCCESS) {
288 			panic("%s: failed pmap_enter, "
289 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
290 			    __FUNCTION__,
291 			    (void *)virt, (void *)start_addr, (void *)end_addr, prot, flags);
292 		}
293 
294 		virt += ps;
295 		start_addr += ps;
296 	}
297 	return virt;
298 }
299 
300 extern  char                    *first_avail;
301 extern  vm_offset_t             virtual_avail, virtual_end;
302 extern  pmap_paddr_t            avail_start, avail_end;
303 extern  vm_offset_t             sHIB;
304 extern  vm_offset_t             eHIB;
305 extern  vm_offset_t             stext;
306 extern  vm_offset_t             etext;
307 extern  vm_offset_t             sdata, edata;
308 extern  vm_offset_t             sconst, econst;
309 
310 extern void                     *KPTphys;
311 
312 boolean_t pmap_smep_enabled = FALSE;
313 boolean_t pmap_smap_enabled = FALSE;
314 
315 void
pmap_cpu_init(void)316 pmap_cpu_init(void)
317 {
318 	cpu_data_t      *cdp = current_cpu_datap();
319 
320 	set_cr4(get_cr4() | CR4_PGE);
321 
322 	/*
323 	 * Initialize the per-cpu, TLB-related fields.
324 	 */
325 	cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
326 	cpu_shadowp(cdp->cpu_number)->cpu_kernel_cr3 = cdp->cpu_kernel_cr3;
327 	cdp->cpu_active_cr3 = kernel_pmap->pm_cr3;
328 	cdp->cpu_tlb_invalid = 0;
329 	cdp->cpu_task_map = TASK_MAP_64BIT;
330 
331 	pmap_pcid_configure();
332 	if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) {
333 		pmap_smep_enabled = TRUE;
334 #if     DEVELOPMENT || DEBUG
335 		boolean_t nsmep;
336 		if (PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) {
337 			pmap_smep_enabled = FALSE;
338 		}
339 #endif
340 		if (pmap_smep_enabled) {
341 			set_cr4(get_cr4() | CR4_SMEP);
342 		}
343 	}
344 	if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
345 		pmap_smap_enabled = TRUE;
346 #if DEVELOPMENT || DEBUG
347 		boolean_t nsmap;
348 		if (PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
349 			pmap_smap_enabled = FALSE;
350 		}
351 #endif
352 		if (pmap_smap_enabled) {
353 			set_cr4(get_cr4() | CR4_SMAP);
354 		}
355 	}
356 
357 #if !MONOTONIC
358 	if (cdp->cpu_fixed_pmcs_enabled) {
359 		boolean_t enable = TRUE;
360 		cpu_pmc_control(&enable);
361 	}
362 #endif /* !MONOTONIC */
363 }
364 
365 static void
pmap_ro_zone_validate_element_dst(zone_id_t zid,vm_offset_t va,vm_offset_t offset,vm_size_t new_data_size)366 pmap_ro_zone_validate_element_dst(
367 	zone_id_t           zid,
368 	vm_offset_t         va,
369 	vm_offset_t         offset,
370 	vm_size_t           new_data_size)
371 {
372 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
373 
374 	/* Check element is from correct zone and properly aligned */
375 	zone_require_ro(zid, elem_size, (void*)va);
376 
377 	if (__improbable(new_data_size > (elem_size - offset))) {
378 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
379 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
380 	}
381 	if (__improbable(offset >= elem_size)) {
382 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
383 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
384 	}
385 }
386 
387 static void
pmap_ro_zone_validate_element(zone_id_t zid,vm_offset_t va,vm_offset_t offset,const vm_offset_t new_data,vm_size_t new_data_size)388 pmap_ro_zone_validate_element(
389 	zone_id_t           zid,
390 	vm_offset_t         va,
391 	vm_offset_t         offset,
392 	const vm_offset_t   new_data,
393 	vm_size_t           new_data_size)
394 {
395 	vm_offset_t sum = 0;
396 
397 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
398 		panic("%s: Integer addition overflow %p + %lu = %lu",
399 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
400 	}
401 
402 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
403 }
404 
405 void
pmap_ro_zone_memcpy(zone_id_t zid,vm_offset_t va,vm_offset_t offset,const vm_offset_t new_data,vm_size_t new_data_size)406 pmap_ro_zone_memcpy(
407 	zone_id_t             zid,
408 	vm_offset_t           va,
409 	vm_offset_t           offset,
410 	const vm_offset_t     new_data,
411 	vm_size_t             new_data_size)
412 {
413 	const pmap_paddr_t pa = kvtophys(va + offset);
414 
415 	if (!new_data || new_data_size == 0) {
416 		return;
417 	}
418 
419 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
420 	/* Write through Physical Aperture */
421 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
422 }
423 
424 uint64_t
pmap_ro_zone_atomic_op(zone_id_t zid,vm_offset_t va,vm_offset_t offset,zro_atomic_op_t op,uint64_t value)425 pmap_ro_zone_atomic_op(
426 	zone_id_t             zid,
427 	vm_offset_t           va,
428 	vm_offset_t           offset,
429 	zro_atomic_op_t       op,
430 	uint64_t              value)
431 {
432 	const pmap_paddr_t pa = kvtophys(va + offset);
433 	vm_size_t value_size = op & 0xf;
434 
435 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
436 	/* Write through Physical Aperture */
437 	return __zalloc_ro_mut_atomic(phystokv(pa), op, value);
438 }
439 
440 void
pmap_ro_zone_bzero(zone_id_t zid,vm_offset_t va,vm_offset_t offset,vm_size_t size)441 pmap_ro_zone_bzero(
442 	zone_id_t         zid,
443 	vm_offset_t       va,
444 	vm_offset_t       offset,
445 	vm_size_t         size)
446 {
447 	const pmap_paddr_t pa = kvtophys(va + offset);
448 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
449 	bzero((void*)phystokv(pa), size);
450 }
451 
452 static uint32_t
pmap_scale_shift(void)453 pmap_scale_shift(void)
454 {
455 	uint32_t scale = 0;
456 
457 	if (sane_size <= 8 * GB) {
458 		scale = (uint32_t)(sane_size / (2 * GB));
459 	} else if (sane_size <= 32 * GB) {
460 		scale = 4 + (uint32_t)((sane_size - (8 * GB)) / (4 * GB));
461 	} else {
462 		scale = 10 + (uint32_t)MIN(4, ((sane_size - (32 * GB)) / (8 * GB)));
463 	}
464 	return scale;
465 }
466 
467 LCK_GRP_DECLARE(pmap_lck_grp, "pmap");
468 LCK_ATTR_DECLARE(pmap_lck_rw_attr, 0, LCK_ATTR_DEBUG);
469 
470 /*
471  *	Bootstrap the system enough to run with virtual memory.
472  *	Map the kernel's code and data, and allocate the system page table.
473  *	Called with mapping OFF.  Page_size must already be set.
474  */
475 
476 void
pmap_bootstrap(__unused vm_offset_t load_start,__unused boolean_t IA32e)477 pmap_bootstrap(
478 	__unused vm_offset_t    load_start,
479 	__unused boolean_t      IA32e)
480 {
481 	assert(IA32e);
482 
483 	vm_last_addr = VM_MAX_KERNEL_ADDRESS;   /* Set the highest address
484 	                                         * known to VM */
485 	/*
486 	 *	The kernel's pmap is statically allocated so we don't
487 	 *	have to use pmap_create, which is unlikely to work
488 	 *	correctly at this part of the boot sequence.
489 	 */
490 
491 	os_ref_init(&kernel_pmap->ref_count, NULL);
492 #if DEVELOPMENT || DEBUG
493 	kernel_pmap->nx_enabled = TRUE;
494 #endif
495 	kernel_pmap->pm_task_map = TASK_MAP_64BIT;
496 	kernel_pmap->pm_obj = (vm_object_t) NULL;
497 	kernel_pmap->pm_pml4 = IdlePML4;
498 	kernel_pmap->pm_upml4 = IdlePML4;
499 	kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
500 	kernel_pmap->pm_ucr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
501 	kernel_pmap->pm_eptp = 0;
502 
503 	pmap_pcid_initialize_kernel(kernel_pmap);
504 
505 	current_cpu_datap()->cpu_kernel_cr3 = cpu_shadowp(cpu_number())->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
506 
507 	nkpt = NKPT;
508 	OSAddAtomic(NKPT, &inuse_ptepages_count);
509 	OSAddAtomic64(NKPT, &alloc_ptepages_count);
510 	bootstrap_wired_pages = NKPT;
511 
512 	virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
513 	virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
514 
515 	if (!PE_parse_boot_argn("npvhash", &npvhashmask, sizeof(npvhashmask))) {
516 		npvhashmask = ((NPVHASHBUCKETS) << pmap_scale_shift()) - 1;
517 	}
518 
519 	npvhashbuckets = npvhashmask + 1;
520 
521 	if (0 != ((npvhashbuckets) & npvhashmask)) {
522 		panic("invalid hash %d, must be ((2^N)-1), "
523 		    "using default %d\n", npvhashmask, NPVHASHMASK);
524 	}
525 
526 	lck_rw_init(&kernel_pmap->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
527 	kernel_pmap->pmap_rwl.lck_rw_can_sleep = FALSE;
528 
529 	pmap_cpu_init();
530 
531 	if (pmap_pcid_ncpus) {
532 		printf("PMAP: PCID enabled\n");
533 	}
534 
535 	if (pmap_smep_enabled) {
536 		printf("PMAP: Supervisor Mode Execute Protection enabled\n");
537 	}
538 	if (pmap_smap_enabled) {
539 		printf("PMAP: Supervisor Mode Access Protection enabled\n");
540 	}
541 
542 #if     DEBUG
543 	printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
544 	printf("early_random(): 0x%qx\n", early_random());
545 #endif
546 #if     DEVELOPMENT || DEBUG
547 	boolean_t ptmp;
548 	/* Check if the user has requested disabling stack or heap no-execute
549 	 * enforcement. These are "const" variables; that qualifier is cast away
550 	 * when altering them. The TEXT/DATA const sections are marked
551 	 * write protected later in the kernel startup sequence, so altering
552 	 * them is possible at this point, in pmap_bootstrap().
553 	 */
554 	if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) {
555 		boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx;
556 		*pdknxp = TRUE;
557 	}
558 
559 	if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) {
560 		boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx;
561 		*pdknhp = TRUE;
562 	}
563 #endif /* DEVELOPMENT || DEBUG */
564 
565 	boot_args *args = (boot_args *)PE_state.bootArgs;
566 	if (args->efiMode == kBootArgsEfiMode32) {
567 		printf("EFI32: kernel virtual space limited to 4GB\n");
568 		virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32;
569 	}
570 	kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
571 	    (long)KERNEL_BASE, (long)virtual_end);
572 	kprintf("Available physical space from 0x%llx to 0x%llx\n",
573 	    avail_start, avail_end);
574 
575 	/*
576 	 * The -no_shared_cr3 boot-arg is a debugging feature (set by default
577 	 * in the DEBUG kernel) to force the kernel to switch to its own map
578 	 * (and cr3) when control is in kernelspace. The kernel's map does not
579 	 * include (i.e. share) userspace so wild references will cause
580 	 * a panic. Only copyin and copyout are exempt from this.
581 	 */
582 	(void) PE_parse_boot_argn("-no_shared_cr3",
583 	    &no_shared_cr3, sizeof(no_shared_cr3));
584 	if (no_shared_cr3) {
585 		kprintf("Kernel not sharing user map\n");
586 	}
587 
588 #ifdef  PMAP_TRACES
589 	if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof(pmap_trace))) {
590 		kprintf("Kernel traces for pmap operations enabled\n");
591 	}
592 #endif  /* PMAP_TRACES */
593 
594 #if MACH_ASSERT
595 	PE_parse_boot_argn("pmap_asserts", &pmap_asserts_enabled, sizeof(pmap_asserts_enabled));
596 	PE_parse_boot_argn("pmap_stats_assert",
597 	    &pmap_stats_assert,
598 	    sizeof(pmap_stats_assert));
599 #endif /* MACH_ASSERT */
600 }
601 
602 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)603 pmap_virtual_space(
604 	vm_offset_t *startp,
605 	vm_offset_t *endp)
606 {
607 	*startp = virtual_avail;
608 	*endp = virtual_end;
609 }
610 
611 
612 
613 
614 #if HIBERNATION
615 
616 #include <IOKit/IOHibernatePrivate.h>
617 #include <machine/pal_hibernate.h>
618 
619 int32_t         pmap_npages;
620 int32_t         pmap_teardown_last_valid_compact_indx = -1;
621 
622 void    pmap_pack_index(uint32_t);
623 int32_t pmap_unpack_index(pv_rooted_entry_t);
624 
625 int32_t
pmap_unpack_index(pv_rooted_entry_t pv_h)626 pmap_unpack_index(pv_rooted_entry_t pv_h)
627 {
628 	int32_t indx = 0;
629 
630 	indx = (int32_t)(*((uint64_t *)(&pv_h->qlink.next)) >> 48);
631 	indx = indx << 16;
632 	indx |= (int32_t)(*((uint64_t *)(&pv_h->qlink.prev)) >> 48);
633 
634 	*((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)0xffff << 48);
635 	*((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)0xffff << 48);
636 
637 	return indx;
638 }
639 
640 
641 void
pmap_pack_index(uint32_t indx)642 pmap_pack_index(uint32_t indx)
643 {
644 	pv_rooted_entry_t       pv_h;
645 
646 	pv_h = &pv_head_table[indx];
647 
648 	*((uint64_t *)(&pv_h->qlink.next)) &= ~((uint64_t)0xffff << 48);
649 	*((uint64_t *)(&pv_h->qlink.prev)) &= ~((uint64_t)0xffff << 48);
650 
651 	*((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)(indx >> 16)) << 48;
652 	*((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)(indx & 0xffff)) << 48;
653 }
654 
655 
656 void
pal_hib_teardown_pmap_structs(addr64_t * unneeded_start,addr64_t * unneeded_end)657 pal_hib_teardown_pmap_structs(addr64_t *unneeded_start, addr64_t *unneeded_end)
658 {
659 	int32_t         i;
660 	int32_t         compact_target_indx;
661 
662 	compact_target_indx = 0;
663 
664 	for (i = 0; i < pmap_npages; i++) {
665 		if (pv_head_table[i].pmap == PMAP_NULL) {
666 			if (pv_head_table[compact_target_indx].pmap != PMAP_NULL) {
667 				compact_target_indx = i;
668 			}
669 		} else {
670 			pmap_pack_index((uint32_t)i);
671 
672 			if (pv_head_table[compact_target_indx].pmap == PMAP_NULL) {
673 				/*
674 				 * we've got a hole to fill, so
675 				 * move this pv_rooted_entry_t to it's new home
676 				 */
677 				pv_head_table[compact_target_indx] = pv_head_table[i];
678 				pv_head_table[i].pmap = PMAP_NULL;
679 
680 				pmap_teardown_last_valid_compact_indx = compact_target_indx;
681 				compact_target_indx++;
682 			} else {
683 				pmap_teardown_last_valid_compact_indx = i;
684 			}
685 		}
686 	}
687 	*unneeded_start = (addr64_t)&pv_head_table[pmap_teardown_last_valid_compact_indx + 1];
688 	*unneeded_end = (addr64_t)&pv_head_table[pmap_npages - 1];
689 
690 	HIBLOG("pal_hib_teardown_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
691 }
692 
693 
694 void
pal_hib_rebuild_pmap_structs(void)695 pal_hib_rebuild_pmap_structs(void)
696 {
697 	int32_t                 cindx, eindx, rindx = 0;
698 	pv_rooted_entry_t       pv_h;
699 
700 	eindx = (int32_t)pmap_npages;
701 
702 	for (cindx = pmap_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
703 		pv_h = &pv_head_table[cindx];
704 
705 		rindx = pmap_unpack_index(pv_h);
706 		assert(rindx < pmap_npages);
707 
708 		if (rindx != cindx) {
709 			/*
710 			 * this pv_rooted_entry_t was moved by pal_hib_teardown_pmap_structs,
711 			 * so move it back to its real location
712 			 */
713 			pv_head_table[rindx] = pv_head_table[cindx];
714 		}
715 		if (rindx + 1 != eindx) {
716 			/*
717 			 * the 'hole' between this vm_rooted_entry_t and the previous
718 			 * vm_rooted_entry_t we moved needs to be initialized as
719 			 * a range of zero'd vm_rooted_entry_t's
720 			 */
721 			bzero((char *)&pv_head_table[rindx + 1], (eindx - rindx - 1) * sizeof(struct pv_rooted_entry));
722 		}
723 		eindx = rindx;
724 	}
725 	if (rindx) {
726 		bzero((char *)&pv_head_table[0], rindx * sizeof(struct pv_rooted_entry));
727 	}
728 
729 	HIBLOG("pal_hib_rebuild_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
730 }
731 
732 #endif
733 
734 /*
735  * Create pv entries for kernel pages mapped by early startup code.
736  * These have to exist so we can ml_static_mfree() them later.
737  */
738 static void
pmap_pv_fixup(vm_offset_t start_va,vm_offset_t end_va)739 pmap_pv_fixup(vm_offset_t start_va, vm_offset_t end_va)
740 {
741 	ppnum_t           ppn;
742 	pv_rooted_entry_t pv_h;
743 	uint32_t          pgsz;
744 
745 	start_va = round_page(start_va);
746 	end_va = trunc_page(end_va);
747 	while (start_va < end_va) {
748 		pgsz = PAGE_SIZE;
749 		ppn = pmap_find_phys(kernel_pmap, start_va);
750 		if (ppn != 0 && IS_MANAGED_PAGE(ppn)) {
751 			pv_h = pai_to_pvh(ppn);
752 			assert(pv_h->qlink.next == 0);           /* shouldn't be init'd yet */
753 			assert(pv_h->pmap == 0);
754 			pv_h->va_and_flags = start_va;
755 			pv_h->pmap = kernel_pmap;
756 			queue_init(&pv_h->qlink);
757 			/*
758 			 * Note that pmap_query_pagesize does not enforce start_va is aligned
759 			 * on a 2M boundary if it's within a large page
760 			 */
761 			if (pmap_query_pagesize(kernel_pmap, start_va) == I386_LPGBYTES) {
762 				pgsz = I386_LPGBYTES;
763 			}
764 		}
765 		if (os_add_overflow(start_va, pgsz, &start_va)) {
766 #if DEVELOPMENT || DEBUG
767 			panic("pmap_pv_fixup: Unexpected address wrap (0x%lx after adding 0x%x)", start_va, pgsz);
768 #else
769 			start_va = end_va;
770 #endif
771 		}
772 	}
773 }
774 
775 /*
776  *	Initialize the pmap module.
777  *	Called by vm_init, to initialize any structures that the pmap
778  *	system needs to map virtual memory.
779  */
780 void
pmap_init(void)781 pmap_init(void)
782 {
783 	long                    npages;
784 	vm_offset_t             addr;
785 	vm_size_t               s, vsize;
786 	vm_map_offset_t         vaddr;
787 	ppnum_t ppn;
788 
789 
790 	kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
791 	_vm_object_allocate((vm_object_size_t)NPML4PGS * PAGE_SIZE, &kpml4obj_object_store);
792 
793 	kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
794 	_vm_object_allocate((vm_object_size_t)NPDPTPGS * PAGE_SIZE, &kpdptobj_object_store);
795 
796 	kernel_pmap->pm_obj = &kptobj_object_store;
797 	_vm_object_allocate((vm_object_size_t)NPDEPGS * PAGE_SIZE, &kptobj_object_store);
798 
799 	/*
800 	 *	Allocate memory for the pv_head_table and its lock bits,
801 	 *	the modify bit array, and the pte_page table.
802 	 */
803 
804 	/*
805 	 * zero bias all these arrays now instead of off avail_start
806 	 * so we cover all memory
807 	 */
808 
809 	npages = i386_btop(avail_end);
810 #if HIBERNATION
811 	pmap_npages = (uint32_t)npages;
812 #endif
813 	s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
814 	    + (sizeof(struct pv_hashed_entry_t *) * (npvhashbuckets))
815 	    + pv_lock_table_size(npages)
816 	    + pv_hash_lock_table_size((npvhashbuckets))
817 	    + npages);
818 	s = round_page(s);
819 
820 	kmem_alloc(kernel_map, &addr, s,
821 	    KMA_NOFAIL | KMA_ZERO | KMA_KOBJECT | KMA_PERMANENT,
822 	    VM_KERN_MEMORY_PMAP);
823 
824 	vaddr = addr;
825 	vsize = s;
826 
827 #if PV_DEBUG
828 	if (0 == npvhashmask) {
829 		panic("npvhashmask not initialized");
830 	}
831 #endif
832 
833 	/*
834 	 *	Allocate the structures first to preserve word-alignment.
835 	 */
836 	pv_head_table = (pv_rooted_entry_t) addr;
837 	addr = (vm_offset_t) (pv_head_table + npages);
838 
839 	pv_hash_table = (pv_hashed_entry_t *)addr;
840 	addr = (vm_offset_t) (pv_hash_table + (npvhashbuckets));
841 
842 	pv_lock_table = (char *) addr;
843 	addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
844 
845 	pv_hash_lock_table = (char *) addr;
846 	addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhashbuckets)));
847 
848 	pmap_phys_attributes = (char *) addr;
849 
850 	ppnum_t  last_pn = i386_btop(avail_end);
851 	unsigned int i;
852 	pmap_memory_region_t *pmptr = pmap_memory_regions;
853 	for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
854 		if (pmptr->type != kEfiConventionalMemory) {
855 			continue;
856 		}
857 		ppnum_t pn;
858 		for (pn = pmptr->base; pn <= pmptr->end; pn++) {
859 			if (pn < last_pn) {
860 				pmap_phys_attributes[pn] |= PHYS_MANAGED;
861 
862 				if (pn > last_managed_page) {
863 					last_managed_page = pn;
864 				}
865 
866 				if ((pmap_high_used_bottom <= pn && pn <= pmap_high_used_top) ||
867 				    (pmap_middle_used_bottom <= pn && pn <= pmap_middle_used_top)) {
868 					pmap_phys_attributes[pn] |= PHYS_NOENCRYPT;
869 				}
870 			}
871 		}
872 	}
873 	while (vsize) {
874 		ppn = pmap_find_phys(kernel_pmap, vaddr);
875 
876 		pmap_phys_attributes[ppn] |= PHYS_NOENCRYPT;
877 
878 		vaddr += PAGE_SIZE;
879 		vsize -= PAGE_SIZE;
880 	}
881 	/*
882 	 *	Create the zone of physical maps,
883 	 *	and of the physical-to-virtual entries.
884 	 */
885 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
886 	    ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
887 
888 	/* The anchor is required to be page aligned. Zone debugging adds
889 	 * padding which may violate that requirement. Tell the zone
890 	 * subsystem that alignment is required.
891 	 */
892 	pmap_anchor_zone = zone_create("pagetable anchors", PAGE_SIZE,
893 	    ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
894 
895 /* TODO: possible general optimisation...pre-allocate via zones commonly created
896  * level3/2 pagetables
897  */
898 	/* The anchor is required to be page aligned. Zone debugging adds
899 	 * padding which may violate that requirement. Tell the zone
900 	 * subsystem that alignment is required.
901 	 */
902 	pmap_uanchor_zone = zone_create("pagetable user anchors", PAGE_SIZE,
903 	    ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
904 
905 	pv_hashed_list_zone = zone_create("pv_list", sizeof(struct pv_hashed_entry),
906 	    ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
907 
908 	/*
909 	 * Create pv entries for kernel pages that might get pmap_remove()ed.
910 	 *
911 	 * - very low pages that were identity mapped.
912 	 * - vm_pages[] entries that might be unused and reclaimed.
913 	 */
914 	assert((uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start <= (uintptr_t)vm_page_array_beginning_addr);
915 	pmap_pv_fixup((uintptr_t)VM_MIN_KERNEL_ADDRESS, (uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start);
916 	pmap_pv_fixup((uintptr_t)vm_page_array_beginning_addr, (uintptr_t)vm_page_array_ending_addr);
917 
918 	pmap_initialized = TRUE;
919 
920 	max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
921 
922 	/*
923 	 * Ensure the kernel's PML4 entry exists for the basement
924 	 * before this is shared with any user.
925 	 */
926 	pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE);
927 
928 #if CONFIG_VMX
929 	pmap_ept_support_ad = vmx_hv_support() && (VMX_CAP(MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_EPT_VPID_CAP_AD_SHIFT, 1) ? TRUE : FALSE);
930 	pmap_eptp_flags = HV_VMX_EPTP_MEMORY_TYPE_WB | HV_VMX_EPTP_WALK_LENGTH(4) | (pmap_ept_support_ad ? HV_VMX_EPTP_ENABLE_AD_FLAGS : 0);
931 #endif /* CONFIG_VMX */
932 }
933 
934 void
pmap_mark_range(pmap_t npmap,uint64_t sv,uint64_t nxrosz,boolean_t NX,boolean_t ro)935 pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro)
936 {
937 	uint64_t ev, cv = sv;
938 	pd_entry_t *pdep;
939 	pt_entry_t *ptep = NULL;
940 
941 	if (os_add_overflow(sv, nxrosz, &ev)) {
942 		panic("pmap_mark_range: Unexpected address overflow: start=0x%llx size=0x%llx", sv, nxrosz);
943 	}
944 
945 	/* XXX what if nxrosz is 0?  we end up marking the page whose address is passed in via sv -- is that kosher? */
946 	assert(!is_ept_pmap(npmap));
947 
948 	assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0);
949 
950 	for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) {
951 		uint64_t pdev = (cv & ~((uint64_t)PDEMASK));
952 
953 		if (*pdep & INTEL_PTE_PS) {
954 #ifdef REMAP_DEBUG
955 			if ((NX ^ !!(*pdep & INTEL_PTE_NX)) || (ro ^ !!!(*pdep & INTEL_PTE_WRITE))) {
956 				kprintf("WARNING: Remapping PDE for %p from %s%s%s to %s%s%s\n", (void *)cv,
957 				    (*pdep & INTEL_PTE_VALID) ? "R" : "",
958 				    (*pdep & INTEL_PTE_WRITE) ? "W" : "",
959 				    (*pdep & INTEL_PTE_NX) ? "" : "X",
960 				    "R",
961 				    ro ? "" : "W",
962 				    NX ? "" : "X");
963 			}
964 #endif
965 
966 			if (NX) {
967 				*pdep |= INTEL_PTE_NX;
968 			} else {
969 				*pdep &= ~INTEL_PTE_NX;
970 			}
971 			if (ro) {
972 				*pdep &= ~INTEL_PTE_WRITE;
973 			} else {
974 				*pdep |= INTEL_PTE_WRITE;
975 			}
976 
977 			if (os_add_overflow(cv, NBPD, &cv)) {
978 				cv = ev;
979 			} else {
980 				cv &= ~((uint64_t) PDEMASK);
981 				pdep = pmap_pde(npmap, cv);
982 			}
983 			continue;
984 		}
985 
986 		for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) {
987 #ifdef REMAP_DEBUG
988 			if ((NX ^ !!(*ptep & INTEL_PTE_NX)) || (ro ^ !!!(*ptep & INTEL_PTE_WRITE))) {
989 				kprintf("WARNING: Remapping PTE for %p from %s%s%s to %s%s%s\n", (void *)cv,
990 				    (*ptep & INTEL_PTE_VALID) ? "R" : "",
991 				    (*ptep & INTEL_PTE_WRITE) ? "W" : "",
992 				    (*ptep & INTEL_PTE_NX) ? "" : "X",
993 				    "R",
994 				    ro ? "" : "W",
995 				    NX ? "" : "X");
996 			}
997 #endif
998 			if (NX) {
999 				*ptep |= INTEL_PTE_NX;
1000 			} else {
1001 				*ptep &= ~INTEL_PTE_NX;
1002 			}
1003 			if (ro) {
1004 				*ptep &= ~INTEL_PTE_WRITE;
1005 			} else {
1006 				*ptep |= INTEL_PTE_WRITE;
1007 			}
1008 			cv += NBPT;
1009 			ptep = pmap_pte(npmap, cv);
1010 		}
1011 	}
1012 	DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0);
1013 }
1014 
1015 /*
1016  * Reclaim memory for early boot 4K page tables that were converted to large page mappings.
1017  * We know this memory is part of the KPTphys[] array that was allocated in Idle_PTs_init(),
1018  * so we can free it using its address in that array.
1019  */
1020 static void
pmap_free_early_PT(ppnum_t ppn,uint32_t cnt)1021 pmap_free_early_PT(ppnum_t ppn, uint32_t cnt)
1022 {
1023 	ppnum_t KPTphys_ppn;
1024 	vm_offset_t offset;
1025 
1026 	KPTphys_ppn = pmap_find_phys(kernel_pmap, (uintptr_t)KPTphys);
1027 	assert(ppn >= KPTphys_ppn);
1028 	assert(ppn + cnt <= KPTphys_ppn + NKPT);
1029 	offset = (ppn - KPTphys_ppn) << PAGE_SHIFT;
1030 	ml_static_mfree((uintptr_t)KPTphys + offset, PAGE_SIZE * cnt);
1031 }
1032 
1033 /*
1034  * Called once VM is fully initialized so that we can release unused
1035  * sections of low memory to the general pool.
1036  * Also complete the set-up of identity-mapped sections of the kernel:
1037  *  1) write-protect kernel text
1038  *  2) map kernel text using large pages if possible
1039  *  3) read and write-protect page zero (for K32)
1040  *  4) map the global page at the appropriate virtual address.
1041  *
1042  * Use of large pages
1043  * ------------------
1044  * To effectively map and write-protect all kernel text pages, the text
1045  * must be 2M-aligned at the base, and the data section above must also be
1046  * 2M-aligned. That is, there's padding below and above. This is achieved
1047  * through linker directives. Large pages are used only if this alignment
1048  * exists (and not overriden by the -kernel_text_page_4K boot-arg). The
1049  * memory layout is:
1050  *
1051  *                       :                :
1052  *                       |     __DATA     |
1053  *               sdata:  ==================  2Meg
1054  *                       |                |
1055  *                       |  zero-padding  |
1056  *                       |                |
1057  *               etext:  ------------------
1058  *                       |                |
1059  *                       :                :
1060  *                       |                |
1061  *                       |     __TEXT     |
1062  *                       |                |
1063  *                       :                :
1064  *                       |                |
1065  *               stext:  ==================  2Meg
1066  *                       |                |
1067  *                       |  zero-padding  |
1068  *                       |                |
1069  *               eHIB:   ------------------
1070  *                       |     __HIB      |
1071  *                       :                :
1072  *
1073  * Prior to changing the mapping from 4K to 2M, the zero-padding pages
1074  * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the
1075  * 4K pages covering [stext,etext] are coalesced as 2M large pages.
1076  * The now unused level-1 PTE pages are also freed.
1077  */
1078 extern ppnum_t  vm_kernel_base_page;
1079 static uint32_t dataptes = 0;
1080 
1081 void
pmap_lowmem_finalize(void)1082 pmap_lowmem_finalize(void)
1083 {
1084 	spl_t           spl;
1085 	int             i;
1086 
1087 	/*
1088 	 * Update wired memory statistics for early boot pages
1089 	 */
1090 	PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE);
1091 
1092 	/*
1093 	 * Free pages in pmap regions below the base:
1094 	 * rdar://6332712
1095 	 *	We can't free all the pages to VM that EFI reports available.
1096 	 *	Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake.
1097 	 *	There's also a size miscalculation here: pend is one page less
1098 	 *	than it should be but this is not fixed to be backwards
1099 	 *	compatible.
1100 	 * This is important for KASLR because up to 256*2MB = 512MB of space
1101 	 * needs has to be released to VM.
1102 	 */
1103 	for (i = 0;
1104 	    pmap_memory_regions[i].end < vm_kernel_base_page;
1105 	    i++) {
1106 		vm_offset_t     pbase = i386_ptob(pmap_memory_regions[i].base);
1107 		vm_offset_t     pend  = i386_ptob(pmap_memory_regions[i].end + 1);
1108 
1109 		DBG("pmap region %d [%p..[%p\n",
1110 		    i, (void *) pbase, (void *) pend);
1111 
1112 		if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED) {
1113 			continue;
1114 		}
1115 		/*
1116 		 * rdar://6332712
1117 		 * Adjust limits not to free pages in range 0xc0000-0xff000.
1118 		 */
1119 		if (pbase >= 0xc0000 && pend <= 0x100000) {
1120 			continue;
1121 		}
1122 		if (pbase < 0xc0000 && pend > 0x100000) {
1123 			/* page range entirely within region, free lower part */
1124 			DBG("- ml_static_mfree(%p,%p)\n",
1125 			    (void *) ml_static_ptovirt(pbase),
1126 			    (void *) (0xc0000 - pbase));
1127 			ml_static_mfree(ml_static_ptovirt(pbase), 0xc0000 - pbase);
1128 			pbase = 0x100000;
1129 		}
1130 		if (pbase < 0xc0000) {
1131 			pend = MIN(pend, 0xc0000);
1132 		}
1133 		if (pend > 0x100000) {
1134 			pbase = MAX(pbase, 0x100000);
1135 		}
1136 		DBG("- ml_static_mfree(%p,%p)\n",
1137 		    (void *) ml_static_ptovirt(pbase),
1138 		    (void *) (pend - pbase));
1139 		ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase);
1140 	}
1141 
1142 	/* A final pass to get rid of all initial identity mappings to
1143 	 * low pages.
1144 	 */
1145 	DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base);
1146 
1147 	/*
1148 	 * Remove all mappings past the boot-cpu descriptor aliases and low globals.
1149 	 * Non-boot-cpu GDT aliases will be remapped later as needed.
1150 	 */
1151 	pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base);
1152 
1153 	/*
1154 	 * Release any memory for early boot 4K page table pages that got replaced
1155 	 * with large page mappings for vm_pages[]. We know this memory is part of
1156 	 * the KPTphys[] array that was allocated in Idle_PTs_init(), so we can free
1157 	 * it using that address.
1158 	 */
1159 	pmap_free_early_PT(released_PT_ppn, released_PT_cnt);
1160 
1161 	/*
1162 	 * If text and data are both 2MB-aligned,
1163 	 * we can map text with large-pages,
1164 	 * unless the -kernel_text_ps_4K boot-arg overrides.
1165 	 */
1166 	if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) {
1167 		kprintf("Kernel text is 2MB aligned");
1168 		kernel_text_ps_4K = FALSE;
1169 		if (PE_parse_boot_argn("-kernel_text_ps_4K",
1170 		    &kernel_text_ps_4K,
1171 		    sizeof(kernel_text_ps_4K))) {
1172 			kprintf(" but will be mapped with 4K pages\n");
1173 		} else {
1174 			kprintf(" and will be mapped with 2M pages\n");
1175 		}
1176 	}
1177 #if     DEVELOPMENT || DEBUG
1178 	(void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof(wpkernel));
1179 #endif
1180 	if (wpkernel) {
1181 		kprintf("Kernel text %p-%p to be write-protected\n",
1182 		    (void *) stext, (void *) etext);
1183 	}
1184 
1185 	spl = splhigh();
1186 
1187 	/*
1188 	 * Scan over text if mappings are to be changed:
1189 	 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0
1190 	 * - Change to large-pages if possible and not overriden.
1191 	 */
1192 	if (kernel_text_ps_4K && wpkernel) {
1193 		vm_offset_t     myva;
1194 		for (myva = stext; myva < etext; myva += PAGE_SIZE) {
1195 			pt_entry_t     *ptep;
1196 
1197 			ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1198 			if (ptep) {
1199 				pmap_store_pte(FALSE, ptep, *ptep & ~INTEL_PTE_WRITE);
1200 			}
1201 		}
1202 	}
1203 
1204 	if (!kernel_text_ps_4K) {
1205 		vm_offset_t     myva;
1206 
1207 		/*
1208 		 * Release zero-filled page padding used for 2M-alignment.
1209 		 */
1210 		DBG("ml_static_mfree(%p,%p) for padding below text\n",
1211 		    (void *) eHIB, (void *) (stext - eHIB));
1212 		ml_static_mfree(eHIB, stext - eHIB);
1213 		DBG("ml_static_mfree(%p,%p) for padding above text\n",
1214 		    (void *) etext, (void *) (sdata - etext));
1215 		ml_static_mfree(etext, sdata - etext);
1216 
1217 		/*
1218 		 * Coalesce text pages into large pages.
1219 		 */
1220 		for (myva = stext; myva < sdata; myva += I386_LPGBYTES) {
1221 			pt_entry_t      *ptep;
1222 			vm_offset_t     pte_phys;
1223 			pt_entry_t      *pdep;
1224 			pt_entry_t      pde;
1225 			ppnum_t         KPT_ppn;
1226 
1227 			pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva);
1228 			KPT_ppn = (ppnum_t)((*pdep & PG_FRAME) >> PAGE_SHIFT);
1229 			ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1230 			DBG("myva: %p pdep: %p ptep: %p\n",
1231 			    (void *) myva, (void *) pdep, (void *) ptep);
1232 			if ((*ptep & INTEL_PTE_VALID) == 0) {
1233 				continue;
1234 			}
1235 			pte_phys = (vm_offset_t)(*ptep & PG_FRAME);
1236 			pde = *pdep & PTMASK;   /* page attributes from pde */
1237 			pde |= INTEL_PTE_PS;    /* make it a 2M entry */
1238 			pde |= pte_phys;        /* take page frame from pte */
1239 
1240 			if (wpkernel) {
1241 				pde &= ~INTEL_PTE_WRITE;
1242 			}
1243 			DBG("pmap_store_pte(%p,0x%llx)\n",
1244 			    (void *)pdep, pde);
1245 			pmap_store_pte(FALSE, pdep, pde);
1246 
1247 			/*
1248 			 * Free the now-unused level-1 pte.
1249 			 */
1250 			pmap_free_early_PT(KPT_ppn, 1);
1251 		}
1252 
1253 		/* Change variable read by sysctl machdep.pmap */
1254 		pmap_kernel_text_ps = I386_LPGBYTES;
1255 	}
1256 
1257 	vm_offset_t dva;
1258 
1259 	for (dva = sdata; dva < edata; dva += I386_PGBYTES) {
1260 		assert(((sdata | edata) & PAGE_MASK) == 0);
1261 		pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
1262 
1263 		dpte = *dptep;
1264 		assert((dpte & INTEL_PTE_VALID));
1265 		dpte |= INTEL_PTE_NX;
1266 		pmap_store_pte(FALSE, dptep, dpte);
1267 		dataptes++;
1268 	}
1269 	assert(dataptes > 0);
1270 
1271 	kernel_segment_command_t * seg;
1272 	kernel_section_t         * sec;
1273 	kc_format_t kc_format;
1274 
1275 	PE_get_primary_kc_format(&kc_format);
1276 
1277 	for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) {
1278 		if (!strcmp(seg->segname, "__TEXT") ||
1279 		    !strcmp(seg->segname, "__DATA")) {
1280 			continue;
1281 		}
1282 
1283 		/* XXX: FIXME_IN_dyld: This is a workaround (see below) */
1284 		if (kc_format != KCFormatFileset) {
1285 			//XXX
1286 			if (!strcmp(seg->segname, "__KLD")) {
1287 				continue;
1288 			}
1289 		}
1290 
1291 		if (!strcmp(seg->segname, "__HIB")) {
1292 			for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
1293 				if (sec->addr & PAGE_MASK) {
1294 					panic("__HIB segment's sections misaligned");
1295 				}
1296 				if (!strcmp(sec->sectname, "__text")) {
1297 					pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE);
1298 				} else {
1299 					pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE);
1300 				}
1301 			}
1302 		} else {
1303 			if (kc_format == KCFormatFileset) {
1304 #if 0
1305 				/*
1306 				 * This block of code is commented out because it may or may not have induced an earlier panic
1307 				 * in ledger init.
1308 				 */
1309 
1310 
1311 				boolean_t NXbit = !(seg->initprot & VM_PROT_EXECUTE),
1312 				    robit = (seg->initprot & (VM_PROT_READ | VM_PROT_WRITE)) == VM_PROT_READ;
1313 
1314 				/*
1315 				 * XXX: FIXME_IN_dyld: This is a workaround for primary KC containing incorrect inaccurate
1316 				 * initprot for segments containing code.
1317 				 */
1318 				if (!strcmp(seg->segname, "__KLD") || !strcmp(seg->segname, "__VECTORS")) {
1319 					NXbit = FALSE;
1320 					robit = FALSE;
1321 				}
1322 
1323 				pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1324 				    round_page_64(seg->vmsize), NXbit, robit);
1325 #endif
1326 
1327 				/*
1328 				 * XXX: We are marking *every* segment with rwx permissions as a workaround
1329 				 * XXX: until the primary KC's kernel segments are page-aligned.
1330 				 */
1331 				kprintf("Marking (%p, %p) as rwx\n", (void *)(seg->vmaddr & ~(uint64_t)PAGE_MASK),
1332 				    (void *)((seg->vmaddr & ~(uint64_t)PAGE_MASK) + round_page_64(seg->vmsize)));
1333 				pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1334 				    round_page_64(seg->vmsize), FALSE, FALSE);
1335 			} else {
1336 				pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE);
1337 			}
1338 		}
1339 	}
1340 
1341 	/*
1342 	 * If we're debugging, map the low global vector page at the fixed
1343 	 * virtual address.  Otherwise, remove the mapping for this.
1344 	 */
1345 	if (debug_boot_arg) {
1346 		pt_entry_t *pte = NULL;
1347 		if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS))) {
1348 			panic("lowmem pte");
1349 		}
1350 
1351 		/* make sure it is defined on page boundary */
1352 		assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK));
1353 		pmap_store_pte(FALSE, pte, kvtophys((vm_offset_t)&lowGlo)
1354 		    | INTEL_PTE_REF
1355 		    | INTEL_PTE_MOD
1356 		    | INTEL_PTE_WIRED
1357 		    | INTEL_PTE_VALID
1358 		    | INTEL_PTE_WRITE
1359 		    | INTEL_PTE_NX);
1360 
1361 #if KASAN
1362 		kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
1363 #endif
1364 	} else {
1365 		pmap_remove(kernel_pmap,
1366 		    LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE);
1367 	}
1368 	pmap_tlbi_range(0, ~0ULL, true, 0);
1369 	splx(spl);
1370 }
1371 
1372 /*
1373  *	Mark the const data segment as read-only, non-executable.
1374  */
1375 void
x86_64_protect_data_const()1376 x86_64_protect_data_const()
1377 {
1378 	boolean_t doconstro = TRUE;
1379 #if DEVELOPMENT || DEBUG
1380 	(void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
1381 #endif
1382 	if (doconstro) {
1383 		if (sconst & PAGE_MASK) {
1384 			panic("CONST segment misaligned 0x%lx 0x%lx",
1385 			    sconst, econst);
1386 		}
1387 		kprintf("Marking const DATA read-only\n");
1388 		pmap_protect(kernel_pmap, sconst, econst, VM_PROT_READ);
1389 	}
1390 }
1391 /*
1392  * this function is only used for debugging fron the vm layer
1393  */
1394 bool
pmap_verify_free(ppnum_t pn)1395 pmap_verify_free(
1396 	ppnum_t pn)
1397 {
1398 	pv_rooted_entry_t       pv_h;
1399 	int             pai;
1400 	bool            result;
1401 
1402 	assert(pn != vm_page_fictitious_addr);
1403 
1404 	if (!pmap_initialized) {
1405 		return true;
1406 	}
1407 
1408 	if (pn == vm_page_guard_addr) {
1409 		return true;
1410 	}
1411 
1412 	pai = ppn_to_pai(pn);
1413 	if (!IS_MANAGED_PAGE(pai)) {
1414 		return false;
1415 	}
1416 	pv_h = pai_to_pvh(pn);
1417 	result = (pv_h->pmap == PMAP_NULL);
1418 	return result;
1419 }
1420 
1421 #if MACH_ASSERT
1422 void
pmap_assert_free(ppnum_t pn)1423 pmap_assert_free(ppnum_t pn)
1424 {
1425 	int pai;
1426 	pv_rooted_entry_t pv_h = NULL;
1427 	pmap_t pmap = NULL;
1428 	vm_offset_t va = 0;
1429 	static char buffer[32];
1430 	static char *pr_name = "not managed pn";
1431 	uint_t attr;
1432 	pt_entry_t *ptep;
1433 	pt_entry_t pte = -1ull;
1434 
1435 	if (pmap_verify_free(pn)) {
1436 		return;
1437 	}
1438 
1439 	if (pn > last_managed_page) {
1440 		attr = 0xff;
1441 		goto done;
1442 	}
1443 
1444 	pai = ppn_to_pai(pn);
1445 	attr = pmap_phys_attributes[pai];
1446 	pv_h = pai_to_pvh(pai);
1447 	va = pv_h->va_and_flags;
1448 	pmap = pv_h->pmap;
1449 	if (pmap == kernel_pmap) {
1450 		pr_name = "kernel";
1451 	} else if (pmap == NULL) {
1452 		pr_name = "pmap NULL";
1453 	} else if (pmap->pmap_procname[0] != 0) {
1454 		pr_name = &pmap->pmap_procname[0];
1455 	} else {
1456 		snprintf(buffer, sizeof(buffer), "pmap %p", pv_h->pmap);
1457 		pr_name = buffer;
1458 	}
1459 
1460 	if (pmap != NULL) {
1461 		ptep = pmap_pte(pmap, va);
1462 		if (ptep != NULL) {
1463 			pte = (uintptr_t)*ptep;
1464 		}
1465 	}
1466 
1467 done:
1468 	panic("page not FREE page: 0x%lx attr: 0x%x %s va: 0x%lx PTE: 0x%llx",
1469 	    (ulong_t)pn, attr, pr_name, va, pte);
1470 }
1471 #endif /* MACH_ASSERT */
1472 
1473 boolean_t
pmap_is_empty(pmap_t pmap,vm_map_offset_t va_start,vm_map_offset_t va_end)1474 pmap_is_empty(
1475 	pmap_t          pmap,
1476 	vm_map_offset_t va_start,
1477 	vm_map_offset_t va_end)
1478 {
1479 	vm_map_offset_t offset;
1480 	ppnum_t         phys_page;
1481 	ledger_amount_t phys_mem;
1482 
1483 	if (pmap == PMAP_NULL) {
1484 		return TRUE;
1485 	}
1486 
1487 	/*
1488 	 * Check the ledger's phys_mem value
1489 	 * - if it's zero, the pmap is completely empty.
1490 	 * This short-circuit test prevents a virtual address scan which is
1491 	 * painfully slow for 64-bit spaces.
1492 	 * This assumes the count is correct
1493 	 * .. the debug kernel ought to be checking perhaps by page table walk.
1494 	 */
1495 	if (pmap != kernel_pmap) {
1496 		ledger_get_balance(pmap->ledger, task_ledgers.phys_mem, &phys_mem);
1497 		if (phys_mem == 0) {
1498 			return TRUE;
1499 		}
1500 	}
1501 
1502 	for (offset = va_start;
1503 	    offset < va_end;
1504 	    offset += PAGE_SIZE_64) {
1505 		phys_page = pmap_find_phys(pmap, offset);
1506 		if (phys_page) {
1507 			kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1508 			    "page %d at 0x%llx\n",
1509 			    pmap, va_start, va_end, phys_page, offset);
1510 			return FALSE;
1511 		}
1512 	}
1513 
1514 	return TRUE;
1515 }
1516 
1517 void
hv_ept_pmap_create(void ** ept_pmap,void ** eptp)1518 hv_ept_pmap_create(void **ept_pmap, void **eptp)
1519 {
1520 	pmap_t p;
1521 
1522 	if ((ept_pmap == NULL) || (eptp == NULL)) {
1523 		return;
1524 	}
1525 
1526 	p = pmap_create_options(get_task_ledger(current_task()), 0, (PMAP_CREATE_64BIT | PMAP_CREATE_EPT));
1527 	if (p == PMAP_NULL) {
1528 		*ept_pmap = NULL;
1529 		*eptp = NULL;
1530 		return;
1531 	}
1532 
1533 	assert(is_ept_pmap(p));
1534 
1535 	*ept_pmap = (void*)p;
1536 	*eptp = (void*)(p->pm_eptp);
1537 	return;
1538 }
1539 
1540 /*
1541  * pmap_create() is used by some special, legacy 3rd party kexts.
1542  * In our kernel code, always use pmap_create_options().
1543  */
1544 extern pmap_t pmap_create(ledger_t ledger, vm_map_size_t sz, boolean_t is_64bit);
1545 
1546 __attribute__((used))
1547 pmap_t
pmap_create(ledger_t ledger,vm_map_size_t sz,boolean_t is_64bit)1548 pmap_create(
1549 	ledger_t      ledger,
1550 	vm_map_size_t sz,
1551 	boolean_t     is_64bit)
1552 {
1553 	return pmap_create_options(ledger, sz, is_64bit ? PMAP_CREATE_64BIT : 0);
1554 }
1555 
1556 /*
1557  *	Create and return a physical map.
1558  *
1559  *	If the size specified for the map
1560  *	is zero, the map is an actual physical
1561  *	map, and may be referenced by the
1562  *	hardware.
1563  *
1564  *	If the size specified is non-zero,
1565  *	the map will be used in software only, and
1566  *	is bounded by that size.
1567  */
1568 
1569 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t sz,unsigned int flags)1570 pmap_create_options(
1571 	ledger_t        ledger,
1572 	vm_map_size_t   sz,
1573 	unsigned int    flags)
1574 {
1575 	pmap_t          p;
1576 	vm_size_t       size;
1577 	pml4_entry_t    *pml4;
1578 	pml4_entry_t    *kpml4;
1579 	int             i;
1580 
1581 	PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, sz, flags);
1582 
1583 	size = (vm_size_t) sz;
1584 
1585 	/*
1586 	 *	A software use-only map doesn't even need a map.
1587 	 */
1588 
1589 	if (size != 0) {
1590 		return PMAP_NULL;
1591 	}
1592 
1593 	/*
1594 	 *	Return error when unrecognized flags are passed.
1595 	 */
1596 	if (__improbable((flags & ~(PMAP_CREATE_KNOWN_FLAGS)) != 0)) {
1597 		return PMAP_NULL;
1598 	}
1599 
1600 	p = zalloc_flags(pmap_zone, Z_WAITOK | Z_ZERO);
1601 	if (PMAP_NULL == p) {
1602 		panic("pmap_create zalloc");
1603 	}
1604 
1605 	lck_rw_init(&p->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
1606 	p->pmap_rwl.lck_rw_can_sleep = FALSE;
1607 
1608 	os_ref_init(&p->ref_count, NULL);
1609 #if DEVELOPMENT || DEBUG
1610 	p->nx_enabled = 1;
1611 #endif
1612 	p->pm_shared = FALSE;
1613 	ledger_reference(ledger);
1614 	p->ledger = ledger;
1615 
1616 	p->pm_task_map = ((flags & PMAP_CREATE_64BIT) ? TASK_MAP_64BIT : TASK_MAP_32BIT);
1617 
1618 	p->pagezero_accessible = FALSE;
1619 	p->pm_vm_map_cs_enforced = FALSE;
1620 
1621 	if (pmap_pcid_ncpus) {
1622 		pmap_pcid_initialize(p);
1623 	}
1624 
1625 	p->pm_pml4 = zalloc(pmap_anchor_zone);
1626 	p->pm_upml4 = zalloc(pmap_uanchor_zone); //cleanup for EPT
1627 
1628 	pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0);
1629 	pmap_assert((((uintptr_t)p->pm_upml4) & PAGE_MASK) == 0);
1630 
1631 	memset((char *)p->pm_pml4, 0, PAGE_SIZE);
1632 	memset((char *)p->pm_upml4, 0, PAGE_SIZE);
1633 
1634 	if (flags & PMAP_CREATE_EPT) {
1635 		p->pm_eptp = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4) | pmap_eptp_flags;
1636 		p->pm_cr3 = 0;
1637 	} else {
1638 		p->pm_eptp = 0;
1639 		p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1640 		p->pm_ucr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_upml4);
1641 	}
1642 
1643 	/* allocate the vm_objs to hold the pdpt, pde and pte pages */
1644 
1645 	p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS) *PAGE_SIZE);
1646 	if (NULL == p->pm_obj_pml4) {
1647 		panic("pmap_create pdpt obj");
1648 	}
1649 
1650 	p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS) *PAGE_SIZE);
1651 	if (NULL == p->pm_obj_pdpt) {
1652 		panic("pmap_create pdpt obj");
1653 	}
1654 
1655 	p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS) *PAGE_SIZE);
1656 	if (NULL == p->pm_obj) {
1657 		panic("pmap_create pte obj");
1658 	}
1659 
1660 	if (!(flags & PMAP_CREATE_EPT)) {
1661 		/* All host pmaps share the kernel's pml4 */
1662 		pml4 = pmap64_pml4(p, 0ULL);
1663 		kpml4 = kernel_pmap->pm_pml4;
1664 		for (i = KERNEL_PML4_INDEX; i < (KERNEL_PML4_INDEX + KERNEL_PML4_COUNT); i++) {
1665 			pml4[i] = kpml4[i];
1666 		}
1667 		pml4[KERNEL_KEXTS_INDEX]   = kpml4[KERNEL_KEXTS_INDEX];
1668 		for (i = KERNEL_PHYSMAP_PML4_INDEX; i < (KERNEL_PHYSMAP_PML4_INDEX + KERNEL_PHYSMAP_PML4_COUNT); i++) {
1669 			pml4[i] = kpml4[i];
1670 		}
1671 		pml4[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1672 #if KASAN
1673 		for (i = KERNEL_KASAN_PML4_FIRST; i <= KERNEL_KASAN_PML4_LAST; i++) {
1674 			pml4[i] = kpml4[i];
1675 		}
1676 #endif
1677 		pml4_entry_t    *pml4u = pmap64_user_pml4(p, 0ULL);
1678 		pml4u[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1679 	}
1680 
1681 #if MACH_ASSERT
1682 	p->pmap_stats_assert = TRUE;
1683 	p->pmap_pid = 0;
1684 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
1685 #endif /* MACH_ASSERT */
1686 
1687 	PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END,
1688 	    VM_KERNEL_ADDRHIDE(p));
1689 
1690 	return p;
1691 }
1692 
1693 /*
1694  * We maintain stats and ledgers so that a task's physical footprint is:
1695  * phys_footprint = ((internal - alternate_accounting)
1696  *                   + (internal_compressed - alternate_accounting_compressed)
1697  *                   + iokit_mapped
1698  *                   + purgeable_nonvolatile
1699  *                   + purgeable_nonvolatile_compressed
1700  *                   + page_table)
1701  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
1702  */
1703 
1704 #if MACH_ASSERT
1705 static void pmap_check_ledgers(pmap_t pmap);
1706 #else /* MACH_ASSERT */
1707 static inline void
pmap_check_ledgers(__unused pmap_t pmap)1708 pmap_check_ledgers(__unused pmap_t pmap)
1709 {
1710 }
1711 #endif /* MACH_ASSERT */
1712 
1713 /*
1714  *	Retire the given physical map from service.
1715  *	Should only be called if the map contains
1716  *	no valid mappings.
1717  */
1718 extern int vm_wired_objects_page_count;
1719 
1720 void
pmap_destroy(pmap_t p)1721 pmap_destroy(pmap_t     p)
1722 {
1723 	os_ref_count_t c;
1724 
1725 	if (p == PMAP_NULL) {
1726 		return;
1727 	}
1728 
1729 	PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1730 	    VM_KERNEL_ADDRHIDe(p));
1731 
1732 	PMAP_LOCK_EXCLUSIVE(p);
1733 
1734 	c = os_ref_release_locked(&p->ref_count);
1735 
1736 	pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE);
1737 
1738 	if (c == 0) {
1739 		/*
1740 		 * If some cpu is not using the physical pmap pointer that it
1741 		 * is supposed to be (see set_dirbase), we might be using the
1742 		 * pmap that is being destroyed! Make sure we are
1743 		 * physically on the right pmap:
1744 		 */
1745 		PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
1746 		if (pmap_pcid_ncpus) {
1747 			pmap_destroy_pcid_sync(p);
1748 		}
1749 	}
1750 
1751 	PMAP_UNLOCK_EXCLUSIVE(p);
1752 
1753 	if (c != 0) {
1754 		PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1755 		pmap_assert(p == kernel_pmap);
1756 		return; /* still in use */
1757 	}
1758 
1759 	/*
1760 	 *	Free the memory maps, then the
1761 	 *	pmap structure.
1762 	 */
1763 	int inuse_ptepages = 0;
1764 
1765 	zfree(pmap_anchor_zone, p->pm_pml4);
1766 	zfree(pmap_uanchor_zone, p->pm_upml4);
1767 
1768 	inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1769 	vm_object_deallocate(p->pm_obj_pml4);
1770 
1771 	inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1772 	vm_object_deallocate(p->pm_obj_pdpt);
1773 
1774 	inuse_ptepages += p->pm_obj->resident_page_count;
1775 	vm_object_deallocate(p->pm_obj);
1776 
1777 	OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
1778 	PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE);
1779 
1780 	pmap_check_ledgers(p);
1781 	ledger_dereference(p->ledger);
1782 	lck_rw_destroy(&p->pmap_rwl, &pmap_lck_grp);
1783 	zfree(pmap_zone, p);
1784 
1785 	PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1786 }
1787 
1788 /*
1789  *	Add a reference to the specified pmap.
1790  */
1791 
1792 void
pmap_reference(pmap_t p)1793 pmap_reference(pmap_t   p)
1794 {
1795 	if (p != PMAP_NULL) {
1796 		PMAP_LOCK_EXCLUSIVE(p);
1797 		os_ref_retain_locked(&p->ref_count);
1798 		PMAP_UNLOCK_EXCLUSIVE(p);
1799 	}
1800 }
1801 
1802 /*
1803  *	Remove phys addr if mapped in specified map
1804  *
1805  */
1806 void
pmap_remove_some_phys(__unused pmap_t map,__unused ppnum_t pn)1807 pmap_remove_some_phys(
1808 	__unused pmap_t         map,
1809 	__unused ppnum_t         pn)
1810 {
1811 /* Implement to support working set code */
1812 }
1813 
1814 
1815 void
pmap_protect(pmap_t map,vm_map_offset_t sva,vm_map_offset_t eva,vm_prot_t prot)1816 pmap_protect(
1817 	pmap_t          map,
1818 	vm_map_offset_t sva,
1819 	vm_map_offset_t eva,
1820 	vm_prot_t       prot)
1821 {
1822 	pmap_protect_options(map, sva, eva, prot, 0, NULL);
1823 }
1824 
1825 
1826 /*
1827  *	Set the physical protection on the
1828  *	specified range of this map as requested.
1829  *
1830  * VERY IMPORTANT: Will *NOT* increase permissions.
1831  *	pmap_protect_options() should protect the range against any access types
1832  *      that are not in "prot" but it should never grant extra access.
1833  *	For example, if "prot" is READ|EXECUTE, that means "remove write
1834  *      access" but it does *not* mean "add read and execute" access.
1835  *	VM relies on getting soft-faults to enforce extra checks (code
1836  *	signing, for example), for example.
1837  *	New access permissions are granted via pmap_enter() only.
1838  *      ***NOTE***:
1839  *	The only exception is for EPT pmaps, where we MUST populate all exec
1840  *      bits when the protection API is invoked (so that the HV fault handler
1841  *      can make decisions based on the exit qualification information, which
1842  *      includes the execute bits in the EPT entries.  Soft-faulting them
1843  *      in would cause a chicken-and-egg problem where the HV fault handler
1844  *      would not be able to identify mode-based execute control (MBE) faults.)
1845  */
1846 void
pmap_protect_options(pmap_t map,vm_map_offset_t sva,vm_map_offset_t eva,vm_prot_t prot,unsigned int options,void * arg)1847 pmap_protect_options(
1848 	pmap_t          map,
1849 	vm_map_offset_t sva,
1850 	vm_map_offset_t eva,
1851 	vm_prot_t       prot,
1852 	unsigned int    options,
1853 	void            *arg)
1854 {
1855 	pt_entry_t      *pde;
1856 	pt_entry_t      *spte, *epte;
1857 	vm_map_offset_t lva;
1858 	vm_map_offset_t orig_sva;
1859 	boolean_t       set_NX;
1860 	int             num_found = 0;
1861 	boolean_t       is_ept;
1862 	uint64_t        cur_vaddr;
1863 
1864 	pmap_intr_assert();
1865 
1866 	if (map == PMAP_NULL) {
1867 		return;
1868 	}
1869 
1870 	if (prot == VM_PROT_NONE) {
1871 		pmap_remove_options(map, sva, eva, options);
1872 		return;
1873 	}
1874 
1875 	PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1876 	    VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(sva),
1877 	    VM_KERNEL_ADDRHIDE(eva));
1878 
1879 	is_ept = is_ept_pmap(map);
1880 
1881 	if ((prot & VM_PROT_EXECUTE) || __improbable(is_ept && (prot & VM_PROT_UEXEC))) {
1882 		set_NX = FALSE;
1883 	} else {
1884 		set_NX = TRUE;
1885 	}
1886 
1887 #if DEVELOPMENT || DEBUG
1888 	if (__improbable(set_NX && (!nx_enabled || !map->nx_enabled))) {
1889 		set_NX = FALSE;
1890 	}
1891 #endif
1892 	PMAP_LOCK_EXCLUSIVE(map);
1893 
1894 	orig_sva = sva;
1895 	cur_vaddr = sva;
1896 	while (sva < eva) {
1897 		uint64_t vaddr_incr;
1898 
1899 		if (os_add_overflow(sva, PDE_MAPPED_SIZE, &lva)) {
1900 			lva = eva;
1901 		} else {
1902 			lva &= ~(PDE_MAPPED_SIZE - 1);
1903 
1904 			if (lva > eva) {
1905 				lva = eva;
1906 			}
1907 		}
1908 
1909 		pde = pmap_pde(map, sva);
1910 		if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1911 			if (*pde & PTE_PS) {
1912 				/* superpage */
1913 				spte = pde;
1914 				epte = spte + 1; /* excluded */
1915 				vaddr_incr = I386_LPGBYTES;
1916 			} else {
1917 				spte = pmap_pte(map, (sva & ~(PDE_MAPPED_SIZE - 1)));
1918 				spte = &spte[ptenum(sva)];
1919 				epte = &spte[intel_btop(lva - sva)];
1920 				vaddr_incr = I386_PGBYTES;
1921 			}
1922 
1923 			for (; spte < epte; spte++) {
1924 				uint64_t clear_bits, set_bits;
1925 
1926 				if (!(*spte & PTE_VALID_MASK(is_ept))) {
1927 					continue;
1928 				}
1929 
1930 				clear_bits = 0;
1931 				set_bits = 0;
1932 
1933 				if (is_ept) {
1934 					if (!(prot & VM_PROT_READ)) {
1935 						clear_bits |= PTE_READ(is_ept);
1936 					}
1937 				}
1938 				if (!(prot & VM_PROT_WRITE)) {
1939 					clear_bits |= PTE_WRITE(is_ept);
1940 				}
1941 #if DEVELOPMENT || DEBUG
1942 				else if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) &&
1943 				    map == kernel_pmap) {
1944 					set_bits |= PTE_WRITE(is_ept);
1945 				}
1946 #endif /* DEVELOPMENT || DEBUG */
1947 
1948 				if (set_NX) {
1949 					if (!is_ept) {
1950 						set_bits |= INTEL_PTE_NX;
1951 					} else {
1952 						clear_bits |= INTEL_EPT_EX | INTEL_EPT_UEX;
1953 					}
1954 				} else if (is_ept) {
1955 					/* This is the exception to the "Don't add permissions" statement, above */
1956 					set_bits |= ((prot & VM_PROT_EXECUTE) ? INTEL_EPT_EX : 0) |
1957 					    ((prot & VM_PROT_UEXEC) ? INTEL_EPT_UEX : 0);
1958 				}
1959 
1960 				pmap_update_pte(is_ept, spte, clear_bits, set_bits, false);
1961 
1962 				DTRACE_VM3(set_pte, pmap_t, map, void *, cur_vaddr, uint64_t, *spte);
1963 				cur_vaddr += vaddr_incr;
1964 
1965 				num_found++;
1966 			}
1967 		}
1968 		sva = lva;
1969 	}
1970 	if (num_found) {
1971 		if (options & PMAP_OPTIONS_NOFLUSH) {
1972 			PMAP_UPDATE_TLBS_DELAYED(map, orig_sva, eva, (pmap_flush_context *)arg);
1973 		} else {
1974 			PMAP_UPDATE_TLBS(map, orig_sva, eva);
1975 		}
1976 	}
1977 
1978 	PMAP_UNLOCK_EXCLUSIVE(map);
1979 
1980 	PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
1981 }
1982 
1983 /* Map a (possibly) autogenned block */
1984 kern_return_t
pmap_map_block_addr(pmap_t pmap,addr64_t va,pmap_paddr_t pa,uint32_t size,vm_prot_t prot,int attr,unsigned int flags)1985 pmap_map_block_addr(
1986 	pmap_t          pmap,
1987 	addr64_t        va,
1988 	pmap_paddr_t    pa,
1989 	uint32_t        size,
1990 	vm_prot_t       prot,
1991 	int             attr,
1992 	unsigned int    flags)
1993 {
1994 	return pmap_map_block(pmap, va, intel_btop(pa), size, prot, attr, flags);
1995 }
1996 
1997 kern_return_t
pmap_map_block(pmap_t pmap,addr64_t va,ppnum_t pa,uint32_t size,vm_prot_t prot,int attr,__unused unsigned int flags)1998 pmap_map_block(
1999 	pmap_t          pmap,
2000 	addr64_t        va,
2001 	ppnum_t         pa,
2002 	uint32_t        size,
2003 	vm_prot_t       prot,
2004 	int             attr,
2005 	__unused unsigned int   flags)
2006 {
2007 	kern_return_t   kr;
2008 	addr64_t        original_va = va;
2009 	uint32_t        page;
2010 	int             cur_page_size;
2011 
2012 	if (attr & VM_MEM_SUPERPAGE) {
2013 		cur_page_size =  SUPERPAGE_SIZE;
2014 	} else {
2015 		cur_page_size =  PAGE_SIZE;
2016 	}
2017 
2018 	for (page = 0; page < size; page += cur_page_size / PAGE_SIZE) {
2019 		kr = pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE);
2020 
2021 		if (kr != KERN_SUCCESS) {
2022 			/*
2023 			 * This will panic for now, as it is unclear that
2024 			 * removing the mappings is correct.
2025 			 */
2026 			panic("%s: failed pmap_enter, "
2027 			    "pmap=%p, va=%#llx, pa=%u, size=%u, prot=%#x, flags=%#x",
2028 			    __FUNCTION__,
2029 			    pmap, va, pa, size, prot, flags);
2030 
2031 			pmap_remove(pmap, original_va, va - original_va);
2032 			return kr;
2033 		}
2034 
2035 		va += cur_page_size;
2036 		pa += cur_page_size / PAGE_SIZE;
2037 	}
2038 
2039 	return KERN_SUCCESS;
2040 }
2041 
2042 kern_return_t
pmap_expand_pml4(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2043 pmap_expand_pml4(
2044 	pmap_t          map,
2045 	vm_map_offset_t vaddr,
2046 	unsigned int options)
2047 {
2048 	vm_page_t       m;
2049 	pmap_paddr_t    pa;
2050 	uint64_t        i;
2051 	ppnum_t         pn;
2052 	pml4_entry_t    *pml4p;
2053 	boolean_t       is_ept = is_ept_pmap(map);
2054 
2055 	DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
2056 
2057 	/* With the exception of the kext "basement", the kernel's level 4
2058 	 * pagetables must not be dynamically expanded.
2059 	 */
2060 	assert(map != kernel_pmap || (vaddr == KERNEL_BASEMENT));
2061 	/*
2062 	 *	Allocate a VM page for the pml4 page
2063 	 */
2064 	while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2065 		if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2066 			return KERN_RESOURCE_SHORTAGE;
2067 		}
2068 		VM_PAGE_WAIT();
2069 	}
2070 	/*
2071 	 *	put the page into the pmap's obj list so it
2072 	 *	can be found later.
2073 	 */
2074 	pn = VM_PAGE_GET_PHYS_PAGE(m);
2075 	pa = i386_ptob(pn);
2076 	i = pml4idx(map, vaddr);
2077 
2078 	/*
2079 	 *	Zero the page.
2080 	 */
2081 	pmap_zero_page(pn);
2082 
2083 	vm_page_lockspin_queues();
2084 	vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2085 	vm_page_unlock_queues();
2086 
2087 	OSAddAtomic(1, &inuse_ptepages_count);
2088 	OSAddAtomic64(1, &alloc_ptepages_count);
2089 	PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2090 
2091 	/* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2092 	vm_object_lock(map->pm_obj_pml4);
2093 
2094 	PMAP_LOCK_EXCLUSIVE(map);
2095 	/*
2096 	 *	See if someone else expanded us first
2097 	 */
2098 	if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
2099 		PMAP_UNLOCK_EXCLUSIVE(map);
2100 		vm_object_unlock(map->pm_obj_pml4);
2101 
2102 		VM_PAGE_FREE(m);
2103 
2104 		OSAddAtomic(-1, &inuse_ptepages_count);
2105 		PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2106 		return KERN_SUCCESS;
2107 	}
2108 
2109 #if 0 /* DEBUG */
2110 	if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE)) {
2111 		panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx",
2112 		    map, map->pm_obj_pml4, vaddr, i);
2113 	}
2114 #endif
2115 	vm_page_insert_wired(m, map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2116 	vm_object_unlock(map->pm_obj_pml4);
2117 
2118 	/*
2119 	 *	Set the page directory entry for this page table.
2120 	 */
2121 	pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
2122 
2123 	/*
2124 	 * Note that INTEL_EPT_UEX is unconditionally set (as is INTEL_EPT_EX) for
2125 	 * all intermediate paging levels, from PML4Es to PDEs.  Processors with
2126 	 * VT-x implementations that do not support MBE ignore the INTEL_EPT_UEX
2127 	 * bit at all levels of the EPT, so there is no risk of inducing EPT
2128 	 * violation faults.
2129 	 */
2130 	pmap_store_pte(is_ept, pml4p, pa_to_pte(pa)
2131 	    | PTE_READ(is_ept)
2132 	    | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2133 	    | PTE_WRITE(is_ept));
2134 	pml4_entry_t    *upml4p;
2135 
2136 	upml4p = pmap64_user_pml4(map, vaddr);
2137 	pmap_store_pte(is_ept, upml4p, pa_to_pte(pa)
2138 	    | PTE_READ(is_ept)
2139 	    | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2140 	    | PTE_WRITE(is_ept));
2141 
2142 	PMAP_UNLOCK_EXCLUSIVE(map);
2143 
2144 	return KERN_SUCCESS;
2145 }
2146 
2147 kern_return_t
pmap_expand_pdpt(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2148 pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options)
2149 {
2150 	vm_page_t       m;
2151 	pmap_paddr_t    pa;
2152 	uint64_t        i;
2153 	ppnum_t         pn;
2154 	pdpt_entry_t    *pdptp;
2155 	boolean_t       is_ept = is_ept_pmap(map);
2156 
2157 	DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
2158 
2159 	while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
2160 		kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options);
2161 		if (pep4kr != KERN_SUCCESS) {
2162 			return pep4kr;
2163 		}
2164 	}
2165 
2166 	/*
2167 	 *	Allocate a VM page for the pdpt page
2168 	 */
2169 	while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2170 		if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2171 			return KERN_RESOURCE_SHORTAGE;
2172 		}
2173 		VM_PAGE_WAIT();
2174 	}
2175 
2176 	/*
2177 	 *	put the page into the pmap's obj list so it
2178 	 *	can be found later.
2179 	 */
2180 	pn = VM_PAGE_GET_PHYS_PAGE(m);
2181 	pa = i386_ptob(pn);
2182 	i = pdptidx(map, vaddr);
2183 
2184 	/*
2185 	 *	Zero the page.
2186 	 */
2187 	pmap_zero_page(pn);
2188 
2189 	vm_page_lockspin_queues();
2190 	vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2191 	vm_page_unlock_queues();
2192 
2193 	OSAddAtomic(1, &inuse_ptepages_count);
2194 	OSAddAtomic64(1, &alloc_ptepages_count);
2195 	PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2196 
2197 	/* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2198 	vm_object_lock(map->pm_obj_pdpt);
2199 
2200 	PMAP_LOCK_EXCLUSIVE(map);
2201 	/*
2202 	 *	See if someone else expanded us first
2203 	 */
2204 	if (pmap_pde(map, vaddr) != PD_ENTRY_NULL) {
2205 		PMAP_UNLOCK_EXCLUSIVE(map);
2206 		vm_object_unlock(map->pm_obj_pdpt);
2207 
2208 		VM_PAGE_FREE(m);
2209 
2210 		OSAddAtomic(-1, &inuse_ptepages_count);
2211 		PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2212 		return KERN_SUCCESS;
2213 	}
2214 
2215 #if 0 /* DEBUG */
2216 	if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE)) {
2217 		panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx",
2218 		    map, map->pm_obj_pdpt, vaddr, i);
2219 	}
2220 #endif
2221 	vm_page_insert_wired(m, map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2222 	vm_object_unlock(map->pm_obj_pdpt);
2223 
2224 	/*
2225 	 *	Set the page directory entry for this page table.
2226 	 */
2227 	pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
2228 
2229 	pmap_store_pte(is_ept, pdptp, pa_to_pte(pa)
2230 	    | PTE_READ(is_ept)
2231 	    | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2232 	    | PTE_WRITE(is_ept));
2233 
2234 	PMAP_UNLOCK_EXCLUSIVE(map);
2235 
2236 	return KERN_SUCCESS;
2237 }
2238 
2239 
2240 
2241 /*
2242  *	Routine:	pmap_expand
2243  *
2244  *	Expands a pmap to be able to map the specified virtual address.
2245  *
2246  *	Allocates new virtual memory for the P0 or P1 portion of the
2247  *	pmap, then re-maps the physical pages that were in the old
2248  *	pmap to be in the new pmap.
2249  *
2250  *	Must be called with the pmap system and the pmap unlocked,
2251  *	since these must be unlocked to use vm_allocate or vm_deallocate.
2252  *	Thus it must be called in a loop that checks whether the map
2253  *	has been expanded enough.
2254  *	(We won't loop forever, since page tables aren't shrunk.)
2255  */
2256 kern_return_t
pmap_expand(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2257 pmap_expand(
2258 	pmap_t          map,
2259 	vm_map_offset_t vaddr,
2260 	unsigned int options)
2261 {
2262 	pt_entry_t              *pdp;
2263 	vm_page_t               m;
2264 	pmap_paddr_t            pa;
2265 	uint64_t                i;
2266 	ppnum_t                 pn;
2267 	boolean_t               is_ept = is_ept_pmap(map);
2268 
2269 
2270 	/*
2271 	 * For the kernel, the virtual address must be in or above the basement
2272 	 * which is for kexts and is in the 512GB immediately below the kernel..
2273 	 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
2274 	 */
2275 	if (__improbable(map == kernel_pmap &&
2276 	    !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))) {
2277 		if ((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0) {
2278 			panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
2279 		}
2280 	}
2281 
2282 	while ((pdp = pmap_pde(map, vaddr)) == PD_ENTRY_NULL) {
2283 		assert((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0);
2284 		kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options);
2285 		if (pepkr != KERN_SUCCESS) {
2286 			return pepkr;
2287 		}
2288 	}
2289 
2290 	/*
2291 	 *	Allocate a VM page for the pde entries.
2292 	 */
2293 	while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2294 		if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2295 			return KERN_RESOURCE_SHORTAGE;
2296 		}
2297 		VM_PAGE_WAIT();
2298 	}
2299 
2300 	/*
2301 	 *	put the page into the pmap's obj list so it
2302 	 *	can be found later.
2303 	 */
2304 	pn = VM_PAGE_GET_PHYS_PAGE(m);
2305 	pa = i386_ptob(pn);
2306 	i = pdeidx(map, vaddr);
2307 
2308 	/*
2309 	 *	Zero the page.
2310 	 */
2311 	pmap_zero_page(pn);
2312 
2313 	vm_page_lockspin_queues();
2314 	vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2315 	vm_page_unlock_queues();
2316 
2317 	OSAddAtomic(1, &inuse_ptepages_count);
2318 	OSAddAtomic64(1, &alloc_ptepages_count);
2319 	PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2320 
2321 	/* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2322 	vm_object_lock(map->pm_obj);
2323 
2324 	PMAP_LOCK_EXCLUSIVE(map);
2325 
2326 	/*
2327 	 *	See if someone else expanded us first
2328 	 */
2329 	if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
2330 		PMAP_UNLOCK_EXCLUSIVE(map);
2331 		vm_object_unlock(map->pm_obj);
2332 
2333 		VM_PAGE_FREE(m);
2334 
2335 		OSAddAtomic(-1, &inuse_ptepages_count); //todo replace all with inlines
2336 		PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2337 		return KERN_SUCCESS;
2338 	}
2339 
2340 #if 0 /* DEBUG */
2341 	if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE)) {
2342 		panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx",
2343 		    map, map->pm_obj, vaddr, i);
2344 	}
2345 #endif
2346 	vm_page_insert_wired(m, map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2347 	vm_object_unlock(map->pm_obj);
2348 
2349 	/*
2350 	 *	Set the page directory entry for this page table.
2351 	 */
2352 	pdp = pmap_pde(map, vaddr);
2353 
2354 	pmap_store_pte(is_ept, pdp, pa_to_pte(pa)
2355 	    | PTE_READ(is_ept)
2356 	    | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2357 	    | PTE_WRITE(is_ept));
2358 
2359 	PMAP_UNLOCK_EXCLUSIVE(map);
2360 
2361 	return KERN_SUCCESS;
2362 }
2363 /*
2364  * Query a pmap to see what size a given virtual address is mapped with.
2365  * If the vaddr is not mapped, returns 0.
2366  */
2367 vm_size_t
pmap_query_pagesize(pmap_t pmap,vm_map_offset_t vaddr)2368 pmap_query_pagesize(
2369 	pmap_t          pmap,
2370 	vm_map_offset_t vaddr)
2371 {
2372 	pd_entry_t      *pdep;
2373 	vm_size_t       size = 0;
2374 
2375 	assert(!is_ept_pmap(pmap));
2376 	PMAP_LOCK_EXCLUSIVE(pmap);
2377 
2378 	pdep = pmap_pde(pmap, vaddr);
2379 	if (pdep != PD_ENTRY_NULL) {
2380 		if (*pdep & INTEL_PTE_PS) {
2381 			size = I386_LPGBYTES;
2382 		} else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2383 			size = I386_PGBYTES;
2384 		}
2385 	}
2386 
2387 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2388 
2389 	return size;
2390 }
2391 
2392 /*
2393  * Ensure the page table hierarchy is filled in down to
2394  * the large page level. Additionally returns FAILURE if
2395  * a lower page table already exists.
2396  */
2397 static kern_return_t
pmap_pre_expand_large_internal(pmap_t pmap,vm_map_offset_t vaddr)2398 pmap_pre_expand_large_internal(
2399 	pmap_t          pmap,
2400 	vm_map_offset_t vaddr)
2401 {
2402 	ppnum_t         pn;
2403 	pt_entry_t      *pte;
2404 	boolean_t       is_ept = is_ept_pmap(pmap);
2405 	kern_return_t   kr = KERN_SUCCESS;
2406 
2407 	if (pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
2408 		if (!pmap_next_page_hi(&pn, FALSE)) {
2409 			panic("pmap_pre_expand_large no PDPT");
2410 		}
2411 
2412 		pmap_zero_page(pn);
2413 
2414 		pte = pmap64_pml4(pmap, vaddr);
2415 
2416 		pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2417 		    PTE_READ(is_ept) |
2418 		    (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2419 		    PTE_WRITE(is_ept));
2420 
2421 		pte = pmap64_user_pml4(pmap, vaddr);
2422 
2423 		pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2424 		    PTE_READ(is_ept) |
2425 		    (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2426 		    PTE_WRITE(is_ept));
2427 	}
2428 
2429 	if (pmap_pde(pmap, vaddr) == PD_ENTRY_NULL) {
2430 		if (!pmap_next_page_hi(&pn, FALSE)) {
2431 			panic("pmap_pre_expand_large no PDE");
2432 		}
2433 
2434 		pmap_zero_page(pn);
2435 
2436 		pte = pmap64_pdpt(pmap, vaddr);
2437 
2438 		pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2439 		    PTE_READ(is_ept) |
2440 		    (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2441 		    PTE_WRITE(is_ept));
2442 	} else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2443 		kr = KERN_FAILURE;
2444 	}
2445 
2446 	return kr;
2447 }
2448 
2449 /*
2450  * Wrapper that locks the pmap.
2451  */
2452 kern_return_t
pmap_pre_expand_large(pmap_t pmap,vm_map_offset_t vaddr)2453 pmap_pre_expand_large(
2454 	pmap_t          pmap,
2455 	vm_map_offset_t vaddr)
2456 {
2457 	kern_return_t   kr;
2458 
2459 	PMAP_LOCK_EXCLUSIVE(pmap);
2460 	kr = pmap_pre_expand_large_internal(pmap, vaddr);
2461 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2462 	return kr;
2463 }
2464 
2465 /*
2466  * On large memory machines, pmap_steal_memory() will allocate past
2467  * the 1GB of pre-allocated/mapped virtual kernel area. This function
2468  * expands kernel the page tables to cover a given vaddr. It uses pages
2469  * from the same pool that pmap_steal_memory() uses, since vm_page_grab()
2470  * isn't available yet.
2471  */
2472 void
pmap_pre_expand(pmap_t pmap,vm_map_offset_t vaddr)2473 pmap_pre_expand(
2474 	pmap_t          pmap,
2475 	vm_map_offset_t vaddr)
2476 {
2477 	ppnum_t         pn;
2478 	pt_entry_t      *pte;
2479 	boolean_t       is_ept = is_ept_pmap(pmap);
2480 
2481 	/*
2482 	 * This returns failure if a 4K page table already exists.
2483 	 * Othewise it fills in the page table hierarchy down
2484 	 * to that level.
2485 	 */
2486 	PMAP_LOCK_EXCLUSIVE(pmap);
2487 	if (pmap_pre_expand_large_internal(pmap, vaddr) == KERN_FAILURE) {
2488 		PMAP_UNLOCK_EXCLUSIVE(pmap);
2489 		return;
2490 	}
2491 
2492 	/* Add the lowest table */
2493 	if (!pmap_next_page_hi(&pn, FALSE)) {
2494 		panic("pmap_pre_expand");
2495 	}
2496 
2497 	pmap_zero_page(pn);
2498 
2499 	pte = pmap_pde(pmap, vaddr);
2500 
2501 	pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2502 	    PTE_READ(is_ept) |
2503 	    (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2504 	    PTE_WRITE(is_ept));
2505 	PMAP_UNLOCK_EXCLUSIVE(pmap);
2506 }
2507 
2508 /*
2509  * pmap_sync_page_data_phys(ppnum_t pa)
2510  *
2511  * Invalidates all of the instruction cache on a physical page and
2512  * pushes any dirty data from the data cache for the same physical page
2513  * Not required in i386.
2514  */
2515 void
pmap_sync_page_data_phys(__unused ppnum_t pa)2516 pmap_sync_page_data_phys(__unused ppnum_t pa)
2517 {
2518 	return;
2519 }
2520 
2521 /*
2522  * pmap_sync_page_attributes_phys(ppnum_t pa)
2523  *
2524  * Write back and invalidate all cachelines on a physical page.
2525  */
2526 void
pmap_sync_page_attributes_phys(ppnum_t pa)2527 pmap_sync_page_attributes_phys(ppnum_t pa)
2528 {
2529 	cache_flush_page_phys(pa);
2530 }
2531 
2532 void
pmap_copy_page(ppnum_t src,ppnum_t dst)2533 pmap_copy_page(ppnum_t src, ppnum_t dst)
2534 {
2535 	bcopy_phys((addr64_t)i386_ptob(src),
2536 	    (addr64_t)i386_ptob(dst),
2537 	    PAGE_SIZE);
2538 }
2539 
2540 
2541 /*
2542  *	Routine:	pmap_pageable
2543  *	Function:
2544  *		Make the specified pages (by pmap, offset)
2545  *		pageable (or not) as requested.
2546  *
2547  *		A page which is not pageable may not take
2548  *		a fault; therefore, its page table entry
2549  *		must remain valid for the duration.
2550  *
2551  *		This routine is merely advisory; pmap_enter
2552  *		will specify that these pages are to be wired
2553  *		down (or not) as appropriate.
2554  */
2555 void
pmap_pageable(__unused pmap_t pmap,__unused vm_map_offset_t start_addr,__unused vm_map_offset_t end_addr,__unused boolean_t pageable)2556 pmap_pageable(
2557 	__unused pmap_t                 pmap,
2558 	__unused vm_map_offset_t        start_addr,
2559 	__unused vm_map_offset_t        end_addr,
2560 	__unused boolean_t              pageable)
2561 {
2562 #ifdef  lint
2563 	pmap++; start_addr++; end_addr++; pageable++;
2564 #endif  /* lint */
2565 }
2566 
2567 void
invalidate_icache(__unused vm_offset_t addr,__unused unsigned cnt,__unused int phys)2568 invalidate_icache(__unused vm_offset_t  addr,
2569     __unused unsigned     cnt,
2570     __unused int          phys)
2571 {
2572 	return;
2573 }
2574 
2575 void
flush_dcache(__unused vm_offset_t addr,__unused unsigned count,__unused int phys)2576 flush_dcache(__unused vm_offset_t       addr,
2577     __unused unsigned          count,
2578     __unused int               phys)
2579 {
2580 	return;
2581 }
2582 
2583 #if CONFIG_DTRACE
2584 /*
2585  * Constrain DTrace copyin/copyout actions
2586  */
2587 extern kern_return_t dtrace_copyio_preflight(addr64_t);
2588 extern kern_return_t dtrace_copyio_postflight(addr64_t);
2589 
2590 kern_return_t
dtrace_copyio_preflight(__unused addr64_t va)2591 dtrace_copyio_preflight(__unused addr64_t va)
2592 {
2593 	thread_t thread = current_thread();
2594 	uint64_t ccr3;
2595 	if (current_map() == kernel_map) {
2596 		return KERN_FAILURE;
2597 	} else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE)) {
2598 		return KERN_FAILURE;
2599 	} else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3)) {
2600 		return KERN_FAILURE;
2601 	} else {
2602 		return KERN_SUCCESS;
2603 	}
2604 }
2605 
2606 kern_return_t
dtrace_copyio_postflight(__unused addr64_t va)2607 dtrace_copyio_postflight(__unused addr64_t va)
2608 {
2609 	return KERN_SUCCESS;
2610 }
2611 #endif /* CONFIG_DTRACE */
2612 
2613 #include <mach_vm_debug.h>
2614 #if     MACH_VM_DEBUG
2615 #include <vm/vm_debug.h>
2616 
2617 int
pmap_list_resident_pages(__unused pmap_t pmap,__unused vm_offset_t * listp,__unused int space)2618 pmap_list_resident_pages(
2619 	__unused pmap_t         pmap,
2620 	__unused vm_offset_t    *listp,
2621 	__unused int            space)
2622 {
2623 	return 0;
2624 }
2625 #endif  /* MACH_VM_DEBUG */
2626 
2627 
2628 #if CONFIG_COREDUMP
2629 /* temporary workaround */
2630 boolean_t
coredumpok(vm_map_t map,mach_vm_offset_t va)2631 coredumpok(vm_map_t map, mach_vm_offset_t va)
2632 {
2633 #if 0
2634 	pt_entry_t     *ptep;
2635 
2636 	ptep = pmap_pte(map->pmap, va);
2637 	if (0 == ptep) {
2638 		return FALSE;
2639 	}
2640 	return (*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED);
2641 #else
2642 	if (vm_map_entry_has_device_pager(map, va)) {
2643 		return FALSE;
2644 	}
2645 	return TRUE;
2646 #endif
2647 }
2648 #endif
2649 
2650 boolean_t
phys_page_exists(ppnum_t pn)2651 phys_page_exists(ppnum_t pn)
2652 {
2653 	assert(pn != vm_page_fictitious_addr);
2654 
2655 	if (!pmap_initialized) {
2656 		return TRUE;
2657 	}
2658 
2659 	if (pn == vm_page_guard_addr) {
2660 		return FALSE;
2661 	}
2662 
2663 	if (!IS_MANAGED_PAGE(ppn_to_pai(pn))) {
2664 		return FALSE;
2665 	}
2666 
2667 	return TRUE;
2668 }
2669 
2670 
2671 
2672 void
pmap_switch(pmap_t tpmap)2673 pmap_switch(pmap_t tpmap)
2674 {
2675 	PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(tpmap));
2676 	assert(ml_get_interrupts_enabled() == FALSE);
2677 	set_dirbase(tpmap, current_thread(), cpu_number());
2678 	PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
2679 }
2680 
2681 void
pmap_require(pmap_t pmap)2682 pmap_require(pmap_t pmap)
2683 {
2684 	if (pmap != kernel_pmap) {
2685 		zone_id_require(ZONE_ID_PMAP, sizeof(struct pmap), pmap);
2686 	}
2687 }
2688 
2689 /*
2690  * disable no-execute capability on
2691  * the specified pmap
2692  */
2693 void
pmap_disable_NX(__unused pmap_t pmap)2694 pmap_disable_NX(__unused pmap_t pmap)
2695 {
2696 #if DEVELOPMENT || DEBUG
2697 	pmap->nx_enabled = 0;
2698 #endif
2699 }
2700 
2701 void
pmap_flush_context_init(pmap_flush_context * pfc)2702 pmap_flush_context_init(pmap_flush_context *pfc)
2703 {
2704 	pfc->pfc_cpus = 0;
2705 	pfc->pfc_invalid_global = 0;
2706 }
2707 
2708 static bool
pmap_tlbi_response(uint32_t lcpu,uint32_t rcpu,bool ngflush)2709 pmap_tlbi_response(uint32_t lcpu, uint32_t rcpu, bool ngflush)
2710 {
2711 	bool responded = false;
2712 	bool gflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_global_count !=
2713 	    cpu_datap(lcpu)->cpu_tlb_gen_counts_global[rcpu]);
2714 
2715 	if (ngflush) {
2716 		if (gflushed) {
2717 			responded = true;
2718 		}
2719 	} else {
2720 		if (gflushed) {
2721 			responded = true;
2722 		} else {
2723 			bool lflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_local_count !=
2724 			    cpu_datap(lcpu)->cpu_tlb_gen_counts_local[rcpu]);
2725 			if (lflushed) {
2726 				responded = true;
2727 			}
2728 		}
2729 	}
2730 
2731 	if (responded == false) {
2732 		if ((cpu_datap(rcpu)->cpu_tlb_invalid == 0) ||
2733 		    !CPU_CR3_IS_ACTIVE(rcpu) ||
2734 		    !cpu_is_running(rcpu)) {
2735 			responded = true;
2736 		}
2737 	}
2738 	return responded;
2739 }
2740 
2741 extern uint64_t TLBTimeOut;
2742 void
pmap_flush(pmap_flush_context * pfc)2743 pmap_flush(
2744 	pmap_flush_context *pfc)
2745 {
2746 	unsigned int    my_cpu;
2747 	unsigned int    cpu;
2748 	cpumask_t       cpu_bit;
2749 	cpumask_t       cpus_to_respond = 0;
2750 	cpumask_t       cpus_to_signal = 0;
2751 	cpumask_t       cpus_signaled = 0;
2752 	boolean_t       flush_self = FALSE;
2753 	uint64_t        deadline;
2754 	bool            need_global_flush = false;
2755 
2756 	mp_disable_preemption();
2757 
2758 	my_cpu = cpu_number();
2759 	cpus_to_signal = pfc->pfc_cpus;
2760 
2761 	PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_START,
2762 	    NULL, cpus_to_signal);
2763 
2764 	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus && cpus_to_signal; cpu++, cpu_bit <<= 1) {
2765 		if (cpus_to_signal & cpu_bit) {
2766 			cpus_to_signal &= ~cpu_bit;
2767 
2768 			if (!cpu_is_running(cpu)) {
2769 				continue;
2770 			}
2771 
2772 			if (pfc->pfc_invalid_global & cpu_bit) {
2773 				cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2774 				need_global_flush = true;
2775 			} else {
2776 				cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2777 			}
2778 			cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2779 			cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2780 			mfence();
2781 
2782 			if (cpu == my_cpu) {
2783 				flush_self = TRUE;
2784 				continue;
2785 			}
2786 			if (CPU_CR3_IS_ACTIVE(cpu)) {
2787 				cpus_to_respond |= cpu_bit;
2788 				i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2789 			}
2790 		}
2791 	}
2792 	cpus_signaled = cpus_to_respond;
2793 
2794 	/*
2795 	 * Flush local tlb if required.
2796 	 * Do this now to overlap with other processors responding.
2797 	 */
2798 	if (flush_self) {
2799 		process_pmap_updates(NULL, (pfc->pfc_invalid_global != 0), 0ULL, ~0ULL);
2800 	}
2801 
2802 	if (cpus_to_respond) {
2803 		deadline = mach_absolute_time() +
2804 		    (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2805 		boolean_t is_timeout_traced = FALSE;
2806 
2807 		/*
2808 		 * Wait for those other cpus to acknowledge
2809 		 */
2810 		while (cpus_to_respond != 0) {
2811 			long orig_acks = 0;
2812 
2813 			for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2814 				bool responded = false;
2815 				if ((cpus_to_respond & cpu_bit) != 0) {
2816 					responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
2817 					if (responded) {
2818 						cpus_to_respond &= ~cpu_bit;
2819 					}
2820 					cpu_pause();
2821 				}
2822 
2823 				if (cpus_to_respond == 0) {
2824 					break;
2825 				}
2826 			}
2827 			if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2828 				if (machine_timeout_suspended()) {
2829 					continue;
2830 				}
2831 				if (TLBTimeOut == 0) {
2832 					if (is_timeout_traced) {
2833 						continue;
2834 					}
2835 
2836 					PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2837 					    NULL, cpus_to_signal, cpus_to_respond);
2838 
2839 					is_timeout_traced = TRUE;
2840 					continue;
2841 				}
2842 				orig_acks = NMIPI_acks;
2843 				NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
2844 				panic("Uninterruptible processor(s): CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu",
2845 				    cpus_to_respond, orig_acks, NMIPI_acks, deadline);
2846 			}
2847 		}
2848 	}
2849 
2850 	PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_END,
2851 	    NULL, cpus_signaled, flush_self);
2852 
2853 	mp_enable_preemption();
2854 }
2855 
2856 
2857 static void
invept(void * eptp)2858 invept(void *eptp)
2859 {
2860 	struct {
2861 		uint64_t eptp;
2862 		uint64_t reserved;
2863 	} __attribute__((aligned(16), packed)) invept_descriptor = {(uint64_t)eptp, 0};
2864 
2865 	__asm__ volatile ("invept (%%rax), %%rcx"
2866                  : : "c" (PMAP_INVEPT_SINGLE_CONTEXT), "a" (&invept_descriptor)
2867                  : "cc", "memory");
2868 }
2869 
2870 /*
2871  * Called with pmap locked, we:
2872  *  - scan through per-cpu data to see which other cpus need to flush
2873  *  - send an IPI to each non-idle cpu to be flushed
2874  *  - wait for all to signal back that they are inactive or we see that
2875  *    they are at a safe point (idle).
2876  *  - flush the local tlb if active for this pmap
2877  *  - return ... the caller will unlock the pmap
2878  */
2879 
2880 void
pmap_flush_tlbs(pmap_t pmap,vm_map_offset_t startv,vm_map_offset_t endv,int options,pmap_flush_context * pfc)2881 pmap_flush_tlbs(pmap_t  pmap, vm_map_offset_t startv, vm_map_offset_t endv, int options, pmap_flush_context *pfc)
2882 {
2883 	unsigned int    cpu;
2884 	cpumask_t       cpu_bit;
2885 	cpumask_t       cpus_to_signal = 0;
2886 	unsigned int    my_cpu = cpu_number();
2887 	pmap_paddr_t    pmap_cr3 = pmap->pm_cr3;
2888 	boolean_t       flush_self = FALSE;
2889 	uint64_t        deadline;
2890 	boolean_t       pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap));
2891 	bool            need_global_flush = false;
2892 	uint32_t        event_code = 0;
2893 	vm_map_offset_t event_startv = 0, event_endv = 0;
2894 	boolean_t       is_ept = is_ept_pmap(pmap);
2895 
2896 	assert((processor_avail_count < 2) ||
2897 	    (ml_get_interrupts_enabled() && get_preemption_level() != 0));
2898 
2899 	assert((endv - startv) >= PAGE_SIZE);
2900 	assert(((endv | startv) & PAGE_MASK) == 0);
2901 
2902 	if (__improbable(kdebug_enable)) {
2903 		if (pmap == kernel_pmap) {
2904 			event_code = PMAP_CODE(PMAP__FLUSH_KERN_TLBS);
2905 			event_startv = VM_KERNEL_UNSLIDE_OR_PERM(startv);
2906 			event_endv = VM_KERNEL_UNSLIDE_OR_PERM(endv);
2907 		} else if (__improbable(is_ept)) {
2908 			event_code = PMAP_CODE(PMAP__FLUSH_EPT);
2909 			event_startv = startv;
2910 			event_endv = endv;
2911 		} else {
2912 			event_code = PMAP_CODE(PMAP__FLUSH_TLBS);
2913 			event_startv = startv;
2914 			event_endv = endv;
2915 		}
2916 	}
2917 
2918 	PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_START,
2919 	    VM_KERNEL_UNSLIDE_OR_PERM(pmap), options,
2920 	    event_startv, event_endv);
2921 
2922 	if (__improbable(is_ept)) {
2923 		mp_cpus_call(CPUMASK_ALL, ASYNC, invept, (void*)pmap->pm_eptp);
2924 		goto out;
2925 	}
2926 
2927 	/*
2928 	 * Scan other cpus for matching active or task CR3.
2929 	 * For idle cpus (with no active map) we mark them invalid but
2930 	 * don't signal -- they'll check as they go busy.
2931 	 */
2932 	if (pmap_pcid_ncpus) {
2933 		if (pmap_is_shared) {
2934 			need_global_flush = true;
2935 		}
2936 		pmap_pcid_invalidate_all_cpus(pmap);
2937 		mfence();
2938 	}
2939 
2940 	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2941 		if (!cpu_is_running(cpu)) {
2942 			continue;
2943 		}
2944 		uint64_t        cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
2945 		uint64_t        cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
2946 
2947 		if ((pmap_cr3 == cpu_task_cr3) ||
2948 		    (pmap_cr3 == cpu_active_cr3) ||
2949 		    (pmap_is_shared)) {
2950 			if (options & PMAP_DELAY_TLB_FLUSH) {
2951 				if (need_global_flush == true) {
2952 					pfc->pfc_invalid_global |= cpu_bit;
2953 				}
2954 				pfc->pfc_cpus |= cpu_bit;
2955 
2956 				continue;
2957 			}
2958 			if (need_global_flush == true) {
2959 				cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2960 				cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2961 			} else {
2962 				cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2963 				cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2964 			}
2965 
2966 			if (cpu == my_cpu) {
2967 				flush_self = TRUE;
2968 				continue;
2969 			}
2970 
2971 			mfence();
2972 
2973 			/*
2974 			 * We don't need to signal processors which will flush
2975 			 * lazily at the idle state or kernel boundary.
2976 			 * For example, if we're invalidating the kernel pmap,
2977 			 * processors currently in userspace don't need to flush
2978 			 * their TLBs until the next time they enter the kernel.
2979 			 * Alterations to the address space of a task active
2980 			 * on a remote processor result in a signal, to
2981 			 * account for copy operations. (There may be room
2982 			 * for optimization in such cases).
2983 			 * The order of the loads below with respect
2984 			 * to the store to the "cpu_tlb_invalid" field above
2985 			 * is important--hence the barrier.
2986 			 */
2987 			if (CPU_CR3_IS_ACTIVE(cpu) &&
2988 			    (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
2989 			    pmap->pm_shared ||
2990 			    (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
2991 				cpus_to_signal |= cpu_bit;
2992 				i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2993 			}
2994 		}
2995 	}
2996 
2997 	if ((options & PMAP_DELAY_TLB_FLUSH)) {
2998 		goto out;
2999 	}
3000 
3001 	/*
3002 	 * Flush local tlb if required.
3003 	 * Do this now to overlap with other processors responding.
3004 	 */
3005 	if (flush_self) {
3006 		process_pmap_updates(pmap, pmap_is_shared, startv, endv);
3007 	}
3008 
3009 	if (cpus_to_signal) {
3010 		cpumask_t       cpus_to_respond = cpus_to_signal;
3011 
3012 		deadline = mach_absolute_time() +
3013 		    (TLBTimeOut ? TLBTimeOut : LockTimeOut);
3014 		boolean_t is_timeout_traced = FALSE;
3015 
3016 		/*
3017 		 * Wait for those other cpus to acknowledge
3018 		 */
3019 		while (cpus_to_respond != 0) {
3020 			long orig_acks = 0;
3021 
3022 			for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
3023 				bool responded = false;
3024 				if ((cpus_to_respond & cpu_bit) != 0) {
3025 					responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
3026 					if (responded) {
3027 						cpus_to_respond &= ~cpu_bit;
3028 					}
3029 					cpu_pause();
3030 				}
3031 				if (cpus_to_respond == 0) {
3032 					break;
3033 				}
3034 			}
3035 			if (cpus_to_respond && (mach_absolute_time() > deadline)) {
3036 				if (machine_timeout_suspended()) {
3037 					continue;
3038 				}
3039 				if (TLBTimeOut == 0) {
3040 					/* cut tracepoint but don't panic */
3041 					if (is_timeout_traced) {
3042 						continue;
3043 					}
3044 
3045 					PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
3046 					    VM_KERNEL_UNSLIDE_OR_PERM(pmap),
3047 					    cpus_to_signal,
3048 					    cpus_to_respond);
3049 
3050 					is_timeout_traced = TRUE;
3051 					continue;
3052 				}
3053 				orig_acks = NMIPI_acks;
3054 				uint64_t tstamp1 = mach_absolute_time();
3055 				NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
3056 				uint64_t tstamp2 = mach_absolute_time();
3057 				panic("IPI timeout, unresponsive CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu, pre-NMIPI time: 0x%llx, current: 0x%llx, global: %d",
3058 				    cpus_to_respond, orig_acks, NMIPI_acks, deadline, tstamp1, tstamp2, need_global_flush);
3059 			}
3060 		}
3061 	}
3062 
3063 	if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) {
3064 		panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, pmap_cr3: 0x%llx, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, pmap_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
3065 	}
3066 
3067 out:
3068 	PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_END,
3069 	    VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal,
3070 	    event_startv, event_endv);
3071 }
3072 
3073 static void
process_pmap_updates(pmap_t p,bool pshared,addr64_t istart,addr64_t iend)3074 process_pmap_updates(pmap_t p, bool pshared, addr64_t istart, addr64_t iend)
3075 {
3076 	int ccpu = cpu_number();
3077 	bool gtlbf = false;
3078 
3079 	pmap_assert(ml_get_interrupts_enabled() == 0 ||
3080 	    get_preemption_level() != 0);
3081 
3082 	if (cpu_datap(ccpu)->cpu_tlb_invalid_global) {
3083 		cpu_datap(ccpu)->cpu_tlb_invalid_global_count++;
3084 		cpu_datap(ccpu)->cpu_tlb_invalid = 0;
3085 		gtlbf = true;
3086 	} else {
3087 		cpu_datap(ccpu)->cpu_tlb_invalid_local_count++;
3088 		cpu_datap(ccpu)->cpu_tlb_invalid_local = 0;
3089 	}
3090 
3091 	if (pmap_pcid_ncpus) {
3092 		if (p) {
3093 			/* TODO global generation count to
3094 			 * avoid potentially redundant
3095 			 * csw invalidations post-global invalidation
3096 			 */
3097 			pmap_pcid_validate_cpu(p, ccpu);
3098 			pmap_tlbi_range(istart, iend, (pshared || gtlbf), p->pmap_pcid_cpus[ccpu]);
3099 		} else {
3100 			pmap_pcid_validate_current();
3101 			pmap_tlbi_range(istart, iend, true, 0);
3102 		}
3103 	} else {
3104 		pmap_tlbi_range(0, ~0ULL, true, 0);
3105 	}
3106 }
3107 
3108 void
pmap_update_interrupt(void)3109 pmap_update_interrupt(void)
3110 {
3111 	PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START);
3112 
3113 	if (current_cpu_datap()->cpu_tlb_invalid) {
3114 		process_pmap_updates(NULL, true, 0ULL, ~0ULL);
3115 	}
3116 
3117 	PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END);
3118 }
3119 
3120 #include <mach/mach_vm.h>       /* mach_vm_region_recurse() */
3121 /* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries
3122  * and identify ranges with mismatched VM permissions and PTE permissions
3123  */
3124 kern_return_t
pmap_permissions_verify(pmap_t ipmap,vm_map_t ivmmap,vm_offset_t sv,vm_offset_t ev)3125 pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev)
3126 {
3127 	vm_offset_t cv = sv;
3128 	kern_return_t rv = KERN_SUCCESS;
3129 	uint64_t skip4 = 0, skip2 = 0;
3130 
3131 	assert(!is_ept_pmap(ipmap));
3132 
3133 	sv &= ~PAGE_MASK_64;
3134 	ev &= ~PAGE_MASK_64;
3135 	while (cv < ev) {
3136 		if (__improbable((cv > 0x00007FFFFFFFFFFFULL) &&
3137 		    (cv < 0xFFFF800000000000ULL))) {
3138 			cv = 0xFFFF800000000000ULL;
3139 		}
3140 		/* Potential inconsistencies from not holding pmap lock
3141 		 * but harmless for the moment.
3142 		 */
3143 		if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) {
3144 			if ((cv + NBPML4) > cv) {
3145 				cv += NBPML4;
3146 			} else {
3147 				break;
3148 			}
3149 			skip4++;
3150 			continue;
3151 		}
3152 		if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) {
3153 			if ((cv + NBPD) > cv) {
3154 				cv += NBPD;
3155 			} else {
3156 				break;
3157 			}
3158 			skip2++;
3159 			continue;
3160 		}
3161 
3162 		pt_entry_t *ptep = pmap_pte(ipmap, cv);
3163 		if (ptep && (*ptep & INTEL_PTE_VALID)) {
3164 			if (*ptep & INTEL_PTE_WRITE) {
3165 				if (!(*ptep & INTEL_PTE_NX)) {
3166 					kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep)))));
3167 					rv = KERN_FAILURE;
3168 				}
3169 			}
3170 		}
3171 		cv += PAGE_SIZE;
3172 	}
3173 	kprintf("Completed pmap scan\n");
3174 	cv = sv;
3175 
3176 	struct vm_region_submap_info_64 vbr;
3177 	mach_msg_type_number_t vbrcount = 0;
3178 	mach_vm_size_t  vmsize;
3179 	vm_prot_t       prot;
3180 	uint32_t nesting_depth = 0;
3181 	kern_return_t kret;
3182 
3183 	while (cv < ev) {
3184 		for (;;) {
3185 			vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64;
3186 			if ((kret = mach_vm_region_recurse(ivmmap,
3187 			    (mach_vm_address_t *) &cv, &vmsize, &nesting_depth,
3188 			    (vm_region_recurse_info_t)&vbr,
3189 			    &vbrcount)) != KERN_SUCCESS) {
3190 				break;
3191 			}
3192 
3193 			if (vbr.is_submap) {
3194 				nesting_depth++;
3195 				continue;
3196 			} else {
3197 				break;
3198 			}
3199 		}
3200 
3201 		if (kret != KERN_SUCCESS) {
3202 			break;
3203 		}
3204 
3205 		prot = vbr.protection;
3206 
3207 		if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
3208 			kprintf("W+X map entry at address 0x%lx\n", cv);
3209 			rv = KERN_FAILURE;
3210 		}
3211 
3212 		if (prot) {
3213 			vm_offset_t pcv;
3214 			for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) {
3215 				pt_entry_t *ptep = pmap_pte(ipmap, pcv);
3216 				vm_prot_t tprot;
3217 
3218 				if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID)) {
3219 					continue;
3220 				}
3221 				tprot = VM_PROT_READ;
3222 				if (*ptep & INTEL_PTE_WRITE) {
3223 					tprot |= VM_PROT_WRITE;
3224 				}
3225 				if ((*ptep & INTEL_PTE_NX) == 0) {
3226 					tprot |= VM_PROT_EXECUTE;
3227 				}
3228 				if (tprot != prot) {
3229 					kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot);
3230 					rv = KERN_FAILURE;
3231 				}
3232 			}
3233 		}
3234 		cv += vmsize;
3235 	}
3236 	return rv;
3237 }
3238 
3239 #if MACH_ASSERT
3240 extern int pmap_ledgers_panic;
3241 extern int pmap_ledgers_panic_leeway;
3242 
3243 static void
pmap_check_ledgers(pmap_t pmap)3244 pmap_check_ledgers(
3245 	pmap_t pmap)
3246 {
3247 	int     pid;
3248 	char    *procname;
3249 
3250 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
3251 		/*
3252 		 * This pmap was not or is no longer fully associated
3253 		 * with a task (e.g. the old pmap after a fork()/exec() or
3254 		 * spawn()).  Its "ledger" still points at a task that is
3255 		 * now using a different (and active) address space, so
3256 		 * we can't check that all the pmap ledgers are balanced here.
3257 		 *
3258 		 * If the "pid" is set, that means that we went through
3259 		 * pmap_set_process() in task_terminate_internal(), so
3260 		 * this task's ledger should not have been re-used and
3261 		 * all the pmap ledgers should be back to 0.
3262 		 */
3263 		return;
3264 	}
3265 
3266 	pid = pmap->pmap_pid;
3267 	procname = pmap->pmap_procname;
3268 
3269 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
3270 }
3271 
3272 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3273 pmap_set_process(
3274 	pmap_t pmap,
3275 	int pid,
3276 	char *procname)
3277 {
3278 	if (pmap == NULL || pmap->pmap_pid == -1) {
3279 		return;
3280 	}
3281 
3282 	pmap->pmap_pid = pid;
3283 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3284 	if (pmap_ledgers_panic_leeway) {
3285 		/*
3286 		 * XXX FBDP
3287 		 * Some processes somehow trigger some issues that make
3288 		 * the pmap stats and ledgers go off track, causing
3289 		 * some assertion failures and ledger panics.
3290 		 * Turn off the sanity checks if we allow some ledger leeway
3291 		 * because of that.  We'll still do a final check in
3292 		 * pmap_check_ledgers() for discrepancies larger than the
3293 		 * allowed leeway after the address space has been fully
3294 		 * cleaned up.
3295 		 */
3296 		pmap->pmap_stats_assert = FALSE;
3297 		ledger_disable_panic_on_negative(pmap->ledger,
3298 		    task_ledgers.phys_footprint);
3299 		ledger_disable_panic_on_negative(pmap->ledger,
3300 		    task_ledgers.internal);
3301 		ledger_disable_panic_on_negative(pmap->ledger,
3302 		    task_ledgers.internal_compressed);
3303 		ledger_disable_panic_on_negative(pmap->ledger,
3304 		    task_ledgers.iokit_mapped);
3305 		ledger_disable_panic_on_negative(pmap->ledger,
3306 		    task_ledgers.alternate_accounting);
3307 		ledger_disable_panic_on_negative(pmap->ledger,
3308 		    task_ledgers.alternate_accounting_compressed);
3309 	}
3310 }
3311 #endif /* MACH_ASSERT */
3312 
3313 
3314 #if DEVELOPMENT || DEBUG
3315 int pmap_pagezero_mitigation = 1;
3316 #endif
3317 
3318 void
pmap_advise_pagezero_range(pmap_t lpmap,uint64_t low_bound)3319 pmap_advise_pagezero_range(pmap_t lpmap, uint64_t low_bound)
3320 {
3321 #if DEVELOPMENT || DEBUG
3322 	if (pmap_pagezero_mitigation == 0) {
3323 		lpmap->pagezero_accessible = FALSE;
3324 		return;
3325 	}
3326 #endif
3327 	lpmap->pagezero_accessible = ((pmap_smap_enabled == FALSE) && (low_bound < 0x1000));
3328 	if (lpmap == current_pmap()) {
3329 		mp_disable_preemption();
3330 		current_cpu_datap()->cpu_pagezero_mapped = lpmap->pagezero_accessible;
3331 		mp_enable_preemption();
3332 	}
3333 }
3334 
3335 uintptr_t
pmap_verify_noncacheable(uintptr_t vaddr)3336 pmap_verify_noncacheable(uintptr_t vaddr)
3337 {
3338 	pt_entry_t *ptep = NULL;
3339 	ptep = pmap_pte(kernel_pmap, vaddr);
3340 	if (ptep == NULL) {
3341 		panic("pmap_verify_noncacheable: no translation for 0x%lx", vaddr);
3342 	}
3343 	/* Non-cacheable OK */
3344 	if (*ptep & (INTEL_PTE_NCACHE)) {
3345 		return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3346 	}
3347 	/* Write-combined OK */
3348 	if (*ptep & (INTEL_PTE_PAT)) {
3349 		return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3350 	}
3351 	panic("pmap_verify_noncacheable: IO read from a cacheable address? address: 0x%lx, PTE: %p, *PTE: 0x%llx", vaddr, ptep, *ptep);
3352 	/*NOTREACHED*/
3353 	return 0;
3354 }
3355 
3356 bool
pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])3357 pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])
3358 {
3359 	// Unsupported on this architecture.
3360 	return false;
3361 }
3362 
3363 uint32_t
pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])3364 pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])
3365 {
3366 	// Unsupported on this architecture.
3367 	return false;
3368 }
3369 
3370 int
pmap_cs_configuration(void)3371 pmap_cs_configuration(void)
3372 {
3373 	// Unsupported on this architecture.
3374 	return 0;
3375 }
3376 
3377 SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
3378 uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
3379 
3380 void
pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])3381 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
3382 {
3383 	simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL);
3384 	memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
3385 	simple_unlock(&pmap_compilation_service_cdhash_lock);
3386 
3387 #if DEVELOPMENT || DEBUG
3388 	printf("Added Compilation Service CDHash through the PMAP: 0x%02X 0x%02X 0x%02X 0x%02X\n", cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
3389 #endif
3390 }
3391 
3392 bool
pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])3393 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
3394 {
3395 	bool match = false;
3396 
3397 	simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL);
3398 	if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
3399 		match = true;
3400 	}
3401 	simple_unlock(&pmap_compilation_service_cdhash_lock);
3402 
3403 #if DEVELOPMENT || DEBUG
3404 	if (match) {
3405 		printf("Matched Compilation Service CDHash through the PMAP\n");
3406 	}
3407 #endif
3408 
3409 	return match;
3410 }
3411 
3412 static bool pmap_local_signing_public_key_set = false;
3413 static uint8_t pmap_local_signing_public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE] = { 0 };
3414 
3415 static bool
pmap_local_signing_public_key_is_set(void)3416 pmap_local_signing_public_key_is_set(void)
3417 {
3418 	return os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
3419 }
3420 
3421 void
pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])3422 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])
3423 {
3424 	bool key_set = false;
3425 
3426 	/*
3427 	 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
3428 	 * a successful exchange means that the local signing public key has _not_ been
3429 	 * set. In case the key has been set, we panic as we would never expect the
3430 	 * kernel to attempt to set the key more than once.
3431 	 */
3432 	key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
3433 
3434 	if (key_set) {
3435 		panic("attempted to set the local signing public key multiple times");
3436 	}
3437 
3438 	memcpy(pmap_local_signing_public_key, public_key, PMAP_ECC_P384_PUBLIC_KEY_SIZE);
3439 
3440 #if DEVELOPMENT || DEBUG
3441 	printf("Set local signing public key\n");
3442 #endif
3443 }
3444 
3445 uint8_t*
pmap_get_local_signing_public_key(void)3446 pmap_get_local_signing_public_key(void)
3447 {
3448 	if (pmap_local_signing_public_key_is_set()) {
3449 		return pmap_local_signing_public_key;
3450 	}
3451 	return NULL;
3452 }
3453 
3454 void
pmap_unrestrict_local_signing(__unused const uint8_t cdhash[CS_CDHASH_LEN])3455 pmap_unrestrict_local_signing(
3456 	__unused const uint8_t cdhash[CS_CDHASH_LEN])
3457 {
3458 	// TODO: Once all changes across XNU and AMFI have been submitted, panic.
3459 }
3460 
3461 bool
pmap_query_entitlements(__unused pmap_t pmap,__unused CEQuery_t query,__unused size_t queryLength,__unused CEQueryContext_t finalContext)3462 pmap_query_entitlements(
3463 	__unused pmap_t pmap,
3464 	__unused CEQuery_t query,
3465 	__unused size_t queryLength,
3466 	__unused CEQueryContext_t finalContext)
3467 {
3468 #if !PMAP_SUPPORTS_ENTITLEMENT_CHECKS
3469 	panic("PMAP_CS: do not use this API without checking for \'#if PMAP_SUPPORTS_ENTITLEMENT_CHECKS\'");
3470 #endif
3471 
3472 	panic("PMAP_SUPPORTS_ENTITLEMENT_CHECKS should not be defined on this platform");
3473 }
3474 
3475 bool
pmap_cs_enabled(void)3476 pmap_cs_enabled(void)
3477 {
3478 	return false;
3479 }
3480 
3481 bool
pmap_in_ppl(void)3482 pmap_in_ppl(void)
3483 {
3484 	// Nonexistent on this architecture.
3485 	return false;
3486 }
3487 
3488 bool
pmap_has_ppl(void)3489 pmap_has_ppl(void)
3490 {
3491 	// Not supported on this architecture.
3492 	return false;
3493 }
3494 
3495 bool
pmap_has_iofilter_protected_write()3496 pmap_has_iofilter_protected_write()
3497 {
3498 	// Not supported on this architecture.
3499 	return false;
3500 }
3501 
3502 __attribute__((__noreturn__))
3503 void
pmap_iofilter_protected_write(__unused vm_address_t addr,__unused uint64_t value,__unused uint64_t width)3504 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
3505 {
3506 	panic("%s called on an unsupported platform.", __FUNCTION__);
3507 }
3508 
3509 void* __attribute__((noreturn))
pmap_image4_pmap_data(__unused size_t * allocated_size)3510 pmap_image4_pmap_data(
3511 	__unused size_t *allocated_size)
3512 {
3513 	panic("PMAP_IMG4: image4 data not available on this architecture");
3514 }
3515 
3516 void __attribute__((noreturn))
pmap_image4_set_nonce(__unused const img4_nonce_domain_index_t ndi,__unused const img4_nonce_t * nonce)3517 pmap_image4_set_nonce(
3518 	__unused const img4_nonce_domain_index_t ndi,
3519 	__unused const img4_nonce_t *nonce)
3520 {
3521 	panic("PMAP_IMG4: set nonce API not supported on this architecture");
3522 }
3523 
3524 void __attribute__((noreturn))
pmap_image4_roll_nonce(__unused const img4_nonce_domain_index_t ndi)3525 pmap_image4_roll_nonce(
3526 	__unused const img4_nonce_domain_index_t ndi)
3527 {
3528 	panic("PMAP_IMG4: roll nonce API not supported on this architecture");
3529 }
3530 
3531 errno_t __attribute__((noreturn))
pmap_image4_copy_nonce(__unused const img4_nonce_domain_index_t ndi,__unused img4_nonce_t * nonce_out)3532 pmap_image4_copy_nonce(
3533 	__unused const img4_nonce_domain_index_t ndi,
3534 	__unused img4_nonce_t *nonce_out
3535 	)
3536 {
3537 	panic("PMAP_IMG4: copy nonce API not supported on this architecture");
3538 }
3539 
3540 errno_t __attribute__((noreturn))
pmap_image4_execute_object(__unused img4_runtime_object_spec_index_t obj_spec_index,__unused const img4_buff_t * payload,__unused const img4_buff_t * _Nullable manifest)3541 pmap_image4_execute_object(
3542 	__unused img4_runtime_object_spec_index_t obj_spec_index,
3543 	__unused const img4_buff_t *payload,
3544 	__unused const img4_buff_t *_Nullable manifest)
3545 {
3546 	panic("PMAP_IMG4: execute object API not supported on this architecture");
3547 }
3548 
3549 errno_t __attribute__((noreturn))
pmap_image4_copy_object(__unused img4_runtime_object_spec_index_t obj_spec_index,__unused vm_address_t object_out,__unused size_t * object_length)3550 pmap_image4_copy_object(
3551 	__unused img4_runtime_object_spec_index_t obj_spec_index,
3552 	__unused vm_address_t object_out,
3553 	__unused size_t *object_length)
3554 {
3555 	panic("PMAP_IMG4: copy object API not supported on this architecture");
3556 }
3557 
3558 kern_return_t
pmap_cs_allow_invalid(__unused pmap_t pmap)3559 pmap_cs_allow_invalid(__unused pmap_t pmap)
3560 {
3561 	// Unsupported on this architecture.
3562 	return KERN_SUCCESS;
3563 }
3564 
3565 void *
pmap_claim_reserved_ppl_page(void)3566 pmap_claim_reserved_ppl_page(void)
3567 {
3568 	// Unsupported on this architecture.
3569 	return NULL;
3570 }
3571 
3572 void
pmap_free_reserved_ppl_page(void __unused * kva)3573 pmap_free_reserved_ppl_page(void __unused *kva)
3574 {
3575 	// Unsupported on this architecture.
3576 }
3577 
3578 kern_return_t
pmap_cs_fork_prepare(__unused pmap_t old_pmap,__unused pmap_t new_pmap)3579 pmap_cs_fork_prepare(__unused pmap_t old_pmap, __unused pmap_t new_pmap)
3580 {
3581 	// PMAP_CS isn't enabled for x86_64.
3582 	return KERN_SUCCESS;
3583 }
3584 
3585 #if DEVELOPMENT || DEBUG
3586 /*
3587  * Used for unit testing recovery from text corruptions.
3588  */
3589 kern_return_t
pmap_test_text_corruption(pmap_paddr_t pa)3590 pmap_test_text_corruption(pmap_paddr_t pa)
3591 {
3592 	int pai;
3593 	uint8_t *va;
3594 
3595 	pai = ppn_to_pai(atop(pa));
3596 	if (!IS_MANAGED_PAGE(pai)) {
3597 		return KERN_FAILURE;
3598 	}
3599 
3600 	va = (uint8_t *)PHYSMAP_PTOV(pa);
3601 	va[0] = 0x0f; /* opcode for UD2 */
3602 	va[1] = 0x0b;
3603 
3604 	return KERN_SUCCESS;
3605 }
3606 #endif /* DEVELOPMENT || DEBUG */
3607