1 /*
2 * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58
59 /*
60 * File: pmap.c
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
63 *
64 * Physical Map management code for Intel i386, i486, and i860.
65 *
66 * Manages physical address maps.
67 *
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
74 *
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
80 * requested.
81 *
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
89 */
90
91 #include <string.h>
92 #include <mach_ldebug.h>
93
94 #include <libkern/OSAtomic.h>
95
96 #include <mach/machine/vm_types.h>
97
98 #include <mach/boolean.h>
99 #include <kern/thread.h>
100 #include <kern/zalloc.h>
101 #include <kern/zalloc_internal.h>
102 #include <kern/queue.h>
103 #include <kern/ledger.h>
104 #include <kern/mach_param.h>
105
106 #include <kern/spl.h>
107
108 #include <vm/pmap.h>
109 #include <vm/pmap_cs.h>
110 #include <vm/vm_map.h>
111 #include <vm/vm_kern.h>
112 #include <mach/vm_param.h>
113 #include <mach/vm_prot.h>
114 #include <vm/vm_object.h>
115 #include <vm/vm_page.h>
116
117 #include <mach/machine/vm_param.h>
118 #include <machine/thread.h>
119
120 #include <kern/misc_protos.h> /* prototyping */
121 #include <i386/misc_protos.h>
122 #include <i386/i386_lowmem.h>
123 #include <x86_64/lowglobals.h>
124
125 #include <i386/cpuid.h>
126 #include <i386/cpu_data.h>
127 #include <i386/cpu_number.h>
128 #include <i386/machine_cpu.h>
129 #include <i386/seg.h>
130 #include <i386/serial_io.h>
131 #include <i386/cpu_capabilities.h>
132 #include <i386/machine_routines.h>
133 #include <i386/proc_reg.h>
134 #include <i386/tsc.h>
135 #include <i386/pmap_internal.h>
136 #include <i386/pmap_pcid.h>
137 #if CONFIG_VMX
138 #include <i386/vmx/vmx_cpu.h>
139 #endif
140
141 #include <vm/vm_protos.h>
142 #include <san/kasan.h>
143
144 #include <i386/mp.h>
145 #include <i386/mp_desc.h>
146 #include <libkern/kernel_mach_header.h>
147
148 #include <pexpert/i386/efi.h>
149 #include <libkern/section_keywords.h>
150 #if MACH_ASSERT
151 int pmap_stats_assert = 1;
152 #endif /* MACH_ASSERT */
153
154 #ifdef IWANTTODEBUG
155 #undef DEBUG
156 #define DEBUG 1
157 #define POSTCODE_DELAY 1
158 #include <i386/postcode.h>
159 #endif /* IWANTTODEBUG */
160
161 #ifdef PMAP_DEBUG
162 #define DBG(x...) kprintf("DBG: " x)
163 #else
164 #define DBG(x...)
165 #endif
166 /* Compile time assert to ensure adjacency/alignment of per-CPU data fields used
167 * in the trampolines for kernel/user boundary TLB coherency.
168 */
169 char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1];
170 boolean_t pmap_trace = FALSE;
171
172 boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */
173
174 #if DEVELOPMENT || DEBUG
175 int nx_enabled = 1; /* enable no-execute protection -- set during boot */
176 #else
177 const int nx_enabled = 1;
178 #endif
179
180 #if DEBUG || DEVELOPMENT
181 int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
182 int allow_stack_exec = 0; /* No apps may execute from the stack by default */
183 #else /* DEBUG || DEVELOPMENT */
184 const int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
185 const int allow_stack_exec = 0; /* No apps may execute from the stack by default */
186 #endif /* DEBUG || DEVELOPMENT */
187
188 uint64_t max_preemption_latency_tsc = 0;
189
190 pv_hashed_entry_t *pv_hash_table; /* hash lists */
191
192 uint32_t npvhashmask = 0, npvhashbuckets = 0;
193
194 pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
195 pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
196 SIMPLE_LOCK_DECLARE(pv_hashed_free_list_lock, 0);
197 SIMPLE_LOCK_DECLARE(pv_hashed_kern_free_list_lock, 0);
198 SIMPLE_LOCK_DECLARE(pv_hash_table_lock, 0);
199 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
200
201 SECURITY_READ_ONLY_LATE(zone_t) pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
202
203 /*
204 * First and last physical addresses that we maintain any information
205 * for. Initialized to zero so that pmap operations done before
206 * pmap_init won't touch any non-existent structures.
207 */
208 boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */
209
210 static struct vm_object kptobj_object_store VM_PAGE_PACKED_ALIGNED;
211 static struct vm_object kpml4obj_object_store VM_PAGE_PACKED_ALIGNED;
212 static struct vm_object kpdptobj_object_store VM_PAGE_PACKED_ALIGNED;
213
214 /*
215 * Array of physical page attribites for managed pages.
216 * One byte per physical page.
217 */
218 char *pmap_phys_attributes;
219 ppnum_t last_managed_page = 0;
220
221 unsigned pmap_memory_region_count;
222 unsigned pmap_memory_region_current;
223
224 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
225
226 /*
227 * Other useful macros.
228 */
229 #define current_pmap() (vm_map_pmap(current_thread()->map))
230
231 struct pmap kernel_pmap_store;
232 const pmap_t kernel_pmap = &kernel_pmap_store;
233 SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
234 SECURITY_READ_ONLY_LATE(zone_t) pmap_anchor_zone;
235 SECURITY_READ_ONLY_LATE(zone_t) pmap_uanchor_zone;
236 int pmap_debug = 0; /* flag for debugging prints */
237
238 unsigned int inuse_ptepages_count = 0;
239 long long alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */
240 unsigned int bootstrap_wired_pages = 0;
241
242 extern long NMIPI_acks;
243
244 SECURITY_READ_ONLY_LATE(boolean_t) kernel_text_ps_4K = TRUE;
245
246 extern char end;
247
248 static int nkpt;
249
250 #if DEVELOPMENT || DEBUG
251 SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kheap_nx = FALSE;
252 SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kstack_nx = FALSE;
253 SECURITY_READ_ONLY_LATE(boolean_t) wpkernel = TRUE;
254 #else
255 const boolean_t wpkernel = TRUE;
256 #endif
257
258 extern long __stack_chk_guard[];
259
260 static uint64_t pmap_eptp_flags = 0;
261 boolean_t pmap_ept_support_ad = FALSE;
262
263 static void process_pmap_updates(pmap_t, bool, addr64_t, addr64_t);
264 /*
265 * Map memory at initialization. The physical addresses being
266 * mapped are not managed and are never unmapped.
267 *
268 * For now, VM is already on, we only need to map the
269 * specified memory.
270 */
271 vm_offset_t
pmap_map(vm_offset_t virt,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int flags)272 pmap_map(
273 vm_offset_t virt,
274 vm_map_offset_t start_addr,
275 vm_map_offset_t end_addr,
276 vm_prot_t prot,
277 unsigned int flags)
278 {
279 kern_return_t kr;
280 int ps;
281
282 ps = PAGE_SIZE;
283 while (start_addr < end_addr) {
284 kr = pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
285 (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE);
286
287 if (kr != KERN_SUCCESS) {
288 panic("%s: failed pmap_enter, "
289 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
290 __FUNCTION__,
291 (void *)virt, (void *)start_addr, (void *)end_addr, prot, flags);
292 }
293
294 virt += ps;
295 start_addr += ps;
296 }
297 return virt;
298 }
299
300 extern char *first_avail;
301 extern vm_offset_t virtual_avail, virtual_end;
302 extern pmap_paddr_t avail_start, avail_end;
303 extern vm_offset_t sHIB;
304 extern vm_offset_t eHIB;
305 extern vm_offset_t stext;
306 extern vm_offset_t etext;
307 extern vm_offset_t sdata, edata;
308 extern vm_offset_t sconst, econst;
309
310 extern void *KPTphys;
311
312 boolean_t pmap_smep_enabled = FALSE;
313 boolean_t pmap_smap_enabled = FALSE;
314
315 void
pmap_cpu_init(void)316 pmap_cpu_init(void)
317 {
318 cpu_data_t *cdp = current_cpu_datap();
319
320 set_cr4(get_cr4() | CR4_PGE);
321
322 /*
323 * Initialize the per-cpu, TLB-related fields.
324 */
325 cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
326 cpu_shadowp(cdp->cpu_number)->cpu_kernel_cr3 = cdp->cpu_kernel_cr3;
327 cdp->cpu_active_cr3 = kernel_pmap->pm_cr3;
328 cdp->cpu_tlb_invalid = 0;
329 cdp->cpu_task_map = TASK_MAP_64BIT;
330
331 pmap_pcid_configure();
332 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) {
333 pmap_smep_enabled = TRUE;
334 #if DEVELOPMENT || DEBUG
335 boolean_t nsmep;
336 if (PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) {
337 pmap_smep_enabled = FALSE;
338 }
339 #endif
340 if (pmap_smep_enabled) {
341 set_cr4(get_cr4() | CR4_SMEP);
342 }
343 }
344 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
345 pmap_smap_enabled = TRUE;
346 #if DEVELOPMENT || DEBUG
347 boolean_t nsmap;
348 if (PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
349 pmap_smap_enabled = FALSE;
350 }
351 #endif
352 if (pmap_smap_enabled) {
353 set_cr4(get_cr4() | CR4_SMAP);
354 }
355 }
356
357 #if !MONOTONIC
358 if (cdp->cpu_fixed_pmcs_enabled) {
359 boolean_t enable = TRUE;
360 cpu_pmc_control(&enable);
361 }
362 #endif /* !MONOTONIC */
363 }
364
365 static void
pmap_ro_zone_validate_element_dst(zone_id_t zid,vm_offset_t va,vm_offset_t offset,vm_size_t new_data_size)366 pmap_ro_zone_validate_element_dst(
367 zone_id_t zid,
368 vm_offset_t va,
369 vm_offset_t offset,
370 vm_size_t new_data_size)
371 {
372 vm_size_t elem_size = zone_elem_size_ro(zid);
373 vm_offset_t sum = 0, page = trunc_page(va);
374 if (__improbable(new_data_size > (elem_size - offset))) {
375 panic("%s: New data size %lu too large for elem size %lu at addr %p",
376 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
377 }
378 if (__improbable(offset >= elem_size)) {
379 panic("%s: Offset %lu too large for elem size %lu at addr %p",
380 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
381 }
382 if (__improbable(os_add3_overflow(va, offset, new_data_size, &sum))) {
383 panic("%s: Integer addition overflow %p + %lu + %lu = %lu",
384 __func__, (void*)va, (uintptr_t)offset, (uintptr_t) new_data_size,
385 (uintptr_t)sum);
386 }
387 if (__improbable((va - page) % elem_size)) {
388 panic("%s: Start of element %p is not aligned to element size %lu",
389 __func__, (void *)va, (uintptr_t)elem_size);
390 }
391
392 /* Check element is from correct zone */
393 zone_require_ro(zid, elem_size, (void*)va);
394 }
395
396 static void
pmap_ro_zone_validate_element(zone_id_t zid,vm_offset_t va,vm_offset_t offset,const vm_offset_t new_data,vm_size_t new_data_size)397 pmap_ro_zone_validate_element(
398 zone_id_t zid,
399 vm_offset_t va,
400 vm_offset_t offset,
401 const vm_offset_t new_data,
402 vm_size_t new_data_size)
403 {
404 vm_offset_t sum = 0;
405
406 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
407 panic("%s: Integer addition overflow %p + %lu = %lu",
408 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
409 }
410
411 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
412 }
413
414 void
pmap_ro_zone_memcpy(zone_id_t zid,vm_offset_t va,vm_offset_t offset,const vm_offset_t new_data,vm_size_t new_data_size)415 pmap_ro_zone_memcpy(
416 zone_id_t zid,
417 vm_offset_t va,
418 vm_offset_t offset,
419 const vm_offset_t new_data,
420 vm_size_t new_data_size)
421 {
422 const pmap_paddr_t pa = kvtophys(va + offset);
423
424 if (!new_data || new_data_size == 0) {
425 return;
426 }
427
428 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
429 /* Write through Physical Aperture */
430 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
431 }
432
433 uint64_t
pmap_ro_zone_atomic_op(zone_id_t zid,vm_offset_t va,vm_offset_t offset,zro_atomic_op_t op,uint64_t value)434 pmap_ro_zone_atomic_op(
435 zone_id_t zid,
436 vm_offset_t va,
437 vm_offset_t offset,
438 zro_atomic_op_t op,
439 uint64_t value)
440 {
441 const pmap_paddr_t pa = kvtophys(va + offset);
442 vm_size_t value_size = op & 0xf;
443
444 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
445 /* Write through Physical Aperture */
446 return __zalloc_ro_mut_atomic(phystokv(pa), op, value);
447 }
448
449 void
pmap_ro_zone_bzero(zone_id_t zid,vm_offset_t va,vm_offset_t offset,vm_size_t size)450 pmap_ro_zone_bzero(
451 zone_id_t zid,
452 vm_offset_t va,
453 vm_offset_t offset,
454 vm_size_t size)
455 {
456 const pmap_paddr_t pa = kvtophys(va + offset);
457 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
458 bzero((void*)phystokv(pa), size);
459 }
460
461 static uint32_t
pmap_scale_shift(void)462 pmap_scale_shift(void)
463 {
464 uint32_t scale = 0;
465
466 if (sane_size <= 8 * GB) {
467 scale = (uint32_t)(sane_size / (2 * GB));
468 } else if (sane_size <= 32 * GB) {
469 scale = 4 + (uint32_t)((sane_size - (8 * GB)) / (4 * GB));
470 } else {
471 scale = 10 + (uint32_t)MIN(4, ((sane_size - (32 * GB)) / (8 * GB)));
472 }
473 return scale;
474 }
475
476 LCK_GRP_DECLARE(pmap_lck_grp, "pmap");
477 LCK_ATTR_DECLARE(pmap_lck_rw_attr, 0, LCK_ATTR_DEBUG);
478
479 /*
480 * Bootstrap the system enough to run with virtual memory.
481 * Map the kernel's code and data, and allocate the system page table.
482 * Called with mapping OFF. Page_size must already be set.
483 */
484
485 void
pmap_bootstrap(__unused vm_offset_t load_start,__unused boolean_t IA32e)486 pmap_bootstrap(
487 __unused vm_offset_t load_start,
488 __unused boolean_t IA32e)
489 {
490 assert(IA32e);
491
492 vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address
493 * known to VM */
494 /*
495 * The kernel's pmap is statically allocated so we don't
496 * have to use pmap_create, which is unlikely to work
497 * correctly at this part of the boot sequence.
498 */
499
500 os_ref_init(&kernel_pmap->ref_count, NULL);
501 #if DEVELOPMENT || DEBUG
502 kernel_pmap->nx_enabled = TRUE;
503 #endif
504 kernel_pmap->pm_task_map = TASK_MAP_64BIT;
505 kernel_pmap->pm_obj = (vm_object_t) NULL;
506 kernel_pmap->pm_pml4 = IdlePML4;
507 kernel_pmap->pm_upml4 = IdlePML4;
508 kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
509 kernel_pmap->pm_ucr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
510 kernel_pmap->pm_eptp = 0;
511
512 pmap_pcid_initialize_kernel(kernel_pmap);
513
514 current_cpu_datap()->cpu_kernel_cr3 = cpu_shadowp(cpu_number())->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
515
516 nkpt = NKPT;
517 OSAddAtomic(NKPT, &inuse_ptepages_count);
518 OSAddAtomic64(NKPT, &alloc_ptepages_count);
519 bootstrap_wired_pages = NKPT;
520
521 virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
522 virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
523
524 if (!PE_parse_boot_argn("npvhash", &npvhashmask, sizeof(npvhashmask))) {
525 npvhashmask = ((NPVHASHBUCKETS) << pmap_scale_shift()) - 1;
526 }
527
528 npvhashbuckets = npvhashmask + 1;
529
530 if (0 != ((npvhashbuckets) & npvhashmask)) {
531 panic("invalid hash %d, must be ((2^N)-1), "
532 "using default %d\n", npvhashmask, NPVHASHMASK);
533 }
534
535 lck_rw_init(&kernel_pmap->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
536 kernel_pmap->pmap_rwl.lck_rw_can_sleep = FALSE;
537
538 pmap_cpu_init();
539
540 if (pmap_pcid_ncpus) {
541 printf("PMAP: PCID enabled\n");
542 }
543
544 if (pmap_smep_enabled) {
545 printf("PMAP: Supervisor Mode Execute Protection enabled\n");
546 }
547 if (pmap_smap_enabled) {
548 printf("PMAP: Supervisor Mode Access Protection enabled\n");
549 }
550
551 #if DEBUG
552 printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
553 printf("early_random(): 0x%qx\n", early_random());
554 #endif
555 #if DEVELOPMENT || DEBUG
556 boolean_t ptmp;
557 /* Check if the user has requested disabling stack or heap no-execute
558 * enforcement. These are "const" variables; that qualifier is cast away
559 * when altering them. The TEXT/DATA const sections are marked
560 * write protected later in the kernel startup sequence, so altering
561 * them is possible at this point, in pmap_bootstrap().
562 */
563 if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) {
564 boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx;
565 *pdknxp = TRUE;
566 }
567
568 if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) {
569 boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx;
570 *pdknhp = TRUE;
571 }
572 #endif /* DEVELOPMENT || DEBUG */
573
574 boot_args *args = (boot_args *)PE_state.bootArgs;
575 if (args->efiMode == kBootArgsEfiMode32) {
576 printf("EFI32: kernel virtual space limited to 4GB\n");
577 virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32;
578 }
579 kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
580 (long)KERNEL_BASE, (long)virtual_end);
581 kprintf("Available physical space from 0x%llx to 0x%llx\n",
582 avail_start, avail_end);
583
584 /*
585 * The -no_shared_cr3 boot-arg is a debugging feature (set by default
586 * in the DEBUG kernel) to force the kernel to switch to its own map
587 * (and cr3) when control is in kernelspace. The kernel's map does not
588 * include (i.e. share) userspace so wild references will cause
589 * a panic. Only copyin and copyout are exempt from this.
590 */
591 (void) PE_parse_boot_argn("-no_shared_cr3",
592 &no_shared_cr3, sizeof(no_shared_cr3));
593 if (no_shared_cr3) {
594 kprintf("Kernel not sharing user map\n");
595 }
596
597 #ifdef PMAP_TRACES
598 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof(pmap_trace))) {
599 kprintf("Kernel traces for pmap operations enabled\n");
600 }
601 #endif /* PMAP_TRACES */
602
603 #if MACH_ASSERT
604 PE_parse_boot_argn("pmap_asserts", &pmap_asserts_enabled, sizeof(pmap_asserts_enabled));
605 PE_parse_boot_argn("pmap_stats_assert",
606 &pmap_stats_assert,
607 sizeof(pmap_stats_assert));
608 #endif /* MACH_ASSERT */
609 }
610
611 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)612 pmap_virtual_space(
613 vm_offset_t *startp,
614 vm_offset_t *endp)
615 {
616 *startp = virtual_avail;
617 *endp = virtual_end;
618 }
619
620
621
622
623 #if HIBERNATION
624
625 #include <IOKit/IOHibernatePrivate.h>
626 #include <machine/pal_hibernate.h>
627
628 int32_t pmap_npages;
629 int32_t pmap_teardown_last_valid_compact_indx = -1;
630
631 void pmap_pack_index(uint32_t);
632 int32_t pmap_unpack_index(pv_rooted_entry_t);
633
634 int32_t
pmap_unpack_index(pv_rooted_entry_t pv_h)635 pmap_unpack_index(pv_rooted_entry_t pv_h)
636 {
637 int32_t indx = 0;
638
639 indx = (int32_t)(*((uint64_t *)(&pv_h->qlink.next)) >> 48);
640 indx = indx << 16;
641 indx |= (int32_t)(*((uint64_t *)(&pv_h->qlink.prev)) >> 48);
642
643 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)0xffff << 48);
644 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)0xffff << 48);
645
646 return indx;
647 }
648
649
650 void
pmap_pack_index(uint32_t indx)651 pmap_pack_index(uint32_t indx)
652 {
653 pv_rooted_entry_t pv_h;
654
655 pv_h = &pv_head_table[indx];
656
657 *((uint64_t *)(&pv_h->qlink.next)) &= ~((uint64_t)0xffff << 48);
658 *((uint64_t *)(&pv_h->qlink.prev)) &= ~((uint64_t)0xffff << 48);
659
660 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)(indx >> 16)) << 48;
661 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)(indx & 0xffff)) << 48;
662 }
663
664
665 void
pal_hib_teardown_pmap_structs(addr64_t * unneeded_start,addr64_t * unneeded_end)666 pal_hib_teardown_pmap_structs(addr64_t *unneeded_start, addr64_t *unneeded_end)
667 {
668 int32_t i;
669 int32_t compact_target_indx;
670
671 compact_target_indx = 0;
672
673 for (i = 0; i < pmap_npages; i++) {
674 if (pv_head_table[i].pmap == PMAP_NULL) {
675 if (pv_head_table[compact_target_indx].pmap != PMAP_NULL) {
676 compact_target_indx = i;
677 }
678 } else {
679 pmap_pack_index((uint32_t)i);
680
681 if (pv_head_table[compact_target_indx].pmap == PMAP_NULL) {
682 /*
683 * we've got a hole to fill, so
684 * move this pv_rooted_entry_t to it's new home
685 */
686 pv_head_table[compact_target_indx] = pv_head_table[i];
687 pv_head_table[i].pmap = PMAP_NULL;
688
689 pmap_teardown_last_valid_compact_indx = compact_target_indx;
690 compact_target_indx++;
691 } else {
692 pmap_teardown_last_valid_compact_indx = i;
693 }
694 }
695 }
696 *unneeded_start = (addr64_t)&pv_head_table[pmap_teardown_last_valid_compact_indx + 1];
697 *unneeded_end = (addr64_t)&pv_head_table[pmap_npages - 1];
698
699 HIBLOG("pal_hib_teardown_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
700 }
701
702
703 void
pal_hib_rebuild_pmap_structs(void)704 pal_hib_rebuild_pmap_structs(void)
705 {
706 int32_t cindx, eindx, rindx = 0;
707 pv_rooted_entry_t pv_h;
708
709 eindx = (int32_t)pmap_npages;
710
711 for (cindx = pmap_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
712 pv_h = &pv_head_table[cindx];
713
714 rindx = pmap_unpack_index(pv_h);
715 assert(rindx < pmap_npages);
716
717 if (rindx != cindx) {
718 /*
719 * this pv_rooted_entry_t was moved by pal_hib_teardown_pmap_structs,
720 * so move it back to its real location
721 */
722 pv_head_table[rindx] = pv_head_table[cindx];
723 }
724 if (rindx + 1 != eindx) {
725 /*
726 * the 'hole' between this vm_rooted_entry_t and the previous
727 * vm_rooted_entry_t we moved needs to be initialized as
728 * a range of zero'd vm_rooted_entry_t's
729 */
730 bzero((char *)&pv_head_table[rindx + 1], (eindx - rindx - 1) * sizeof(struct pv_rooted_entry));
731 }
732 eindx = rindx;
733 }
734 if (rindx) {
735 bzero((char *)&pv_head_table[0], rindx * sizeof(struct pv_rooted_entry));
736 }
737
738 HIBLOG("pal_hib_rebuild_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
739 }
740
741 #endif
742
743 /*
744 * Create pv entries for kernel pages mapped by early startup code.
745 * These have to exist so we can ml_static_mfree() them later.
746 */
747 static void
pmap_pv_fixup(vm_offset_t start_va,vm_offset_t end_va)748 pmap_pv_fixup(vm_offset_t start_va, vm_offset_t end_va)
749 {
750 ppnum_t ppn;
751 pv_rooted_entry_t pv_h;
752 uint32_t pgsz;
753
754 start_va = round_page(start_va);
755 end_va = trunc_page(end_va);
756 while (start_va < end_va) {
757 pgsz = PAGE_SIZE;
758 ppn = pmap_find_phys(kernel_pmap, start_va);
759 if (ppn != 0 && IS_MANAGED_PAGE(ppn)) {
760 pv_h = pai_to_pvh(ppn);
761 assert(pv_h->qlink.next == 0); /* shouldn't be init'd yet */
762 assert(pv_h->pmap == 0);
763 pv_h->va_and_flags = start_va;
764 pv_h->pmap = kernel_pmap;
765 queue_init(&pv_h->qlink);
766 /*
767 * Note that pmap_query_pagesize does not enforce start_va is aligned
768 * on a 2M boundary if it's within a large page
769 */
770 if (pmap_query_pagesize(kernel_pmap, start_va) == I386_LPGBYTES) {
771 pgsz = I386_LPGBYTES;
772 }
773 }
774 if (os_add_overflow(start_va, pgsz, &start_va)) {
775 #if DEVELOPMENT || DEBUG
776 panic("pmap_pv_fixup: Unexpected address wrap (0x%lx after adding 0x%x)", start_va, pgsz);
777 #else
778 start_va = end_va;
779 #endif
780 }
781 }
782 }
783
784 /*
785 * Initialize the pmap module.
786 * Called by vm_init, to initialize any structures that the pmap
787 * system needs to map virtual memory.
788 */
789 void
pmap_init(void)790 pmap_init(void)
791 {
792 long npages;
793 vm_offset_t addr;
794 vm_size_t s, vsize;
795 vm_map_offset_t vaddr;
796 ppnum_t ppn;
797
798
799 kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
800 _vm_object_allocate((vm_object_size_t)NPML4PGS * PAGE_SIZE, &kpml4obj_object_store);
801
802 kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
803 _vm_object_allocate((vm_object_size_t)NPDPTPGS * PAGE_SIZE, &kpdptobj_object_store);
804
805 kernel_pmap->pm_obj = &kptobj_object_store;
806 _vm_object_allocate((vm_object_size_t)NPDEPGS * PAGE_SIZE, &kptobj_object_store);
807
808 /*
809 * Allocate memory for the pv_head_table and its lock bits,
810 * the modify bit array, and the pte_page table.
811 */
812
813 /*
814 * zero bias all these arrays now instead of off avail_start
815 * so we cover all memory
816 */
817
818 npages = i386_btop(avail_end);
819 #if HIBERNATION
820 pmap_npages = (uint32_t)npages;
821 #endif
822 s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
823 + (sizeof(struct pv_hashed_entry_t *) * (npvhashbuckets))
824 + pv_lock_table_size(npages)
825 + pv_hash_lock_table_size((npvhashbuckets))
826 + npages);
827 s = round_page(s);
828
829 kmem_alloc(kernel_map, &addr, s,
830 KMA_NOFAIL | KMA_ZERO | KMA_KOBJECT | KMA_PERMANENT,
831 VM_KERN_MEMORY_PMAP);
832
833 vaddr = addr;
834 vsize = s;
835
836 #if PV_DEBUG
837 if (0 == npvhashmask) {
838 panic("npvhashmask not initialized");
839 }
840 #endif
841
842 /*
843 * Allocate the structures first to preserve word-alignment.
844 */
845 pv_head_table = (pv_rooted_entry_t) addr;
846 addr = (vm_offset_t) (pv_head_table + npages);
847
848 pv_hash_table = (pv_hashed_entry_t *)addr;
849 addr = (vm_offset_t) (pv_hash_table + (npvhashbuckets));
850
851 pv_lock_table = (char *) addr;
852 addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
853
854 pv_hash_lock_table = (char *) addr;
855 addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhashbuckets)));
856
857 pmap_phys_attributes = (char *) addr;
858
859 ppnum_t last_pn = i386_btop(avail_end);
860 unsigned int i;
861 pmap_memory_region_t *pmptr = pmap_memory_regions;
862 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
863 if (pmptr->type != kEfiConventionalMemory) {
864 continue;
865 }
866 ppnum_t pn;
867 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
868 if (pn < last_pn) {
869 pmap_phys_attributes[pn] |= PHYS_MANAGED;
870
871 if (pn > last_managed_page) {
872 last_managed_page = pn;
873 }
874
875 if ((pmap_high_used_bottom <= pn && pn <= pmap_high_used_top) ||
876 (pmap_middle_used_bottom <= pn && pn <= pmap_middle_used_top)) {
877 pmap_phys_attributes[pn] |= PHYS_NOENCRYPT;
878 }
879 }
880 }
881 }
882 while (vsize) {
883 ppn = pmap_find_phys(kernel_pmap, vaddr);
884
885 pmap_phys_attributes[ppn] |= PHYS_NOENCRYPT;
886
887 vaddr += PAGE_SIZE;
888 vsize -= PAGE_SIZE;
889 }
890 /*
891 * Create the zone of physical maps,
892 * and of the physical-to-virtual entries.
893 */
894 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
895 ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
896
897 /* The anchor is required to be page aligned. Zone debugging adds
898 * padding which may violate that requirement. Tell the zone
899 * subsystem that alignment is required.
900 */
901 pmap_anchor_zone = zone_create("pagetable anchors", PAGE_SIZE,
902 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
903
904 /* TODO: possible general optimisation...pre-allocate via zones commonly created
905 * level3/2 pagetables
906 */
907 /* The anchor is required to be page aligned. Zone debugging adds
908 * padding which may violate that requirement. Tell the zone
909 * subsystem that alignment is required.
910 */
911 pmap_uanchor_zone = zone_create("pagetable user anchors", PAGE_SIZE,
912 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
913
914 pv_hashed_list_zone = zone_create("pv_list", sizeof(struct pv_hashed_entry),
915 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
916
917 /*
918 * Create pv entries for kernel pages that might get pmap_remove()ed.
919 *
920 * - very low pages that were identity mapped.
921 * - vm_pages[] entries that might be unused and reclaimed.
922 */
923 assert((uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start <= (uintptr_t)vm_page_array_beginning_addr);
924 pmap_pv_fixup((uintptr_t)VM_MIN_KERNEL_ADDRESS, (uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start);
925 pmap_pv_fixup((uintptr_t)vm_page_array_beginning_addr, (uintptr_t)vm_page_array_ending_addr);
926
927 pmap_initialized = TRUE;
928
929 max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
930
931 /*
932 * Ensure the kernel's PML4 entry exists for the basement
933 * before this is shared with any user.
934 */
935 pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE);
936
937 #if CONFIG_VMX
938 pmap_ept_support_ad = vmx_hv_support() && (VMX_CAP(MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_EPT_VPID_CAP_AD_SHIFT, 1) ? TRUE : FALSE);
939 pmap_eptp_flags = HV_VMX_EPTP_MEMORY_TYPE_WB | HV_VMX_EPTP_WALK_LENGTH(4) | (pmap_ept_support_ad ? HV_VMX_EPTP_ENABLE_AD_FLAGS : 0);
940 #endif /* CONFIG_VMX */
941 }
942
943 void
pmap_mark_range(pmap_t npmap,uint64_t sv,uint64_t nxrosz,boolean_t NX,boolean_t ro)944 pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro)
945 {
946 uint64_t ev, cv = sv;
947 pd_entry_t *pdep;
948 pt_entry_t *ptep = NULL;
949
950 if (os_add_overflow(sv, nxrosz, &ev)) {
951 panic("pmap_mark_range: Unexpected address overflow: start=0x%llx size=0x%llx", sv, nxrosz);
952 }
953
954 /* XXX what if nxrosz is 0? we end up marking the page whose address is passed in via sv -- is that kosher? */
955 assert(!is_ept_pmap(npmap));
956
957 assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0);
958
959 for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) {
960 uint64_t pdev = (cv & ~((uint64_t)PDEMASK));
961
962 if (*pdep & INTEL_PTE_PS) {
963 #ifdef REMAP_DEBUG
964 if ((NX ^ !!(*pdep & INTEL_PTE_NX)) || (ro ^ !!!(*pdep & INTEL_PTE_WRITE))) {
965 kprintf("WARNING: Remapping PDE for %p from %s%s%s to %s%s%s\n", (void *)cv,
966 (*pdep & INTEL_PTE_VALID) ? "R" : "",
967 (*pdep & INTEL_PTE_WRITE) ? "W" : "",
968 (*pdep & INTEL_PTE_NX) ? "" : "X",
969 "R",
970 ro ? "" : "W",
971 NX ? "" : "X");
972 }
973 #endif
974
975 if (NX) {
976 *pdep |= INTEL_PTE_NX;
977 } else {
978 *pdep &= ~INTEL_PTE_NX;
979 }
980 if (ro) {
981 *pdep &= ~INTEL_PTE_WRITE;
982 } else {
983 *pdep |= INTEL_PTE_WRITE;
984 }
985
986 if (os_add_overflow(cv, NBPD, &cv)) {
987 cv = ev;
988 } else {
989 cv &= ~((uint64_t) PDEMASK);
990 pdep = pmap_pde(npmap, cv);
991 }
992 continue;
993 }
994
995 for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) {
996 #ifdef REMAP_DEBUG
997 if ((NX ^ !!(*ptep & INTEL_PTE_NX)) || (ro ^ !!!(*ptep & INTEL_PTE_WRITE))) {
998 kprintf("WARNING: Remapping PTE for %p from %s%s%s to %s%s%s\n", (void *)cv,
999 (*ptep & INTEL_PTE_VALID) ? "R" : "",
1000 (*ptep & INTEL_PTE_WRITE) ? "W" : "",
1001 (*ptep & INTEL_PTE_NX) ? "" : "X",
1002 "R",
1003 ro ? "" : "W",
1004 NX ? "" : "X");
1005 }
1006 #endif
1007 if (NX) {
1008 *ptep |= INTEL_PTE_NX;
1009 } else {
1010 *ptep &= ~INTEL_PTE_NX;
1011 }
1012 if (ro) {
1013 *ptep &= ~INTEL_PTE_WRITE;
1014 } else {
1015 *ptep |= INTEL_PTE_WRITE;
1016 }
1017 cv += NBPT;
1018 ptep = pmap_pte(npmap, cv);
1019 }
1020 }
1021 DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0);
1022 }
1023
1024 /*
1025 * Reclaim memory for early boot 4K page tables that were converted to large page mappings.
1026 * We know this memory is part of the KPTphys[] array that was allocated in Idle_PTs_init(),
1027 * so we can free it using its address in that array.
1028 */
1029 static void
pmap_free_early_PT(ppnum_t ppn,uint32_t cnt)1030 pmap_free_early_PT(ppnum_t ppn, uint32_t cnt)
1031 {
1032 ppnum_t KPTphys_ppn;
1033 vm_offset_t offset;
1034
1035 KPTphys_ppn = pmap_find_phys(kernel_pmap, (uintptr_t)KPTphys);
1036 assert(ppn >= KPTphys_ppn);
1037 assert(ppn + cnt <= KPTphys_ppn + NKPT);
1038 offset = (ppn - KPTphys_ppn) << PAGE_SHIFT;
1039 ml_static_mfree((uintptr_t)KPTphys + offset, PAGE_SIZE * cnt);
1040 }
1041
1042 /*
1043 * Called once VM is fully initialized so that we can release unused
1044 * sections of low memory to the general pool.
1045 * Also complete the set-up of identity-mapped sections of the kernel:
1046 * 1) write-protect kernel text
1047 * 2) map kernel text using large pages if possible
1048 * 3) read and write-protect page zero (for K32)
1049 * 4) map the global page at the appropriate virtual address.
1050 *
1051 * Use of large pages
1052 * ------------------
1053 * To effectively map and write-protect all kernel text pages, the text
1054 * must be 2M-aligned at the base, and the data section above must also be
1055 * 2M-aligned. That is, there's padding below and above. This is achieved
1056 * through linker directives. Large pages are used only if this alignment
1057 * exists (and not overriden by the -kernel_text_page_4K boot-arg). The
1058 * memory layout is:
1059 *
1060 * : :
1061 * | __DATA |
1062 * sdata: ================== 2Meg
1063 * | |
1064 * | zero-padding |
1065 * | |
1066 * etext: ------------------
1067 * | |
1068 * : :
1069 * | |
1070 * | __TEXT |
1071 * | |
1072 * : :
1073 * | |
1074 * stext: ================== 2Meg
1075 * | |
1076 * | zero-padding |
1077 * | |
1078 * eHIB: ------------------
1079 * | __HIB |
1080 * : :
1081 *
1082 * Prior to changing the mapping from 4K to 2M, the zero-padding pages
1083 * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the
1084 * 4K pages covering [stext,etext] are coalesced as 2M large pages.
1085 * The now unused level-1 PTE pages are also freed.
1086 */
1087 extern ppnum_t vm_kernel_base_page;
1088 static uint32_t dataptes = 0;
1089
1090 void
pmap_lowmem_finalize(void)1091 pmap_lowmem_finalize(void)
1092 {
1093 spl_t spl;
1094 int i;
1095
1096 /*
1097 * Update wired memory statistics for early boot pages
1098 */
1099 PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE);
1100
1101 /*
1102 * Free pages in pmap regions below the base:
1103 * rdar://6332712
1104 * We can't free all the pages to VM that EFI reports available.
1105 * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake.
1106 * There's also a size miscalculation here: pend is one page less
1107 * than it should be but this is not fixed to be backwards
1108 * compatible.
1109 * This is important for KASLR because up to 256*2MB = 512MB of space
1110 * needs has to be released to VM.
1111 */
1112 for (i = 0;
1113 pmap_memory_regions[i].end < vm_kernel_base_page;
1114 i++) {
1115 vm_offset_t pbase = i386_ptob(pmap_memory_regions[i].base);
1116 vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end + 1);
1117
1118 DBG("pmap region %d [%p..[%p\n",
1119 i, (void *) pbase, (void *) pend);
1120
1121 if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED) {
1122 continue;
1123 }
1124 /*
1125 * rdar://6332712
1126 * Adjust limits not to free pages in range 0xc0000-0xff000.
1127 */
1128 if (pbase >= 0xc0000 && pend <= 0x100000) {
1129 continue;
1130 }
1131 if (pbase < 0xc0000 && pend > 0x100000) {
1132 /* page range entirely within region, free lower part */
1133 DBG("- ml_static_mfree(%p,%p)\n",
1134 (void *) ml_static_ptovirt(pbase),
1135 (void *) (0xc0000 - pbase));
1136 ml_static_mfree(ml_static_ptovirt(pbase), 0xc0000 - pbase);
1137 pbase = 0x100000;
1138 }
1139 if (pbase < 0xc0000) {
1140 pend = MIN(pend, 0xc0000);
1141 }
1142 if (pend > 0x100000) {
1143 pbase = MAX(pbase, 0x100000);
1144 }
1145 DBG("- ml_static_mfree(%p,%p)\n",
1146 (void *) ml_static_ptovirt(pbase),
1147 (void *) (pend - pbase));
1148 ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase);
1149 }
1150
1151 /* A final pass to get rid of all initial identity mappings to
1152 * low pages.
1153 */
1154 DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base);
1155
1156 /*
1157 * Remove all mappings past the boot-cpu descriptor aliases and low globals.
1158 * Non-boot-cpu GDT aliases will be remapped later as needed.
1159 */
1160 pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base);
1161
1162 /*
1163 * Release any memory for early boot 4K page table pages that got replaced
1164 * with large page mappings for vm_pages[]. We know this memory is part of
1165 * the KPTphys[] array that was allocated in Idle_PTs_init(), so we can free
1166 * it using that address.
1167 */
1168 pmap_free_early_PT(released_PT_ppn, released_PT_cnt);
1169
1170 /*
1171 * If text and data are both 2MB-aligned,
1172 * we can map text with large-pages,
1173 * unless the -kernel_text_ps_4K boot-arg overrides.
1174 */
1175 if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) {
1176 kprintf("Kernel text is 2MB aligned");
1177 kernel_text_ps_4K = FALSE;
1178 if (PE_parse_boot_argn("-kernel_text_ps_4K",
1179 &kernel_text_ps_4K,
1180 sizeof(kernel_text_ps_4K))) {
1181 kprintf(" but will be mapped with 4K pages\n");
1182 } else {
1183 kprintf(" and will be mapped with 2M pages\n");
1184 }
1185 }
1186 #if DEVELOPMENT || DEBUG
1187 (void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof(wpkernel));
1188 #endif
1189 if (wpkernel) {
1190 kprintf("Kernel text %p-%p to be write-protected\n",
1191 (void *) stext, (void *) etext);
1192 }
1193
1194 spl = splhigh();
1195
1196 /*
1197 * Scan over text if mappings are to be changed:
1198 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0
1199 * - Change to large-pages if possible and not overriden.
1200 */
1201 if (kernel_text_ps_4K && wpkernel) {
1202 vm_offset_t myva;
1203 for (myva = stext; myva < etext; myva += PAGE_SIZE) {
1204 pt_entry_t *ptep;
1205
1206 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1207 if (ptep) {
1208 pmap_store_pte(FALSE, ptep, *ptep & ~INTEL_PTE_WRITE);
1209 }
1210 }
1211 }
1212
1213 if (!kernel_text_ps_4K) {
1214 vm_offset_t myva;
1215
1216 /*
1217 * Release zero-filled page padding used for 2M-alignment.
1218 */
1219 DBG("ml_static_mfree(%p,%p) for padding below text\n",
1220 (void *) eHIB, (void *) (stext - eHIB));
1221 ml_static_mfree(eHIB, stext - eHIB);
1222 DBG("ml_static_mfree(%p,%p) for padding above text\n",
1223 (void *) etext, (void *) (sdata - etext));
1224 ml_static_mfree(etext, sdata - etext);
1225
1226 /*
1227 * Coalesce text pages into large pages.
1228 */
1229 for (myva = stext; myva < sdata; myva += I386_LPGBYTES) {
1230 pt_entry_t *ptep;
1231 vm_offset_t pte_phys;
1232 pt_entry_t *pdep;
1233 pt_entry_t pde;
1234 ppnum_t KPT_ppn;
1235
1236 pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva);
1237 KPT_ppn = (ppnum_t)((*pdep & PG_FRAME) >> PAGE_SHIFT);
1238 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1239 DBG("myva: %p pdep: %p ptep: %p\n",
1240 (void *) myva, (void *) pdep, (void *) ptep);
1241 if ((*ptep & INTEL_PTE_VALID) == 0) {
1242 continue;
1243 }
1244 pte_phys = (vm_offset_t)(*ptep & PG_FRAME);
1245 pde = *pdep & PTMASK; /* page attributes from pde */
1246 pde |= INTEL_PTE_PS; /* make it a 2M entry */
1247 pde |= pte_phys; /* take page frame from pte */
1248
1249 if (wpkernel) {
1250 pde &= ~INTEL_PTE_WRITE;
1251 }
1252 DBG("pmap_store_pte(%p,0x%llx)\n",
1253 (void *)pdep, pde);
1254 pmap_store_pte(FALSE, pdep, pde);
1255
1256 /*
1257 * Free the now-unused level-1 pte.
1258 */
1259 pmap_free_early_PT(KPT_ppn, 1);
1260 }
1261
1262 /* Change variable read by sysctl machdep.pmap */
1263 pmap_kernel_text_ps = I386_LPGBYTES;
1264 }
1265
1266 vm_offset_t dva;
1267
1268 for (dva = sdata; dva < edata; dva += I386_PGBYTES) {
1269 assert(((sdata | edata) & PAGE_MASK) == 0);
1270 pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
1271
1272 dpte = *dptep;
1273 assert((dpte & INTEL_PTE_VALID));
1274 dpte |= INTEL_PTE_NX;
1275 pmap_store_pte(FALSE, dptep, dpte);
1276 dataptes++;
1277 }
1278 assert(dataptes > 0);
1279
1280 kernel_segment_command_t * seg;
1281 kernel_section_t * sec;
1282 kc_format_t kc_format;
1283
1284 PE_get_primary_kc_format(&kc_format);
1285
1286 for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) {
1287 if (!strcmp(seg->segname, "__TEXT") ||
1288 !strcmp(seg->segname, "__DATA")) {
1289 continue;
1290 }
1291
1292 /* XXX: FIXME_IN_dyld: This is a workaround (see below) */
1293 if (kc_format != KCFormatFileset) {
1294 //XXX
1295 if (!strcmp(seg->segname, "__KLD")) {
1296 continue;
1297 }
1298 }
1299
1300 if (!strcmp(seg->segname, "__HIB")) {
1301 for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
1302 if (sec->addr & PAGE_MASK) {
1303 panic("__HIB segment's sections misaligned");
1304 }
1305 if (!strcmp(sec->sectname, "__text")) {
1306 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE);
1307 } else {
1308 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE);
1309 }
1310 }
1311 } else {
1312 if (kc_format == KCFormatFileset) {
1313 #if 0
1314 /*
1315 * This block of code is commented out because it may or may not have induced an earlier panic
1316 * in ledger init.
1317 */
1318
1319
1320 boolean_t NXbit = !(seg->initprot & VM_PROT_EXECUTE),
1321 robit = (seg->initprot & (VM_PROT_READ | VM_PROT_WRITE)) == VM_PROT_READ;
1322
1323 /*
1324 * XXX: FIXME_IN_dyld: This is a workaround for primary KC containing incorrect inaccurate
1325 * initprot for segments containing code.
1326 */
1327 if (!strcmp(seg->segname, "__KLD") || !strcmp(seg->segname, "__VECTORS")) {
1328 NXbit = FALSE;
1329 robit = FALSE;
1330 }
1331
1332 pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1333 round_page_64(seg->vmsize), NXbit, robit);
1334 #endif
1335
1336 /*
1337 * XXX: We are marking *every* segment with rwx permissions as a workaround
1338 * XXX: until the primary KC's kernel segments are page-aligned.
1339 */
1340 kprintf("Marking (%p, %p) as rwx\n", (void *)(seg->vmaddr & ~(uint64_t)PAGE_MASK),
1341 (void *)((seg->vmaddr & ~(uint64_t)PAGE_MASK) + round_page_64(seg->vmsize)));
1342 pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1343 round_page_64(seg->vmsize), FALSE, FALSE);
1344 } else {
1345 pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE);
1346 }
1347 }
1348 }
1349
1350 /*
1351 * If we're debugging, map the low global vector page at the fixed
1352 * virtual address. Otherwise, remove the mapping for this.
1353 */
1354 if (debug_boot_arg) {
1355 pt_entry_t *pte = NULL;
1356 if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS))) {
1357 panic("lowmem pte");
1358 }
1359
1360 /* make sure it is defined on page boundary */
1361 assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK));
1362 pmap_store_pte(FALSE, pte, kvtophys((vm_offset_t)&lowGlo)
1363 | INTEL_PTE_REF
1364 | INTEL_PTE_MOD
1365 | INTEL_PTE_WIRED
1366 | INTEL_PTE_VALID
1367 | INTEL_PTE_WRITE
1368 | INTEL_PTE_NX);
1369
1370 #if KASAN
1371 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
1372 #endif
1373 } else {
1374 pmap_remove(kernel_pmap,
1375 LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE);
1376 }
1377 pmap_tlbi_range(0, ~0ULL, true, 0);
1378 splx(spl);
1379 }
1380
1381 /*
1382 * Mark the const data segment as read-only, non-executable.
1383 */
1384 void
x86_64_protect_data_const()1385 x86_64_protect_data_const()
1386 {
1387 boolean_t doconstro = TRUE;
1388 #if DEVELOPMENT || DEBUG
1389 (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
1390 #endif
1391 if (doconstro) {
1392 if (sconst & PAGE_MASK) {
1393 panic("CONST segment misaligned 0x%lx 0x%lx",
1394 sconst, econst);
1395 }
1396 kprintf("Marking const DATA read-only\n");
1397 pmap_protect(kernel_pmap, sconst, econst, VM_PROT_READ);
1398 }
1399 }
1400 /*
1401 * this function is only used for debugging fron the vm layer
1402 */
1403 bool
pmap_verify_free(ppnum_t pn)1404 pmap_verify_free(
1405 ppnum_t pn)
1406 {
1407 pv_rooted_entry_t pv_h;
1408 int pai;
1409 bool result;
1410
1411 assert(pn != vm_page_fictitious_addr);
1412
1413 if (!pmap_initialized) {
1414 return true;
1415 }
1416
1417 if (pn == vm_page_guard_addr) {
1418 return true;
1419 }
1420
1421 pai = ppn_to_pai(pn);
1422 if (!IS_MANAGED_PAGE(pai)) {
1423 return false;
1424 }
1425 pv_h = pai_to_pvh(pn);
1426 result = (pv_h->pmap == PMAP_NULL);
1427 return result;
1428 }
1429
1430 #if MACH_ASSERT
1431 void
pmap_assert_free(ppnum_t pn)1432 pmap_assert_free(ppnum_t pn)
1433 {
1434 int pai;
1435 pv_rooted_entry_t pv_h = NULL;
1436 pmap_t pmap = NULL;
1437 vm_offset_t va = 0;
1438 static char buffer[32];
1439 static char *pr_name = "not managed pn";
1440 uint_t attr;
1441 pt_entry_t *ptep;
1442 pt_entry_t pte = -1ull;
1443
1444 if (pmap_verify_free(pn)) {
1445 return;
1446 }
1447
1448 if (pn > last_managed_page) {
1449 attr = 0xff;
1450 goto done;
1451 }
1452
1453 pai = ppn_to_pai(pn);
1454 attr = pmap_phys_attributes[pai];
1455 pv_h = pai_to_pvh(pai);
1456 va = pv_h->va_and_flags;
1457 pmap = pv_h->pmap;
1458 if (pmap == kernel_pmap) {
1459 pr_name = "kernel";
1460 } else if (pmap == NULL) {
1461 pr_name = "pmap NULL";
1462 } else if (pmap->pmap_procname[0] != 0) {
1463 pr_name = &pmap->pmap_procname[0];
1464 } else {
1465 snprintf(buffer, sizeof(buffer), "pmap %p", pv_h->pmap);
1466 pr_name = buffer;
1467 }
1468
1469 if (pmap != NULL) {
1470 ptep = pmap_pte(pmap, va);
1471 if (ptep != NULL) {
1472 pte = (uintptr_t)*ptep;
1473 }
1474 }
1475
1476 done:
1477 panic("page not FREE page: 0x%lx attr: 0x%x %s va: 0x%lx PTE: 0x%llx",
1478 (ulong_t)pn, attr, pr_name, va, pte);
1479 }
1480 #endif /* MACH_ASSERT */
1481
1482 boolean_t
pmap_is_empty(pmap_t pmap,vm_map_offset_t va_start,vm_map_offset_t va_end)1483 pmap_is_empty(
1484 pmap_t pmap,
1485 vm_map_offset_t va_start,
1486 vm_map_offset_t va_end)
1487 {
1488 vm_map_offset_t offset;
1489 ppnum_t phys_page;
1490 ledger_amount_t phys_mem;
1491
1492 if (pmap == PMAP_NULL) {
1493 return TRUE;
1494 }
1495
1496 /*
1497 * Check the ledger's phys_mem value
1498 * - if it's zero, the pmap is completely empty.
1499 * This short-circuit test prevents a virtual address scan which is
1500 * painfully slow for 64-bit spaces.
1501 * This assumes the count is correct
1502 * .. the debug kernel ought to be checking perhaps by page table walk.
1503 */
1504 if (pmap != kernel_pmap) {
1505 ledger_get_balance(pmap->ledger, task_ledgers.phys_mem, &phys_mem);
1506 if (phys_mem == 0) {
1507 return TRUE;
1508 }
1509 }
1510
1511 for (offset = va_start;
1512 offset < va_end;
1513 offset += PAGE_SIZE_64) {
1514 phys_page = pmap_find_phys(pmap, offset);
1515 if (phys_page) {
1516 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1517 "page %d at 0x%llx\n",
1518 pmap, va_start, va_end, phys_page, offset);
1519 return FALSE;
1520 }
1521 }
1522
1523 return TRUE;
1524 }
1525
1526 void
hv_ept_pmap_create(void ** ept_pmap,void ** eptp)1527 hv_ept_pmap_create(void **ept_pmap, void **eptp)
1528 {
1529 pmap_t p;
1530
1531 if ((ept_pmap == NULL) || (eptp == NULL)) {
1532 return;
1533 }
1534
1535 p = pmap_create_options(get_task_ledger(current_task()), 0, (PMAP_CREATE_64BIT | PMAP_CREATE_EPT));
1536 if (p == PMAP_NULL) {
1537 *ept_pmap = NULL;
1538 *eptp = NULL;
1539 return;
1540 }
1541
1542 assert(is_ept_pmap(p));
1543
1544 *ept_pmap = (void*)p;
1545 *eptp = (void*)(p->pm_eptp);
1546 return;
1547 }
1548
1549 /*
1550 * pmap_create() is used by some special, legacy 3rd party kexts.
1551 * In our kernel code, always use pmap_create_options().
1552 */
1553 extern pmap_t pmap_create(ledger_t ledger, vm_map_size_t sz, boolean_t is_64bit);
1554
1555 __attribute__((used))
1556 pmap_t
pmap_create(ledger_t ledger,vm_map_size_t sz,boolean_t is_64bit)1557 pmap_create(
1558 ledger_t ledger,
1559 vm_map_size_t sz,
1560 boolean_t is_64bit)
1561 {
1562 return pmap_create_options(ledger, sz, is_64bit ? PMAP_CREATE_64BIT : 0);
1563 }
1564
1565 /*
1566 * Create and return a physical map.
1567 *
1568 * If the size specified for the map
1569 * is zero, the map is an actual physical
1570 * map, and may be referenced by the
1571 * hardware.
1572 *
1573 * If the size specified is non-zero,
1574 * the map will be used in software only, and
1575 * is bounded by that size.
1576 */
1577
1578 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t sz,unsigned int flags)1579 pmap_create_options(
1580 ledger_t ledger,
1581 vm_map_size_t sz,
1582 unsigned int flags)
1583 {
1584 pmap_t p;
1585 vm_size_t size;
1586 pml4_entry_t *pml4;
1587 pml4_entry_t *kpml4;
1588 int i;
1589
1590 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, sz, flags);
1591
1592 size = (vm_size_t) sz;
1593
1594 /*
1595 * A software use-only map doesn't even need a map.
1596 */
1597
1598 if (size != 0) {
1599 return PMAP_NULL;
1600 }
1601
1602 /*
1603 * Return error when unrecognized flags are passed.
1604 */
1605 if (__improbable((flags & ~(PMAP_CREATE_KNOWN_FLAGS)) != 0)) {
1606 return PMAP_NULL;
1607 }
1608
1609 p = zalloc_flags(pmap_zone, Z_WAITOK | Z_ZERO);
1610 if (PMAP_NULL == p) {
1611 panic("pmap_create zalloc");
1612 }
1613
1614 lck_rw_init(&p->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
1615 p->pmap_rwl.lck_rw_can_sleep = FALSE;
1616
1617 os_ref_init(&p->ref_count, NULL);
1618 #if DEVELOPMENT || DEBUG
1619 p->nx_enabled = 1;
1620 #endif
1621 p->pm_shared = FALSE;
1622 ledger_reference(ledger);
1623 p->ledger = ledger;
1624
1625 p->pm_task_map = ((flags & PMAP_CREATE_64BIT) ? TASK_MAP_64BIT : TASK_MAP_32BIT);
1626
1627 p->pagezero_accessible = FALSE;
1628 p->pm_vm_map_cs_enforced = FALSE;
1629
1630 if (pmap_pcid_ncpus) {
1631 pmap_pcid_initialize(p);
1632 }
1633
1634 p->pm_pml4 = zalloc(pmap_anchor_zone);
1635 p->pm_upml4 = zalloc(pmap_uanchor_zone); //cleanup for EPT
1636
1637 pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0);
1638 pmap_assert((((uintptr_t)p->pm_upml4) & PAGE_MASK) == 0);
1639
1640 memset((char *)p->pm_pml4, 0, PAGE_SIZE);
1641 memset((char *)p->pm_upml4, 0, PAGE_SIZE);
1642
1643 if (flags & PMAP_CREATE_EPT) {
1644 p->pm_eptp = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4) | pmap_eptp_flags;
1645 p->pm_cr3 = 0;
1646 } else {
1647 p->pm_eptp = 0;
1648 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1649 p->pm_ucr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_upml4);
1650 }
1651
1652 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1653
1654 p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS) *PAGE_SIZE);
1655 if (NULL == p->pm_obj_pml4) {
1656 panic("pmap_create pdpt obj");
1657 }
1658
1659 p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS) *PAGE_SIZE);
1660 if (NULL == p->pm_obj_pdpt) {
1661 panic("pmap_create pdpt obj");
1662 }
1663
1664 p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS) *PAGE_SIZE);
1665 if (NULL == p->pm_obj) {
1666 panic("pmap_create pte obj");
1667 }
1668
1669 if (!(flags & PMAP_CREATE_EPT)) {
1670 /* All host pmaps share the kernel's pml4 */
1671 pml4 = pmap64_pml4(p, 0ULL);
1672 kpml4 = kernel_pmap->pm_pml4;
1673 for (i = KERNEL_PML4_INDEX; i < (KERNEL_PML4_INDEX + KERNEL_PML4_COUNT); i++) {
1674 pml4[i] = kpml4[i];
1675 }
1676 pml4[KERNEL_KEXTS_INDEX] = kpml4[KERNEL_KEXTS_INDEX];
1677 for (i = KERNEL_PHYSMAP_PML4_INDEX; i < (KERNEL_PHYSMAP_PML4_INDEX + KERNEL_PHYSMAP_PML4_COUNT); i++) {
1678 pml4[i] = kpml4[i];
1679 }
1680 pml4[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1681 #if KASAN
1682 for (i = KERNEL_KASAN_PML4_FIRST; i <= KERNEL_KASAN_PML4_LAST; i++) {
1683 pml4[i] = kpml4[i];
1684 }
1685 #endif
1686 pml4_entry_t *pml4u = pmap64_user_pml4(p, 0ULL);
1687 pml4u[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1688 }
1689
1690 #if MACH_ASSERT
1691 p->pmap_stats_assert = TRUE;
1692 p->pmap_pid = 0;
1693 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
1694 #endif /* MACH_ASSERT */
1695
1696 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END,
1697 VM_KERNEL_ADDRHIDE(p));
1698
1699 return p;
1700 }
1701
1702 /*
1703 * We maintain stats and ledgers so that a task's physical footprint is:
1704 * phys_footprint = ((internal - alternate_accounting)
1705 * + (internal_compressed - alternate_accounting_compressed)
1706 * + iokit_mapped
1707 * + purgeable_nonvolatile
1708 * + purgeable_nonvolatile_compressed
1709 * + page_table)
1710 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
1711 */
1712
1713 #if MACH_ASSERT
1714 static void pmap_check_ledgers(pmap_t pmap);
1715 #else /* MACH_ASSERT */
1716 static inline void
pmap_check_ledgers(__unused pmap_t pmap)1717 pmap_check_ledgers(__unused pmap_t pmap)
1718 {
1719 }
1720 #endif /* MACH_ASSERT */
1721
1722 /*
1723 * Retire the given physical map from service.
1724 * Should only be called if the map contains
1725 * no valid mappings.
1726 */
1727 extern int vm_wired_objects_page_count;
1728
1729 void
pmap_destroy(pmap_t p)1730 pmap_destroy(pmap_t p)
1731 {
1732 os_ref_count_t c;
1733
1734 if (p == PMAP_NULL) {
1735 return;
1736 }
1737
1738 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1739 VM_KERNEL_ADDRHIDe(p));
1740
1741 PMAP_LOCK_EXCLUSIVE(p);
1742
1743 c = os_ref_release_locked(&p->ref_count);
1744
1745 pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE);
1746
1747 if (c == 0) {
1748 /*
1749 * If some cpu is not using the physical pmap pointer that it
1750 * is supposed to be (see set_dirbase), we might be using the
1751 * pmap that is being destroyed! Make sure we are
1752 * physically on the right pmap:
1753 */
1754 PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
1755 if (pmap_pcid_ncpus) {
1756 pmap_destroy_pcid_sync(p);
1757 }
1758 }
1759
1760 PMAP_UNLOCK_EXCLUSIVE(p);
1761
1762 if (c != 0) {
1763 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1764 pmap_assert(p == kernel_pmap);
1765 return; /* still in use */
1766 }
1767
1768 /*
1769 * Free the memory maps, then the
1770 * pmap structure.
1771 */
1772 int inuse_ptepages = 0;
1773
1774 zfree(pmap_anchor_zone, p->pm_pml4);
1775 zfree(pmap_uanchor_zone, p->pm_upml4);
1776
1777 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1778 vm_object_deallocate(p->pm_obj_pml4);
1779
1780 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1781 vm_object_deallocate(p->pm_obj_pdpt);
1782
1783 inuse_ptepages += p->pm_obj->resident_page_count;
1784 vm_object_deallocate(p->pm_obj);
1785
1786 OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
1787 PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE);
1788
1789 pmap_check_ledgers(p);
1790 ledger_dereference(p->ledger);
1791 lck_rw_destroy(&p->pmap_rwl, &pmap_lck_grp);
1792 zfree(pmap_zone, p);
1793
1794 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1795 }
1796
1797 /*
1798 * Add a reference to the specified pmap.
1799 */
1800
1801 void
pmap_reference(pmap_t p)1802 pmap_reference(pmap_t p)
1803 {
1804 if (p != PMAP_NULL) {
1805 PMAP_LOCK_EXCLUSIVE(p);
1806 os_ref_retain_locked(&p->ref_count);
1807 PMAP_UNLOCK_EXCLUSIVE(p);
1808 }
1809 }
1810
1811 /*
1812 * Remove phys addr if mapped in specified map
1813 *
1814 */
1815 void
pmap_remove_some_phys(__unused pmap_t map,__unused ppnum_t pn)1816 pmap_remove_some_phys(
1817 __unused pmap_t map,
1818 __unused ppnum_t pn)
1819 {
1820 /* Implement to support working set code */
1821 }
1822
1823
1824 void
pmap_protect(pmap_t map,vm_map_offset_t sva,vm_map_offset_t eva,vm_prot_t prot)1825 pmap_protect(
1826 pmap_t map,
1827 vm_map_offset_t sva,
1828 vm_map_offset_t eva,
1829 vm_prot_t prot)
1830 {
1831 pmap_protect_options(map, sva, eva, prot, 0, NULL);
1832 }
1833
1834
1835 /*
1836 * Set the physical protection on the
1837 * specified range of this map as requested.
1838 *
1839 * VERY IMPORTANT: Will *NOT* increase permissions.
1840 * pmap_protect_options() should protect the range against any access types
1841 * that are not in "prot" but it should never grant extra access.
1842 * For example, if "prot" is READ|EXECUTE, that means "remove write
1843 * access" but it does *not* mean "add read and execute" access.
1844 * VM relies on getting soft-faults to enforce extra checks (code
1845 * signing, for example), for example.
1846 * New access permissions are granted via pmap_enter() only.
1847 * ***NOTE***:
1848 * The only exception is for EPT pmaps, where we MUST populate all exec
1849 * bits when the protection API is invoked (so that the HV fault handler
1850 * can make decisions based on the exit qualification information, which
1851 * includes the execute bits in the EPT entries. Soft-faulting them
1852 * in would cause a chicken-and-egg problem where the HV fault handler
1853 * would not be able to identify mode-based execute control (MBE) faults.)
1854 */
1855 void
pmap_protect_options(pmap_t map,vm_map_offset_t sva,vm_map_offset_t eva,vm_prot_t prot,unsigned int options,void * arg)1856 pmap_protect_options(
1857 pmap_t map,
1858 vm_map_offset_t sva,
1859 vm_map_offset_t eva,
1860 vm_prot_t prot,
1861 unsigned int options,
1862 void *arg)
1863 {
1864 pt_entry_t *pde;
1865 pt_entry_t *spte, *epte;
1866 vm_map_offset_t lva;
1867 vm_map_offset_t orig_sva;
1868 boolean_t set_NX;
1869 int num_found = 0;
1870 boolean_t is_ept;
1871 uint64_t cur_vaddr;
1872
1873 pmap_intr_assert();
1874
1875 if (map == PMAP_NULL) {
1876 return;
1877 }
1878
1879 if (prot == VM_PROT_NONE) {
1880 pmap_remove_options(map, sva, eva, options);
1881 return;
1882 }
1883
1884 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1885 VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(sva),
1886 VM_KERNEL_ADDRHIDE(eva));
1887
1888 is_ept = is_ept_pmap(map);
1889
1890 if ((prot & VM_PROT_EXECUTE) || __improbable(is_ept && (prot & VM_PROT_UEXEC))) {
1891 set_NX = FALSE;
1892 } else {
1893 set_NX = TRUE;
1894 }
1895
1896 #if DEVELOPMENT || DEBUG
1897 if (__improbable(set_NX && (!nx_enabled || !map->nx_enabled))) {
1898 set_NX = FALSE;
1899 }
1900 #endif
1901 PMAP_LOCK_EXCLUSIVE(map);
1902
1903 orig_sva = sva;
1904 cur_vaddr = sva;
1905 while (sva < eva) {
1906 uint64_t vaddr_incr;
1907
1908 if (os_add_overflow(sva, PDE_MAPPED_SIZE, &lva)) {
1909 lva = eva;
1910 } else {
1911 lva &= ~(PDE_MAPPED_SIZE - 1);
1912
1913 if (lva > eva) {
1914 lva = eva;
1915 }
1916 }
1917
1918 pde = pmap_pde(map, sva);
1919 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1920 if (*pde & PTE_PS) {
1921 /* superpage */
1922 spte = pde;
1923 epte = spte + 1; /* excluded */
1924 vaddr_incr = I386_LPGBYTES;
1925 } else {
1926 spte = pmap_pte(map, (sva & ~(PDE_MAPPED_SIZE - 1)));
1927 spte = &spte[ptenum(sva)];
1928 epte = &spte[intel_btop(lva - sva)];
1929 vaddr_incr = I386_PGBYTES;
1930 }
1931
1932 for (; spte < epte; spte++) {
1933 uint64_t clear_bits, set_bits;
1934
1935 if (!(*spte & PTE_VALID_MASK(is_ept))) {
1936 continue;
1937 }
1938
1939 clear_bits = 0;
1940 set_bits = 0;
1941
1942 if (is_ept) {
1943 if (!(prot & VM_PROT_READ)) {
1944 clear_bits |= PTE_READ(is_ept);
1945 }
1946 }
1947 if (!(prot & VM_PROT_WRITE)) {
1948 clear_bits |= PTE_WRITE(is_ept);
1949 }
1950 #if DEVELOPMENT || DEBUG
1951 else if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) &&
1952 map == kernel_pmap) {
1953 set_bits |= PTE_WRITE(is_ept);
1954 }
1955 #endif /* DEVELOPMENT || DEBUG */
1956
1957 if (set_NX) {
1958 if (!is_ept) {
1959 set_bits |= INTEL_PTE_NX;
1960 } else {
1961 clear_bits |= INTEL_EPT_EX | INTEL_EPT_UEX;
1962 }
1963 } else if (is_ept) {
1964 /* This is the exception to the "Don't add permissions" statement, above */
1965 set_bits |= ((prot & VM_PROT_EXECUTE) ? INTEL_EPT_EX : 0) |
1966 ((prot & VM_PROT_UEXEC) ? INTEL_EPT_UEX : 0);
1967 }
1968
1969 pmap_update_pte(is_ept, spte, clear_bits, set_bits, false);
1970
1971 DTRACE_VM3(set_pte, pmap_t, map, void *, cur_vaddr, uint64_t, *spte);
1972 cur_vaddr += vaddr_incr;
1973
1974 num_found++;
1975 }
1976 }
1977 sva = lva;
1978 }
1979 if (num_found) {
1980 if (options & PMAP_OPTIONS_NOFLUSH) {
1981 PMAP_UPDATE_TLBS_DELAYED(map, orig_sva, eva, (pmap_flush_context *)arg);
1982 } else {
1983 PMAP_UPDATE_TLBS(map, orig_sva, eva);
1984 }
1985 }
1986
1987 PMAP_UNLOCK_EXCLUSIVE(map);
1988
1989 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
1990 }
1991
1992 /* Map a (possibly) autogenned block */
1993 kern_return_t
pmap_map_block_addr(pmap_t pmap,addr64_t va,pmap_paddr_t pa,uint32_t size,vm_prot_t prot,int attr,unsigned int flags)1994 pmap_map_block_addr(
1995 pmap_t pmap,
1996 addr64_t va,
1997 pmap_paddr_t pa,
1998 uint32_t size,
1999 vm_prot_t prot,
2000 int attr,
2001 unsigned int flags)
2002 {
2003 return pmap_map_block(pmap, va, intel_btop(pa), size, prot, attr, flags);
2004 }
2005
2006 kern_return_t
pmap_map_block(pmap_t pmap,addr64_t va,ppnum_t pa,uint32_t size,vm_prot_t prot,int attr,__unused unsigned int flags)2007 pmap_map_block(
2008 pmap_t pmap,
2009 addr64_t va,
2010 ppnum_t pa,
2011 uint32_t size,
2012 vm_prot_t prot,
2013 int attr,
2014 __unused unsigned int flags)
2015 {
2016 kern_return_t kr;
2017 addr64_t original_va = va;
2018 uint32_t page;
2019 int cur_page_size;
2020
2021 if (attr & VM_MEM_SUPERPAGE) {
2022 cur_page_size = SUPERPAGE_SIZE;
2023 } else {
2024 cur_page_size = PAGE_SIZE;
2025 }
2026
2027 for (page = 0; page < size; page += cur_page_size / PAGE_SIZE) {
2028 kr = pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE);
2029
2030 if (kr != KERN_SUCCESS) {
2031 /*
2032 * This will panic for now, as it is unclear that
2033 * removing the mappings is correct.
2034 */
2035 panic("%s: failed pmap_enter, "
2036 "pmap=%p, va=%#llx, pa=%u, size=%u, prot=%#x, flags=%#x",
2037 __FUNCTION__,
2038 pmap, va, pa, size, prot, flags);
2039
2040 pmap_remove(pmap, original_va, va - original_va);
2041 return kr;
2042 }
2043
2044 va += cur_page_size;
2045 pa += cur_page_size / PAGE_SIZE;
2046 }
2047
2048 return KERN_SUCCESS;
2049 }
2050
2051 kern_return_t
pmap_expand_pml4(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2052 pmap_expand_pml4(
2053 pmap_t map,
2054 vm_map_offset_t vaddr,
2055 unsigned int options)
2056 {
2057 vm_page_t m;
2058 pmap_paddr_t pa;
2059 uint64_t i;
2060 ppnum_t pn;
2061 pml4_entry_t *pml4p;
2062 boolean_t is_ept = is_ept_pmap(map);
2063
2064 DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
2065
2066 /* With the exception of the kext "basement", the kernel's level 4
2067 * pagetables must not be dynamically expanded.
2068 */
2069 assert(map != kernel_pmap || (vaddr == KERNEL_BASEMENT));
2070 /*
2071 * Allocate a VM page for the pml4 page
2072 */
2073 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2074 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2075 return KERN_RESOURCE_SHORTAGE;
2076 }
2077 VM_PAGE_WAIT();
2078 }
2079 /*
2080 * put the page into the pmap's obj list so it
2081 * can be found later.
2082 */
2083 pn = VM_PAGE_GET_PHYS_PAGE(m);
2084 pa = i386_ptob(pn);
2085 i = pml4idx(map, vaddr);
2086
2087 /*
2088 * Zero the page.
2089 */
2090 pmap_zero_page(pn);
2091
2092 vm_page_lockspin_queues();
2093 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2094 vm_page_unlock_queues();
2095
2096 OSAddAtomic(1, &inuse_ptepages_count);
2097 OSAddAtomic64(1, &alloc_ptepages_count);
2098 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2099
2100 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2101 vm_object_lock(map->pm_obj_pml4);
2102
2103 PMAP_LOCK_EXCLUSIVE(map);
2104 /*
2105 * See if someone else expanded us first
2106 */
2107 if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
2108 PMAP_UNLOCK_EXCLUSIVE(map);
2109 vm_object_unlock(map->pm_obj_pml4);
2110
2111 VM_PAGE_FREE(m);
2112
2113 OSAddAtomic(-1, &inuse_ptepages_count);
2114 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2115 return KERN_SUCCESS;
2116 }
2117
2118 #if 0 /* DEBUG */
2119 if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE)) {
2120 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx",
2121 map, map->pm_obj_pml4, vaddr, i);
2122 }
2123 #endif
2124 vm_page_insert_wired(m, map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2125 vm_object_unlock(map->pm_obj_pml4);
2126
2127 /*
2128 * Set the page directory entry for this page table.
2129 */
2130 pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
2131
2132 /*
2133 * Note that INTEL_EPT_UEX is unconditionally set (as is INTEL_EPT_EX) for
2134 * all intermediate paging levels, from PML4Es to PDEs. Processors with
2135 * VT-x implementations that do not support MBE ignore the INTEL_EPT_UEX
2136 * bit at all levels of the EPT, so there is no risk of inducing EPT
2137 * violation faults.
2138 */
2139 pmap_store_pte(is_ept, pml4p, pa_to_pte(pa)
2140 | PTE_READ(is_ept)
2141 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2142 | PTE_WRITE(is_ept));
2143 pml4_entry_t *upml4p;
2144
2145 upml4p = pmap64_user_pml4(map, vaddr);
2146 pmap_store_pte(is_ept, upml4p, pa_to_pte(pa)
2147 | PTE_READ(is_ept)
2148 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2149 | PTE_WRITE(is_ept));
2150
2151 PMAP_UNLOCK_EXCLUSIVE(map);
2152
2153 return KERN_SUCCESS;
2154 }
2155
2156 kern_return_t
pmap_expand_pdpt(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2157 pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options)
2158 {
2159 vm_page_t m;
2160 pmap_paddr_t pa;
2161 uint64_t i;
2162 ppnum_t pn;
2163 pdpt_entry_t *pdptp;
2164 boolean_t is_ept = is_ept_pmap(map);
2165
2166 DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
2167
2168 while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
2169 kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options);
2170 if (pep4kr != KERN_SUCCESS) {
2171 return pep4kr;
2172 }
2173 }
2174
2175 /*
2176 * Allocate a VM page for the pdpt page
2177 */
2178 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2179 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2180 return KERN_RESOURCE_SHORTAGE;
2181 }
2182 VM_PAGE_WAIT();
2183 }
2184
2185 /*
2186 * put the page into the pmap's obj list so it
2187 * can be found later.
2188 */
2189 pn = VM_PAGE_GET_PHYS_PAGE(m);
2190 pa = i386_ptob(pn);
2191 i = pdptidx(map, vaddr);
2192
2193 /*
2194 * Zero the page.
2195 */
2196 pmap_zero_page(pn);
2197
2198 vm_page_lockspin_queues();
2199 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2200 vm_page_unlock_queues();
2201
2202 OSAddAtomic(1, &inuse_ptepages_count);
2203 OSAddAtomic64(1, &alloc_ptepages_count);
2204 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2205
2206 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2207 vm_object_lock(map->pm_obj_pdpt);
2208
2209 PMAP_LOCK_EXCLUSIVE(map);
2210 /*
2211 * See if someone else expanded us first
2212 */
2213 if (pmap_pde(map, vaddr) != PD_ENTRY_NULL) {
2214 PMAP_UNLOCK_EXCLUSIVE(map);
2215 vm_object_unlock(map->pm_obj_pdpt);
2216
2217 VM_PAGE_FREE(m);
2218
2219 OSAddAtomic(-1, &inuse_ptepages_count);
2220 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2221 return KERN_SUCCESS;
2222 }
2223
2224 #if 0 /* DEBUG */
2225 if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE)) {
2226 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx",
2227 map, map->pm_obj_pdpt, vaddr, i);
2228 }
2229 #endif
2230 vm_page_insert_wired(m, map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2231 vm_object_unlock(map->pm_obj_pdpt);
2232
2233 /*
2234 * Set the page directory entry for this page table.
2235 */
2236 pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
2237
2238 pmap_store_pte(is_ept, pdptp, pa_to_pte(pa)
2239 | PTE_READ(is_ept)
2240 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2241 | PTE_WRITE(is_ept));
2242
2243 PMAP_UNLOCK_EXCLUSIVE(map);
2244
2245 return KERN_SUCCESS;
2246 }
2247
2248
2249
2250 /*
2251 * Routine: pmap_expand
2252 *
2253 * Expands a pmap to be able to map the specified virtual address.
2254 *
2255 * Allocates new virtual memory for the P0 or P1 portion of the
2256 * pmap, then re-maps the physical pages that were in the old
2257 * pmap to be in the new pmap.
2258 *
2259 * Must be called with the pmap system and the pmap unlocked,
2260 * since these must be unlocked to use vm_allocate or vm_deallocate.
2261 * Thus it must be called in a loop that checks whether the map
2262 * has been expanded enough.
2263 * (We won't loop forever, since page tables aren't shrunk.)
2264 */
2265 kern_return_t
pmap_expand(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2266 pmap_expand(
2267 pmap_t map,
2268 vm_map_offset_t vaddr,
2269 unsigned int options)
2270 {
2271 pt_entry_t *pdp;
2272 vm_page_t m;
2273 pmap_paddr_t pa;
2274 uint64_t i;
2275 ppnum_t pn;
2276 boolean_t is_ept = is_ept_pmap(map);
2277
2278
2279 /*
2280 * For the kernel, the virtual address must be in or above the basement
2281 * which is for kexts and is in the 512GB immediately below the kernel..
2282 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
2283 */
2284 if (__improbable(map == kernel_pmap &&
2285 !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))) {
2286 if ((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0) {
2287 panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
2288 }
2289 }
2290
2291 while ((pdp = pmap_pde(map, vaddr)) == PD_ENTRY_NULL) {
2292 assert((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0);
2293 kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options);
2294 if (pepkr != KERN_SUCCESS) {
2295 return pepkr;
2296 }
2297 }
2298
2299 /*
2300 * Allocate a VM page for the pde entries.
2301 */
2302 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2303 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2304 return KERN_RESOURCE_SHORTAGE;
2305 }
2306 VM_PAGE_WAIT();
2307 }
2308
2309 /*
2310 * put the page into the pmap's obj list so it
2311 * can be found later.
2312 */
2313 pn = VM_PAGE_GET_PHYS_PAGE(m);
2314 pa = i386_ptob(pn);
2315 i = pdeidx(map, vaddr);
2316
2317 /*
2318 * Zero the page.
2319 */
2320 pmap_zero_page(pn);
2321
2322 vm_page_lockspin_queues();
2323 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2324 vm_page_unlock_queues();
2325
2326 OSAddAtomic(1, &inuse_ptepages_count);
2327 OSAddAtomic64(1, &alloc_ptepages_count);
2328 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2329
2330 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2331 vm_object_lock(map->pm_obj);
2332
2333 PMAP_LOCK_EXCLUSIVE(map);
2334
2335 /*
2336 * See if someone else expanded us first
2337 */
2338 if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
2339 PMAP_UNLOCK_EXCLUSIVE(map);
2340 vm_object_unlock(map->pm_obj);
2341
2342 VM_PAGE_FREE(m);
2343
2344 OSAddAtomic(-1, &inuse_ptepages_count); //todo replace all with inlines
2345 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2346 return KERN_SUCCESS;
2347 }
2348
2349 #if 0 /* DEBUG */
2350 if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE)) {
2351 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx",
2352 map, map->pm_obj, vaddr, i);
2353 }
2354 #endif
2355 vm_page_insert_wired(m, map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2356 vm_object_unlock(map->pm_obj);
2357
2358 /*
2359 * Set the page directory entry for this page table.
2360 */
2361 pdp = pmap_pde(map, vaddr);
2362
2363 pmap_store_pte(is_ept, pdp, pa_to_pte(pa)
2364 | PTE_READ(is_ept)
2365 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2366 | PTE_WRITE(is_ept));
2367
2368 PMAP_UNLOCK_EXCLUSIVE(map);
2369
2370 return KERN_SUCCESS;
2371 }
2372 /*
2373 * Query a pmap to see what size a given virtual address is mapped with.
2374 * If the vaddr is not mapped, returns 0.
2375 */
2376 vm_size_t
pmap_query_pagesize(pmap_t pmap,vm_map_offset_t vaddr)2377 pmap_query_pagesize(
2378 pmap_t pmap,
2379 vm_map_offset_t vaddr)
2380 {
2381 pd_entry_t *pdep;
2382 vm_size_t size = 0;
2383
2384 assert(!is_ept_pmap(pmap));
2385 PMAP_LOCK_EXCLUSIVE(pmap);
2386
2387 pdep = pmap_pde(pmap, vaddr);
2388 if (pdep != PD_ENTRY_NULL) {
2389 if (*pdep & INTEL_PTE_PS) {
2390 size = I386_LPGBYTES;
2391 } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2392 size = I386_PGBYTES;
2393 }
2394 }
2395
2396 PMAP_UNLOCK_EXCLUSIVE(pmap);
2397
2398 return size;
2399 }
2400
2401 /*
2402 * Ensure the page table hierarchy is filled in down to
2403 * the large page level. Additionally returns FAILURE if
2404 * a lower page table already exists.
2405 */
2406 static kern_return_t
pmap_pre_expand_large_internal(pmap_t pmap,vm_map_offset_t vaddr)2407 pmap_pre_expand_large_internal(
2408 pmap_t pmap,
2409 vm_map_offset_t vaddr)
2410 {
2411 ppnum_t pn;
2412 pt_entry_t *pte;
2413 boolean_t is_ept = is_ept_pmap(pmap);
2414 kern_return_t kr = KERN_SUCCESS;
2415
2416 if (pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
2417 if (!pmap_next_page_hi(&pn, FALSE)) {
2418 panic("pmap_pre_expand_large no PDPT");
2419 }
2420
2421 pmap_zero_page(pn);
2422
2423 pte = pmap64_pml4(pmap, vaddr);
2424
2425 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2426 PTE_READ(is_ept) |
2427 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2428 PTE_WRITE(is_ept));
2429
2430 pte = pmap64_user_pml4(pmap, vaddr);
2431
2432 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2433 PTE_READ(is_ept) |
2434 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2435 PTE_WRITE(is_ept));
2436 }
2437
2438 if (pmap_pde(pmap, vaddr) == PD_ENTRY_NULL) {
2439 if (!pmap_next_page_hi(&pn, FALSE)) {
2440 panic("pmap_pre_expand_large no PDE");
2441 }
2442
2443 pmap_zero_page(pn);
2444
2445 pte = pmap64_pdpt(pmap, vaddr);
2446
2447 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2448 PTE_READ(is_ept) |
2449 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2450 PTE_WRITE(is_ept));
2451 } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2452 kr = KERN_FAILURE;
2453 }
2454
2455 return kr;
2456 }
2457
2458 /*
2459 * Wrapper that locks the pmap.
2460 */
2461 kern_return_t
pmap_pre_expand_large(pmap_t pmap,vm_map_offset_t vaddr)2462 pmap_pre_expand_large(
2463 pmap_t pmap,
2464 vm_map_offset_t vaddr)
2465 {
2466 kern_return_t kr;
2467
2468 PMAP_LOCK_EXCLUSIVE(pmap);
2469 kr = pmap_pre_expand_large_internal(pmap, vaddr);
2470 PMAP_UNLOCK_EXCLUSIVE(pmap);
2471 return kr;
2472 }
2473
2474 /*
2475 * On large memory machines, pmap_steal_memory() will allocate past
2476 * the 1GB of pre-allocated/mapped virtual kernel area. This function
2477 * expands kernel the page tables to cover a given vaddr. It uses pages
2478 * from the same pool that pmap_steal_memory() uses, since vm_page_grab()
2479 * isn't available yet.
2480 */
2481 void
pmap_pre_expand(pmap_t pmap,vm_map_offset_t vaddr)2482 pmap_pre_expand(
2483 pmap_t pmap,
2484 vm_map_offset_t vaddr)
2485 {
2486 ppnum_t pn;
2487 pt_entry_t *pte;
2488 boolean_t is_ept = is_ept_pmap(pmap);
2489
2490 /*
2491 * This returns failure if a 4K page table already exists.
2492 * Othewise it fills in the page table hierarchy down
2493 * to that level.
2494 */
2495 PMAP_LOCK_EXCLUSIVE(pmap);
2496 if (pmap_pre_expand_large_internal(pmap, vaddr) == KERN_FAILURE) {
2497 PMAP_UNLOCK_EXCLUSIVE(pmap);
2498 return;
2499 }
2500
2501 /* Add the lowest table */
2502 if (!pmap_next_page_hi(&pn, FALSE)) {
2503 panic("pmap_pre_expand");
2504 }
2505
2506 pmap_zero_page(pn);
2507
2508 pte = pmap_pde(pmap, vaddr);
2509
2510 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2511 PTE_READ(is_ept) |
2512 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2513 PTE_WRITE(is_ept));
2514 PMAP_UNLOCK_EXCLUSIVE(pmap);
2515 }
2516
2517 /*
2518 * pmap_sync_page_data_phys(ppnum_t pa)
2519 *
2520 * Invalidates all of the instruction cache on a physical page and
2521 * pushes any dirty data from the data cache for the same physical page
2522 * Not required in i386.
2523 */
2524 void
pmap_sync_page_data_phys(__unused ppnum_t pa)2525 pmap_sync_page_data_phys(__unused ppnum_t pa)
2526 {
2527 return;
2528 }
2529
2530 /*
2531 * pmap_sync_page_attributes_phys(ppnum_t pa)
2532 *
2533 * Write back and invalidate all cachelines on a physical page.
2534 */
2535 void
pmap_sync_page_attributes_phys(ppnum_t pa)2536 pmap_sync_page_attributes_phys(ppnum_t pa)
2537 {
2538 cache_flush_page_phys(pa);
2539 }
2540
2541 void
pmap_copy_page(ppnum_t src,ppnum_t dst)2542 pmap_copy_page(ppnum_t src, ppnum_t dst)
2543 {
2544 bcopy_phys((addr64_t)i386_ptob(src),
2545 (addr64_t)i386_ptob(dst),
2546 PAGE_SIZE);
2547 }
2548
2549
2550 /*
2551 * Routine: pmap_pageable
2552 * Function:
2553 * Make the specified pages (by pmap, offset)
2554 * pageable (or not) as requested.
2555 *
2556 * A page which is not pageable may not take
2557 * a fault; therefore, its page table entry
2558 * must remain valid for the duration.
2559 *
2560 * This routine is merely advisory; pmap_enter
2561 * will specify that these pages are to be wired
2562 * down (or not) as appropriate.
2563 */
2564 void
pmap_pageable(__unused pmap_t pmap,__unused vm_map_offset_t start_addr,__unused vm_map_offset_t end_addr,__unused boolean_t pageable)2565 pmap_pageable(
2566 __unused pmap_t pmap,
2567 __unused vm_map_offset_t start_addr,
2568 __unused vm_map_offset_t end_addr,
2569 __unused boolean_t pageable)
2570 {
2571 #ifdef lint
2572 pmap++; start_addr++; end_addr++; pageable++;
2573 #endif /* lint */
2574 }
2575
2576 void
invalidate_icache(__unused vm_offset_t addr,__unused unsigned cnt,__unused int phys)2577 invalidate_icache(__unused vm_offset_t addr,
2578 __unused unsigned cnt,
2579 __unused int phys)
2580 {
2581 return;
2582 }
2583
2584 void
flush_dcache(__unused vm_offset_t addr,__unused unsigned count,__unused int phys)2585 flush_dcache(__unused vm_offset_t addr,
2586 __unused unsigned count,
2587 __unused int phys)
2588 {
2589 return;
2590 }
2591
2592 #if CONFIG_DTRACE
2593 /*
2594 * Constrain DTrace copyin/copyout actions
2595 */
2596 extern kern_return_t dtrace_copyio_preflight(addr64_t);
2597 extern kern_return_t dtrace_copyio_postflight(addr64_t);
2598
2599 kern_return_t
dtrace_copyio_preflight(__unused addr64_t va)2600 dtrace_copyio_preflight(__unused addr64_t va)
2601 {
2602 thread_t thread = current_thread();
2603 uint64_t ccr3;
2604 if (current_map() == kernel_map) {
2605 return KERN_FAILURE;
2606 } else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE)) {
2607 return KERN_FAILURE;
2608 } else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3)) {
2609 return KERN_FAILURE;
2610 } else {
2611 return KERN_SUCCESS;
2612 }
2613 }
2614
2615 kern_return_t
dtrace_copyio_postflight(__unused addr64_t va)2616 dtrace_copyio_postflight(__unused addr64_t va)
2617 {
2618 return KERN_SUCCESS;
2619 }
2620 #endif /* CONFIG_DTRACE */
2621
2622 #include <mach_vm_debug.h>
2623 #if MACH_VM_DEBUG
2624 #include <vm/vm_debug.h>
2625
2626 int
pmap_list_resident_pages(__unused pmap_t pmap,__unused vm_offset_t * listp,__unused int space)2627 pmap_list_resident_pages(
2628 __unused pmap_t pmap,
2629 __unused vm_offset_t *listp,
2630 __unused int space)
2631 {
2632 return 0;
2633 }
2634 #endif /* MACH_VM_DEBUG */
2635
2636
2637 #if CONFIG_COREDUMP
2638 /* temporary workaround */
2639 boolean_t
coredumpok(__unused vm_map_t map,__unused mach_vm_offset_t va)2640 coredumpok(__unused vm_map_t map, __unused mach_vm_offset_t va)
2641 {
2642 #if 0
2643 pt_entry_t *ptep;
2644
2645 ptep = pmap_pte(map->pmap, va);
2646 if (0 == ptep) {
2647 return FALSE;
2648 }
2649 return (*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED);
2650 #else
2651 return TRUE;
2652 #endif
2653 }
2654 #endif
2655
2656 boolean_t
phys_page_exists(ppnum_t pn)2657 phys_page_exists(ppnum_t pn)
2658 {
2659 assert(pn != vm_page_fictitious_addr);
2660
2661 if (!pmap_initialized) {
2662 return TRUE;
2663 }
2664
2665 if (pn == vm_page_guard_addr) {
2666 return FALSE;
2667 }
2668
2669 if (!IS_MANAGED_PAGE(ppn_to_pai(pn))) {
2670 return FALSE;
2671 }
2672
2673 return TRUE;
2674 }
2675
2676
2677
2678 void
pmap_switch(pmap_t tpmap)2679 pmap_switch(pmap_t tpmap)
2680 {
2681 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(tpmap));
2682 assert(ml_get_interrupts_enabled() == FALSE);
2683 set_dirbase(tpmap, current_thread(), cpu_number());
2684 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
2685 }
2686
2687 void
pmap_require(pmap_t pmap)2688 pmap_require(pmap_t pmap)
2689 {
2690 if (pmap != kernel_pmap) {
2691 zone_id_require(ZONE_ID_PMAP, sizeof(struct pmap), pmap);
2692 }
2693 }
2694
2695 /*
2696 * disable no-execute capability on
2697 * the specified pmap
2698 */
2699 void
pmap_disable_NX(__unused pmap_t pmap)2700 pmap_disable_NX(__unused pmap_t pmap)
2701 {
2702 #if DEVELOPMENT || DEBUG
2703 pmap->nx_enabled = 0;
2704 #endif
2705 }
2706
2707 void
pmap_flush_context_init(pmap_flush_context * pfc)2708 pmap_flush_context_init(pmap_flush_context *pfc)
2709 {
2710 pfc->pfc_cpus = 0;
2711 pfc->pfc_invalid_global = 0;
2712 }
2713
2714 static bool
pmap_tlbi_response(uint32_t lcpu,uint32_t rcpu,bool ngflush)2715 pmap_tlbi_response(uint32_t lcpu, uint32_t rcpu, bool ngflush)
2716 {
2717 bool responded = false;
2718 bool gflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_global_count !=
2719 cpu_datap(lcpu)->cpu_tlb_gen_counts_global[rcpu]);
2720
2721 if (ngflush) {
2722 if (gflushed) {
2723 responded = true;
2724 }
2725 } else {
2726 if (gflushed) {
2727 responded = true;
2728 } else {
2729 bool lflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_local_count !=
2730 cpu_datap(lcpu)->cpu_tlb_gen_counts_local[rcpu]);
2731 if (lflushed) {
2732 responded = true;
2733 }
2734 }
2735 }
2736
2737 if (responded == false) {
2738 if ((cpu_datap(rcpu)->cpu_tlb_invalid == 0) ||
2739 !CPU_CR3_IS_ACTIVE(rcpu) ||
2740 !cpu_is_running(rcpu)) {
2741 responded = true;
2742 }
2743 }
2744 return responded;
2745 }
2746
2747 extern uint64_t TLBTimeOut;
2748 void
pmap_flush(pmap_flush_context * pfc)2749 pmap_flush(
2750 pmap_flush_context *pfc)
2751 {
2752 unsigned int my_cpu;
2753 unsigned int cpu;
2754 cpumask_t cpu_bit;
2755 cpumask_t cpus_to_respond = 0;
2756 cpumask_t cpus_to_signal = 0;
2757 cpumask_t cpus_signaled = 0;
2758 boolean_t flush_self = FALSE;
2759 uint64_t deadline;
2760 bool need_global_flush = false;
2761
2762 mp_disable_preemption();
2763
2764 my_cpu = cpu_number();
2765 cpus_to_signal = pfc->pfc_cpus;
2766
2767 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_START,
2768 NULL, cpus_to_signal);
2769
2770 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus && cpus_to_signal; cpu++, cpu_bit <<= 1) {
2771 if (cpus_to_signal & cpu_bit) {
2772 cpus_to_signal &= ~cpu_bit;
2773
2774 if (!cpu_is_running(cpu)) {
2775 continue;
2776 }
2777
2778 if (pfc->pfc_invalid_global & cpu_bit) {
2779 cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2780 need_global_flush = true;
2781 } else {
2782 cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2783 }
2784 cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2785 cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2786 mfence();
2787
2788 if (cpu == my_cpu) {
2789 flush_self = TRUE;
2790 continue;
2791 }
2792 if (CPU_CR3_IS_ACTIVE(cpu)) {
2793 cpus_to_respond |= cpu_bit;
2794 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2795 }
2796 }
2797 }
2798 cpus_signaled = cpus_to_respond;
2799
2800 /*
2801 * Flush local tlb if required.
2802 * Do this now to overlap with other processors responding.
2803 */
2804 if (flush_self) {
2805 process_pmap_updates(NULL, (pfc->pfc_invalid_global != 0), 0ULL, ~0ULL);
2806 }
2807
2808 if (cpus_to_respond) {
2809 deadline = mach_absolute_time() +
2810 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2811 boolean_t is_timeout_traced = FALSE;
2812
2813 /*
2814 * Wait for those other cpus to acknowledge
2815 */
2816 while (cpus_to_respond != 0) {
2817 long orig_acks = 0;
2818
2819 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2820 bool responded = false;
2821 if ((cpus_to_respond & cpu_bit) != 0) {
2822 responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
2823 if (responded) {
2824 cpus_to_respond &= ~cpu_bit;
2825 }
2826 cpu_pause();
2827 }
2828
2829 if (cpus_to_respond == 0) {
2830 break;
2831 }
2832 }
2833 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2834 if (machine_timeout_suspended()) {
2835 continue;
2836 }
2837 if (TLBTimeOut == 0) {
2838 if (is_timeout_traced) {
2839 continue;
2840 }
2841
2842 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2843 NULL, cpus_to_signal, cpus_to_respond);
2844
2845 is_timeout_traced = TRUE;
2846 continue;
2847 }
2848 orig_acks = NMIPI_acks;
2849 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
2850 panic("Uninterruptible processor(s): CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu",
2851 cpus_to_respond, orig_acks, NMIPI_acks, deadline);
2852 }
2853 }
2854 }
2855
2856 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_END,
2857 NULL, cpus_signaled, flush_self);
2858
2859 mp_enable_preemption();
2860 }
2861
2862
2863 static void
invept(void * eptp)2864 invept(void *eptp)
2865 {
2866 struct {
2867 uint64_t eptp;
2868 uint64_t reserved;
2869 } __attribute__((aligned(16), packed)) invept_descriptor = {(uint64_t)eptp, 0};
2870
2871 __asm__ volatile ("invept (%%rax), %%rcx"
2872 : : "c" (PMAP_INVEPT_SINGLE_CONTEXT), "a" (&invept_descriptor)
2873 : "cc", "memory");
2874 }
2875
2876 /*
2877 * Called with pmap locked, we:
2878 * - scan through per-cpu data to see which other cpus need to flush
2879 * - send an IPI to each non-idle cpu to be flushed
2880 * - wait for all to signal back that they are inactive or we see that
2881 * they are at a safe point (idle).
2882 * - flush the local tlb if active for this pmap
2883 * - return ... the caller will unlock the pmap
2884 */
2885
2886 void
pmap_flush_tlbs(pmap_t pmap,vm_map_offset_t startv,vm_map_offset_t endv,int options,pmap_flush_context * pfc)2887 pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int options, pmap_flush_context *pfc)
2888 {
2889 unsigned int cpu;
2890 cpumask_t cpu_bit;
2891 cpumask_t cpus_to_signal = 0;
2892 unsigned int my_cpu = cpu_number();
2893 pmap_paddr_t pmap_cr3 = pmap->pm_cr3;
2894 boolean_t flush_self = FALSE;
2895 uint64_t deadline;
2896 boolean_t pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap));
2897 bool need_global_flush = false;
2898 uint32_t event_code = 0;
2899 vm_map_offset_t event_startv = 0, event_endv = 0;
2900 boolean_t is_ept = is_ept_pmap(pmap);
2901
2902 assert((processor_avail_count < 2) ||
2903 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
2904
2905 assert((endv - startv) >= PAGE_SIZE);
2906 assert(((endv | startv) & PAGE_MASK) == 0);
2907
2908 if (__improbable(kdebug_enable)) {
2909 if (pmap == kernel_pmap) {
2910 event_code = PMAP_CODE(PMAP__FLUSH_KERN_TLBS);
2911 event_startv = VM_KERNEL_UNSLIDE_OR_PERM(startv);
2912 event_endv = VM_KERNEL_UNSLIDE_OR_PERM(endv);
2913 } else if (__improbable(is_ept)) {
2914 event_code = PMAP_CODE(PMAP__FLUSH_EPT);
2915 event_startv = startv;
2916 event_endv = endv;
2917 } else {
2918 event_code = PMAP_CODE(PMAP__FLUSH_TLBS);
2919 event_startv = startv;
2920 event_endv = endv;
2921 }
2922 }
2923
2924 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_START,
2925 VM_KERNEL_UNSLIDE_OR_PERM(pmap), options,
2926 event_startv, event_endv);
2927
2928 if (__improbable(is_ept)) {
2929 mp_cpus_call(CPUMASK_ALL, ASYNC, invept, (void*)pmap->pm_eptp);
2930 goto out;
2931 }
2932
2933 /*
2934 * Scan other cpus for matching active or task CR3.
2935 * For idle cpus (with no active map) we mark them invalid but
2936 * don't signal -- they'll check as they go busy.
2937 */
2938 if (pmap_pcid_ncpus) {
2939 if (pmap_is_shared) {
2940 need_global_flush = true;
2941 }
2942 pmap_pcid_invalidate_all_cpus(pmap);
2943 mfence();
2944 }
2945
2946 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2947 if (!cpu_is_running(cpu)) {
2948 continue;
2949 }
2950 uint64_t cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
2951 uint64_t cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
2952
2953 if ((pmap_cr3 == cpu_task_cr3) ||
2954 (pmap_cr3 == cpu_active_cr3) ||
2955 (pmap_is_shared)) {
2956 if (options & PMAP_DELAY_TLB_FLUSH) {
2957 if (need_global_flush == true) {
2958 pfc->pfc_invalid_global |= cpu_bit;
2959 }
2960 pfc->pfc_cpus |= cpu_bit;
2961
2962 continue;
2963 }
2964 if (need_global_flush == true) {
2965 cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2966 cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2967 } else {
2968 cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2969 cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2970 }
2971
2972 if (cpu == my_cpu) {
2973 flush_self = TRUE;
2974 continue;
2975 }
2976
2977 mfence();
2978
2979 /*
2980 * We don't need to signal processors which will flush
2981 * lazily at the idle state or kernel boundary.
2982 * For example, if we're invalidating the kernel pmap,
2983 * processors currently in userspace don't need to flush
2984 * their TLBs until the next time they enter the kernel.
2985 * Alterations to the address space of a task active
2986 * on a remote processor result in a signal, to
2987 * account for copy operations. (There may be room
2988 * for optimization in such cases).
2989 * The order of the loads below with respect
2990 * to the store to the "cpu_tlb_invalid" field above
2991 * is important--hence the barrier.
2992 */
2993 if (CPU_CR3_IS_ACTIVE(cpu) &&
2994 (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
2995 pmap->pm_shared ||
2996 (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
2997 cpus_to_signal |= cpu_bit;
2998 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2999 }
3000 }
3001 }
3002
3003 if ((options & PMAP_DELAY_TLB_FLUSH)) {
3004 goto out;
3005 }
3006
3007 /*
3008 * Flush local tlb if required.
3009 * Do this now to overlap with other processors responding.
3010 */
3011 if (flush_self) {
3012 process_pmap_updates(pmap, pmap_is_shared, startv, endv);
3013 }
3014
3015 if (cpus_to_signal) {
3016 cpumask_t cpus_to_respond = cpus_to_signal;
3017
3018 deadline = mach_absolute_time() +
3019 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
3020 boolean_t is_timeout_traced = FALSE;
3021
3022 /*
3023 * Wait for those other cpus to acknowledge
3024 */
3025 while (cpus_to_respond != 0) {
3026 long orig_acks = 0;
3027
3028 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
3029 bool responded = false;
3030 if ((cpus_to_respond & cpu_bit) != 0) {
3031 responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
3032 if (responded) {
3033 cpus_to_respond &= ~cpu_bit;
3034 }
3035 cpu_pause();
3036 }
3037 if (cpus_to_respond == 0) {
3038 break;
3039 }
3040 }
3041 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
3042 if (machine_timeout_suspended()) {
3043 continue;
3044 }
3045 if (TLBTimeOut == 0) {
3046 /* cut tracepoint but don't panic */
3047 if (is_timeout_traced) {
3048 continue;
3049 }
3050
3051 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
3052 VM_KERNEL_UNSLIDE_OR_PERM(pmap),
3053 cpus_to_signal,
3054 cpus_to_respond);
3055
3056 is_timeout_traced = TRUE;
3057 continue;
3058 }
3059 orig_acks = NMIPI_acks;
3060 uint64_t tstamp1 = mach_absolute_time();
3061 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
3062 uint64_t tstamp2 = mach_absolute_time();
3063 panic("IPI timeout, unresponsive CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu, pre-NMIPI time: 0x%llx, current: 0x%llx, global: %d",
3064 cpus_to_respond, orig_acks, NMIPI_acks, deadline, tstamp1, tstamp2, need_global_flush);
3065 }
3066 }
3067 }
3068
3069 if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) {
3070 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, pmap_cr3: 0x%llx, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, pmap_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
3071 }
3072
3073 out:
3074 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_END,
3075 VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal,
3076 event_startv, event_endv);
3077 }
3078
3079 static void
process_pmap_updates(pmap_t p,bool pshared,addr64_t istart,addr64_t iend)3080 process_pmap_updates(pmap_t p, bool pshared, addr64_t istart, addr64_t iend)
3081 {
3082 int ccpu = cpu_number();
3083 bool gtlbf = false;
3084
3085 pmap_assert(ml_get_interrupts_enabled() == 0 ||
3086 get_preemption_level() != 0);
3087
3088 if (cpu_datap(ccpu)->cpu_tlb_invalid_global) {
3089 cpu_datap(ccpu)->cpu_tlb_invalid_global_count++;
3090 cpu_datap(ccpu)->cpu_tlb_invalid = 0;
3091 gtlbf = true;
3092 } else {
3093 cpu_datap(ccpu)->cpu_tlb_invalid_local_count++;
3094 cpu_datap(ccpu)->cpu_tlb_invalid_local = 0;
3095 }
3096
3097 if (pmap_pcid_ncpus) {
3098 if (p) {
3099 /* TODO global generation count to
3100 * avoid potentially redundant
3101 * csw invalidations post-global invalidation
3102 */
3103 pmap_pcid_validate_cpu(p, ccpu);
3104 pmap_tlbi_range(istart, iend, (pshared || gtlbf), p->pmap_pcid_cpus[ccpu]);
3105 } else {
3106 pmap_pcid_validate_current();
3107 pmap_tlbi_range(istart, iend, true, 0);
3108 }
3109 } else {
3110 pmap_tlbi_range(0, ~0ULL, true, 0);
3111 }
3112 }
3113
3114 void
pmap_update_interrupt(void)3115 pmap_update_interrupt(void)
3116 {
3117 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START);
3118
3119 if (current_cpu_datap()->cpu_tlb_invalid) {
3120 process_pmap_updates(NULL, true, 0ULL, ~0ULL);
3121 }
3122
3123 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END);
3124 }
3125
3126 #include <mach/mach_vm.h> /* mach_vm_region_recurse() */
3127 /* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries
3128 * and identify ranges with mismatched VM permissions and PTE permissions
3129 */
3130 kern_return_t
pmap_permissions_verify(pmap_t ipmap,vm_map_t ivmmap,vm_offset_t sv,vm_offset_t ev)3131 pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev)
3132 {
3133 vm_offset_t cv = sv;
3134 kern_return_t rv = KERN_SUCCESS;
3135 uint64_t skip4 = 0, skip2 = 0;
3136
3137 assert(!is_ept_pmap(ipmap));
3138
3139 sv &= ~PAGE_MASK_64;
3140 ev &= ~PAGE_MASK_64;
3141 while (cv < ev) {
3142 if (__improbable((cv > 0x00007FFFFFFFFFFFULL) &&
3143 (cv < 0xFFFF800000000000ULL))) {
3144 cv = 0xFFFF800000000000ULL;
3145 }
3146 /* Potential inconsistencies from not holding pmap lock
3147 * but harmless for the moment.
3148 */
3149 if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) {
3150 if ((cv + NBPML4) > cv) {
3151 cv += NBPML4;
3152 } else {
3153 break;
3154 }
3155 skip4++;
3156 continue;
3157 }
3158 if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) {
3159 if ((cv + NBPD) > cv) {
3160 cv += NBPD;
3161 } else {
3162 break;
3163 }
3164 skip2++;
3165 continue;
3166 }
3167
3168 pt_entry_t *ptep = pmap_pte(ipmap, cv);
3169 if (ptep && (*ptep & INTEL_PTE_VALID)) {
3170 if (*ptep & INTEL_PTE_WRITE) {
3171 if (!(*ptep & INTEL_PTE_NX)) {
3172 kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep)))));
3173 rv = KERN_FAILURE;
3174 }
3175 }
3176 }
3177 cv += PAGE_SIZE;
3178 }
3179 kprintf("Completed pmap scan\n");
3180 cv = sv;
3181
3182 struct vm_region_submap_info_64 vbr;
3183 mach_msg_type_number_t vbrcount = 0;
3184 mach_vm_size_t vmsize;
3185 vm_prot_t prot;
3186 uint32_t nesting_depth = 0;
3187 kern_return_t kret;
3188
3189 while (cv < ev) {
3190 for (;;) {
3191 vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64;
3192 if ((kret = mach_vm_region_recurse(ivmmap,
3193 (mach_vm_address_t *) &cv, &vmsize, &nesting_depth,
3194 (vm_region_recurse_info_t)&vbr,
3195 &vbrcount)) != KERN_SUCCESS) {
3196 break;
3197 }
3198
3199 if (vbr.is_submap) {
3200 nesting_depth++;
3201 continue;
3202 } else {
3203 break;
3204 }
3205 }
3206
3207 if (kret != KERN_SUCCESS) {
3208 break;
3209 }
3210
3211 prot = vbr.protection;
3212
3213 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
3214 kprintf("W+X map entry at address 0x%lx\n", cv);
3215 rv = KERN_FAILURE;
3216 }
3217
3218 if (prot) {
3219 vm_offset_t pcv;
3220 for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) {
3221 pt_entry_t *ptep = pmap_pte(ipmap, pcv);
3222 vm_prot_t tprot;
3223
3224 if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID)) {
3225 continue;
3226 }
3227 tprot = VM_PROT_READ;
3228 if (*ptep & INTEL_PTE_WRITE) {
3229 tprot |= VM_PROT_WRITE;
3230 }
3231 if ((*ptep & INTEL_PTE_NX) == 0) {
3232 tprot |= VM_PROT_EXECUTE;
3233 }
3234 if (tprot != prot) {
3235 kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot);
3236 rv = KERN_FAILURE;
3237 }
3238 }
3239 }
3240 cv += vmsize;
3241 }
3242 return rv;
3243 }
3244
3245 #if MACH_ASSERT
3246 extern int pmap_ledgers_panic;
3247 extern int pmap_ledgers_panic_leeway;
3248
3249 static void
pmap_check_ledgers(pmap_t pmap)3250 pmap_check_ledgers(
3251 pmap_t pmap)
3252 {
3253 int pid;
3254 char *procname;
3255
3256 if (pmap->pmap_pid == 0) {
3257 /*
3258 * This pmap was not or is no longer fully associated
3259 * with a task (e.g. the old pmap after a fork()/exec() or
3260 * spawn()). Its "ledger" still points at a task that is
3261 * now using a different (and active) address space, so
3262 * we can't check that all the pmap ledgers are balanced here.
3263 *
3264 * If the "pid" is set, that means that we went through
3265 * pmap_set_process() in task_terminate_internal(), so
3266 * this task's ledger should not have been re-used and
3267 * all the pmap ledgers should be back to 0.
3268 */
3269 return;
3270 }
3271
3272 pid = pmap->pmap_pid;
3273 procname = pmap->pmap_procname;
3274
3275 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
3276 }
3277
3278 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3279 pmap_set_process(
3280 pmap_t pmap,
3281 int pid,
3282 char *procname)
3283 {
3284 if (pmap == NULL) {
3285 return;
3286 }
3287
3288 pmap->pmap_pid = pid;
3289 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3290 if (pmap_ledgers_panic_leeway) {
3291 /*
3292 * XXX FBDP
3293 * Some processes somehow trigger some issues that make
3294 * the pmap stats and ledgers go off track, causing
3295 * some assertion failures and ledger panics.
3296 * Turn off the sanity checks if we allow some ledger leeway
3297 * because of that. We'll still do a final check in
3298 * pmap_check_ledgers() for discrepancies larger than the
3299 * allowed leeway after the address space has been fully
3300 * cleaned up.
3301 */
3302 pmap->pmap_stats_assert = FALSE;
3303 ledger_disable_panic_on_negative(pmap->ledger,
3304 task_ledgers.phys_footprint);
3305 ledger_disable_panic_on_negative(pmap->ledger,
3306 task_ledgers.internal);
3307 ledger_disable_panic_on_negative(pmap->ledger,
3308 task_ledgers.internal_compressed);
3309 ledger_disable_panic_on_negative(pmap->ledger,
3310 task_ledgers.iokit_mapped);
3311 ledger_disable_panic_on_negative(pmap->ledger,
3312 task_ledgers.alternate_accounting);
3313 ledger_disable_panic_on_negative(pmap->ledger,
3314 task_ledgers.alternate_accounting_compressed);
3315 }
3316 }
3317 #endif /* MACH_ASSERT */
3318
3319
3320 #if DEVELOPMENT || DEBUG
3321 int pmap_pagezero_mitigation = 1;
3322 #endif
3323
3324 void
pmap_advise_pagezero_range(pmap_t lpmap,uint64_t low_bound)3325 pmap_advise_pagezero_range(pmap_t lpmap, uint64_t low_bound)
3326 {
3327 #if DEVELOPMENT || DEBUG
3328 if (pmap_pagezero_mitigation == 0) {
3329 lpmap->pagezero_accessible = FALSE;
3330 return;
3331 }
3332 #endif
3333 lpmap->pagezero_accessible = ((pmap_smap_enabled == FALSE) && (low_bound < 0x1000));
3334 if (lpmap == current_pmap()) {
3335 mp_disable_preemption();
3336 current_cpu_datap()->cpu_pagezero_mapped = lpmap->pagezero_accessible;
3337 mp_enable_preemption();
3338 }
3339 }
3340
3341 uintptr_t
pmap_verify_noncacheable(uintptr_t vaddr)3342 pmap_verify_noncacheable(uintptr_t vaddr)
3343 {
3344 pt_entry_t *ptep = NULL;
3345 ptep = pmap_pte(kernel_pmap, vaddr);
3346 if (ptep == NULL) {
3347 panic("pmap_verify_noncacheable: no translation for 0x%lx", vaddr);
3348 }
3349 /* Non-cacheable OK */
3350 if (*ptep & (INTEL_PTE_NCACHE)) {
3351 return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3352 }
3353 /* Write-combined OK */
3354 if (*ptep & (INTEL_PTE_PAT)) {
3355 return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3356 }
3357 panic("pmap_verify_noncacheable: IO read from a cacheable address? address: 0x%lx, PTE: %p, *PTE: 0x%llx", vaddr, ptep, *ptep);
3358 /*NOTREACHED*/
3359 return 0;
3360 }
3361
3362 void
trust_cache_init(void)3363 trust_cache_init(void)
3364 {
3365 // Unsupported on this architecture.
3366 }
3367
3368 kern_return_t
pmap_load_legacy_trust_cache(struct pmap_legacy_trust_cache __unused * trust_cache,const vm_size_t __unused trust_cache_len)3369 pmap_load_legacy_trust_cache(struct pmap_legacy_trust_cache __unused *trust_cache,
3370 const vm_size_t __unused trust_cache_len)
3371 {
3372 // Unsupported on this architecture.
3373 return KERN_NOT_SUPPORTED;
3374 }
3375
3376 pmap_tc_ret_t
pmap_load_image4_trust_cache(struct pmap_image4_trust_cache __unused * trust_cache,const vm_size_t __unused trust_cache_len,uint8_t const * __unused img4_manifest,const vm_size_t __unused img4_manifest_buffer_len,const vm_size_t __unused img4_manifest_actual_len,bool __unused dry_run)3377 pmap_load_image4_trust_cache(struct pmap_image4_trust_cache __unused *trust_cache,
3378 const vm_size_t __unused trust_cache_len,
3379 uint8_t const * __unused img4_manifest,
3380 const vm_size_t __unused img4_manifest_buffer_len,
3381 const vm_size_t __unused img4_manifest_actual_len,
3382 bool __unused dry_run)
3383 {
3384 // Unsupported on this architecture.
3385 return PMAP_TC_UNKNOWN_FORMAT;
3386 }
3387
3388
3389 bool
pmap_is_trust_cache_loaded(const uuid_t __unused uuid)3390 pmap_is_trust_cache_loaded(const uuid_t __unused uuid)
3391 {
3392 // Unsupported on this architecture.
3393 return false;
3394 }
3395
3396 bool
pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])3397 pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])
3398 {
3399 // Unsupported on this architecture.
3400 return false;
3401 }
3402
3403 uint32_t
pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])3404 pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])
3405 {
3406 // Unsupported on this architecture.
3407 return false;
3408 }
3409
3410 int
pmap_cs_configuration(void)3411 pmap_cs_configuration(void)
3412 {
3413 // Unsupported on this architecture.
3414 return 0;
3415 }
3416
3417 SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
3418 uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
3419
3420 void
pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])3421 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
3422 {
3423 simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL);
3424 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
3425 simple_unlock(&pmap_compilation_service_cdhash_lock);
3426
3427 #if DEVELOPMENT || DEBUG
3428 printf("Added Compilation Service CDHash through the PMAP: 0x%02X 0x%02X 0x%02X 0x%02X\n", cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
3429 #endif
3430 }
3431
3432 bool
pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])3433 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
3434 {
3435 bool match = false;
3436
3437 simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL);
3438 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
3439 match = true;
3440 }
3441 simple_unlock(&pmap_compilation_service_cdhash_lock);
3442
3443 #if DEVELOPMENT || DEBUG
3444 if (match) {
3445 printf("Matched Compilation Service CDHash through the PMAP\n");
3446 }
3447 #endif
3448
3449 return match;
3450 }
3451
3452 static bool pmap_local_signing_public_key_set = false;
3453 static uint8_t pmap_local_signing_public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE] = { 0 };
3454
3455 static bool
pmap_local_signing_public_key_is_set(void)3456 pmap_local_signing_public_key_is_set(void)
3457 {
3458 return os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
3459 }
3460
3461 void
pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])3462 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])
3463 {
3464 bool key_set = false;
3465
3466 /*
3467 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
3468 * a successful exchange means that the local signing public key has _not_ been
3469 * set. In case the key has been set, we panic as we would never expect the
3470 * kernel to attempt to set the key more than once.
3471 */
3472 key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
3473
3474 if (key_set) {
3475 panic("attempted to set the local signing public key multiple times");
3476 }
3477
3478 memcpy(pmap_local_signing_public_key, public_key, PMAP_ECC_P384_PUBLIC_KEY_SIZE);
3479
3480 #if DEVELOPMENT || DEBUG
3481 printf("Set local signing public key\n");
3482 #endif
3483 }
3484
3485 uint8_t*
pmap_get_local_signing_public_key(void)3486 pmap_get_local_signing_public_key(void)
3487 {
3488 if (pmap_local_signing_public_key_is_set()) {
3489 return pmap_local_signing_public_key;
3490 }
3491 return NULL;
3492 }
3493
3494 void
pmap_unrestrict_local_signing(__unused const uint8_t cdhash[CS_CDHASH_LEN])3495 pmap_unrestrict_local_signing(
3496 __unused const uint8_t cdhash[CS_CDHASH_LEN])
3497 {
3498 // TODO: Once all changes across XNU and AMFI have been submitted, panic.
3499 }
3500
3501 bool
pmap_query_entitlements(__unused pmap_t pmap,__unused CEQuery_t query,__unused size_t queryLength,__unused CEQueryContext_t finalContext)3502 pmap_query_entitlements(
3503 __unused pmap_t pmap,
3504 __unused CEQuery_t query,
3505 __unused size_t queryLength,
3506 __unused CEQueryContext_t finalContext)
3507 {
3508 #if !PMAP_SUPPORTS_ENTITLEMENT_CHECKS
3509 panic("PMAP_CS: do not use this API without checking for \'#if PMAP_SUPPORTS_ENTITLEMENT_CHECKS\'");
3510 #endif
3511
3512 panic("PMAP_SUPPORTS_ENTITLEMENT_CHECKS should not be defined on this platform");
3513 }
3514
3515 bool
pmap_cs_enabled(void)3516 pmap_cs_enabled(void)
3517 {
3518 return false;
3519 }
3520
3521 bool
pmap_in_ppl(void)3522 pmap_in_ppl(void)
3523 {
3524 // Nonexistent on this architecture.
3525 return false;
3526 }
3527
3528 bool
pmap_has_ppl(void)3529 pmap_has_ppl(void)
3530 {
3531 // Not supported on this architecture.
3532 return false;
3533 }
3534
3535 void* __attribute__((noreturn))
pmap_image4_pmap_data(__unused size_t * allocated_size)3536 pmap_image4_pmap_data(
3537 __unused size_t *allocated_size)
3538 {
3539 panic("PMAP_IMG4: image4 data not available on this architecture");
3540 }
3541
3542 void __attribute__((noreturn))
pmap_image4_set_nonce(__unused const img4_nonce_domain_index_t ndi,__unused const img4_nonce_t * nonce)3543 pmap_image4_set_nonce(
3544 __unused const img4_nonce_domain_index_t ndi,
3545 __unused const img4_nonce_t *nonce)
3546 {
3547 panic("PMAP_IMG4: set nonce API not supported on this architecture");
3548 }
3549
3550 void __attribute__((noreturn))
pmap_image4_roll_nonce(__unused const img4_nonce_domain_index_t ndi)3551 pmap_image4_roll_nonce(
3552 __unused const img4_nonce_domain_index_t ndi)
3553 {
3554 panic("PMAP_IMG4: roll nonce API not supported on this architecture");
3555 }
3556
3557 errno_t __attribute__((noreturn))
pmap_image4_copy_nonce(__unused const img4_nonce_domain_index_t ndi,__unused img4_nonce_t * nonce_out)3558 pmap_image4_copy_nonce(
3559 __unused const img4_nonce_domain_index_t ndi,
3560 __unused img4_nonce_t *nonce_out
3561 )
3562 {
3563 panic("PMAP_IMG4: copy nonce API not supported on this architecture");
3564 }
3565
3566 errno_t __attribute__((noreturn))
pmap_image4_execute_object(__unused img4_runtime_object_spec_index_t obj_spec_index,__unused const img4_buff_t * payload,__unused const img4_buff_t * _Nullable manifest)3567 pmap_image4_execute_object(
3568 __unused img4_runtime_object_spec_index_t obj_spec_index,
3569 __unused const img4_buff_t *payload,
3570 __unused const img4_buff_t *_Nullable manifest)
3571 {
3572 panic("PMAP_IMG4: execute object API not supported on this architecture");
3573 }
3574
3575 errno_t __attribute__((noreturn))
pmap_image4_copy_object(__unused img4_runtime_object_spec_index_t obj_spec_index,__unused vm_address_t object_out,__unused size_t * object_length)3576 pmap_image4_copy_object(
3577 __unused img4_runtime_object_spec_index_t obj_spec_index,
3578 __unused vm_address_t object_out,
3579 __unused size_t *object_length)
3580 {
3581 panic("PMAP_IMG4: copy object API not supported on this architecture");
3582 }
3583
3584 void
pmap_lockdown_image4_slab(__unused vm_offset_t slab,__unused vm_size_t slab_len,__unused uint64_t flags)3585 pmap_lockdown_image4_slab(__unused vm_offset_t slab, __unused vm_size_t slab_len, __unused uint64_t flags)
3586 {
3587 // Unsupported on this architecture.
3588 }
3589
3590 void
pmap_lockdown_image4_late_slab(__unused vm_offset_t slab,__unused vm_size_t slab_len,__unused uint64_t flags)3591 pmap_lockdown_image4_late_slab(__unused vm_offset_t slab, __unused vm_size_t slab_len, __unused uint64_t flags)
3592 {
3593 // Unsupported on this architecture.
3594 }
3595
3596 kern_return_t
pmap_cs_allow_invalid(__unused pmap_t pmap)3597 pmap_cs_allow_invalid(__unused pmap_t pmap)
3598 {
3599 // Unsupported on this architecture.
3600 return KERN_SUCCESS;
3601 }
3602
3603 void *
pmap_claim_reserved_ppl_page(void)3604 pmap_claim_reserved_ppl_page(void)
3605 {
3606 // Unsupported on this architecture.
3607 return NULL;
3608 }
3609
3610 void
pmap_free_reserved_ppl_page(void __unused * kva)3611 pmap_free_reserved_ppl_page(void __unused *kva)
3612 {
3613 // Unsupported on this architecture.
3614 }
3615
3616 kern_return_t
pmap_cs_fork_prepare(__unused pmap_t old_pmap,__unused pmap_t new_pmap)3617 pmap_cs_fork_prepare(__unused pmap_t old_pmap, __unused pmap_t new_pmap)
3618 {
3619 // PMAP_CS isn't enabled for x86_64.
3620 return KERN_SUCCESS;
3621 }
3622
3623 #if DEVELOPMENT || DEBUG
3624 /*
3625 * Used for unit testing recovery from text corruptions.
3626 */
3627 kern_return_t
pmap_test_text_corruption(pmap_paddr_t pa)3628 pmap_test_text_corruption(pmap_paddr_t pa)
3629 {
3630 int pai;
3631 uint8_t *va;
3632
3633 pai = ppn_to_pai(atop(pa));
3634 if (!IS_MANAGED_PAGE(pai)) {
3635 return KERN_FAILURE;
3636 }
3637
3638 va = (uint8_t *)PHYSMAP_PTOV(pa);
3639 va[0] = 0x0f; /* opcode for UD2 */
3640 va[1] = 0x0b;
3641
3642 return KERN_SUCCESS;
3643 }
3644 #endif /* DEVELOPMENT || DEBUG */
3645