1 /*
2 * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58
59 /*
60 * File: pmap.c
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
63 *
64 * Physical Map management code for Intel i386, i486, and i860.
65 *
66 * Manages physical address maps.
67 *
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
74 *
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
80 * requested.
81 *
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
89 */
90
91 #include <string.h>
92 #include <mach_ldebug.h>
93
94 #include <libkern/OSAtomic.h>
95
96 #include <mach/machine/vm_types.h>
97
98 #include <mach/boolean.h>
99 #include <kern/thread.h>
100 #include <kern/zalloc.h>
101 #include <kern/zalloc_internal.h>
102 #include <kern/queue.h>
103 #include <kern/ledger.h>
104 #include <kern/mach_param.h>
105
106 #include <kern/spl.h>
107
108 #include <vm/pmap.h>
109 #include <vm/pmap_cs.h>
110 #include <vm/vm_map.h>
111 #include <vm/vm_kern.h>
112 #include <mach/vm_param.h>
113 #include <mach/vm_prot.h>
114 #include <vm/vm_object.h>
115 #include <vm/vm_page.h>
116
117 #include <mach/machine/vm_param.h>
118 #include <machine/thread.h>
119
120 #include <kern/misc_protos.h> /* prototyping */
121 #include <i386/misc_protos.h>
122 #include <i386/i386_lowmem.h>
123 #include <x86_64/lowglobals.h>
124
125 #include <i386/cpuid.h>
126 #include <i386/cpu_data.h>
127 #include <i386/cpu_number.h>
128 #include <i386/machine_cpu.h>
129 #include <i386/seg.h>
130 #include <i386/serial_io.h>
131 #include <i386/cpu_capabilities.h>
132 #include <i386/machine_routines.h>
133 #include <i386/proc_reg.h>
134 #include <i386/tsc.h>
135 #include <i386/pmap_internal.h>
136 #include <i386/pmap_pcid.h>
137 #if CONFIG_VMX
138 #include <i386/vmx/vmx_cpu.h>
139 #endif
140
141 #include <vm/vm_protos.h>
142 #include <san/kasan.h>
143
144 #include <i386/mp.h>
145 #include <i386/mp_desc.h>
146 #include <libkern/kernel_mach_header.h>
147
148 #include <pexpert/i386/efi.h>
149 #include <libkern/section_keywords.h>
150 #if MACH_ASSERT
151 int pmap_stats_assert = 1;
152 #endif /* MACH_ASSERT */
153
154 #ifdef IWANTTODEBUG
155 #undef DEBUG
156 #define DEBUG 1
157 #define POSTCODE_DELAY 1
158 #include <i386/postcode.h>
159 #endif /* IWANTTODEBUG */
160
161 #ifdef PMAP_DEBUG
162 #define DBG(x...) kprintf("DBG: " x)
163 #else
164 #define DBG(x...)
165 #endif
166 /* Compile time assert to ensure adjacency/alignment of per-CPU data fields used
167 * in the trampolines for kernel/user boundary TLB coherency.
168 */
169 char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1];
170 boolean_t pmap_trace = FALSE;
171
172 boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */
173
174 #if DEVELOPMENT || DEBUG
175 int nx_enabled = 1; /* enable no-execute protection -- set during boot */
176 #else
177 const int nx_enabled = 1;
178 #endif
179
180 #if DEBUG || DEVELOPMENT
181 int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
182 int allow_stack_exec = 0; /* No apps may execute from the stack by default */
183 #else /* DEBUG || DEVELOPMENT */
184 const int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
185 const int allow_stack_exec = 0; /* No apps may execute from the stack by default */
186 #endif /* DEBUG || DEVELOPMENT */
187
188 uint64_t max_preemption_latency_tsc = 0;
189
190 pv_hashed_entry_t *pv_hash_table; /* hash lists */
191
192 uint32_t npvhashmask = 0, npvhashbuckets = 0;
193
194 pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
195 pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
196 SIMPLE_LOCK_DECLARE(pv_hashed_free_list_lock, 0);
197 SIMPLE_LOCK_DECLARE(pv_hashed_kern_free_list_lock, 0);
198 SIMPLE_LOCK_DECLARE(pv_hash_table_lock, 0);
199 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
200
201 SECURITY_READ_ONLY_LATE(zone_t) pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
202
203 /*
204 * First and last physical addresses that we maintain any information
205 * for. Initialized to zero so that pmap operations done before
206 * pmap_init won't touch any non-existent structures.
207 */
208 boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */
209
210 static struct vm_object kptobj_object_store VM_PAGE_PACKED_ALIGNED;
211 static struct vm_object kpml4obj_object_store VM_PAGE_PACKED_ALIGNED;
212 static struct vm_object kpdptobj_object_store VM_PAGE_PACKED_ALIGNED;
213
214 /*
215 * Array of physical page attribites for managed pages.
216 * One byte per physical page.
217 */
218 char *pmap_phys_attributes;
219 ppnum_t last_managed_page = 0;
220
221 unsigned pmap_memory_region_count;
222 unsigned pmap_memory_region_current;
223
224 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
225
226 /*
227 * Other useful macros.
228 */
229 #define current_pmap() (vm_map_pmap(current_thread()->map))
230
231 struct pmap kernel_pmap_store;
232 const pmap_t kernel_pmap = &kernel_pmap_store;
233 SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
234 SECURITY_READ_ONLY_LATE(zone_t) pmap_anchor_zone;
235 SECURITY_READ_ONLY_LATE(zone_t) pmap_uanchor_zone;
236 int pmap_debug = 0; /* flag for debugging prints */
237
238 unsigned int inuse_ptepages_count = 0;
239 long long alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */
240 unsigned int bootstrap_wired_pages = 0;
241
242 extern long NMIPI_acks;
243
244 SECURITY_READ_ONLY_LATE(boolean_t) kernel_text_ps_4K = TRUE;
245
246 extern char end;
247
248 static int nkpt;
249
250 #if DEVELOPMENT || DEBUG
251 SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kheap_nx = FALSE;
252 SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kstack_nx = FALSE;
253 SECURITY_READ_ONLY_LATE(boolean_t) wpkernel = TRUE;
254 #else
255 const boolean_t wpkernel = TRUE;
256 #endif
257
258 extern long __stack_chk_guard[];
259
260 static uint64_t pmap_eptp_flags = 0;
261 boolean_t pmap_ept_support_ad = FALSE;
262
263 static void process_pmap_updates(pmap_t, bool, addr64_t, addr64_t);
264 /*
265 * Map memory at initialization. The physical addresses being
266 * mapped are not managed and are never unmapped.
267 *
268 * For now, VM is already on, we only need to map the
269 * specified memory.
270 */
271 vm_offset_t
pmap_map(vm_offset_t virt,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int flags)272 pmap_map(
273 vm_offset_t virt,
274 vm_map_offset_t start_addr,
275 vm_map_offset_t end_addr,
276 vm_prot_t prot,
277 unsigned int flags)
278 {
279 kern_return_t kr;
280 int ps;
281
282 ps = PAGE_SIZE;
283 while (start_addr < end_addr) {
284 kr = pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
285 (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE);
286
287 if (kr != KERN_SUCCESS) {
288 panic("%s: failed pmap_enter, "
289 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
290 __FUNCTION__,
291 (void *)virt, (void *)start_addr, (void *)end_addr, prot, flags);
292 }
293
294 virt += ps;
295 start_addr += ps;
296 }
297 return virt;
298 }
299
300 extern char *first_avail;
301 extern vm_offset_t virtual_avail, virtual_end;
302 extern pmap_paddr_t avail_start, avail_end;
303 extern vm_offset_t sHIB;
304 extern vm_offset_t eHIB;
305 extern vm_offset_t stext;
306 extern vm_offset_t etext;
307 extern vm_offset_t sdata, edata;
308 extern vm_offset_t sconst, econst;
309
310 extern void *KPTphys;
311
312 boolean_t pmap_smep_enabled = FALSE;
313 boolean_t pmap_smap_enabled = FALSE;
314
315 void
pmap_cpu_init(void)316 pmap_cpu_init(void)
317 {
318 cpu_data_t *cdp = current_cpu_datap();
319
320 set_cr4(get_cr4() | CR4_PGE);
321
322 /*
323 * Initialize the per-cpu, TLB-related fields.
324 */
325 cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
326 cpu_shadowp(cdp->cpu_number)->cpu_kernel_cr3 = cdp->cpu_kernel_cr3;
327 cdp->cpu_active_cr3 = kernel_pmap->pm_cr3;
328 cdp->cpu_tlb_invalid = 0;
329 cdp->cpu_task_map = TASK_MAP_64BIT;
330
331 pmap_pcid_configure();
332 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) {
333 pmap_smep_enabled = TRUE;
334 #if DEVELOPMENT || DEBUG
335 boolean_t nsmep;
336 if (PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) {
337 pmap_smep_enabled = FALSE;
338 }
339 #endif
340 if (pmap_smep_enabled) {
341 set_cr4(get_cr4() | CR4_SMEP);
342 }
343 }
344 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
345 pmap_smap_enabled = TRUE;
346 #if DEVELOPMENT || DEBUG
347 boolean_t nsmap;
348 if (PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
349 pmap_smap_enabled = FALSE;
350 }
351 #endif
352 if (pmap_smap_enabled) {
353 set_cr4(get_cr4() | CR4_SMAP);
354 }
355 }
356
357 #if !MONOTONIC
358 if (cdp->cpu_fixed_pmcs_enabled) {
359 boolean_t enable = TRUE;
360 cpu_pmc_control(&enable);
361 }
362 #endif /* !MONOTONIC */
363 }
364
365 static void
pmap_ro_zone_validate_element_dst(zone_id_t zid,vm_offset_t va,vm_offset_t offset,vm_size_t new_data_size)366 pmap_ro_zone_validate_element_dst(
367 zone_id_t zid,
368 vm_offset_t va,
369 vm_offset_t offset,
370 vm_size_t new_data_size)
371 {
372 if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
373 panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
374 ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
375 }
376
377 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
378
379 /* Check element is from correct zone and properly aligned */
380 zone_require_ro(zid, elem_size, (void*)va);
381
382 if (__improbable(new_data_size > (elem_size - offset))) {
383 panic("%s: New data size %lu too large for elem size %lu at addr %p",
384 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
385 }
386 if (__improbable(offset >= elem_size)) {
387 panic("%s: Offset %lu too large for elem size %lu at addr %p",
388 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
389 }
390 }
391
392 static void
pmap_ro_zone_validate_element(zone_id_t zid,vm_offset_t va,vm_offset_t offset,const vm_offset_t new_data,vm_size_t new_data_size)393 pmap_ro_zone_validate_element(
394 zone_id_t zid,
395 vm_offset_t va,
396 vm_offset_t offset,
397 const vm_offset_t new_data,
398 vm_size_t new_data_size)
399 {
400 vm_offset_t sum = 0;
401
402 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
403 panic("%s: Integer addition overflow %p + %lu = %lu",
404 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
405 }
406
407 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
408 }
409
410 void
pmap_ro_zone_memcpy(zone_id_t zid,vm_offset_t va,vm_offset_t offset,const vm_offset_t new_data,vm_size_t new_data_size)411 pmap_ro_zone_memcpy(
412 zone_id_t zid,
413 vm_offset_t va,
414 vm_offset_t offset,
415 const vm_offset_t new_data,
416 vm_size_t new_data_size)
417 {
418 const pmap_paddr_t pa = kvtophys(va + offset);
419
420 if (!new_data || new_data_size == 0) {
421 return;
422 }
423
424 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
425 /* Write through Physical Aperture */
426 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
427 }
428
429 uint64_t
pmap_ro_zone_atomic_op(zone_id_t zid,vm_offset_t va,vm_offset_t offset,zro_atomic_op_t op,uint64_t value)430 pmap_ro_zone_atomic_op(
431 zone_id_t zid,
432 vm_offset_t va,
433 vm_offset_t offset,
434 zro_atomic_op_t op,
435 uint64_t value)
436 {
437 const pmap_paddr_t pa = kvtophys(va + offset);
438 vm_size_t value_size = op & 0xf;
439
440 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
441 /* Write through Physical Aperture */
442 return __zalloc_ro_mut_atomic(phystokv(pa), op, value);
443 }
444
445 void
pmap_ro_zone_bzero(zone_id_t zid,vm_offset_t va,vm_offset_t offset,vm_size_t size)446 pmap_ro_zone_bzero(
447 zone_id_t zid,
448 vm_offset_t va,
449 vm_offset_t offset,
450 vm_size_t size)
451 {
452 const pmap_paddr_t pa = kvtophys(va + offset);
453 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
454 bzero((void*)phystokv(pa), size);
455 }
456
457 static uint32_t
pmap_scale_shift(void)458 pmap_scale_shift(void)
459 {
460 uint32_t scale = 0;
461
462 if (sane_size <= 8 * GB) {
463 scale = (uint32_t)(sane_size / (2 * GB));
464 } else if (sane_size <= 32 * GB) {
465 scale = 4 + (uint32_t)((sane_size - (8 * GB)) / (4 * GB));
466 } else {
467 scale = 10 + (uint32_t)MIN(4, ((sane_size - (32 * GB)) / (8 * GB)));
468 }
469 return scale;
470 }
471
472 LCK_GRP_DECLARE(pmap_lck_grp, "pmap");
473 LCK_ATTR_DECLARE(pmap_lck_rw_attr, 0, LCK_ATTR_DEBUG);
474
475 /*
476 * Bootstrap the system enough to run with virtual memory.
477 * Map the kernel's code and data, and allocate the system page table.
478 * Called with mapping OFF. Page_size must already be set.
479 */
480
481 void
pmap_bootstrap(__unused vm_offset_t load_start,__unused boolean_t IA32e)482 pmap_bootstrap(
483 __unused vm_offset_t load_start,
484 __unused boolean_t IA32e)
485 {
486 assert(IA32e);
487
488 vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address
489 * known to VM */
490 /*
491 * The kernel's pmap is statically allocated so we don't
492 * have to use pmap_create, which is unlikely to work
493 * correctly at this part of the boot sequence.
494 */
495
496 os_ref_init(&kernel_pmap->ref_count, NULL);
497 #if DEVELOPMENT || DEBUG
498 kernel_pmap->nx_enabled = TRUE;
499 #endif
500 kernel_pmap->pm_task_map = TASK_MAP_64BIT;
501 kernel_pmap->pm_obj = (vm_object_t) NULL;
502 kernel_pmap->pm_pml4 = IdlePML4;
503 kernel_pmap->pm_upml4 = IdlePML4;
504 kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
505 kernel_pmap->pm_ucr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
506 kernel_pmap->pm_eptp = 0;
507
508 pmap_pcid_initialize_kernel(kernel_pmap);
509
510 current_cpu_datap()->cpu_kernel_cr3 = cpu_shadowp(cpu_number())->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
511
512 nkpt = NKPT;
513 OSAddAtomic(NKPT, &inuse_ptepages_count);
514 OSAddAtomic64(NKPT, &alloc_ptepages_count);
515 bootstrap_wired_pages = NKPT;
516
517 virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
518 virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
519
520 if (!PE_parse_boot_argn("npvhash", &npvhashmask, sizeof(npvhashmask))) {
521 npvhashmask = ((NPVHASHBUCKETS) << pmap_scale_shift()) - 1;
522 }
523
524 npvhashbuckets = npvhashmask + 1;
525
526 if (0 != ((npvhashbuckets) & npvhashmask)) {
527 panic("invalid hash %d, must be ((2^N)-1), "
528 "using default %d\n", npvhashmask, NPVHASHMASK);
529 }
530
531 lck_rw_init(&kernel_pmap->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
532 kernel_pmap->pmap_rwl.lck_rw_can_sleep = FALSE;
533
534 pmap_cpu_init();
535
536 if (pmap_pcid_ncpus) {
537 printf("PMAP: PCID enabled\n");
538 }
539
540 if (pmap_smep_enabled) {
541 printf("PMAP: Supervisor Mode Execute Protection enabled\n");
542 }
543 if (pmap_smap_enabled) {
544 printf("PMAP: Supervisor Mode Access Protection enabled\n");
545 }
546
547 #if DEBUG
548 printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
549 printf("early_random(): 0x%qx\n", early_random());
550 #endif
551 #if DEVELOPMENT || DEBUG
552 boolean_t ptmp;
553 /* Check if the user has requested disabling stack or heap no-execute
554 * enforcement. These are "const" variables; that qualifier is cast away
555 * when altering them. The TEXT/DATA const sections are marked
556 * write protected later in the kernel startup sequence, so altering
557 * them is possible at this point, in pmap_bootstrap().
558 */
559 if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) {
560 boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx;
561 *pdknxp = TRUE;
562 }
563
564 if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) {
565 boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx;
566 *pdknhp = TRUE;
567 }
568 #endif /* DEVELOPMENT || DEBUG */
569
570 boot_args *args = (boot_args *)PE_state.bootArgs;
571 if (args->efiMode == kBootArgsEfiMode32) {
572 printf("EFI32: kernel virtual space limited to 4GB\n");
573 virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32;
574 }
575 kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
576 (long)KERNEL_BASE, (long)virtual_end);
577 kprintf("Available physical space from 0x%llx to 0x%llx\n",
578 avail_start, avail_end);
579
580 /*
581 * The -no_shared_cr3 boot-arg is a debugging feature (set by default
582 * in the DEBUG kernel) to force the kernel to switch to its own map
583 * (and cr3) when control is in kernelspace. The kernel's map does not
584 * include (i.e. share) userspace so wild references will cause
585 * a panic. Only copyin and copyout are exempt from this.
586 */
587 (void) PE_parse_boot_argn("-no_shared_cr3",
588 &no_shared_cr3, sizeof(no_shared_cr3));
589 if (no_shared_cr3) {
590 kprintf("Kernel not sharing user map\n");
591 }
592
593 #ifdef PMAP_TRACES
594 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof(pmap_trace))) {
595 kprintf("Kernel traces for pmap operations enabled\n");
596 }
597 #endif /* PMAP_TRACES */
598
599 #if MACH_ASSERT
600 PE_parse_boot_argn("pmap_asserts", &pmap_asserts_enabled, sizeof(pmap_asserts_enabled));
601 PE_parse_boot_argn("pmap_stats_assert",
602 &pmap_stats_assert,
603 sizeof(pmap_stats_assert));
604 #endif /* MACH_ASSERT */
605 }
606
607 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)608 pmap_virtual_space(
609 vm_offset_t *startp,
610 vm_offset_t *endp)
611 {
612 *startp = virtual_avail;
613 *endp = virtual_end;
614 }
615
616
617
618
619 #if HIBERNATION
620
621 #include <IOKit/IOHibernatePrivate.h>
622 #include <machine/pal_hibernate.h>
623
624 int32_t pmap_npages;
625 int32_t pmap_teardown_last_valid_compact_indx = -1;
626
627 void pmap_pack_index(uint32_t);
628 int32_t pmap_unpack_index(pv_rooted_entry_t);
629
630 int32_t
pmap_unpack_index(pv_rooted_entry_t pv_h)631 pmap_unpack_index(pv_rooted_entry_t pv_h)
632 {
633 int32_t indx = 0;
634
635 indx = (int32_t)(*((uint64_t *)(&pv_h->qlink.next)) >> 48);
636 indx = indx << 16;
637 indx |= (int32_t)(*((uint64_t *)(&pv_h->qlink.prev)) >> 48);
638
639 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)0xffff << 48);
640 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)0xffff << 48);
641
642 return indx;
643 }
644
645
646 void
pmap_pack_index(uint32_t indx)647 pmap_pack_index(uint32_t indx)
648 {
649 pv_rooted_entry_t pv_h;
650
651 pv_h = &pv_head_table[indx];
652
653 *((uint64_t *)(&pv_h->qlink.next)) &= ~((uint64_t)0xffff << 48);
654 *((uint64_t *)(&pv_h->qlink.prev)) &= ~((uint64_t)0xffff << 48);
655
656 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)(indx >> 16)) << 48;
657 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)(indx & 0xffff)) << 48;
658 }
659
660
661 void
pal_hib_teardown_pmap_structs(addr64_t * unneeded_start,addr64_t * unneeded_end)662 pal_hib_teardown_pmap_structs(addr64_t *unneeded_start, addr64_t *unneeded_end)
663 {
664 int32_t i;
665 int32_t compact_target_indx;
666
667 compact_target_indx = 0;
668
669 for (i = 0; i < pmap_npages; i++) {
670 if (pv_head_table[i].pmap == PMAP_NULL) {
671 if (pv_head_table[compact_target_indx].pmap != PMAP_NULL) {
672 compact_target_indx = i;
673 }
674 } else {
675 pmap_pack_index((uint32_t)i);
676
677 if (pv_head_table[compact_target_indx].pmap == PMAP_NULL) {
678 /*
679 * we've got a hole to fill, so
680 * move this pv_rooted_entry_t to it's new home
681 */
682 pv_head_table[compact_target_indx] = pv_head_table[i];
683 pv_head_table[i].pmap = PMAP_NULL;
684
685 pmap_teardown_last_valid_compact_indx = compact_target_indx;
686 compact_target_indx++;
687 } else {
688 pmap_teardown_last_valid_compact_indx = i;
689 }
690 }
691 }
692 *unneeded_start = (addr64_t)&pv_head_table[pmap_teardown_last_valid_compact_indx + 1];
693 *unneeded_end = (addr64_t)&pv_head_table[pmap_npages - 1];
694
695 HIBLOG("pal_hib_teardown_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
696 }
697
698
699 void
pal_hib_rebuild_pmap_structs(void)700 pal_hib_rebuild_pmap_structs(void)
701 {
702 int32_t cindx, eindx, rindx = 0;
703 pv_rooted_entry_t pv_h;
704
705 eindx = (int32_t)pmap_npages;
706
707 for (cindx = pmap_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
708 pv_h = &pv_head_table[cindx];
709
710 rindx = pmap_unpack_index(pv_h);
711 assert(rindx < pmap_npages);
712
713 if (rindx != cindx) {
714 /*
715 * this pv_rooted_entry_t was moved by pal_hib_teardown_pmap_structs,
716 * so move it back to its real location
717 */
718 pv_head_table[rindx] = pv_head_table[cindx];
719 }
720 if (rindx + 1 != eindx) {
721 /*
722 * the 'hole' between this vm_rooted_entry_t and the previous
723 * vm_rooted_entry_t we moved needs to be initialized as
724 * a range of zero'd vm_rooted_entry_t's
725 */
726 bzero((char *)&pv_head_table[rindx + 1], (eindx - rindx - 1) * sizeof(struct pv_rooted_entry));
727 }
728 eindx = rindx;
729 }
730 if (rindx) {
731 bzero((char *)&pv_head_table[0], rindx * sizeof(struct pv_rooted_entry));
732 }
733
734 HIBLOG("pal_hib_rebuild_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
735 }
736
737 #endif
738
739 /*
740 * Create pv entries for kernel pages mapped by early startup code.
741 * These have to exist so we can ml_static_mfree() them later.
742 */
743 static void
pmap_pv_fixup(vm_offset_t start_va,vm_offset_t end_va)744 pmap_pv_fixup(vm_offset_t start_va, vm_offset_t end_va)
745 {
746 ppnum_t ppn;
747 pv_rooted_entry_t pv_h;
748 uint32_t pgsz;
749
750 start_va = round_page(start_va);
751 end_va = trunc_page(end_va);
752 while (start_va < end_va) {
753 pgsz = PAGE_SIZE;
754 ppn = pmap_find_phys(kernel_pmap, start_va);
755 if (ppn != 0 && IS_MANAGED_PAGE(ppn)) {
756 pv_h = pai_to_pvh(ppn);
757 assert(pv_h->qlink.next == 0); /* shouldn't be init'd yet */
758 assert(pv_h->pmap == 0);
759 pv_h->va_and_flags = start_va;
760 pv_h->pmap = kernel_pmap;
761 queue_init(&pv_h->qlink);
762 /*
763 * Note that pmap_query_pagesize does not enforce start_va is aligned
764 * on a 2M boundary if it's within a large page
765 */
766 if (pmap_query_pagesize(kernel_pmap, start_va) == I386_LPGBYTES) {
767 pgsz = I386_LPGBYTES;
768 }
769 }
770 if (os_add_overflow(start_va, pgsz, &start_va)) {
771 #if DEVELOPMENT || DEBUG
772 panic("pmap_pv_fixup: Unexpected address wrap (0x%lx after adding 0x%x)", start_va, pgsz);
773 #else
774 start_va = end_va;
775 #endif
776 }
777 }
778 }
779
780 /*
781 * Initialize the pmap module.
782 * Called by vm_init, to initialize any structures that the pmap
783 * system needs to map virtual memory.
784 */
785 void
pmap_init(void)786 pmap_init(void)
787 {
788 long npages;
789 vm_offset_t addr;
790 vm_size_t s, vsize;
791 vm_map_offset_t vaddr;
792 ppnum_t ppn;
793
794
795 kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
796 _vm_object_allocate((vm_object_size_t)NPML4PGS * PAGE_SIZE, &kpml4obj_object_store);
797
798 kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
799 _vm_object_allocate((vm_object_size_t)NPDPTPGS * PAGE_SIZE, &kpdptobj_object_store);
800
801 kernel_pmap->pm_obj = &kptobj_object_store;
802 _vm_object_allocate((vm_object_size_t)NPDEPGS * PAGE_SIZE, &kptobj_object_store);
803
804 /*
805 * Allocate memory for the pv_head_table and its lock bits,
806 * the modify bit array, and the pte_page table.
807 */
808
809 /*
810 * zero bias all these arrays now instead of off avail_start
811 * so we cover all memory
812 */
813
814 npages = i386_btop(avail_end);
815 #if HIBERNATION
816 pmap_npages = (uint32_t)npages;
817 #endif
818 s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
819 + (sizeof(struct pv_hashed_entry_t *) * (npvhashbuckets))
820 + pv_lock_table_size(npages)
821 + pv_hash_lock_table_size((npvhashbuckets))
822 + npages);
823 s = round_page(s);
824
825 kmem_alloc(kernel_map, &addr, s,
826 KMA_NOFAIL | KMA_ZERO | KMA_KOBJECT | KMA_PERMANENT,
827 VM_KERN_MEMORY_PMAP);
828
829 vaddr = addr;
830 vsize = s;
831
832 #if PV_DEBUG
833 if (0 == npvhashmask) {
834 panic("npvhashmask not initialized");
835 }
836 #endif
837
838 /*
839 * Allocate the structures first to preserve word-alignment.
840 */
841 pv_head_table = (pv_rooted_entry_t) addr;
842 addr = (vm_offset_t) (pv_head_table + npages);
843
844 pv_hash_table = (pv_hashed_entry_t *)addr;
845 addr = (vm_offset_t) (pv_hash_table + (npvhashbuckets));
846
847 pv_lock_table = (char *) addr;
848 addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
849
850 pv_hash_lock_table = (char *) addr;
851 addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhashbuckets)));
852
853 pmap_phys_attributes = (char *) addr;
854
855 ppnum_t last_pn = i386_btop(avail_end);
856 unsigned int i;
857 pmap_memory_region_t *pmptr = pmap_memory_regions;
858 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
859 if (pmptr->type != kEfiConventionalMemory) {
860 continue;
861 }
862 ppnum_t pn;
863 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
864 if (pn < last_pn) {
865 pmap_phys_attributes[pn] |= PHYS_MANAGED;
866
867 if (pn > last_managed_page) {
868 last_managed_page = pn;
869 }
870
871 if ((pmap_high_used_bottom <= pn && pn <= pmap_high_used_top) ||
872 (pmap_middle_used_bottom <= pn && pn <= pmap_middle_used_top)) {
873 pmap_phys_attributes[pn] |= PHYS_NOENCRYPT;
874 }
875 }
876 }
877 }
878 while (vsize) {
879 ppn = pmap_find_phys(kernel_pmap, vaddr);
880
881 pmap_phys_attributes[ppn] |= PHYS_NOENCRYPT;
882
883 vaddr += PAGE_SIZE;
884 vsize -= PAGE_SIZE;
885 }
886 /*
887 * Create the zone of physical maps,
888 * and of the physical-to-virtual entries.
889 */
890 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
891 ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
892
893 /* The anchor is required to be page aligned. Zone debugging adds
894 * padding which may violate that requirement. Tell the zone
895 * subsystem that alignment is required.
896 */
897 pmap_anchor_zone = zone_create("pagetable anchors", PAGE_SIZE,
898 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
899
900 /* TODO: possible general optimisation...pre-allocate via zones commonly created
901 * level3/2 pagetables
902 */
903 /* The anchor is required to be page aligned. Zone debugging adds
904 * padding which may violate that requirement. Tell the zone
905 * subsystem that alignment is required.
906 */
907 pmap_uanchor_zone = zone_create("pagetable user anchors", PAGE_SIZE,
908 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
909
910 pv_hashed_list_zone = zone_create("pv_list", sizeof(struct pv_hashed_entry),
911 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
912
913 /*
914 * Create pv entries for kernel pages that might get pmap_remove()ed.
915 *
916 * - very low pages that were identity mapped.
917 * - vm_pages[] entries that might be unused and reclaimed.
918 */
919 assert((uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start <= (uintptr_t)vm_page_array_beginning_addr);
920 pmap_pv_fixup((uintptr_t)VM_MIN_KERNEL_ADDRESS, (uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start);
921 pmap_pv_fixup((uintptr_t)vm_page_array_beginning_addr, (uintptr_t)vm_page_array_ending_addr);
922
923 pmap_initialized = TRUE;
924
925 max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
926
927 /*
928 * Ensure the kernel's PML4 entry exists for the basement
929 * before this is shared with any user.
930 */
931 pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE);
932
933 #if CONFIG_VMX
934 pmap_ept_support_ad = vmx_hv_support() && (VMX_CAP(MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_EPT_VPID_CAP_AD_SHIFT, 1) ? TRUE : FALSE);
935 pmap_eptp_flags = HV_VMX_EPTP_MEMORY_TYPE_WB | HV_VMX_EPTP_WALK_LENGTH(4) | (pmap_ept_support_ad ? HV_VMX_EPTP_ENABLE_AD_FLAGS : 0);
936 #endif /* CONFIG_VMX */
937 }
938
939 void
pmap_mark_range(pmap_t npmap,uint64_t sv,uint64_t nxrosz,boolean_t NX,boolean_t ro)940 pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro)
941 {
942 uint64_t ev, cv = sv;
943 pd_entry_t *pdep;
944 pt_entry_t *ptep = NULL;
945
946 if (os_add_overflow(sv, nxrosz, &ev)) {
947 panic("pmap_mark_range: Unexpected address overflow: start=0x%llx size=0x%llx", sv, nxrosz);
948 }
949
950 /* XXX what if nxrosz is 0? we end up marking the page whose address is passed in via sv -- is that kosher? */
951 assert(!is_ept_pmap(npmap));
952
953 assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0);
954
955 for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) {
956 uint64_t pdev = (cv & ~((uint64_t)PDEMASK));
957
958 if (*pdep & INTEL_PTE_PS) {
959 #ifdef REMAP_DEBUG
960 if ((NX ^ !!(*pdep & INTEL_PTE_NX)) || (ro ^ !!!(*pdep & INTEL_PTE_WRITE))) {
961 kprintf("WARNING: Remapping PDE for %p from %s%s%s to %s%s%s\n", (void *)cv,
962 (*pdep & INTEL_PTE_VALID) ? "R" : "",
963 (*pdep & INTEL_PTE_WRITE) ? "W" : "",
964 (*pdep & INTEL_PTE_NX) ? "" : "X",
965 "R",
966 ro ? "" : "W",
967 NX ? "" : "X");
968 }
969 #endif
970
971 if (NX) {
972 *pdep |= INTEL_PTE_NX;
973 } else {
974 *pdep &= ~INTEL_PTE_NX;
975 }
976 if (ro) {
977 *pdep &= ~INTEL_PTE_WRITE;
978 } else {
979 *pdep |= INTEL_PTE_WRITE;
980 }
981
982 if (os_add_overflow(cv, NBPD, &cv)) {
983 cv = ev;
984 } else {
985 cv &= ~((uint64_t) PDEMASK);
986 pdep = pmap_pde(npmap, cv);
987 }
988 continue;
989 }
990
991 for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) {
992 #ifdef REMAP_DEBUG
993 if ((NX ^ !!(*ptep & INTEL_PTE_NX)) || (ro ^ !!!(*ptep & INTEL_PTE_WRITE))) {
994 kprintf("WARNING: Remapping PTE for %p from %s%s%s to %s%s%s\n", (void *)cv,
995 (*ptep & INTEL_PTE_VALID) ? "R" : "",
996 (*ptep & INTEL_PTE_WRITE) ? "W" : "",
997 (*ptep & INTEL_PTE_NX) ? "" : "X",
998 "R",
999 ro ? "" : "W",
1000 NX ? "" : "X");
1001 }
1002 #endif
1003 if (NX) {
1004 *ptep |= INTEL_PTE_NX;
1005 } else {
1006 *ptep &= ~INTEL_PTE_NX;
1007 }
1008 if (ro) {
1009 *ptep &= ~INTEL_PTE_WRITE;
1010 } else {
1011 *ptep |= INTEL_PTE_WRITE;
1012 }
1013 cv += NBPT;
1014 ptep = pmap_pte(npmap, cv);
1015 }
1016 }
1017 DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0);
1018 }
1019
1020 /*
1021 * Reclaim memory for early boot 4K page tables that were converted to large page mappings.
1022 * We know this memory is part of the KPTphys[] array that was allocated in Idle_PTs_init(),
1023 * so we can free it using its address in that array.
1024 */
1025 static void
pmap_free_early_PT(ppnum_t ppn,uint32_t cnt)1026 pmap_free_early_PT(ppnum_t ppn, uint32_t cnt)
1027 {
1028 ppnum_t KPTphys_ppn;
1029 vm_offset_t offset;
1030
1031 KPTphys_ppn = pmap_find_phys(kernel_pmap, (uintptr_t)KPTphys);
1032 assert(ppn >= KPTphys_ppn);
1033 assert(ppn + cnt <= KPTphys_ppn + NKPT);
1034 offset = (ppn - KPTphys_ppn) << PAGE_SHIFT;
1035 ml_static_mfree((uintptr_t)KPTphys + offset, PAGE_SIZE * cnt);
1036 }
1037
1038 /*
1039 * Called once VM is fully initialized so that we can release unused
1040 * sections of low memory to the general pool.
1041 * Also complete the set-up of identity-mapped sections of the kernel:
1042 * 1) write-protect kernel text
1043 * 2) map kernel text using large pages if possible
1044 * 3) read and write-protect page zero (for K32)
1045 * 4) map the global page at the appropriate virtual address.
1046 *
1047 * Use of large pages
1048 * ------------------
1049 * To effectively map and write-protect all kernel text pages, the text
1050 * must be 2M-aligned at the base, and the data section above must also be
1051 * 2M-aligned. That is, there's padding below and above. This is achieved
1052 * through linker directives. Large pages are used only if this alignment
1053 * exists (and not overriden by the -kernel_text_page_4K boot-arg). The
1054 * memory layout is:
1055 *
1056 * : :
1057 * | __DATA |
1058 * sdata: ================== 2Meg
1059 * | |
1060 * | zero-padding |
1061 * | |
1062 * etext: ------------------
1063 * | |
1064 * : :
1065 * | |
1066 * | __TEXT |
1067 * | |
1068 * : :
1069 * | |
1070 * stext: ================== 2Meg
1071 * | |
1072 * | zero-padding |
1073 * | |
1074 * eHIB: ------------------
1075 * | __HIB |
1076 * : :
1077 *
1078 * Prior to changing the mapping from 4K to 2M, the zero-padding pages
1079 * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the
1080 * 4K pages covering [stext,etext] are coalesced as 2M large pages.
1081 * The now unused level-1 PTE pages are also freed.
1082 */
1083 extern ppnum_t vm_kernel_base_page;
1084 static uint32_t dataptes = 0;
1085
1086 void
pmap_lowmem_finalize(void)1087 pmap_lowmem_finalize(void)
1088 {
1089 spl_t spl;
1090 int i;
1091
1092 /*
1093 * Update wired memory statistics for early boot pages
1094 */
1095 PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE);
1096
1097 /*
1098 * Free pages in pmap regions below the base:
1099 * rdar://6332712
1100 * We can't free all the pages to VM that EFI reports available.
1101 * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake.
1102 * There's also a size miscalculation here: pend is one page less
1103 * than it should be but this is not fixed to be backwards
1104 * compatible.
1105 * This is important for KASLR because up to 256*2MB = 512MB of space
1106 * needs has to be released to VM.
1107 */
1108 for (i = 0;
1109 pmap_memory_regions[i].end < vm_kernel_base_page;
1110 i++) {
1111 vm_offset_t pbase = i386_ptob(pmap_memory_regions[i].base);
1112 vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end + 1);
1113
1114 DBG("pmap region %d [%p..[%p\n",
1115 i, (void *) pbase, (void *) pend);
1116
1117 if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED) {
1118 continue;
1119 }
1120 /*
1121 * rdar://6332712
1122 * Adjust limits not to free pages in range 0xc0000-0xff000.
1123 */
1124 if (pbase >= 0xc0000 && pend <= 0x100000) {
1125 continue;
1126 }
1127 if (pbase < 0xc0000 && pend > 0x100000) {
1128 /* page range entirely within region, free lower part */
1129 DBG("- ml_static_mfree(%p,%p)\n",
1130 (void *) ml_static_ptovirt(pbase),
1131 (void *) (0xc0000 - pbase));
1132 ml_static_mfree(ml_static_ptovirt(pbase), 0xc0000 - pbase);
1133 pbase = 0x100000;
1134 }
1135 if (pbase < 0xc0000) {
1136 pend = MIN(pend, 0xc0000);
1137 }
1138 if (pend > 0x100000) {
1139 pbase = MAX(pbase, 0x100000);
1140 }
1141 DBG("- ml_static_mfree(%p,%p)\n",
1142 (void *) ml_static_ptovirt(pbase),
1143 (void *) (pend - pbase));
1144 ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase);
1145 }
1146
1147 /* A final pass to get rid of all initial identity mappings to
1148 * low pages.
1149 */
1150 DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base);
1151
1152 /*
1153 * Remove all mappings past the boot-cpu descriptor aliases and low globals.
1154 * Non-boot-cpu GDT aliases will be remapped later as needed.
1155 */
1156 pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base);
1157
1158 /*
1159 * Release any memory for early boot 4K page table pages that got replaced
1160 * with large page mappings for vm_pages[]. We know this memory is part of
1161 * the KPTphys[] array that was allocated in Idle_PTs_init(), so we can free
1162 * it using that address.
1163 */
1164 pmap_free_early_PT(released_PT_ppn, released_PT_cnt);
1165
1166 /*
1167 * If text and data are both 2MB-aligned,
1168 * we can map text with large-pages,
1169 * unless the -kernel_text_ps_4K boot-arg overrides.
1170 */
1171 if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) {
1172 kprintf("Kernel text is 2MB aligned");
1173 kernel_text_ps_4K = FALSE;
1174 if (PE_parse_boot_argn("-kernel_text_ps_4K",
1175 &kernel_text_ps_4K,
1176 sizeof(kernel_text_ps_4K))) {
1177 kprintf(" but will be mapped with 4K pages\n");
1178 } else {
1179 kprintf(" and will be mapped with 2M pages\n");
1180 }
1181 }
1182 #if DEVELOPMENT || DEBUG
1183 (void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof(wpkernel));
1184 #endif
1185 if (wpkernel) {
1186 kprintf("Kernel text %p-%p to be write-protected\n",
1187 (void *) stext, (void *) etext);
1188 }
1189
1190 spl = splhigh();
1191
1192 /*
1193 * Scan over text if mappings are to be changed:
1194 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0
1195 * - Change to large-pages if possible and not overriden.
1196 */
1197 if (kernel_text_ps_4K && wpkernel) {
1198 vm_offset_t myva;
1199 for (myva = stext; myva < etext; myva += PAGE_SIZE) {
1200 pt_entry_t *ptep;
1201
1202 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1203 if (ptep) {
1204 pmap_store_pte(FALSE, ptep, *ptep & ~INTEL_PTE_WRITE);
1205 }
1206 }
1207 }
1208
1209 if (!kernel_text_ps_4K) {
1210 vm_offset_t myva;
1211
1212 /*
1213 * Release zero-filled page padding used for 2M-alignment.
1214 */
1215 DBG("ml_static_mfree(%p,%p) for padding below text\n",
1216 (void *) eHIB, (void *) (stext - eHIB));
1217 ml_static_mfree(eHIB, stext - eHIB);
1218 DBG("ml_static_mfree(%p,%p) for padding above text\n",
1219 (void *) etext, (void *) (sdata - etext));
1220 ml_static_mfree(etext, sdata - etext);
1221
1222 /*
1223 * Coalesce text pages into large pages.
1224 */
1225 for (myva = stext; myva < sdata; myva += I386_LPGBYTES) {
1226 pt_entry_t *ptep;
1227 vm_offset_t pte_phys;
1228 pt_entry_t *pdep;
1229 pt_entry_t pde;
1230 ppnum_t KPT_ppn;
1231
1232 pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva);
1233 KPT_ppn = (ppnum_t)((*pdep & PG_FRAME) >> PAGE_SHIFT);
1234 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1235 DBG("myva: %p pdep: %p ptep: %p\n",
1236 (void *) myva, (void *) pdep, (void *) ptep);
1237 if ((*ptep & INTEL_PTE_VALID) == 0) {
1238 continue;
1239 }
1240 pte_phys = (vm_offset_t)(*ptep & PG_FRAME);
1241 pde = *pdep & PTMASK; /* page attributes from pde */
1242 pde |= INTEL_PTE_PS; /* make it a 2M entry */
1243 pde |= pte_phys; /* take page frame from pte */
1244
1245 if (wpkernel) {
1246 pde &= ~INTEL_PTE_WRITE;
1247 }
1248 DBG("pmap_store_pte(%p,0x%llx)\n",
1249 (void *)pdep, pde);
1250 pmap_store_pte(FALSE, pdep, pde);
1251
1252 /*
1253 * Free the now-unused level-1 pte.
1254 */
1255 pmap_free_early_PT(KPT_ppn, 1);
1256 }
1257
1258 /* Change variable read by sysctl machdep.pmap */
1259 pmap_kernel_text_ps = I386_LPGBYTES;
1260 }
1261
1262 vm_offset_t dva;
1263
1264 for (dva = sdata; dva < edata; dva += I386_PGBYTES) {
1265 assert(((sdata | edata) & PAGE_MASK) == 0);
1266 pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
1267
1268 dpte = *dptep;
1269 assert((dpte & INTEL_PTE_VALID));
1270 dpte |= INTEL_PTE_NX;
1271 pmap_store_pte(FALSE, dptep, dpte);
1272 dataptes++;
1273 }
1274 assert(dataptes > 0);
1275
1276 kernel_segment_command_t * seg;
1277 kernel_section_t * sec;
1278 kc_format_t kc_format;
1279
1280 PE_get_primary_kc_format(&kc_format);
1281
1282 for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) {
1283 if (!strcmp(seg->segname, "__TEXT") ||
1284 !strcmp(seg->segname, "__DATA")) {
1285 continue;
1286 }
1287
1288 /* XXX: FIXME_IN_dyld: This is a workaround (see below) */
1289 if (kc_format != KCFormatFileset) {
1290 //XXX
1291 if (!strcmp(seg->segname, "__KLD")) {
1292 continue;
1293 }
1294 }
1295
1296 if (!strcmp(seg->segname, "__HIB")) {
1297 for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
1298 if (sec->addr & PAGE_MASK) {
1299 panic("__HIB segment's sections misaligned");
1300 }
1301 if (!strcmp(sec->sectname, "__text")) {
1302 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE);
1303 } else {
1304 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE);
1305 }
1306 }
1307 } else {
1308 if (kc_format == KCFormatFileset) {
1309 #if 0
1310 /*
1311 * This block of code is commented out because it may or may not have induced an earlier panic
1312 * in ledger init.
1313 */
1314
1315
1316 boolean_t NXbit = !(seg->initprot & VM_PROT_EXECUTE),
1317 robit = (seg->initprot & (VM_PROT_READ | VM_PROT_WRITE)) == VM_PROT_READ;
1318
1319 /*
1320 * XXX: FIXME_IN_dyld: This is a workaround for primary KC containing incorrect inaccurate
1321 * initprot for segments containing code.
1322 */
1323 if (!strcmp(seg->segname, "__KLD") || !strcmp(seg->segname, "__VECTORS")) {
1324 NXbit = FALSE;
1325 robit = FALSE;
1326 }
1327
1328 pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1329 round_page_64(seg->vmsize), NXbit, robit);
1330 #endif
1331
1332 /*
1333 * XXX: We are marking *every* segment with rwx permissions as a workaround
1334 * XXX: until the primary KC's kernel segments are page-aligned.
1335 */
1336 kprintf("Marking (%p, %p) as rwx\n", (void *)(seg->vmaddr & ~(uint64_t)PAGE_MASK),
1337 (void *)((seg->vmaddr & ~(uint64_t)PAGE_MASK) + round_page_64(seg->vmsize)));
1338 pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1339 round_page_64(seg->vmsize), FALSE, FALSE);
1340 } else {
1341 pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE);
1342 }
1343 }
1344 }
1345
1346 /*
1347 * If we're debugging, map the low global vector page at the fixed
1348 * virtual address. Otherwise, remove the mapping for this.
1349 */
1350 if (debug_boot_arg) {
1351 pt_entry_t *pte = NULL;
1352 if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS))) {
1353 panic("lowmem pte");
1354 }
1355
1356 /* make sure it is defined on page boundary */
1357 assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK));
1358 pmap_store_pte(FALSE, pte, kvtophys((vm_offset_t)&lowGlo)
1359 | INTEL_PTE_REF
1360 | INTEL_PTE_MOD
1361 | INTEL_PTE_WIRED
1362 | INTEL_PTE_VALID
1363 | INTEL_PTE_WRITE
1364 | INTEL_PTE_NX);
1365
1366 #if KASAN
1367 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
1368 #endif
1369 } else {
1370 pmap_remove(kernel_pmap,
1371 LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE);
1372 }
1373 pmap_tlbi_range(0, ~0ULL, true, 0);
1374 splx(spl);
1375 }
1376
1377 /*
1378 * Mark the const data segment as read-only, non-executable.
1379 */
1380 void
x86_64_protect_data_const()1381 x86_64_protect_data_const()
1382 {
1383 boolean_t doconstro = TRUE;
1384 #if DEVELOPMENT || DEBUG
1385 (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
1386 #endif
1387 if (doconstro) {
1388 if (sconst & PAGE_MASK) {
1389 panic("CONST segment misaligned 0x%lx 0x%lx",
1390 sconst, econst);
1391 }
1392 kprintf("Marking const DATA read-only\n");
1393 pmap_protect(kernel_pmap, sconst, econst, VM_PROT_READ);
1394 }
1395 }
1396 /*
1397 * this function is only used for debugging fron the vm layer
1398 */
1399 bool
pmap_verify_free(ppnum_t pn)1400 pmap_verify_free(
1401 ppnum_t pn)
1402 {
1403 pv_rooted_entry_t pv_h;
1404 int pai;
1405 bool result;
1406
1407 assert(pn != vm_page_fictitious_addr);
1408
1409 if (!pmap_initialized) {
1410 return true;
1411 }
1412
1413 if (pn == vm_page_guard_addr) {
1414 return true;
1415 }
1416
1417 pai = ppn_to_pai(pn);
1418 if (!IS_MANAGED_PAGE(pai)) {
1419 return false;
1420 }
1421 pv_h = pai_to_pvh(pn);
1422 result = (pv_h->pmap == PMAP_NULL);
1423 return result;
1424 }
1425
1426 #if MACH_ASSERT
1427 void
pmap_assert_free(ppnum_t pn)1428 pmap_assert_free(ppnum_t pn)
1429 {
1430 int pai;
1431 pv_rooted_entry_t pv_h = NULL;
1432 pmap_t pmap = NULL;
1433 vm_offset_t va = 0;
1434 static char buffer[32];
1435 static char *pr_name = "not managed pn";
1436 uint_t attr;
1437 pt_entry_t *ptep;
1438 pt_entry_t pte = -1ull;
1439
1440 if (pmap_verify_free(pn)) {
1441 return;
1442 }
1443
1444 if (pn > last_managed_page) {
1445 attr = 0xff;
1446 goto done;
1447 }
1448
1449 pai = ppn_to_pai(pn);
1450 attr = pmap_phys_attributes[pai];
1451 pv_h = pai_to_pvh(pai);
1452 va = pv_h->va_and_flags;
1453 pmap = pv_h->pmap;
1454 if (pmap == kernel_pmap) {
1455 pr_name = "kernel";
1456 } else if (pmap == NULL) {
1457 pr_name = "pmap NULL";
1458 } else if (pmap->pmap_procname[0] != 0) {
1459 pr_name = &pmap->pmap_procname[0];
1460 } else {
1461 snprintf(buffer, sizeof(buffer), "pmap %p", pv_h->pmap);
1462 pr_name = buffer;
1463 }
1464
1465 if (pmap != NULL) {
1466 ptep = pmap_pte(pmap, va);
1467 if (ptep != NULL) {
1468 pte = (uintptr_t)*ptep;
1469 }
1470 }
1471
1472 done:
1473 panic("page not FREE page: 0x%lx attr: 0x%x %s va: 0x%lx PTE: 0x%llx",
1474 (ulong_t)pn, attr, pr_name, va, pte);
1475 }
1476 #endif /* MACH_ASSERT */
1477
1478 boolean_t
pmap_is_empty(pmap_t pmap,vm_map_offset_t va_start,vm_map_offset_t va_end)1479 pmap_is_empty(
1480 pmap_t pmap,
1481 vm_map_offset_t va_start,
1482 vm_map_offset_t va_end)
1483 {
1484 vm_map_offset_t offset;
1485 ppnum_t phys_page;
1486 ledger_amount_t phys_mem;
1487
1488 if (pmap == PMAP_NULL) {
1489 return TRUE;
1490 }
1491
1492 /*
1493 * Check the ledger's phys_mem value
1494 * - if it's zero, the pmap is completely empty.
1495 * This short-circuit test prevents a virtual address scan which is
1496 * painfully slow for 64-bit spaces.
1497 * This assumes the count is correct
1498 * .. the debug kernel ought to be checking perhaps by page table walk.
1499 */
1500 if (pmap != kernel_pmap) {
1501 ledger_get_balance(pmap->ledger, task_ledgers.phys_mem, &phys_mem);
1502 if (phys_mem == 0) {
1503 return TRUE;
1504 }
1505 }
1506
1507 for (offset = va_start;
1508 offset < va_end;
1509 offset += PAGE_SIZE_64) {
1510 phys_page = pmap_find_phys(pmap, offset);
1511 if (phys_page) {
1512 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1513 "page %d at 0x%llx\n",
1514 pmap, va_start, va_end, phys_page, offset);
1515 return FALSE;
1516 }
1517 }
1518
1519 return TRUE;
1520 }
1521
1522 void
hv_ept_pmap_create(void ** ept_pmap,void ** eptp)1523 hv_ept_pmap_create(void **ept_pmap, void **eptp)
1524 {
1525 pmap_t p;
1526
1527 if ((ept_pmap == NULL) || (eptp == NULL)) {
1528 return;
1529 }
1530
1531 p = pmap_create_options(get_task_ledger(current_task()), 0, (PMAP_CREATE_64BIT | PMAP_CREATE_EPT));
1532 if (p == PMAP_NULL) {
1533 *ept_pmap = NULL;
1534 *eptp = NULL;
1535 return;
1536 }
1537
1538 assert(is_ept_pmap(p));
1539
1540 *ept_pmap = (void*)p;
1541 *eptp = (void*)(p->pm_eptp);
1542 return;
1543 }
1544
1545 /*
1546 * pmap_create() is used by some special, legacy 3rd party kexts.
1547 * In our kernel code, always use pmap_create_options().
1548 */
1549 extern pmap_t pmap_create(ledger_t ledger, vm_map_size_t sz, boolean_t is_64bit);
1550
1551 __attribute__((used))
1552 pmap_t
pmap_create(ledger_t ledger,vm_map_size_t sz,boolean_t is_64bit)1553 pmap_create(
1554 ledger_t ledger,
1555 vm_map_size_t sz,
1556 boolean_t is_64bit)
1557 {
1558 return pmap_create_options(ledger, sz, is_64bit ? PMAP_CREATE_64BIT : 0);
1559 }
1560
1561 /*
1562 * Create and return a physical map.
1563 *
1564 * If the size specified for the map
1565 * is zero, the map is an actual physical
1566 * map, and may be referenced by the
1567 * hardware.
1568 *
1569 * If the size specified is non-zero,
1570 * the map will be used in software only, and
1571 * is bounded by that size.
1572 */
1573
1574 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t sz,unsigned int flags)1575 pmap_create_options(
1576 ledger_t ledger,
1577 vm_map_size_t sz,
1578 unsigned int flags)
1579 {
1580 pmap_t p;
1581 vm_size_t size;
1582 pml4_entry_t *pml4;
1583 pml4_entry_t *kpml4;
1584 int i;
1585
1586 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, sz, flags);
1587
1588 size = (vm_size_t) sz;
1589
1590 /*
1591 * A software use-only map doesn't even need a map.
1592 */
1593
1594 if (size != 0) {
1595 return PMAP_NULL;
1596 }
1597
1598 /*
1599 * Return error when unrecognized flags are passed.
1600 */
1601 if (__improbable((flags & ~(PMAP_CREATE_KNOWN_FLAGS)) != 0)) {
1602 return PMAP_NULL;
1603 }
1604
1605 p = zalloc_flags(pmap_zone, Z_WAITOK | Z_ZERO);
1606 if (PMAP_NULL == p) {
1607 panic("pmap_create zalloc");
1608 }
1609
1610 lck_rw_init(&p->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
1611 p->pmap_rwl.lck_rw_can_sleep = FALSE;
1612
1613 os_ref_init(&p->ref_count, NULL);
1614 #if DEVELOPMENT || DEBUG
1615 p->nx_enabled = 1;
1616 #endif
1617 p->pm_shared = FALSE;
1618 ledger_reference(ledger);
1619 p->ledger = ledger;
1620
1621 p->pm_task_map = ((flags & PMAP_CREATE_64BIT) ? TASK_MAP_64BIT : TASK_MAP_32BIT);
1622
1623 p->pagezero_accessible = FALSE;
1624 p->pm_vm_map_cs_enforced = FALSE;
1625
1626 if (pmap_pcid_ncpus) {
1627 pmap_pcid_initialize(p);
1628 }
1629
1630 p->pm_pml4 = zalloc(pmap_anchor_zone);
1631 p->pm_upml4 = zalloc(pmap_uanchor_zone); //cleanup for EPT
1632
1633 pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0);
1634 pmap_assert((((uintptr_t)p->pm_upml4) & PAGE_MASK) == 0);
1635
1636 memset((char *)p->pm_pml4, 0, PAGE_SIZE);
1637 memset((char *)p->pm_upml4, 0, PAGE_SIZE);
1638
1639 if (flags & PMAP_CREATE_EPT) {
1640 p->pm_eptp = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4) | pmap_eptp_flags;
1641 p->pm_cr3 = 0;
1642 } else {
1643 p->pm_eptp = 0;
1644 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1645 p->pm_ucr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_upml4);
1646 }
1647
1648 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1649
1650 p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS) *PAGE_SIZE);
1651 if (NULL == p->pm_obj_pml4) {
1652 panic("pmap_create pdpt obj");
1653 }
1654
1655 p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS) *PAGE_SIZE);
1656 if (NULL == p->pm_obj_pdpt) {
1657 panic("pmap_create pdpt obj");
1658 }
1659
1660 p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS) *PAGE_SIZE);
1661 if (NULL == p->pm_obj) {
1662 panic("pmap_create pte obj");
1663 }
1664
1665 if (!(flags & PMAP_CREATE_EPT)) {
1666 /* All host pmaps share the kernel's pml4 */
1667 pml4 = pmap64_pml4(p, 0ULL);
1668 kpml4 = kernel_pmap->pm_pml4;
1669 for (i = KERNEL_PML4_INDEX; i < (KERNEL_PML4_INDEX + KERNEL_PML4_COUNT); i++) {
1670 pml4[i] = kpml4[i];
1671 }
1672 pml4[KERNEL_KEXTS_INDEX] = kpml4[KERNEL_KEXTS_INDEX];
1673 for (i = KERNEL_PHYSMAP_PML4_INDEX; i < (KERNEL_PHYSMAP_PML4_INDEX + KERNEL_PHYSMAP_PML4_COUNT); i++) {
1674 pml4[i] = kpml4[i];
1675 }
1676 pml4[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1677 #if KASAN
1678 for (i = KERNEL_KASAN_PML4_FIRST; i <= KERNEL_KASAN_PML4_LAST; i++) {
1679 pml4[i] = kpml4[i];
1680 }
1681 #endif
1682 pml4_entry_t *pml4u = pmap64_user_pml4(p, 0ULL);
1683 pml4u[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1684 }
1685
1686 #if MACH_ASSERT
1687 p->pmap_stats_assert = TRUE;
1688 p->pmap_pid = 0;
1689 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
1690 #endif /* MACH_ASSERT */
1691
1692 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END,
1693 VM_KERNEL_ADDRHIDE(p));
1694
1695 return p;
1696 }
1697
1698 /*
1699 * We maintain stats and ledgers so that a task's physical footprint is:
1700 * phys_footprint = ((internal - alternate_accounting)
1701 * + (internal_compressed - alternate_accounting_compressed)
1702 * + iokit_mapped
1703 * + purgeable_nonvolatile
1704 * + purgeable_nonvolatile_compressed
1705 * + page_table)
1706 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
1707 */
1708
1709 #if MACH_ASSERT
1710 static void pmap_check_ledgers(pmap_t pmap);
1711 #else /* MACH_ASSERT */
1712 static inline void
pmap_check_ledgers(__unused pmap_t pmap)1713 pmap_check_ledgers(__unused pmap_t pmap)
1714 {
1715 }
1716 #endif /* MACH_ASSERT */
1717
1718 /*
1719 * Retire the given physical map from service.
1720 * Should only be called if the map contains
1721 * no valid mappings.
1722 */
1723 extern int vm_wired_objects_page_count;
1724
1725 void
pmap_destroy(pmap_t p)1726 pmap_destroy(pmap_t p)
1727 {
1728 os_ref_count_t c;
1729
1730 if (p == PMAP_NULL) {
1731 return;
1732 }
1733
1734 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1735 VM_KERNEL_ADDRHIDe(p));
1736
1737 PMAP_LOCK_EXCLUSIVE(p);
1738
1739 c = os_ref_release_locked(&p->ref_count);
1740
1741 pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE);
1742
1743 if (c == 0) {
1744 /*
1745 * If some cpu is not using the physical pmap pointer that it
1746 * is supposed to be (see set_dirbase), we might be using the
1747 * pmap that is being destroyed! Make sure we are
1748 * physically on the right pmap:
1749 */
1750 PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
1751 if (pmap_pcid_ncpus) {
1752 pmap_destroy_pcid_sync(p);
1753 }
1754 }
1755
1756 PMAP_UNLOCK_EXCLUSIVE(p);
1757
1758 if (c != 0) {
1759 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1760 pmap_assert(p == kernel_pmap);
1761 return; /* still in use */
1762 }
1763
1764 /*
1765 * Free the memory maps, then the
1766 * pmap structure.
1767 */
1768 int inuse_ptepages = 0;
1769
1770 zfree(pmap_anchor_zone, p->pm_pml4);
1771 zfree(pmap_uanchor_zone, p->pm_upml4);
1772
1773 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1774 vm_object_deallocate(p->pm_obj_pml4);
1775
1776 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1777 vm_object_deallocate(p->pm_obj_pdpt);
1778
1779 inuse_ptepages += p->pm_obj->resident_page_count;
1780 vm_object_deallocate(p->pm_obj);
1781
1782 OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
1783 PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE);
1784
1785 pmap_check_ledgers(p);
1786 ledger_dereference(p->ledger);
1787 lck_rw_destroy(&p->pmap_rwl, &pmap_lck_grp);
1788 zfree(pmap_zone, p);
1789
1790 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1791 }
1792
1793 /*
1794 * Add a reference to the specified pmap.
1795 */
1796
1797 void
pmap_reference(pmap_t p)1798 pmap_reference(pmap_t p)
1799 {
1800 if (p != PMAP_NULL) {
1801 PMAP_LOCK_EXCLUSIVE(p);
1802 os_ref_retain_locked(&p->ref_count);
1803 PMAP_UNLOCK_EXCLUSIVE(p);
1804 }
1805 }
1806
1807 /*
1808 * Remove phys addr if mapped in specified map
1809 *
1810 */
1811 void
pmap_remove_some_phys(__unused pmap_t map,__unused ppnum_t pn)1812 pmap_remove_some_phys(
1813 __unused pmap_t map,
1814 __unused ppnum_t pn)
1815 {
1816 /* Implement to support working set code */
1817 }
1818
1819
1820 void
pmap_protect(pmap_t map,vm_map_offset_t sva,vm_map_offset_t eva,vm_prot_t prot)1821 pmap_protect(
1822 pmap_t map,
1823 vm_map_offset_t sva,
1824 vm_map_offset_t eva,
1825 vm_prot_t prot)
1826 {
1827 pmap_protect_options(map, sva, eva, prot, 0, NULL);
1828 }
1829
1830
1831 /*
1832 * Set the physical protection on the
1833 * specified range of this map as requested.
1834 *
1835 * VERY IMPORTANT: Will *NOT* increase permissions.
1836 * pmap_protect_options() should protect the range against any access types
1837 * that are not in "prot" but it should never grant extra access.
1838 * For example, if "prot" is READ|EXECUTE, that means "remove write
1839 * access" but it does *not* mean "add read and execute" access.
1840 * VM relies on getting soft-faults to enforce extra checks (code
1841 * signing, for example), for example.
1842 * New access permissions are granted via pmap_enter() only.
1843 * ***NOTE***:
1844 * The only exception is for EPT pmaps, where we MUST populate all exec
1845 * bits when the protection API is invoked (so that the HV fault handler
1846 * can make decisions based on the exit qualification information, which
1847 * includes the execute bits in the EPT entries. Soft-faulting them
1848 * in would cause a chicken-and-egg problem where the HV fault handler
1849 * would not be able to identify mode-based execute control (MBE) faults.)
1850 */
1851 void
pmap_protect_options(pmap_t map,vm_map_offset_t sva,vm_map_offset_t eva,vm_prot_t prot,unsigned int options,void * arg)1852 pmap_protect_options(
1853 pmap_t map,
1854 vm_map_offset_t sva,
1855 vm_map_offset_t eva,
1856 vm_prot_t prot,
1857 unsigned int options,
1858 void *arg)
1859 {
1860 pt_entry_t *pde;
1861 pt_entry_t *spte, *epte;
1862 vm_map_offset_t lva;
1863 vm_map_offset_t orig_sva;
1864 boolean_t set_NX;
1865 int num_found = 0;
1866 boolean_t is_ept;
1867 uint64_t cur_vaddr;
1868
1869 pmap_intr_assert();
1870
1871 if (map == PMAP_NULL) {
1872 return;
1873 }
1874
1875 if (prot == VM_PROT_NONE) {
1876 pmap_remove_options(map, sva, eva, options);
1877 return;
1878 }
1879
1880 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1881 VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(sva),
1882 VM_KERNEL_ADDRHIDE(eva));
1883
1884 is_ept = is_ept_pmap(map);
1885
1886 if ((prot & VM_PROT_EXECUTE) || __improbable(is_ept && (prot & VM_PROT_UEXEC))) {
1887 set_NX = FALSE;
1888 } else {
1889 set_NX = TRUE;
1890 }
1891
1892 #if DEVELOPMENT || DEBUG
1893 if (__improbable(set_NX && (!nx_enabled || !map->nx_enabled))) {
1894 set_NX = FALSE;
1895 }
1896 #endif
1897 PMAP_LOCK_EXCLUSIVE(map);
1898
1899 orig_sva = sva;
1900 cur_vaddr = sva;
1901 while (sva < eva) {
1902 uint64_t vaddr_incr;
1903
1904 if (os_add_overflow(sva, PDE_MAPPED_SIZE, &lva)) {
1905 lva = eva;
1906 } else {
1907 lva &= ~(PDE_MAPPED_SIZE - 1);
1908
1909 if (lva > eva) {
1910 lva = eva;
1911 }
1912 }
1913
1914 pde = pmap_pde(map, sva);
1915 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1916 if (*pde & PTE_PS) {
1917 /* superpage */
1918 spte = pde;
1919 epte = spte + 1; /* excluded */
1920 vaddr_incr = I386_LPGBYTES;
1921 } else {
1922 spte = pmap_pte(map, (sva & ~(PDE_MAPPED_SIZE - 1)));
1923 spte = &spte[ptenum(sva)];
1924 epte = &spte[intel_btop(lva - sva)];
1925 vaddr_incr = I386_PGBYTES;
1926 }
1927
1928 for (; spte < epte; spte++) {
1929 uint64_t clear_bits, set_bits;
1930
1931 if (!(*spte & PTE_VALID_MASK(is_ept))) {
1932 continue;
1933 }
1934
1935 clear_bits = 0;
1936 set_bits = 0;
1937
1938 if (is_ept) {
1939 if (!(prot & VM_PROT_READ)) {
1940 clear_bits |= PTE_READ(is_ept);
1941 }
1942 }
1943 if (!(prot & VM_PROT_WRITE)) {
1944 clear_bits |= PTE_WRITE(is_ept);
1945 }
1946 #if DEVELOPMENT || DEBUG
1947 else if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) &&
1948 map == kernel_pmap) {
1949 set_bits |= PTE_WRITE(is_ept);
1950 }
1951 #endif /* DEVELOPMENT || DEBUG */
1952
1953 if (set_NX) {
1954 if (!is_ept) {
1955 set_bits |= INTEL_PTE_NX;
1956 } else {
1957 clear_bits |= INTEL_EPT_EX | INTEL_EPT_UEX;
1958 }
1959 } else if (is_ept) {
1960 /* This is the exception to the "Don't add permissions" statement, above */
1961 set_bits |= ((prot & VM_PROT_EXECUTE) ? INTEL_EPT_EX : 0) |
1962 ((prot & VM_PROT_UEXEC) ? INTEL_EPT_UEX : 0);
1963 }
1964
1965 pmap_update_pte(is_ept, spte, clear_bits, set_bits, false);
1966
1967 DTRACE_VM3(set_pte, pmap_t, map, void *, cur_vaddr, uint64_t, *spte);
1968 cur_vaddr += vaddr_incr;
1969
1970 num_found++;
1971 }
1972 }
1973 sva = lva;
1974 }
1975 if (num_found) {
1976 if (options & PMAP_OPTIONS_NOFLUSH) {
1977 PMAP_UPDATE_TLBS_DELAYED(map, orig_sva, eva, (pmap_flush_context *)arg);
1978 } else {
1979 PMAP_UPDATE_TLBS(map, orig_sva, eva);
1980 }
1981 }
1982
1983 PMAP_UNLOCK_EXCLUSIVE(map);
1984
1985 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
1986 }
1987
1988 /* Map a (possibly) autogenned block */
1989 kern_return_t
pmap_map_block_addr(pmap_t pmap,addr64_t va,pmap_paddr_t pa,uint32_t size,vm_prot_t prot,int attr,unsigned int flags)1990 pmap_map_block_addr(
1991 pmap_t pmap,
1992 addr64_t va,
1993 pmap_paddr_t pa,
1994 uint32_t size,
1995 vm_prot_t prot,
1996 int attr,
1997 unsigned int flags)
1998 {
1999 return pmap_map_block(pmap, va, intel_btop(pa), size, prot, attr, flags);
2000 }
2001
2002 kern_return_t
pmap_map_block(pmap_t pmap,addr64_t va,ppnum_t pa,uint32_t size,vm_prot_t prot,int attr,__unused unsigned int flags)2003 pmap_map_block(
2004 pmap_t pmap,
2005 addr64_t va,
2006 ppnum_t pa,
2007 uint32_t size,
2008 vm_prot_t prot,
2009 int attr,
2010 __unused unsigned int flags)
2011 {
2012 kern_return_t kr;
2013 addr64_t original_va = va;
2014 uint32_t page;
2015 int cur_page_size;
2016
2017 if (attr & VM_MEM_SUPERPAGE) {
2018 cur_page_size = SUPERPAGE_SIZE;
2019 } else {
2020 cur_page_size = PAGE_SIZE;
2021 }
2022
2023 for (page = 0; page < size; page += cur_page_size / PAGE_SIZE) {
2024 kr = pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE);
2025
2026 if (kr != KERN_SUCCESS) {
2027 /*
2028 * This will panic for now, as it is unclear that
2029 * removing the mappings is correct.
2030 */
2031 panic("%s: failed pmap_enter, "
2032 "pmap=%p, va=%#llx, pa=%u, size=%u, prot=%#x, flags=%#x",
2033 __FUNCTION__,
2034 pmap, va, pa, size, prot, flags);
2035
2036 pmap_remove(pmap, original_va, va - original_va);
2037 return kr;
2038 }
2039
2040 va += cur_page_size;
2041 pa += cur_page_size / PAGE_SIZE;
2042 }
2043
2044 return KERN_SUCCESS;
2045 }
2046
2047 kern_return_t
pmap_expand_pml4(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2048 pmap_expand_pml4(
2049 pmap_t map,
2050 vm_map_offset_t vaddr,
2051 unsigned int options)
2052 {
2053 vm_page_t m;
2054 pmap_paddr_t pa;
2055 uint64_t i;
2056 ppnum_t pn;
2057 pml4_entry_t *pml4p;
2058 boolean_t is_ept = is_ept_pmap(map);
2059
2060 DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
2061
2062 /* With the exception of the kext "basement", the kernel's level 4
2063 * pagetables must not be dynamically expanded.
2064 */
2065 assert(map != kernel_pmap || (vaddr == KERNEL_BASEMENT));
2066 /*
2067 * Allocate a VM page for the pml4 page
2068 */
2069 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2070 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2071 return KERN_RESOURCE_SHORTAGE;
2072 }
2073 VM_PAGE_WAIT();
2074 }
2075 /*
2076 * put the page into the pmap's obj list so it
2077 * can be found later.
2078 */
2079 pn = VM_PAGE_GET_PHYS_PAGE(m);
2080 pa = i386_ptob(pn);
2081 i = pml4idx(map, vaddr);
2082
2083 /*
2084 * Zero the page.
2085 */
2086 pmap_zero_page(pn);
2087
2088 vm_page_lockspin_queues();
2089 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2090 vm_page_unlock_queues();
2091
2092 OSAddAtomic(1, &inuse_ptepages_count);
2093 OSAddAtomic64(1, &alloc_ptepages_count);
2094 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2095
2096 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2097 vm_object_lock(map->pm_obj_pml4);
2098
2099 PMAP_LOCK_EXCLUSIVE(map);
2100 /*
2101 * See if someone else expanded us first
2102 */
2103 if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
2104 PMAP_UNLOCK_EXCLUSIVE(map);
2105 vm_object_unlock(map->pm_obj_pml4);
2106
2107 VM_PAGE_FREE(m);
2108
2109 OSAddAtomic(-1, &inuse_ptepages_count);
2110 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2111 return KERN_SUCCESS;
2112 }
2113
2114 #if 0 /* DEBUG */
2115 if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE)) {
2116 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx",
2117 map, map->pm_obj_pml4, vaddr, i);
2118 }
2119 #endif
2120 vm_page_insert_wired(m, map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2121 vm_object_unlock(map->pm_obj_pml4);
2122
2123 /*
2124 * Set the page directory entry for this page table.
2125 */
2126 pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
2127
2128 /*
2129 * Note that INTEL_EPT_UEX is unconditionally set (as is INTEL_EPT_EX) for
2130 * all intermediate paging levels, from PML4Es to PDEs. Processors with
2131 * VT-x implementations that do not support MBE ignore the INTEL_EPT_UEX
2132 * bit at all levels of the EPT, so there is no risk of inducing EPT
2133 * violation faults.
2134 */
2135 pmap_store_pte(is_ept, pml4p, pa_to_pte(pa)
2136 | PTE_READ(is_ept)
2137 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2138 | PTE_WRITE(is_ept));
2139 pml4_entry_t *upml4p;
2140
2141 upml4p = pmap64_user_pml4(map, vaddr);
2142 pmap_store_pte(is_ept, upml4p, pa_to_pte(pa)
2143 | PTE_READ(is_ept)
2144 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2145 | PTE_WRITE(is_ept));
2146
2147 PMAP_UNLOCK_EXCLUSIVE(map);
2148
2149 return KERN_SUCCESS;
2150 }
2151
2152 kern_return_t
pmap_expand_pdpt(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2153 pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options)
2154 {
2155 vm_page_t m;
2156 pmap_paddr_t pa;
2157 uint64_t i;
2158 ppnum_t pn;
2159 pdpt_entry_t *pdptp;
2160 boolean_t is_ept = is_ept_pmap(map);
2161
2162 DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
2163
2164 while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
2165 kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options);
2166 if (pep4kr != KERN_SUCCESS) {
2167 return pep4kr;
2168 }
2169 }
2170
2171 /*
2172 * Allocate a VM page for the pdpt page
2173 */
2174 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2175 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2176 return KERN_RESOURCE_SHORTAGE;
2177 }
2178 VM_PAGE_WAIT();
2179 }
2180
2181 /*
2182 * put the page into the pmap's obj list so it
2183 * can be found later.
2184 */
2185 pn = VM_PAGE_GET_PHYS_PAGE(m);
2186 pa = i386_ptob(pn);
2187 i = pdptidx(map, vaddr);
2188
2189 /*
2190 * Zero the page.
2191 */
2192 pmap_zero_page(pn);
2193
2194 vm_page_lockspin_queues();
2195 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2196 vm_page_unlock_queues();
2197
2198 OSAddAtomic(1, &inuse_ptepages_count);
2199 OSAddAtomic64(1, &alloc_ptepages_count);
2200 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2201
2202 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2203 vm_object_lock(map->pm_obj_pdpt);
2204
2205 PMAP_LOCK_EXCLUSIVE(map);
2206 /*
2207 * See if someone else expanded us first
2208 */
2209 if (pmap_pde(map, vaddr) != PD_ENTRY_NULL) {
2210 PMAP_UNLOCK_EXCLUSIVE(map);
2211 vm_object_unlock(map->pm_obj_pdpt);
2212
2213 VM_PAGE_FREE(m);
2214
2215 OSAddAtomic(-1, &inuse_ptepages_count);
2216 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2217 return KERN_SUCCESS;
2218 }
2219
2220 #if 0 /* DEBUG */
2221 if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE)) {
2222 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx",
2223 map, map->pm_obj_pdpt, vaddr, i);
2224 }
2225 #endif
2226 vm_page_insert_wired(m, map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2227 vm_object_unlock(map->pm_obj_pdpt);
2228
2229 /*
2230 * Set the page directory entry for this page table.
2231 */
2232 pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
2233
2234 pmap_store_pte(is_ept, pdptp, pa_to_pte(pa)
2235 | PTE_READ(is_ept)
2236 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2237 | PTE_WRITE(is_ept));
2238
2239 PMAP_UNLOCK_EXCLUSIVE(map);
2240
2241 return KERN_SUCCESS;
2242 }
2243
2244
2245
2246 /*
2247 * Routine: pmap_expand
2248 *
2249 * Expands a pmap to be able to map the specified virtual address.
2250 *
2251 * Allocates new virtual memory for the P0 or P1 portion of the
2252 * pmap, then re-maps the physical pages that were in the old
2253 * pmap to be in the new pmap.
2254 *
2255 * Must be called with the pmap system and the pmap unlocked,
2256 * since these must be unlocked to use vm_allocate or vm_deallocate.
2257 * Thus it must be called in a loop that checks whether the map
2258 * has been expanded enough.
2259 * (We won't loop forever, since page tables aren't shrunk.)
2260 */
2261 kern_return_t
pmap_expand(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2262 pmap_expand(
2263 pmap_t map,
2264 vm_map_offset_t vaddr,
2265 unsigned int options)
2266 {
2267 pt_entry_t *pdp;
2268 vm_page_t m;
2269 pmap_paddr_t pa;
2270 uint64_t i;
2271 ppnum_t pn;
2272 boolean_t is_ept = is_ept_pmap(map);
2273
2274
2275 /*
2276 * For the kernel, the virtual address must be in or above the basement
2277 * which is for kexts and is in the 512GB immediately below the kernel..
2278 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
2279 */
2280 if (__improbable(map == kernel_pmap &&
2281 !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))) {
2282 if ((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0) {
2283 panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
2284 }
2285 }
2286
2287 while ((pdp = pmap_pde(map, vaddr)) == PD_ENTRY_NULL) {
2288 assert((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0);
2289 kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options);
2290 if (pepkr != KERN_SUCCESS) {
2291 return pepkr;
2292 }
2293 }
2294
2295 /*
2296 * Allocate a VM page for the pde entries.
2297 */
2298 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2299 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2300 return KERN_RESOURCE_SHORTAGE;
2301 }
2302 VM_PAGE_WAIT();
2303 }
2304
2305 /*
2306 * put the page into the pmap's obj list so it
2307 * can be found later.
2308 */
2309 pn = VM_PAGE_GET_PHYS_PAGE(m);
2310 pa = i386_ptob(pn);
2311 i = pdeidx(map, vaddr);
2312
2313 /*
2314 * Zero the page.
2315 */
2316 pmap_zero_page(pn);
2317
2318 vm_page_lockspin_queues();
2319 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2320 vm_page_unlock_queues();
2321
2322 OSAddAtomic(1, &inuse_ptepages_count);
2323 OSAddAtomic64(1, &alloc_ptepages_count);
2324 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2325
2326 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2327 vm_object_lock(map->pm_obj);
2328
2329 PMAP_LOCK_EXCLUSIVE(map);
2330
2331 /*
2332 * See if someone else expanded us first
2333 */
2334 if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
2335 PMAP_UNLOCK_EXCLUSIVE(map);
2336 vm_object_unlock(map->pm_obj);
2337
2338 VM_PAGE_FREE(m);
2339
2340 OSAddAtomic(-1, &inuse_ptepages_count); //todo replace all with inlines
2341 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2342 return KERN_SUCCESS;
2343 }
2344
2345 #if 0 /* DEBUG */
2346 if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE)) {
2347 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx",
2348 map, map->pm_obj, vaddr, i);
2349 }
2350 #endif
2351 vm_page_insert_wired(m, map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2352 vm_object_unlock(map->pm_obj);
2353
2354 /*
2355 * Set the page directory entry for this page table.
2356 */
2357 pdp = pmap_pde(map, vaddr);
2358
2359 pmap_store_pte(is_ept, pdp, pa_to_pte(pa)
2360 | PTE_READ(is_ept)
2361 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2362 | PTE_WRITE(is_ept));
2363
2364 PMAP_UNLOCK_EXCLUSIVE(map);
2365
2366 return KERN_SUCCESS;
2367 }
2368 /*
2369 * Query a pmap to see what size a given virtual address is mapped with.
2370 * If the vaddr is not mapped, returns 0.
2371 */
2372 vm_size_t
pmap_query_pagesize(pmap_t pmap,vm_map_offset_t vaddr)2373 pmap_query_pagesize(
2374 pmap_t pmap,
2375 vm_map_offset_t vaddr)
2376 {
2377 pd_entry_t *pdep;
2378 vm_size_t size = 0;
2379
2380 assert(!is_ept_pmap(pmap));
2381 PMAP_LOCK_EXCLUSIVE(pmap);
2382
2383 pdep = pmap_pde(pmap, vaddr);
2384 if (pdep != PD_ENTRY_NULL) {
2385 if (*pdep & INTEL_PTE_PS) {
2386 size = I386_LPGBYTES;
2387 } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2388 size = I386_PGBYTES;
2389 }
2390 }
2391
2392 PMAP_UNLOCK_EXCLUSIVE(pmap);
2393
2394 return size;
2395 }
2396
2397 /*
2398 * Ensure the page table hierarchy is filled in down to
2399 * the large page level. Additionally returns FAILURE if
2400 * a lower page table already exists.
2401 */
2402 static kern_return_t
pmap_pre_expand_large_internal(pmap_t pmap,vm_map_offset_t vaddr)2403 pmap_pre_expand_large_internal(
2404 pmap_t pmap,
2405 vm_map_offset_t vaddr)
2406 {
2407 ppnum_t pn;
2408 pt_entry_t *pte;
2409 boolean_t is_ept = is_ept_pmap(pmap);
2410 kern_return_t kr = KERN_SUCCESS;
2411
2412 if (pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
2413 if (!pmap_next_page_hi(&pn, FALSE)) {
2414 panic("pmap_pre_expand_large no PDPT");
2415 }
2416
2417 pmap_zero_page(pn);
2418
2419 pte = pmap64_pml4(pmap, vaddr);
2420
2421 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2422 PTE_READ(is_ept) |
2423 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2424 PTE_WRITE(is_ept));
2425
2426 pte = pmap64_user_pml4(pmap, vaddr);
2427
2428 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2429 PTE_READ(is_ept) |
2430 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2431 PTE_WRITE(is_ept));
2432 }
2433
2434 if (pmap_pde(pmap, vaddr) == PD_ENTRY_NULL) {
2435 if (!pmap_next_page_hi(&pn, FALSE)) {
2436 panic("pmap_pre_expand_large no PDE");
2437 }
2438
2439 pmap_zero_page(pn);
2440
2441 pte = pmap64_pdpt(pmap, vaddr);
2442
2443 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2444 PTE_READ(is_ept) |
2445 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2446 PTE_WRITE(is_ept));
2447 } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2448 kr = KERN_FAILURE;
2449 }
2450
2451 return kr;
2452 }
2453
2454 /*
2455 * Wrapper that locks the pmap.
2456 */
2457 kern_return_t
pmap_pre_expand_large(pmap_t pmap,vm_map_offset_t vaddr)2458 pmap_pre_expand_large(
2459 pmap_t pmap,
2460 vm_map_offset_t vaddr)
2461 {
2462 kern_return_t kr;
2463
2464 PMAP_LOCK_EXCLUSIVE(pmap);
2465 kr = pmap_pre_expand_large_internal(pmap, vaddr);
2466 PMAP_UNLOCK_EXCLUSIVE(pmap);
2467 return kr;
2468 }
2469
2470 /*
2471 * On large memory machines, pmap_steal_memory() will allocate past
2472 * the 1GB of pre-allocated/mapped virtual kernel area. This function
2473 * expands kernel the page tables to cover a given vaddr. It uses pages
2474 * from the same pool that pmap_steal_memory() uses, since vm_page_grab()
2475 * isn't available yet.
2476 */
2477 void
pmap_pre_expand(pmap_t pmap,vm_map_offset_t vaddr)2478 pmap_pre_expand(
2479 pmap_t pmap,
2480 vm_map_offset_t vaddr)
2481 {
2482 ppnum_t pn;
2483 pt_entry_t *pte;
2484 boolean_t is_ept = is_ept_pmap(pmap);
2485
2486 /*
2487 * This returns failure if a 4K page table already exists.
2488 * Othewise it fills in the page table hierarchy down
2489 * to that level.
2490 */
2491 PMAP_LOCK_EXCLUSIVE(pmap);
2492 if (pmap_pre_expand_large_internal(pmap, vaddr) == KERN_FAILURE) {
2493 PMAP_UNLOCK_EXCLUSIVE(pmap);
2494 return;
2495 }
2496
2497 /* Add the lowest table */
2498 if (!pmap_next_page_hi(&pn, FALSE)) {
2499 panic("pmap_pre_expand");
2500 }
2501
2502 pmap_zero_page(pn);
2503
2504 pte = pmap_pde(pmap, vaddr);
2505
2506 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2507 PTE_READ(is_ept) |
2508 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2509 PTE_WRITE(is_ept));
2510 PMAP_UNLOCK_EXCLUSIVE(pmap);
2511 }
2512
2513 /*
2514 * pmap_sync_page_data_phys(ppnum_t pa)
2515 *
2516 * Invalidates all of the instruction cache on a physical page and
2517 * pushes any dirty data from the data cache for the same physical page
2518 * Not required in i386.
2519 */
2520 void
pmap_sync_page_data_phys(__unused ppnum_t pa)2521 pmap_sync_page_data_phys(__unused ppnum_t pa)
2522 {
2523 return;
2524 }
2525
2526 /*
2527 * pmap_sync_page_attributes_phys(ppnum_t pa)
2528 *
2529 * Write back and invalidate all cachelines on a physical page.
2530 */
2531 void
pmap_sync_page_attributes_phys(ppnum_t pa)2532 pmap_sync_page_attributes_phys(ppnum_t pa)
2533 {
2534 cache_flush_page_phys(pa);
2535 }
2536
2537 void
pmap_copy_page(ppnum_t src,ppnum_t dst)2538 pmap_copy_page(ppnum_t src, ppnum_t dst)
2539 {
2540 bcopy_phys((addr64_t)i386_ptob(src),
2541 (addr64_t)i386_ptob(dst),
2542 PAGE_SIZE);
2543 }
2544
2545
2546 /*
2547 * Routine: pmap_pageable
2548 * Function:
2549 * Make the specified pages (by pmap, offset)
2550 * pageable (or not) as requested.
2551 *
2552 * A page which is not pageable may not take
2553 * a fault; therefore, its page table entry
2554 * must remain valid for the duration.
2555 *
2556 * This routine is merely advisory; pmap_enter
2557 * will specify that these pages are to be wired
2558 * down (or not) as appropriate.
2559 */
2560 void
pmap_pageable(__unused pmap_t pmap,__unused vm_map_offset_t start_addr,__unused vm_map_offset_t end_addr,__unused boolean_t pageable)2561 pmap_pageable(
2562 __unused pmap_t pmap,
2563 __unused vm_map_offset_t start_addr,
2564 __unused vm_map_offset_t end_addr,
2565 __unused boolean_t pageable)
2566 {
2567 #ifdef lint
2568 pmap++; start_addr++; end_addr++; pageable++;
2569 #endif /* lint */
2570 }
2571
2572 void
invalidate_icache(__unused vm_offset_t addr,__unused unsigned cnt,__unused int phys)2573 invalidate_icache(__unused vm_offset_t addr,
2574 __unused unsigned cnt,
2575 __unused int phys)
2576 {
2577 return;
2578 }
2579
2580 void
flush_dcache(__unused vm_offset_t addr,__unused unsigned count,__unused int phys)2581 flush_dcache(__unused vm_offset_t addr,
2582 __unused unsigned count,
2583 __unused int phys)
2584 {
2585 return;
2586 }
2587
2588 #if CONFIG_DTRACE
2589 /*
2590 * Constrain DTrace copyin/copyout actions
2591 */
2592 extern kern_return_t dtrace_copyio_preflight(addr64_t);
2593 extern kern_return_t dtrace_copyio_postflight(addr64_t);
2594
2595 kern_return_t
dtrace_copyio_preflight(__unused addr64_t va)2596 dtrace_copyio_preflight(__unused addr64_t va)
2597 {
2598 thread_t thread = current_thread();
2599 uint64_t ccr3;
2600 if (current_map() == kernel_map) {
2601 return KERN_FAILURE;
2602 } else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE)) {
2603 return KERN_FAILURE;
2604 } else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3)) {
2605 return KERN_FAILURE;
2606 } else {
2607 return KERN_SUCCESS;
2608 }
2609 }
2610
2611 kern_return_t
dtrace_copyio_postflight(__unused addr64_t va)2612 dtrace_copyio_postflight(__unused addr64_t va)
2613 {
2614 return KERN_SUCCESS;
2615 }
2616 #endif /* CONFIG_DTRACE */
2617
2618 #include <mach_vm_debug.h>
2619 #if MACH_VM_DEBUG
2620 #include <vm/vm_debug.h>
2621
2622 int
pmap_list_resident_pages(__unused pmap_t pmap,__unused vm_offset_t * listp,__unused int space)2623 pmap_list_resident_pages(
2624 __unused pmap_t pmap,
2625 __unused vm_offset_t *listp,
2626 __unused int space)
2627 {
2628 return 0;
2629 }
2630 #endif /* MACH_VM_DEBUG */
2631
2632
2633 #if CONFIG_COREDUMP
2634 /* temporary workaround */
2635 boolean_t
coredumpok(vm_map_t map,mach_vm_offset_t va)2636 coredumpok(vm_map_t map, mach_vm_offset_t va)
2637 {
2638 #if 0
2639 pt_entry_t *ptep;
2640
2641 ptep = pmap_pte(map->pmap, va);
2642 if (0 == ptep) {
2643 return FALSE;
2644 }
2645 return (*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED);
2646 #else
2647 if (vm_map_entry_has_device_pager(map, va)) {
2648 return FALSE;
2649 }
2650 return TRUE;
2651 #endif
2652 }
2653 #endif
2654
2655 boolean_t
phys_page_exists(ppnum_t pn)2656 phys_page_exists(ppnum_t pn)
2657 {
2658 assert(pn != vm_page_fictitious_addr);
2659
2660 if (!pmap_initialized) {
2661 return TRUE;
2662 }
2663
2664 if (pn == vm_page_guard_addr) {
2665 return FALSE;
2666 }
2667
2668 if (!IS_MANAGED_PAGE(ppn_to_pai(pn))) {
2669 return FALSE;
2670 }
2671
2672 return TRUE;
2673 }
2674
2675
2676
2677 void
pmap_switch(pmap_t tpmap)2678 pmap_switch(pmap_t tpmap)
2679 {
2680 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(tpmap));
2681 assert(ml_get_interrupts_enabled() == FALSE);
2682 set_dirbase(tpmap, current_thread(), cpu_number());
2683 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
2684 }
2685
2686 void
pmap_require(pmap_t pmap)2687 pmap_require(pmap_t pmap)
2688 {
2689 if (pmap != kernel_pmap) {
2690 zone_id_require(ZONE_ID_PMAP, sizeof(struct pmap), pmap);
2691 }
2692 }
2693
2694 /*
2695 * disable no-execute capability on
2696 * the specified pmap
2697 */
2698 void
pmap_disable_NX(__unused pmap_t pmap)2699 pmap_disable_NX(__unused pmap_t pmap)
2700 {
2701 #if DEVELOPMENT || DEBUG
2702 pmap->nx_enabled = 0;
2703 #endif
2704 }
2705
2706 void
pmap_flush_context_init(pmap_flush_context * pfc)2707 pmap_flush_context_init(pmap_flush_context *pfc)
2708 {
2709 pfc->pfc_cpus = 0;
2710 pfc->pfc_invalid_global = 0;
2711 }
2712
2713 static bool
pmap_tlbi_response(uint32_t lcpu,uint32_t rcpu,bool ngflush)2714 pmap_tlbi_response(uint32_t lcpu, uint32_t rcpu, bool ngflush)
2715 {
2716 bool responded = false;
2717 bool gflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_global_count !=
2718 cpu_datap(lcpu)->cpu_tlb_gen_counts_global[rcpu]);
2719
2720 if (ngflush) {
2721 if (gflushed) {
2722 responded = true;
2723 }
2724 } else {
2725 if (gflushed) {
2726 responded = true;
2727 } else {
2728 bool lflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_local_count !=
2729 cpu_datap(lcpu)->cpu_tlb_gen_counts_local[rcpu]);
2730 if (lflushed) {
2731 responded = true;
2732 }
2733 }
2734 }
2735
2736 if (responded == false) {
2737 if ((cpu_datap(rcpu)->cpu_tlb_invalid == 0) ||
2738 !CPU_CR3_IS_ACTIVE(rcpu) ||
2739 !cpu_is_running(rcpu)) {
2740 responded = true;
2741 }
2742 }
2743 return responded;
2744 }
2745
2746 extern uint64_t TLBTimeOut;
2747 void
pmap_flush(pmap_flush_context * pfc)2748 pmap_flush(
2749 pmap_flush_context *pfc)
2750 {
2751 unsigned int my_cpu;
2752 unsigned int cpu;
2753 cpumask_t cpu_bit;
2754 cpumask_t cpus_to_respond = 0;
2755 cpumask_t cpus_to_signal = 0;
2756 cpumask_t cpus_signaled = 0;
2757 boolean_t flush_self = FALSE;
2758 uint64_t deadline;
2759 bool need_global_flush = false;
2760
2761 mp_disable_preemption();
2762
2763 my_cpu = cpu_number();
2764 cpus_to_signal = pfc->pfc_cpus;
2765
2766 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_START,
2767 NULL, cpus_to_signal);
2768
2769 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus && cpus_to_signal; cpu++, cpu_bit <<= 1) {
2770 if (cpus_to_signal & cpu_bit) {
2771 cpus_to_signal &= ~cpu_bit;
2772
2773 if (!cpu_is_running(cpu)) {
2774 continue;
2775 }
2776
2777 if (pfc->pfc_invalid_global & cpu_bit) {
2778 cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2779 need_global_flush = true;
2780 } else {
2781 cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2782 }
2783 cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2784 cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2785 mfence();
2786
2787 if (cpu == my_cpu) {
2788 flush_self = TRUE;
2789 continue;
2790 }
2791 if (CPU_CR3_IS_ACTIVE(cpu)) {
2792 cpus_to_respond |= cpu_bit;
2793 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2794 }
2795 }
2796 }
2797 cpus_signaled = cpus_to_respond;
2798
2799 /*
2800 * Flush local tlb if required.
2801 * Do this now to overlap with other processors responding.
2802 */
2803 if (flush_self) {
2804 process_pmap_updates(NULL, (pfc->pfc_invalid_global != 0), 0ULL, ~0ULL);
2805 }
2806
2807 if (cpus_to_respond) {
2808 deadline = mach_absolute_time() +
2809 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2810 boolean_t is_timeout_traced = FALSE;
2811
2812 /*
2813 * Wait for those other cpus to acknowledge
2814 */
2815 while (cpus_to_respond != 0) {
2816 long orig_acks = 0;
2817
2818 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2819 bool responded = false;
2820 if ((cpus_to_respond & cpu_bit) != 0) {
2821 responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
2822 if (responded) {
2823 cpus_to_respond &= ~cpu_bit;
2824 }
2825 cpu_pause();
2826 }
2827
2828 if (cpus_to_respond == 0) {
2829 break;
2830 }
2831 }
2832 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2833 if (machine_timeout_suspended()) {
2834 continue;
2835 }
2836 if (TLBTimeOut == 0) {
2837 if (is_timeout_traced) {
2838 continue;
2839 }
2840
2841 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2842 NULL, cpus_to_signal, cpus_to_respond);
2843
2844 is_timeout_traced = TRUE;
2845 continue;
2846 }
2847 orig_acks = NMIPI_acks;
2848 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
2849 panic("Uninterruptible processor(s): CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu",
2850 cpus_to_respond, orig_acks, NMIPI_acks, deadline);
2851 }
2852 }
2853 }
2854
2855 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_END,
2856 NULL, cpus_signaled, flush_self);
2857
2858 mp_enable_preemption();
2859 }
2860
2861
2862 static void
invept(void * eptp)2863 invept(void *eptp)
2864 {
2865 struct {
2866 uint64_t eptp;
2867 uint64_t reserved;
2868 } __attribute__((aligned(16), packed)) invept_descriptor = {(uint64_t)eptp, 0};
2869
2870 __asm__ volatile ("invept (%%rax), %%rcx"
2871 : : "c" (PMAP_INVEPT_SINGLE_CONTEXT), "a" (&invept_descriptor)
2872 : "cc", "memory");
2873 }
2874
2875 /*
2876 * Called with pmap locked, we:
2877 * - scan through per-cpu data to see which other cpus need to flush
2878 * - send an IPI to each non-idle cpu to be flushed
2879 * - wait for all to signal back that they are inactive or we see that
2880 * they are at a safe point (idle).
2881 * - flush the local tlb if active for this pmap
2882 * - return ... the caller will unlock the pmap
2883 */
2884
2885 void
pmap_flush_tlbs(pmap_t pmap,vm_map_offset_t startv,vm_map_offset_t endv,int options,pmap_flush_context * pfc)2886 pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int options, pmap_flush_context *pfc)
2887 {
2888 unsigned int cpu;
2889 cpumask_t cpu_bit;
2890 cpumask_t cpus_to_signal = 0;
2891 unsigned int my_cpu = cpu_number();
2892 pmap_paddr_t pmap_cr3 = pmap->pm_cr3;
2893 boolean_t flush_self = FALSE;
2894 uint64_t deadline;
2895 boolean_t pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap));
2896 bool need_global_flush = false;
2897 uint32_t event_code = 0;
2898 vm_map_offset_t event_startv = 0, event_endv = 0;
2899 boolean_t is_ept = is_ept_pmap(pmap);
2900
2901 assert((processor_avail_count < 2) ||
2902 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
2903
2904 assert((endv - startv) >= PAGE_SIZE);
2905 assert(((endv | startv) & PAGE_MASK) == 0);
2906
2907 if (__improbable(kdebug_enable)) {
2908 if (pmap == kernel_pmap) {
2909 event_code = PMAP_CODE(PMAP__FLUSH_KERN_TLBS);
2910 event_startv = VM_KERNEL_UNSLIDE_OR_PERM(startv);
2911 event_endv = VM_KERNEL_UNSLIDE_OR_PERM(endv);
2912 } else if (__improbable(is_ept)) {
2913 event_code = PMAP_CODE(PMAP__FLUSH_EPT);
2914 event_startv = startv;
2915 event_endv = endv;
2916 } else {
2917 event_code = PMAP_CODE(PMAP__FLUSH_TLBS);
2918 event_startv = startv;
2919 event_endv = endv;
2920 }
2921 }
2922
2923 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_START,
2924 VM_KERNEL_UNSLIDE_OR_PERM(pmap), options,
2925 event_startv, event_endv);
2926
2927 if (__improbable(is_ept)) {
2928 mp_cpus_call(CPUMASK_ALL, ASYNC, invept, (void*)pmap->pm_eptp);
2929 goto out;
2930 }
2931
2932 /*
2933 * Scan other cpus for matching active or task CR3.
2934 * For idle cpus (with no active map) we mark them invalid but
2935 * don't signal -- they'll check as they go busy.
2936 */
2937 if (pmap_pcid_ncpus) {
2938 if (pmap_is_shared) {
2939 need_global_flush = true;
2940 }
2941 pmap_pcid_invalidate_all_cpus(pmap);
2942 mfence();
2943 }
2944
2945 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2946 if (!cpu_is_running(cpu)) {
2947 continue;
2948 }
2949 uint64_t cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
2950 uint64_t cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
2951
2952 if ((pmap_cr3 == cpu_task_cr3) ||
2953 (pmap_cr3 == cpu_active_cr3) ||
2954 (pmap_is_shared)) {
2955 if (options & PMAP_DELAY_TLB_FLUSH) {
2956 if (need_global_flush == true) {
2957 pfc->pfc_invalid_global |= cpu_bit;
2958 }
2959 pfc->pfc_cpus |= cpu_bit;
2960
2961 continue;
2962 }
2963 if (need_global_flush == true) {
2964 cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2965 cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2966 } else {
2967 cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2968 cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2969 }
2970
2971 if (cpu == my_cpu) {
2972 flush_self = TRUE;
2973 continue;
2974 }
2975
2976 mfence();
2977
2978 /*
2979 * We don't need to signal processors which will flush
2980 * lazily at the idle state or kernel boundary.
2981 * For example, if we're invalidating the kernel pmap,
2982 * processors currently in userspace don't need to flush
2983 * their TLBs until the next time they enter the kernel.
2984 * Alterations to the address space of a task active
2985 * on a remote processor result in a signal, to
2986 * account for copy operations. (There may be room
2987 * for optimization in such cases).
2988 * The order of the loads below with respect
2989 * to the store to the "cpu_tlb_invalid" field above
2990 * is important--hence the barrier.
2991 */
2992 if (CPU_CR3_IS_ACTIVE(cpu) &&
2993 (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
2994 pmap->pm_shared ||
2995 (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
2996 cpus_to_signal |= cpu_bit;
2997 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2998 }
2999 }
3000 }
3001
3002 if ((options & PMAP_DELAY_TLB_FLUSH)) {
3003 goto out;
3004 }
3005
3006 /*
3007 * Flush local tlb if required.
3008 * Do this now to overlap with other processors responding.
3009 */
3010 if (flush_self) {
3011 process_pmap_updates(pmap, pmap_is_shared, startv, endv);
3012 }
3013
3014 if (cpus_to_signal) {
3015 cpumask_t cpus_to_respond = cpus_to_signal;
3016
3017 deadline = mach_absolute_time() +
3018 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
3019 boolean_t is_timeout_traced = FALSE;
3020
3021 /*
3022 * Wait for those other cpus to acknowledge
3023 */
3024 while (cpus_to_respond != 0) {
3025 long orig_acks = 0;
3026
3027 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
3028 bool responded = false;
3029 if ((cpus_to_respond & cpu_bit) != 0) {
3030 responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
3031 if (responded) {
3032 cpus_to_respond &= ~cpu_bit;
3033 }
3034 cpu_pause();
3035 }
3036 if (cpus_to_respond == 0) {
3037 break;
3038 }
3039 }
3040 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
3041 if (machine_timeout_suspended()) {
3042 continue;
3043 }
3044 if (TLBTimeOut == 0) {
3045 /* cut tracepoint but don't panic */
3046 if (is_timeout_traced) {
3047 continue;
3048 }
3049
3050 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
3051 VM_KERNEL_UNSLIDE_OR_PERM(pmap),
3052 cpus_to_signal,
3053 cpus_to_respond);
3054
3055 is_timeout_traced = TRUE;
3056 continue;
3057 }
3058 orig_acks = NMIPI_acks;
3059 uint64_t tstamp1 = mach_absolute_time();
3060 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
3061 uint64_t tstamp2 = mach_absolute_time();
3062 panic("IPI timeout, unresponsive CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu, pre-NMIPI time: 0x%llx, current: 0x%llx, global: %d",
3063 cpus_to_respond, orig_acks, NMIPI_acks, deadline, tstamp1, tstamp2, need_global_flush);
3064 }
3065 }
3066 }
3067
3068 if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) {
3069 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, pmap_cr3: 0x%llx, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, pmap_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
3070 }
3071
3072 out:
3073 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_END,
3074 VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal,
3075 event_startv, event_endv);
3076 }
3077
3078 static void
process_pmap_updates(pmap_t p,bool pshared,addr64_t istart,addr64_t iend)3079 process_pmap_updates(pmap_t p, bool pshared, addr64_t istart, addr64_t iend)
3080 {
3081 int ccpu = cpu_number();
3082 bool gtlbf = false;
3083
3084 pmap_assert(ml_get_interrupts_enabled() == 0 ||
3085 get_preemption_level() != 0);
3086
3087 if (cpu_datap(ccpu)->cpu_tlb_invalid_global) {
3088 cpu_datap(ccpu)->cpu_tlb_invalid_global_count++;
3089 cpu_datap(ccpu)->cpu_tlb_invalid = 0;
3090 gtlbf = true;
3091 } else {
3092 cpu_datap(ccpu)->cpu_tlb_invalid_local_count++;
3093 cpu_datap(ccpu)->cpu_tlb_invalid_local = 0;
3094 }
3095
3096 if (pmap_pcid_ncpus) {
3097 if (p) {
3098 /* TODO global generation count to
3099 * avoid potentially redundant
3100 * csw invalidations post-global invalidation
3101 */
3102 pmap_pcid_validate_cpu(p, ccpu);
3103 pmap_tlbi_range(istart, iend, (pshared || gtlbf), p->pmap_pcid_cpus[ccpu]);
3104 } else {
3105 pmap_pcid_validate_current();
3106 pmap_tlbi_range(istart, iend, true, 0);
3107 }
3108 } else {
3109 pmap_tlbi_range(0, ~0ULL, true, 0);
3110 }
3111 }
3112
3113 void
pmap_update_interrupt(void)3114 pmap_update_interrupt(void)
3115 {
3116 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START);
3117
3118 if (current_cpu_datap()->cpu_tlb_invalid) {
3119 process_pmap_updates(NULL, true, 0ULL, ~0ULL);
3120 }
3121
3122 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END);
3123 }
3124
3125 #include <mach/mach_vm.h> /* mach_vm_region_recurse() */
3126 /* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries
3127 * and identify ranges with mismatched VM permissions and PTE permissions
3128 */
3129 kern_return_t
pmap_permissions_verify(pmap_t ipmap,vm_map_t ivmmap,vm_offset_t sv,vm_offset_t ev)3130 pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev)
3131 {
3132 vm_offset_t cv = sv;
3133 kern_return_t rv = KERN_SUCCESS;
3134 uint64_t skip4 = 0, skip2 = 0;
3135
3136 assert(!is_ept_pmap(ipmap));
3137
3138 sv &= ~PAGE_MASK_64;
3139 ev &= ~PAGE_MASK_64;
3140 while (cv < ev) {
3141 if (__improbable((cv > 0x00007FFFFFFFFFFFULL) &&
3142 (cv < 0xFFFF800000000000ULL))) {
3143 cv = 0xFFFF800000000000ULL;
3144 }
3145 /* Potential inconsistencies from not holding pmap lock
3146 * but harmless for the moment.
3147 */
3148 if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) {
3149 if ((cv + NBPML4) > cv) {
3150 cv += NBPML4;
3151 } else {
3152 break;
3153 }
3154 skip4++;
3155 continue;
3156 }
3157 if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) {
3158 if ((cv + NBPD) > cv) {
3159 cv += NBPD;
3160 } else {
3161 break;
3162 }
3163 skip2++;
3164 continue;
3165 }
3166
3167 pt_entry_t *ptep = pmap_pte(ipmap, cv);
3168 if (ptep && (*ptep & INTEL_PTE_VALID)) {
3169 if (*ptep & INTEL_PTE_WRITE) {
3170 if (!(*ptep & INTEL_PTE_NX)) {
3171 kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep)))));
3172 rv = KERN_FAILURE;
3173 }
3174 }
3175 }
3176 cv += PAGE_SIZE;
3177 }
3178 kprintf("Completed pmap scan\n");
3179 cv = sv;
3180
3181 struct vm_region_submap_info_64 vbr;
3182 mach_msg_type_number_t vbrcount = 0;
3183 mach_vm_size_t vmsize;
3184 vm_prot_t prot;
3185 uint32_t nesting_depth = 0;
3186 kern_return_t kret;
3187
3188 while (cv < ev) {
3189 for (;;) {
3190 vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64;
3191 if ((kret = mach_vm_region_recurse(ivmmap,
3192 (mach_vm_address_t *) &cv, &vmsize, &nesting_depth,
3193 (vm_region_recurse_info_t)&vbr,
3194 &vbrcount)) != KERN_SUCCESS) {
3195 break;
3196 }
3197
3198 if (vbr.is_submap) {
3199 nesting_depth++;
3200 continue;
3201 } else {
3202 break;
3203 }
3204 }
3205
3206 if (kret != KERN_SUCCESS) {
3207 break;
3208 }
3209
3210 prot = vbr.protection;
3211
3212 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
3213 kprintf("W+X map entry at address 0x%lx\n", cv);
3214 rv = KERN_FAILURE;
3215 }
3216
3217 if (prot) {
3218 vm_offset_t pcv;
3219 for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) {
3220 pt_entry_t *ptep = pmap_pte(ipmap, pcv);
3221 vm_prot_t tprot;
3222
3223 if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID)) {
3224 continue;
3225 }
3226 tprot = VM_PROT_READ;
3227 if (*ptep & INTEL_PTE_WRITE) {
3228 tprot |= VM_PROT_WRITE;
3229 }
3230 if ((*ptep & INTEL_PTE_NX) == 0) {
3231 tprot |= VM_PROT_EXECUTE;
3232 }
3233 if (tprot != prot) {
3234 kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot);
3235 rv = KERN_FAILURE;
3236 }
3237 }
3238 }
3239 cv += vmsize;
3240 }
3241 return rv;
3242 }
3243
3244 #if MACH_ASSERT
3245 extern int pmap_ledgers_panic;
3246 extern int pmap_ledgers_panic_leeway;
3247
3248 static void
pmap_check_ledgers(pmap_t pmap)3249 pmap_check_ledgers(
3250 pmap_t pmap)
3251 {
3252 int pid;
3253 char *procname;
3254
3255 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
3256 /*
3257 * This pmap was not or is no longer fully associated
3258 * with a task (e.g. the old pmap after a fork()/exec() or
3259 * spawn()). Its "ledger" still points at a task that is
3260 * now using a different (and active) address space, so
3261 * we can't check that all the pmap ledgers are balanced here.
3262 *
3263 * If the "pid" is set, that means that we went through
3264 * pmap_set_process() in task_terminate_internal(), so
3265 * this task's ledger should not have been re-used and
3266 * all the pmap ledgers should be back to 0.
3267 */
3268 return;
3269 }
3270
3271 pid = pmap->pmap_pid;
3272 procname = pmap->pmap_procname;
3273
3274 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
3275 }
3276
3277 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3278 pmap_set_process(
3279 pmap_t pmap,
3280 int pid,
3281 char *procname)
3282 {
3283 if (pmap == NULL || pmap->pmap_pid == -1) {
3284 return;
3285 }
3286
3287 pmap->pmap_pid = pid;
3288 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3289 if (pmap_ledgers_panic_leeway) {
3290 /*
3291 * XXX FBDP
3292 * Some processes somehow trigger some issues that make
3293 * the pmap stats and ledgers go off track, causing
3294 * some assertion failures and ledger panics.
3295 * Turn off the sanity checks if we allow some ledger leeway
3296 * because of that. We'll still do a final check in
3297 * pmap_check_ledgers() for discrepancies larger than the
3298 * allowed leeway after the address space has been fully
3299 * cleaned up.
3300 */
3301 pmap->pmap_stats_assert = FALSE;
3302 ledger_disable_panic_on_negative(pmap->ledger,
3303 task_ledgers.phys_footprint);
3304 ledger_disable_panic_on_negative(pmap->ledger,
3305 task_ledgers.internal);
3306 ledger_disable_panic_on_negative(pmap->ledger,
3307 task_ledgers.internal_compressed);
3308 ledger_disable_panic_on_negative(pmap->ledger,
3309 task_ledgers.iokit_mapped);
3310 ledger_disable_panic_on_negative(pmap->ledger,
3311 task_ledgers.alternate_accounting);
3312 ledger_disable_panic_on_negative(pmap->ledger,
3313 task_ledgers.alternate_accounting_compressed);
3314 }
3315 }
3316 #endif /* MACH_ASSERT */
3317
3318
3319 #if DEVELOPMENT || DEBUG
3320 int pmap_pagezero_mitigation = 1;
3321 #endif
3322
3323 void
pmap_advise_pagezero_range(pmap_t lpmap,uint64_t low_bound)3324 pmap_advise_pagezero_range(pmap_t lpmap, uint64_t low_bound)
3325 {
3326 #if DEVELOPMENT || DEBUG
3327 if (pmap_pagezero_mitigation == 0) {
3328 lpmap->pagezero_accessible = FALSE;
3329 return;
3330 }
3331 #endif
3332 lpmap->pagezero_accessible = ((pmap_smap_enabled == FALSE) && (low_bound < 0x1000));
3333 if (lpmap == current_pmap()) {
3334 mp_disable_preemption();
3335 current_cpu_datap()->cpu_pagezero_mapped = lpmap->pagezero_accessible;
3336 mp_enable_preemption();
3337 }
3338 }
3339
3340 uintptr_t
pmap_verify_noncacheable(uintptr_t vaddr)3341 pmap_verify_noncacheable(uintptr_t vaddr)
3342 {
3343 pt_entry_t *ptep = NULL;
3344 ptep = pmap_pte(kernel_pmap, vaddr);
3345 if (ptep == NULL) {
3346 panic("pmap_verify_noncacheable: no translation for 0x%lx", vaddr);
3347 }
3348 /* Non-cacheable OK */
3349 if (*ptep & (INTEL_PTE_NCACHE)) {
3350 return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3351 }
3352 /* Write-combined OK */
3353 if (*ptep & (INTEL_PTE_PAT)) {
3354 return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3355 }
3356 panic("pmap_verify_noncacheable: IO read from a cacheable address? address: 0x%lx, PTE: %p, *PTE: 0x%llx", vaddr, ptep, *ptep);
3357 /*NOTREACHED*/
3358 return 0;
3359 }
3360
3361 bool
pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])3362 pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])
3363 {
3364 // Unsupported on this architecture.
3365 return false;
3366 }
3367
3368 uint32_t
pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])3369 pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])
3370 {
3371 // Unsupported on this architecture.
3372 return false;
3373 }
3374
3375 int
pmap_cs_configuration(void)3376 pmap_cs_configuration(void)
3377 {
3378 // Unsupported on this architecture.
3379 return 0;
3380 }
3381
3382 SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
3383 uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
3384
3385 void
pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])3386 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
3387 {
3388 simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL);
3389 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
3390 simple_unlock(&pmap_compilation_service_cdhash_lock);
3391
3392 #if DEVELOPMENT || DEBUG
3393 printf("Added Compilation Service CDHash through the PMAP: 0x%02X 0x%02X 0x%02X 0x%02X\n", cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
3394 #endif
3395 }
3396
3397 bool
pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])3398 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
3399 {
3400 bool match = false;
3401
3402 simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL);
3403 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
3404 match = true;
3405 }
3406 simple_unlock(&pmap_compilation_service_cdhash_lock);
3407
3408 #if DEVELOPMENT || DEBUG
3409 if (match) {
3410 printf("Matched Compilation Service CDHash through the PMAP\n");
3411 }
3412 #endif
3413
3414 return match;
3415 }
3416
3417 static bool pmap_local_signing_public_key_set = false;
3418 static uint8_t pmap_local_signing_public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE] = { 0 };
3419
3420 static bool
pmap_local_signing_public_key_is_set(void)3421 pmap_local_signing_public_key_is_set(void)
3422 {
3423 return os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
3424 }
3425
3426 void
pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])3427 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])
3428 {
3429 bool key_set = false;
3430
3431 /*
3432 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
3433 * a successful exchange means that the local signing public key has _not_ been
3434 * set. In case the key has been set, we panic as we would never expect the
3435 * kernel to attempt to set the key more than once.
3436 */
3437 key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
3438
3439 if (key_set) {
3440 panic("attempted to set the local signing public key multiple times");
3441 }
3442
3443 memcpy(pmap_local_signing_public_key, public_key, PMAP_ECC_P384_PUBLIC_KEY_SIZE);
3444
3445 #if DEVELOPMENT || DEBUG
3446 printf("Set local signing public key\n");
3447 #endif
3448 }
3449
3450 uint8_t*
pmap_get_local_signing_public_key(void)3451 pmap_get_local_signing_public_key(void)
3452 {
3453 if (pmap_local_signing_public_key_is_set()) {
3454 return pmap_local_signing_public_key;
3455 }
3456 return NULL;
3457 }
3458
3459 void
pmap_unrestrict_local_signing(__unused const uint8_t cdhash[CS_CDHASH_LEN])3460 pmap_unrestrict_local_signing(
3461 __unused const uint8_t cdhash[CS_CDHASH_LEN])
3462 {
3463 // TODO: Once all changes across XNU and AMFI have been submitted, panic.
3464 }
3465
3466 bool
pmap_query_entitlements(__unused pmap_t pmap,__unused CEQuery_t query,__unused size_t queryLength,__unused CEQueryContext_t finalContext)3467 pmap_query_entitlements(
3468 __unused pmap_t pmap,
3469 __unused CEQuery_t query,
3470 __unused size_t queryLength,
3471 __unused CEQueryContext_t finalContext)
3472 {
3473 #if !PMAP_SUPPORTS_ENTITLEMENT_CHECKS
3474 panic("PMAP_CS: do not use this API without checking for \'#if PMAP_SUPPORTS_ENTITLEMENT_CHECKS\'");
3475 #endif
3476
3477 panic("PMAP_SUPPORTS_ENTITLEMENT_CHECKS should not be defined on this platform");
3478 }
3479
3480 bool
pmap_cs_enabled(void)3481 pmap_cs_enabled(void)
3482 {
3483 return false;
3484 }
3485
3486 bool
pmap_in_ppl(void)3487 pmap_in_ppl(void)
3488 {
3489 // Nonexistent on this architecture.
3490 return false;
3491 }
3492
3493 bool
pmap_has_ppl(void)3494 pmap_has_ppl(void)
3495 {
3496 // Not supported on this architecture.
3497 return false;
3498 }
3499
3500 bool
pmap_has_iofilter_protected_write()3501 pmap_has_iofilter_protected_write()
3502 {
3503 // Not supported on this architecture.
3504 return false;
3505 }
3506
3507 __attribute__((__noreturn__))
3508 void
pmap_iofilter_protected_write(__unused vm_address_t addr,__unused uint64_t value,__unused uint64_t width)3509 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
3510 {
3511 panic("%s called on an unsupported platform.", __FUNCTION__);
3512 }
3513
3514 void* __attribute__((noreturn))
pmap_image4_pmap_data(__unused size_t * allocated_size)3515 pmap_image4_pmap_data(
3516 __unused size_t *allocated_size)
3517 {
3518 panic("PMAP_IMG4: image4 data not available on this architecture");
3519 }
3520
3521 void __attribute__((noreturn))
pmap_image4_set_nonce(__unused const img4_nonce_domain_index_t ndi,__unused const img4_nonce_t * nonce)3522 pmap_image4_set_nonce(
3523 __unused const img4_nonce_domain_index_t ndi,
3524 __unused const img4_nonce_t *nonce)
3525 {
3526 panic("PMAP_IMG4: set nonce API not supported on this architecture");
3527 }
3528
3529 void __attribute__((noreturn))
pmap_image4_roll_nonce(__unused const img4_nonce_domain_index_t ndi)3530 pmap_image4_roll_nonce(
3531 __unused const img4_nonce_domain_index_t ndi)
3532 {
3533 panic("PMAP_IMG4: roll nonce API not supported on this architecture");
3534 }
3535
3536 errno_t __attribute__((noreturn))
pmap_image4_copy_nonce(__unused const img4_nonce_domain_index_t ndi,__unused img4_nonce_t * nonce_out)3537 pmap_image4_copy_nonce(
3538 __unused const img4_nonce_domain_index_t ndi,
3539 __unused img4_nonce_t *nonce_out
3540 )
3541 {
3542 panic("PMAP_IMG4: copy nonce API not supported on this architecture");
3543 }
3544
3545 errno_t __attribute__((noreturn))
pmap_image4_execute_object(__unused img4_runtime_object_spec_index_t obj_spec_index,__unused const img4_buff_t * payload,__unused const img4_buff_t * _Nullable manifest)3546 pmap_image4_execute_object(
3547 __unused img4_runtime_object_spec_index_t obj_spec_index,
3548 __unused const img4_buff_t *payload,
3549 __unused const img4_buff_t *_Nullable manifest)
3550 {
3551 panic("PMAP_IMG4: execute object API not supported on this architecture");
3552 }
3553
3554 errno_t __attribute__((noreturn))
pmap_image4_copy_object(__unused img4_runtime_object_spec_index_t obj_spec_index,__unused vm_address_t object_out,__unused size_t * object_length)3555 pmap_image4_copy_object(
3556 __unused img4_runtime_object_spec_index_t obj_spec_index,
3557 __unused vm_address_t object_out,
3558 __unused size_t *object_length)
3559 {
3560 panic("PMAP_IMG4: copy object API not supported on this architecture");
3561 }
3562
3563 kern_return_t
pmap_cs_allow_invalid(__unused pmap_t pmap)3564 pmap_cs_allow_invalid(__unused pmap_t pmap)
3565 {
3566 // Unsupported on this architecture.
3567 return KERN_SUCCESS;
3568 }
3569
3570 void *
pmap_claim_reserved_ppl_page(void)3571 pmap_claim_reserved_ppl_page(void)
3572 {
3573 // Unsupported on this architecture.
3574 return NULL;
3575 }
3576
3577 void
pmap_free_reserved_ppl_page(void __unused * kva)3578 pmap_free_reserved_ppl_page(void __unused *kva)
3579 {
3580 // Unsupported on this architecture.
3581 }
3582
3583 kern_return_t
pmap_cs_fork_prepare(__unused pmap_t old_pmap,__unused pmap_t new_pmap)3584 pmap_cs_fork_prepare(__unused pmap_t old_pmap, __unused pmap_t new_pmap)
3585 {
3586 // PMAP_CS isn't enabled for x86_64.
3587 return KERN_SUCCESS;
3588 }
3589
3590 #if DEVELOPMENT || DEBUG
3591 /*
3592 * Used for unit testing recovery from text corruptions.
3593 */
3594 kern_return_t
pmap_test_text_corruption(pmap_paddr_t pa)3595 pmap_test_text_corruption(pmap_paddr_t pa)
3596 {
3597 int pai;
3598 uint8_t *va;
3599
3600 pai = ppn_to_pai(atop(pa));
3601 if (!IS_MANAGED_PAGE(pai)) {
3602 return KERN_FAILURE;
3603 }
3604
3605 va = (uint8_t *)PHYSMAP_PTOV(pa);
3606 va[0] = 0x0f; /* opcode for UD2 */
3607 va[1] = 0x0b;
3608
3609 return KERN_SUCCESS;
3610 }
3611 #endif /* DEVELOPMENT || DEBUG */
3612