1 /*
2 * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58
59 /*
60 * File: pmap.c
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
63 *
64 * Physical Map management code for Intel i386, i486, and i860.
65 *
66 * Manages physical address maps.
67 *
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
74 *
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
80 * requested.
81 *
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
89 */
90
91 #include <string.h>
92 #include <mach_ldebug.h>
93
94 #include <libkern/OSAtomic.h>
95
96 #include <mach/machine/vm_types.h>
97
98 #include <mach/boolean.h>
99 #include <kern/thread.h>
100 #include <kern/zalloc.h>
101 #include <kern/zalloc_internal.h>
102 #include <kern/queue.h>
103 #include <kern/ledger.h>
104 #include <kern/mach_param.h>
105
106 #include <kern/spl.h>
107
108 #include <vm/pmap.h>
109 #include <vm/pmap_cs.h>
110 #include <vm/vm_map.h>
111 #include <vm/vm_kern.h>
112 #include <mach/vm_param.h>
113 #include <mach/vm_prot.h>
114 #include <vm/vm_object.h>
115 #include <vm/vm_page.h>
116
117 #include <mach/machine/vm_param.h>
118 #include <machine/thread.h>
119
120 #include <kern/misc_protos.h> /* prototyping */
121 #include <i386/misc_protos.h>
122 #include <i386/i386_lowmem.h>
123 #include <x86_64/lowglobals.h>
124
125 #include <i386/cpuid.h>
126 #include <i386/cpu_data.h>
127 #include <i386/cpu_number.h>
128 #include <i386/machine_cpu.h>
129 #include <i386/seg.h>
130 #include <i386/serial_io.h>
131 #include <i386/cpu_capabilities.h>
132 #include <i386/machine_routines.h>
133 #include <i386/proc_reg.h>
134 #include <i386/tsc.h>
135 #include <i386/pmap_internal.h>
136 #include <i386/pmap_pcid.h>
137 #if CONFIG_VMX
138 #include <i386/vmx/vmx_cpu.h>
139 #endif
140
141 #include <vm/vm_protos.h>
142 #include <san/kasan.h>
143
144 #include <i386/mp.h>
145 #include <i386/mp_desc.h>
146 #include <libkern/kernel_mach_header.h>
147
148 #include <pexpert/i386/efi.h>
149 #include <libkern/section_keywords.h>
150 #if MACH_ASSERT
151 int pmap_stats_assert = 1;
152 #endif /* MACH_ASSERT */
153
154 #ifdef IWANTTODEBUG
155 #undef DEBUG
156 #define DEBUG 1
157 #define POSTCODE_DELAY 1
158 #include <i386/postcode.h>
159 #endif /* IWANTTODEBUG */
160
161 #ifdef PMAP_DEBUG
162 #define DBG(x...) kprintf("DBG: " x)
163 #else
164 #define DBG(x...)
165 #endif
166 /* Compile time assert to ensure adjacency/alignment of per-CPU data fields used
167 * in the trampolines for kernel/user boundary TLB coherency.
168 */
169 char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1];
170 boolean_t pmap_trace = FALSE;
171
172 boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */
173
174 #if DEVELOPMENT || DEBUG
175 int nx_enabled = 1; /* enable no-execute protection -- set during boot */
176 #else
177 const int nx_enabled = 1;
178 #endif
179
180 #if DEBUG || DEVELOPMENT
181 int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
182 int allow_stack_exec = 0; /* No apps may execute from the stack by default */
183 #else /* DEBUG || DEVELOPMENT */
184 const int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
185 const int allow_stack_exec = 0; /* No apps may execute from the stack by default */
186 #endif /* DEBUG || DEVELOPMENT */
187
188 uint64_t max_preemption_latency_tsc = 0;
189
190 pv_hashed_entry_t *pv_hash_table; /* hash lists */
191
192 uint32_t npvhashmask = 0, npvhashbuckets = 0;
193
194 pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
195 pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
196 SIMPLE_LOCK_DECLARE(pv_hashed_free_list_lock, 0);
197 SIMPLE_LOCK_DECLARE(pv_hashed_kern_free_list_lock, 0);
198 SIMPLE_LOCK_DECLARE(pv_hash_table_lock, 0);
199 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
200
201 SECURITY_READ_ONLY_LATE(zone_t) pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
202
203 /*
204 * First and last physical addresses that we maintain any information
205 * for. Initialized to zero so that pmap operations done before
206 * pmap_init won't touch any non-existent structures.
207 */
208 boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */
209
210 static struct vm_object kptobj_object_store VM_PAGE_PACKED_ALIGNED;
211 static struct vm_object kpml4obj_object_store VM_PAGE_PACKED_ALIGNED;
212 static struct vm_object kpdptobj_object_store VM_PAGE_PACKED_ALIGNED;
213
214 /*
215 * Array of physical page attribites for managed pages.
216 * One byte per physical page.
217 */
218 char *pmap_phys_attributes;
219 ppnum_t last_managed_page = 0;
220
221 unsigned pmap_memory_region_count;
222 unsigned pmap_memory_region_current;
223
224 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
225
226 /*
227 * Other useful macros.
228 */
229 #define current_pmap() (vm_map_pmap(current_thread()->map))
230
231 struct pmap kernel_pmap_store;
232 const pmap_t kernel_pmap = &kernel_pmap_store;
233 SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
234 SECURITY_READ_ONLY_LATE(zone_t) pmap_anchor_zone;
235 SECURITY_READ_ONLY_LATE(zone_t) pmap_uanchor_zone;
236 int pmap_debug = 0; /* flag for debugging prints */
237
238 unsigned int inuse_ptepages_count = 0;
239 long long alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */
240 unsigned int bootstrap_wired_pages = 0;
241
242 extern long NMIPI_acks;
243
244 SECURITY_READ_ONLY_LATE(boolean_t) kernel_text_ps_4K = TRUE;
245
246 extern char end;
247
248 static int nkpt;
249
250 #if DEVELOPMENT || DEBUG
251 SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kheap_nx = FALSE;
252 SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kstack_nx = FALSE;
253 SECURITY_READ_ONLY_LATE(boolean_t) wpkernel = TRUE;
254 #else
255 const boolean_t wpkernel = TRUE;
256 #endif
257
258 extern long __stack_chk_guard[];
259
260 static uint64_t pmap_eptp_flags = 0;
261 boolean_t pmap_ept_support_ad = FALSE;
262
263 static void process_pmap_updates(pmap_t, bool, addr64_t, addr64_t);
264 /*
265 * Map memory at initialization. The physical addresses being
266 * mapped are not managed and are never unmapped.
267 *
268 * For now, VM is already on, we only need to map the
269 * specified memory.
270 */
271 vm_offset_t
pmap_map(vm_offset_t virt,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int flags)272 pmap_map(
273 vm_offset_t virt,
274 vm_map_offset_t start_addr,
275 vm_map_offset_t end_addr,
276 vm_prot_t prot,
277 unsigned int flags)
278 {
279 kern_return_t kr;
280 int ps;
281
282 ps = PAGE_SIZE;
283 while (start_addr < end_addr) {
284 kr = pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
285 (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE);
286
287 if (kr != KERN_SUCCESS) {
288 panic("%s: failed pmap_enter, "
289 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
290 __FUNCTION__,
291 (void *)virt, (void *)start_addr, (void *)end_addr, prot, flags);
292 }
293
294 virt += ps;
295 start_addr += ps;
296 }
297 return virt;
298 }
299
300 extern char *first_avail;
301 extern vm_offset_t virtual_avail, virtual_end;
302 extern pmap_paddr_t avail_start, avail_end;
303 extern vm_offset_t sHIB;
304 extern vm_offset_t eHIB;
305 extern vm_offset_t stext;
306 extern vm_offset_t etext;
307 extern vm_offset_t sdata, edata;
308 extern vm_offset_t sconst, econst;
309
310 extern void *KPTphys;
311
312 boolean_t pmap_smep_enabled = FALSE;
313 boolean_t pmap_smap_enabled = FALSE;
314
315 void
pmap_cpu_init(void)316 pmap_cpu_init(void)
317 {
318 cpu_data_t *cdp = current_cpu_datap();
319
320 set_cr4(get_cr4() | CR4_PGE);
321
322 /*
323 * Initialize the per-cpu, TLB-related fields.
324 */
325 cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
326 cpu_shadowp(cdp->cpu_number)->cpu_kernel_cr3 = cdp->cpu_kernel_cr3;
327 cdp->cpu_active_cr3 = kernel_pmap->pm_cr3;
328 cdp->cpu_tlb_invalid = 0;
329 cdp->cpu_task_map = TASK_MAP_64BIT;
330
331 pmap_pcid_configure();
332 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) {
333 pmap_smep_enabled = TRUE;
334 #if DEVELOPMENT || DEBUG
335 boolean_t nsmep;
336 if (PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) {
337 pmap_smep_enabled = FALSE;
338 }
339 #endif
340 if (pmap_smep_enabled) {
341 set_cr4(get_cr4() | CR4_SMEP);
342 }
343 }
344 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
345 pmap_smap_enabled = TRUE;
346 #if DEVELOPMENT || DEBUG
347 boolean_t nsmap;
348 if (PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
349 pmap_smap_enabled = FALSE;
350 }
351 #endif
352 if (pmap_smap_enabled) {
353 set_cr4(get_cr4() | CR4_SMAP);
354 }
355 }
356
357 #if !MONOTONIC
358 if (cdp->cpu_fixed_pmcs_enabled) {
359 boolean_t enable = TRUE;
360 cpu_pmc_control(&enable);
361 }
362 #endif /* !MONOTONIC */
363 }
364
365 static void
pmap_ro_zone_validate_element_dst(zone_id_t zid,vm_offset_t va,vm_offset_t offset,vm_size_t new_data_size)366 pmap_ro_zone_validate_element_dst(
367 zone_id_t zid,
368 vm_offset_t va,
369 vm_offset_t offset,
370 vm_size_t new_data_size)
371 {
372 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
373
374 /* Check element is from correct zone and properly aligned */
375 zone_require_ro(zid, elem_size, (void*)va);
376
377 if (__improbable(new_data_size > (elem_size - offset))) {
378 panic("%s: New data size %lu too large for elem size %lu at addr %p",
379 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
380 }
381 if (__improbable(offset >= elem_size)) {
382 panic("%s: Offset %lu too large for elem size %lu at addr %p",
383 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
384 }
385 }
386
387 static void
pmap_ro_zone_validate_element(zone_id_t zid,vm_offset_t va,vm_offset_t offset,const vm_offset_t new_data,vm_size_t new_data_size)388 pmap_ro_zone_validate_element(
389 zone_id_t zid,
390 vm_offset_t va,
391 vm_offset_t offset,
392 const vm_offset_t new_data,
393 vm_size_t new_data_size)
394 {
395 vm_offset_t sum = 0;
396
397 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
398 panic("%s: Integer addition overflow %p + %lu = %lu",
399 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
400 }
401
402 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
403 }
404
405 void
pmap_ro_zone_memcpy(zone_id_t zid,vm_offset_t va,vm_offset_t offset,const vm_offset_t new_data,vm_size_t new_data_size)406 pmap_ro_zone_memcpy(
407 zone_id_t zid,
408 vm_offset_t va,
409 vm_offset_t offset,
410 const vm_offset_t new_data,
411 vm_size_t new_data_size)
412 {
413 const pmap_paddr_t pa = kvtophys(va + offset);
414
415 if (!new_data || new_data_size == 0) {
416 return;
417 }
418
419 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
420 /* Write through Physical Aperture */
421 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
422 }
423
424 uint64_t
pmap_ro_zone_atomic_op(zone_id_t zid,vm_offset_t va,vm_offset_t offset,zro_atomic_op_t op,uint64_t value)425 pmap_ro_zone_atomic_op(
426 zone_id_t zid,
427 vm_offset_t va,
428 vm_offset_t offset,
429 zro_atomic_op_t op,
430 uint64_t value)
431 {
432 const pmap_paddr_t pa = kvtophys(va + offset);
433 vm_size_t value_size = op & 0xf;
434
435 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
436 /* Write through Physical Aperture */
437 return __zalloc_ro_mut_atomic(phystokv(pa), op, value);
438 }
439
440 void
pmap_ro_zone_bzero(zone_id_t zid,vm_offset_t va,vm_offset_t offset,vm_size_t size)441 pmap_ro_zone_bzero(
442 zone_id_t zid,
443 vm_offset_t va,
444 vm_offset_t offset,
445 vm_size_t size)
446 {
447 const pmap_paddr_t pa = kvtophys(va + offset);
448 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
449 bzero((void*)phystokv(pa), size);
450 }
451
452 static uint32_t
pmap_scale_shift(void)453 pmap_scale_shift(void)
454 {
455 uint32_t scale = 0;
456
457 if (sane_size <= 8 * GB) {
458 scale = (uint32_t)(sane_size / (2 * GB));
459 } else if (sane_size <= 32 * GB) {
460 scale = 4 + (uint32_t)((sane_size - (8 * GB)) / (4 * GB));
461 } else {
462 scale = 10 + (uint32_t)MIN(4, ((sane_size - (32 * GB)) / (8 * GB)));
463 }
464 return scale;
465 }
466
467 LCK_GRP_DECLARE(pmap_lck_grp, "pmap");
468 LCK_ATTR_DECLARE(pmap_lck_rw_attr, 0, LCK_ATTR_DEBUG);
469
470 /*
471 * Bootstrap the system enough to run with virtual memory.
472 * Map the kernel's code and data, and allocate the system page table.
473 * Called with mapping OFF. Page_size must already be set.
474 */
475
476 void
pmap_bootstrap(__unused vm_offset_t load_start,__unused boolean_t IA32e)477 pmap_bootstrap(
478 __unused vm_offset_t load_start,
479 __unused boolean_t IA32e)
480 {
481 assert(IA32e);
482
483 vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address
484 * known to VM */
485 /*
486 * The kernel's pmap is statically allocated so we don't
487 * have to use pmap_create, which is unlikely to work
488 * correctly at this part of the boot sequence.
489 */
490
491 os_ref_init(&kernel_pmap->ref_count, NULL);
492 #if DEVELOPMENT || DEBUG
493 kernel_pmap->nx_enabled = TRUE;
494 #endif
495 kernel_pmap->pm_task_map = TASK_MAP_64BIT;
496 kernel_pmap->pm_obj = (vm_object_t) NULL;
497 kernel_pmap->pm_pml4 = IdlePML4;
498 kernel_pmap->pm_upml4 = IdlePML4;
499 kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
500 kernel_pmap->pm_ucr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
501 kernel_pmap->pm_eptp = 0;
502
503 pmap_pcid_initialize_kernel(kernel_pmap);
504
505 current_cpu_datap()->cpu_kernel_cr3 = cpu_shadowp(cpu_number())->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
506
507 nkpt = NKPT;
508 OSAddAtomic(NKPT, &inuse_ptepages_count);
509 OSAddAtomic64(NKPT, &alloc_ptepages_count);
510 bootstrap_wired_pages = NKPT;
511
512 virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
513 virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
514
515 if (!PE_parse_boot_argn("npvhash", &npvhashmask, sizeof(npvhashmask))) {
516 npvhashmask = ((NPVHASHBUCKETS) << pmap_scale_shift()) - 1;
517 }
518
519 npvhashbuckets = npvhashmask + 1;
520
521 if (0 != ((npvhashbuckets) & npvhashmask)) {
522 panic("invalid hash %d, must be ((2^N)-1), "
523 "using default %d\n", npvhashmask, NPVHASHMASK);
524 }
525
526 lck_rw_init(&kernel_pmap->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
527 kernel_pmap->pmap_rwl.lck_rw_can_sleep = FALSE;
528
529 pmap_cpu_init();
530
531 if (pmap_pcid_ncpus) {
532 printf("PMAP: PCID enabled\n");
533 }
534
535 if (pmap_smep_enabled) {
536 printf("PMAP: Supervisor Mode Execute Protection enabled\n");
537 }
538 if (pmap_smap_enabled) {
539 printf("PMAP: Supervisor Mode Access Protection enabled\n");
540 }
541
542 #if DEBUG
543 printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
544 printf("early_random(): 0x%qx\n", early_random());
545 #endif
546 #if DEVELOPMENT || DEBUG
547 boolean_t ptmp;
548 /* Check if the user has requested disabling stack or heap no-execute
549 * enforcement. These are "const" variables; that qualifier is cast away
550 * when altering them. The TEXT/DATA const sections are marked
551 * write protected later in the kernel startup sequence, so altering
552 * them is possible at this point, in pmap_bootstrap().
553 */
554 if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) {
555 boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx;
556 *pdknxp = TRUE;
557 }
558
559 if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) {
560 boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx;
561 *pdknhp = TRUE;
562 }
563 #endif /* DEVELOPMENT || DEBUG */
564
565 boot_args *args = (boot_args *)PE_state.bootArgs;
566 if (args->efiMode == kBootArgsEfiMode32) {
567 printf("EFI32: kernel virtual space limited to 4GB\n");
568 virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32;
569 }
570 kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
571 (long)KERNEL_BASE, (long)virtual_end);
572 kprintf("Available physical space from 0x%llx to 0x%llx\n",
573 avail_start, avail_end);
574
575 /*
576 * The -no_shared_cr3 boot-arg is a debugging feature (set by default
577 * in the DEBUG kernel) to force the kernel to switch to its own map
578 * (and cr3) when control is in kernelspace. The kernel's map does not
579 * include (i.e. share) userspace so wild references will cause
580 * a panic. Only copyin and copyout are exempt from this.
581 */
582 (void) PE_parse_boot_argn("-no_shared_cr3",
583 &no_shared_cr3, sizeof(no_shared_cr3));
584 if (no_shared_cr3) {
585 kprintf("Kernel not sharing user map\n");
586 }
587
588 #ifdef PMAP_TRACES
589 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof(pmap_trace))) {
590 kprintf("Kernel traces for pmap operations enabled\n");
591 }
592 #endif /* PMAP_TRACES */
593
594 #if MACH_ASSERT
595 PE_parse_boot_argn("pmap_asserts", &pmap_asserts_enabled, sizeof(pmap_asserts_enabled));
596 PE_parse_boot_argn("pmap_stats_assert",
597 &pmap_stats_assert,
598 sizeof(pmap_stats_assert));
599 #endif /* MACH_ASSERT */
600 }
601
602 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)603 pmap_virtual_space(
604 vm_offset_t *startp,
605 vm_offset_t *endp)
606 {
607 *startp = virtual_avail;
608 *endp = virtual_end;
609 }
610
611
612
613
614 #if HIBERNATION
615
616 #include <IOKit/IOHibernatePrivate.h>
617 #include <machine/pal_hibernate.h>
618
619 int32_t pmap_npages;
620 int32_t pmap_teardown_last_valid_compact_indx = -1;
621
622 void pmap_pack_index(uint32_t);
623 int32_t pmap_unpack_index(pv_rooted_entry_t);
624
625 int32_t
pmap_unpack_index(pv_rooted_entry_t pv_h)626 pmap_unpack_index(pv_rooted_entry_t pv_h)
627 {
628 int32_t indx = 0;
629
630 indx = (int32_t)(*((uint64_t *)(&pv_h->qlink.next)) >> 48);
631 indx = indx << 16;
632 indx |= (int32_t)(*((uint64_t *)(&pv_h->qlink.prev)) >> 48);
633
634 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)0xffff << 48);
635 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)0xffff << 48);
636
637 return indx;
638 }
639
640
641 void
pmap_pack_index(uint32_t indx)642 pmap_pack_index(uint32_t indx)
643 {
644 pv_rooted_entry_t pv_h;
645
646 pv_h = &pv_head_table[indx];
647
648 *((uint64_t *)(&pv_h->qlink.next)) &= ~((uint64_t)0xffff << 48);
649 *((uint64_t *)(&pv_h->qlink.prev)) &= ~((uint64_t)0xffff << 48);
650
651 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)(indx >> 16)) << 48;
652 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)(indx & 0xffff)) << 48;
653 }
654
655
656 void
pal_hib_teardown_pmap_structs(addr64_t * unneeded_start,addr64_t * unneeded_end)657 pal_hib_teardown_pmap_structs(addr64_t *unneeded_start, addr64_t *unneeded_end)
658 {
659 int32_t i;
660 int32_t compact_target_indx;
661
662 compact_target_indx = 0;
663
664 for (i = 0; i < pmap_npages; i++) {
665 if (pv_head_table[i].pmap == PMAP_NULL) {
666 if (pv_head_table[compact_target_indx].pmap != PMAP_NULL) {
667 compact_target_indx = i;
668 }
669 } else {
670 pmap_pack_index((uint32_t)i);
671
672 if (pv_head_table[compact_target_indx].pmap == PMAP_NULL) {
673 /*
674 * we've got a hole to fill, so
675 * move this pv_rooted_entry_t to it's new home
676 */
677 pv_head_table[compact_target_indx] = pv_head_table[i];
678 pv_head_table[i].pmap = PMAP_NULL;
679
680 pmap_teardown_last_valid_compact_indx = compact_target_indx;
681 compact_target_indx++;
682 } else {
683 pmap_teardown_last_valid_compact_indx = i;
684 }
685 }
686 }
687 *unneeded_start = (addr64_t)&pv_head_table[pmap_teardown_last_valid_compact_indx + 1];
688 *unneeded_end = (addr64_t)&pv_head_table[pmap_npages - 1];
689
690 HIBLOG("pal_hib_teardown_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
691 }
692
693
694 void
pal_hib_rebuild_pmap_structs(void)695 pal_hib_rebuild_pmap_structs(void)
696 {
697 int32_t cindx, eindx, rindx = 0;
698 pv_rooted_entry_t pv_h;
699
700 eindx = (int32_t)pmap_npages;
701
702 for (cindx = pmap_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
703 pv_h = &pv_head_table[cindx];
704
705 rindx = pmap_unpack_index(pv_h);
706 assert(rindx < pmap_npages);
707
708 if (rindx != cindx) {
709 /*
710 * this pv_rooted_entry_t was moved by pal_hib_teardown_pmap_structs,
711 * so move it back to its real location
712 */
713 pv_head_table[rindx] = pv_head_table[cindx];
714 }
715 if (rindx + 1 != eindx) {
716 /*
717 * the 'hole' between this vm_rooted_entry_t and the previous
718 * vm_rooted_entry_t we moved needs to be initialized as
719 * a range of zero'd vm_rooted_entry_t's
720 */
721 bzero((char *)&pv_head_table[rindx + 1], (eindx - rindx - 1) * sizeof(struct pv_rooted_entry));
722 }
723 eindx = rindx;
724 }
725 if (rindx) {
726 bzero((char *)&pv_head_table[0], rindx * sizeof(struct pv_rooted_entry));
727 }
728
729 HIBLOG("pal_hib_rebuild_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
730 }
731
732 #endif
733
734 /*
735 * Create pv entries for kernel pages mapped by early startup code.
736 * These have to exist so we can ml_static_mfree() them later.
737 */
738 static void
pmap_pv_fixup(vm_offset_t start_va,vm_offset_t end_va)739 pmap_pv_fixup(vm_offset_t start_va, vm_offset_t end_va)
740 {
741 ppnum_t ppn;
742 pv_rooted_entry_t pv_h;
743 uint32_t pgsz;
744
745 start_va = round_page(start_va);
746 end_va = trunc_page(end_va);
747 while (start_va < end_va) {
748 pgsz = PAGE_SIZE;
749 ppn = pmap_find_phys(kernel_pmap, start_va);
750 if (ppn != 0 && IS_MANAGED_PAGE(ppn)) {
751 pv_h = pai_to_pvh(ppn);
752 assert(pv_h->qlink.next == 0); /* shouldn't be init'd yet */
753 assert(pv_h->pmap == 0);
754 pv_h->va_and_flags = start_va;
755 pv_h->pmap = kernel_pmap;
756 queue_init(&pv_h->qlink);
757 /*
758 * Note that pmap_query_pagesize does not enforce start_va is aligned
759 * on a 2M boundary if it's within a large page
760 */
761 if (pmap_query_pagesize(kernel_pmap, start_va) == I386_LPGBYTES) {
762 pgsz = I386_LPGBYTES;
763 }
764 }
765 if (os_add_overflow(start_va, pgsz, &start_va)) {
766 #if DEVELOPMENT || DEBUG
767 panic("pmap_pv_fixup: Unexpected address wrap (0x%lx after adding 0x%x)", start_va, pgsz);
768 #else
769 start_va = end_va;
770 #endif
771 }
772 }
773 }
774
775 /*
776 * Initialize the pmap module.
777 * Called by vm_init, to initialize any structures that the pmap
778 * system needs to map virtual memory.
779 */
780 void
pmap_init(void)781 pmap_init(void)
782 {
783 long npages;
784 vm_offset_t addr;
785 vm_size_t s, vsize;
786 vm_map_offset_t vaddr;
787 ppnum_t ppn;
788
789
790 kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
791 _vm_object_allocate((vm_object_size_t)NPML4PGS * PAGE_SIZE, &kpml4obj_object_store);
792
793 kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
794 _vm_object_allocate((vm_object_size_t)NPDPTPGS * PAGE_SIZE, &kpdptobj_object_store);
795
796 kernel_pmap->pm_obj = &kptobj_object_store;
797 _vm_object_allocate((vm_object_size_t)NPDEPGS * PAGE_SIZE, &kptobj_object_store);
798
799 /*
800 * Allocate memory for the pv_head_table and its lock bits,
801 * the modify bit array, and the pte_page table.
802 */
803
804 /*
805 * zero bias all these arrays now instead of off avail_start
806 * so we cover all memory
807 */
808
809 npages = i386_btop(avail_end);
810 #if HIBERNATION
811 pmap_npages = (uint32_t)npages;
812 #endif
813 s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
814 + (sizeof(struct pv_hashed_entry_t *) * (npvhashbuckets))
815 + pv_lock_table_size(npages)
816 + pv_hash_lock_table_size((npvhashbuckets))
817 + npages);
818 s = round_page(s);
819
820 kmem_alloc(kernel_map, &addr, s,
821 KMA_NOFAIL | KMA_ZERO | KMA_KOBJECT | KMA_PERMANENT,
822 VM_KERN_MEMORY_PMAP);
823
824 vaddr = addr;
825 vsize = s;
826
827 #if PV_DEBUG
828 if (0 == npvhashmask) {
829 panic("npvhashmask not initialized");
830 }
831 #endif
832
833 /*
834 * Allocate the structures first to preserve word-alignment.
835 */
836 pv_head_table = (pv_rooted_entry_t) addr;
837 addr = (vm_offset_t) (pv_head_table + npages);
838
839 pv_hash_table = (pv_hashed_entry_t *)addr;
840 addr = (vm_offset_t) (pv_hash_table + (npvhashbuckets));
841
842 pv_lock_table = (char *) addr;
843 addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
844
845 pv_hash_lock_table = (char *) addr;
846 addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhashbuckets)));
847
848 pmap_phys_attributes = (char *) addr;
849
850 ppnum_t last_pn = i386_btop(avail_end);
851 unsigned int i;
852 pmap_memory_region_t *pmptr = pmap_memory_regions;
853 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
854 if (pmptr->type != kEfiConventionalMemory) {
855 continue;
856 }
857 ppnum_t pn;
858 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
859 if (pn < last_pn) {
860 pmap_phys_attributes[pn] |= PHYS_MANAGED;
861
862 if (pn > last_managed_page) {
863 last_managed_page = pn;
864 }
865
866 if ((pmap_high_used_bottom <= pn && pn <= pmap_high_used_top) ||
867 (pmap_middle_used_bottom <= pn && pn <= pmap_middle_used_top)) {
868 pmap_phys_attributes[pn] |= PHYS_NOENCRYPT;
869 }
870 }
871 }
872 }
873 while (vsize) {
874 ppn = pmap_find_phys(kernel_pmap, vaddr);
875
876 pmap_phys_attributes[ppn] |= PHYS_NOENCRYPT;
877
878 vaddr += PAGE_SIZE;
879 vsize -= PAGE_SIZE;
880 }
881 /*
882 * Create the zone of physical maps,
883 * and of the physical-to-virtual entries.
884 */
885 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
886 ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
887
888 /* The anchor is required to be page aligned. Zone debugging adds
889 * padding which may violate that requirement. Tell the zone
890 * subsystem that alignment is required.
891 */
892 pmap_anchor_zone = zone_create("pagetable anchors", PAGE_SIZE,
893 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
894
895 /* TODO: possible general optimisation...pre-allocate via zones commonly created
896 * level3/2 pagetables
897 */
898 /* The anchor is required to be page aligned. Zone debugging adds
899 * padding which may violate that requirement. Tell the zone
900 * subsystem that alignment is required.
901 */
902 pmap_uanchor_zone = zone_create("pagetable user anchors", PAGE_SIZE,
903 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
904
905 pv_hashed_list_zone = zone_create("pv_list", sizeof(struct pv_hashed_entry),
906 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
907
908 /*
909 * Create pv entries for kernel pages that might get pmap_remove()ed.
910 *
911 * - very low pages that were identity mapped.
912 * - vm_pages[] entries that might be unused and reclaimed.
913 */
914 assert((uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start <= (uintptr_t)vm_page_array_beginning_addr);
915 pmap_pv_fixup((uintptr_t)VM_MIN_KERNEL_ADDRESS, (uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start);
916 pmap_pv_fixup((uintptr_t)vm_page_array_beginning_addr, (uintptr_t)vm_page_array_ending_addr);
917
918 pmap_initialized = TRUE;
919
920 max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
921
922 /*
923 * Ensure the kernel's PML4 entry exists for the basement
924 * before this is shared with any user.
925 */
926 pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE);
927
928 #if CONFIG_VMX
929 pmap_ept_support_ad = vmx_hv_support() && (VMX_CAP(MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_EPT_VPID_CAP_AD_SHIFT, 1) ? TRUE : FALSE);
930 pmap_eptp_flags = HV_VMX_EPTP_MEMORY_TYPE_WB | HV_VMX_EPTP_WALK_LENGTH(4) | (pmap_ept_support_ad ? HV_VMX_EPTP_ENABLE_AD_FLAGS : 0);
931 #endif /* CONFIG_VMX */
932 }
933
934 void
pmap_mark_range(pmap_t npmap,uint64_t sv,uint64_t nxrosz,boolean_t NX,boolean_t ro)935 pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro)
936 {
937 uint64_t ev, cv = sv;
938 pd_entry_t *pdep;
939 pt_entry_t *ptep = NULL;
940
941 if (os_add_overflow(sv, nxrosz, &ev)) {
942 panic("pmap_mark_range: Unexpected address overflow: start=0x%llx size=0x%llx", sv, nxrosz);
943 }
944
945 /* XXX what if nxrosz is 0? we end up marking the page whose address is passed in via sv -- is that kosher? */
946 assert(!is_ept_pmap(npmap));
947
948 assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0);
949
950 for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) {
951 uint64_t pdev = (cv & ~((uint64_t)PDEMASK));
952
953 if (*pdep & INTEL_PTE_PS) {
954 #ifdef REMAP_DEBUG
955 if ((NX ^ !!(*pdep & INTEL_PTE_NX)) || (ro ^ !!!(*pdep & INTEL_PTE_WRITE))) {
956 kprintf("WARNING: Remapping PDE for %p from %s%s%s to %s%s%s\n", (void *)cv,
957 (*pdep & INTEL_PTE_VALID) ? "R" : "",
958 (*pdep & INTEL_PTE_WRITE) ? "W" : "",
959 (*pdep & INTEL_PTE_NX) ? "" : "X",
960 "R",
961 ro ? "" : "W",
962 NX ? "" : "X");
963 }
964 #endif
965
966 if (NX) {
967 *pdep |= INTEL_PTE_NX;
968 } else {
969 *pdep &= ~INTEL_PTE_NX;
970 }
971 if (ro) {
972 *pdep &= ~INTEL_PTE_WRITE;
973 } else {
974 *pdep |= INTEL_PTE_WRITE;
975 }
976
977 if (os_add_overflow(cv, NBPD, &cv)) {
978 cv = ev;
979 } else {
980 cv &= ~((uint64_t) PDEMASK);
981 pdep = pmap_pde(npmap, cv);
982 }
983 continue;
984 }
985
986 for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) {
987 #ifdef REMAP_DEBUG
988 if ((NX ^ !!(*ptep & INTEL_PTE_NX)) || (ro ^ !!!(*ptep & INTEL_PTE_WRITE))) {
989 kprintf("WARNING: Remapping PTE for %p from %s%s%s to %s%s%s\n", (void *)cv,
990 (*ptep & INTEL_PTE_VALID) ? "R" : "",
991 (*ptep & INTEL_PTE_WRITE) ? "W" : "",
992 (*ptep & INTEL_PTE_NX) ? "" : "X",
993 "R",
994 ro ? "" : "W",
995 NX ? "" : "X");
996 }
997 #endif
998 if (NX) {
999 *ptep |= INTEL_PTE_NX;
1000 } else {
1001 *ptep &= ~INTEL_PTE_NX;
1002 }
1003 if (ro) {
1004 *ptep &= ~INTEL_PTE_WRITE;
1005 } else {
1006 *ptep |= INTEL_PTE_WRITE;
1007 }
1008 cv += NBPT;
1009 ptep = pmap_pte(npmap, cv);
1010 }
1011 }
1012 DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0);
1013 }
1014
1015 /*
1016 * Reclaim memory for early boot 4K page tables that were converted to large page mappings.
1017 * We know this memory is part of the KPTphys[] array that was allocated in Idle_PTs_init(),
1018 * so we can free it using its address in that array.
1019 */
1020 static void
pmap_free_early_PT(ppnum_t ppn,uint32_t cnt)1021 pmap_free_early_PT(ppnum_t ppn, uint32_t cnt)
1022 {
1023 ppnum_t KPTphys_ppn;
1024 vm_offset_t offset;
1025
1026 KPTphys_ppn = pmap_find_phys(kernel_pmap, (uintptr_t)KPTphys);
1027 assert(ppn >= KPTphys_ppn);
1028 assert(ppn + cnt <= KPTphys_ppn + NKPT);
1029 offset = (ppn - KPTphys_ppn) << PAGE_SHIFT;
1030 ml_static_mfree((uintptr_t)KPTphys + offset, PAGE_SIZE * cnt);
1031 }
1032
1033 /*
1034 * Called once VM is fully initialized so that we can release unused
1035 * sections of low memory to the general pool.
1036 * Also complete the set-up of identity-mapped sections of the kernel:
1037 * 1) write-protect kernel text
1038 * 2) map kernel text using large pages if possible
1039 * 3) read and write-protect page zero (for K32)
1040 * 4) map the global page at the appropriate virtual address.
1041 *
1042 * Use of large pages
1043 * ------------------
1044 * To effectively map and write-protect all kernel text pages, the text
1045 * must be 2M-aligned at the base, and the data section above must also be
1046 * 2M-aligned. That is, there's padding below and above. This is achieved
1047 * through linker directives. Large pages are used only if this alignment
1048 * exists (and not overriden by the -kernel_text_page_4K boot-arg). The
1049 * memory layout is:
1050 *
1051 * : :
1052 * | __DATA |
1053 * sdata: ================== 2Meg
1054 * | |
1055 * | zero-padding |
1056 * | |
1057 * etext: ------------------
1058 * | |
1059 * : :
1060 * | |
1061 * | __TEXT |
1062 * | |
1063 * : :
1064 * | |
1065 * stext: ================== 2Meg
1066 * | |
1067 * | zero-padding |
1068 * | |
1069 * eHIB: ------------------
1070 * | __HIB |
1071 * : :
1072 *
1073 * Prior to changing the mapping from 4K to 2M, the zero-padding pages
1074 * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the
1075 * 4K pages covering [stext,etext] are coalesced as 2M large pages.
1076 * The now unused level-1 PTE pages are also freed.
1077 */
1078 extern ppnum_t vm_kernel_base_page;
1079 static uint32_t dataptes = 0;
1080
1081 void
pmap_lowmem_finalize(void)1082 pmap_lowmem_finalize(void)
1083 {
1084 spl_t spl;
1085 int i;
1086
1087 /*
1088 * Update wired memory statistics for early boot pages
1089 */
1090 PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE);
1091
1092 /*
1093 * Free pages in pmap regions below the base:
1094 * rdar://6332712
1095 * We can't free all the pages to VM that EFI reports available.
1096 * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake.
1097 * There's also a size miscalculation here: pend is one page less
1098 * than it should be but this is not fixed to be backwards
1099 * compatible.
1100 * This is important for KASLR because up to 256*2MB = 512MB of space
1101 * needs has to be released to VM.
1102 */
1103 for (i = 0;
1104 pmap_memory_regions[i].end < vm_kernel_base_page;
1105 i++) {
1106 vm_offset_t pbase = i386_ptob(pmap_memory_regions[i].base);
1107 vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end + 1);
1108
1109 DBG("pmap region %d [%p..[%p\n",
1110 i, (void *) pbase, (void *) pend);
1111
1112 if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED) {
1113 continue;
1114 }
1115 /*
1116 * rdar://6332712
1117 * Adjust limits not to free pages in range 0xc0000-0xff000.
1118 */
1119 if (pbase >= 0xc0000 && pend <= 0x100000) {
1120 continue;
1121 }
1122 if (pbase < 0xc0000 && pend > 0x100000) {
1123 /* page range entirely within region, free lower part */
1124 DBG("- ml_static_mfree(%p,%p)\n",
1125 (void *) ml_static_ptovirt(pbase),
1126 (void *) (0xc0000 - pbase));
1127 ml_static_mfree(ml_static_ptovirt(pbase), 0xc0000 - pbase);
1128 pbase = 0x100000;
1129 }
1130 if (pbase < 0xc0000) {
1131 pend = MIN(pend, 0xc0000);
1132 }
1133 if (pend > 0x100000) {
1134 pbase = MAX(pbase, 0x100000);
1135 }
1136 DBG("- ml_static_mfree(%p,%p)\n",
1137 (void *) ml_static_ptovirt(pbase),
1138 (void *) (pend - pbase));
1139 ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase);
1140 }
1141
1142 /* A final pass to get rid of all initial identity mappings to
1143 * low pages.
1144 */
1145 DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base);
1146
1147 /*
1148 * Remove all mappings past the boot-cpu descriptor aliases and low globals.
1149 * Non-boot-cpu GDT aliases will be remapped later as needed.
1150 */
1151 pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base);
1152
1153 /*
1154 * Release any memory for early boot 4K page table pages that got replaced
1155 * with large page mappings for vm_pages[]. We know this memory is part of
1156 * the KPTphys[] array that was allocated in Idle_PTs_init(), so we can free
1157 * it using that address.
1158 */
1159 pmap_free_early_PT(released_PT_ppn, released_PT_cnt);
1160
1161 /*
1162 * If text and data are both 2MB-aligned,
1163 * we can map text with large-pages,
1164 * unless the -kernel_text_ps_4K boot-arg overrides.
1165 */
1166 if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) {
1167 kprintf("Kernel text is 2MB aligned");
1168 kernel_text_ps_4K = FALSE;
1169 if (PE_parse_boot_argn("-kernel_text_ps_4K",
1170 &kernel_text_ps_4K,
1171 sizeof(kernel_text_ps_4K))) {
1172 kprintf(" but will be mapped with 4K pages\n");
1173 } else {
1174 kprintf(" and will be mapped with 2M pages\n");
1175 }
1176 }
1177 #if DEVELOPMENT || DEBUG
1178 (void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof(wpkernel));
1179 #endif
1180 if (wpkernel) {
1181 kprintf("Kernel text %p-%p to be write-protected\n",
1182 (void *) stext, (void *) etext);
1183 }
1184
1185 spl = splhigh();
1186
1187 /*
1188 * Scan over text if mappings are to be changed:
1189 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0
1190 * - Change to large-pages if possible and not overriden.
1191 */
1192 if (kernel_text_ps_4K && wpkernel) {
1193 vm_offset_t myva;
1194 for (myva = stext; myva < etext; myva += PAGE_SIZE) {
1195 pt_entry_t *ptep;
1196
1197 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1198 if (ptep) {
1199 pmap_store_pte(FALSE, ptep, *ptep & ~INTEL_PTE_WRITE);
1200 }
1201 }
1202 }
1203
1204 if (!kernel_text_ps_4K) {
1205 vm_offset_t myva;
1206
1207 /*
1208 * Release zero-filled page padding used for 2M-alignment.
1209 */
1210 DBG("ml_static_mfree(%p,%p) for padding below text\n",
1211 (void *) eHIB, (void *) (stext - eHIB));
1212 ml_static_mfree(eHIB, stext - eHIB);
1213 DBG("ml_static_mfree(%p,%p) for padding above text\n",
1214 (void *) etext, (void *) (sdata - etext));
1215 ml_static_mfree(etext, sdata - etext);
1216
1217 /*
1218 * Coalesce text pages into large pages.
1219 */
1220 for (myva = stext; myva < sdata; myva += I386_LPGBYTES) {
1221 pt_entry_t *ptep;
1222 vm_offset_t pte_phys;
1223 pt_entry_t *pdep;
1224 pt_entry_t pde;
1225 ppnum_t KPT_ppn;
1226
1227 pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva);
1228 KPT_ppn = (ppnum_t)((*pdep & PG_FRAME) >> PAGE_SHIFT);
1229 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1230 DBG("myva: %p pdep: %p ptep: %p\n",
1231 (void *) myva, (void *) pdep, (void *) ptep);
1232 if ((*ptep & INTEL_PTE_VALID) == 0) {
1233 continue;
1234 }
1235 pte_phys = (vm_offset_t)(*ptep & PG_FRAME);
1236 pde = *pdep & PTMASK; /* page attributes from pde */
1237 pde |= INTEL_PTE_PS; /* make it a 2M entry */
1238 pde |= pte_phys; /* take page frame from pte */
1239
1240 if (wpkernel) {
1241 pde &= ~INTEL_PTE_WRITE;
1242 }
1243 DBG("pmap_store_pte(%p,0x%llx)\n",
1244 (void *)pdep, pde);
1245 pmap_store_pte(FALSE, pdep, pde);
1246
1247 /*
1248 * Free the now-unused level-1 pte.
1249 */
1250 pmap_free_early_PT(KPT_ppn, 1);
1251 }
1252
1253 /* Change variable read by sysctl machdep.pmap */
1254 pmap_kernel_text_ps = I386_LPGBYTES;
1255 }
1256
1257 vm_offset_t dva;
1258
1259 for (dva = sdata; dva < edata; dva += I386_PGBYTES) {
1260 assert(((sdata | edata) & PAGE_MASK) == 0);
1261 pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
1262
1263 dpte = *dptep;
1264 assert((dpte & INTEL_PTE_VALID));
1265 dpte |= INTEL_PTE_NX;
1266 pmap_store_pte(FALSE, dptep, dpte);
1267 dataptes++;
1268 }
1269 assert(dataptes > 0);
1270
1271 kernel_segment_command_t * seg;
1272 kernel_section_t * sec;
1273 kc_format_t kc_format;
1274
1275 PE_get_primary_kc_format(&kc_format);
1276
1277 for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) {
1278 if (!strcmp(seg->segname, "__TEXT") ||
1279 !strcmp(seg->segname, "__DATA")) {
1280 continue;
1281 }
1282
1283 /* XXX: FIXME_IN_dyld: This is a workaround (see below) */
1284 if (kc_format != KCFormatFileset) {
1285 //XXX
1286 if (!strcmp(seg->segname, "__KLD")) {
1287 continue;
1288 }
1289 }
1290
1291 if (!strcmp(seg->segname, "__HIB")) {
1292 for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
1293 if (sec->addr & PAGE_MASK) {
1294 panic("__HIB segment's sections misaligned");
1295 }
1296 if (!strcmp(sec->sectname, "__text")) {
1297 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE);
1298 } else {
1299 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE);
1300 }
1301 }
1302 } else {
1303 if (kc_format == KCFormatFileset) {
1304 #if 0
1305 /*
1306 * This block of code is commented out because it may or may not have induced an earlier panic
1307 * in ledger init.
1308 */
1309
1310
1311 boolean_t NXbit = !(seg->initprot & VM_PROT_EXECUTE),
1312 robit = (seg->initprot & (VM_PROT_READ | VM_PROT_WRITE)) == VM_PROT_READ;
1313
1314 /*
1315 * XXX: FIXME_IN_dyld: This is a workaround for primary KC containing incorrect inaccurate
1316 * initprot for segments containing code.
1317 */
1318 if (!strcmp(seg->segname, "__KLD") || !strcmp(seg->segname, "__VECTORS")) {
1319 NXbit = FALSE;
1320 robit = FALSE;
1321 }
1322
1323 pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1324 round_page_64(seg->vmsize), NXbit, robit);
1325 #endif
1326
1327 /*
1328 * XXX: We are marking *every* segment with rwx permissions as a workaround
1329 * XXX: until the primary KC's kernel segments are page-aligned.
1330 */
1331 kprintf("Marking (%p, %p) as rwx\n", (void *)(seg->vmaddr & ~(uint64_t)PAGE_MASK),
1332 (void *)((seg->vmaddr & ~(uint64_t)PAGE_MASK) + round_page_64(seg->vmsize)));
1333 pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1334 round_page_64(seg->vmsize), FALSE, FALSE);
1335 } else {
1336 pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE);
1337 }
1338 }
1339 }
1340
1341 /*
1342 * If we're debugging, map the low global vector page at the fixed
1343 * virtual address. Otherwise, remove the mapping for this.
1344 */
1345 if (debug_boot_arg) {
1346 pt_entry_t *pte = NULL;
1347 if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS))) {
1348 panic("lowmem pte");
1349 }
1350
1351 /* make sure it is defined on page boundary */
1352 assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK));
1353 pmap_store_pte(FALSE, pte, kvtophys((vm_offset_t)&lowGlo)
1354 | INTEL_PTE_REF
1355 | INTEL_PTE_MOD
1356 | INTEL_PTE_WIRED
1357 | INTEL_PTE_VALID
1358 | INTEL_PTE_WRITE
1359 | INTEL_PTE_NX);
1360
1361 #if KASAN
1362 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
1363 #endif
1364 } else {
1365 pmap_remove(kernel_pmap,
1366 LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE);
1367 }
1368 pmap_tlbi_range(0, ~0ULL, true, 0);
1369 splx(spl);
1370 }
1371
1372 /*
1373 * Mark the const data segment as read-only, non-executable.
1374 */
1375 void
x86_64_protect_data_const()1376 x86_64_protect_data_const()
1377 {
1378 boolean_t doconstro = TRUE;
1379 #if DEVELOPMENT || DEBUG
1380 (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
1381 #endif
1382 if (doconstro) {
1383 if (sconst & PAGE_MASK) {
1384 panic("CONST segment misaligned 0x%lx 0x%lx",
1385 sconst, econst);
1386 }
1387 kprintf("Marking const DATA read-only\n");
1388 pmap_protect(kernel_pmap, sconst, econst, VM_PROT_READ);
1389 }
1390 }
1391 /*
1392 * this function is only used for debugging fron the vm layer
1393 */
1394 bool
pmap_verify_free(ppnum_t pn)1395 pmap_verify_free(
1396 ppnum_t pn)
1397 {
1398 pv_rooted_entry_t pv_h;
1399 int pai;
1400 bool result;
1401
1402 assert(pn != vm_page_fictitious_addr);
1403
1404 if (!pmap_initialized) {
1405 return true;
1406 }
1407
1408 if (pn == vm_page_guard_addr) {
1409 return true;
1410 }
1411
1412 pai = ppn_to_pai(pn);
1413 if (!IS_MANAGED_PAGE(pai)) {
1414 return false;
1415 }
1416 pv_h = pai_to_pvh(pn);
1417 result = (pv_h->pmap == PMAP_NULL);
1418 return result;
1419 }
1420
1421 #if MACH_ASSERT
1422 void
pmap_assert_free(ppnum_t pn)1423 pmap_assert_free(ppnum_t pn)
1424 {
1425 int pai;
1426 pv_rooted_entry_t pv_h = NULL;
1427 pmap_t pmap = NULL;
1428 vm_offset_t va = 0;
1429 static char buffer[32];
1430 static char *pr_name = "not managed pn";
1431 uint_t attr;
1432 pt_entry_t *ptep;
1433 pt_entry_t pte = -1ull;
1434
1435 if (pmap_verify_free(pn)) {
1436 return;
1437 }
1438
1439 if (pn > last_managed_page) {
1440 attr = 0xff;
1441 goto done;
1442 }
1443
1444 pai = ppn_to_pai(pn);
1445 attr = pmap_phys_attributes[pai];
1446 pv_h = pai_to_pvh(pai);
1447 va = pv_h->va_and_flags;
1448 pmap = pv_h->pmap;
1449 if (pmap == kernel_pmap) {
1450 pr_name = "kernel";
1451 } else if (pmap == NULL) {
1452 pr_name = "pmap NULL";
1453 } else if (pmap->pmap_procname[0] != 0) {
1454 pr_name = &pmap->pmap_procname[0];
1455 } else {
1456 snprintf(buffer, sizeof(buffer), "pmap %p", pv_h->pmap);
1457 pr_name = buffer;
1458 }
1459
1460 if (pmap != NULL) {
1461 ptep = pmap_pte(pmap, va);
1462 if (ptep != NULL) {
1463 pte = (uintptr_t)*ptep;
1464 }
1465 }
1466
1467 done:
1468 panic("page not FREE page: 0x%lx attr: 0x%x %s va: 0x%lx PTE: 0x%llx",
1469 (ulong_t)pn, attr, pr_name, va, pte);
1470 }
1471 #endif /* MACH_ASSERT */
1472
1473 boolean_t
pmap_is_empty(pmap_t pmap,vm_map_offset_t va_start,vm_map_offset_t va_end)1474 pmap_is_empty(
1475 pmap_t pmap,
1476 vm_map_offset_t va_start,
1477 vm_map_offset_t va_end)
1478 {
1479 vm_map_offset_t offset;
1480 ppnum_t phys_page;
1481 ledger_amount_t phys_mem;
1482
1483 if (pmap == PMAP_NULL) {
1484 return TRUE;
1485 }
1486
1487 /*
1488 * Check the ledger's phys_mem value
1489 * - if it's zero, the pmap is completely empty.
1490 * This short-circuit test prevents a virtual address scan which is
1491 * painfully slow for 64-bit spaces.
1492 * This assumes the count is correct
1493 * .. the debug kernel ought to be checking perhaps by page table walk.
1494 */
1495 if (pmap != kernel_pmap) {
1496 ledger_get_balance(pmap->ledger, task_ledgers.phys_mem, &phys_mem);
1497 if (phys_mem == 0) {
1498 return TRUE;
1499 }
1500 }
1501
1502 for (offset = va_start;
1503 offset < va_end;
1504 offset += PAGE_SIZE_64) {
1505 phys_page = pmap_find_phys(pmap, offset);
1506 if (phys_page) {
1507 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1508 "page %d at 0x%llx\n",
1509 pmap, va_start, va_end, phys_page, offset);
1510 return FALSE;
1511 }
1512 }
1513
1514 return TRUE;
1515 }
1516
1517 void
hv_ept_pmap_create(void ** ept_pmap,void ** eptp)1518 hv_ept_pmap_create(void **ept_pmap, void **eptp)
1519 {
1520 pmap_t p;
1521
1522 if ((ept_pmap == NULL) || (eptp == NULL)) {
1523 return;
1524 }
1525
1526 p = pmap_create_options(get_task_ledger(current_task()), 0, (PMAP_CREATE_64BIT | PMAP_CREATE_EPT));
1527 if (p == PMAP_NULL) {
1528 *ept_pmap = NULL;
1529 *eptp = NULL;
1530 return;
1531 }
1532
1533 assert(is_ept_pmap(p));
1534
1535 *ept_pmap = (void*)p;
1536 *eptp = (void*)(p->pm_eptp);
1537 return;
1538 }
1539
1540 /*
1541 * pmap_create() is used by some special, legacy 3rd party kexts.
1542 * In our kernel code, always use pmap_create_options().
1543 */
1544 extern pmap_t pmap_create(ledger_t ledger, vm_map_size_t sz, boolean_t is_64bit);
1545
1546 __attribute__((used))
1547 pmap_t
pmap_create(ledger_t ledger,vm_map_size_t sz,boolean_t is_64bit)1548 pmap_create(
1549 ledger_t ledger,
1550 vm_map_size_t sz,
1551 boolean_t is_64bit)
1552 {
1553 return pmap_create_options(ledger, sz, is_64bit ? PMAP_CREATE_64BIT : 0);
1554 }
1555
1556 /*
1557 * Create and return a physical map.
1558 *
1559 * If the size specified for the map
1560 * is zero, the map is an actual physical
1561 * map, and may be referenced by the
1562 * hardware.
1563 *
1564 * If the size specified is non-zero,
1565 * the map will be used in software only, and
1566 * is bounded by that size.
1567 */
1568
1569 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t sz,unsigned int flags)1570 pmap_create_options(
1571 ledger_t ledger,
1572 vm_map_size_t sz,
1573 unsigned int flags)
1574 {
1575 pmap_t p;
1576 vm_size_t size;
1577 pml4_entry_t *pml4;
1578 pml4_entry_t *kpml4;
1579 int i;
1580
1581 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, sz, flags);
1582
1583 size = (vm_size_t) sz;
1584
1585 /*
1586 * A software use-only map doesn't even need a map.
1587 */
1588
1589 if (size != 0) {
1590 return PMAP_NULL;
1591 }
1592
1593 /*
1594 * Return error when unrecognized flags are passed.
1595 */
1596 if (__improbable((flags & ~(PMAP_CREATE_KNOWN_FLAGS)) != 0)) {
1597 return PMAP_NULL;
1598 }
1599
1600 p = zalloc_flags(pmap_zone, Z_WAITOK | Z_ZERO);
1601 if (PMAP_NULL == p) {
1602 panic("pmap_create zalloc");
1603 }
1604
1605 lck_rw_init(&p->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
1606 p->pmap_rwl.lck_rw_can_sleep = FALSE;
1607
1608 os_ref_init(&p->ref_count, NULL);
1609 #if DEVELOPMENT || DEBUG
1610 p->nx_enabled = 1;
1611 #endif
1612 p->pm_shared = FALSE;
1613 ledger_reference(ledger);
1614 p->ledger = ledger;
1615
1616 p->pm_task_map = ((flags & PMAP_CREATE_64BIT) ? TASK_MAP_64BIT : TASK_MAP_32BIT);
1617
1618 p->pagezero_accessible = FALSE;
1619 p->pm_vm_map_cs_enforced = FALSE;
1620
1621 if (pmap_pcid_ncpus) {
1622 pmap_pcid_initialize(p);
1623 }
1624
1625 p->pm_pml4 = zalloc(pmap_anchor_zone);
1626 p->pm_upml4 = zalloc(pmap_uanchor_zone); //cleanup for EPT
1627
1628 pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0);
1629 pmap_assert((((uintptr_t)p->pm_upml4) & PAGE_MASK) == 0);
1630
1631 memset((char *)p->pm_pml4, 0, PAGE_SIZE);
1632 memset((char *)p->pm_upml4, 0, PAGE_SIZE);
1633
1634 if (flags & PMAP_CREATE_EPT) {
1635 p->pm_eptp = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4) | pmap_eptp_flags;
1636 p->pm_cr3 = 0;
1637 } else {
1638 p->pm_eptp = 0;
1639 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1640 p->pm_ucr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_upml4);
1641 }
1642
1643 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1644
1645 p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS) *PAGE_SIZE);
1646 if (NULL == p->pm_obj_pml4) {
1647 panic("pmap_create pdpt obj");
1648 }
1649
1650 p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS) *PAGE_SIZE);
1651 if (NULL == p->pm_obj_pdpt) {
1652 panic("pmap_create pdpt obj");
1653 }
1654
1655 p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS) *PAGE_SIZE);
1656 if (NULL == p->pm_obj) {
1657 panic("pmap_create pte obj");
1658 }
1659
1660 if (!(flags & PMAP_CREATE_EPT)) {
1661 /* All host pmaps share the kernel's pml4 */
1662 pml4 = pmap64_pml4(p, 0ULL);
1663 kpml4 = kernel_pmap->pm_pml4;
1664 for (i = KERNEL_PML4_INDEX; i < (KERNEL_PML4_INDEX + KERNEL_PML4_COUNT); i++) {
1665 pml4[i] = kpml4[i];
1666 }
1667 pml4[KERNEL_KEXTS_INDEX] = kpml4[KERNEL_KEXTS_INDEX];
1668 for (i = KERNEL_PHYSMAP_PML4_INDEX; i < (KERNEL_PHYSMAP_PML4_INDEX + KERNEL_PHYSMAP_PML4_COUNT); i++) {
1669 pml4[i] = kpml4[i];
1670 }
1671 pml4[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1672 #if KASAN
1673 for (i = KERNEL_KASAN_PML4_FIRST; i <= KERNEL_KASAN_PML4_LAST; i++) {
1674 pml4[i] = kpml4[i];
1675 }
1676 #endif
1677 pml4_entry_t *pml4u = pmap64_user_pml4(p, 0ULL);
1678 pml4u[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1679 }
1680
1681 #if MACH_ASSERT
1682 p->pmap_stats_assert = TRUE;
1683 p->pmap_pid = 0;
1684 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
1685 #endif /* MACH_ASSERT */
1686
1687 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END,
1688 VM_KERNEL_ADDRHIDE(p));
1689
1690 return p;
1691 }
1692
1693 /*
1694 * We maintain stats and ledgers so that a task's physical footprint is:
1695 * phys_footprint = ((internal - alternate_accounting)
1696 * + (internal_compressed - alternate_accounting_compressed)
1697 * + iokit_mapped
1698 * + purgeable_nonvolatile
1699 * + purgeable_nonvolatile_compressed
1700 * + page_table)
1701 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
1702 */
1703
1704 #if MACH_ASSERT
1705 static void pmap_check_ledgers(pmap_t pmap);
1706 #else /* MACH_ASSERT */
1707 static inline void
pmap_check_ledgers(__unused pmap_t pmap)1708 pmap_check_ledgers(__unused pmap_t pmap)
1709 {
1710 }
1711 #endif /* MACH_ASSERT */
1712
1713 /*
1714 * Retire the given physical map from service.
1715 * Should only be called if the map contains
1716 * no valid mappings.
1717 */
1718 extern int vm_wired_objects_page_count;
1719
1720 void
pmap_destroy(pmap_t p)1721 pmap_destroy(pmap_t p)
1722 {
1723 os_ref_count_t c;
1724
1725 if (p == PMAP_NULL) {
1726 return;
1727 }
1728
1729 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1730 VM_KERNEL_ADDRHIDe(p));
1731
1732 PMAP_LOCK_EXCLUSIVE(p);
1733
1734 c = os_ref_release_locked(&p->ref_count);
1735
1736 pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE);
1737
1738 if (c == 0) {
1739 /*
1740 * If some cpu is not using the physical pmap pointer that it
1741 * is supposed to be (see set_dirbase), we might be using the
1742 * pmap that is being destroyed! Make sure we are
1743 * physically on the right pmap:
1744 */
1745 PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
1746 if (pmap_pcid_ncpus) {
1747 pmap_destroy_pcid_sync(p);
1748 }
1749 }
1750
1751 PMAP_UNLOCK_EXCLUSIVE(p);
1752
1753 if (c != 0) {
1754 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1755 pmap_assert(p == kernel_pmap);
1756 return; /* still in use */
1757 }
1758
1759 /*
1760 * Free the memory maps, then the
1761 * pmap structure.
1762 */
1763 int inuse_ptepages = 0;
1764
1765 zfree(pmap_anchor_zone, p->pm_pml4);
1766 zfree(pmap_uanchor_zone, p->pm_upml4);
1767
1768 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1769 vm_object_deallocate(p->pm_obj_pml4);
1770
1771 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1772 vm_object_deallocate(p->pm_obj_pdpt);
1773
1774 inuse_ptepages += p->pm_obj->resident_page_count;
1775 vm_object_deallocate(p->pm_obj);
1776
1777 OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
1778 PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE);
1779
1780 pmap_check_ledgers(p);
1781 ledger_dereference(p->ledger);
1782 lck_rw_destroy(&p->pmap_rwl, &pmap_lck_grp);
1783 zfree(pmap_zone, p);
1784
1785 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1786 }
1787
1788 /*
1789 * Add a reference to the specified pmap.
1790 */
1791
1792 void
pmap_reference(pmap_t p)1793 pmap_reference(pmap_t p)
1794 {
1795 if (p != PMAP_NULL) {
1796 PMAP_LOCK_EXCLUSIVE(p);
1797 os_ref_retain_locked(&p->ref_count);
1798 PMAP_UNLOCK_EXCLUSIVE(p);
1799 }
1800 }
1801
1802 /*
1803 * Remove phys addr if mapped in specified map
1804 *
1805 */
1806 void
pmap_remove_some_phys(__unused pmap_t map,__unused ppnum_t pn)1807 pmap_remove_some_phys(
1808 __unused pmap_t map,
1809 __unused ppnum_t pn)
1810 {
1811 /* Implement to support working set code */
1812 }
1813
1814
1815 void
pmap_protect(pmap_t map,vm_map_offset_t sva,vm_map_offset_t eva,vm_prot_t prot)1816 pmap_protect(
1817 pmap_t map,
1818 vm_map_offset_t sva,
1819 vm_map_offset_t eva,
1820 vm_prot_t prot)
1821 {
1822 pmap_protect_options(map, sva, eva, prot, 0, NULL);
1823 }
1824
1825
1826 /*
1827 * Set the physical protection on the
1828 * specified range of this map as requested.
1829 *
1830 * VERY IMPORTANT: Will *NOT* increase permissions.
1831 * pmap_protect_options() should protect the range against any access types
1832 * that are not in "prot" but it should never grant extra access.
1833 * For example, if "prot" is READ|EXECUTE, that means "remove write
1834 * access" but it does *not* mean "add read and execute" access.
1835 * VM relies on getting soft-faults to enforce extra checks (code
1836 * signing, for example), for example.
1837 * New access permissions are granted via pmap_enter() only.
1838 * ***NOTE***:
1839 * The only exception is for EPT pmaps, where we MUST populate all exec
1840 * bits when the protection API is invoked (so that the HV fault handler
1841 * can make decisions based on the exit qualification information, which
1842 * includes the execute bits in the EPT entries. Soft-faulting them
1843 * in would cause a chicken-and-egg problem where the HV fault handler
1844 * would not be able to identify mode-based execute control (MBE) faults.)
1845 */
1846 void
pmap_protect_options(pmap_t map,vm_map_offset_t sva,vm_map_offset_t eva,vm_prot_t prot,unsigned int options,void * arg)1847 pmap_protect_options(
1848 pmap_t map,
1849 vm_map_offset_t sva,
1850 vm_map_offset_t eva,
1851 vm_prot_t prot,
1852 unsigned int options,
1853 void *arg)
1854 {
1855 pt_entry_t *pde;
1856 pt_entry_t *spte, *epte;
1857 vm_map_offset_t lva;
1858 vm_map_offset_t orig_sva;
1859 boolean_t set_NX;
1860 int num_found = 0;
1861 boolean_t is_ept;
1862 uint64_t cur_vaddr;
1863
1864 pmap_intr_assert();
1865
1866 if (map == PMAP_NULL) {
1867 return;
1868 }
1869
1870 if (prot == VM_PROT_NONE) {
1871 pmap_remove_options(map, sva, eva, options);
1872 return;
1873 }
1874
1875 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1876 VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(sva),
1877 VM_KERNEL_ADDRHIDE(eva));
1878
1879 is_ept = is_ept_pmap(map);
1880
1881 if ((prot & VM_PROT_EXECUTE) || __improbable(is_ept && (prot & VM_PROT_UEXEC))) {
1882 set_NX = FALSE;
1883 } else {
1884 set_NX = TRUE;
1885 }
1886
1887 #if DEVELOPMENT || DEBUG
1888 if (__improbable(set_NX && (!nx_enabled || !map->nx_enabled))) {
1889 set_NX = FALSE;
1890 }
1891 #endif
1892 PMAP_LOCK_EXCLUSIVE(map);
1893
1894 orig_sva = sva;
1895 cur_vaddr = sva;
1896 while (sva < eva) {
1897 uint64_t vaddr_incr;
1898
1899 if (os_add_overflow(sva, PDE_MAPPED_SIZE, &lva)) {
1900 lva = eva;
1901 } else {
1902 lva &= ~(PDE_MAPPED_SIZE - 1);
1903
1904 if (lva > eva) {
1905 lva = eva;
1906 }
1907 }
1908
1909 pde = pmap_pde(map, sva);
1910 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1911 if (*pde & PTE_PS) {
1912 /* superpage */
1913 spte = pde;
1914 epte = spte + 1; /* excluded */
1915 vaddr_incr = I386_LPGBYTES;
1916 } else {
1917 spte = pmap_pte(map, (sva & ~(PDE_MAPPED_SIZE - 1)));
1918 spte = &spte[ptenum(sva)];
1919 epte = &spte[intel_btop(lva - sva)];
1920 vaddr_incr = I386_PGBYTES;
1921 }
1922
1923 for (; spte < epte; spte++) {
1924 uint64_t clear_bits, set_bits;
1925
1926 if (!(*spte & PTE_VALID_MASK(is_ept))) {
1927 continue;
1928 }
1929
1930 clear_bits = 0;
1931 set_bits = 0;
1932
1933 if (is_ept) {
1934 if (!(prot & VM_PROT_READ)) {
1935 clear_bits |= PTE_READ(is_ept);
1936 }
1937 }
1938 if (!(prot & VM_PROT_WRITE)) {
1939 clear_bits |= PTE_WRITE(is_ept);
1940 }
1941 #if DEVELOPMENT || DEBUG
1942 else if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) &&
1943 map == kernel_pmap) {
1944 set_bits |= PTE_WRITE(is_ept);
1945 }
1946 #endif /* DEVELOPMENT || DEBUG */
1947
1948 if (set_NX) {
1949 if (!is_ept) {
1950 set_bits |= INTEL_PTE_NX;
1951 } else {
1952 clear_bits |= INTEL_EPT_EX | INTEL_EPT_UEX;
1953 }
1954 } else if (is_ept) {
1955 /* This is the exception to the "Don't add permissions" statement, above */
1956 set_bits |= ((prot & VM_PROT_EXECUTE) ? INTEL_EPT_EX : 0) |
1957 ((prot & VM_PROT_UEXEC) ? INTEL_EPT_UEX : 0);
1958 }
1959
1960 pmap_update_pte(is_ept, spte, clear_bits, set_bits, false);
1961
1962 DTRACE_VM3(set_pte, pmap_t, map, void *, cur_vaddr, uint64_t, *spte);
1963 cur_vaddr += vaddr_incr;
1964
1965 num_found++;
1966 }
1967 }
1968 sva = lva;
1969 }
1970 if (num_found) {
1971 if (options & PMAP_OPTIONS_NOFLUSH) {
1972 PMAP_UPDATE_TLBS_DELAYED(map, orig_sva, eva, (pmap_flush_context *)arg);
1973 } else {
1974 PMAP_UPDATE_TLBS(map, orig_sva, eva);
1975 }
1976 }
1977
1978 PMAP_UNLOCK_EXCLUSIVE(map);
1979
1980 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
1981 }
1982
1983 /* Map a (possibly) autogenned block */
1984 kern_return_t
pmap_map_block_addr(pmap_t pmap,addr64_t va,pmap_paddr_t pa,uint32_t size,vm_prot_t prot,int attr,unsigned int flags)1985 pmap_map_block_addr(
1986 pmap_t pmap,
1987 addr64_t va,
1988 pmap_paddr_t pa,
1989 uint32_t size,
1990 vm_prot_t prot,
1991 int attr,
1992 unsigned int flags)
1993 {
1994 return pmap_map_block(pmap, va, intel_btop(pa), size, prot, attr, flags);
1995 }
1996
1997 kern_return_t
pmap_map_block(pmap_t pmap,addr64_t va,ppnum_t pa,uint32_t size,vm_prot_t prot,int attr,__unused unsigned int flags)1998 pmap_map_block(
1999 pmap_t pmap,
2000 addr64_t va,
2001 ppnum_t pa,
2002 uint32_t size,
2003 vm_prot_t prot,
2004 int attr,
2005 __unused unsigned int flags)
2006 {
2007 kern_return_t kr;
2008 addr64_t original_va = va;
2009 uint32_t page;
2010 int cur_page_size;
2011
2012 if (attr & VM_MEM_SUPERPAGE) {
2013 cur_page_size = SUPERPAGE_SIZE;
2014 } else {
2015 cur_page_size = PAGE_SIZE;
2016 }
2017
2018 for (page = 0; page < size; page += cur_page_size / PAGE_SIZE) {
2019 kr = pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE);
2020
2021 if (kr != KERN_SUCCESS) {
2022 /*
2023 * This will panic for now, as it is unclear that
2024 * removing the mappings is correct.
2025 */
2026 panic("%s: failed pmap_enter, "
2027 "pmap=%p, va=%#llx, pa=%u, size=%u, prot=%#x, flags=%#x",
2028 __FUNCTION__,
2029 pmap, va, pa, size, prot, flags);
2030
2031 pmap_remove(pmap, original_va, va - original_va);
2032 return kr;
2033 }
2034
2035 va += cur_page_size;
2036 pa += cur_page_size / PAGE_SIZE;
2037 }
2038
2039 return KERN_SUCCESS;
2040 }
2041
2042 kern_return_t
pmap_expand_pml4(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2043 pmap_expand_pml4(
2044 pmap_t map,
2045 vm_map_offset_t vaddr,
2046 unsigned int options)
2047 {
2048 vm_page_t m;
2049 pmap_paddr_t pa;
2050 uint64_t i;
2051 ppnum_t pn;
2052 pml4_entry_t *pml4p;
2053 boolean_t is_ept = is_ept_pmap(map);
2054
2055 DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
2056
2057 /* With the exception of the kext "basement", the kernel's level 4
2058 * pagetables must not be dynamically expanded.
2059 */
2060 assert(map != kernel_pmap || (vaddr == KERNEL_BASEMENT));
2061 /*
2062 * Allocate a VM page for the pml4 page
2063 */
2064 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2065 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2066 return KERN_RESOURCE_SHORTAGE;
2067 }
2068 VM_PAGE_WAIT();
2069 }
2070 /*
2071 * put the page into the pmap's obj list so it
2072 * can be found later.
2073 */
2074 pn = VM_PAGE_GET_PHYS_PAGE(m);
2075 pa = i386_ptob(pn);
2076 i = pml4idx(map, vaddr);
2077
2078 /*
2079 * Zero the page.
2080 */
2081 pmap_zero_page(pn);
2082
2083 vm_page_lockspin_queues();
2084 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2085 vm_page_unlock_queues();
2086
2087 OSAddAtomic(1, &inuse_ptepages_count);
2088 OSAddAtomic64(1, &alloc_ptepages_count);
2089 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2090
2091 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2092 vm_object_lock(map->pm_obj_pml4);
2093
2094 PMAP_LOCK_EXCLUSIVE(map);
2095 /*
2096 * See if someone else expanded us first
2097 */
2098 if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
2099 PMAP_UNLOCK_EXCLUSIVE(map);
2100 vm_object_unlock(map->pm_obj_pml4);
2101
2102 VM_PAGE_FREE(m);
2103
2104 OSAddAtomic(-1, &inuse_ptepages_count);
2105 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2106 return KERN_SUCCESS;
2107 }
2108
2109 #if 0 /* DEBUG */
2110 if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE)) {
2111 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx",
2112 map, map->pm_obj_pml4, vaddr, i);
2113 }
2114 #endif
2115 vm_page_insert_wired(m, map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2116 vm_object_unlock(map->pm_obj_pml4);
2117
2118 /*
2119 * Set the page directory entry for this page table.
2120 */
2121 pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
2122
2123 /*
2124 * Note that INTEL_EPT_UEX is unconditionally set (as is INTEL_EPT_EX) for
2125 * all intermediate paging levels, from PML4Es to PDEs. Processors with
2126 * VT-x implementations that do not support MBE ignore the INTEL_EPT_UEX
2127 * bit at all levels of the EPT, so there is no risk of inducing EPT
2128 * violation faults.
2129 */
2130 pmap_store_pte(is_ept, pml4p, pa_to_pte(pa)
2131 | PTE_READ(is_ept)
2132 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2133 | PTE_WRITE(is_ept));
2134 pml4_entry_t *upml4p;
2135
2136 upml4p = pmap64_user_pml4(map, vaddr);
2137 pmap_store_pte(is_ept, upml4p, pa_to_pte(pa)
2138 | PTE_READ(is_ept)
2139 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2140 | PTE_WRITE(is_ept));
2141
2142 PMAP_UNLOCK_EXCLUSIVE(map);
2143
2144 return KERN_SUCCESS;
2145 }
2146
2147 kern_return_t
pmap_expand_pdpt(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2148 pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options)
2149 {
2150 vm_page_t m;
2151 pmap_paddr_t pa;
2152 uint64_t i;
2153 ppnum_t pn;
2154 pdpt_entry_t *pdptp;
2155 boolean_t is_ept = is_ept_pmap(map);
2156
2157 DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
2158
2159 while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
2160 kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options);
2161 if (pep4kr != KERN_SUCCESS) {
2162 return pep4kr;
2163 }
2164 }
2165
2166 /*
2167 * Allocate a VM page for the pdpt page
2168 */
2169 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2170 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2171 return KERN_RESOURCE_SHORTAGE;
2172 }
2173 VM_PAGE_WAIT();
2174 }
2175
2176 /*
2177 * put the page into the pmap's obj list so it
2178 * can be found later.
2179 */
2180 pn = VM_PAGE_GET_PHYS_PAGE(m);
2181 pa = i386_ptob(pn);
2182 i = pdptidx(map, vaddr);
2183
2184 /*
2185 * Zero the page.
2186 */
2187 pmap_zero_page(pn);
2188
2189 vm_page_lockspin_queues();
2190 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2191 vm_page_unlock_queues();
2192
2193 OSAddAtomic(1, &inuse_ptepages_count);
2194 OSAddAtomic64(1, &alloc_ptepages_count);
2195 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2196
2197 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2198 vm_object_lock(map->pm_obj_pdpt);
2199
2200 PMAP_LOCK_EXCLUSIVE(map);
2201 /*
2202 * See if someone else expanded us first
2203 */
2204 if (pmap_pde(map, vaddr) != PD_ENTRY_NULL) {
2205 PMAP_UNLOCK_EXCLUSIVE(map);
2206 vm_object_unlock(map->pm_obj_pdpt);
2207
2208 VM_PAGE_FREE(m);
2209
2210 OSAddAtomic(-1, &inuse_ptepages_count);
2211 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2212 return KERN_SUCCESS;
2213 }
2214
2215 #if 0 /* DEBUG */
2216 if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE)) {
2217 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx",
2218 map, map->pm_obj_pdpt, vaddr, i);
2219 }
2220 #endif
2221 vm_page_insert_wired(m, map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2222 vm_object_unlock(map->pm_obj_pdpt);
2223
2224 /*
2225 * Set the page directory entry for this page table.
2226 */
2227 pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
2228
2229 pmap_store_pte(is_ept, pdptp, pa_to_pte(pa)
2230 | PTE_READ(is_ept)
2231 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2232 | PTE_WRITE(is_ept));
2233
2234 PMAP_UNLOCK_EXCLUSIVE(map);
2235
2236 return KERN_SUCCESS;
2237 }
2238
2239
2240
2241 /*
2242 * Routine: pmap_expand
2243 *
2244 * Expands a pmap to be able to map the specified virtual address.
2245 *
2246 * Allocates new virtual memory for the P0 or P1 portion of the
2247 * pmap, then re-maps the physical pages that were in the old
2248 * pmap to be in the new pmap.
2249 *
2250 * Must be called with the pmap system and the pmap unlocked,
2251 * since these must be unlocked to use vm_allocate or vm_deallocate.
2252 * Thus it must be called in a loop that checks whether the map
2253 * has been expanded enough.
2254 * (We won't loop forever, since page tables aren't shrunk.)
2255 */
2256 kern_return_t
pmap_expand(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2257 pmap_expand(
2258 pmap_t map,
2259 vm_map_offset_t vaddr,
2260 unsigned int options)
2261 {
2262 pt_entry_t *pdp;
2263 vm_page_t m;
2264 pmap_paddr_t pa;
2265 uint64_t i;
2266 ppnum_t pn;
2267 boolean_t is_ept = is_ept_pmap(map);
2268
2269
2270 /*
2271 * For the kernel, the virtual address must be in or above the basement
2272 * which is for kexts and is in the 512GB immediately below the kernel..
2273 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
2274 */
2275 if (__improbable(map == kernel_pmap &&
2276 !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))) {
2277 if ((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0) {
2278 panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
2279 }
2280 }
2281
2282 while ((pdp = pmap_pde(map, vaddr)) == PD_ENTRY_NULL) {
2283 assert((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0);
2284 kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options);
2285 if (pepkr != KERN_SUCCESS) {
2286 return pepkr;
2287 }
2288 }
2289
2290 /*
2291 * Allocate a VM page for the pde entries.
2292 */
2293 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2294 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2295 return KERN_RESOURCE_SHORTAGE;
2296 }
2297 VM_PAGE_WAIT();
2298 }
2299
2300 /*
2301 * put the page into the pmap's obj list so it
2302 * can be found later.
2303 */
2304 pn = VM_PAGE_GET_PHYS_PAGE(m);
2305 pa = i386_ptob(pn);
2306 i = pdeidx(map, vaddr);
2307
2308 /*
2309 * Zero the page.
2310 */
2311 pmap_zero_page(pn);
2312
2313 vm_page_lockspin_queues();
2314 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2315 vm_page_unlock_queues();
2316
2317 OSAddAtomic(1, &inuse_ptepages_count);
2318 OSAddAtomic64(1, &alloc_ptepages_count);
2319 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2320
2321 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2322 vm_object_lock(map->pm_obj);
2323
2324 PMAP_LOCK_EXCLUSIVE(map);
2325
2326 /*
2327 * See if someone else expanded us first
2328 */
2329 if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
2330 PMAP_UNLOCK_EXCLUSIVE(map);
2331 vm_object_unlock(map->pm_obj);
2332
2333 VM_PAGE_FREE(m);
2334
2335 OSAddAtomic(-1, &inuse_ptepages_count); //todo replace all with inlines
2336 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2337 return KERN_SUCCESS;
2338 }
2339
2340 #if 0 /* DEBUG */
2341 if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE)) {
2342 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx",
2343 map, map->pm_obj, vaddr, i);
2344 }
2345 #endif
2346 vm_page_insert_wired(m, map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2347 vm_object_unlock(map->pm_obj);
2348
2349 /*
2350 * Set the page directory entry for this page table.
2351 */
2352 pdp = pmap_pde(map, vaddr);
2353
2354 pmap_store_pte(is_ept, pdp, pa_to_pte(pa)
2355 | PTE_READ(is_ept)
2356 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2357 | PTE_WRITE(is_ept));
2358
2359 PMAP_UNLOCK_EXCLUSIVE(map);
2360
2361 return KERN_SUCCESS;
2362 }
2363 /*
2364 * Query a pmap to see what size a given virtual address is mapped with.
2365 * If the vaddr is not mapped, returns 0.
2366 */
2367 vm_size_t
pmap_query_pagesize(pmap_t pmap,vm_map_offset_t vaddr)2368 pmap_query_pagesize(
2369 pmap_t pmap,
2370 vm_map_offset_t vaddr)
2371 {
2372 pd_entry_t *pdep;
2373 vm_size_t size = 0;
2374
2375 assert(!is_ept_pmap(pmap));
2376 PMAP_LOCK_EXCLUSIVE(pmap);
2377
2378 pdep = pmap_pde(pmap, vaddr);
2379 if (pdep != PD_ENTRY_NULL) {
2380 if (*pdep & INTEL_PTE_PS) {
2381 size = I386_LPGBYTES;
2382 } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2383 size = I386_PGBYTES;
2384 }
2385 }
2386
2387 PMAP_UNLOCK_EXCLUSIVE(pmap);
2388
2389 return size;
2390 }
2391
2392 /*
2393 * Ensure the page table hierarchy is filled in down to
2394 * the large page level. Additionally returns FAILURE if
2395 * a lower page table already exists.
2396 */
2397 static kern_return_t
pmap_pre_expand_large_internal(pmap_t pmap,vm_map_offset_t vaddr)2398 pmap_pre_expand_large_internal(
2399 pmap_t pmap,
2400 vm_map_offset_t vaddr)
2401 {
2402 ppnum_t pn;
2403 pt_entry_t *pte;
2404 boolean_t is_ept = is_ept_pmap(pmap);
2405 kern_return_t kr = KERN_SUCCESS;
2406
2407 if (pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
2408 if (!pmap_next_page_hi(&pn, FALSE)) {
2409 panic("pmap_pre_expand_large no PDPT");
2410 }
2411
2412 pmap_zero_page(pn);
2413
2414 pte = pmap64_pml4(pmap, vaddr);
2415
2416 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2417 PTE_READ(is_ept) |
2418 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2419 PTE_WRITE(is_ept));
2420
2421 pte = pmap64_user_pml4(pmap, vaddr);
2422
2423 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2424 PTE_READ(is_ept) |
2425 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2426 PTE_WRITE(is_ept));
2427 }
2428
2429 if (pmap_pde(pmap, vaddr) == PD_ENTRY_NULL) {
2430 if (!pmap_next_page_hi(&pn, FALSE)) {
2431 panic("pmap_pre_expand_large no PDE");
2432 }
2433
2434 pmap_zero_page(pn);
2435
2436 pte = pmap64_pdpt(pmap, vaddr);
2437
2438 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2439 PTE_READ(is_ept) |
2440 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2441 PTE_WRITE(is_ept));
2442 } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2443 kr = KERN_FAILURE;
2444 }
2445
2446 return kr;
2447 }
2448
2449 /*
2450 * Wrapper that locks the pmap.
2451 */
2452 kern_return_t
pmap_pre_expand_large(pmap_t pmap,vm_map_offset_t vaddr)2453 pmap_pre_expand_large(
2454 pmap_t pmap,
2455 vm_map_offset_t vaddr)
2456 {
2457 kern_return_t kr;
2458
2459 PMAP_LOCK_EXCLUSIVE(pmap);
2460 kr = pmap_pre_expand_large_internal(pmap, vaddr);
2461 PMAP_UNLOCK_EXCLUSIVE(pmap);
2462 return kr;
2463 }
2464
2465 /*
2466 * On large memory machines, pmap_steal_memory() will allocate past
2467 * the 1GB of pre-allocated/mapped virtual kernel area. This function
2468 * expands kernel the page tables to cover a given vaddr. It uses pages
2469 * from the same pool that pmap_steal_memory() uses, since vm_page_grab()
2470 * isn't available yet.
2471 */
2472 void
pmap_pre_expand(pmap_t pmap,vm_map_offset_t vaddr)2473 pmap_pre_expand(
2474 pmap_t pmap,
2475 vm_map_offset_t vaddr)
2476 {
2477 ppnum_t pn;
2478 pt_entry_t *pte;
2479 boolean_t is_ept = is_ept_pmap(pmap);
2480
2481 /*
2482 * This returns failure if a 4K page table already exists.
2483 * Othewise it fills in the page table hierarchy down
2484 * to that level.
2485 */
2486 PMAP_LOCK_EXCLUSIVE(pmap);
2487 if (pmap_pre_expand_large_internal(pmap, vaddr) == KERN_FAILURE) {
2488 PMAP_UNLOCK_EXCLUSIVE(pmap);
2489 return;
2490 }
2491
2492 /* Add the lowest table */
2493 if (!pmap_next_page_hi(&pn, FALSE)) {
2494 panic("pmap_pre_expand");
2495 }
2496
2497 pmap_zero_page(pn);
2498
2499 pte = pmap_pde(pmap, vaddr);
2500
2501 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2502 PTE_READ(is_ept) |
2503 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2504 PTE_WRITE(is_ept));
2505 PMAP_UNLOCK_EXCLUSIVE(pmap);
2506 }
2507
2508 /*
2509 * pmap_sync_page_data_phys(ppnum_t pa)
2510 *
2511 * Invalidates all of the instruction cache on a physical page and
2512 * pushes any dirty data from the data cache for the same physical page
2513 * Not required in i386.
2514 */
2515 void
pmap_sync_page_data_phys(__unused ppnum_t pa)2516 pmap_sync_page_data_phys(__unused ppnum_t pa)
2517 {
2518 return;
2519 }
2520
2521 /*
2522 * pmap_sync_page_attributes_phys(ppnum_t pa)
2523 *
2524 * Write back and invalidate all cachelines on a physical page.
2525 */
2526 void
pmap_sync_page_attributes_phys(ppnum_t pa)2527 pmap_sync_page_attributes_phys(ppnum_t pa)
2528 {
2529 cache_flush_page_phys(pa);
2530 }
2531
2532 void
pmap_copy_page(ppnum_t src,ppnum_t dst)2533 pmap_copy_page(ppnum_t src, ppnum_t dst)
2534 {
2535 bcopy_phys((addr64_t)i386_ptob(src),
2536 (addr64_t)i386_ptob(dst),
2537 PAGE_SIZE);
2538 }
2539
2540
2541 /*
2542 * Routine: pmap_pageable
2543 * Function:
2544 * Make the specified pages (by pmap, offset)
2545 * pageable (or not) as requested.
2546 *
2547 * A page which is not pageable may not take
2548 * a fault; therefore, its page table entry
2549 * must remain valid for the duration.
2550 *
2551 * This routine is merely advisory; pmap_enter
2552 * will specify that these pages are to be wired
2553 * down (or not) as appropriate.
2554 */
2555 void
pmap_pageable(__unused pmap_t pmap,__unused vm_map_offset_t start_addr,__unused vm_map_offset_t end_addr,__unused boolean_t pageable)2556 pmap_pageable(
2557 __unused pmap_t pmap,
2558 __unused vm_map_offset_t start_addr,
2559 __unused vm_map_offset_t end_addr,
2560 __unused boolean_t pageable)
2561 {
2562 #ifdef lint
2563 pmap++; start_addr++; end_addr++; pageable++;
2564 #endif /* lint */
2565 }
2566
2567 void
invalidate_icache(__unused vm_offset_t addr,__unused unsigned cnt,__unused int phys)2568 invalidate_icache(__unused vm_offset_t addr,
2569 __unused unsigned cnt,
2570 __unused int phys)
2571 {
2572 return;
2573 }
2574
2575 void
flush_dcache(__unused vm_offset_t addr,__unused unsigned count,__unused int phys)2576 flush_dcache(__unused vm_offset_t addr,
2577 __unused unsigned count,
2578 __unused int phys)
2579 {
2580 return;
2581 }
2582
2583 #if CONFIG_DTRACE
2584 /*
2585 * Constrain DTrace copyin/copyout actions
2586 */
2587 extern kern_return_t dtrace_copyio_preflight(addr64_t);
2588 extern kern_return_t dtrace_copyio_postflight(addr64_t);
2589
2590 kern_return_t
dtrace_copyio_preflight(__unused addr64_t va)2591 dtrace_copyio_preflight(__unused addr64_t va)
2592 {
2593 thread_t thread = current_thread();
2594 uint64_t ccr3;
2595 if (current_map() == kernel_map) {
2596 return KERN_FAILURE;
2597 } else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE)) {
2598 return KERN_FAILURE;
2599 } else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3)) {
2600 return KERN_FAILURE;
2601 } else {
2602 return KERN_SUCCESS;
2603 }
2604 }
2605
2606 kern_return_t
dtrace_copyio_postflight(__unused addr64_t va)2607 dtrace_copyio_postflight(__unused addr64_t va)
2608 {
2609 return KERN_SUCCESS;
2610 }
2611 #endif /* CONFIG_DTRACE */
2612
2613 #include <mach_vm_debug.h>
2614 #if MACH_VM_DEBUG
2615 #include <vm/vm_debug.h>
2616
2617 int
pmap_list_resident_pages(__unused pmap_t pmap,__unused vm_offset_t * listp,__unused int space)2618 pmap_list_resident_pages(
2619 __unused pmap_t pmap,
2620 __unused vm_offset_t *listp,
2621 __unused int space)
2622 {
2623 return 0;
2624 }
2625 #endif /* MACH_VM_DEBUG */
2626
2627
2628 #if CONFIG_COREDUMP
2629 /* temporary workaround */
2630 boolean_t
coredumpok(vm_map_t map,mach_vm_offset_t va)2631 coredumpok(vm_map_t map, mach_vm_offset_t va)
2632 {
2633 #if 0
2634 pt_entry_t *ptep;
2635
2636 ptep = pmap_pte(map->pmap, va);
2637 if (0 == ptep) {
2638 return FALSE;
2639 }
2640 return (*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED);
2641 #else
2642 if (vm_map_entry_has_device_pager(map, va)) {
2643 return FALSE;
2644 }
2645 return TRUE;
2646 #endif
2647 }
2648 #endif
2649
2650 boolean_t
phys_page_exists(ppnum_t pn)2651 phys_page_exists(ppnum_t pn)
2652 {
2653 assert(pn != vm_page_fictitious_addr);
2654
2655 if (!pmap_initialized) {
2656 return TRUE;
2657 }
2658
2659 if (pn == vm_page_guard_addr) {
2660 return FALSE;
2661 }
2662
2663 if (!IS_MANAGED_PAGE(ppn_to_pai(pn))) {
2664 return FALSE;
2665 }
2666
2667 return TRUE;
2668 }
2669
2670
2671
2672 void
pmap_switch(pmap_t tpmap)2673 pmap_switch(pmap_t tpmap)
2674 {
2675 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(tpmap));
2676 assert(ml_get_interrupts_enabled() == FALSE);
2677 set_dirbase(tpmap, current_thread(), cpu_number());
2678 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
2679 }
2680
2681 void
pmap_require(pmap_t pmap)2682 pmap_require(pmap_t pmap)
2683 {
2684 if (pmap != kernel_pmap) {
2685 zone_id_require(ZONE_ID_PMAP, sizeof(struct pmap), pmap);
2686 }
2687 }
2688
2689 /*
2690 * disable no-execute capability on
2691 * the specified pmap
2692 */
2693 void
pmap_disable_NX(__unused pmap_t pmap)2694 pmap_disable_NX(__unused pmap_t pmap)
2695 {
2696 #if DEVELOPMENT || DEBUG
2697 pmap->nx_enabled = 0;
2698 #endif
2699 }
2700
2701 void
pmap_flush_context_init(pmap_flush_context * pfc)2702 pmap_flush_context_init(pmap_flush_context *pfc)
2703 {
2704 pfc->pfc_cpus = 0;
2705 pfc->pfc_invalid_global = 0;
2706 }
2707
2708 static bool
pmap_tlbi_response(uint32_t lcpu,uint32_t rcpu,bool ngflush)2709 pmap_tlbi_response(uint32_t lcpu, uint32_t rcpu, bool ngflush)
2710 {
2711 bool responded = false;
2712 bool gflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_global_count !=
2713 cpu_datap(lcpu)->cpu_tlb_gen_counts_global[rcpu]);
2714
2715 if (ngflush) {
2716 if (gflushed) {
2717 responded = true;
2718 }
2719 } else {
2720 if (gflushed) {
2721 responded = true;
2722 } else {
2723 bool lflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_local_count !=
2724 cpu_datap(lcpu)->cpu_tlb_gen_counts_local[rcpu]);
2725 if (lflushed) {
2726 responded = true;
2727 }
2728 }
2729 }
2730
2731 if (responded == false) {
2732 if ((cpu_datap(rcpu)->cpu_tlb_invalid == 0) ||
2733 !CPU_CR3_IS_ACTIVE(rcpu) ||
2734 !cpu_is_running(rcpu)) {
2735 responded = true;
2736 }
2737 }
2738 return responded;
2739 }
2740
2741 extern uint64_t TLBTimeOut;
2742 void
pmap_flush(pmap_flush_context * pfc)2743 pmap_flush(
2744 pmap_flush_context *pfc)
2745 {
2746 unsigned int my_cpu;
2747 unsigned int cpu;
2748 cpumask_t cpu_bit;
2749 cpumask_t cpus_to_respond = 0;
2750 cpumask_t cpus_to_signal = 0;
2751 cpumask_t cpus_signaled = 0;
2752 boolean_t flush_self = FALSE;
2753 uint64_t deadline;
2754 bool need_global_flush = false;
2755
2756 mp_disable_preemption();
2757
2758 my_cpu = cpu_number();
2759 cpus_to_signal = pfc->pfc_cpus;
2760
2761 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_START,
2762 NULL, cpus_to_signal);
2763
2764 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus && cpus_to_signal; cpu++, cpu_bit <<= 1) {
2765 if (cpus_to_signal & cpu_bit) {
2766 cpus_to_signal &= ~cpu_bit;
2767
2768 if (!cpu_is_running(cpu)) {
2769 continue;
2770 }
2771
2772 if (pfc->pfc_invalid_global & cpu_bit) {
2773 cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2774 need_global_flush = true;
2775 } else {
2776 cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2777 }
2778 cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2779 cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2780 mfence();
2781
2782 if (cpu == my_cpu) {
2783 flush_self = TRUE;
2784 continue;
2785 }
2786 if (CPU_CR3_IS_ACTIVE(cpu)) {
2787 cpus_to_respond |= cpu_bit;
2788 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2789 }
2790 }
2791 }
2792 cpus_signaled = cpus_to_respond;
2793
2794 /*
2795 * Flush local tlb if required.
2796 * Do this now to overlap with other processors responding.
2797 */
2798 if (flush_self) {
2799 process_pmap_updates(NULL, (pfc->pfc_invalid_global != 0), 0ULL, ~0ULL);
2800 }
2801
2802 if (cpus_to_respond) {
2803 deadline = mach_absolute_time() +
2804 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2805 boolean_t is_timeout_traced = FALSE;
2806
2807 /*
2808 * Wait for those other cpus to acknowledge
2809 */
2810 while (cpus_to_respond != 0) {
2811 long orig_acks = 0;
2812
2813 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2814 bool responded = false;
2815 if ((cpus_to_respond & cpu_bit) != 0) {
2816 responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
2817 if (responded) {
2818 cpus_to_respond &= ~cpu_bit;
2819 }
2820 cpu_pause();
2821 }
2822
2823 if (cpus_to_respond == 0) {
2824 break;
2825 }
2826 }
2827 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2828 if (machine_timeout_suspended()) {
2829 continue;
2830 }
2831 if (TLBTimeOut == 0) {
2832 if (is_timeout_traced) {
2833 continue;
2834 }
2835
2836 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2837 NULL, cpus_to_signal, cpus_to_respond);
2838
2839 is_timeout_traced = TRUE;
2840 continue;
2841 }
2842 orig_acks = NMIPI_acks;
2843 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
2844 panic("Uninterruptible processor(s): CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu",
2845 cpus_to_respond, orig_acks, NMIPI_acks, deadline);
2846 }
2847 }
2848 }
2849
2850 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_END,
2851 NULL, cpus_signaled, flush_self);
2852
2853 mp_enable_preemption();
2854 }
2855
2856
2857 static void
invept(void * eptp)2858 invept(void *eptp)
2859 {
2860 struct {
2861 uint64_t eptp;
2862 uint64_t reserved;
2863 } __attribute__((aligned(16), packed)) invept_descriptor = {(uint64_t)eptp, 0};
2864
2865 __asm__ volatile ("invept (%%rax), %%rcx"
2866 : : "c" (PMAP_INVEPT_SINGLE_CONTEXT), "a" (&invept_descriptor)
2867 : "cc", "memory");
2868 }
2869
2870 /*
2871 * Called with pmap locked, we:
2872 * - scan through per-cpu data to see which other cpus need to flush
2873 * - send an IPI to each non-idle cpu to be flushed
2874 * - wait for all to signal back that they are inactive or we see that
2875 * they are at a safe point (idle).
2876 * - flush the local tlb if active for this pmap
2877 * - return ... the caller will unlock the pmap
2878 */
2879
2880 void
pmap_flush_tlbs(pmap_t pmap,vm_map_offset_t startv,vm_map_offset_t endv,int options,pmap_flush_context * pfc)2881 pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int options, pmap_flush_context *pfc)
2882 {
2883 unsigned int cpu;
2884 cpumask_t cpu_bit;
2885 cpumask_t cpus_to_signal = 0;
2886 unsigned int my_cpu = cpu_number();
2887 pmap_paddr_t pmap_cr3 = pmap->pm_cr3;
2888 boolean_t flush_self = FALSE;
2889 uint64_t deadline;
2890 boolean_t pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap));
2891 bool need_global_flush = false;
2892 uint32_t event_code = 0;
2893 vm_map_offset_t event_startv = 0, event_endv = 0;
2894 boolean_t is_ept = is_ept_pmap(pmap);
2895
2896 assert((processor_avail_count < 2) ||
2897 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
2898
2899 assert((endv - startv) >= PAGE_SIZE);
2900 assert(((endv | startv) & PAGE_MASK) == 0);
2901
2902 if (__improbable(kdebug_enable)) {
2903 if (pmap == kernel_pmap) {
2904 event_code = PMAP_CODE(PMAP__FLUSH_KERN_TLBS);
2905 event_startv = VM_KERNEL_UNSLIDE_OR_PERM(startv);
2906 event_endv = VM_KERNEL_UNSLIDE_OR_PERM(endv);
2907 } else if (__improbable(is_ept)) {
2908 event_code = PMAP_CODE(PMAP__FLUSH_EPT);
2909 event_startv = startv;
2910 event_endv = endv;
2911 } else {
2912 event_code = PMAP_CODE(PMAP__FLUSH_TLBS);
2913 event_startv = startv;
2914 event_endv = endv;
2915 }
2916 }
2917
2918 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_START,
2919 VM_KERNEL_UNSLIDE_OR_PERM(pmap), options,
2920 event_startv, event_endv);
2921
2922 if (__improbable(is_ept)) {
2923 mp_cpus_call(CPUMASK_ALL, ASYNC, invept, (void*)pmap->pm_eptp);
2924 goto out;
2925 }
2926
2927 /*
2928 * Scan other cpus for matching active or task CR3.
2929 * For idle cpus (with no active map) we mark them invalid but
2930 * don't signal -- they'll check as they go busy.
2931 */
2932 if (pmap_pcid_ncpus) {
2933 if (pmap_is_shared) {
2934 need_global_flush = true;
2935 }
2936 pmap_pcid_invalidate_all_cpus(pmap);
2937 mfence();
2938 }
2939
2940 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2941 if (!cpu_is_running(cpu)) {
2942 continue;
2943 }
2944 uint64_t cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
2945 uint64_t cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
2946
2947 if ((pmap_cr3 == cpu_task_cr3) ||
2948 (pmap_cr3 == cpu_active_cr3) ||
2949 (pmap_is_shared)) {
2950 if (options & PMAP_DELAY_TLB_FLUSH) {
2951 if (need_global_flush == true) {
2952 pfc->pfc_invalid_global |= cpu_bit;
2953 }
2954 pfc->pfc_cpus |= cpu_bit;
2955
2956 continue;
2957 }
2958 if (need_global_flush == true) {
2959 cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2960 cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2961 } else {
2962 cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2963 cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2964 }
2965
2966 if (cpu == my_cpu) {
2967 flush_self = TRUE;
2968 continue;
2969 }
2970
2971 mfence();
2972
2973 /*
2974 * We don't need to signal processors which will flush
2975 * lazily at the idle state or kernel boundary.
2976 * For example, if we're invalidating the kernel pmap,
2977 * processors currently in userspace don't need to flush
2978 * their TLBs until the next time they enter the kernel.
2979 * Alterations to the address space of a task active
2980 * on a remote processor result in a signal, to
2981 * account for copy operations. (There may be room
2982 * for optimization in such cases).
2983 * The order of the loads below with respect
2984 * to the store to the "cpu_tlb_invalid" field above
2985 * is important--hence the barrier.
2986 */
2987 if (CPU_CR3_IS_ACTIVE(cpu) &&
2988 (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
2989 pmap->pm_shared ||
2990 (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
2991 cpus_to_signal |= cpu_bit;
2992 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2993 }
2994 }
2995 }
2996
2997 if ((options & PMAP_DELAY_TLB_FLUSH)) {
2998 goto out;
2999 }
3000
3001 /*
3002 * Flush local tlb if required.
3003 * Do this now to overlap with other processors responding.
3004 */
3005 if (flush_self) {
3006 process_pmap_updates(pmap, pmap_is_shared, startv, endv);
3007 }
3008
3009 if (cpus_to_signal) {
3010 cpumask_t cpus_to_respond = cpus_to_signal;
3011
3012 deadline = mach_absolute_time() +
3013 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
3014 boolean_t is_timeout_traced = FALSE;
3015
3016 /*
3017 * Wait for those other cpus to acknowledge
3018 */
3019 while (cpus_to_respond != 0) {
3020 long orig_acks = 0;
3021
3022 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
3023 bool responded = false;
3024 if ((cpus_to_respond & cpu_bit) != 0) {
3025 responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
3026 if (responded) {
3027 cpus_to_respond &= ~cpu_bit;
3028 }
3029 cpu_pause();
3030 }
3031 if (cpus_to_respond == 0) {
3032 break;
3033 }
3034 }
3035 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
3036 if (machine_timeout_suspended()) {
3037 continue;
3038 }
3039 if (TLBTimeOut == 0) {
3040 /* cut tracepoint but don't panic */
3041 if (is_timeout_traced) {
3042 continue;
3043 }
3044
3045 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
3046 VM_KERNEL_UNSLIDE_OR_PERM(pmap),
3047 cpus_to_signal,
3048 cpus_to_respond);
3049
3050 is_timeout_traced = TRUE;
3051 continue;
3052 }
3053 orig_acks = NMIPI_acks;
3054 uint64_t tstamp1 = mach_absolute_time();
3055 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
3056 uint64_t tstamp2 = mach_absolute_time();
3057 panic("IPI timeout, unresponsive CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu, pre-NMIPI time: 0x%llx, current: 0x%llx, global: %d",
3058 cpus_to_respond, orig_acks, NMIPI_acks, deadline, tstamp1, tstamp2, need_global_flush);
3059 }
3060 }
3061 }
3062
3063 if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) {
3064 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, pmap_cr3: 0x%llx, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, pmap_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
3065 }
3066
3067 out:
3068 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_END,
3069 VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal,
3070 event_startv, event_endv);
3071 }
3072
3073 static void
process_pmap_updates(pmap_t p,bool pshared,addr64_t istart,addr64_t iend)3074 process_pmap_updates(pmap_t p, bool pshared, addr64_t istart, addr64_t iend)
3075 {
3076 int ccpu = cpu_number();
3077 bool gtlbf = false;
3078
3079 pmap_assert(ml_get_interrupts_enabled() == 0 ||
3080 get_preemption_level() != 0);
3081
3082 if (cpu_datap(ccpu)->cpu_tlb_invalid_global) {
3083 cpu_datap(ccpu)->cpu_tlb_invalid_global_count++;
3084 cpu_datap(ccpu)->cpu_tlb_invalid = 0;
3085 gtlbf = true;
3086 } else {
3087 cpu_datap(ccpu)->cpu_tlb_invalid_local_count++;
3088 cpu_datap(ccpu)->cpu_tlb_invalid_local = 0;
3089 }
3090
3091 if (pmap_pcid_ncpus) {
3092 if (p) {
3093 /* TODO global generation count to
3094 * avoid potentially redundant
3095 * csw invalidations post-global invalidation
3096 */
3097 pmap_pcid_validate_cpu(p, ccpu);
3098 pmap_tlbi_range(istart, iend, (pshared || gtlbf), p->pmap_pcid_cpus[ccpu]);
3099 } else {
3100 pmap_pcid_validate_current();
3101 pmap_tlbi_range(istart, iend, true, 0);
3102 }
3103 } else {
3104 pmap_tlbi_range(0, ~0ULL, true, 0);
3105 }
3106 }
3107
3108 void
pmap_update_interrupt(void)3109 pmap_update_interrupt(void)
3110 {
3111 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START);
3112
3113 if (current_cpu_datap()->cpu_tlb_invalid) {
3114 process_pmap_updates(NULL, true, 0ULL, ~0ULL);
3115 }
3116
3117 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END);
3118 }
3119
3120 #include <mach/mach_vm.h> /* mach_vm_region_recurse() */
3121 /* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries
3122 * and identify ranges with mismatched VM permissions and PTE permissions
3123 */
3124 kern_return_t
pmap_permissions_verify(pmap_t ipmap,vm_map_t ivmmap,vm_offset_t sv,vm_offset_t ev)3125 pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev)
3126 {
3127 vm_offset_t cv = sv;
3128 kern_return_t rv = KERN_SUCCESS;
3129 uint64_t skip4 = 0, skip2 = 0;
3130
3131 assert(!is_ept_pmap(ipmap));
3132
3133 sv &= ~PAGE_MASK_64;
3134 ev &= ~PAGE_MASK_64;
3135 while (cv < ev) {
3136 if (__improbable((cv > 0x00007FFFFFFFFFFFULL) &&
3137 (cv < 0xFFFF800000000000ULL))) {
3138 cv = 0xFFFF800000000000ULL;
3139 }
3140 /* Potential inconsistencies from not holding pmap lock
3141 * but harmless for the moment.
3142 */
3143 if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) {
3144 if ((cv + NBPML4) > cv) {
3145 cv += NBPML4;
3146 } else {
3147 break;
3148 }
3149 skip4++;
3150 continue;
3151 }
3152 if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) {
3153 if ((cv + NBPD) > cv) {
3154 cv += NBPD;
3155 } else {
3156 break;
3157 }
3158 skip2++;
3159 continue;
3160 }
3161
3162 pt_entry_t *ptep = pmap_pte(ipmap, cv);
3163 if (ptep && (*ptep & INTEL_PTE_VALID)) {
3164 if (*ptep & INTEL_PTE_WRITE) {
3165 if (!(*ptep & INTEL_PTE_NX)) {
3166 kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep)))));
3167 rv = KERN_FAILURE;
3168 }
3169 }
3170 }
3171 cv += PAGE_SIZE;
3172 }
3173 kprintf("Completed pmap scan\n");
3174 cv = sv;
3175
3176 struct vm_region_submap_info_64 vbr;
3177 mach_msg_type_number_t vbrcount = 0;
3178 mach_vm_size_t vmsize;
3179 vm_prot_t prot;
3180 uint32_t nesting_depth = 0;
3181 kern_return_t kret;
3182
3183 while (cv < ev) {
3184 for (;;) {
3185 vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64;
3186 if ((kret = mach_vm_region_recurse(ivmmap,
3187 (mach_vm_address_t *) &cv, &vmsize, &nesting_depth,
3188 (vm_region_recurse_info_t)&vbr,
3189 &vbrcount)) != KERN_SUCCESS) {
3190 break;
3191 }
3192
3193 if (vbr.is_submap) {
3194 nesting_depth++;
3195 continue;
3196 } else {
3197 break;
3198 }
3199 }
3200
3201 if (kret != KERN_SUCCESS) {
3202 break;
3203 }
3204
3205 prot = vbr.protection;
3206
3207 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
3208 kprintf("W+X map entry at address 0x%lx\n", cv);
3209 rv = KERN_FAILURE;
3210 }
3211
3212 if (prot) {
3213 vm_offset_t pcv;
3214 for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) {
3215 pt_entry_t *ptep = pmap_pte(ipmap, pcv);
3216 vm_prot_t tprot;
3217
3218 if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID)) {
3219 continue;
3220 }
3221 tprot = VM_PROT_READ;
3222 if (*ptep & INTEL_PTE_WRITE) {
3223 tprot |= VM_PROT_WRITE;
3224 }
3225 if ((*ptep & INTEL_PTE_NX) == 0) {
3226 tprot |= VM_PROT_EXECUTE;
3227 }
3228 if (tprot != prot) {
3229 kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot);
3230 rv = KERN_FAILURE;
3231 }
3232 }
3233 }
3234 cv += vmsize;
3235 }
3236 return rv;
3237 }
3238
3239 #if MACH_ASSERT
3240 extern int pmap_ledgers_panic;
3241 extern int pmap_ledgers_panic_leeway;
3242
3243 static void
pmap_check_ledgers(pmap_t pmap)3244 pmap_check_ledgers(
3245 pmap_t pmap)
3246 {
3247 int pid;
3248 char *procname;
3249
3250 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
3251 /*
3252 * This pmap was not or is no longer fully associated
3253 * with a task (e.g. the old pmap after a fork()/exec() or
3254 * spawn()). Its "ledger" still points at a task that is
3255 * now using a different (and active) address space, so
3256 * we can't check that all the pmap ledgers are balanced here.
3257 *
3258 * If the "pid" is set, that means that we went through
3259 * pmap_set_process() in task_terminate_internal(), so
3260 * this task's ledger should not have been re-used and
3261 * all the pmap ledgers should be back to 0.
3262 */
3263 return;
3264 }
3265
3266 pid = pmap->pmap_pid;
3267 procname = pmap->pmap_procname;
3268
3269 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
3270 }
3271
3272 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3273 pmap_set_process(
3274 pmap_t pmap,
3275 int pid,
3276 char *procname)
3277 {
3278 if (pmap == NULL || pmap->pmap_pid == -1) {
3279 return;
3280 }
3281
3282 pmap->pmap_pid = pid;
3283 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3284 if (pmap_ledgers_panic_leeway) {
3285 /*
3286 * XXX FBDP
3287 * Some processes somehow trigger some issues that make
3288 * the pmap stats and ledgers go off track, causing
3289 * some assertion failures and ledger panics.
3290 * Turn off the sanity checks if we allow some ledger leeway
3291 * because of that. We'll still do a final check in
3292 * pmap_check_ledgers() for discrepancies larger than the
3293 * allowed leeway after the address space has been fully
3294 * cleaned up.
3295 */
3296 pmap->pmap_stats_assert = FALSE;
3297 ledger_disable_panic_on_negative(pmap->ledger,
3298 task_ledgers.phys_footprint);
3299 ledger_disable_panic_on_negative(pmap->ledger,
3300 task_ledgers.internal);
3301 ledger_disable_panic_on_negative(pmap->ledger,
3302 task_ledgers.internal_compressed);
3303 ledger_disable_panic_on_negative(pmap->ledger,
3304 task_ledgers.iokit_mapped);
3305 ledger_disable_panic_on_negative(pmap->ledger,
3306 task_ledgers.alternate_accounting);
3307 ledger_disable_panic_on_negative(pmap->ledger,
3308 task_ledgers.alternate_accounting_compressed);
3309 }
3310 }
3311 #endif /* MACH_ASSERT */
3312
3313
3314 #if DEVELOPMENT || DEBUG
3315 int pmap_pagezero_mitigation = 1;
3316 #endif
3317
3318 void
pmap_advise_pagezero_range(pmap_t lpmap,uint64_t low_bound)3319 pmap_advise_pagezero_range(pmap_t lpmap, uint64_t low_bound)
3320 {
3321 #if DEVELOPMENT || DEBUG
3322 if (pmap_pagezero_mitigation == 0) {
3323 lpmap->pagezero_accessible = FALSE;
3324 return;
3325 }
3326 #endif
3327 lpmap->pagezero_accessible = ((pmap_smap_enabled == FALSE) && (low_bound < 0x1000));
3328 if (lpmap == current_pmap()) {
3329 mp_disable_preemption();
3330 current_cpu_datap()->cpu_pagezero_mapped = lpmap->pagezero_accessible;
3331 mp_enable_preemption();
3332 }
3333 }
3334
3335 uintptr_t
pmap_verify_noncacheable(uintptr_t vaddr)3336 pmap_verify_noncacheable(uintptr_t vaddr)
3337 {
3338 pt_entry_t *ptep = NULL;
3339 ptep = pmap_pte(kernel_pmap, vaddr);
3340 if (ptep == NULL) {
3341 panic("pmap_verify_noncacheable: no translation for 0x%lx", vaddr);
3342 }
3343 /* Non-cacheable OK */
3344 if (*ptep & (INTEL_PTE_NCACHE)) {
3345 return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3346 }
3347 /* Write-combined OK */
3348 if (*ptep & (INTEL_PTE_PAT)) {
3349 return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3350 }
3351 panic("pmap_verify_noncacheable: IO read from a cacheable address? address: 0x%lx, PTE: %p, *PTE: 0x%llx", vaddr, ptep, *ptep);
3352 /*NOTREACHED*/
3353 return 0;
3354 }
3355
3356 bool
pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])3357 pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])
3358 {
3359 // Unsupported on this architecture.
3360 return false;
3361 }
3362
3363 uint32_t
pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])3364 pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])
3365 {
3366 // Unsupported on this architecture.
3367 return false;
3368 }
3369
3370 int
pmap_cs_configuration(void)3371 pmap_cs_configuration(void)
3372 {
3373 // Unsupported on this architecture.
3374 return 0;
3375 }
3376
3377 SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
3378 uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
3379
3380 void
pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])3381 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
3382 {
3383 simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL);
3384 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
3385 simple_unlock(&pmap_compilation_service_cdhash_lock);
3386
3387 #if DEVELOPMENT || DEBUG
3388 printf("Added Compilation Service CDHash through the PMAP: 0x%02X 0x%02X 0x%02X 0x%02X\n", cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
3389 #endif
3390 }
3391
3392 bool
pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])3393 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
3394 {
3395 bool match = false;
3396
3397 simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL);
3398 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
3399 match = true;
3400 }
3401 simple_unlock(&pmap_compilation_service_cdhash_lock);
3402
3403 #if DEVELOPMENT || DEBUG
3404 if (match) {
3405 printf("Matched Compilation Service CDHash through the PMAP\n");
3406 }
3407 #endif
3408
3409 return match;
3410 }
3411
3412 static bool pmap_local_signing_public_key_set = false;
3413 static uint8_t pmap_local_signing_public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE] = { 0 };
3414
3415 static bool
pmap_local_signing_public_key_is_set(void)3416 pmap_local_signing_public_key_is_set(void)
3417 {
3418 return os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
3419 }
3420
3421 void
pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])3422 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])
3423 {
3424 bool key_set = false;
3425
3426 /*
3427 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
3428 * a successful exchange means that the local signing public key has _not_ been
3429 * set. In case the key has been set, we panic as we would never expect the
3430 * kernel to attempt to set the key more than once.
3431 */
3432 key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
3433
3434 if (key_set) {
3435 panic("attempted to set the local signing public key multiple times");
3436 }
3437
3438 memcpy(pmap_local_signing_public_key, public_key, PMAP_ECC_P384_PUBLIC_KEY_SIZE);
3439
3440 #if DEVELOPMENT || DEBUG
3441 printf("Set local signing public key\n");
3442 #endif
3443 }
3444
3445 uint8_t*
pmap_get_local_signing_public_key(void)3446 pmap_get_local_signing_public_key(void)
3447 {
3448 if (pmap_local_signing_public_key_is_set()) {
3449 return pmap_local_signing_public_key;
3450 }
3451 return NULL;
3452 }
3453
3454 void
pmap_unrestrict_local_signing(__unused const uint8_t cdhash[CS_CDHASH_LEN])3455 pmap_unrestrict_local_signing(
3456 __unused const uint8_t cdhash[CS_CDHASH_LEN])
3457 {
3458 // TODO: Once all changes across XNU and AMFI have been submitted, panic.
3459 }
3460
3461 bool
pmap_query_entitlements(__unused pmap_t pmap,__unused CEQuery_t query,__unused size_t queryLength,__unused CEQueryContext_t finalContext)3462 pmap_query_entitlements(
3463 __unused pmap_t pmap,
3464 __unused CEQuery_t query,
3465 __unused size_t queryLength,
3466 __unused CEQueryContext_t finalContext)
3467 {
3468 #if !PMAP_SUPPORTS_ENTITLEMENT_CHECKS
3469 panic("PMAP_CS: do not use this API without checking for \'#if PMAP_SUPPORTS_ENTITLEMENT_CHECKS\'");
3470 #endif
3471
3472 panic("PMAP_SUPPORTS_ENTITLEMENT_CHECKS should not be defined on this platform");
3473 }
3474
3475 bool
pmap_cs_enabled(void)3476 pmap_cs_enabled(void)
3477 {
3478 return false;
3479 }
3480
3481 bool
pmap_in_ppl(void)3482 pmap_in_ppl(void)
3483 {
3484 // Nonexistent on this architecture.
3485 return false;
3486 }
3487
3488 bool
pmap_has_ppl(void)3489 pmap_has_ppl(void)
3490 {
3491 // Not supported on this architecture.
3492 return false;
3493 }
3494
3495 bool
pmap_has_iofilter_protected_write()3496 pmap_has_iofilter_protected_write()
3497 {
3498 // Not supported on this architecture.
3499 return false;
3500 }
3501
3502 __attribute__((__noreturn__))
3503 void
pmap_iofilter_protected_write(__unused vm_address_t addr,__unused uint64_t value,__unused uint64_t width)3504 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
3505 {
3506 panic("%s called on an unsupported platform.", __FUNCTION__);
3507 }
3508
3509 void* __attribute__((noreturn))
pmap_image4_pmap_data(__unused size_t * allocated_size)3510 pmap_image4_pmap_data(
3511 __unused size_t *allocated_size)
3512 {
3513 panic("PMAP_IMG4: image4 data not available on this architecture");
3514 }
3515
3516 void __attribute__((noreturn))
pmap_image4_set_nonce(__unused const img4_nonce_domain_index_t ndi,__unused const img4_nonce_t * nonce)3517 pmap_image4_set_nonce(
3518 __unused const img4_nonce_domain_index_t ndi,
3519 __unused const img4_nonce_t *nonce)
3520 {
3521 panic("PMAP_IMG4: set nonce API not supported on this architecture");
3522 }
3523
3524 void __attribute__((noreturn))
pmap_image4_roll_nonce(__unused const img4_nonce_domain_index_t ndi)3525 pmap_image4_roll_nonce(
3526 __unused const img4_nonce_domain_index_t ndi)
3527 {
3528 panic("PMAP_IMG4: roll nonce API not supported on this architecture");
3529 }
3530
3531 errno_t __attribute__((noreturn))
pmap_image4_copy_nonce(__unused const img4_nonce_domain_index_t ndi,__unused img4_nonce_t * nonce_out)3532 pmap_image4_copy_nonce(
3533 __unused const img4_nonce_domain_index_t ndi,
3534 __unused img4_nonce_t *nonce_out
3535 )
3536 {
3537 panic("PMAP_IMG4: copy nonce API not supported on this architecture");
3538 }
3539
3540 errno_t __attribute__((noreturn))
pmap_image4_execute_object(__unused img4_runtime_object_spec_index_t obj_spec_index,__unused const img4_buff_t * payload,__unused const img4_buff_t * _Nullable manifest)3541 pmap_image4_execute_object(
3542 __unused img4_runtime_object_spec_index_t obj_spec_index,
3543 __unused const img4_buff_t *payload,
3544 __unused const img4_buff_t *_Nullable manifest)
3545 {
3546 panic("PMAP_IMG4: execute object API not supported on this architecture");
3547 }
3548
3549 errno_t __attribute__((noreturn))
pmap_image4_copy_object(__unused img4_runtime_object_spec_index_t obj_spec_index,__unused vm_address_t object_out,__unused size_t * object_length)3550 pmap_image4_copy_object(
3551 __unused img4_runtime_object_spec_index_t obj_spec_index,
3552 __unused vm_address_t object_out,
3553 __unused size_t *object_length)
3554 {
3555 panic("PMAP_IMG4: copy object API not supported on this architecture");
3556 }
3557
3558 kern_return_t
pmap_cs_allow_invalid(__unused pmap_t pmap)3559 pmap_cs_allow_invalid(__unused pmap_t pmap)
3560 {
3561 // Unsupported on this architecture.
3562 return KERN_SUCCESS;
3563 }
3564
3565 void *
pmap_claim_reserved_ppl_page(void)3566 pmap_claim_reserved_ppl_page(void)
3567 {
3568 // Unsupported on this architecture.
3569 return NULL;
3570 }
3571
3572 void
pmap_free_reserved_ppl_page(void __unused * kva)3573 pmap_free_reserved_ppl_page(void __unused *kva)
3574 {
3575 // Unsupported on this architecture.
3576 }
3577
3578 kern_return_t
pmap_cs_fork_prepare(__unused pmap_t old_pmap,__unused pmap_t new_pmap)3579 pmap_cs_fork_prepare(__unused pmap_t old_pmap, __unused pmap_t new_pmap)
3580 {
3581 // PMAP_CS isn't enabled for x86_64.
3582 return KERN_SUCCESS;
3583 }
3584
3585 #if DEVELOPMENT || DEBUG
3586 /*
3587 * Used for unit testing recovery from text corruptions.
3588 */
3589 kern_return_t
pmap_test_text_corruption(pmap_paddr_t pa)3590 pmap_test_text_corruption(pmap_paddr_t pa)
3591 {
3592 int pai;
3593 uint8_t *va;
3594
3595 pai = ppn_to_pai(atop(pa));
3596 if (!IS_MANAGED_PAGE(pai)) {
3597 return KERN_FAILURE;
3598 }
3599
3600 va = (uint8_t *)PHYSMAP_PTOV(pa);
3601 va[0] = 0x0f; /* opcode for UD2 */
3602 va[1] = 0x0b;
3603
3604 return KERN_SUCCESS;
3605 }
3606 #endif /* DEVELOPMENT || DEBUG */
3607