1 /*
2 * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58
59 /*
60 * File: pmap.c
61 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * (These guys wrote the Vax version)
63 *
64 * Physical Map management code for Intel i386, i486, and i860.
65 *
66 * Manages physical address maps.
67 *
68 * In addition to hardware address maps, this
69 * module is called upon to provide software-use-only
70 * maps which may or may not be stored in the same
71 * form as hardware maps. These pseudo-maps are
72 * used to store intermediate results from copy
73 * operations to and from address spaces.
74 *
75 * Since the information managed by this module is
76 * also stored by the logical address mapping module,
77 * this module may throw away valid virtual-to-physical
78 * mappings at almost any time. However, invalidations
79 * of virtual-to-physical mappings must be done as
80 * requested.
81 *
82 * In order to cope with hardware architectures which
83 * make virtual-to-physical map invalidates expensive,
84 * this module may delay invalidate or reduced protection
85 * operations until such time as they are actually
86 * necessary. This module is given full information as
87 * to which processors are currently using which maps,
88 * and to when physical maps must be made correct.
89 */
90
91 #include <string.h>
92 #include <mach_ldebug.h>
93
94 #include <libkern/OSAtomic.h>
95
96 #include <mach/machine/vm_types.h>
97
98 #include <mach/boolean.h>
99 #include <kern/thread.h>
100 #include <kern/zalloc.h>
101 #include <kern/zalloc_internal.h>
102 #include <kern/queue.h>
103 #include <kern/ledger.h>
104 #include <kern/mach_param.h>
105
106 #include <kern/spl.h>
107
108 #include <vm/pmap.h>
109 #include <vm/pmap_cs.h>
110 #include <vm/vm_map_xnu.h>
111 #include <vm/vm_kern_xnu.h>
112 #include <mach/vm_param.h>
113 #include <mach/vm_prot.h>
114 #include <vm/vm_object_internal.h>
115 #include <vm/vm_page_internal.h>
116
117 #include <mach/machine/vm_param.h>
118 #include <machine/thread.h>
119
120 #include <kern/misc_protos.h> /* prototyping */
121 #include <i386/misc_protos.h>
122 #include <i386/i386_lowmem.h>
123 #include <x86_64/lowglobals.h>
124
125 #include <i386/cpuid.h>
126 #include <i386/cpu_data.h>
127 #include <i386/cpu_number.h>
128 #include <i386/machine_cpu.h>
129 #include <i386/seg.h>
130 #include <i386/serial_io.h>
131 #include <i386/cpu_capabilities.h>
132 #include <i386/machine_routines.h>
133 #include <i386/proc_reg.h>
134 #include <i386/tsc.h>
135 #include <i386/pmap_internal.h>
136 #include <i386/pmap_pcid.h>
137 #if CONFIG_VMX
138 #include <i386/vmx/vmx_cpu.h>
139 #endif
140
141 #include <vm/vm_protos.h>
142 #include <san/kasan.h>
143
144 #include <i386/mp.h>
145 #include <i386/mp_desc.h>
146 #include <libkern/kernel_mach_header.h>
147
148 #include <pexpert/i386/efi.h>
149 #include <libkern/section_keywords.h>
150 #if MACH_ASSERT
151 int pmap_stats_assert = 1;
152 #endif /* MACH_ASSERT */
153
154 #ifdef IWANTTODEBUG
155 #undef DEBUG
156 #define DEBUG 1
157 #define POSTCODE_DELAY 1
158 #include <i386/postcode.h>
159 #endif /* IWANTTODEBUG */
160
161 #ifdef PMAP_DEBUG
162 #define DBG(x...) kprintf("DBG: " x)
163 #else
164 #define DBG(x...)
165 #endif
166 /* Compile time assert to ensure adjacency/alignment of per-CPU data fields used
167 * in the trampolines for kernel/user boundary TLB coherency.
168 */
169 char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1];
170 boolean_t pmap_trace = FALSE;
171
172 boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */
173
174 #if DEVELOPMENT || DEBUG
175 int nx_enabled = 1; /* enable no-execute protection -- set during boot */
176 #else
177 const int nx_enabled = 1;
178 #endif
179
180 #if DEBUG || DEVELOPMENT
181 int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
182 int allow_stack_exec = 0; /* No apps may execute from the stack by default */
183 #else /* DEBUG || DEVELOPMENT */
184 const int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */
185 const int allow_stack_exec = 0; /* No apps may execute from the stack by default */
186 #endif /* DEBUG || DEVELOPMENT */
187
188 uint64_t max_preemption_latency_tsc = 0;
189
190 pv_hashed_entry_t *pv_hash_table; /* hash lists */
191
192 uint32_t npvhashmask = 0, npvhashbuckets = 0;
193
194 pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
195 pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
196 SIMPLE_LOCK_DECLARE(pv_hashed_free_list_lock, 0);
197 SIMPLE_LOCK_DECLARE(pv_hashed_kern_free_list_lock, 0);
198 SIMPLE_LOCK_DECLARE(pv_hash_table_lock, 0);
199 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
200
201 SECURITY_READ_ONLY_LATE(zone_t) pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
202
203 /*
204 * First and last physical addresses that we maintain any information
205 * for. Initialized to zero so that pmap operations done before
206 * pmap_init won't touch any non-existent structures.
207 */
208 boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */
209
210 static struct vm_object kptobj_object_store VM_PAGE_PACKED_ALIGNED;
211 static struct vm_object kpml4obj_object_store VM_PAGE_PACKED_ALIGNED;
212 static struct vm_object kpdptobj_object_store VM_PAGE_PACKED_ALIGNED;
213
214 /*
215 * Array of physical page attribites for managed pages.
216 * One byte per physical page.
217 */
218 char *pmap_phys_attributes;
219 ppnum_t last_managed_page = 0;
220
221 unsigned pmap_memory_region_count;
222 unsigned pmap_memory_region_current;
223
224 pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
225
226 /*
227 * Other useful macros.
228 */
229 #define current_pmap() (vm_map_pmap(current_thread()->map))
230
231 struct pmap kernel_pmap_store;
232 const pmap_t kernel_pmap = &kernel_pmap_store;
233 SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
234 SECURITY_READ_ONLY_LATE(zone_t) pmap_anchor_zone;
235 SECURITY_READ_ONLY_LATE(zone_t) pmap_uanchor_zone;
236 int pmap_debug = 0; /* flag for debugging prints */
237
238 unsigned int inuse_ptepages_count = 0;
239 long long alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */
240 unsigned int bootstrap_wired_pages = 0;
241
242 extern long NMIPI_acks;
243
244 SECURITY_READ_ONLY_LATE(boolean_t) kernel_text_ps_4K = TRUE;
245
246 extern char end;
247
248 static int nkpt;
249
250 #if DEVELOPMENT || DEBUG
251 SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kheap_nx = FALSE;
252 SECURITY_READ_ONLY_LATE(boolean_t) pmap_disable_kstack_nx = FALSE;
253 SECURITY_READ_ONLY_LATE(boolean_t) wpkernel = TRUE;
254 #else
255 const boolean_t wpkernel = TRUE;
256 #endif
257
258 extern long __stack_chk_guard[];
259
260 static uint64_t pmap_eptp_flags = 0;
261 boolean_t pmap_ept_support_ad = FALSE;
262
263 static void process_pmap_updates(pmap_t, bool, addr64_t, addr64_t);
264 /*
265 * Map memory at initialization. The physical addresses being
266 * mapped are not managed and are never unmapped.
267 *
268 * For now, VM is already on, we only need to map the
269 * specified memory.
270 */
271 vm_offset_t
pmap_map(vm_offset_t virt,vm_map_offset_t start_addr,vm_map_offset_t end_addr,vm_prot_t prot,unsigned int flags)272 pmap_map(
273 vm_offset_t virt,
274 vm_map_offset_t start_addr,
275 vm_map_offset_t end_addr,
276 vm_prot_t prot,
277 unsigned int flags)
278 {
279 kern_return_t kr;
280 int ps;
281
282 ps = PAGE_SIZE;
283 while (start_addr < end_addr) {
284 kr = pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
285 (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE, PMAP_MAPPING_TYPE_INFER);
286
287 if (kr != KERN_SUCCESS) {
288 panic("%s: failed pmap_enter, "
289 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
290 __FUNCTION__,
291 (void *)virt, (void *)start_addr, (void *)end_addr, prot, flags);
292 }
293
294 virt += ps;
295 start_addr += ps;
296 }
297 return virt;
298 }
299
300 extern char *first_avail;
301 extern vm_offset_t virtual_avail, virtual_end;
302 extern pmap_paddr_t avail_start, avail_end;
303 extern vm_offset_t sHIB;
304 extern vm_offset_t eHIB;
305 extern vm_offset_t stext;
306 extern vm_offset_t etext;
307 extern vm_offset_t sdata, edata;
308 extern vm_offset_t sconst, econst;
309
310 extern void *KPTphys;
311
312 boolean_t pmap_smep_enabled = FALSE;
313 boolean_t pmap_smap_enabled = FALSE;
314
315 void
pmap_cpu_init(void)316 pmap_cpu_init(void)
317 {
318 cpu_data_t *cdp = current_cpu_datap();
319
320 set_cr4(get_cr4() | CR4_PGE);
321
322 /*
323 * Initialize the per-cpu, TLB-related fields.
324 */
325 cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
326 cpu_shadowp(cdp->cpu_number)->cpu_kernel_cr3 = cdp->cpu_kernel_cr3;
327 cdp->cpu_active_cr3 = kernel_pmap->pm_cr3;
328 cdp->cpu_tlb_invalid = 0;
329 cdp->cpu_task_map = TASK_MAP_64BIT;
330
331 pmap_pcid_configure();
332 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) {
333 pmap_smep_enabled = TRUE;
334 #if DEVELOPMENT || DEBUG
335 boolean_t nsmep;
336 if (PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) {
337 pmap_smep_enabled = FALSE;
338 }
339 #endif
340 if (pmap_smep_enabled) {
341 set_cr4(get_cr4() | CR4_SMEP);
342 }
343 }
344 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMAP) {
345 pmap_smap_enabled = TRUE;
346 #if DEVELOPMENT || DEBUG
347 boolean_t nsmap;
348 if (PE_parse_boot_argn("-pmap_smap_disable", &nsmap, sizeof(nsmap))) {
349 pmap_smap_enabled = FALSE;
350 }
351 #endif
352 if (pmap_smap_enabled) {
353 set_cr4(get_cr4() | CR4_SMAP);
354 }
355 }
356
357 #if !CONFIG_CPU_COUNTERS
358 if (cdp->cpu_fixed_pmcs_enabled) {
359 boolean_t enable = TRUE;
360 cpu_pmc_control(&enable);
361 }
362 #endif /* !CONFIG_CPU_COUNTERS */
363 }
364
365 static void
pmap_ro_zone_validate_element_dst(zone_id_t zid,vm_offset_t va,vm_offset_t offset,vm_size_t new_data_size)366 pmap_ro_zone_validate_element_dst(
367 zone_id_t zid,
368 vm_offset_t va,
369 vm_offset_t offset,
370 vm_size_t new_data_size)
371 {
372 if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
373 panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
374 ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
375 }
376
377 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
378
379 /* Check element is from correct zone and properly aligned */
380 zone_require_ro(zid, elem_size, (void*)va);
381
382 if (__improbable(new_data_size > (elem_size - offset))) {
383 panic("%s: New data size %lu too large for elem size %lu at addr %p",
384 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
385 }
386 if (__improbable(offset >= elem_size)) {
387 panic("%s: Offset %lu too large for elem size %lu at addr %p",
388 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
389 }
390 }
391
392 static void
pmap_ro_zone_validate_element(zone_id_t zid,vm_offset_t va,vm_offset_t offset,const vm_offset_t new_data,vm_size_t new_data_size)393 pmap_ro_zone_validate_element(
394 zone_id_t zid,
395 vm_offset_t va,
396 vm_offset_t offset,
397 const vm_offset_t new_data,
398 vm_size_t new_data_size)
399 {
400 vm_offset_t sum = 0;
401
402 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
403 panic("%s: Integer addition overflow %p + %lu = %lu",
404 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
405 }
406
407 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
408 }
409
410 void
pmap_ro_zone_memcpy(zone_id_t zid,vm_offset_t va,vm_offset_t offset,const vm_offset_t new_data,vm_size_t new_data_size)411 pmap_ro_zone_memcpy(
412 zone_id_t zid,
413 vm_offset_t va,
414 vm_offset_t offset,
415 const vm_offset_t new_data,
416 vm_size_t new_data_size)
417 {
418 const pmap_paddr_t pa = kvtophys(va + offset);
419
420 if (!new_data || new_data_size == 0) {
421 return;
422 }
423
424 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
425 /* Write through Physical Aperture */
426 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
427 }
428
429 uint64_t
pmap_ro_zone_atomic_op(zone_id_t zid,vm_offset_t va,vm_offset_t offset,zro_atomic_op_t op,uint64_t value)430 pmap_ro_zone_atomic_op(
431 zone_id_t zid,
432 vm_offset_t va,
433 vm_offset_t offset,
434 zro_atomic_op_t op,
435 uint64_t value)
436 {
437 const pmap_paddr_t pa = kvtophys(va + offset);
438 vm_size_t value_size = op & 0xf;
439
440 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
441 /* Write through Physical Aperture */
442 return __zalloc_ro_mut_atomic(phystokv(pa), op, value);
443 }
444
445 void
pmap_ro_zone_bzero(zone_id_t zid,vm_offset_t va,vm_offset_t offset,vm_size_t size)446 pmap_ro_zone_bzero(
447 zone_id_t zid,
448 vm_offset_t va,
449 vm_offset_t offset,
450 vm_size_t size)
451 {
452 const pmap_paddr_t pa = kvtophys(va + offset);
453 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
454 bzero((void*)phystokv(pa), size);
455 }
456
457 static uint32_t
pmap_scale_shift(void)458 pmap_scale_shift(void)
459 {
460 uint32_t scale = 0;
461
462 if (sane_size <= 8 * GB) {
463 scale = (uint32_t)(sane_size / (2 * GB));
464 } else if (sane_size <= 32 * GB) {
465 scale = 4 + (uint32_t)((sane_size - (8 * GB)) / (4 * GB));
466 } else {
467 scale = 10 + (uint32_t)MIN(4, ((sane_size - (32 * GB)) / (8 * GB)));
468 }
469 return scale;
470 }
471
472 LCK_GRP_DECLARE(pmap_lck_grp, "pmap");
473 LCK_ATTR_DECLARE(pmap_lck_rw_attr, 0, LCK_ATTR_DEBUG);
474
475 /*
476 * Bootstrap the system enough to run with virtual memory.
477 * Map the kernel's code and data, and allocate the system page table.
478 * Called with mapping OFF. Page_size must already be set.
479 */
480
481 void
pmap_bootstrap(__unused vm_offset_t load_start,__unused boolean_t IA32e)482 pmap_bootstrap(
483 __unused vm_offset_t load_start,
484 __unused boolean_t IA32e)
485 {
486 assert(IA32e);
487
488 vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address
489 * known to VM */
490 /*
491 * The kernel's pmap is statically allocated so we don't
492 * have to use pmap_create, which is unlikely to work
493 * correctly at this part of the boot sequence.
494 */
495
496 os_ref_init(&kernel_pmap->ref_count, NULL);
497 #if DEVELOPMENT || DEBUG
498 kernel_pmap->nx_enabled = TRUE;
499 #endif
500 kernel_pmap->pm_task_map = TASK_MAP_64BIT;
501 kernel_pmap->pm_obj = (vm_object_t) NULL;
502 kernel_pmap->pm_pml4 = IdlePML4;
503 kernel_pmap->pm_upml4 = IdlePML4;
504 kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
505 kernel_pmap->pm_ucr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
506 kernel_pmap->pm_eptp = 0;
507
508 pmap_pcid_initialize_kernel(kernel_pmap);
509
510 current_cpu_datap()->cpu_kernel_cr3 = cpu_shadowp(cpu_number())->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
511
512 nkpt = NKPT;
513 OSAddAtomic(NKPT, &inuse_ptepages_count);
514 OSAddAtomic64(NKPT, &alloc_ptepages_count);
515 bootstrap_wired_pages = NKPT;
516
517 virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
518 virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
519
520 if (!PE_parse_boot_argn("npvhash", &npvhashmask, sizeof(npvhashmask))) {
521 npvhashmask = ((NPVHASHBUCKETS) << pmap_scale_shift()) - 1;
522 }
523
524 npvhashbuckets = npvhashmask + 1;
525
526 if (0 != ((npvhashbuckets) & npvhashmask)) {
527 panic("invalid hash %d, must be ((2^N)-1), "
528 "using default %d\n", npvhashmask, NPVHASHMASK);
529 }
530
531 lck_rw_init(&kernel_pmap->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
532 kernel_pmap->pmap_rwl.lck_rw_can_sleep = FALSE;
533
534 pmap_cpu_init();
535
536 if (pmap_pcid_ncpus) {
537 printf("PMAP: PCID enabled\n");
538 }
539
540 if (pmap_smep_enabled) {
541 printf("PMAP: Supervisor Mode Execute Protection enabled\n");
542 }
543 if (pmap_smap_enabled) {
544 printf("PMAP: Supervisor Mode Access Protection enabled\n");
545 }
546
547 #if DEBUG
548 printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
549 printf("early_random(): 0x%qx\n", early_random());
550 #endif
551 #if DEVELOPMENT || DEBUG
552 boolean_t ptmp;
553 /* Check if the user has requested disabling stack or heap no-execute
554 * enforcement. These are "const" variables; that qualifier is cast away
555 * when altering them. The TEXT/DATA const sections are marked
556 * write protected later in the kernel startup sequence, so altering
557 * them is possible at this point, in pmap_bootstrap().
558 */
559 if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) {
560 boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx;
561 *pdknxp = TRUE;
562 }
563
564 if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) {
565 boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx;
566 *pdknhp = TRUE;
567 }
568 #endif /* DEVELOPMENT || DEBUG */
569
570 boot_args *args = (boot_args *)PE_state.bootArgs;
571 if (args->efiMode == kBootArgsEfiMode32) {
572 printf("EFI32: kernel virtual space limited to 4GB\n");
573 virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32;
574 }
575 kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
576 (long)KERNEL_BASE, (long)virtual_end);
577 kprintf("Available physical space from 0x%llx to 0x%llx\n",
578 avail_start, avail_end);
579
580 /*
581 * The -no_shared_cr3 boot-arg is a debugging feature (set by default
582 * in the DEBUG kernel) to force the kernel to switch to its own map
583 * (and cr3) when control is in kernelspace. The kernel's map does not
584 * include (i.e. share) userspace so wild references will cause
585 * a panic. Only copyin and copyout are exempt from this.
586 */
587 (void) PE_parse_boot_argn("-no_shared_cr3",
588 &no_shared_cr3, sizeof(no_shared_cr3));
589 if (no_shared_cr3) {
590 kprintf("Kernel not sharing user map\n");
591 }
592
593 #ifdef PMAP_TRACES
594 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof(pmap_trace))) {
595 kprintf("Kernel traces for pmap operations enabled\n");
596 }
597 #endif /* PMAP_TRACES */
598
599 #if MACH_ASSERT
600 PE_parse_boot_argn("pmap_asserts", &pmap_asserts_enabled, sizeof(pmap_asserts_enabled));
601 PE_parse_boot_argn("pmap_stats_assert",
602 &pmap_stats_assert,
603 sizeof(pmap_stats_assert));
604 #endif /* MACH_ASSERT */
605 }
606
607 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)608 pmap_virtual_space(
609 vm_offset_t *startp,
610 vm_offset_t *endp)
611 {
612 *startp = virtual_avail;
613 *endp = virtual_end;
614 }
615
616
617
618
619 #if HIBERNATION
620
621 #include <IOKit/IOHibernatePrivate.h>
622 #include <machine/pal_hibernate.h>
623
624 int32_t pmap_npages;
625 int32_t pmap_teardown_last_valid_compact_indx = -1;
626
627 void pmap_pack_index(uint32_t);
628 int32_t pmap_unpack_index(pv_rooted_entry_t);
629
630 int32_t
pmap_unpack_index(pv_rooted_entry_t pv_h)631 pmap_unpack_index(pv_rooted_entry_t pv_h)
632 {
633 int32_t indx = 0;
634
635 indx = (int32_t)(*((uint64_t *)(&pv_h->qlink.next)) >> 48);
636 indx = indx << 16;
637 indx |= (int32_t)(*((uint64_t *)(&pv_h->qlink.prev)) >> 48);
638
639 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)0xffff << 48);
640 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)0xffff << 48);
641
642 return indx;
643 }
644
645
646 void
pmap_pack_index(uint32_t indx)647 pmap_pack_index(uint32_t indx)
648 {
649 pv_rooted_entry_t pv_h;
650
651 pv_h = &pv_head_table[indx];
652
653 *((uint64_t *)(&pv_h->qlink.next)) &= ~((uint64_t)0xffff << 48);
654 *((uint64_t *)(&pv_h->qlink.prev)) &= ~((uint64_t)0xffff << 48);
655
656 *((uint64_t *)(&pv_h->qlink.next)) |= ((uint64_t)(indx >> 16)) << 48;
657 *((uint64_t *)(&pv_h->qlink.prev)) |= ((uint64_t)(indx & 0xffff)) << 48;
658 }
659
660
661 void
pal_hib_teardown_pmap_structs(addr64_t * unneeded_start,addr64_t * unneeded_end)662 pal_hib_teardown_pmap_structs(addr64_t *unneeded_start, addr64_t *unneeded_end)
663 {
664 int32_t i;
665 int32_t compact_target_indx;
666
667 compact_target_indx = 0;
668
669 for (i = 0; i < pmap_npages; i++) {
670 if (pv_head_table[i].pmap == PMAP_NULL) {
671 if (pv_head_table[compact_target_indx].pmap != PMAP_NULL) {
672 compact_target_indx = i;
673 }
674 } else {
675 pmap_pack_index((uint32_t)i);
676
677 if (pv_head_table[compact_target_indx].pmap == PMAP_NULL) {
678 /*
679 * we've got a hole to fill, so
680 * move this pv_rooted_entry_t to it's new home
681 */
682 pv_head_table[compact_target_indx] = pv_head_table[i];
683 pv_head_table[i].pmap = PMAP_NULL;
684
685 pmap_teardown_last_valid_compact_indx = compact_target_indx;
686 compact_target_indx++;
687 } else {
688 pmap_teardown_last_valid_compact_indx = i;
689 }
690 }
691 }
692 *unneeded_start = (addr64_t)&pv_head_table[pmap_teardown_last_valid_compact_indx + 1];
693 *unneeded_end = (addr64_t)&pv_head_table[pmap_npages - 1];
694
695 HIBLOG("pal_hib_teardown_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
696 }
697
698
699 void
pal_hib_rebuild_pmap_structs(void)700 pal_hib_rebuild_pmap_structs(void)
701 {
702 int32_t cindx, eindx, rindx = 0;
703 pv_rooted_entry_t pv_h;
704
705 eindx = (int32_t)pmap_npages;
706
707 for (cindx = pmap_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
708 pv_h = &pv_head_table[cindx];
709
710 rindx = pmap_unpack_index(pv_h);
711 assert(rindx < pmap_npages);
712
713 if (rindx != cindx) {
714 /*
715 * this pv_rooted_entry_t was moved by pal_hib_teardown_pmap_structs,
716 * so move it back to its real location
717 */
718 pv_head_table[rindx] = pv_head_table[cindx];
719 }
720 if (rindx + 1 != eindx) {
721 /*
722 * the 'hole' between this vm_rooted_entry_t and the previous
723 * vm_rooted_entry_t we moved needs to be initialized as
724 * a range of zero'd vm_rooted_entry_t's
725 */
726 bzero((char *)&pv_head_table[rindx + 1], (eindx - rindx - 1) * sizeof(struct pv_rooted_entry));
727 }
728 eindx = rindx;
729 }
730 if (rindx) {
731 bzero((char *)&pv_head_table[0], rindx * sizeof(struct pv_rooted_entry));
732 }
733
734 HIBLOG("pal_hib_rebuild_pmap_structs done: last_valid_compact_indx %d\n", pmap_teardown_last_valid_compact_indx);
735 }
736
737 #endif
738
739 /*
740 * Create pv entries for kernel pages mapped by early startup code.
741 * These have to exist so we can ml_static_mfree() them later.
742 */
743 static void
pmap_pv_fixup(vm_offset_t start_va,vm_offset_t end_va)744 pmap_pv_fixup(vm_offset_t start_va, vm_offset_t end_va)
745 {
746 ppnum_t ppn;
747 pv_rooted_entry_t pv_h;
748 uint32_t pgsz;
749
750 start_va = round_page(start_va);
751 end_va = trunc_page(end_va);
752 while (start_va < end_va) {
753 pgsz = PAGE_SIZE;
754 ppn = pmap_find_phys(kernel_pmap, start_va);
755 if (ppn != 0 && IS_MANAGED_PAGE(ppn)) {
756 pv_h = pai_to_pvh(ppn);
757 assert(pv_h->qlink.next == 0); /* shouldn't be init'd yet */
758 assert(pv_h->pmap == 0);
759 pv_h->va_and_flags = start_va;
760 pv_h->pmap = kernel_pmap;
761 queue_init(&pv_h->qlink);
762 /*
763 * Note that pmap_query_pagesize does not enforce start_va is aligned
764 * on a 2M boundary if it's within a large page
765 */
766 if (pmap_query_pagesize(kernel_pmap, start_va) == I386_LPGBYTES) {
767 pgsz = I386_LPGBYTES;
768 }
769 }
770 if (os_add_overflow(start_va, pgsz, &start_va)) {
771 #if DEVELOPMENT || DEBUG
772 panic("pmap_pv_fixup: Unexpected address wrap (0x%lx after adding 0x%x)", start_va, pgsz);
773 #else
774 start_va = end_va;
775 #endif
776 }
777 }
778 }
779
780 static SECURITY_READ_ONLY_LATE(struct mach_vm_range) pmap_struct_range = {};
781 static __startup_data vm_map_t pmap_struct_map;
782 static __startup_data long pmap_npages_early;
783 static __startup_data vm_map_size_t pmap_struct_size;
784 KMEM_RANGE_REGISTER_DYNAMIC(pmap_struct, &pmap_struct_range, ^() {
785 vm_map_size_t s;
786
787 pmap_npages_early = i386_btop(avail_end);
788 s = (vm_map_size_t) (sizeof(struct pv_rooted_entry) * pmap_npages_early +
789 (sizeof(struct pv_hashed_entry_t *) * (npvhashbuckets)) +
790 pv_lock_table_size(pmap_npages_early) +
791 pv_hash_lock_table_size((npvhashbuckets)) +
792 pmap_npages_early);
793 pmap_struct_size = round_page(s);
794 return pmap_struct_size;
795 });
796
797 /*
798 * Initialize the pmap module.
799 * Called by vm_init, to initialize any structures that the pmap
800 * system needs to map virtual memory.
801 */
802 void
pmap_init(void)803 pmap_init(void)
804 {
805 long npages;
806 vm_offset_t addr;
807 vm_size_t vsize;
808 vm_map_offset_t vaddr;
809 ppnum_t ppn;
810
811 kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
812 _vm_object_allocate((vm_object_size_t)NPML4PGS * PAGE_SIZE, &kpml4obj_object_store);
813
814 kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
815 _vm_object_allocate((vm_object_size_t)NPDPTPGS * PAGE_SIZE, &kpdptobj_object_store);
816
817 kernel_pmap->pm_obj = &kptobj_object_store;
818 _vm_object_allocate((vm_object_size_t)NPDEPGS * PAGE_SIZE, &kptobj_object_store);
819
820 /*
821 * Allocate memory for the pv_head_table and its lock bits,
822 * the modify bit array, and the pte_page table.
823 */
824
825 /*
826 * zero bias all these arrays now instead of off avail_start
827 * so we cover all memory
828 */
829
830 npages = pmap_npages_early;
831 assert(npages == i386_btop(avail_end));
832 #if HIBERNATION
833 pmap_npages = (uint32_t)npages;
834 #endif
835 vm_map_will_allocate_early_map(&pmap_struct_map);
836 pmap_struct_map = kmem_suballoc(kernel_map, &pmap_struct_range.min_address,
837 pmap_struct_size, VM_MAP_CREATE_NEVER_FAULTS,
838 VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, KMS_NOFAIL | KMS_PERMANENT,
839 VM_KERN_MEMORY_PMAP).kmr_submap;
840 kmem_alloc(pmap_struct_map, &addr, pmap_struct_size,
841 KMA_NOFAIL | KMA_ZERO | KMA_KOBJECT | KMA_PERMANENT,
842 VM_KERN_MEMORY_PMAP);
843
844 vaddr = addr;
845 vsize = pmap_struct_size;
846
847 #if PV_DEBUG
848 if (0 == npvhashmask) {
849 panic("npvhashmask not initialized");
850 }
851 #endif
852
853 /*
854 * Allocate the structures first to preserve word-alignment.
855 */
856 pv_head_table = (pv_rooted_entry_t) addr;
857 addr = (vm_offset_t) (pv_head_table + npages);
858
859 pv_hash_table = (pv_hashed_entry_t *)addr;
860 addr = (vm_offset_t) (pv_hash_table + (npvhashbuckets));
861
862 pv_lock_table = (char *) addr;
863 addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
864
865 pv_hash_lock_table = (char *) addr;
866 addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhashbuckets)));
867
868 pmap_phys_attributes = (char *) addr;
869
870 ppnum_t last_pn = i386_btop(avail_end);
871 unsigned int i;
872 pmap_memory_region_t *pmptr = pmap_memory_regions;
873 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
874 if (pmptr->type != kEfiConventionalMemory) {
875 continue;
876 }
877 ppnum_t pn;
878 for (pn = pmptr->base; pn <= pmptr->end; pn++) {
879 if (pn < last_pn) {
880 pmap_phys_attributes[pn] |= PHYS_MANAGED;
881
882 if (pn > last_managed_page) {
883 last_managed_page = pn;
884 }
885
886 if ((pmap_high_used_bottom <= pn && pn <= pmap_high_used_top) ||
887 (pmap_middle_used_bottom <= pn && pn <= pmap_middle_used_top)) {
888 pmap_phys_attributes[pn] |= PHYS_NOENCRYPT;
889 }
890 }
891 }
892 }
893 while (vsize) {
894 ppn = pmap_find_phys(kernel_pmap, vaddr);
895
896 pmap_phys_attributes[ppn] |= PHYS_NOENCRYPT;
897
898 vaddr += PAGE_SIZE;
899 vsize -= PAGE_SIZE;
900 }
901 /*
902 * Create the zone of physical maps,
903 * and of the physical-to-virtual entries.
904 */
905 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
906 ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
907
908 /* The anchor is required to be page aligned. Zone debugging adds
909 * padding which may violate that requirement. Tell the zone
910 * subsystem that alignment is required.
911 */
912 pmap_anchor_zone = zone_create("pagetable anchors", PAGE_SIZE,
913 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
914
915 /* TODO: possible general optimisation...pre-allocate via zones commonly created
916 * level3/2 pagetables
917 */
918 /* The anchor is required to be page aligned. Zone debugging adds
919 * padding which may violate that requirement. Tell the zone
920 * subsystem that alignment is required.
921 */
922 pmap_uanchor_zone = zone_create("pagetable user anchors", PAGE_SIZE,
923 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
924
925 pv_hashed_list_zone = zone_create("pv_list", sizeof(struct pv_hashed_entry),
926 ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED);
927
928 /*
929 * Create pv entries for kernel pages that might get pmap_remove()ed.
930 *
931 * - very low pages that were identity mapped.
932 * - vm_pages[] entries that might be unused and reclaimed.
933 */
934 assert((uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start <= (uintptr_t)vm_page_array_beginning_addr);
935 pmap_pv_fixup((uintptr_t)VM_MIN_KERNEL_ADDRESS, (uintptr_t)VM_MIN_KERNEL_ADDRESS + avail_start);
936 pmap_pv_fixup((uintptr_t)vm_page_array_beginning_addr, (uintptr_t)vm_page_array_ending_addr);
937
938 pmap_initialized = TRUE;
939
940 max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
941
942 /*
943 * Ensure the kernel's PML4 entry exists for the basement
944 * before this is shared with any user.
945 */
946 pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE);
947
948 #if CONFIG_VMX
949 pmap_ept_support_ad = vmx_hv_support() && (VMX_CAP(MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_EPT_VPID_CAP_AD_SHIFT, 1) ? TRUE : FALSE);
950 pmap_eptp_flags = HV_VMX_EPTP_MEMORY_TYPE_WB | HV_VMX_EPTP_WALK_LENGTH(4) | (pmap_ept_support_ad ? HV_VMX_EPTP_ENABLE_AD_FLAGS : 0);
951 #endif /* CONFIG_VMX */
952 }
953
954 void
pmap_mark_range(pmap_t npmap,uint64_t sv,uint64_t nxrosz,boolean_t NX,boolean_t ro)955 pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro)
956 {
957 uint64_t ev, cv = sv;
958 pd_entry_t *pdep;
959 pt_entry_t *ptep = NULL;
960
961 if (os_add_overflow(sv, nxrosz, &ev)) {
962 panic("pmap_mark_range: Unexpected address overflow: start=0x%llx size=0x%llx", sv, nxrosz);
963 }
964
965 /* XXX what if nxrosz is 0? we end up marking the page whose address is passed in via sv -- is that kosher? */
966 assert(!is_ept_pmap(npmap));
967
968 assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0);
969
970 for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) {
971 uint64_t pdev = (cv & ~((uint64_t)PDEMASK));
972
973 if (*pdep & INTEL_PTE_PS) {
974 #ifdef REMAP_DEBUG
975 if ((NX ^ !!(*pdep & INTEL_PTE_NX)) || (ro ^ !!!(*pdep & INTEL_PTE_WRITE))) {
976 kprintf("WARNING: Remapping PDE for %p from %s%s%s to %s%s%s\n", (void *)cv,
977 (*pdep & INTEL_PTE_VALID) ? "R" : "",
978 (*pdep & INTEL_PTE_WRITE) ? "W" : "",
979 (*pdep & INTEL_PTE_NX) ? "" : "X",
980 "R",
981 ro ? "" : "W",
982 NX ? "" : "X");
983 }
984 #endif
985
986 if (NX) {
987 *pdep |= INTEL_PTE_NX;
988 } else {
989 *pdep &= ~INTEL_PTE_NX;
990 }
991 if (ro) {
992 *pdep &= ~INTEL_PTE_WRITE;
993 } else {
994 *pdep |= INTEL_PTE_WRITE;
995 }
996
997 if (os_add_overflow(cv, NBPD, &cv)) {
998 cv = ev;
999 } else {
1000 cv &= ~((uint64_t) PDEMASK);
1001 pdep = pmap_pde(npmap, cv);
1002 }
1003 continue;
1004 }
1005
1006 for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) {
1007 #ifdef REMAP_DEBUG
1008 if ((NX ^ !!(*ptep & INTEL_PTE_NX)) || (ro ^ !!!(*ptep & INTEL_PTE_WRITE))) {
1009 kprintf("WARNING: Remapping PTE for %p from %s%s%s to %s%s%s\n", (void *)cv,
1010 (*ptep & INTEL_PTE_VALID) ? "R" : "",
1011 (*ptep & INTEL_PTE_WRITE) ? "W" : "",
1012 (*ptep & INTEL_PTE_NX) ? "" : "X",
1013 "R",
1014 ro ? "" : "W",
1015 NX ? "" : "X");
1016 }
1017 #endif
1018 if (NX) {
1019 *ptep |= INTEL_PTE_NX;
1020 } else {
1021 *ptep &= ~INTEL_PTE_NX;
1022 }
1023 if (ro) {
1024 *ptep &= ~INTEL_PTE_WRITE;
1025 } else {
1026 *ptep |= INTEL_PTE_WRITE;
1027 }
1028 cv += NBPT;
1029 ptep = pmap_pte(npmap, cv);
1030 }
1031 }
1032 DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0);
1033 }
1034
1035 /*
1036 * Reclaim memory for early boot 4K page tables that were converted to large page mappings.
1037 * We know this memory is part of the KPTphys[] array that was allocated in Idle_PTs_init(),
1038 * so we can free it using its address in that array.
1039 */
1040 static void
pmap_free_early_PT(ppnum_t ppn,uint32_t cnt)1041 pmap_free_early_PT(ppnum_t ppn, uint32_t cnt)
1042 {
1043 ppnum_t KPTphys_ppn;
1044 vm_offset_t offset;
1045
1046 KPTphys_ppn = pmap_find_phys(kernel_pmap, (uintptr_t)KPTphys);
1047 assert(ppn >= KPTphys_ppn);
1048 assert(ppn + cnt <= KPTphys_ppn + NKPT);
1049 offset = (ppn - KPTphys_ppn) << PAGE_SHIFT;
1050 ml_static_mfree((uintptr_t)KPTphys + offset, PAGE_SIZE * cnt);
1051 }
1052
1053 /*
1054 * Called once VM is fully initialized so that we can release unused
1055 * sections of low memory to the general pool.
1056 * Also complete the set-up of identity-mapped sections of the kernel:
1057 * 1) write-protect kernel text
1058 * 2) map kernel text using large pages if possible
1059 * 3) read and write-protect page zero (for K32)
1060 * 4) map the global page at the appropriate virtual address.
1061 *
1062 * Use of large pages
1063 * ------------------
1064 * To effectively map and write-protect all kernel text pages, the text
1065 * must be 2M-aligned at the base, and the data section above must also be
1066 * 2M-aligned. That is, there's padding below and above. This is achieved
1067 * through linker directives. Large pages are used only if this alignment
1068 * exists (and not overriden by the -kernel_text_page_4K boot-arg). The
1069 * memory layout is:
1070 *
1071 * : :
1072 * | __DATA |
1073 * sdata: ================== 2Meg
1074 * | |
1075 * | zero-padding |
1076 * | |
1077 * etext: ------------------
1078 * | |
1079 * : :
1080 * | |
1081 * | __TEXT |
1082 * | |
1083 * : :
1084 * | |
1085 * stext: ================== 2Meg
1086 * | |
1087 * | zero-padding |
1088 * | |
1089 * eHIB: ------------------
1090 * | __HIB |
1091 * : :
1092 *
1093 * Prior to changing the mapping from 4K to 2M, the zero-padding pages
1094 * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the
1095 * 4K pages covering [stext,etext] are coalesced as 2M large pages.
1096 * The now unused level-1 PTE pages are also freed.
1097 */
1098 extern ppnum_t vm_kernel_base_page;
1099 static uint32_t dataptes = 0;
1100
1101 void
pmap_lowmem_finalize(void)1102 pmap_lowmem_finalize(void)
1103 {
1104 spl_t spl;
1105 int i;
1106
1107 /*
1108 * Update wired memory statistics for early boot pages
1109 */
1110 PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE);
1111
1112 /*
1113 * Free pages in pmap regions below the base:
1114 * rdar://6332712
1115 * We can't free all the pages to VM that EFI reports available.
1116 * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake.
1117 * There's also a size miscalculation here: pend is one page less
1118 * than it should be but this is not fixed to be backwards
1119 * compatible.
1120 * This is important for KASLR because up to 256*2MB = 512MB of space
1121 * needs has to be released to VM.
1122 */
1123 for (i = 0;
1124 pmap_memory_regions[i].end < vm_kernel_base_page;
1125 i++) {
1126 vm_offset_t pbase = i386_ptob(pmap_memory_regions[i].base);
1127 vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end + 1);
1128
1129 DBG("pmap region %d [%p..[%p\n",
1130 i, (void *) pbase, (void *) pend);
1131
1132 if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED) {
1133 continue;
1134 }
1135 /*
1136 * rdar://6332712
1137 * Adjust limits not to free pages in range 0xc0000-0xff000.
1138 */
1139 if (pbase >= 0xc0000 && pend <= 0x100000) {
1140 continue;
1141 }
1142 if (pbase < 0xc0000 && pend > 0x100000) {
1143 /* page range entirely within region, free lower part */
1144 DBG("- ml_static_mfree(%p,%p)\n",
1145 (void *) ml_static_ptovirt(pbase),
1146 (void *) (0xc0000 - pbase));
1147 ml_static_mfree(ml_static_ptovirt(pbase), 0xc0000 - pbase);
1148 pbase = 0x100000;
1149 }
1150 if (pbase < 0xc0000) {
1151 pend = MIN(pend, 0xc0000);
1152 }
1153 if (pend > 0x100000) {
1154 pbase = MAX(pbase, 0x100000);
1155 }
1156 DBG("- ml_static_mfree(%p,%p)\n",
1157 (void *) ml_static_ptovirt(pbase),
1158 (void *) (pend - pbase));
1159 ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase);
1160 }
1161
1162 /* A final pass to get rid of all initial identity mappings to
1163 * low pages.
1164 */
1165 DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base);
1166
1167 /*
1168 * Remove all mappings past the boot-cpu descriptor aliases and low globals.
1169 * Non-boot-cpu GDT aliases will be remapped later as needed.
1170 */
1171 pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base);
1172
1173 /*
1174 * Release any memory for early boot 4K page table pages that got replaced
1175 * with large page mappings for vm_pages[]. We know this memory is part of
1176 * the KPTphys[] array that was allocated in Idle_PTs_init(), so we can free
1177 * it using that address.
1178 */
1179 pmap_free_early_PT(released_PT_ppn, released_PT_cnt);
1180
1181 /*
1182 * If text and data are both 2MB-aligned,
1183 * we can map text with large-pages,
1184 * unless the -kernel_text_ps_4K boot-arg overrides.
1185 */
1186 if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) {
1187 kprintf("Kernel text is 2MB aligned");
1188 kernel_text_ps_4K = FALSE;
1189 if (PE_parse_boot_argn("-kernel_text_ps_4K",
1190 &kernel_text_ps_4K,
1191 sizeof(kernel_text_ps_4K))) {
1192 kprintf(" but will be mapped with 4K pages\n");
1193 } else {
1194 kprintf(" and will be mapped with 2M pages\n");
1195 }
1196 }
1197 #if DEVELOPMENT || DEBUG
1198 (void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof(wpkernel));
1199 #endif
1200 if (wpkernel) {
1201 kprintf("Kernel text %p-%p to be write-protected\n",
1202 (void *) stext, (void *) etext);
1203 }
1204
1205 spl = splhigh();
1206
1207 /*
1208 * Scan over text if mappings are to be changed:
1209 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0
1210 * - Change to large-pages if possible and not overriden.
1211 */
1212 if (kernel_text_ps_4K && wpkernel) {
1213 vm_offset_t myva;
1214 for (myva = stext; myva < etext; myva += PAGE_SIZE) {
1215 pt_entry_t *ptep;
1216
1217 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1218 if (ptep) {
1219 pmap_store_pte(FALSE, ptep, *ptep & ~INTEL_PTE_WRITE);
1220 }
1221 }
1222 }
1223
1224 if (!kernel_text_ps_4K) {
1225 vm_offset_t myva;
1226
1227 /*
1228 * Release zero-filled page padding used for 2M-alignment.
1229 */
1230 DBG("ml_static_mfree(%p,%p) for padding below text\n",
1231 (void *) eHIB, (void *) (stext - eHIB));
1232 ml_static_mfree(eHIB, stext - eHIB);
1233 DBG("ml_static_mfree(%p,%p) for padding above text\n",
1234 (void *) etext, (void *) (sdata - etext));
1235 ml_static_mfree(etext, sdata - etext);
1236
1237 /*
1238 * Coalesce text pages into large pages.
1239 */
1240 for (myva = stext; myva < sdata; myva += I386_LPGBYTES) {
1241 pt_entry_t *ptep;
1242 vm_offset_t pte_phys;
1243 pt_entry_t *pdep;
1244 pt_entry_t pde;
1245 ppnum_t KPT_ppn;
1246
1247 pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva);
1248 KPT_ppn = (ppnum_t)((*pdep & PG_FRAME) >> PAGE_SHIFT);
1249 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1250 DBG("myva: %p pdep: %p ptep: %p\n",
1251 (void *) myva, (void *) pdep, (void *) ptep);
1252 if ((*ptep & INTEL_PTE_VALID) == 0) {
1253 continue;
1254 }
1255 pte_phys = (vm_offset_t)(*ptep & PG_FRAME);
1256 pde = *pdep & PTMASK; /* page attributes from pde */
1257 pde |= INTEL_PTE_PS; /* make it a 2M entry */
1258 pde |= pte_phys; /* take page frame from pte */
1259
1260 if (wpkernel) {
1261 pde &= ~INTEL_PTE_WRITE;
1262 }
1263 DBG("pmap_store_pte(%p,0x%llx)\n",
1264 (void *)pdep, pde);
1265 pmap_store_pte(FALSE, pdep, pde);
1266
1267 /*
1268 * Free the now-unused level-1 pte.
1269 */
1270 pmap_free_early_PT(KPT_ppn, 1);
1271 }
1272
1273 /* Change variable read by sysctl machdep.pmap */
1274 pmap_kernel_text_ps = I386_LPGBYTES;
1275 }
1276
1277 vm_offset_t dva;
1278
1279 for (dva = sdata; dva < edata; dva += I386_PGBYTES) {
1280 assert(((sdata | edata) & PAGE_MASK) == 0);
1281 pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
1282
1283 dpte = *dptep;
1284 assert((dpte & INTEL_PTE_VALID));
1285 dpte |= INTEL_PTE_NX;
1286 pmap_store_pte(FALSE, dptep, dpte);
1287 dataptes++;
1288 }
1289 assert(dataptes > 0);
1290
1291 kernel_segment_command_t * seg;
1292 kernel_section_t * sec;
1293 kc_format_t kc_format;
1294
1295 PE_get_primary_kc_format(&kc_format);
1296
1297 for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) {
1298 if (!strcmp(seg->segname, "__TEXT") ||
1299 !strcmp(seg->segname, "__DATA")) {
1300 continue;
1301 }
1302
1303 /* XXX: FIXME_IN_dyld: This is a workaround (see below) */
1304 if (kc_format != KCFormatFileset) {
1305 //XXX
1306 if (!strcmp(seg->segname, "__KLD")) {
1307 continue;
1308 }
1309 }
1310
1311 if (!strcmp(seg->segname, "__HIB")) {
1312 for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
1313 if (sec->addr & PAGE_MASK) {
1314 panic("__HIB segment's sections misaligned");
1315 }
1316 if (!strcmp(sec->sectname, "__text")) {
1317 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE);
1318 } else {
1319 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE);
1320 }
1321 }
1322 } else {
1323 if (kc_format == KCFormatFileset) {
1324 #if 0
1325 /*
1326 * This block of code is commented out because it may or may not have induced an earlier panic
1327 * in ledger init.
1328 */
1329
1330
1331 boolean_t NXbit = !(seg->initprot & VM_PROT_EXECUTE),
1332 robit = (seg->initprot & (VM_PROT_READ | VM_PROT_WRITE)) == VM_PROT_READ;
1333
1334 /*
1335 * XXX: FIXME_IN_dyld: This is a workaround for primary KC containing incorrect inaccurate
1336 * initprot for segments containing code.
1337 */
1338 if (!strcmp(seg->segname, "__KLD") || !strcmp(seg->segname, "__VECTORS")) {
1339 NXbit = FALSE;
1340 robit = FALSE;
1341 }
1342
1343 pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1344 round_page_64(seg->vmsize), NXbit, robit);
1345 #endif
1346
1347 /*
1348 * XXX: We are marking *every* segment with rwx permissions as a workaround
1349 * XXX: until the primary KC's kernel segments are page-aligned.
1350 */
1351 kprintf("Marking (%p, %p) as rwx\n", (void *)(seg->vmaddr & ~(uint64_t)PAGE_MASK),
1352 (void *)((seg->vmaddr & ~(uint64_t)PAGE_MASK) + round_page_64(seg->vmsize)));
1353 pmap_mark_range(kernel_pmap, seg->vmaddr & ~(uint64_t)PAGE_MASK,
1354 round_page_64(seg->vmsize), FALSE, FALSE);
1355 } else {
1356 pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE);
1357 }
1358 }
1359 }
1360
1361 /*
1362 * If we're debugging, map the low global vector page at the fixed
1363 * virtual address. Otherwise, remove the mapping for this.
1364 */
1365 if (debug_boot_arg) {
1366 pt_entry_t *pte = NULL;
1367 if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS))) {
1368 panic("lowmem pte");
1369 }
1370
1371 /* make sure it is defined on page boundary */
1372 assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK));
1373 pmap_store_pte(FALSE, pte, kvtophys((vm_offset_t)&lowGlo)
1374 | INTEL_PTE_REF
1375 | INTEL_PTE_MOD
1376 | INTEL_PTE_WIRED
1377 | INTEL_PTE_VALID
1378 | INTEL_PTE_WRITE
1379 | INTEL_PTE_NX);
1380
1381 #if KASAN
1382 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
1383 #endif
1384 } else {
1385 pmap_remove(kernel_pmap,
1386 LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE);
1387 }
1388 pmap_tlbi_range(0, ~0ULL, true, 0);
1389 splx(spl);
1390 }
1391
1392 /*
1393 * Mark the const data segment as read-only, non-executable.
1394 */
1395 void
x86_64_protect_data_const()1396 x86_64_protect_data_const()
1397 {
1398 boolean_t doconstro = TRUE;
1399 #if DEVELOPMENT || DEBUG
1400 (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
1401 #endif
1402 if (doconstro) {
1403 if (sconst & PAGE_MASK) {
1404 panic("CONST segment misaligned 0x%lx 0x%lx",
1405 sconst, econst);
1406 }
1407 kprintf("Marking const DATA read-only\n");
1408 pmap_protect(kernel_pmap, sconst, econst, VM_PROT_READ);
1409 }
1410 }
1411 /*
1412 * this function is only used for debugging fron the vm layer
1413 */
1414 bool
pmap_verify_free(ppnum_t pn)1415 pmap_verify_free(
1416 ppnum_t pn)
1417 {
1418 pv_rooted_entry_t pv_h;
1419 int pai;
1420 bool result;
1421
1422 assert(pn != vm_page_fictitious_addr);
1423
1424 if (!pmap_initialized) {
1425 return true;
1426 }
1427
1428 if (pn == vm_page_guard_addr) {
1429 return true;
1430 }
1431
1432 pai = ppn_to_pai(pn);
1433 if (!IS_MANAGED_PAGE(pai)) {
1434 return false;
1435 }
1436 pv_h = pai_to_pvh(pn);
1437 result = (pv_h->pmap == PMAP_NULL);
1438 return result;
1439 }
1440
1441 #if MACH_ASSERT
1442 void
pmap_assert_free(ppnum_t pn)1443 pmap_assert_free(ppnum_t pn)
1444 {
1445 int pai;
1446 pv_rooted_entry_t pv_h = NULL;
1447 pmap_t pmap = NULL;
1448 vm_offset_t va = 0;
1449 static char buffer[32];
1450 static char *pr_name = "not managed pn";
1451 uint_t attr;
1452 pt_entry_t *ptep;
1453 pt_entry_t pte = -1ull;
1454
1455 if (pmap_verify_free(pn)) {
1456 return;
1457 }
1458
1459 if (pn > last_managed_page) {
1460 attr = 0xff;
1461 goto done;
1462 }
1463
1464 pai = ppn_to_pai(pn);
1465 attr = pmap_phys_attributes[pai];
1466 pv_h = pai_to_pvh(pai);
1467 va = pv_h->va_and_flags;
1468 pmap = pv_h->pmap;
1469 if (pmap == kernel_pmap) {
1470 pr_name = "kernel";
1471 } else if (pmap == NULL) {
1472 pr_name = "pmap NULL";
1473 } else if (pmap->pmap_procname[0] != 0) {
1474 pr_name = &pmap->pmap_procname[0];
1475 } else {
1476 snprintf(buffer, sizeof(buffer), "pmap %p", pv_h->pmap);
1477 pr_name = buffer;
1478 }
1479
1480 if (pmap != NULL) {
1481 ptep = pmap_pte(pmap, va);
1482 if (ptep != NULL) {
1483 pte = (uintptr_t)*ptep;
1484 }
1485 }
1486
1487 done:
1488 panic("page not FREE page: 0x%lx attr: 0x%x %s va: 0x%lx PTE: 0x%llx",
1489 (ulong_t)pn, attr, pr_name, va, pte);
1490 }
1491 #endif /* MACH_ASSERT */
1492
1493 boolean_t
pmap_is_empty(pmap_t pmap,vm_map_offset_t va_start,vm_map_offset_t va_end)1494 pmap_is_empty(
1495 pmap_t pmap,
1496 vm_map_offset_t va_start,
1497 vm_map_offset_t va_end)
1498 {
1499 vm_map_offset_t offset;
1500 ppnum_t phys_page;
1501 ledger_amount_t phys_mem;
1502
1503 if (pmap == PMAP_NULL) {
1504 return TRUE;
1505 }
1506
1507 /*
1508 * Check the ledger's phys_mem value
1509 * - if it's zero, the pmap is completely empty.
1510 * This short-circuit test prevents a virtual address scan which is
1511 * painfully slow for 64-bit spaces.
1512 * This assumes the count is correct
1513 * .. the debug kernel ought to be checking perhaps by page table walk.
1514 */
1515 if (pmap != kernel_pmap) {
1516 ledger_get_balance(pmap->ledger, task_ledgers.phys_mem, &phys_mem);
1517 if (phys_mem == 0) {
1518 return TRUE;
1519 }
1520 }
1521
1522 for (offset = va_start;
1523 offset < va_end;
1524 offset += PAGE_SIZE_64) {
1525 phys_page = pmap_find_phys(pmap, offset);
1526 if (phys_page) {
1527 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1528 "page %d at 0x%llx\n",
1529 pmap, va_start, va_end, phys_page, offset);
1530 return FALSE;
1531 }
1532 }
1533
1534 return TRUE;
1535 }
1536
1537 void
hv_ept_pmap_create(void ** ept_pmap,void ** eptp)1538 hv_ept_pmap_create(void **ept_pmap, void **eptp)
1539 {
1540 pmap_t p;
1541
1542 if ((ept_pmap == NULL) || (eptp == NULL)) {
1543 return;
1544 }
1545
1546 p = pmap_create_options(get_task_ledger(current_task()), 0, (PMAP_CREATE_64BIT | PMAP_CREATE_EPT));
1547 if (p == PMAP_NULL) {
1548 *ept_pmap = NULL;
1549 *eptp = NULL;
1550 return;
1551 }
1552
1553 assert(is_ept_pmap(p));
1554
1555 *ept_pmap = (void*)p;
1556 *eptp = (void*)(p->pm_eptp);
1557 return;
1558 }
1559
1560 /*
1561 * pmap_create() is used by some special, legacy 3rd party kexts.
1562 * In our kernel code, always use pmap_create_options().
1563 */
1564 extern pmap_t pmap_create(ledger_t ledger, vm_map_size_t sz, boolean_t is_64bit);
1565
1566 __attribute__((used))
1567 pmap_t
pmap_create(ledger_t ledger,vm_map_size_t sz,boolean_t is_64bit)1568 pmap_create(
1569 ledger_t ledger,
1570 vm_map_size_t sz,
1571 boolean_t is_64bit)
1572 {
1573 return pmap_create_options(ledger, sz, is_64bit ? PMAP_CREATE_64BIT : 0);
1574 }
1575
1576 /*
1577 * Create and return a physical map.
1578 *
1579 * If the size specified for the map
1580 * is zero, the map is an actual physical
1581 * map, and may be referenced by the
1582 * hardware.
1583 *
1584 * If the size specified is non-zero,
1585 * the map will be used in software only, and
1586 * is bounded by that size.
1587 */
1588
1589 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t sz,unsigned int flags)1590 pmap_create_options(
1591 ledger_t ledger,
1592 vm_map_size_t sz,
1593 unsigned int flags)
1594 {
1595 pmap_t p;
1596 vm_size_t size;
1597 pml4_entry_t *pml4;
1598 pml4_entry_t *kpml4;
1599 int i;
1600
1601 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, sz, flags);
1602
1603 size = (vm_size_t) sz;
1604
1605 /*
1606 * A software use-only map doesn't even need a map.
1607 */
1608
1609 if (size != 0) {
1610 return PMAP_NULL;
1611 }
1612
1613 /*
1614 * Return error when unrecognized flags are passed.
1615 */
1616 if (__improbable((flags & ~(PMAP_CREATE_KNOWN_FLAGS)) != 0)) {
1617 return PMAP_NULL;
1618 }
1619
1620 p = zalloc_flags(pmap_zone, Z_WAITOK | Z_ZERO);
1621 if (PMAP_NULL == p) {
1622 panic("pmap_create zalloc");
1623 }
1624
1625 lck_rw_init(&p->pmap_rwl, &pmap_lck_grp, &pmap_lck_rw_attr);
1626 p->pmap_rwl.lck_rw_can_sleep = FALSE;
1627
1628 os_ref_init(&p->ref_count, NULL);
1629 #if DEVELOPMENT || DEBUG
1630 p->nx_enabled = 1;
1631 #endif
1632 p->pm_shared = FALSE;
1633 ledger_reference(ledger);
1634 p->ledger = ledger;
1635
1636 p->pm_task_map = ((flags & PMAP_CREATE_64BIT) ? TASK_MAP_64BIT : TASK_MAP_32BIT);
1637
1638 p->pagezero_accessible = FALSE;
1639 p->pm_vm_map_cs_enforced = FALSE;
1640
1641 if (pmap_pcid_ncpus) {
1642 pmap_pcid_initialize(p);
1643 }
1644
1645 p->pm_pml4 = zalloc(pmap_anchor_zone);
1646 p->pm_upml4 = zalloc(pmap_uanchor_zone); //cleanup for EPT
1647
1648 pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0);
1649 pmap_assert((((uintptr_t)p->pm_upml4) & PAGE_MASK) == 0);
1650
1651 memset((char *)p->pm_pml4, 0, PAGE_SIZE);
1652 memset((char *)p->pm_upml4, 0, PAGE_SIZE);
1653
1654 if (flags & PMAP_CREATE_EPT) {
1655 p->pm_eptp = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4) | pmap_eptp_flags;
1656 p->pm_cr3 = 0;
1657 } else {
1658 p->pm_eptp = 0;
1659 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1660 p->pm_ucr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_upml4);
1661 }
1662
1663 /* allocate the vm_objs to hold the pdpt, pde and pte pages */
1664
1665 p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS) *PAGE_SIZE);
1666 if (NULL == p->pm_obj_pml4) {
1667 panic("pmap_create pdpt obj");
1668 }
1669
1670 p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS) *PAGE_SIZE);
1671 if (NULL == p->pm_obj_pdpt) {
1672 panic("pmap_create pdpt obj");
1673 }
1674
1675 p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS) *PAGE_SIZE);
1676 if (NULL == p->pm_obj) {
1677 panic("pmap_create pte obj");
1678 }
1679
1680 if (!(flags & PMAP_CREATE_EPT)) {
1681 /* All host pmaps share the kernel's pml4 */
1682 pml4 = pmap64_pml4(p, 0ULL);
1683 kpml4 = kernel_pmap->pm_pml4;
1684 for (i = KERNEL_PML4_INDEX; i < (KERNEL_PML4_INDEX + KERNEL_PML4_COUNT); i++) {
1685 pml4[i] = kpml4[i];
1686 }
1687 pml4[KERNEL_KEXTS_INDEX] = kpml4[KERNEL_KEXTS_INDEX];
1688 for (i = KERNEL_PHYSMAP_PML4_INDEX; i < (KERNEL_PHYSMAP_PML4_INDEX + KERNEL_PHYSMAP_PML4_COUNT); i++) {
1689 pml4[i] = kpml4[i];
1690 }
1691 pml4[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1692 #if KASAN
1693 for (i = KERNEL_KASAN_PML4_FIRST; i <= KERNEL_KASAN_PML4_LAST; i++) {
1694 pml4[i] = kpml4[i];
1695 }
1696 #endif
1697 pml4_entry_t *pml4u = pmap64_user_pml4(p, 0ULL);
1698 pml4u[KERNEL_DBLMAP_PML4_INDEX] = kpml4[KERNEL_DBLMAP_PML4_INDEX];
1699 }
1700
1701 #if MACH_ASSERT
1702 p->pmap_stats_assert = TRUE;
1703 p->pmap_pid = 0;
1704 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
1705 #endif /* MACH_ASSERT */
1706
1707 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END,
1708 VM_KERNEL_ADDRHIDE(p));
1709
1710 return p;
1711 }
1712
1713 /*
1714 * We maintain stats and ledgers so that a task's physical footprint is:
1715 * phys_footprint = ((internal - alternate_accounting)
1716 * + (internal_compressed - alternate_accounting_compressed)
1717 * + iokit_mapped
1718 * + purgeable_nonvolatile
1719 * + purgeable_nonvolatile_compressed
1720 * + page_table)
1721 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
1722 */
1723
1724 #if MACH_ASSERT
1725 static void pmap_check_ledgers(pmap_t pmap);
1726 #else /* MACH_ASSERT */
1727 static inline void
pmap_check_ledgers(__unused pmap_t pmap)1728 pmap_check_ledgers(__unused pmap_t pmap)
1729 {
1730 }
1731 #endif /* MACH_ASSERT */
1732
1733 /*
1734 * Retire the given physical map from service.
1735 * Should only be called if the map contains
1736 * no valid mappings.
1737 */
1738 extern int vm_wired_objects_page_count;
1739
1740 void
pmap_destroy(pmap_t p)1741 pmap_destroy(pmap_t p)
1742 {
1743 os_ref_count_t c;
1744
1745 if (p == PMAP_NULL) {
1746 return;
1747 }
1748
1749 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1750 VM_KERNEL_ADDRHIDe(p));
1751
1752 PMAP_LOCK_EXCLUSIVE(p);
1753
1754 c = os_ref_release_locked(&p->ref_count);
1755
1756 pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE);
1757
1758 if (c == 0) {
1759 /*
1760 * If some cpu is not using the physical pmap pointer that it
1761 * is supposed to be (see set_dirbase), we might be using the
1762 * pmap that is being destroyed! Make sure we are
1763 * physically on the right pmap:
1764 */
1765 PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
1766 if (pmap_pcid_ncpus) {
1767 pmap_destroy_pcid_sync(p);
1768 }
1769 }
1770
1771 PMAP_UNLOCK_EXCLUSIVE(p);
1772
1773 if (c != 0) {
1774 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1775 pmap_assert(p == kernel_pmap);
1776 return; /* still in use */
1777 }
1778
1779 /*
1780 * Free the memory maps, then the
1781 * pmap structure.
1782 */
1783 int inuse_ptepages = 0;
1784
1785 zfree(pmap_anchor_zone, p->pm_pml4);
1786 zfree(pmap_uanchor_zone, p->pm_upml4);
1787
1788 inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1789 vm_object_deallocate(p->pm_obj_pml4);
1790
1791 inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1792 vm_object_deallocate(p->pm_obj_pdpt);
1793
1794 inuse_ptepages += p->pm_obj->resident_page_count;
1795 vm_object_deallocate(p->pm_obj);
1796
1797 OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count);
1798 PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE);
1799
1800 pmap_check_ledgers(p);
1801 ledger_dereference(p->ledger);
1802 lck_rw_destroy(&p->pmap_rwl, &pmap_lck_grp);
1803 zfree(pmap_zone, p);
1804
1805 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
1806 }
1807
1808 /*
1809 * Add a reference to the specified pmap.
1810 */
1811
1812 void
pmap_reference(pmap_t p)1813 pmap_reference(pmap_t p)
1814 {
1815 if (p != PMAP_NULL) {
1816 PMAP_LOCK_EXCLUSIVE(p);
1817 os_ref_retain_locked(&p->ref_count);
1818 PMAP_UNLOCK_EXCLUSIVE(p);
1819 }
1820 }
1821
1822 /*
1823 * Remove phys addr if mapped in specified map
1824 *
1825 */
1826 void
pmap_remove_some_phys(__unused pmap_t map,__unused ppnum_t pn)1827 pmap_remove_some_phys(
1828 __unused pmap_t map,
1829 __unused ppnum_t pn)
1830 {
1831 /* Implement to support working set code */
1832 }
1833
1834
1835 void
pmap_protect(pmap_t map,vm_map_offset_t sva,vm_map_offset_t eva,vm_prot_t prot)1836 pmap_protect(
1837 pmap_t map,
1838 vm_map_offset_t sva,
1839 vm_map_offset_t eva,
1840 vm_prot_t prot)
1841 {
1842 pmap_protect_options(map, sva, eva, prot, 0, NULL);
1843 }
1844
1845
1846 /*
1847 * Set the physical protection on the
1848 * specified range of this map as requested.
1849 *
1850 * VERY IMPORTANT: Will *NOT* increase permissions.
1851 * pmap_protect_options() should protect the range against any access types
1852 * that are not in "prot" but it should never grant extra access.
1853 * For example, if "prot" is READ|EXECUTE, that means "remove write
1854 * access" but it does *not* mean "add read and execute" access.
1855 * VM relies on getting soft-faults to enforce extra checks (code
1856 * signing, for example), for example.
1857 * New access permissions are granted via pmap_enter() only.
1858 * ***NOTE***:
1859 * The only exception is for EPT pmaps, where we MUST populate all exec
1860 * bits when the protection API is invoked (so that the HV fault handler
1861 * can make decisions based on the exit qualification information, which
1862 * includes the execute bits in the EPT entries. Soft-faulting them
1863 * in would cause a chicken-and-egg problem where the HV fault handler
1864 * would not be able to identify mode-based execute control (MBE) faults.)
1865 */
1866 void
pmap_protect_options(pmap_t map,vm_map_offset_t sva,vm_map_offset_t eva,vm_prot_t prot,unsigned int options,void * arg)1867 pmap_protect_options(
1868 pmap_t map,
1869 vm_map_offset_t sva,
1870 vm_map_offset_t eva,
1871 vm_prot_t prot,
1872 unsigned int options,
1873 void *arg)
1874 {
1875 pt_entry_t *pde;
1876 pt_entry_t *spte, *epte;
1877 vm_map_offset_t lva;
1878 vm_map_offset_t orig_sva;
1879 boolean_t set_NX;
1880 int num_found = 0;
1881 boolean_t is_ept;
1882 uint64_t cur_vaddr;
1883
1884 pmap_intr_assert();
1885
1886 if (map == PMAP_NULL) {
1887 return;
1888 }
1889
1890 if (prot == VM_PROT_NONE) {
1891 pmap_remove_options(map, sva, eva, options);
1892 return;
1893 }
1894
1895 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1896 VM_KERNEL_ADDRHIDE(map), VM_KERNEL_ADDRHIDE(sva),
1897 VM_KERNEL_ADDRHIDE(eva));
1898
1899 is_ept = is_ept_pmap(map);
1900
1901 if ((prot & VM_PROT_EXECUTE) || __improbable(is_ept && (prot & VM_PROT_UEXEC))) {
1902 set_NX = FALSE;
1903 } else {
1904 set_NX = TRUE;
1905 }
1906
1907 #if DEVELOPMENT || DEBUG
1908 if (__improbable(set_NX && (!nx_enabled || !map->nx_enabled))) {
1909 set_NX = FALSE;
1910 }
1911 #endif
1912 PMAP_LOCK_EXCLUSIVE(map);
1913
1914 orig_sva = sva;
1915 cur_vaddr = sva;
1916 while (sva < eva) {
1917 uint64_t vaddr_incr;
1918
1919 if (os_add_overflow(sva, PDE_MAPPED_SIZE, &lva)) {
1920 lva = eva;
1921 } else {
1922 lva &= ~(PDE_MAPPED_SIZE - 1);
1923
1924 if (lva > eva) {
1925 lva = eva;
1926 }
1927 }
1928
1929 pde = pmap_pde(map, sva);
1930 if (pde && (*pde & PTE_VALID_MASK(is_ept))) {
1931 if (*pde & PTE_PS) {
1932 /* superpage */
1933 spte = pde;
1934 epte = spte + 1; /* excluded */
1935 vaddr_incr = I386_LPGBYTES;
1936 } else {
1937 spte = pmap_pte(map, (sva & ~(PDE_MAPPED_SIZE - 1)));
1938 spte = &spte[ptenum(sva)];
1939 epte = &spte[intel_btop(lva - sva)];
1940 vaddr_incr = I386_PGBYTES;
1941 }
1942
1943 for (; spte < epte; spte++) {
1944 uint64_t clear_bits, set_bits;
1945
1946 if (!(*spte & PTE_VALID_MASK(is_ept))) {
1947 continue;
1948 }
1949
1950 clear_bits = 0;
1951 set_bits = 0;
1952
1953 if (is_ept) {
1954 if (!(prot & VM_PROT_READ)) {
1955 clear_bits |= PTE_READ(is_ept);
1956 }
1957 }
1958 if (!(prot & VM_PROT_WRITE)) {
1959 clear_bits |= PTE_WRITE(is_ept);
1960 }
1961 #if DEVELOPMENT || DEBUG
1962 else if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) &&
1963 map == kernel_pmap) {
1964 set_bits |= PTE_WRITE(is_ept);
1965 }
1966 #endif /* DEVELOPMENT || DEBUG */
1967
1968 if (set_NX) {
1969 if (!is_ept) {
1970 set_bits |= INTEL_PTE_NX;
1971 } else {
1972 clear_bits |= INTEL_EPT_EX | INTEL_EPT_UEX;
1973 }
1974 } else if (is_ept) {
1975 /* This is the exception to the "Don't add permissions" statement, above */
1976 set_bits |= ((prot & VM_PROT_EXECUTE) ? INTEL_EPT_EX : 0) |
1977 ((prot & VM_PROT_UEXEC) ? INTEL_EPT_UEX : 0);
1978 }
1979
1980 pmap_update_pte(is_ept, spte, clear_bits, set_bits, false);
1981
1982 DTRACE_VM3(set_pte, pmap_t, map, void *, cur_vaddr, uint64_t, *spte);
1983 cur_vaddr += vaddr_incr;
1984
1985 num_found++;
1986 }
1987 }
1988 sva = lva;
1989 }
1990 if (num_found) {
1991 if (options & PMAP_OPTIONS_NOFLUSH) {
1992 PMAP_UPDATE_TLBS_DELAYED(map, orig_sva, eva, (pmap_flush_context *)arg);
1993 } else {
1994 PMAP_UPDATE_TLBS(map, orig_sva, eva);
1995 }
1996 }
1997
1998 PMAP_UNLOCK_EXCLUSIVE(map);
1999
2000 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
2001 }
2002
2003 /* Map a (possibly) autogenned block */
2004 kern_return_t
pmap_map_block_addr(pmap_t pmap,addr64_t va,pmap_paddr_t pa,uint32_t size,vm_prot_t prot,int attr,unsigned int flags)2005 pmap_map_block_addr(
2006 pmap_t pmap,
2007 addr64_t va,
2008 pmap_paddr_t pa,
2009 uint32_t size,
2010 vm_prot_t prot,
2011 int attr,
2012 unsigned int flags)
2013 {
2014 return pmap_map_block(pmap, va, intel_btop(pa), size, prot, attr, flags);
2015 }
2016
2017 kern_return_t
pmap_map_block(pmap_t pmap,addr64_t va,ppnum_t pa,uint32_t size,vm_prot_t prot,int attr,__unused unsigned int flags)2018 pmap_map_block(
2019 pmap_t pmap,
2020 addr64_t va,
2021 ppnum_t pa,
2022 uint32_t size,
2023 vm_prot_t prot,
2024 int attr,
2025 __unused unsigned int flags)
2026 {
2027 kern_return_t kr;
2028 addr64_t original_va = va;
2029 uint32_t page;
2030 int cur_page_size;
2031
2032 if (attr & VM_MEM_SUPERPAGE) {
2033 cur_page_size = SUPERPAGE_SIZE;
2034 } else {
2035 cur_page_size = PAGE_SIZE;
2036 }
2037
2038 for (page = 0; page < size; page += cur_page_size / PAGE_SIZE) {
2039 kr = pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE, PMAP_MAPPING_TYPE_INFER);
2040
2041 if (kr != KERN_SUCCESS) {
2042 /*
2043 * This will panic for now, as it is unclear that
2044 * removing the mappings is correct.
2045 */
2046 panic("%s: failed pmap_enter, "
2047 "pmap=%p, va=%#llx, pa=%u, size=%u, prot=%#x, flags=%#x",
2048 __FUNCTION__,
2049 pmap, va, pa, size, prot, flags);
2050
2051 pmap_remove(pmap, original_va, va - original_va);
2052 return kr;
2053 }
2054
2055 va += cur_page_size;
2056 pa += cur_page_size / PAGE_SIZE;
2057 }
2058
2059 return KERN_SUCCESS;
2060 }
2061
2062 kern_return_t
pmap_expand_pml4(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2063 pmap_expand_pml4(
2064 pmap_t map,
2065 vm_map_offset_t vaddr,
2066 unsigned int options)
2067 {
2068 vm_page_t m;
2069 pmap_paddr_t pa;
2070 uint64_t i;
2071 ppnum_t pn;
2072 pml4_entry_t *pml4p;
2073 boolean_t is_ept = is_ept_pmap(map);
2074
2075 DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
2076
2077 /* With the exception of the kext "basement", the kernel's level 4
2078 * pagetables must not be dynamically expanded.
2079 */
2080 assert(map != kernel_pmap || (vaddr == KERNEL_BASEMENT));
2081 /*
2082 * Allocate a VM page for the pml4 page
2083 */
2084 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2085 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2086 return KERN_RESOURCE_SHORTAGE;
2087 }
2088 VM_PAGE_WAIT();
2089 }
2090 /*
2091 * put the page into the pmap's obj list so it
2092 * can be found later.
2093 */
2094 pn = VM_PAGE_GET_PHYS_PAGE(m);
2095 pa = i386_ptob(pn);
2096 i = pml4idx(map, vaddr);
2097
2098 /*
2099 * Zero the page.
2100 */
2101 pmap_zero_page(pn);
2102
2103 vm_page_lockspin_queues();
2104 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2105 vm_page_unlock_queues();
2106
2107 OSAddAtomic(1, &inuse_ptepages_count);
2108 OSAddAtomic64(1, &alloc_ptepages_count);
2109 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2110
2111 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2112 vm_object_lock(map->pm_obj_pml4);
2113
2114 PMAP_LOCK_EXCLUSIVE(map);
2115 /*
2116 * See if someone else expanded us first
2117 */
2118 if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
2119 PMAP_UNLOCK_EXCLUSIVE(map);
2120 vm_object_unlock(map->pm_obj_pml4);
2121
2122 VM_PAGE_FREE(m);
2123
2124 OSAddAtomic(-1, &inuse_ptepages_count);
2125 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2126 return KERN_SUCCESS;
2127 }
2128
2129 #if 0 /* DEBUG */
2130 if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE)) {
2131 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx",
2132 map, map->pm_obj_pml4, vaddr, i);
2133 }
2134 #endif
2135 vm_page_insert_wired(m, map->pm_obj_pml4, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2136 vm_object_unlock(map->pm_obj_pml4);
2137
2138 /*
2139 * Set the page directory entry for this page table.
2140 */
2141 pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
2142
2143 /*
2144 * Note that INTEL_EPT_UEX is unconditionally set (as is INTEL_EPT_EX) for
2145 * all intermediate paging levels, from PML4Es to PDEs. Processors with
2146 * VT-x implementations that do not support MBE ignore the INTEL_EPT_UEX
2147 * bit at all levels of the EPT, so there is no risk of inducing EPT
2148 * violation faults.
2149 */
2150 pmap_store_pte(is_ept, pml4p, pa_to_pte(pa)
2151 | PTE_READ(is_ept)
2152 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2153 | PTE_WRITE(is_ept));
2154 pml4_entry_t *upml4p;
2155
2156 upml4p = pmap64_user_pml4(map, vaddr);
2157 pmap_store_pte(is_ept, upml4p, pa_to_pte(pa)
2158 | PTE_READ(is_ept)
2159 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2160 | PTE_WRITE(is_ept));
2161
2162 PMAP_UNLOCK_EXCLUSIVE(map);
2163
2164 return KERN_SUCCESS;
2165 }
2166
2167 kern_return_t
pmap_expand_pdpt(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2168 pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options)
2169 {
2170 vm_page_t m;
2171 pmap_paddr_t pa;
2172 uint64_t i;
2173 ppnum_t pn;
2174 pdpt_entry_t *pdptp;
2175 boolean_t is_ept = is_ept_pmap(map);
2176
2177 DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
2178
2179 while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
2180 kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options);
2181 if (pep4kr != KERN_SUCCESS) {
2182 return pep4kr;
2183 }
2184 }
2185
2186 /*
2187 * Allocate a VM page for the pdpt page
2188 */
2189 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2190 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2191 return KERN_RESOURCE_SHORTAGE;
2192 }
2193 VM_PAGE_WAIT();
2194 }
2195
2196 /*
2197 * put the page into the pmap's obj list so it
2198 * can be found later.
2199 */
2200 pn = VM_PAGE_GET_PHYS_PAGE(m);
2201 pa = i386_ptob(pn);
2202 i = pdptidx(map, vaddr);
2203
2204 /*
2205 * Zero the page.
2206 */
2207 pmap_zero_page(pn);
2208
2209 vm_page_lockspin_queues();
2210 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2211 vm_page_unlock_queues();
2212
2213 OSAddAtomic(1, &inuse_ptepages_count);
2214 OSAddAtomic64(1, &alloc_ptepages_count);
2215 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2216
2217 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2218 vm_object_lock(map->pm_obj_pdpt);
2219
2220 PMAP_LOCK_EXCLUSIVE(map);
2221 /*
2222 * See if someone else expanded us first
2223 */
2224 if (pmap_pde(map, vaddr) != PD_ENTRY_NULL) {
2225 PMAP_UNLOCK_EXCLUSIVE(map);
2226 vm_object_unlock(map->pm_obj_pdpt);
2227
2228 VM_PAGE_FREE(m);
2229
2230 OSAddAtomic(-1, &inuse_ptepages_count);
2231 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2232 return KERN_SUCCESS;
2233 }
2234
2235 #if 0 /* DEBUG */
2236 if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE)) {
2237 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx",
2238 map, map->pm_obj_pdpt, vaddr, i);
2239 }
2240 #endif
2241 vm_page_insert_wired(m, map->pm_obj_pdpt, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2242 vm_object_unlock(map->pm_obj_pdpt);
2243
2244 /*
2245 * Set the page directory entry for this page table.
2246 */
2247 pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
2248
2249 pmap_store_pte(is_ept, pdptp, pa_to_pte(pa)
2250 | PTE_READ(is_ept)
2251 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2252 | PTE_WRITE(is_ept));
2253
2254 PMAP_UNLOCK_EXCLUSIVE(map);
2255
2256 return KERN_SUCCESS;
2257 }
2258
2259
2260
2261 /*
2262 * Routine: pmap_expand
2263 *
2264 * Expands a pmap to be able to map the specified virtual address.
2265 *
2266 * Allocates new virtual memory for the P0 or P1 portion of the
2267 * pmap, then re-maps the physical pages that were in the old
2268 * pmap to be in the new pmap.
2269 *
2270 * Must be called with the pmap system and the pmap unlocked,
2271 * since these must be unlocked to use vm_allocate or vm_deallocate.
2272 * Thus it must be called in a loop that checks whether the map
2273 * has been expanded enough.
2274 * (We won't loop forever, since page tables aren't shrunk.)
2275 */
2276 kern_return_t
pmap_expand(pmap_t map,vm_map_offset_t vaddr,unsigned int options)2277 pmap_expand(
2278 pmap_t map,
2279 vm_map_offset_t vaddr,
2280 unsigned int options)
2281 {
2282 pt_entry_t *pdp;
2283 vm_page_t m;
2284 pmap_paddr_t pa;
2285 uint64_t i;
2286 ppnum_t pn;
2287 boolean_t is_ept = is_ept_pmap(map);
2288
2289
2290 /*
2291 * For the kernel, the virtual address must be in or above the basement
2292 * which is for kexts and is in the 512GB immediately below the kernel..
2293 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
2294 */
2295 if (__improbable(map == kernel_pmap &&
2296 !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))) {
2297 if ((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0) {
2298 panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
2299 }
2300 }
2301
2302 while ((pdp = pmap_pde(map, vaddr)) == PD_ENTRY_NULL) {
2303 assert((options & PMAP_EXPAND_OPTIONS_ALIASMAP) == 0);
2304 kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options);
2305 if (pepkr != KERN_SUCCESS) {
2306 return pepkr;
2307 }
2308 }
2309
2310 /*
2311 * Allocate a VM page for the pde entries.
2312 */
2313 while ((m = vm_page_grab()) == VM_PAGE_NULL) {
2314 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) {
2315 return KERN_RESOURCE_SHORTAGE;
2316 }
2317 VM_PAGE_WAIT();
2318 }
2319
2320 /*
2321 * put the page into the pmap's obj list so it
2322 * can be found later.
2323 */
2324 pn = VM_PAGE_GET_PHYS_PAGE(m);
2325 pa = i386_ptob(pn);
2326 i = pdeidx(map, vaddr);
2327
2328 /*
2329 * Zero the page.
2330 */
2331 pmap_zero_page(pn);
2332
2333 vm_page_lockspin_queues();
2334 vm_page_wire(m, VM_KERN_MEMORY_PTE, TRUE);
2335 vm_page_unlock_queues();
2336
2337 OSAddAtomic(1, &inuse_ptepages_count);
2338 OSAddAtomic64(1, &alloc_ptepages_count);
2339 PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
2340
2341 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
2342 vm_object_lock(map->pm_obj);
2343
2344 PMAP_LOCK_EXCLUSIVE(map);
2345
2346 /*
2347 * See if someone else expanded us first
2348 */
2349 if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
2350 PMAP_UNLOCK_EXCLUSIVE(map);
2351 vm_object_unlock(map->pm_obj);
2352
2353 VM_PAGE_FREE(m);
2354
2355 OSAddAtomic(-1, &inuse_ptepages_count); //todo replace all with inlines
2356 PMAP_ZINFO_PFREE(map, PAGE_SIZE);
2357 return KERN_SUCCESS;
2358 }
2359
2360 #if 0 /* DEBUG */
2361 if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE)) {
2362 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx",
2363 map, map->pm_obj, vaddr, i);
2364 }
2365 #endif
2366 vm_page_insert_wired(m, map->pm_obj, (vm_object_offset_t)i * PAGE_SIZE, VM_KERN_MEMORY_PTE);
2367 vm_object_unlock(map->pm_obj);
2368
2369 /*
2370 * Set the page directory entry for this page table.
2371 */
2372 pdp = pmap_pde(map, vaddr);
2373
2374 pmap_store_pte(is_ept, pdp, pa_to_pte(pa)
2375 | PTE_READ(is_ept)
2376 | (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER)
2377 | PTE_WRITE(is_ept));
2378
2379 PMAP_UNLOCK_EXCLUSIVE(map);
2380
2381 return KERN_SUCCESS;
2382 }
2383 /*
2384 * Query a pmap to see what size a given virtual address is mapped with.
2385 * If the vaddr is not mapped, returns 0.
2386 */
2387 vm_size_t
pmap_query_pagesize(pmap_t pmap,vm_map_offset_t vaddr)2388 pmap_query_pagesize(
2389 pmap_t pmap,
2390 vm_map_offset_t vaddr)
2391 {
2392 pd_entry_t *pdep;
2393 vm_size_t size = 0;
2394
2395 assert(!is_ept_pmap(pmap));
2396 PMAP_LOCK_EXCLUSIVE(pmap);
2397
2398 pdep = pmap_pde(pmap, vaddr);
2399 if (pdep != PD_ENTRY_NULL) {
2400 if (*pdep & INTEL_PTE_PS) {
2401 size = I386_LPGBYTES;
2402 } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2403 size = I386_PGBYTES;
2404 }
2405 }
2406
2407 PMAP_UNLOCK_EXCLUSIVE(pmap);
2408
2409 return size;
2410 }
2411
2412 uint32_t
pmap_user_va_bits(pmap_t pmap __unused)2413 pmap_user_va_bits(pmap_t pmap __unused)
2414 {
2415 /* x86 has constant set of bits based on 4 level paging. */
2416 return 48;
2417 }
2418
2419 uint32_t
pmap_kernel_va_bits(void)2420 pmap_kernel_va_bits(void)
2421 {
2422 /* x86 has constant set of bits based on 4 level paging. */
2423 return 48;
2424 }
2425
2426 /*
2427 * Ensure the page table hierarchy is filled in down to
2428 * the large page level. Additionally returns FAILURE if
2429 * a lower page table already exists.
2430 */
2431 static kern_return_t
pmap_pre_expand_large_internal(pmap_t pmap,vm_map_offset_t vaddr)2432 pmap_pre_expand_large_internal(
2433 pmap_t pmap,
2434 vm_map_offset_t vaddr)
2435 {
2436 ppnum_t pn;
2437 pt_entry_t *pte;
2438 boolean_t is_ept = is_ept_pmap(pmap);
2439 kern_return_t kr = KERN_SUCCESS;
2440
2441 if (pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
2442 if (!pmap_next_page_hi(&pn, FALSE)) {
2443 panic("pmap_pre_expand_large no PDPT");
2444 }
2445
2446 pmap_zero_page(pn);
2447
2448 pte = pmap64_pml4(pmap, vaddr);
2449
2450 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2451 PTE_READ(is_ept) |
2452 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2453 PTE_WRITE(is_ept));
2454
2455 pte = pmap64_user_pml4(pmap, vaddr);
2456
2457 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2458 PTE_READ(is_ept) |
2459 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2460 PTE_WRITE(is_ept));
2461 }
2462
2463 if (pmap_pde(pmap, vaddr) == PD_ENTRY_NULL) {
2464 if (!pmap_next_page_hi(&pn, FALSE)) {
2465 panic("pmap_pre_expand_large no PDE");
2466 }
2467
2468 pmap_zero_page(pn);
2469
2470 pte = pmap64_pdpt(pmap, vaddr);
2471
2472 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2473 PTE_READ(is_ept) |
2474 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2475 PTE_WRITE(is_ept));
2476 } else if (pmap_pte(pmap, vaddr) != PT_ENTRY_NULL) {
2477 kr = KERN_FAILURE;
2478 }
2479
2480 return kr;
2481 }
2482
2483 /*
2484 * Wrapper that locks the pmap.
2485 */
2486 kern_return_t
pmap_pre_expand_large(pmap_t pmap,vm_map_offset_t vaddr)2487 pmap_pre_expand_large(
2488 pmap_t pmap,
2489 vm_map_offset_t vaddr)
2490 {
2491 kern_return_t kr;
2492
2493 PMAP_LOCK_EXCLUSIVE(pmap);
2494 kr = pmap_pre_expand_large_internal(pmap, vaddr);
2495 PMAP_UNLOCK_EXCLUSIVE(pmap);
2496 return kr;
2497 }
2498
2499 /*
2500 * On large memory machines, pmap_steal_memory() will allocate past
2501 * the 1GB of pre-allocated/mapped virtual kernel area. This function
2502 * expands kernel the page tables to cover a given vaddr. It uses pages
2503 * from the same pool that pmap_steal_memory() uses, since vm_page_grab()
2504 * isn't available yet.
2505 */
2506 void
pmap_pre_expand(pmap_t pmap,vm_map_offset_t vaddr)2507 pmap_pre_expand(
2508 pmap_t pmap,
2509 vm_map_offset_t vaddr)
2510 {
2511 ppnum_t pn;
2512 pt_entry_t *pte;
2513 boolean_t is_ept = is_ept_pmap(pmap);
2514
2515 /*
2516 * This returns failure if a 4K page table already exists.
2517 * Othewise it fills in the page table hierarchy down
2518 * to that level.
2519 */
2520 PMAP_LOCK_EXCLUSIVE(pmap);
2521 if (pmap_pre_expand_large_internal(pmap, vaddr) == KERN_FAILURE) {
2522 PMAP_UNLOCK_EXCLUSIVE(pmap);
2523 return;
2524 }
2525
2526 /* Add the lowest table */
2527 if (!pmap_next_page_hi(&pn, FALSE)) {
2528 panic("pmap_pre_expand");
2529 }
2530
2531 pmap_zero_page(pn);
2532
2533 pte = pmap_pde(pmap, vaddr);
2534
2535 pmap_store_pte(is_ept, pte, pa_to_pte(i386_ptob(pn)) |
2536 PTE_READ(is_ept) |
2537 (is_ept ? (INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_USER) |
2538 PTE_WRITE(is_ept));
2539 PMAP_UNLOCK_EXCLUSIVE(pmap);
2540 }
2541
2542 /*
2543 * pmap_sync_page_data_phys(ppnum_t pa)
2544 *
2545 * Invalidates all of the instruction cache on a physical page and
2546 * pushes any dirty data from the data cache for the same physical page
2547 * Not required in i386.
2548 */
2549 void
pmap_sync_page_data_phys(__unused ppnum_t pa)2550 pmap_sync_page_data_phys(__unused ppnum_t pa)
2551 {
2552 return;
2553 }
2554
2555 /*
2556 * pmap_sync_page_attributes_phys(ppnum_t pa)
2557 *
2558 * Write back and invalidate all cachelines on a physical page.
2559 */
2560 void
pmap_sync_page_attributes_phys(ppnum_t pa)2561 pmap_sync_page_attributes_phys(ppnum_t pa)
2562 {
2563 cache_flush_page_phys(pa);
2564 }
2565
2566 void
pmap_copy_page(ppnum_t src,ppnum_t dst)2567 pmap_copy_page(ppnum_t src, ppnum_t dst)
2568 {
2569 bcopy_phys((addr64_t)i386_ptob(src),
2570 (addr64_t)i386_ptob(dst),
2571 PAGE_SIZE);
2572 }
2573
2574
2575 /*
2576 * Routine: pmap_pageable
2577 * Function:
2578 * Make the specified pages (by pmap, offset)
2579 * pageable (or not) as requested.
2580 *
2581 * A page which is not pageable may not take
2582 * a fault; therefore, its page table entry
2583 * must remain valid for the duration.
2584 *
2585 * This routine is merely advisory; pmap_enter
2586 * will specify that these pages are to be wired
2587 * down (or not) as appropriate.
2588 */
2589 void
pmap_pageable(__unused pmap_t pmap,__unused vm_map_offset_t start_addr,__unused vm_map_offset_t end_addr,__unused boolean_t pageable)2590 pmap_pageable(
2591 __unused pmap_t pmap,
2592 __unused vm_map_offset_t start_addr,
2593 __unused vm_map_offset_t end_addr,
2594 __unused boolean_t pageable)
2595 {
2596 #ifdef lint
2597 pmap++; start_addr++; end_addr++; pageable++;
2598 #endif /* lint */
2599 }
2600
2601 void
invalidate_icache(__unused vm_offset_t addr,__unused unsigned cnt,__unused int phys)2602 invalidate_icache(__unused vm_offset_t addr,
2603 __unused unsigned cnt,
2604 __unused int phys)
2605 {
2606 return;
2607 }
2608
2609 void
flush_dcache(__unused vm_offset_t addr,__unused unsigned count,__unused int phys)2610 flush_dcache(__unused vm_offset_t addr,
2611 __unused unsigned count,
2612 __unused int phys)
2613 {
2614 return;
2615 }
2616
2617 #if CONFIG_DTRACE
2618 /*
2619 * Constrain DTrace copyin/copyout actions
2620 */
2621 extern kern_return_t dtrace_copyio_preflight(addr64_t);
2622 extern kern_return_t dtrace_copyio_postflight(addr64_t);
2623
2624 kern_return_t
dtrace_copyio_preflight(__unused addr64_t va)2625 dtrace_copyio_preflight(__unused addr64_t va)
2626 {
2627 thread_t thread = current_thread();
2628 uint64_t ccr3;
2629 if (current_map() == kernel_map) {
2630 return KERN_FAILURE;
2631 } else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE)) {
2632 return KERN_FAILURE;
2633 } else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3)) {
2634 return KERN_FAILURE;
2635 } else {
2636 return KERN_SUCCESS;
2637 }
2638 }
2639
2640 kern_return_t
dtrace_copyio_postflight(__unused addr64_t va)2641 dtrace_copyio_postflight(__unused addr64_t va)
2642 {
2643 return KERN_SUCCESS;
2644 }
2645 #endif /* CONFIG_DTRACE */
2646
2647 #include <mach_vm_debug.h>
2648 #if MACH_VM_DEBUG
2649 #include <vm/vm_debug_internal.h>
2650
2651 int
pmap_list_resident_pages(__unused pmap_t pmap,__unused vm_offset_t * listp,__unused int space)2652 pmap_list_resident_pages(
2653 __unused pmap_t pmap,
2654 __unused vm_offset_t *listp,
2655 __unused int space)
2656 {
2657 return 0;
2658 }
2659 #endif /* MACH_VM_DEBUG */
2660
2661
2662 #if CONFIG_COREDUMP
2663 /* temporary workaround */
2664 boolean_t
coredumpok(vm_map_t map,mach_vm_offset_t va)2665 coredumpok(vm_map_t map, mach_vm_offset_t va)
2666 {
2667 #if 0
2668 pt_entry_t *ptep;
2669
2670 ptep = pmap_pte(map->pmap, va);
2671 if (0 == ptep) {
2672 return FALSE;
2673 }
2674 return (*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED);
2675 #else
2676 if (vm_map_entry_has_device_pager(map, va)) {
2677 return FALSE;
2678 }
2679 return TRUE;
2680 #endif
2681 }
2682 #endif
2683
2684 boolean_t
phys_page_exists(ppnum_t pn)2685 phys_page_exists(ppnum_t pn)
2686 {
2687 assert(pn != vm_page_fictitious_addr);
2688
2689 if (!pmap_initialized) {
2690 return TRUE;
2691 }
2692
2693 if (pn == vm_page_guard_addr) {
2694 return FALSE;
2695 }
2696
2697 if (!IS_MANAGED_PAGE(ppn_to_pai(pn))) {
2698 return FALSE;
2699 }
2700
2701 return TRUE;
2702 }
2703
2704
2705
2706 void
pmap_switch(pmap_t tpmap)2707 pmap_switch(pmap_t tpmap)
2708 {
2709 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(tpmap));
2710 assert(ml_get_interrupts_enabled() == FALSE);
2711 set_dirbase(tpmap, current_thread(), cpu_number());
2712 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
2713 }
2714
2715 void
pmap_require(pmap_t pmap)2716 pmap_require(pmap_t pmap)
2717 {
2718 if (pmap != kernel_pmap) {
2719 zone_id_require(ZONE_ID_PMAP, sizeof(struct pmap), pmap);
2720 }
2721 }
2722
2723 /*
2724 * disable no-execute capability on
2725 * the specified pmap
2726 */
2727 void
pmap_disable_NX(__unused pmap_t pmap)2728 pmap_disable_NX(__unused pmap_t pmap)
2729 {
2730 #if DEVELOPMENT || DEBUG
2731 pmap->nx_enabled = 0;
2732 #endif
2733 }
2734
2735 void
pmap_flush_context_init(pmap_flush_context * pfc)2736 pmap_flush_context_init(pmap_flush_context *pfc)
2737 {
2738 pfc->pfc_cpus = 0;
2739 pfc->pfc_invalid_global = 0;
2740 }
2741
2742 static bool
pmap_tlbi_response(uint32_t lcpu,uint32_t rcpu,bool ngflush)2743 pmap_tlbi_response(uint32_t lcpu, uint32_t rcpu, bool ngflush)
2744 {
2745 bool responded = false;
2746 bool gflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_global_count !=
2747 cpu_datap(lcpu)->cpu_tlb_gen_counts_global[rcpu]);
2748
2749 if (ngflush) {
2750 if (gflushed) {
2751 responded = true;
2752 }
2753 } else {
2754 if (gflushed) {
2755 responded = true;
2756 } else {
2757 bool lflushed = (cpu_datap(rcpu)->cpu_tlb_invalid_local_count !=
2758 cpu_datap(lcpu)->cpu_tlb_gen_counts_local[rcpu]);
2759 if (lflushed) {
2760 responded = true;
2761 }
2762 }
2763 }
2764
2765 if (responded == false) {
2766 if ((cpu_datap(rcpu)->cpu_tlb_invalid == 0) ||
2767 !CPU_CR3_IS_ACTIVE(rcpu) ||
2768 !cpu_is_running(rcpu)) {
2769 responded = true;
2770 }
2771 }
2772 return responded;
2773 }
2774
2775 extern uint64_t TLBTimeOut;
2776 void
pmap_flush(pmap_flush_context * pfc)2777 pmap_flush(
2778 pmap_flush_context *pfc)
2779 {
2780 unsigned int my_cpu;
2781 unsigned int cpu;
2782 cpumask_t cpu_bit;
2783 cpumask_t cpus_to_respond = 0;
2784 cpumask_t cpus_to_signal = 0;
2785 cpumask_t cpus_signaled = 0;
2786 boolean_t flush_self = FALSE;
2787 uint64_t deadline;
2788 bool need_global_flush = false;
2789
2790 mp_disable_preemption();
2791
2792 my_cpu = cpu_number();
2793 cpus_to_signal = pfc->pfc_cpus;
2794
2795 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_START,
2796 NULL, cpus_to_signal);
2797
2798 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus && cpus_to_signal; cpu++, cpu_bit <<= 1) {
2799 if (cpus_to_signal & cpu_bit) {
2800 cpus_to_signal &= ~cpu_bit;
2801
2802 if (!cpu_is_running(cpu)) {
2803 continue;
2804 }
2805
2806 if (pfc->pfc_invalid_global & cpu_bit) {
2807 cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2808 need_global_flush = true;
2809 } else {
2810 cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2811 }
2812 cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2813 cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2814 mfence();
2815
2816 if (cpu == my_cpu) {
2817 flush_self = TRUE;
2818 continue;
2819 }
2820 if (CPU_CR3_IS_ACTIVE(cpu)) {
2821 cpus_to_respond |= cpu_bit;
2822 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2823 }
2824 }
2825 }
2826 cpus_signaled = cpus_to_respond;
2827
2828 /*
2829 * Flush local tlb if required.
2830 * Do this now to overlap with other processors responding.
2831 */
2832 if (flush_self) {
2833 process_pmap_updates(NULL, (pfc->pfc_invalid_global != 0), 0ULL, ~0ULL);
2834 }
2835
2836 if (cpus_to_respond) {
2837 deadline = mach_absolute_time() +
2838 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
2839 boolean_t is_timeout_traced = FALSE;
2840
2841 /*
2842 * Wait for those other cpus to acknowledge
2843 */
2844 while (cpus_to_respond != 0) {
2845 long orig_acks = 0;
2846
2847 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2848 bool responded = false;
2849 if ((cpus_to_respond & cpu_bit) != 0) {
2850 responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
2851 if (responded) {
2852 cpus_to_respond &= ~cpu_bit;
2853 }
2854 cpu_pause();
2855 }
2856
2857 if (cpus_to_respond == 0) {
2858 break;
2859 }
2860 }
2861 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2862 if (machine_timeout_suspended()) {
2863 continue;
2864 }
2865 if (TLBTimeOut == 0) {
2866 if (is_timeout_traced) {
2867 continue;
2868 }
2869
2870 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
2871 NULL, cpus_to_signal, cpus_to_respond);
2872
2873 is_timeout_traced = TRUE;
2874 continue;
2875 }
2876 orig_acks = NMIPI_acks;
2877 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
2878 panic("Uninterruptible processor(s): CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu",
2879 cpus_to_respond, orig_acks, NMIPI_acks, deadline);
2880 }
2881 }
2882 }
2883
2884 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_DELAYED_TLBS) | DBG_FUNC_END,
2885 NULL, cpus_signaled, flush_self);
2886
2887 mp_enable_preemption();
2888 }
2889
2890
2891 static void
invept(void * eptp)2892 invept(void *eptp)
2893 {
2894 struct {
2895 uint64_t eptp;
2896 uint64_t reserved;
2897 } __attribute__((aligned(16), packed)) invept_descriptor = {(uint64_t)eptp, 0};
2898
2899 __asm__ volatile ("invept (%%rax), %%rcx"
2900 : : "c" (PMAP_INVEPT_SINGLE_CONTEXT), "a" (&invept_descriptor)
2901 : "cc", "memory");
2902 }
2903
2904 /*
2905 * Called with pmap locked, we:
2906 * - scan through per-cpu data to see which other cpus need to flush
2907 * - send an IPI to each non-idle cpu to be flushed
2908 * - wait for all to signal back that they are inactive or we see that
2909 * they are at a safe point (idle).
2910 * - flush the local tlb if active for this pmap
2911 * - return ... the caller will unlock the pmap
2912 */
2913
2914 void
pmap_flush_tlbs(pmap_t pmap,vm_map_offset_t startv,vm_map_offset_t endv,int options,pmap_flush_context * pfc)2915 pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int options, pmap_flush_context *pfc)
2916 {
2917 unsigned int cpu;
2918 cpumask_t cpu_bit;
2919 cpumask_t cpus_to_signal = 0;
2920 unsigned int my_cpu = cpu_number();
2921 pmap_paddr_t pmap_cr3 = pmap->pm_cr3;
2922 boolean_t flush_self = FALSE;
2923 uint64_t deadline;
2924 boolean_t pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap));
2925 bool need_global_flush = false;
2926 uint32_t event_code = 0;
2927 vm_map_offset_t event_startv = 0, event_endv = 0;
2928 boolean_t is_ept = is_ept_pmap(pmap);
2929
2930 assert((processor_avail_count < 2) ||
2931 (ml_get_interrupts_enabled() && get_preemption_level() != 0));
2932
2933 assert((endv - startv) >= PAGE_SIZE);
2934 assert(((endv | startv) & PAGE_MASK) == 0);
2935
2936 if (__improbable(kdebug_enable)) {
2937 if (pmap == kernel_pmap) {
2938 event_code = PMAP_CODE(PMAP__FLUSH_KERN_TLBS);
2939 event_startv = VM_KERNEL_UNSLIDE_OR_PERM(startv);
2940 event_endv = VM_KERNEL_UNSLIDE_OR_PERM(endv);
2941 } else if (__improbable(is_ept)) {
2942 event_code = PMAP_CODE(PMAP__FLUSH_EPT);
2943 event_startv = startv;
2944 event_endv = endv;
2945 } else {
2946 event_code = PMAP_CODE(PMAP__FLUSH_TLBS);
2947 event_startv = startv;
2948 event_endv = endv;
2949 }
2950 }
2951
2952 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_START,
2953 VM_KERNEL_UNSLIDE_OR_PERM(pmap), options,
2954 event_startv, event_endv);
2955
2956 if (__improbable(is_ept)) {
2957 mp_cpus_call(CPUMASK_ALL, ASYNC, invept, (void*)pmap->pm_eptp);
2958 goto out;
2959 }
2960
2961 /*
2962 * Scan other cpus for matching active or task CR3.
2963 * For idle cpus (with no active map) we mark them invalid but
2964 * don't signal -- they'll check as they go busy.
2965 */
2966 if (pmap_pcid_ncpus) {
2967 if (pmap_is_shared) {
2968 need_global_flush = true;
2969 }
2970 pmap_pcid_invalidate_all_cpus(pmap);
2971 mfence();
2972 }
2973
2974 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2975 if (!cpu_is_running(cpu)) {
2976 continue;
2977 }
2978 uint64_t cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
2979 uint64_t cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
2980
2981 if ((pmap_cr3 == cpu_task_cr3) ||
2982 (pmap_cr3 == cpu_active_cr3) ||
2983 (pmap_is_shared)) {
2984 if (options & PMAP_DELAY_TLB_FLUSH) {
2985 if (need_global_flush == true) {
2986 pfc->pfc_invalid_global |= cpu_bit;
2987 }
2988 pfc->pfc_cpus |= cpu_bit;
2989
2990 continue;
2991 }
2992 if (need_global_flush == true) {
2993 cpu_datap(my_cpu)->cpu_tlb_gen_counts_global[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_global_count;
2994 cpu_datap(cpu)->cpu_tlb_invalid_global = 1;
2995 } else {
2996 cpu_datap(my_cpu)->cpu_tlb_gen_counts_local[cpu] = cpu_datap(cpu)->cpu_tlb_invalid_local_count;
2997 cpu_datap(cpu)->cpu_tlb_invalid_local = 1;
2998 }
2999
3000 if (cpu == my_cpu) {
3001 flush_self = TRUE;
3002 continue;
3003 }
3004
3005 mfence();
3006
3007 /*
3008 * We don't need to signal processors which will flush
3009 * lazily at the idle state or kernel boundary.
3010 * For example, if we're invalidating the kernel pmap,
3011 * processors currently in userspace don't need to flush
3012 * their TLBs until the next time they enter the kernel.
3013 * Alterations to the address space of a task active
3014 * on a remote processor result in a signal, to
3015 * account for copy operations. (There may be room
3016 * for optimization in such cases).
3017 * The order of the loads below with respect
3018 * to the store to the "cpu_tlb_invalid" field above
3019 * is important--hence the barrier.
3020 */
3021 if (CPU_CR3_IS_ACTIVE(cpu) &&
3022 (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
3023 pmap->pm_shared ||
3024 (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
3025 cpus_to_signal |= cpu_bit;
3026 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
3027 }
3028 }
3029 }
3030
3031 if ((options & PMAP_DELAY_TLB_FLUSH)) {
3032 goto out;
3033 }
3034
3035 /*
3036 * Flush local tlb if required.
3037 * Do this now to overlap with other processors responding.
3038 */
3039 if (flush_self) {
3040 process_pmap_updates(pmap, pmap_is_shared, startv, endv);
3041 }
3042
3043 if (cpus_to_signal) {
3044 cpumask_t cpus_to_respond = cpus_to_signal;
3045
3046 deadline = mach_absolute_time() +
3047 (TLBTimeOut ? TLBTimeOut : LockTimeOut);
3048 boolean_t is_timeout_traced = FALSE;
3049
3050 /*
3051 * Wait for those other cpus to acknowledge
3052 */
3053 while (cpus_to_respond != 0) {
3054 long orig_acks = 0;
3055
3056 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
3057 bool responded = false;
3058 if ((cpus_to_respond & cpu_bit) != 0) {
3059 responded = pmap_tlbi_response(my_cpu, cpu, need_global_flush);
3060 if (responded) {
3061 cpus_to_respond &= ~cpu_bit;
3062 }
3063 cpu_pause();
3064 }
3065 if (cpus_to_respond == 0) {
3066 break;
3067 }
3068 }
3069 if (cpus_to_respond && (mach_absolute_time() > deadline)) {
3070 if (machine_timeout_suspended()) {
3071 continue;
3072 }
3073 if (TLBTimeOut == 0) {
3074 /* cut tracepoint but don't panic */
3075 if (is_timeout_traced) {
3076 continue;
3077 }
3078
3079 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS_TO),
3080 VM_KERNEL_UNSLIDE_OR_PERM(pmap),
3081 cpus_to_signal,
3082 cpus_to_respond);
3083
3084 is_timeout_traced = TRUE;
3085 continue;
3086 }
3087 orig_acks = NMIPI_acks;
3088 uint64_t tstamp1 = mach_absolute_time();
3089 NMIPI_panic(cpus_to_respond, TLB_FLUSH_TIMEOUT);
3090 uint64_t tstamp2 = mach_absolute_time();
3091 panic("IPI timeout, unresponsive CPU bitmap: 0x%llx, NMIPI acks: 0x%lx, now: 0x%lx, deadline: %llu, pre-NMIPI time: 0x%llx, current: 0x%llx, global: %d",
3092 cpus_to_respond, orig_acks, NMIPI_acks, deadline, tstamp1, tstamp2, need_global_flush);
3093 }
3094 }
3095 }
3096
3097 if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) {
3098 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, pmap_cr3: 0x%llx, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, pmap_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
3099 }
3100
3101 out:
3102 PMAP_TRACE_CONSTANT(event_code | DBG_FUNC_END,
3103 VM_KERNEL_UNSLIDE_OR_PERM(pmap), cpus_to_signal,
3104 event_startv, event_endv);
3105 }
3106
3107 static void
process_pmap_updates(pmap_t p,bool pshared,addr64_t istart,addr64_t iend)3108 process_pmap_updates(pmap_t p, bool pshared, addr64_t istart, addr64_t iend)
3109 {
3110 int ccpu = cpu_number();
3111 bool gtlbf = false;
3112
3113 pmap_assert(ml_get_interrupts_enabled() == 0 ||
3114 get_preemption_level() != 0);
3115
3116 if (cpu_datap(ccpu)->cpu_tlb_invalid_global) {
3117 cpu_datap(ccpu)->cpu_tlb_invalid_global_count++;
3118 cpu_datap(ccpu)->cpu_tlb_invalid = 0;
3119 gtlbf = true;
3120 } else {
3121 cpu_datap(ccpu)->cpu_tlb_invalid_local_count++;
3122 cpu_datap(ccpu)->cpu_tlb_invalid_local = 0;
3123 }
3124
3125 if (pmap_pcid_ncpus) {
3126 if (p) {
3127 /* TODO global generation count to
3128 * avoid potentially redundant
3129 * csw invalidations post-global invalidation
3130 */
3131 pmap_pcid_validate_cpu(p, ccpu);
3132 pmap_tlbi_range(istart, iend, (pshared || gtlbf), p->pmap_pcid_cpus[ccpu]);
3133 } else {
3134 pmap_pcid_validate_current();
3135 pmap_tlbi_range(istart, iend, true, 0);
3136 }
3137 } else {
3138 pmap_tlbi_range(0, ~0ULL, true, 0);
3139 }
3140 }
3141
3142 void
pmap_update_interrupt(void)3143 pmap_update_interrupt(void)
3144 {
3145 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START);
3146
3147 if (current_cpu_datap()->cpu_tlb_invalid) {
3148 process_pmap_updates(NULL, true, 0ULL, ~0ULL);
3149 }
3150
3151 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END);
3152 }
3153
3154 #include <mach/mach_vm.h> /* mach_vm_region_recurse() */
3155 /* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries
3156 * and identify ranges with mismatched VM permissions and PTE permissions
3157 */
3158 kern_return_t
pmap_permissions_verify(pmap_t ipmap,vm_map_t ivmmap,vm_offset_t sv,vm_offset_t ev)3159 pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev)
3160 {
3161 vm_offset_t cv = sv;
3162 kern_return_t rv = KERN_SUCCESS;
3163 uint64_t skip4 = 0, skip2 = 0;
3164
3165 assert(!is_ept_pmap(ipmap));
3166
3167 sv &= ~PAGE_MASK_64;
3168 ev &= ~PAGE_MASK_64;
3169 while (cv < ev) {
3170 if (__improbable((cv > 0x00007FFFFFFFFFFFULL) &&
3171 (cv < 0xFFFF800000000000ULL))) {
3172 cv = 0xFFFF800000000000ULL;
3173 }
3174 /* Potential inconsistencies from not holding pmap lock
3175 * but harmless for the moment.
3176 */
3177 if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) {
3178 if ((cv + NBPML4) > cv) {
3179 cv += NBPML4;
3180 } else {
3181 break;
3182 }
3183 skip4++;
3184 continue;
3185 }
3186 if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) {
3187 if ((cv + NBPD) > cv) {
3188 cv += NBPD;
3189 } else {
3190 break;
3191 }
3192 skip2++;
3193 continue;
3194 }
3195
3196 pt_entry_t *ptep = pmap_pte(ipmap, cv);
3197 if (ptep && (*ptep & INTEL_PTE_VALID)) {
3198 if (*ptep & INTEL_PTE_WRITE) {
3199 if (!(*ptep & INTEL_PTE_NX)) {
3200 kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep)))));
3201 rv = KERN_FAILURE;
3202 }
3203 }
3204 }
3205 cv += PAGE_SIZE;
3206 }
3207 kprintf("Completed pmap scan\n");
3208 cv = sv;
3209
3210 struct vm_region_submap_info_64 vbr;
3211 mach_msg_type_number_t vbrcount = 0;
3212 mach_vm_size_t vmsize;
3213 vm_prot_t prot;
3214 uint32_t nesting_depth = 0;
3215 kern_return_t kret;
3216
3217 while (cv < ev) {
3218 for (;;) {
3219 vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64;
3220 if ((kret = mach_vm_region_recurse(ivmmap,
3221 (mach_vm_address_t *) &cv, &vmsize, &nesting_depth,
3222 (vm_region_recurse_info_t)&vbr,
3223 &vbrcount)) != KERN_SUCCESS) {
3224 break;
3225 }
3226
3227 if (vbr.is_submap) {
3228 nesting_depth++;
3229 continue;
3230 } else {
3231 break;
3232 }
3233 }
3234
3235 if (kret != KERN_SUCCESS) {
3236 break;
3237 }
3238
3239 prot = vbr.protection;
3240
3241 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
3242 kprintf("W+X map entry at address 0x%lx\n", cv);
3243 rv = KERN_FAILURE;
3244 }
3245
3246 if (prot) {
3247 vm_offset_t pcv;
3248 for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) {
3249 pt_entry_t *ptep = pmap_pte(ipmap, pcv);
3250 vm_prot_t tprot;
3251
3252 if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID)) {
3253 continue;
3254 }
3255 tprot = VM_PROT_READ;
3256 if (*ptep & INTEL_PTE_WRITE) {
3257 tprot |= VM_PROT_WRITE;
3258 }
3259 if ((*ptep & INTEL_PTE_NX) == 0) {
3260 tprot |= VM_PROT_EXECUTE;
3261 }
3262 if (tprot != prot) {
3263 kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot);
3264 rv = KERN_FAILURE;
3265 }
3266 }
3267 }
3268 cv += vmsize;
3269 }
3270 return rv;
3271 }
3272
3273 #if MACH_ASSERT
3274 extern int pmap_ledgers_panic;
3275 extern int pmap_ledgers_panic_leeway;
3276
3277 static void
pmap_check_ledgers(pmap_t pmap)3278 pmap_check_ledgers(
3279 pmap_t pmap)
3280 {
3281 int pid;
3282 char *procname;
3283
3284 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
3285 /*
3286 * This pmap was not or is no longer fully associated
3287 * with a task (e.g. the old pmap after a fork()/exec() or
3288 * spawn()). Its "ledger" still points at a task that is
3289 * now using a different (and active) address space, so
3290 * we can't check that all the pmap ledgers are balanced here.
3291 *
3292 * If the "pid" is set, that means that we went through
3293 * pmap_set_process() in task_terminate_internal(), so
3294 * this task's ledger should not have been re-used and
3295 * all the pmap ledgers should be back to 0.
3296 */
3297 return;
3298 }
3299
3300 pid = pmap->pmap_pid;
3301 procname = pmap->pmap_procname;
3302
3303 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
3304 }
3305
3306 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3307 pmap_set_process(
3308 pmap_t pmap,
3309 int pid,
3310 char *procname)
3311 {
3312 if (pmap == NULL || pmap->pmap_pid == -1) {
3313 return;
3314 }
3315
3316 pmap->pmap_pid = pid;
3317 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3318 if (pmap_ledgers_panic_leeway) {
3319 /*
3320 * XXX FBDP
3321 * Some processes somehow trigger some issues that make
3322 * the pmap stats and ledgers go off track, causing
3323 * some assertion failures and ledger panics.
3324 * Turn off the sanity checks if we allow some ledger leeway
3325 * because of that. We'll still do a final check in
3326 * pmap_check_ledgers() for discrepancies larger than the
3327 * allowed leeway after the address space has been fully
3328 * cleaned up.
3329 */
3330 pmap->pmap_stats_assert = FALSE;
3331 ledger_disable_panic_on_negative(pmap->ledger,
3332 task_ledgers.phys_footprint);
3333 ledger_disable_panic_on_negative(pmap->ledger,
3334 task_ledgers.internal);
3335 ledger_disable_panic_on_negative(pmap->ledger,
3336 task_ledgers.internal_compressed);
3337 ledger_disable_panic_on_negative(pmap->ledger,
3338 task_ledgers.iokit_mapped);
3339 ledger_disable_panic_on_negative(pmap->ledger,
3340 task_ledgers.alternate_accounting);
3341 ledger_disable_panic_on_negative(pmap->ledger,
3342 task_ledgers.alternate_accounting_compressed);
3343 }
3344 }
3345 #endif /* MACH_ASSERT */
3346
3347
3348 #if DEVELOPMENT || DEBUG
3349 int pmap_pagezero_mitigation = 1;
3350 #endif
3351
3352 void
pmap_advise_pagezero_range(pmap_t lpmap,uint64_t low_bound)3353 pmap_advise_pagezero_range(pmap_t lpmap, uint64_t low_bound)
3354 {
3355 #if DEVELOPMENT || DEBUG
3356 if (pmap_pagezero_mitigation == 0) {
3357 lpmap->pagezero_accessible = FALSE;
3358 return;
3359 }
3360 #endif
3361 lpmap->pagezero_accessible = ((pmap_smap_enabled == FALSE) && (low_bound < 0x1000));
3362 if (lpmap == current_pmap()) {
3363 mp_disable_preemption();
3364 current_cpu_datap()->cpu_pagezero_mapped = lpmap->pagezero_accessible;
3365 mp_enable_preemption();
3366 }
3367 }
3368
3369 uintptr_t
pmap_verify_noncacheable(uintptr_t vaddr)3370 pmap_verify_noncacheable(uintptr_t vaddr)
3371 {
3372 pt_entry_t *ptep = NULL;
3373 ptep = pmap_pte(kernel_pmap, vaddr);
3374 if (ptep == NULL) {
3375 panic("pmap_verify_noncacheable: no translation for 0x%lx", vaddr);
3376 }
3377 /* Non-cacheable OK */
3378 if (*ptep & (INTEL_PTE_NCACHE)) {
3379 return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3380 }
3381 /* Write-combined OK */
3382 if (*ptep & (INTEL_PTE_PAT)) {
3383 return pte_to_pa(*ptep) | (vaddr & INTEL_OFFMASK);
3384 }
3385 panic("pmap_verify_noncacheable: IO read from a cacheable address? address: 0x%lx, PTE: %p, *PTE: 0x%llx", vaddr, ptep, *ptep);
3386 /*NOTREACHED*/
3387 return 0;
3388 }
3389
3390 bool
pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])3391 pmap_lookup_in_loaded_trust_caches(const uint8_t __unused cdhash[20])
3392 {
3393 // Unsupported on this architecture.
3394 return false;
3395 }
3396
3397 uint32_t
pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])3398 pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20])
3399 {
3400 // Unsupported on this architecture.
3401 return false;
3402 }
3403
3404 int
pmap_cs_configuration(void)3405 pmap_cs_configuration(void)
3406 {
3407 // Unsupported on this architecture.
3408 return 0;
3409 }
3410
3411 bool
pmap_in_ppl(void)3412 pmap_in_ppl(void)
3413 {
3414 // Nonexistent on this architecture.
3415 return false;
3416 }
3417
3418 bool
pmap_has_iofilter_protected_write()3419 pmap_has_iofilter_protected_write()
3420 {
3421 // Not supported on this architecture.
3422 return false;
3423 }
3424
3425 __attribute__((__noreturn__))
3426 void
pmap_iofilter_protected_write(__unused vm_address_t addr,__unused uint64_t value,__unused uint64_t width)3427 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
3428 {
3429 panic("%s called on an unsupported platform.", __FUNCTION__);
3430 }
3431 void *
pmap_claim_reserved_ppl_page(void)3432 pmap_claim_reserved_ppl_page(void)
3433 {
3434 // Unsupported on this architecture.
3435 return NULL;
3436 }
3437
3438 void
pmap_free_reserved_ppl_page(void __unused * kva)3439 pmap_free_reserved_ppl_page(void __unused *kva)
3440 {
3441 // Unsupported on this architecture.
3442 }
3443
3444 #if DEVELOPMENT || DEBUG
3445 /*
3446 * Used for unit testing recovery from text corruptions.
3447 */
3448 kern_return_t
pmap_test_text_corruption(pmap_paddr_t pa)3449 pmap_test_text_corruption(pmap_paddr_t pa)
3450 {
3451 int pai;
3452 uint8_t *va;
3453
3454 pai = ppn_to_pai(atop(pa));
3455 if (!IS_MANAGED_PAGE(pai)) {
3456 return KERN_FAILURE;
3457 }
3458
3459 va = (uint8_t *)PHYSMAP_PTOV(pa);
3460 va[0] = 0x0f; /* opcode for UD2 */
3461 va[1] = 0x0b;
3462
3463 return KERN_SUCCESS;
3464 }
3465 #endif /* DEVELOPMENT || DEBUG */
3466