xref: /xnu-11215.41.3/osfmk/i386/pmap.h (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 
59 /*
60  *	File:	pmap.h
61  *
62  *	Authors:  Avadis Tevanian, Jr., Michael Wayne Young
63  *	Date:	1985
64  *
65  *	Machine-dependent structures for the physical map module.
66  */
67 #ifdef KERNEL_PRIVATE
68 #ifndef _PMAP_MACHINE_
69 #define _PMAP_MACHINE_  1
70 
71 #ifndef ASSEMBLER
72 
73 #include <mach/kern_return.h>
74 #include <mach/machine/vm_types.h>
75 #include <mach/vm_prot.h>
76 #include <mach/vm_statistics.h>
77 #include <mach/machine/vm_param.h>
78 #include <kern/kern_types.h>
79 #include <kern/thread.h>
80 #include <kern/simple_lock.h>
81 
82 #include <i386/mp.h>
83 #include <i386/cpu_number.h>
84 #include <i386/proc_reg.h>
85 #include <os/atomic_private.h>
86 #include <i386/pal_routines.h>
87 
88 /*
89  *	Define the generic in terms of the specific
90  */
91 
92 #define INTEL_PGBYTES           I386_PGBYTES
93 #define INTEL_PGSHIFT           I386_PGSHIFT
94 #define intel_btop(x)           i386_btop(x)
95 #define intel_ptob(x)           i386_ptob(x)
96 #define intel_round_page(x)     i386_round_page(x)
97 #define intel_trunc_page(x)     i386_trunc_page(x)
98 
99 /*
100  *	i386/i486/i860 Page Table Entry
101  */
102 
103 #endif  /* ASSEMBLER */
104 
105 #define NPGPTD          4ULL
106 #define PDESHIFT        21ULL
107 #define PTEMASK         0x1ffULL
108 #define PTEINDX         3ULL
109 
110 #define PTESHIFT        12ULL
111 
112 #define LOW_4GB_MASK    ((vm_offset_t)0x00000000FFFFFFFFUL)
113 
114 #define PDESIZE         sizeof(pd_entry_t) /* for assembly files */
115 #define PTESIZE         sizeof(pt_entry_t) /* for assembly files */
116 
117 #define INTEL_OFFMASK   (I386_PGBYTES - 1)
118 #define INTEL_LOFFMASK  (I386_LPGBYTES - 1)
119 #define PG_FRAME        0x000FFFFFFFFFF000ULL
120 #define NPTEPG          (PAGE_SIZE/(sizeof (pt_entry_t)))
121 #define NPTDPG          (PAGE_SIZE/(sizeof (pd_entry_t)))
122 
123 #define NBPTD           (NPGPTD << PAGE_SHIFT)
124 #define NPDEPTD         (NBPTD / (sizeof (pd_entry_t)))
125 #define NPDEPG          (PAGE_SIZE/(sizeof (pd_entry_t)))
126 #define NBPDE           (1ULL << PDESHIFT)
127 #define PDEMASK         (NBPDE - 1)
128 
129 #define PTE_PER_PAGE    512 /* number of PTE's per page on any level */
130 
131 /* cleanly define parameters for all the page table levels */
132 typedef uint64_t        pml4_entry_t;
133 #define NPML4PG         (PAGE_SIZE/(sizeof (pml4_entry_t)))
134 #define PML4SHIFT       39
135 #define PML4PGSHIFT     9
136 #define NBPML4          (1ULL << PML4SHIFT)
137 #define PML4MASK        (NBPML4-1)
138 #define PML4_ENTRY_NULL ((pml4_entry_t *) 0)
139 
140 typedef uint64_t        pdpt_entry_t;
141 #define NPDPTPG         (PAGE_SIZE/(sizeof (pdpt_entry_t)))
142 #define PDPTSHIFT       30
143 #define PDPTPGSHIFT     9
144 #define NBPDPT          (1ULL << PDPTSHIFT)
145 #define PDPTMASK        (NBPDPT-1)
146 #define PDPT_ENTRY_NULL ((pdpt_entry_t *) 0)
147 
148 typedef uint64_t        pd_entry_t;
149 #define NPDPG           (PAGE_SIZE/(sizeof (pd_entry_t)))
150 #define PDSHIFT         21
151 #define PDPGSHIFT       9
152 #define NBPD            (1ULL << PDSHIFT)
153 #define PDMASK          (NBPD-1)
154 #define PD_ENTRY_NULL   ((pd_entry_t *) 0)
155 
156 typedef uint64_t        pt_entry_t;
157 #define NPTPG           (PAGE_SIZE/(sizeof (pt_entry_t)))
158 #define PTSHIFT         12
159 #define PTPGSHIFT       9
160 #define NBPT            (1ULL << PTSHIFT)
161 #define PTMASK          (NBPT-1)
162 #define PT_ENTRY_NULL   ((pt_entry_t *) 0)
163 
164 typedef uint64_t  pmap_paddr_t __kernel_ptr_semantics;
165 
166 #if     DEVELOPMENT || DEBUG
167 #define PMAP_ASSERT 1
168 extern int pmap_asserts_enabled;
169 extern int pmap_asserts_traced;
170 #endif
171 
172 #if PMAP_ASSERT
173 #define pmap_assert(ex) (pmap_asserts_enabled ? ((ex) ? (void)0 : Assert(__FILE_NAME__, __LINE__, # ex)) : (void)0)
174 
175 #define pmap_assert2(ex, fmt, args...)                                  \
176 	do {                                                            \
177 	        if (__improbable(pmap_asserts_enabled && !(ex))) {      \
178 	                if (pmap_asserts_traced) {                      \
179 	                        KERNEL_DEBUG_CONSTANT(0xDEAD1000, __builtin_return_address(0), __LINE__, 0, 0, 0); \
180 	                        kdebug_enable = 0;                      \
181 	                } else {                                        \
182 	                                kprintf("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE_NAME__, __LINE__, __builtin_return_address(0),  ##args); \
183 	                                panic("Assertion %s failed (%s:%d, caller %p) " fmt , #ex, __FILE_NAME__, __LINE__, __builtin_return_address(0),  ##args); \
184 	                }                                               \
185 	        }                                                       \
186 	} while(0)
187 #else
188 #define pmap_assert(ex)
189 #define pmap_assert2(ex, fmt, args...)
190 #endif
191 
192 /* superpages */
193 #define SUPERPAGE_NBASEPAGES 512
194 
195 /* in 64 bit spaces, the number of each type of page in the page tables */
196 #define NPML4PGS        (1ULL * (PAGE_SIZE/(sizeof (pml4_entry_t))))
197 #define NPDPTPGS        (NPML4PGS * (PAGE_SIZE/(sizeof (pdpt_entry_t))))
198 #define NPDEPGS         (NPDPTPGS * (PAGE_SIZE/(sizeof (pd_entry_t))))
199 #define NPTEPGS         (NPDEPGS * (PAGE_SIZE/(sizeof (pt_entry_t))))
200 
201 extern int      kernPhysPML4Index;
202 extern int      kernPhysPML4EntryCount;
203 
204 #define KERNEL_PML4_INDEX               511
205 #define KERNEL_KEXTS_INDEX              (KERNEL_PML4_INDEX - 1)         /* 510: Home of KEXTs - the basement */
206 #define KERNEL_PHYSMAP_PML4_INDEX       (kernPhysPML4Index)             /* 50X: virtual to physical map */
207 #define KERNEL_PHYSMAP_PML4_COUNT       (kernPhysPML4EntryCount)
208 #define KERNEL_PHYSMAP_PML4_COUNT_MAX   (16 - 2)        /* 1 for KERNEL, 1 for BASEMENT */
209 /* 2 PML4s for KASAN to cover a maximum of 16 PML4s {PHYSMAP + BASEMENT + KVA} */
210 #define KERNEL_KASAN_PML4_LAST          (495) /* 511 - 16 */
211 #define KERNEL_KASAN_PML4_FIRST         (494) /* 511 - 17 */
212 #define KERNEL_DBLMAP_PML4_INDEX        (KERNEL_KASAN_PML4_FIRST - 1)
213 #define KERNEL_PML4_COUNT               1
214 #define KERNEL_BASE                     (0ULL - (NBPML4 * KERNEL_PML4_COUNT))
215 #define KERNEL_BASEMENT                 (KERNEL_BASE - NBPML4)  /* Basement uses one PML4 entry */
216 
217 /*
218  * Pte related macros
219  */
220 #define KVADDR(pmi, pdpi, pdi, pti)               \
221 	 ((vm_offset_t)                   \
222 	        ((uint64_t) -1    << 47)        | \
223 	        ((uint64_t)(pmi)  << PML4SHIFT) | \
224 	        ((uint64_t)(pdpi) << PDPTSHIFT) | \
225 	        ((uint64_t)(pdi)  << PDESHIFT)  | \
226 	        ((uint64_t)(pti)  << PTESHIFT))
227 
228 
229 #ifndef NKPT
230 #define NKPT            500     /* actual number of bootstrap kernel page tables */
231 #endif
232 
233 
234 
235 /*
236  *	Convert address offset to page descriptor index
237  */
238 #define pdptnum(pmap, a) (((vm_offset_t)(a) >> PDPTSHIFT) & PDPTMASK)
239 #define pdenum(pmap, a) (((vm_offset_t)(a) >> PDESHIFT) & PDEMASK)
240 #define PMAP_INVALID_PDPTNUM (~0ULL)
241 
242 #define pdeidx(pmap, a)    (((a) >> PDSHIFT)   & ((1ULL<<(48 - PDSHIFT)) -1))
243 #define pdptidx(pmap, a)   (((a) >> PDPTSHIFT) & ((1ULL<<(48 - PDPTSHIFT)) -1))
244 #define pml4idx(pmap, a)   (((a) >> PML4SHIFT) & ((1ULL<<(48 - PML4SHIFT)) -1))
245 
246 
247 /*
248  *	Convert page descriptor index to user virtual address
249  */
250 #define pdetova(a)      ((vm_offset_t)(a) << PDESHIFT)
251 
252 /*
253  *	Convert address offset to page table index
254  */
255 #define ptenum(a)       (((vm_offset_t)(a) >> PTESHIFT) & PTEMASK)
256 
257 /*
258  *	Hardware pte bit definitions (to be used directly on the ptes
259  *	without using the bit fields).
260  */
261 
262 #define INTEL_PTE_VALID         0x00000001ULL
263 
264 #define INTEL_PTE_WRITE         0x00000002ULL
265 #define INTEL_PTE_RW            0x00000002ULL
266 
267 #define INTEL_PTE_USER          0x00000004ULL
268 
269 #define INTEL_PTE_WTHRU         0x00000008ULL
270 #define INTEL_PTE_NCACHE        0x00000010ULL
271 
272 #define INTEL_PTE_REF           0x00000020ULL
273 #define INTEL_PTE_MOD           0x00000040ULL
274 
275 #define INTEL_PTE_PS            0x00000080ULL
276 #define INTEL_PTE_PAT           0x00000080ULL
277 
278 #define INTEL_PTE_GLOBAL        0x00000100ULL
279 
280 /* These markers use software available bits ignored by the
281  * processor's 4-level and EPT pagetable walkers.
282  * N.B.: WIRED was originally bit 10, but that conflicts with
283  * execute permissions for EPT entries iff mode-based execute controls
284  * are enabled.
285  */
286 #define INTEL_PTE_SWLOCK        (0x1ULL << 52)
287 #define INTEL_PDPTE_NESTED      (0x1ULL << 53)
288 #define INTEL_PTE_WIRED         (0x1ULL << 54)
289 /* TODO: Compressed markers, potential conflict with protection keys? */
290 #define INTEL_PTE_COMPRESSED_ALT (1ULL << 61) /* compressed but with "alternate accounting" */
291 #define INTEL_PTE_COMPRESSED    (1ULL << 62) /* marker, for invalid PTE only -- ignored by hardware for both regular/EPT entries*/
292 
293 #define INTEL_PTE_PFN           PG_FRAME
294 /* TODO: these should be internal definitions */
295 #define INTEL_PTE_NX            (1ULL << 63)
296 
297 #define INTEL_PTE_INVALID       0
298 /* This is conservative, but suffices */
299 #define INTEL_PTE_RSVD          ((1ULL << 10) | (1ULL << 11))
300 
301 
302 #define INTEL_PTE_COMPRESSED_MASK (INTEL_PTE_COMPRESSED | \
303 	                           INTEL_PTE_COMPRESSED_ALT | INTEL_PTE_SWLOCK)
304 #define PTE_IS_COMPRESSED(x, ptep, pmap, vaddr)                            \
305 	((((x) & INTEL_PTE_VALID) == 0) && /* PTE is not valid... */       \
306 	 ((x) & INTEL_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \
307 	 ((!((x) & ~INTEL_PTE_COMPRESSED_MASK)) || /* ...no other bits */  \
308 	  pmap_compressed_pte_corruption_repair((x), &(x), (ptep), (pmap), (vaddr))))
309 
310 #define pa_to_pte(a)            ((a) & INTEL_PTE_PFN) /* XXX */
311 #define pte_to_pa(p)            ((p) & INTEL_PTE_PFN) /* XXX */
312 #define pte_increment_pa(p)     ((p) += INTEL_OFFMASK+1)
313 
314 #define pte_kernel_rw(p)          ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_RW))
315 #define pte_kernel_ro(p)          ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID))
316 #define pte_user_rw(p)            ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_USER|INTEL_PTE_RW))
317 #define pte_user_ro(p)            ((pt_entry_t)(pa_to_pte(p) | INTEL_PTE_VALID|INTEL_PTE_USER))
318 
319 #define PMAP_INVEPT_SINGLE_CONTEXT      1
320 
321 
322 #define INTEL_EPTP_AD           0x00000040ULL
323 
324 #define INTEL_EPT_READ          0x00000001ULL
325 #define INTEL_EPT_WRITE         0x00000002ULL
326 #define INTEL_EPT_EX            0x00000004ULL   /* Supervisor-execute when MBE is enabled */
327 #define INTEL_EPT_IPAT          0x00000040ULL
328 #define INTEL_EPT_PS            0x00000080ULL
329 #define INTEL_EPT_REF           0x00000100ULL
330 #define INTEL_EPT_MOD           0x00000200ULL
331 #define INTEL_EPT_UEX           0x00000400ULL   /* User-execute when MBE is enabled (ignored otherwise) */
332 
333 #define INTEL_EPT_CACHE_MASK    0x00000038ULL
334 #define INTEL_EPT_NCACHE        0x00000000ULL
335 #define INTEL_EPT_WC            0x00000008ULL
336 #define INTEL_EPT_WTHRU         0x00000020ULL
337 #define INTEL_EPT_WP            0x00000028ULL
338 #define INTEL_EPT_WB            0x00000030ULL
339 
340 /*
341  * Routines to filter correct bits depending on the pmap type
342  */
343 
344 static inline pt_entry_t
pte_remove_ex(pt_entry_t pte,boolean_t is_ept)345 pte_remove_ex(pt_entry_t pte, boolean_t is_ept)
346 {
347 	if (__probable(!is_ept)) {
348 		return pte | INTEL_PTE_NX;
349 	}
350 
351 	return pte & (~INTEL_EPT_EX);
352 }
353 
354 static inline pt_entry_t
pte_set_ex(pt_entry_t pte,boolean_t is_ept)355 pte_set_ex(pt_entry_t pte, boolean_t is_ept)
356 {
357 	if (__probable(!is_ept)) {
358 		return pte & (~INTEL_PTE_NX);
359 	}
360 
361 	return pte | INTEL_EPT_EX;
362 }
363 
364 static inline pt_entry_t
pte_set_uex(pt_entry_t pte)365 pte_set_uex(pt_entry_t pte)
366 {
367 	return pte | INTEL_EPT_UEX;
368 }
369 
370 static inline pt_entry_t
physmap_refmod_to_ept(pt_entry_t physmap_pte)371 physmap_refmod_to_ept(pt_entry_t physmap_pte)
372 {
373 	pt_entry_t ept_pte = 0;
374 
375 	if (physmap_pte & INTEL_PTE_MOD) {
376 		ept_pte |= INTEL_EPT_MOD;
377 	}
378 
379 	if (physmap_pte & INTEL_PTE_REF) {
380 		ept_pte |= INTEL_EPT_REF;
381 	}
382 
383 	return ept_pte;
384 }
385 
386 static inline pt_entry_t
ept_refmod_to_physmap(pt_entry_t ept_pte)387 ept_refmod_to_physmap(pt_entry_t ept_pte)
388 {
389 	pt_entry_t physmap_pte = 0;
390 
391 	assert((ept_pte & ~(INTEL_EPT_REF | INTEL_EPT_MOD)) == 0);
392 
393 	if (ept_pte & INTEL_EPT_REF) {
394 		physmap_pte |= INTEL_PTE_REF;
395 	}
396 
397 	if (ept_pte & INTEL_EPT_MOD) {
398 		physmap_pte |= INTEL_PTE_MOD;
399 	}
400 
401 	return physmap_pte;
402 }
403 
404 /*
405  * Note: Not all Intel processors support EPT referenced access and dirty bits.
406  *	 During pmap_init() we check the VMX capability for the current hardware
407  *	 and update this variable accordingly.
408  */
409 extern boolean_t pmap_ept_support_ad;
410 
411 #define PTE_VALID_MASK(is_ept)  ((is_ept) ? (INTEL_EPT_READ | INTEL_EPT_WRITE | INTEL_EPT_EX | INTEL_EPT_UEX) : INTEL_PTE_VALID)
412 #define PTE_READ(is_ept)        ((is_ept) ? INTEL_EPT_READ : INTEL_PTE_VALID)
413 #define PTE_WRITE(is_ept)       ((is_ept) ? INTEL_EPT_WRITE : INTEL_PTE_WRITE)
414 #define PTE_IS_EXECUTABLE(is_ept, pte)  ((is_ept) ? (((pte) & (INTEL_EPT_EX | INTEL_EPT_UEX)) != 0) : (((pte) & INTEL_PTE_NX) == 0))
415 #define PTE_PS                  INTEL_PTE_PS
416 #define PTE_COMPRESSED          INTEL_PTE_COMPRESSED
417 #define PTE_COMPRESSED_ALT      INTEL_PTE_COMPRESSED_ALT
418 #define PTE_NCACHE(is_ept)      ((is_ept) ? INTEL_EPT_NCACHE : INTEL_PTE_NCACHE)
419 #define PTE_WTHRU(is_ept)       ((is_ept) ? INTEL_EPT_WTHRU : INTEL_PTE_WTHRU)
420 #define PTE_REF(is_ept)         ((is_ept) ? INTEL_EPT_REF : INTEL_PTE_REF)
421 #define PTE_MOD(is_ept)         ((is_ept) ? INTEL_EPT_MOD : INTEL_PTE_MOD)
422 #define PTE_WIRED               INTEL_PTE_WIRED
423 
424 
425 #define PMAP_DEFAULT_CACHE      0
426 #define PMAP_INHIBIT_CACHE      1
427 #define PMAP_GUARDED_CACHE      2
428 #define PMAP_ACTIVATE_CACHE     4
429 #define PMAP_NO_GUARD_CACHE     8
430 
431 /* Per-pmap ledger operations */
432 #define pmap_ledger_debit(p, e, a) ledger_debit((p)->ledger, e, a)
433 #define pmap_ledger_credit(p, e, a) ledger_credit((p)->ledger, e, a)
434 
435 #ifndef ASSEMBLER
436 
437 #include <sys/queue.h>
438 
439 /*
440  * Address of current and alternate address space page table maps
441  * and directories.
442  */
443 
444 extern pt_entry_t       *PTmap;
445 extern pdpt_entry_t     *IdlePDPT;
446 extern pml4_entry_t     *IdlePML4;
447 extern boolean_t        no_shared_cr3;
448 extern pd_entry_t       *IdlePTD;       /* physical addr of "Idle" state PTD */
449 
450 extern uint64_t         pmap_pv_hashlist_walks;
451 extern uint64_t         pmap_pv_hashlist_cnts;
452 extern uint32_t         pmap_pv_hashlist_max;
453 extern uint32_t         pmap_kernel_text_ps;
454 
455 #define ID_MAP_VTOP(x)  ((void *)(((uint64_t)(x)) & LOW_4GB_MASK))
456 
457 extern  uint64_t physmap_base, physmap_max;
458 
459 #define NPHYSMAP (MAX(((physmap_max - physmap_base) / GB), 4))
460 
461 extern pt_entry_t *PTE_corrupted_ptr;
462 
463 #if DEVELOPMENT || DEBUG
464 extern int pmap_inject_pte_corruption;
465 #endif
466 
467 static inline void
pmap_corrupted_pte_detected(pt_entry_t * ptep,uint64_t clear_bits,uint64_t set_bits)468 pmap_corrupted_pte_detected(pt_entry_t *ptep, uint64_t clear_bits, uint64_t set_bits)
469 {
470 	if (__c11_atomic_compare_exchange_strong((_Atomic(pt_entry_t *)*) & PTE_corrupted_ptr, &PTE_corrupted_ptr, ptep,
471 	    memory_order_acq_rel_smp, memory_order_relaxed)) {
472 		force_immediate_debugger_NMI = TRUE;
473 		NMIPI_panic(CPUMASK_REAL_OTHERS, PTE_CORRUPTION);
474 		if (clear_bits == 0 && set_bits == 0) {
475 			panic("PTE Corruption detected: ptep 0x%llx pte value 0x%llx", (unsigned long long)(uintptr_t)ptep, *(uint64_t *)ptep);
476 		} else {
477 			panic("PTE Corruption detected: ptep 0x%llx pte value 0x%llx clear 0x%llx set 0x%llx",
478 			    (unsigned long long)(uintptr_t)ptep, *(uint64_t *)ptep, clear_bits, set_bits);
479 		}
480 	}
481 }
482 
483 /*
484  * Atomic 64-bit store of a page table entry.
485  */
486 static inline void
pmap_store_pte(boolean_t is_ept,pt_entry_t * entryp,pt_entry_t value)487 pmap_store_pte(boolean_t is_ept, pt_entry_t *entryp, pt_entry_t value)
488 {
489 	/*
490 	 * In the 32-bit kernel a compare-and-exchange loop was
491 	 * required to provide atomicity. For K64, life is easier:
492 	 */
493 	*entryp = value;
494 
495 #if DEVELOPMENT || DEBUG
496 	if (__improbable(pmap_inject_pte_corruption != 0 && is_ept == FALSE && (value & PTE_COMPRESSED))) {
497 		pmap_inject_pte_corruption = 0;
498 		/* Inject a corruption event */
499 		value |= INTEL_PTE_NX;
500 	}
501 #endif
502 
503 	if (__improbable((is_ept == FALSE) && (value & PTE_COMPRESSED) && (value & INTEL_PTE_NX))) {
504 		pmap_corrupted_pte_detected(entryp, 0, 0);
505 	}
506 }
507 
508 static inline boolean_t
physmap_enclosed(addr64_t a)509 physmap_enclosed(addr64_t a)
510 {
511 	return a < (NPHYSMAP * GB);
512 }
513 
514 static  inline void *
PHYSMAP_PTOV_check(void * paddr)515 PHYSMAP_PTOV_check(void *paddr)
516 {
517 	uint64_t pvaddr = (uint64_t)paddr + physmap_base;
518 
519 	if (__improbable(pvaddr >= physmap_max)) {
520 		panic("PHYSMAP_PTOV bounds exceeded, 0x%qx, 0x%qx, 0x%qx",
521 		    pvaddr, physmap_base, physmap_max);
522 	}
523 
524 	return (void *)pvaddr;
525 }
526 
527 #define PHYSMAP_PTOV(x) (PHYSMAP_PTOV_check((void*) (x)))
528 #define phystokv(x) ((vm_offset_t)(PHYSMAP_PTOV(x)))
529 #if MACH_KERNEL_PRIVATE
530 extern uint64_t dblmap_base, dblmap_max, dblmap_dist;
531 
532 static inline uint64_t
DBLMAP_CHECK(uintptr_t x)533 DBLMAP_CHECK(uintptr_t x)
534 {
535 	uint64_t dbladdr = (uint64_t)x + dblmap_dist;
536 	if (__improbable((dbladdr >= dblmap_max) || (dbladdr < dblmap_base))) {
537 		panic("DBLMAP bounds exceeded, 0x%qx, 0x%qx 0x%qx, 0x%qx",
538 		    (uint64_t)x, dbladdr, dblmap_base, dblmap_max);
539 	}
540 	return dbladdr;
541 }
542 #define DBLMAP(x) (DBLMAP_CHECK((uint64_t) x))
543 extern uint64_t ldt_alias_offset;
544 static inline uint64_t
LDTALIAS_CHECK(uintptr_t x)545 LDTALIAS_CHECK(uintptr_t x)
546 {
547 	uint64_t dbladdr = (uint64_t)x + ldt_alias_offset;
548 	if (__improbable((dbladdr >= dblmap_max) || (dbladdr < dblmap_base))) {
549 		panic("LDTALIAS: bounds exceeded, 0x%qx, 0x%qx 0x%qx, 0x%qx",
550 		    (uint64_t)x, dbladdr, dblmap_base, dblmap_max);
551 	}
552 	return dbladdr;
553 }
554 #define LDTALIAS(x) (LDTALIAS_CHECK((uint64_t) x))
555 #endif
556 
557 /*
558  * For KASLR, we alias the master processor's IDT and GDT at fixed
559  * virtual addresses to defeat SIDT/SGDT address leakage.
560  * And non-boot processor's GDT aliases likewise (skipping LOWGLOBAL_ALIAS)
561  * The low global vector page is mapped at a fixed alias also.
562  */
563 #define LOWGLOBAL_ALIAS         (VM_MIN_KERNEL_ADDRESS + 0x2000)
564 
565 /*
566  * This indicates (roughly) where there is free space for the VM
567  * to use for the heap; this does not need to be precise.
568  */
569 #define KERNEL_PMAP_HEAP_RANGE_START VM_MIN_KERNEL_AND_KEXT_ADDRESS
570 
571 #if MACH_KERNEL_PRIVATE
572 extern void
573 pmap_tlbi_range(uint64_t startv, uint64_t endv, bool global, uint16_t pcid);
574 
575 #include <vm/vm_page.h>
576 
577 /*
578  *	For each vm_page_t, there is a list of all currently
579  *	valid virtual mappings of that page.  An entry is
580  *	a pv_entry_t; the list is the pv_table.
581  */
582 
583 struct pmap {
584 	lck_rw_t        pmap_rwl __attribute((aligned(64)));
585 	pmap_paddr_t    pm_cr3 __attribute((aligned(64))); /* Kernel+user shared PML4 physical*/
586 	pmap_paddr_t    pm_ucr3;        /* Mirrored user PML4 physical */
587 	pml4_entry_t    *pm_pml4;       /* VKA of top level */
588 	pml4_entry_t    *pm_upml4;      /* Shadow VKA of top level */
589 	pmap_paddr_t    pm_eptp;        /* EPTP */
590 
591 	task_map_t      pm_task_map;
592 	boolean_t       pagezero_accessible;
593 	boolean_t       pm_vm_map_cs_enforced; /* is vm_map cs_enforced? */
594 #define PMAP_PCID_MAX_CPUS      MAX_CPUS        /* Must be a multiple of 8 */
595 	pcid_t          pmap_pcid_cpus[PMAP_PCID_MAX_CPUS];
596 	volatile uint8_t pmap_pcid_coherency_vector[PMAP_PCID_MAX_CPUS];
597 	boolean_t       pm_shared;
598 	os_refcnt_t     ref_count;
599 	pdpt_entry_t    *pm_pdpt;       /* KVA of 3rd level page */
600 	vm_object_t     pm_obj;         /* object to hold pde's */
601 	vm_object_t     pm_obj_pdpt;    /* holds pdpt pages */
602 	vm_object_t     pm_obj_pml4;    /* holds pml4 pages */
603 #if     DEVELOPMENT || DEBUG
604 	int             nx_enabled;
605 #endif
606 	ledger_t        ledger;         /* ledger tracking phys mappings */
607 	uint64_t        corrected_compressed_ptes_count;
608 #if MACH_ASSERT
609 	boolean_t       pmap_stats_assert;
610 	int             pmap_pid;
611 	char            pmap_procname[17];
612 #endif /* MACH_ASSERT */
613 };
614 
615 static inline boolean_t
is_ept_pmap(pmap_t p)616 is_ept_pmap(pmap_t p)
617 {
618 	if (__probable(p->pm_cr3 != 0)) {
619 		assert(p->pm_eptp == 0);
620 		return FALSE;
621 	}
622 
623 	assert(p->pm_eptp != 0);
624 
625 	return TRUE;
626 }
627 
628 void hv_ept_pmap_create(void **ept_pmap, void **eptp);
629 
630 typedef struct pmap_memory_regions {
631 	ppnum_t base;            /* first page of this region */
632 	ppnum_t alloc_up;        /* pages below this one have been "stolen" */
633 	ppnum_t alloc_down;      /* pages above this one have been "stolen" */
634 	ppnum_t alloc_frag_up;   /* low page of fragment after large page alloc */
635 	ppnum_t alloc_frag_down; /* high page of fragment after large page alloc */
636 	ppnum_t end;             /* last page of this region */
637 	uint32_t type;
638 	uint64_t attribute;
639 } pmap_memory_region_t;
640 
641 extern unsigned pmap_memory_region_count;
642 extern unsigned pmap_memory_region_current;
643 
644 #define PMAP_MEMORY_REGIONS_SIZE 128
645 
646 extern pmap_memory_region_t pmap_memory_regions[];
647 #include <i386/pmap_pcid.h>
648 
649 static inline void
set_dirbase(pmap_t tpmap,thread_t thread,int my_cpu)650 set_dirbase(pmap_t tpmap, thread_t thread, int my_cpu)
651 {
652 	int ccpu = my_cpu;
653 	uint64_t pcr3 = tpmap->pm_cr3, ucr3 = tpmap->pm_ucr3;
654 	cpu_datap(ccpu)->cpu_task_cr3 = pcr3;
655 	cpu_shadowp(ccpu)->cpu_shadowtask_cr3 = pcr3;
656 
657 	cpu_datap(ccpu)->cpu_ucr3 = ucr3;
658 	cpu_shadowp(ccpu)->cpu_ucr3 = ucr3;
659 
660 	cpu_datap(ccpu)->cpu_task_map = cpu_shadowp(ccpu)->cpu_task_map =
661 	    tpmap->pm_task_map;
662 
663 	assert((get_preemption_level() > 0) || (ml_get_interrupts_enabled() == FALSE));
664 	assert(ccpu == cpu_number());
665 	/*
666 	 * Switch cr3 if necessary
667 	 * - unless running with no_shared_cr3 debugging mode
668 	 *   and we're not on the kernel's cr3 (after pre-empted copyio)
669 	 */
670 	boolean_t nopagezero = tpmap->pagezero_accessible;
671 	boolean_t priorpagezero = cpu_datap(ccpu)->cpu_pagezero_mapped;
672 	cpu_datap(ccpu)->cpu_pagezero_mapped = nopagezero;
673 
674 	if (__probable(!no_shared_cr3)) {
675 		if (__improbable(nopagezero)) {
676 			boolean_t copyio_active = ((thread->machine.specFlags & CopyIOActive) != 0);
677 			if (pmap_pcid_ncpus) {
678 				pmap_pcid_activate(tpmap, ccpu, TRUE, copyio_active);
679 			} else {
680 				if (copyio_active) {
681 					if (get_cr3_base() != tpmap->pm_cr3) {
682 						set_cr3_raw(tpmap->pm_cr3);
683 					}
684 				} else if (get_cr3_base() != cpu_datap(ccpu)->cpu_kernel_cr3) {
685 					set_cr3_raw(cpu_datap(ccpu)->cpu_kernel_cr3);
686 				}
687 			}
688 		} else if ((get_cr3_base() != tpmap->pm_cr3) || priorpagezero) {
689 			if (pmap_pcid_ncpus) {
690 				pmap_pcid_activate(tpmap, ccpu, FALSE, FALSE);
691 			} else {
692 				set_cr3_raw(tpmap->pm_cr3);
693 			}
694 		}
695 	} else {
696 		if (get_cr3_base() != cpu_datap(ccpu)->cpu_kernel_cr3) {
697 			set_cr3_raw(cpu_datap(ccpu)->cpu_kernel_cr3);
698 		}
699 	}
700 }
701 
702 /*
703  *	External declarations for PMAP_ACTIVATE.
704  */
705 
706 extern void             pmap_update_interrupt(void);
707 
708 extern addr64_t(kvtophys)(
709 	vm_offset_t     addr);
710 
711 extern kern_return_t    pmap_expand(
712 	pmap_t          pmap,
713 	vm_map_offset_t addr,
714 	unsigned int options);
715 extern vm_offset_t      pmap_map(
716 	vm_offset_t     virt,
717 	vm_map_offset_t start,
718 	vm_map_offset_t end,
719 	vm_prot_t       prot,
720 	unsigned int    flags);
721 
722 extern vm_offset_t      pmap_map_bd(
723 	vm_offset_t     virt,
724 	vm_map_offset_t start,
725 	vm_map_offset_t end,
726 	vm_prot_t       prot,
727 	unsigned int    flags);
728 extern void             pmap_bootstrap(
729 	vm_offset_t     load_start,
730 	boolean_t       IA32e);
731 
732 extern boolean_t        pmap_valid_page(
733 	ppnum_t pn);
734 
735 extern int              pmap_list_resident_pages(
736 	struct pmap     *pmap,
737 	vm_offset_t     *listp,
738 	int             space);
739 extern void             x86_filter_TLB_coherency_interrupts(boolean_t);
740 
741 extern void
742 pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX,
743     boolean_t ro);
744 
745 /*
746  * Get cache attributes (as pagetable bits) for the specified phys page
747  */
748 extern  unsigned        pmap_get_cache_attributes(ppnum_t, boolean_t is_ept);
749 
750 extern kern_return_t    pmap_map_block_addr(
751 	pmap_t pmap,
752 	addr64_t va,
753 	pmap_paddr_t pa,
754 	uint32_t size,
755 	vm_prot_t prot,
756 	int attr,
757 	unsigned int flags);
758 extern kern_return_t    pmap_map_block(
759 	pmap_t pmap,
760 	addr64_t va,
761 	ppnum_t pa,
762 	uint32_t size,
763 	vm_prot_t prot,
764 	int attr,
765 	unsigned int flags);
766 
767 extern void invalidate_icache(vm_offset_t addr, unsigned cnt, int phys);
768 extern void flush_dcache(vm_offset_t addr, unsigned count, int phys);
769 extern pmap_paddr_t pmap_find_pa(pmap_t map, addr64_t va);
770 extern ppnum_t pmap_find_phys(pmap_t map, addr64_t va);
771 extern ppnum_t pmap_find_phys_nofault(pmap_t pmap, addr64_t va);
772 
773 extern kern_return_t pmap_get_prot(pmap_t pmap, addr64_t va, vm_prot_t *protp);
774 
775 extern void pmap_cpu_init(void);
776 extern void pmap_disable_NX(pmap_t pmap);
777 
778 extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__printflike(1, 2));
779 
780 extern void x86_64_protect_data_const(void);
781 
782 extern uint64_t pmap_commpage_size_min(pmap_t pmap);
783 
784 static inline vm_offset_t
pmap_ro_zone_align(vm_offset_t value)785 pmap_ro_zone_align(vm_offset_t value)
786 {
787 	return value;
788 }
789 
790 extern void pmap_ro_zone_memcpy(zone_id_t zid, vm_offset_t va, vm_offset_t offset,
791     vm_offset_t new_data, vm_size_t new_data_size);
792 extern uint64_t pmap_ro_zone_atomic_op(zone_id_t zid, vm_offset_t va, vm_offset_t offset,
793     uint32_t op, uint64_t value);
794 extern void pmap_ro_zone_bzero(zone_id_t zid, vm_offset_t va, vm_offset_t offset, vm_size_t size);
795 
796 /*
797  *	Macros for speed.
798  */
799 
800 
801 #include <kern/spl.h>
802 
803 
804 #define PMAP_ACTIVATE_MAP(map, thread, my_cpu)  {                               \
805 	pmap_t		tpmap;                                  \
806                                                                         \
807 	tpmap = vm_map_pmap(map);                                       \
808 	set_dirbase(tpmap, thread, my_cpu);                                     \
809 }
810 
811 #if   defined(__x86_64__)
812 #define PMAP_DEACTIVATE_MAP(map, thread, ccpu)                          \
813 	pmap_assert2((pmap_pcid_ncpus ? (pcid_for_pmap_cpu_tuple(map->pmap, thread, ccpu) == (get_cr3_raw() & 0xFFF)) : TRUE),"PCIDs: 0x%x, active PCID: 0x%x, CR3: 0x%lx, pmap_cr3: 0x%llx, kernel_cr3: 0x%llx, kernel pmap cr3: 0x%llx, CPU active PCID: 0x%x, CPU kernel PCID: 0x%x, specflags: 0x%x, pagezero: 0x%x", pmap_pcid_ncpus, pcid_for_pmap_cpu_tuple(map->pmap, thread, ccpu), get_cr3_raw(), map->pmap->pm_cr3, cpu_datap(ccpu)->cpu_kernel_cr3, kernel_pmap->pm_cr3, cpu_datap(ccpu)->cpu_active_pcid, cpu_datap(ccpu)->cpu_kernel_pcid, thread->machine.specFlags, map->pmap->pagezero_accessible);
814 #else
815 #define PMAP_DEACTIVATE_MAP(map, thread)
816 #endif
817 
818 #define PMAP_SWITCH_USER(th, new_map, my_cpu) {                         \
819 	spl_t		spl;                                            \
820                                                                         \
821 	spl = splhigh();                                                \
822 	PMAP_DEACTIVATE_MAP(th->map, th, my_cpu);                       \
823 	th->map = new_map;                                              \
824 	PMAP_ACTIVATE_MAP(th->map, th, my_cpu);                         \
825 	splx(spl);                                                      \
826 }
827 
828 /*
829  * Marking the current cpu's cr3 inactive is achieved by setting its lsb.
830  * Marking the current cpu's cr3 active once more involves clearng this bit.
831  * Note that valid page tables are page-aligned and so the bottom 12 bits
832  * are normally zero, modulo PCID.
833  * We can only mark the current cpu active/inactive but we can test any cpu.
834  */
835 #define CPU_CR3_MARK_INACTIVE()                                         \
836 	current_cpu_datap()->cpu_active_cr3 |= 1
837 
838 #define CPU_CR3_MARK_ACTIVE()                                           \
839 	current_cpu_datap()->cpu_active_cr3 &= ~1
840 
841 #define CPU_CR3_IS_ACTIVE(cpu)                                          \
842 	((cpu_datap(cpu)->cpu_active_cr3 & 1) == 0)
843 
844 #define CPU_GET_ACTIVE_CR3(cpu)                                         \
845 	(cpu_datap(cpu)->cpu_active_cr3 & ~1)
846 
847 #define CPU_GET_TASK_CR3(cpu)                                           \
848 	(cpu_datap(cpu)->cpu_task_cr3)
849 
850 /*
851  *	Mark this cpu idle, and remove it from the active set,
852  *	since it is not actively using any pmap.  Signal_cpus
853  *	will notice that it is idle, and avoid signaling it,
854  *	but will queue the update request for when the cpu
855  *	becomes active.
856  */
857 #define MARK_CPU_IDLE(my_cpu)   {                                       \
858 	assert(ml_get_interrupts_enabled() == FALSE);                   \
859 	CPU_CR3_MARK_INACTIVE();                                        \
860 	mfence();                                                                       \
861 }
862 
863 #define MARK_CPU_ACTIVE(my_cpu) {                                       \
864 	assert(ml_get_interrupts_enabled() == FALSE);                   \
865 	/* \
866 	 *	If a kernel_pmap update was requested while this cpu \
867 	 *	was idle, process it as if we got the interrupt. \
868 	 *	Before doing so, remove this cpu from the idle set. \
869 	 *	Since we do not grab any pmap locks while we flush \
870 	 *	our TLB, another cpu may start an update operation \
871 	 *	before we finish.  Removing this cpu from the idle \
872 	 *	set assures that we will receive another update \
873 	 *	interrupt if this happens. \
874 	 */                                                             \
875 	CPU_CR3_MARK_ACTIVE();                                          \
876 	mfence();                                                       \
877 	pmap_update_interrupt();                                        \
878 }
879 
880 #define PMAP_CONTEXT(pmap, thread)
881 
882 #define pmap_kernel_va(VA)      \
883 	((((vm_offset_t) (VA)) >= vm_min_kernel_address) &&     \
884 	 (((vm_offset_t) (VA)) <= vm_max_kernel_address))
885 
886 
887 #define pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
888 #define pmap_attribute(pmap, addr, size, attr, value) \
889 	                                (KERN_INVALID_ADDRESS)
890 #define pmap_attribute_cache_sync(addr, size, attr, value) \
891 	                                (KERN_INVALID_ADDRESS)
892 
893 extern boolean_t pmap_is_empty(pmap_t           pmap,
894     vm_map_offset_t  start,
895     vm_map_offset_t  end);
896 
897 #define MACHINE_BOOTSTRAPPTD    1       /* Static bootstrap page-tables */
898 
899 kern_return_t
900     pmap_permissions_verify(pmap_t, vm_map_t, vm_offset_t, vm_offset_t);
901 
902 #if DEVELOPMENT || DEBUG
903 extern kern_return_t pmap_test_text_corruption(pmap_paddr_t);
904 #endif /* DEVELOPMENT || DEBUG */
905 
906 #if MACH_ASSERT
907 extern int pmap_stats_assert;
908 #define PMAP_STATS_ASSERTF(args)                \
909 	MACRO_BEGIN                             \
910 	if (pmap_stats_assert) assertf args;    \
911 	MACRO_END
912 #else /* MACH_ASSERT */
913 #define PMAP_STATS_ASSERTF(args)
914 #endif /* MACH_ASSERT */
915 #endif /* MACH_KERNEL_PRIVATE */
916 #endif  /* ASSEMBLER */
917 #endif  /* _PMAP_MACHINE_ */
918 #endif  /* KERNEL_PRIVATE */
919