xref: /xnu-12377.81.4/osfmk/arm64/arm_vm_init.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2007-2011 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <mach_kdp.h>
30 #include <debug.h>
31 
32 #include <kern/assert.h>
33 #include <kern/misc_protos.h>
34 #include <kern/monotonic.h>
35 #include <mach/vm_types.h>
36 #include <mach/vm_param.h>
37 #include <vm/vm_kern.h>
38 #include <vm/vm_page_internal.h>
39 #include <vm/pmap.h>
40 
41 #include <machine/atomic.h>
42 #include <arm64/proc_reg.h>
43 #include <arm64/lowglobals.h>
44 #include <arm/cpu_data_internal.h>
45 #include <arm/misc_protos.h>
46 #include <pexpert/arm64/boot.h>
47 #include <pexpert/device_tree.h>
48 
49 #include <libkern/kernel_mach_header.h>
50 #include <libkern/section_keywords.h>
51 
52 #include <san/kasan.h>
53 
54 #if __ARM_KERNEL_PROTECT__
55 /*
56  * If we want to support __ARM_KERNEL_PROTECT__, we need a sufficient amount of
57  * mappable space preceeding the kernel (as we unmap the kernel by cutting the
58  * range covered by TTBR1 in half).  This must also cover the exception vectors.
59  */
60 static_assert(KERNEL_PMAP_HEAP_RANGE_START > ARM_KERNEL_PROTECT_EXCEPTION_START);
61 
62 /* The exception vectors and the kernel cannot share root TTEs. */
63 static_assert((KERNEL_PMAP_HEAP_RANGE_START & ~ARM_TT_ROOT_OFFMASK) > ARM_KERNEL_PROTECT_EXCEPTION_START);
64 
65 /*
66  * We must have enough space in the TTBR1_EL1 range to create the EL0 mapping of
67  * the exception vectors.
68  */
69 static_assert((KERN_PROTECT_REGION_SIZE * 2ULL) <= KERN_ADDRESS_SPACE_SIZE);
70 #endif /* __ARM_KERNEL_PROTECT__ */
71 
72 #define ARM_DYNAMIC_TABLE_XN (ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN)
73 
74 #if KASAN
75 extern vm_offset_t shadow_pbase;
76 extern vm_offset_t shadow_ptop;
77 extern vm_offset_t physmap_vbase;
78 extern vm_offset_t physmap_vtop;
79 #endif
80 
81 /*
82  * We explicitly place this in const, as it is not const from a language
83  * perspective, but it is only modified before we actually switch away from
84  * the bootstrap page tables.
85  */
86 SECURITY_READ_ONLY_LATE(uint8_t) bootstrap_pagetables[BOOTSTRAP_TABLE_SIZE] __attribute__((aligned(ARM_PGBYTES)));
87 
88 /*
89  * Denotes the end of xnu.
90  */
91 extern void *last_kernel_symbol;
92 
93 extern void arm64_replace_bootstack(cpu_data_t*);
94 extern void PE_slide_devicetree(vm_offset_t);
95 
96 /*
97  * KASLR parameters
98  */
99 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernel_base;
100 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernel_top;
101 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kext_base;
102 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kext_top;
103 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernel_stext;
104 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernel_etext;
105 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernel_slide;
106 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernel_slid_base;
107 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernel_slid_top;
108 
109 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_prelink_stext;
110 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_prelink_etext;
111 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_prelink_sdata;
112 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_prelink_edata;
113 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_prelink_sinfo;
114 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_prelink_einfo;
115 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_slinkedit;
116 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_elinkedit;
117 
118 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernel_builtinkmod_text;
119 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernel_builtinkmod_text_end;
120 
121 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernelcache_base;
122 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernelcache_top;
123 
124 /* Used by <mach/arm/vm_param.h> */
125 SECURITY_READ_ONLY_LATE(unsigned long) gVirtBase;
126 SECURITY_READ_ONLY_LATE(unsigned long) gPhysBase;
127 SECURITY_READ_ONLY_LATE(unsigned long) gPhysSize;
128 SECURITY_READ_ONLY_LATE(unsigned long) gT0Sz = T0SZ_BOOT;
129 SECURITY_READ_ONLY_LATE(unsigned long) gT1Sz = T1SZ_BOOT;
130 
131 /* 23543331 - step 1 of kext / kernel __TEXT and __DATA colocation is to move
132  * all kexts before the kernel.  This is only for arm64 devices and looks
133  * something like the following:
134  * -- vmaddr order --
135  * 0xffffff8004004000 __PRELINK_TEXT
136  * 0xffffff8007004000 __TEXT (xnu)
137  * 0xffffff80075ec000 __DATA (xnu)
138  * 0xffffff80076dc000 __KLD (xnu)
139  * 0xffffff80076e0000 __LAST (xnu)
140  * 0xffffff80076e4000 __LINKEDIT (xnu)
141  * 0xffffff80076e4000 __PRELINK_DATA (not used yet)
142  * 0xffffff800782c000 __PRELINK_INFO
143  * 0xffffff80078e4000 -- End of kernelcache
144  */
145 
146 /* 24921709 - make XNU ready for KTRR
147  *
148  * Two possible kernel cache layouts, depending on which kcgen is being used.
149  * VAs increasing downwards.
150  * Old KCGEN:
151  *
152  * __PRELINK_TEXT
153  * __TEXT
154  * __DATA_CONST
155  * __TEXT_EXEC
156  * __KLD
157  * __LAST
158  * __DATA
159  * __PRELINK_DATA (expected empty)
160  * __LINKEDIT
161  * __PRELINK_INFO
162  *
163  * New kcgen:
164  *
165  * __PRELINK_TEXT    <--- First KTRR (ReadOnly) segment
166  * __PLK_DATA_CONST
167  * __PLK_TEXT_EXEC
168  * __TEXT
169  * __DATA_CONST
170  * __TEXT_EXEC
171  * __KLD
172  * __LAST            <--- Last KTRR (ReadOnly) segment
173  * __DATA
174  * __BOOTDATA (if present)
175  * __LINKEDIT
176  * __PRELINK_DATA (expected populated now)
177  * __PLK_LINKEDIT
178  * __PRELINK_INFO
179  *
180  */
181 
182 vm_offset_t mem_size;                             /* Size of actual physical memory present
183                                                    * minus any performance buffer and possibly
184                                                    * limited by mem_limit in bytes */
185 uint64_t    mem_actual;                           /* The "One True" physical memory size
186                                                    * actually, it's the highest physical
187                                                    * address + 1 */
188 uint64_t    max_mem;                              /* Size of physical memory (bytes), adjusted
189                                                    * by maxmem */
190 uint64_t    max_mem_actual;                       /* Actual size of physical memory (bytes),
191                                                    * adjusted by the maxmem boot-arg */
192 uint64_t    sane_size;                            /* Memory size to use for defaults
193                                                    * calculations */
194 /* This no longer appears to be used; kill it? */
195 addr64_t    vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Highest kernel
196                                                    * virtual address known
197                                                    * to the VM system */
198 
199 SECURITY_READ_ONLY_LATE(vm_offset_t)              segEXTRADATA;
200 SECURITY_READ_ONLY_LATE(unsigned long)            segSizeEXTRADATA;
201 
202 /* Trust cache portion of EXTRADATA (if within it) */
203 SECURITY_READ_ONLY_LATE(vm_offset_t)              segTRUSTCACHE;
204 SECURITY_READ_ONLY_LATE(unsigned long)            segSizeTRUSTCACHE;
205 
206 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLOWESTTEXT;
207 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLOWEST;
208 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLOWESTRO;
209 SECURITY_READ_ONLY_LATE(vm_offset_t)          segHIGHESTRO;
210 
211 /* Only set when booted from MH_FILESET kernel collections */
212 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLOWESTKC;
213 SECURITY_READ_ONLY_LATE(vm_offset_t)          segHIGHESTKC;
214 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLOWESTROKC;
215 SECURITY_READ_ONLY_LATE(vm_offset_t)          segHIGHESTROKC;
216 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLOWESTAuxKC;
217 SECURITY_READ_ONLY_LATE(vm_offset_t)          segHIGHESTAuxKC;
218 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLOWESTROAuxKC;
219 SECURITY_READ_ONLY_LATE(vm_offset_t)          segHIGHESTROAuxKC;
220 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLOWESTRXAuxKC;
221 SECURITY_READ_ONLY_LATE(vm_offset_t)          segHIGHESTRXAuxKC;
222 SECURITY_READ_ONLY_LATE(vm_offset_t)          segHIGHESTNLEAuxKC;
223 
224 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segTEXTB;
225 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeTEXT;
226 
227 #if XNU_MONITOR
228 SECURITY_READ_ONLY_LATE(vm_offset_t)          segPPLTEXTB;
229 SECURITY_READ_ONLY_LATE(unsigned long)        segSizePPLTEXT;
230 
231 SECURITY_READ_ONLY_LATE(vm_offset_t)          segPPLTRAMPB;
232 SECURITY_READ_ONLY_LATE(unsigned long)        segSizePPLTRAMP;
233 
234 SECURITY_READ_ONLY_LATE(vm_offset_t)          segPPLDATACONSTB;
235 SECURITY_READ_ONLY_LATE(unsigned long)        segSizePPLDATACONST;
236 SECURITY_READ_ONLY_LATE(void *)               pmap_stacks_start = NULL;
237 SECURITY_READ_ONLY_LATE(void *)               pmap_stacks_end = NULL;
238 #if HAS_GUARDED_IO_FILTER
239 SECURITY_READ_ONLY_LATE(void *)               iofilter_stacks_start = NULL;
240 SECURITY_READ_ONLY_LATE(void *)               iofilter_stacks_end = NULL;
241 #endif
242 #endif
243 
244 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segDATACONSTB;
245 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeDATACONST;
246 
247 SECURITY_READ_ONLY_LATE(vm_offset_t)   segTEXTEXECB;
248 SECURITY_READ_ONLY_LATE(unsigned long) segSizeTEXTEXEC;
249 
250 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segDATAB;
251 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeDATA;
252 
253 #if XNU_MONITOR
254 SECURITY_READ_ONLY_LATE(vm_offset_t)          segPPLDATAB;
255 SECURITY_READ_ONLY_LATE(unsigned long)        segSizePPLDATA;
256 #endif
257 
258 SECURITY_READ_ONLY_LATE(vm_offset_t)          segBOOTDATAB;
259 SECURITY_READ_ONLY_LATE(unsigned long)        segSizeBOOTDATA;
260 extern vm_offset_t                            intstack_low_guard;
261 extern vm_offset_t                            intstack_high_guard;
262 extern vm_offset_t                            excepstack_high_guard;
263 
264 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLINKB;
265 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeLINK;
266 
267 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segKLDB;
268 SECURITY_READ_ONLY_LATE(unsigned long)        segSizeKLD;
269 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segKLDDATAB;
270 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeKLDDATA;
271 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLASTB;
272 SECURITY_READ_ONLY_LATE(unsigned long)        segSizeLAST;
273 SECURITY_READ_ONLY_LATE(vm_offset_t)          segLASTDATACONSTB;
274 SECURITY_READ_ONLY_LATE(unsigned long)        segSizeLASTDATACONST;
275 
276 SECURITY_READ_ONLY_LATE(vm_offset_t)          sectHIBTEXTB;
277 SECURITY_READ_ONLY_LATE(unsigned long)        sectSizeHIBTEXT;
278 SECURITY_READ_ONLY_LATE(vm_offset_t)          segHIBDATAB;
279 SECURITY_READ_ONLY_LATE(unsigned long)        segSizeHIBDATA;
280 SECURITY_READ_ONLY_LATE(vm_offset_t)          sectHIBDATACONSTB;
281 SECURITY_READ_ONLY_LATE(unsigned long)        sectSizeHIBDATACONST;
282 
283 SECURITY_READ_ONLY_LATE(vm_offset_t)          segPRELINKTEXTB;
284 SECURITY_READ_ONLY_LATE(unsigned long)        segSizePRELINKTEXT;
285 
286 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segPLKTEXTEXECB;
287 SECURITY_READ_ONLY_LATE(static unsigned long) segSizePLKTEXTEXEC;
288 
289 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segPLKDATACONSTB;
290 SECURITY_READ_ONLY_LATE(static unsigned long) segSizePLKDATACONST;
291 
292 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segPRELINKDATAB;
293 SECURITY_READ_ONLY_LATE(static unsigned long) segSizePRELINKDATA;
294 
295 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segPLKLLVMCOVB = 0;
296 SECURITY_READ_ONLY_LATE(static unsigned long) segSizePLKLLVMCOV = 0;
297 
298 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segPLKLINKEDITB;
299 SECURITY_READ_ONLY_LATE(static unsigned long) segSizePLKLINKEDIT;
300 
301 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segPRELINKINFOB;
302 SECURITY_READ_ONLY_LATE(static unsigned long) segSizePRELINKINFO;
303 
304 /* Only set when booted from MH_FILESET primary kernel collection */
305 SECURITY_READ_ONLY_LATE(vm_offset_t)          segKCTEXTEXECB;
306 SECURITY_READ_ONLY_LATE(unsigned long)        segSizeKCTEXTEXEC;
307 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segKCDATACONSTB;
308 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeKCDATACONST;
309 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segKCDATAB;
310 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeKCDATA;
311 
312 SECURITY_READ_ONLY_LATE(static boolean_t) use_contiguous_hint = TRUE;
313 
314 SECURITY_READ_ONLY_LATE(int) PAGE_SHIFT_CONST;
315 
316 SECURITY_READ_ONLY_LATE(vm_offset_t) end_kern;
317 SECURITY_READ_ONLY_LATE(vm_offset_t) etext;
318 SECURITY_READ_ONLY_LATE(vm_offset_t) sdata;
319 SECURITY_READ_ONLY_LATE(vm_offset_t) edata;
320 
321 SECURITY_READ_ONLY_LATE(static vm_offset_t) auxkc_mh, auxkc_base, auxkc_right_above;
322 
323 vm_offset_t alloc_ptpage(boolean_t map_static);
324 SECURITY_READ_ONLY_LATE(vm_offset_t) ropage_next;
325 extern int dtrace_keep_kernel_symbols(void);
326 
327 /*
328  * Bootstrap the system enough to run with virtual memory.
329  * Map the kernel's code and data, and allocate the system page table.
330  * Page_size must already be set.
331  *
332  * Parameters:
333  * first_avail: first available physical page -
334  *              after kernel page tables
335  * avail_start: PA of first physical page
336  * avail_end:   PA of last physical page
337  */
338 SECURITY_READ_ONLY_LATE(vm_offset_t)     first_avail;
339 SECURITY_READ_ONLY_LATE(vm_offset_t)     static_memory_end;
340 SECURITY_READ_ONLY_LATE(pmap_paddr_t)    avail_start;
341 SECURITY_READ_ONLY_LATE(pmap_paddr_t)    avail_end;
342 SECURITY_READ_ONLY_LATE(pmap_paddr_t)    real_avail_end;
343 SECURITY_READ_ONLY_LATE(unsigned long)   real_phys_size;
344 SECURITY_READ_ONLY_LATE(vm_map_address_t) physmap_base = (vm_map_address_t)0;
345 SECURITY_READ_ONLY_LATE(vm_map_address_t) physmap_end = (vm_map_address_t)0;
346 
347 /**
348  * First physical address freely available to xnu.
349  */
350 SECURITY_READ_ONLY_LATE(addr64_t) first_avail_phys = 0;
351 
352 /*
353  * Bounds of the kernelcache; used for accounting.
354  */
355 SECURITY_READ_ONLY_LATE(vm_offset_t) arm_vm_kernelcache_phys_start;
356 SECURITY_READ_ONLY_LATE(vm_offset_t) arm_vm_kernelcache_phys_end;
357 
358 #if __ARM_KERNEL_PROTECT__
359 extern void ExceptionVectorsBase;
360 extern void ExceptionVectorsEnd;
361 #endif /* __ARM_KERNEL_PROTECT__ */
362 
363 typedef struct {
364 	pmap_paddr_t pa;
365 	vm_map_address_t va;
366 	vm_size_t len;
367 } ptov_table_entry;
368 
369 #if HAS_MTE
370 #define PTOV_TABLE_SIZE 9
371 #else /* HAS_MTE */
372 #define PTOV_TABLE_SIZE 8
373 #endif /* HAS_MTE */
374 
375 SECURITY_READ_ONLY_LATE(static ptov_table_entry)        ptov_table[PTOV_TABLE_SIZE];
376 SECURITY_READ_ONLY_LATE(static boolean_t)               kva_active = FALSE;
377 
378 #define ARM64_PAGE_UNGUARDED (0)
379 #define ARM64_PAGE_GUARDED   (1)
380 
381 /* "physical to kernel virtual" - given a physical address, return the corresponding physical aperture address */
382 vm_map_address_t
phystokv(pmap_paddr_t pa)383 phystokv(pmap_paddr_t pa)
384 {
385 
386 	for (size_t i = 0; (i < PTOV_TABLE_SIZE) && (ptov_table[i].len != 0); i++) {
387 		if ((pa >= ptov_table[i].pa) && (pa < (ptov_table[i].pa + ptov_table[i].len))) {
388 			return pa - ptov_table[i].pa + ptov_table[i].va;
389 		}
390 	}
391 	if (__improbable((pa < gPhysBase) || ((pa - gPhysBase) >= real_phys_size))) {
392 		panic("%s: illegal PA: 0x%llx; phys base 0x%llx, size 0x%llx", __func__,
393 		    (unsigned long long)pa, (unsigned long long)gPhysBase, (unsigned long long)real_phys_size);
394 	}
395 	return pa - gPhysBase + gVirtBase;
396 }
397 
398 vm_map_address_t
phystokv_range(pmap_paddr_t pa,vm_size_t * max_len)399 phystokv_range(pmap_paddr_t pa, vm_size_t *max_len)
400 {
401 
402 	vm_size_t len;
403 	for (size_t i = 0; (i < PTOV_TABLE_SIZE) && (ptov_table[i].len != 0); i++) {
404 		if ((pa >= ptov_table[i].pa) && (pa < (ptov_table[i].pa + ptov_table[i].len))) {
405 			len = ptov_table[i].len - (pa - ptov_table[i].pa);
406 			if (*max_len > len) {
407 				*max_len = len;
408 			}
409 			return pa - ptov_table[i].pa + ptov_table[i].va;
410 		}
411 	}
412 	len = PAGE_SIZE - (pa & PAGE_MASK);
413 	if (*max_len > len) {
414 		*max_len = len;
415 	}
416 	if (__improbable((pa < gPhysBase) || ((pa - gPhysBase) >= real_phys_size))) {
417 		panic("%s: illegal PA: 0x%llx; phys base 0x%llx, size 0x%llx", __func__,
418 		    (unsigned long long)pa, (unsigned long long)gPhysBase, (unsigned long long)real_phys_size);
419 	}
420 	return pa - gPhysBase + gVirtBase;
421 }
422 
423 vm_offset_t
ml_static_vtop(vm_offset_t va)424 ml_static_vtop(vm_offset_t va)
425 {
426 	for (size_t i = 0; (i < PTOV_TABLE_SIZE) && (ptov_table[i].len != 0); i++) {
427 		if ((va >= ptov_table[i].va) && (va < (ptov_table[i].va + ptov_table[i].len))) {
428 			return va - ptov_table[i].va + ptov_table[i].pa;
429 		}
430 	}
431 	if (__improbable((va < gVirtBase) || (((vm_address_t)(va) - gVirtBase) >= gPhysSize))) {
432 		panic("%s: illegal VA: %p; virt base 0x%llx, size 0x%llx", __func__,
433 		    (void*)va, (unsigned long long)gVirtBase, (unsigned long long)gPhysSize);
434 	}
435 	return (vm_address_t)(va) - gVirtBase + gPhysBase;
436 }
437 
438 /*
439  * This rounds the given address up to the nearest boundary for a PTE contiguous
440  * hint.
441  */
442 static vm_offset_t
round_up_pte_hint_address(vm_offset_t address)443 round_up_pte_hint_address(vm_offset_t address)
444 {
445 	vm_offset_t hint_size = ARM_PTE_SIZE << ARM_PTE_HINT_ENTRIES_SHIFT;
446 	return (address + (hint_size - 1)) & ~(hint_size - 1);
447 }
448 
449 /* allocate a page for a page table: we support static and dynamic mappings.
450  *
451  * returns a virtual address for the allocated page
452  *
453  * for static mappings, we allocate from the region ropagetable_begin to ro_pagetable_end-1,
454  * which is defined in the DATA_CONST segment and will be protected RNX when vm_prot_finalize runs.
455  *
456  * for dynamic mappings, we allocate from avail_start, which should remain RWNX.
457  */
458 
459 vm_offset_t
alloc_ptpage(boolean_t map_static)460 alloc_ptpage(boolean_t map_static)
461 {
462 	vm_offset_t vaddr;
463 
464 #if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR))
465 	map_static = FALSE;
466 #endif
467 
468 	if (!ropage_next) {
469 		ropage_next = (vm_offset_t)&ropagetable_begin;
470 	}
471 
472 	if (map_static) {
473 		assert(ropage_next < (vm_offset_t)&ropagetable_end);
474 
475 		vaddr = ropage_next;
476 		ropage_next += ARM_PGBYTES;
477 
478 		return vaddr;
479 	} else {
480 		vaddr = phystokv(avail_start);
481 		avail_start += ARM_PGBYTES;
482 
483 		return vaddr;
484 	}
485 }
486 
487 #if DEBUG
488 
489 void dump_kva_l2(vm_offset_t tt_base, tt_entry_t *tt, int indent, uint64_t *rosz_out, uint64_t *rwsz_out);
490 
491 void
dump_kva_l2(vm_offset_t tt_base,tt_entry_t * tt,int indent,uint64_t * rosz_out,uint64_t * rwsz_out)492 dump_kva_l2(vm_offset_t tt_base, tt_entry_t *tt, int indent, uint64_t *rosz_out, uint64_t *rwsz_out)
493 {
494 	unsigned int i;
495 	boolean_t cur_ro, prev_ro = 0;
496 	int start_entry = -1;
497 	tt_entry_t cur, prev = 0;
498 	pmap_paddr_t robegin = kvtophys((vm_offset_t)&ropagetable_begin);
499 	pmap_paddr_t roend = kvtophys((vm_offset_t)&ropagetable_end);
500 	boolean_t tt_static = kvtophys((vm_offset_t)tt) >= robegin &&
501 	    kvtophys((vm_offset_t)tt) < roend;
502 
503 	for (i = 0; i < TTE_PGENTRIES; i++) {
504 		int tte_type = tt[i] & ARM_TTE_TYPE_MASK;
505 		cur = tt[i] & ARM_TTE_TABLE_MASK;
506 
507 		if (tt_static) {
508 			/* addresses mapped by this entry are static if it is a block mapping,
509 			 * or the table was allocated from the RO page table region */
510 			cur_ro = (tte_type == ARM_TTE_TYPE_BLOCK) || (cur >= robegin && cur < roend);
511 		} else {
512 			cur_ro = 0;
513 		}
514 
515 		if ((cur == 0 && prev != 0) || (cur_ro != prev_ro && prev != 0)) { // falling edge
516 			uintptr_t start, end, sz;
517 
518 			start = (uintptr_t)start_entry << ARM_TT_L2_SHIFT;
519 			start += tt_base;
520 			end = ((uintptr_t)i << ARM_TT_L2_SHIFT) - 1;
521 			end += tt_base;
522 
523 			sz = end - start + 1;
524 			printf("%*s0x%08x_%08x-0x%08x_%08x %s (%luMB)\n",
525 			    indent * 4, "",
526 			    (uint32_t)(start >> 32), (uint32_t)start,
527 			    (uint32_t)(end >> 32), (uint32_t)end,
528 			    prev_ro ? "Static " : "Dynamic",
529 			    (sz >> 20));
530 
531 			if (prev_ro) {
532 				*rosz_out += sz;
533 			} else {
534 				*rwsz_out += sz;
535 			}
536 		}
537 
538 		if ((prev == 0 && cur != 0) || cur_ro != prev_ro) { // rising edge: set start
539 			start_entry = i;
540 		}
541 
542 		prev = cur;
543 		prev_ro = cur_ro;
544 	}
545 }
546 
547 void
dump_kva_space()548 dump_kva_space()
549 {
550 	uint64_t tot_rosz = 0, tot_rwsz = 0;
551 	int ro_ptpages, rw_ptpages;
552 	pmap_paddr_t robegin = kvtophys((vm_offset_t)&ropagetable_begin);
553 	pmap_paddr_t roend = kvtophys((vm_offset_t)&ropagetable_end);
554 	boolean_t root_static = kvtophys((vm_offset_t)cpu_tte) >= robegin &&
555 	    kvtophys((vm_offset_t)cpu_tte) < roend;
556 	uint64_t kva_base = ~((1ULL << (64 - T1SZ_BOOT)) - 1);
557 
558 	printf("Root page table: %s\n", root_static ? "Static" : "Dynamic");
559 
560 	for (unsigned int i = 0; i < TTE_PGENTRIES; i++) {
561 		pmap_paddr_t cur;
562 		boolean_t cur_ro;
563 		uintptr_t start, end;
564 		uint64_t rosz = 0, rwsz = 0;
565 
566 		if ((cpu_tte[i] & ARM_TTE_VALID) == 0) {
567 			continue;
568 		}
569 
570 		cur = cpu_tte[i] & ARM_TTE_TABLE_MASK;
571 		start = (uint64_t)i << ARM_TT_L1_SHIFT;
572 		start = start + kva_base;
573 		end = start + (ARM_TT_L1_SIZE - 1);
574 		cur_ro = cur >= robegin && cur < roend;
575 
576 		printf("0x%08x_%08x-0x%08x_%08x %s\n",
577 		    (uint32_t)(start >> 32), (uint32_t)start,
578 		    (uint32_t)(end >> 32), (uint32_t)end,
579 		    cur_ro ? "Static " : "Dynamic");
580 
581 		dump_kva_l2(start, (tt_entry_t*)phystokv(cur), 1, &rosz, &rwsz);
582 		tot_rosz += rosz;
583 		tot_rwsz += rwsz;
584 	}
585 
586 	printf("L2 Address space mapped: Static %lluMB Dynamic %lluMB Total %lluMB\n",
587 	    tot_rosz >> 20,
588 	    tot_rwsz >> 20,
589 	    (tot_rosz >> 20) + (tot_rwsz >> 20));
590 
591 	ro_ptpages = (int)((ropage_next - (vm_offset_t)&ropagetable_begin) >> ARM_PGSHIFT);
592 	rw_ptpages = (int)(lowGlo.lgStaticSize  >> ARM_PGSHIFT);
593 	printf("Pages used: static %d dynamic %d\n", ro_ptpages, rw_ptpages);
594 }
595 
596 #endif /* DEBUG */
597 
598 #if __ARM_KERNEL_PROTECT__ || XNU_MONITOR
599 /*
600  * arm_vm_map:
601  *   root_ttp: The kernel virtual address for the root of the target page tables
602  *   vaddr: The target virtual address
603  *   pte: A page table entry value (may be ARM_PTE_EMPTY)
604  *
605  * This function installs pte at vaddr in root_ttp.  Any page table pages needed
606  * to install pte will be allocated by this function.
607  */
608 static void
arm_vm_map(tt_entry_t * root_ttp,vm_offset_t vaddr,pt_entry_t pte)609 arm_vm_map(tt_entry_t * root_ttp, vm_offset_t vaddr, pt_entry_t pte)
610 {
611 	vm_offset_t ptpage = 0;
612 	tt_entry_t * ttp = root_ttp;
613 
614 	tt_entry_t * l1_ttep = NULL;
615 	tt_entry_t l1_tte = 0;
616 
617 	tt_entry_t * l2_ttep = NULL;
618 	tt_entry_t l2_tte = 0;
619 	pt_entry_t * ptep = NULL;
620 	pt_entry_t cpte = 0;
621 
622 	/*
623 	 * Walk the target page table to find the PTE for the given virtual
624 	 * address.  Allocate any page table pages needed to do this.
625 	 */
626 	l1_ttep = ttp + L1_TABLE_T1_INDEX(vaddr, TCR_EL1_BOOT);
627 	l1_tte = *l1_ttep;
628 
629 	if (l1_tte == ARM_TTE_EMPTY) {
630 		ptpage = alloc_ptpage(TRUE);
631 		bzero((void *)ptpage, ARM_PGBYTES);
632 		l1_tte = kvtophys(ptpage);
633 		l1_tte &= ARM_TTE_TABLE_MASK;
634 		l1_tte |= ARM_TTE_VALID | ARM_TTE_TYPE_TABLE | ARM_TTE_TABLE_AP(ARM_TTE_TABLE_AP_USER_NA);
635 		*l1_ttep = l1_tte;
636 		ptpage = 0;
637 	}
638 
639 	ttp = (tt_entry_t *)phystokv(l1_tte & ARM_TTE_TABLE_MASK);
640 
641 	l2_ttep = ttp + ((vaddr & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
642 	l2_tte = *l2_ttep;
643 
644 	if (l2_tte == ARM_TTE_EMPTY) {
645 		ptpage = alloc_ptpage(TRUE);
646 		bzero((void *)ptpage, ARM_PGBYTES);
647 		l2_tte = kvtophys(ptpage);
648 		l2_tte &= ARM_TTE_TABLE_MASK;
649 		l2_tte |= ARM_TTE_VALID | ARM_TTE_TYPE_TABLE;
650 		*l2_ttep = l2_tte;
651 		ptpage = 0;
652 	}
653 
654 	ttp = (tt_entry_t *)phystokv(l2_tte & ARM_TTE_TABLE_MASK);
655 
656 	ptep = ttp + ((vaddr & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT);
657 	cpte = *ptep;
658 
659 	/*
660 	 * If the existing PTE is not empty, then we are replacing a valid
661 	 * mapping.
662 	 */
663 	if (cpte != ARM_PTE_EMPTY) {
664 		panic("%s: cpte=%#llx is not empty, "
665 		    "vaddr=%#lx, pte=%#llx",
666 		    __FUNCTION__, cpte,
667 		    vaddr, pte);
668 	}
669 
670 	*ptep = pte;
671 }
672 
673 #endif /* __ARM_KERNEL_PROTECT__ || XNU_MONITOR */
674 
675 #if __ARM_KERNEL_PROTECT__
676 
677 /*
678  * arm_vm_kernel_el0_map:
679  *   vaddr: The target virtual address
680  *   pte: A page table entry value (may be ARM_PTE_EMPTY)
681  *
682  * This function installs pte at vaddr for the EL0 kernel mappings.
683  */
684 static void
arm_vm_kernel_el0_map(vm_offset_t vaddr,pt_entry_t pte)685 arm_vm_kernel_el0_map(vm_offset_t vaddr, pt_entry_t pte)
686 {
687 	/* Calculate where vaddr will be in the EL1 kernel page tables. */
688 	vm_offset_t kernel_pmap_vaddr = vaddr - KERN_PROTECT_REGION_SIZE;
689 	arm_vm_map(cpu_tte, kernel_pmap_vaddr, pte);
690 }
691 
692 /*
693  * arm_vm_kernel_el1_map:
694  *   vaddr: The target virtual address
695  *   pte: A page table entry value (may be ARM_PTE_EMPTY)
696  *
697  * This function installs pte at vaddr for the EL1 kernel mappings.
698  */
699 static void
arm_vm_kernel_el1_map(vm_offset_t vaddr,pt_entry_t pte)700 arm_vm_kernel_el1_map(vm_offset_t vaddr, pt_entry_t pte)
701 {
702 	arm_vm_map(cpu_tte, vaddr, pte);
703 }
704 
705 /*
706  * arm_vm_kernel_pte:
707  *   vaddr: The target virtual address
708  *
709  * This function returns the PTE value for the given vaddr from the kernel page
710  * tables.  If the region has been been block mapped, we return what an
711  * equivalent PTE value would be (as regards permissions and flags).  We also
712  * remove the HINT bit (as we are not necessarily creating contiguous mappings.
713  */
714 static pt_entry_t
arm_vm_kernel_pte(vm_offset_t vaddr)715 arm_vm_kernel_pte(vm_offset_t vaddr)
716 {
717 	tt_entry_t * ttp = cpu_tte;
718 	tt_entry_t * ttep = NULL;
719 	tt_entry_t tte = 0;
720 	pt_entry_t * ptep = NULL;
721 	pt_entry_t pte = 0;
722 
723 	ttep = ttp + L1_TABLE_T1_INDEX(vaddr, TCR_EL1_BOOT);
724 	tte = *ttep;
725 
726 	assert(tte & ARM_TTE_VALID);
727 
728 	if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
729 		/* This is a block mapping; return the equivalent PTE value. */
730 		pte = (pt_entry_t)(tte & ~ARM_TTE_TYPE_MASK);
731 		pte |= ARM_PTE_TYPE_VALID;
732 		pte |= vaddr & ((ARM_TT_L1_SIZE - 1) & ARM_PTE_PAGE_MASK);
733 		pte &= ~ARM_PTE_HINT_MASK;
734 		return pte;
735 	}
736 
737 	ttp = (tt_entry_t *)phystokv(tte & ARM_TTE_TABLE_MASK);
738 	ttep = ttp + ((vaddr & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
739 	tte = *ttep;
740 
741 	assert(tte & ARM_TTE_VALID);
742 
743 	if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
744 		/* This is a block mapping; return the equivalent PTE value. */
745 		pte = (pt_entry_t)(tte & ~ARM_TTE_TYPE_MASK);
746 		pte |= ARM_PTE_TYPE_VALID;
747 		pte |= vaddr & ((ARM_TT_L2_SIZE - 1) & ARM_PTE_PAGE_MASK);
748 		pte &= ~ARM_PTE_HINT_MASK;
749 		return pte;
750 	}
751 
752 	ttp = (tt_entry_t *)phystokv(tte & ARM_TTE_TABLE_MASK);
753 
754 	ptep = ttp + ((vaddr & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT);
755 	pte = *ptep;
756 	pte &= ~ARM_PTE_HINT_MASK;
757 	return pte;
758 }
759 
760 /*
761  * arm_vm_prepare_kernel_el0_mappings:
762  *   alloc_only: Indicates if PTE values should be copied from the EL1 kernel
763  *     mappings.
764  *
765  * This function expands the kernel page tables to support the EL0 kernel
766  * mappings, and conditionally installs the PTE values for the EL0 kernel
767  * mappings (if alloc_only is false).
768  */
769 static void
arm_vm_prepare_kernel_el0_mappings(bool alloc_only)770 arm_vm_prepare_kernel_el0_mappings(bool alloc_only)
771 {
772 	pt_entry_t pte = 0;
773 	vm_offset_t start = ((vm_offset_t)&ExceptionVectorsBase) & ~PAGE_MASK;
774 	vm_offset_t end = (((vm_offset_t)&ExceptionVectorsEnd) + PAGE_MASK) & ~PAGE_MASK;
775 	vm_offset_t cur = 0;
776 	vm_offset_t cur_fixed = 0;
777 
778 	/* Expand for/map the exceptions vectors in the EL0 kernel mappings. */
779 	for (cur = start, cur_fixed = ARM_KERNEL_PROTECT_EXCEPTION_START; cur < end; cur += ARM_PGBYTES, cur_fixed += ARM_PGBYTES) {
780 		/*
781 		 * We map the exception vectors at a different address than that
782 		 * of the kernelcache to avoid sharing page table pages with the
783 		 * kernelcache (as this may cause issues with TLB caching of
784 		 * page table pages.
785 		 */
786 		if (!alloc_only) {
787 			pte = arm_vm_kernel_pte(cur);
788 		}
789 
790 		arm_vm_kernel_el1_map(cur_fixed, pte);
791 		arm_vm_kernel_el0_map(cur_fixed, pte);
792 	}
793 
794 	__builtin_arm_dmb(DMB_ISH);
795 	__builtin_arm_isb(ISB_SY);
796 
797 	if (!alloc_only) {
798 		/*
799 		 * If we have created the alternate exception vector mappings,
800 		 * the boot CPU may now switch over to them.
801 		 */
802 		set_vbar_el1(ARM_KERNEL_PROTECT_EXCEPTION_START);
803 		__builtin_arm_isb(ISB_SY);
804 	}
805 }
806 
807 /*
808  * arm_vm_populate_kernel_el0_mappings:
809  *
810  * This function adds all required mappings to the EL0 kernel mappings.
811  */
812 static void
arm_vm_populate_kernel_el0_mappings(void)813 arm_vm_populate_kernel_el0_mappings(void)
814 {
815 	arm_vm_prepare_kernel_el0_mappings(FALSE);
816 }
817 
818 /*
819  * arm_vm_expand_kernel_el0_mappings:
820  *
821  * This function expands the kernel page tables to accomodate the EL0 kernel
822  * mappings.
823  */
824 static void
arm_vm_expand_kernel_el0_mappings(void)825 arm_vm_expand_kernel_el0_mappings(void)
826 {
827 	arm_vm_prepare_kernel_el0_mappings(TRUE);
828 }
829 #endif /* __ARM_KERNEL_PROTECT__ */
830 
831 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
832 extern void bootstrap_instructions;
833 
834 /*
835  * arm_replace_identity_map takes the V=P map that we construct in start.s
836  * and repurposes it in order to have it map only the page we need in order
837  * to turn on the MMU.  This prevents us from running into issues where
838  * KTRR will cause us to fault on executable block mappings that cross the
839  * KTRR boundary.
840  */
841 static void
arm_replace_identity_map(void)842 arm_replace_identity_map(void)
843 {
844 	vm_offset_t addr;
845 	pmap_paddr_t paddr;
846 
847 	pmap_paddr_t l1_ptp_phys = 0;
848 	tt_entry_t *l1_ptp_virt = NULL;
849 	tt_entry_t *tte1 = NULL;
850 	pmap_paddr_t l2_ptp_phys = 0;
851 	tt_entry_t *l2_ptp_virt = NULL;
852 	tt_entry_t *tte2 = NULL;
853 	pmap_paddr_t l3_ptp_phys = 0;
854 	pt_entry_t *l3_ptp_virt = NULL;
855 	pt_entry_t *ptep = NULL;
856 
857 	addr = ((vm_offset_t)&bootstrap_instructions) & ~ARM_PGMASK;
858 	paddr = kvtophys(addr);
859 
860 	/*
861 	 * Grab references to the V=P page tables, and allocate an L3 page.
862 	 */
863 	l1_ptp_phys = kvtophys((vm_offset_t)&bootstrap_pagetables);
864 	l1_ptp_virt = (tt_entry_t *)phystokv(l1_ptp_phys);
865 	tte1 = &l1_ptp_virt[L1_TABLE_T1_INDEX(paddr, TCR_EL1_BOOT)];
866 
867 	l2_ptp_virt = L2_TABLE_VA(tte1);
868 	l2_ptp_phys = (*tte1) & ARM_TTE_TABLE_MASK;
869 	tte2 = &l2_ptp_virt[L2_TABLE_INDEX(paddr)];
870 
871 	l3_ptp_virt = (pt_entry_t *)alloc_ptpage(TRUE);
872 	l3_ptp_phys = kvtophys((vm_offset_t)l3_ptp_virt);
873 	ptep = &l3_ptp_virt[L3_TABLE_INDEX(paddr)];
874 
875 	/*
876 	 * Replace the large V=P mapping with a mapping that provides only the
877 	 * mappings needed to turn on the MMU.
878 	 */
879 
880 	bzero(l1_ptp_virt, ARM_PGBYTES);
881 	*tte1 = ARM_TTE_BOOT_TABLE | (l2_ptp_phys & ARM_TTE_TABLE_MASK);
882 
883 	bzero(l2_ptp_virt, ARM_PGBYTES);
884 	*tte2 = ARM_TTE_BOOT_TABLE | (l3_ptp_phys & ARM_TTE_TABLE_MASK);
885 
886 	*ptep = (paddr & ARM_PTE_MASK) |
887 	    ARM_PTE_TYPE_VALID |
888 	    ARM_PTE_SH(SH_OUTER_MEMORY) |
889 	    ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) |
890 	    ARM_PTE_AF |
891 	    ARM_PTE_AP(AP_RONA) |
892 	    ARM_PTE_NX;
893 }
894 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */
895 
896 tt_entry_t *arm_kva_to_tte(vm_offset_t);
897 
898 tt_entry_t *
arm_kva_to_tte(vm_offset_t va)899 arm_kva_to_tte(vm_offset_t va)
900 {
901 	tt_entry_t *tte1, *tte2;
902 	tte1 = cpu_tte + L1_TABLE_T1_INDEX(va, TCR_EL1_BOOT);
903 	tte2 = L2_TABLE_VA(tte1) + L2_TABLE_INDEX(va);
904 
905 	return tte2;
906 }
907 
908 #if XNU_MONITOR
909 
910 static inline pt_entry_t *
arm_kva_to_pte(vm_offset_t va)911 arm_kva_to_pte(vm_offset_t va)
912 {
913 	tt_entry_t *tte2 = arm_kva_to_tte(va);
914 	return L3_TABLE_VA(tte2) + L3_TABLE_INDEX(va);
915 }
916 
917 #endif
918 
919 #define ARM64_GRANULE_ALLOW_BLOCK (1 << 0)
920 #define ARM64_GRANULE_ALLOW_HINT (1 << 1)
921 
922 /**
923  * Updates a translation table entry (TTE) with the supplied value, unless doing so might render
924  * the pagetable region read-only before subsequent updates have finished.  In that case, the TTE
925  * value will be saved off for deferred processing.
926  *
927  * @param ttep address of the TTE to update
928  * @param entry the value to store in ttep
929  * @param pa the base physical address mapped by the TTE
930  * @param ttebase L3-page- or L2-block-aligned base virtual address of the pagetable region
931  * @param granule mask indicating whether L2 block or L3 hint mappings are allowed for this segment
932  * @param deferred_ttep_pair 2-element array of addresses of deferred TTEs
933  * @param deferred_tte_pair 2-element array containing TTE values for deferred assignment to
934  *        corresponding elements of deferred_ttep_pair
935  */
936 static void
update_or_defer_tte(tt_entry_t * ttep,tt_entry_t entry,pmap_paddr_t pa,vm_map_address_t ttebase,unsigned granule __unused,tt_entry_t ** deferred_ttep_pair,tt_entry_t * deferred_tte_pair)937 update_or_defer_tte(tt_entry_t *ttep, tt_entry_t entry, pmap_paddr_t pa, vm_map_address_t ttebase,
938     unsigned granule __unused, tt_entry_t **deferred_ttep_pair, tt_entry_t *deferred_tte_pair)
939 {
940 	/*
941 	 * If we're trying to assign an entry that maps the current TTE region (identified by ttebase),
942 	 * and the pagetable is already live (indicated by kva_active), defer assignment of the current
943 	 * entry and possibly the entry after it until all other mappings in the segment have been
944 	 * updated.  Otherwise we may end up immediately marking the pagetable region read-only
945 	 * leading to a fault later on a later assignment if we manage to outrun the TLB.  This can
946 	 * happen on KTRR/CTRR-enabled devices when marking segDATACONST read-only, as the pagetables
947 	 * that map that segment must come from the segment itself.  We therefore store the initial
948 	 * recursive TTE in deferred_ttep_pair[0] and its value in deferred_tte_pair[0].  We may also
949 	 * defer assignment of the TTE following that recursive TTE and store its value in
950 	 * deferred_tte_pair[1], because the TTE region following the current one may also contain
951 	 * pagetables and we must avoid marking that region read-only before updating those tables.
952 	 *
953 	 * We require that such recursive mappings must exist in regions that can be mapped with L2
954 	 * block entries if they are sufficiently large.  This is what allows us to assume that no
955 	 * more than 2 deferred TTEs will be required, because:
956 	 * 	--If more than 2 adjacent L3 PTEs were required to map our pagetables, that would mean
957 	 * 	  we would have at least one full L3 pagetable page and would instead use an L2 block.
958 	 *	--If more than 2 adjacent L2 blocks were required to map our pagetables, that would
959 	 * 	  mean we would have at least one full L2-block-sized region of TTEs and something
960 	 *	  is very wrong because no segment should be that large.
961 	 */
962 	if ((deferred_ttep_pair != NULL) && (deferred_ttep_pair[0] != NULL) && (ttep == (deferred_ttep_pair[0] + 1))) {
963 		assert(deferred_tte_pair[1] == 0);
964 		deferred_ttep_pair[1] = ttep;
965 		deferred_tte_pair[1] = entry;
966 	} else if (kva_active && (phystokv(pa) == ttebase)) {
967 		assert(deferred_ttep_pair != NULL);
968 		assert(granule & ARM64_GRANULE_ALLOW_BLOCK);
969 		if (deferred_ttep_pair[0] == NULL) {
970 			deferred_ttep_pair[0] = ttep;
971 			deferred_tte_pair[0] = entry;
972 		} else {
973 			assert(deferred_ttep_pair[1] == NULL);
974 			deferred_ttep_pair[1] = ttep;
975 			deferred_tte_pair[1] = entry;
976 		}
977 	} else {
978 		*ttep = entry;
979 	}
980 }
981 
982 /*
983  * arm_vm_page_granular_helper updates protections at the L3 level.  It will (if
984  * neccessary) allocate a page for the L3 table and update the corresponding L2
985  * entry.  Then, it will iterate over the L3 table, updating protections as necessary.
986  * This expects to be invoked on a L2 entry or sub L2 entry granularity, so this should
987  * not be invoked from a context that does not do L2 iteration separately (basically,
988  * don't call this except from arm_vm_page_granular_prot).
989  *
990  * unsigned granule: 0 => force to page granule, or a combination of
991  * ARM64_GRANULE_* flags declared above.
992  *
993  * unsigned int guarded => flag indicating whether this range should be
994  * considered an ARM "guarded" page. This enables BTI enforcement for a region.
995  */
996 
997 static void
arm_vm_page_granular_helper(vm_offset_t start,vm_offset_t _end,vm_offset_t va,pmap_paddr_t pa_offset,int pte_prot_APX,int pte_prot_XN,unsigned granule,__unused unsigned int guarded,tt_entry_t ** deferred_ttep_pair,tt_entry_t * deferred_tte_pair)998 arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, pmap_paddr_t pa_offset,
999     int pte_prot_APX, int pte_prot_XN, unsigned granule, __unused unsigned int guarded,
1000     tt_entry_t **deferred_ttep_pair, tt_entry_t *deferred_tte_pair)
1001 {
1002 	if (va & ARM_TT_L2_OFFMASK) { /* ragged edge hanging over a ARM_TT_L2_SIZE  boundary */
1003 		tt_entry_t *tte2;
1004 		tt_entry_t tmplate;
1005 		pmap_paddr_t pa;
1006 		pt_entry_t *ppte, ptmp;
1007 		addr64_t ppte_phys;
1008 		unsigned i;
1009 
1010 		va &= ~ARM_TT_L2_OFFMASK;
1011 		pa = va - gVirtBase + gPhysBase - pa_offset;
1012 
1013 		if (pa >= real_avail_end) {
1014 			return;
1015 		}
1016 
1017 		tte2 = arm_kva_to_tte(va);
1018 
1019 		assert(_end >= va);
1020 		tmplate = *tte2;
1021 
1022 		if (ARM_TTE_TYPE_TABLE == (tmplate & ARM_TTE_TYPE_MASK)) {
1023 			/* pick up the existing page table. */
1024 			ppte = (pt_entry_t *)phystokv((tmplate & ARM_TTE_TABLE_MASK));
1025 		} else {
1026 			// TTE must be reincarnated with page level mappings.
1027 
1028 			// ... but we don't want to break up blocks on live
1029 			// translation tables.
1030 			assert(!kva_active);
1031 
1032 			ppte = (pt_entry_t*)alloc_ptpage(pa_offset == 0);
1033 			bzero(ppte, ARM_PGBYTES);
1034 			ppte_phys = kvtophys((vm_offset_t)ppte);
1035 
1036 			*tte2 = pa_to_tte(ppte_phys) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
1037 		}
1038 
1039 		vm_offset_t len = _end - va;
1040 		if ((pa + len) > real_avail_end) {
1041 			_end -= (pa + len - real_avail_end);
1042 		}
1043 		assert((start - gVirtBase + gPhysBase - pa_offset) >= gPhysBase);
1044 
1045 		/* Round up to the nearest PAGE_SIZE boundary when creating mappings:
1046 		 * PAGE_SIZE may be a multiple of ARM_PGBYTES, and we don't want to leave
1047 		 * a ragged non-PAGE_SIZE-aligned edge. */
1048 		vm_offset_t rounded_end = round_page(_end);
1049 		/* Apply the desired protections to the specified page range */
1050 		for (i = 0; i <= (ARM_TT_L3_INDEX_MASK >> ARM_TT_L3_SHIFT); i++) {
1051 			if ((start <= va) && (va < rounded_end)) {
1052 				ptmp = pa | ARM_PTE_AF | ARM_PTE_SH(SH_OUTER_MEMORY) | ARM_PTE_TYPE_VALID;
1053 				ptmp = ptmp | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
1054 				ptmp = ptmp | ARM_PTE_AP(pte_prot_APX);
1055 				ptmp = ptmp | ARM_PTE_NX;
1056 #if __ARM_KERNEL_PROTECT__
1057 				ptmp = ptmp | ARM_PTE_NG;
1058 #endif /* __ARM_KERNEL_PROTECT__ */
1059 
1060 				if (pte_prot_XN) {
1061 					ptmp = ptmp | ARM_PTE_PNX;
1062 				}
1063 
1064 				/*
1065 				 * If we can, apply the contiguous hint to this range.  The hint is
1066 				 * applicable if the current address falls within a hint-sized range that will
1067 				 * be fully covered by this mapping request.
1068 				 */
1069 				if ((va >= round_up_pte_hint_address(start)) && (round_up_pte_hint_address(va + 1) <= _end) &&
1070 				    (granule & ARM64_GRANULE_ALLOW_HINT) && use_contiguous_hint) {
1071 					assert((va & ((1 << ARM_PTE_HINT_ADDR_SHIFT) - 1)) == ((pa & ((1 << ARM_PTE_HINT_ADDR_SHIFT) - 1))));
1072 					ptmp |= ARM_PTE_HINT;
1073 					/* Do not attempt to reapply the hint bit to an already-active mapping.
1074 					 * This very likely means we're attempting to change attributes on an already-active mapping,
1075 					 * which violates the requirement of the hint bit.*/
1076 					assert(!kva_active || (ppte[i] == ARM_PTE_EMPTY));
1077 				}
1078 
1079 #if BTI_ENFORCED
1080 				/*
1081 				 * Set the 'guarded page' flag to enable ARM BTI enforcement.
1082 				 */
1083 				if (guarded) {
1084 					ptmp |= ARM_PTE_GP;
1085 				}
1086 #endif /* BTI_ENFORCED */
1087 				/*
1088 				 * Do not change the contiguous bit on an active mapping.  Even in a single-threaded
1089 				 * environment, it's possible for prefetch to produce a TLB conflict by trying to pull in
1090 				 * a hint-sized entry on top of one or more existing page-sized entries.  It's also useful
1091 				 * to make sure we're not trying to unhint a sub-range of a larger hinted range, which
1092 				 * could produce a later TLB conflict.
1093 				 */
1094 				assert(!kva_active || (ppte[i] == ARM_PTE_EMPTY) || ((ppte[i] & ARM_PTE_HINT) == (ptmp & ARM_PTE_HINT)));
1095 
1096 				update_or_defer_tte(&ppte[i], ptmp, pa, (vm_map_address_t)ppte, granule, deferred_ttep_pair, deferred_tte_pair);
1097 			}
1098 
1099 			va += ARM_PGBYTES;
1100 			pa += ARM_PGBYTES;
1101 		}
1102 	}
1103 }
1104 
1105 /*
1106  * arm_vm_page_granular_prot updates protections by iterating over the L2 entries and
1107  * changing them.  If a particular chunk necessitates L3 entries (for reasons of
1108  * alignment or length, or an explicit request that the entry be fully expanded), we
1109  * hand off to arm_vm_page_granular_helper to deal with the L3 chunk of the logic.
1110  */
1111 static void
arm_vm_page_granular_prot(vm_offset_t start,unsigned long size,pmap_paddr_t pa_offset,int tte_prot_XN,int pte_prot_APX,int pte_prot_XN,unsigned granule,unsigned int guarded)1112 arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, pmap_paddr_t pa_offset,
1113     int tte_prot_XN, int pte_prot_APX, int pte_prot_XN,
1114     unsigned granule, unsigned int guarded)
1115 {
1116 	tt_entry_t *deferred_ttep_pair[2] = {NULL};
1117 	tt_entry_t deferred_tte_pair[2] = {0};
1118 	vm_offset_t _end = start + size;
1119 	vm_offset_t align_start = (start + ARM_TT_L2_OFFMASK) & ~ARM_TT_L2_OFFMASK;
1120 
1121 	if (size == 0x0UL) {
1122 		return;
1123 	}
1124 
1125 	if (align_start > _end) {
1126 		align_start = _end;
1127 	}
1128 
1129 	arm_vm_page_granular_helper(start, align_start, start, pa_offset, pte_prot_APX, pte_prot_XN, granule, guarded, deferred_ttep_pair, deferred_tte_pair);
1130 
1131 	while ((_end - align_start) >= ARM_TT_L2_SIZE) {
1132 		if (!(granule & ARM64_GRANULE_ALLOW_BLOCK)) {
1133 			arm_vm_page_granular_helper(align_start, align_start + ARM_TT_L2_SIZE, align_start + 1, pa_offset,
1134 			    pte_prot_APX, pte_prot_XN, granule, guarded, deferred_ttep_pair, deferred_tte_pair);
1135 		} else {
1136 			pmap_paddr_t pa = align_start - gVirtBase + gPhysBase - pa_offset;
1137 			assert((pa & ARM_TT_L2_OFFMASK) == 0);
1138 			tt_entry_t *tte2;
1139 			tt_entry_t tmplate;
1140 
1141 			tte2 = arm_kva_to_tte(align_start);
1142 
1143 			if ((pa >= gPhysBase) && (pa < real_avail_end)) {
1144 				tmplate = (pa & ARM_TTE_BLOCK_L2_MASK) | ARM_TTE_TYPE_BLOCK
1145 				    | ARM_TTE_VALID | ARM_TTE_BLOCK_AF | ARM_TTE_BLOCK_NX
1146 				    | ARM_TTE_BLOCK_AP(pte_prot_APX) | ARM_TTE_BLOCK_SH(SH_OUTER_MEMORY)
1147 				    | ARM_TTE_BLOCK_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
1148 
1149 #if __ARM_KERNEL_PROTECT__
1150 				tmplate = tmplate | ARM_TTE_BLOCK_NG;
1151 #endif /* __ARM_KERNEL_PROTECT__ */
1152 				if (tte_prot_XN) {
1153 					tmplate = tmplate | ARM_TTE_BLOCK_PNX;
1154 				}
1155 
1156 				update_or_defer_tte(tte2, tmplate, pa, (vm_map_address_t)tte2 & ~ARM_TT_L2_OFFMASK,
1157 				    granule, deferred_ttep_pair, deferred_tte_pair);
1158 			}
1159 		}
1160 		align_start += ARM_TT_L2_SIZE;
1161 	}
1162 
1163 	if (align_start < _end) {
1164 		arm_vm_page_granular_helper(align_start, _end, _end, pa_offset, pte_prot_APX, pte_prot_XN, granule, guarded, deferred_ttep_pair, deferred_tte_pair);
1165 	}
1166 
1167 	if (deferred_ttep_pair[0] != NULL) {
1168 #if DEBUG || DEVELOPMENT
1169 		/*
1170 		 * Flush the TLB to catch bugs that might cause us to prematurely revoke write access from the pagetable page.
1171 		 * These bugs may otherwise be hidden by TLB entries in most cases, resulting in very rare panics.
1172 		 * Note that we always flush the TLB at the end of arm_vm_prot_finalize().
1173 		 */
1174 		flush_mmu_tlb();
1175 #endif
1176 		/*
1177 		 * The first TTE in the pair is a recursive mapping of the pagetable region, so we must update it last
1178 		 * to avoid potentially marking deferred_pte_pair[1] read-only.
1179 		 */
1180 		if (deferred_tte_pair[1] != 0) {
1181 			os_atomic_store(deferred_ttep_pair[1], deferred_tte_pair[1], release);
1182 		}
1183 		os_atomic_store(deferred_ttep_pair[0], deferred_tte_pair[0], release);
1184 	}
1185 }
1186 
1187 static inline void
arm_vm_page_granular_RNX(vm_offset_t start,unsigned long size,unsigned granule)1188 arm_vm_page_granular_RNX(vm_offset_t start, unsigned long size, unsigned granule)
1189 {
1190 	arm_vm_page_granular_prot(start, size, 0, 1, AP_RONA, 1, granule, ARM64_PAGE_UNGUARDED);
1191 }
1192 
1193 static inline void
arm_vm_page_granular_ROX(vm_offset_t start,unsigned long size,unsigned granule,unsigned int guarded)1194 arm_vm_page_granular_ROX(vm_offset_t start, unsigned long size, unsigned granule, unsigned int guarded)
1195 {
1196 	arm_vm_page_granular_prot(start, size, 0, 0, AP_RONA, 0, granule, guarded);
1197 }
1198 
1199 static inline void
arm_vm_page_granular_RWNX(vm_offset_t start,unsigned long size,unsigned granule)1200 arm_vm_page_granular_RWNX(vm_offset_t start, unsigned long size, unsigned granule)
1201 {
1202 	arm_vm_page_granular_prot(start, size, 0, 1, AP_RWNA, 1, granule, ARM64_PAGE_UNGUARDED);
1203 }
1204 
1205 // Populate seg...AuxKC and fixup AuxKC permissions
1206 static bool
arm_vm_auxkc_init(void)1207 arm_vm_auxkc_init(void)
1208 {
1209 	if (auxkc_mh == 0 || auxkc_base == 0) {
1210 		return false; // no auxKC.
1211 	}
1212 
1213 	/* Fixup AuxKC and populate seg*AuxKC globals used below */
1214 	arm_auxkc_init((void*)auxkc_mh, (void*)auxkc_base);
1215 
1216 	if (segLOWESTAuxKC != segLOWEST) {
1217 		panic("segLOWESTAuxKC (%p) not equal to segLOWEST (%p). auxkc_mh: %p, auxkc_base: %p",
1218 		    (void*)segLOWESTAuxKC, (void*)segLOWEST,
1219 		    (void*)auxkc_mh, (void*)auxkc_base);
1220 	}
1221 
1222 	/*
1223 	 * The AuxKC LINKEDIT segment needs to be covered by the RO region but is excluded
1224 	 * from the RO address range returned by kernel_collection_adjust_mh_addrs().
1225 	 * Ensure the highest non-LINKEDIT address in the AuxKC is the current end of
1226 	 * its RO region before extending it.
1227 	 */
1228 	assert(segHIGHESTROAuxKC == segHIGHESTNLEAuxKC);
1229 	assert(segHIGHESTAuxKC >= segHIGHESTROAuxKC);
1230 	if (segHIGHESTAuxKC > segHIGHESTROAuxKC) {
1231 		segHIGHESTROAuxKC = segHIGHESTAuxKC;
1232 	}
1233 
1234 	/*
1235 	 * The AuxKC RO region must be right below the device tree/trustcache so that it can be covered
1236 	 * by CTRR, and the AuxKC RX region must be within the RO region.
1237 	 */
1238 	assert(segHIGHESTROAuxKC == auxkc_right_above);
1239 	assert(segHIGHESTRXAuxKC <= segHIGHESTROAuxKC);
1240 	assert(segLOWESTRXAuxKC <= segHIGHESTRXAuxKC);
1241 	assert(segLOWESTROAuxKC <= segLOWESTRXAuxKC);
1242 	assert(segLOWESTAuxKC <= segLOWESTROAuxKC);
1243 
1244 	if (segHIGHESTRXAuxKC < segLOWEST) {
1245 		arm_vm_page_granular_RNX(segHIGHESTRXAuxKC, segLOWEST - segHIGHESTRXAuxKC, 0);
1246 	}
1247 	if (segLOWESTRXAuxKC < segHIGHESTRXAuxKC) {
1248 		/*
1249 		 * We cannot mark auxKC text as guarded because doing so would enforce
1250 		 * BTI on oblivious third-party kexts and break ABI compatibility.
1251 		 * Doing this defeats the purpose of BTI (branches to these pages are
1252 		 * unchecked!) but given both the relative rarity and the diversity of
1253 		 * third-party kexts, we expect that this is likely impractical to
1254 		 * exploit in practice.
1255 		 */
1256 		arm_vm_page_granular_ROX(segLOWESTRXAuxKC, segHIGHESTRXAuxKC - segLOWESTRXAuxKC, 0, ARM64_PAGE_UNGUARDED); // Refined in OSKext::readPrelinkedExtensions
1257 	}
1258 	if (segLOWESTROAuxKC < segLOWESTRXAuxKC) {
1259 		arm_vm_page_granular_RNX(segLOWESTROAuxKC, segLOWESTRXAuxKC - segLOWESTROAuxKC, 0);
1260 	}
1261 	if (segLOWESTAuxKC < segLOWESTROAuxKC) {
1262 		arm_vm_page_granular_RWNX(segLOWESTAuxKC, segLOWESTROAuxKC - segLOWESTAuxKC, 0);
1263 	}
1264 
1265 	return true;
1266 }
1267 
1268 void
arm_vm_prot_init(__unused boot_args * args)1269 arm_vm_prot_init(__unused boot_args * args)
1270 {
1271 	segLOWESTTEXT = UINT64_MAX;
1272 	if (segSizePRELINKTEXT && (segPRELINKTEXTB < segLOWESTTEXT)) {
1273 		segLOWESTTEXT = segPRELINKTEXTB;
1274 	}
1275 	assert(segSizeTEXT);
1276 	if (segTEXTB < segLOWESTTEXT) {
1277 		segLOWESTTEXT = segTEXTB;
1278 	}
1279 	assert(segLOWESTTEXT < UINT64_MAX);
1280 
1281 	segEXTRADATA = 0;
1282 	segSizeEXTRADATA = 0;
1283 	segTRUSTCACHE = 0;
1284 	segSizeTRUSTCACHE = 0;
1285 
1286 	segLOWEST = segLOWESTTEXT;
1287 	segLOWESTRO = segLOWESTTEXT;
1288 
1289 	if (segLOWESTKC && segLOWESTKC < segLOWEST) {
1290 		/*
1291 		 * kernel collections have segments below the kernel. In particular the collection mach header
1292 		 * is below PRELINK_TEXT and is not covered by any other segments already tracked.
1293 		 */
1294 		arm_vm_page_granular_RNX(segLOWESTKC, segLOWEST - segLOWESTKC, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
1295 		segLOWEST = segLOWESTKC;
1296 		if (segLOWESTROKC && segLOWESTROKC < segLOWESTRO) {
1297 			segLOWESTRO = segLOWESTROKC;
1298 		}
1299 		if (segHIGHESTROKC && segHIGHESTROKC > segHIGHESTRO) {
1300 			segHIGHESTRO = segHIGHESTROKC;
1301 		}
1302 	}
1303 
1304 	DTEntry memory_map;
1305 	int err;
1306 
1307 	// Device Tree portion of EXTRADATA
1308 	if (SecureDTIsLockedDown()) {
1309 		segEXTRADATA = (vm_offset_t)PE_state.deviceTreeHead;
1310 		segSizeEXTRADATA = PE_state.deviceTreeSize;
1311 	}
1312 
1313 	// Trust Caches portion of EXTRADATA
1314 	{
1315 		DTMemoryMapRange const *trustCacheRange;
1316 		unsigned int trustCacheRangeSize;
1317 
1318 		err = SecureDTLookupEntry(NULL, "chosen/memory-map", &memory_map);
1319 		assert(err == kSuccess);
1320 
1321 		err = SecureDTGetProperty(memory_map, "TrustCache", (void const **)&trustCacheRange, &trustCacheRangeSize);
1322 		if (err == kSuccess) {
1323 			if (trustCacheRangeSize != sizeof(DTMemoryMapRange)) {
1324 				panic("Unexpected /chosen/memory-map/TrustCache property size %u != %zu", trustCacheRangeSize, sizeof(DTMemoryMapRange));
1325 			}
1326 
1327 			vm_offset_t const trustCacheRegion = phystokv(trustCacheRange->paddr);
1328 			if (trustCacheRegion < segLOWEST) {
1329 				if (segEXTRADATA != 0) {
1330 					if (trustCacheRegion != segEXTRADATA + segSizeEXTRADATA) {
1331 						panic("Unexpected location of TrustCache region: %#lx != %#lx",
1332 						    trustCacheRegion, segEXTRADATA + segSizeEXTRADATA);
1333 					}
1334 					segSizeEXTRADATA += trustCacheRange->length;
1335 				} else {
1336 					// Not all devices support CTRR device trees.
1337 					segEXTRADATA = trustCacheRegion;
1338 					segSizeEXTRADATA = trustCacheRange->length;
1339 				}
1340 			}
1341 #if !(DEVELOPMENT || DEBUG)
1342 			else {
1343 				panic("TrustCache region is in an unexpected place: %#lx > %#lx", trustCacheRegion, segLOWEST);
1344 			}
1345 #endif
1346 			segTRUSTCACHE = trustCacheRegion;
1347 			segSizeTRUSTCACHE = trustCacheRange->length;
1348 		}
1349 	}
1350 
1351 	if (segSizeEXTRADATA != 0) {
1352 		if (segEXTRADATA <= segLOWEST) {
1353 			segLOWEST = segEXTRADATA;
1354 			if (segEXTRADATA <= segLOWESTRO) {
1355 				segLOWESTRO = segEXTRADATA;
1356 			}
1357 		} else {
1358 			panic("EXTRADATA is in an unexpected place: %#lx > %#lx", segEXTRADATA, segLOWEST);
1359 		}
1360 
1361 		arm_vm_page_granular_RNX(segEXTRADATA, segSizeEXTRADATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
1362 	}
1363 
1364 	const DTMemoryMapRange *auxKC_range, *auxKC_header_range;
1365 	unsigned int auxKC_range_size, auxKC_header_range_size;
1366 
1367 	err = SecureDTGetProperty(memory_map, "AuxKC", (const void**)&auxKC_range,
1368 	    &auxKC_range_size);
1369 	if (err != kSuccess) {
1370 		goto noAuxKC;
1371 	}
1372 	assert(auxKC_range_size == sizeof(DTMemoryMapRange));
1373 	err = SecureDTGetProperty(memory_map, "AuxKC-mach_header",
1374 	    (const void**)&auxKC_header_range, &auxKC_header_range_size);
1375 	if (err != kSuccess) {
1376 		goto noAuxKC;
1377 	}
1378 	assert(auxKC_header_range_size == sizeof(DTMemoryMapRange));
1379 
1380 	if (auxKC_header_range->paddr == 0 || auxKC_range->paddr == 0) {
1381 		goto noAuxKC;
1382 	}
1383 
1384 	auxkc_mh = phystokv(auxKC_header_range->paddr);
1385 	auxkc_base = phystokv(auxKC_range->paddr);
1386 
1387 	if (auxkc_base < segLOWEST) {
1388 		auxkc_right_above = segLOWEST;
1389 		segLOWEST = auxkc_base;
1390 	} else {
1391 		panic("auxkc_base (%p) not below segLOWEST (%p)", (void*)auxkc_base, (void*)segLOWEST);
1392 	}
1393 
1394 	/* Map AuxKC RWNX initially so that arm_vm_auxkc_init can traverse
1395 	 * it and apply fixups (after we're off the bootstrap translation
1396 	 * tables).
1397 	 */
1398 	arm_vm_page_granular_RWNX(auxkc_base, auxKC_range->length, 0);
1399 
1400 noAuxKC:
1401 	/* Map coalesced kext TEXT segment RWNX for now */
1402 	arm_vm_page_granular_RWNX(segPRELINKTEXTB, segSizePRELINKTEXT, ARM64_GRANULE_ALLOW_BLOCK); // Refined in OSKext::readPrelinkedExtensions
1403 
1404 	/* Map coalesced kext DATA_CONST segment RWNX (could be empty) */
1405 	arm_vm_page_granular_RWNX(segPLKDATACONSTB, segSizePLKDATACONST, ARM64_GRANULE_ALLOW_BLOCK); // Refined in OSKext::readPrelinkedExtensions
1406 
1407 	/* Map coalesced kext TEXT_EXEC segment RX (could be empty) */
1408 	arm_vm_page_granular_ROX(segPLKTEXTEXECB, segSizePLKTEXTEXEC, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT, ARM64_PAGE_GUARDED); // Refined in OSKext::readPrelinkedExtensions
1409 
1410 	/* if new segments not present, set space between PRELINK_TEXT and xnu TEXT to RWNX
1411 	 * otherwise we no longer expect any space between the coalesced kext read only segments and xnu rosegments
1412 	 */
1413 	if (!segSizePLKDATACONST && !segSizePLKTEXTEXEC) {
1414 		if (segSizePRELINKTEXT) {
1415 			arm_vm_page_granular_RWNX(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT),
1416 			    ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
1417 		}
1418 	} else {
1419 		/*
1420 		 * If we have the new segments, we should still protect the gap between kext
1421 		 * read-only pages and kernel read-only pages, in the event that this gap
1422 		 * exists.
1423 		 */
1424 		if ((segPLKDATACONSTB + segSizePLKDATACONST) < segTEXTB) {
1425 			arm_vm_page_granular_RWNX(segPLKDATACONSTB + segSizePLKDATACONST, segTEXTB - (segPLKDATACONSTB + segSizePLKDATACONST),
1426 			    ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
1427 		}
1428 	}
1429 
1430 	/*
1431 	 * Protection on kernel text is loose here to allow shenanigans early on.  These
1432 	 * protections are tightened in arm_vm_prot_finalize().  This is necessary because
1433 	 * we currently patch LowResetVectorBase in cpu.c.
1434 	 *
1435 	 * TEXT segment contains mach headers and other non-executable data. This will become RONX later.
1436 	 */
1437 	arm_vm_page_granular_RNX(segTEXTB, segSizeTEXT, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
1438 
1439 	/* Can DATACONST start out and stay RNX?
1440 	 * NO, stuff in this segment gets modified during startup (viz. mac_policy_init()/mac_policy_list)
1441 	 * Make RNX in prot_finalize
1442 	 */
1443 	arm_vm_page_granular_RWNX(segDATACONSTB, segSizeDATACONST, ARM64_GRANULE_ALLOW_BLOCK);
1444 
1445 	arm_vm_page_granular_ROX(segTEXTEXECB, segSizeTEXTEXEC, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT, ARM64_PAGE_GUARDED);
1446 
1447 #if XNU_MONITOR
1448 	arm_vm_page_granular_ROX(segPPLTEXTB, segSizePPLTEXT, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT, ARM64_PAGE_UNGUARDED);
1449 	arm_vm_page_granular_ROX(segPPLTRAMPB, segSizePPLTRAMP, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT, ARM64_PAGE_UNGUARDED);
1450 	arm_vm_page_granular_RNX(segPPLDATACONSTB, segSizePPLDATACONST, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
1451 #endif
1452 
1453 	/* DATA segment will remain RWNX */
1454 	arm_vm_page_granular_RWNX(segDATAB, segSizeDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
1455 #if XNU_MONITOR
1456 	arm_vm_page_granular_RWNX(segPPLDATAB, segSizePPLDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
1457 #endif
1458 
1459 	arm_vm_page_granular_RWNX(segHIBDATAB, segSizeHIBDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
1460 
1461 	arm_vm_page_granular_RWNX(segBOOTDATAB, segSizeBOOTDATA, 0);
1462 	arm_vm_page_granular_RNX((vm_offset_t)&intstack_low_guard, PAGE_MAX_SIZE, 0);
1463 	arm_vm_page_granular_RNX((vm_offset_t)&intstack_high_guard, PAGE_MAX_SIZE, 0);
1464 	arm_vm_page_granular_RNX((vm_offset_t)&excepstack_high_guard, PAGE_MAX_SIZE, 0);
1465 
1466 	arm_vm_page_granular_ROX(segKLDB, segSizeKLD, 0, ARM64_PAGE_GUARDED);
1467 	arm_vm_page_granular_RNX(segKLDDATAB, segSizeKLDDATA, 0);
1468 	arm_vm_page_granular_RWNX(segLINKB, segSizeLINK, 0);
1469 	arm_vm_page_granular_RWNX(segPLKLINKEDITB, segSizePLKLINKEDIT, 0); // Coalesced kext LINKEDIT segment
1470 	arm_vm_page_granular_ROX(segLASTB, segSizeLAST, ARM64_GRANULE_ALLOW_BLOCK, ARM64_PAGE_GUARDED); // __LAST may be empty, but we cannot assume this
1471 	if (segLASTDATACONSTB) {
1472 		arm_vm_page_granular_RWNX(segLASTDATACONSTB, segSizeLASTDATACONST, ARM64_GRANULE_ALLOW_BLOCK); // __LASTDATA_CONST may be empty, but we cannot assume this
1473 	}
1474 	arm_vm_page_granular_RWNX(segPRELINKDATAB, segSizePRELINKDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // Prelink __DATA for kexts (RW data)
1475 
1476 	if (segSizePLKLLVMCOV > 0) {
1477 		arm_vm_page_granular_RWNX(segPLKLLVMCOVB, segSizePLKLLVMCOV, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // LLVM code coverage data
1478 	}
1479 	arm_vm_page_granular_RWNX(segPRELINKINFOB, segSizePRELINKINFO, 0); /* PreLinkInfoDictionary */
1480 
1481 	/* Record the bounds of the kernelcache. */
1482 	vm_kernelcache_base = segLOWEST;
1483 	vm_kernelcache_top = end_kern;
1484 }
1485 
1486 /*
1487  * return < 0 for a < b
1488  *          0 for a == b
1489  *        > 0 for a > b
1490  */
1491 typedef int (*cmpfunc_t)(const void *a, const void *b);
1492 
1493 extern void
1494 qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
1495 
1496 static int
cmp_ptov_entries(const void * a,const void * b)1497 cmp_ptov_entries(const void *a, const void *b)
1498 {
1499 	const ptov_table_entry *entry_a = a;
1500 	const ptov_table_entry *entry_b = b;
1501 	// Sort in descending order of segment length
1502 	if (entry_a->len < entry_b->len) {
1503 		return 1;
1504 	} else if (entry_a->len > entry_b->len) {
1505 		return -1;
1506 	} else {
1507 		return 0;
1508 	}
1509 }
1510 
1511 SECURITY_READ_ONLY_LATE(static unsigned int) ptov_index = 0;
1512 
1513 #define ROUND_L1(addr) (((addr) + ARM_TT_L1_OFFMASK) & ~(ARM_TT_L1_OFFMASK))
1514 #define ROUND_TWIG(addr) (((addr) + ARM_TT_TWIG_OFFMASK) & ~(ARM_TT_TWIG_OFFMASK))
1515 
1516 static void
arm_vm_physmap_slide(ptov_table_entry * temp_ptov_table,vm_map_address_t orig_va,vm_size_t len,int pte_prot_APX,unsigned granule)1517 arm_vm_physmap_slide(ptov_table_entry *temp_ptov_table, vm_map_address_t orig_va, vm_size_t len, int pte_prot_APX, unsigned granule)
1518 {
1519 	pmap_paddr_t pa_offset;
1520 
1521 	if (__improbable(ptov_index >= PTOV_TABLE_SIZE)) {
1522 		panic("%s: PTOV table limit exceeded; segment va = 0x%llx, size = 0x%llx", __func__,
1523 		    (unsigned long long)orig_va, (unsigned long long)len);
1524 	}
1525 	assert((orig_va & ARM_PGMASK) == 0);
1526 	temp_ptov_table[ptov_index].pa = orig_va - gVirtBase + gPhysBase;
1527 	if (ptov_index == 0) {
1528 		temp_ptov_table[ptov_index].va = physmap_base;
1529 	} else {
1530 		temp_ptov_table[ptov_index].va = temp_ptov_table[ptov_index - 1].va + temp_ptov_table[ptov_index - 1].len;
1531 	}
1532 	if (granule & ARM64_GRANULE_ALLOW_BLOCK) {
1533 		vm_map_address_t orig_offset = temp_ptov_table[ptov_index].pa & ARM_TT_TWIG_OFFMASK;
1534 		vm_map_address_t new_offset = temp_ptov_table[ptov_index].va & ARM_TT_TWIG_OFFMASK;
1535 		if (new_offset < orig_offset) {
1536 			temp_ptov_table[ptov_index].va += (orig_offset - new_offset);
1537 		} else if (new_offset > orig_offset) {
1538 			temp_ptov_table[ptov_index].va = ROUND_TWIG(temp_ptov_table[ptov_index].va) + orig_offset;
1539 		}
1540 	}
1541 	assert((temp_ptov_table[ptov_index].va & ARM_PGMASK) == 0);
1542 	temp_ptov_table[ptov_index].len = round_page(len);
1543 	pa_offset = temp_ptov_table[ptov_index].va - orig_va;
1544 	arm_vm_page_granular_prot(temp_ptov_table[ptov_index].va, temp_ptov_table[ptov_index].len, pa_offset, 1, pte_prot_APX, 1, granule, ARM64_PAGE_UNGUARDED);
1545 	++ptov_index;
1546 }
1547 
1548 #if XNU_MONITOR
1549 
1550 SECURITY_READ_ONLY_LATE(static boolean_t) keep_linkedit = FALSE;
1551 
1552 static void
arm_vm_physmap_init(boot_args * args)1553 arm_vm_physmap_init(boot_args *args)
1554 {
1555 	ptov_table_entry temp_ptov_table[PTOV_TABLE_SIZE];
1556 	bzero(temp_ptov_table, sizeof(temp_ptov_table));
1557 
1558 	// This is memory that will either be handed back to the VM layer via ml_static_mfree(),
1559 	// or will be available for general-purpose use.   Physical aperture mappings for this memory
1560 	// must be at page granularity, so that PPL ownership or cache attribute changes can be reflected
1561 	// in the physical aperture mappings.
1562 
1563 	// Slid region between gPhysBase and beginning of protected text
1564 	arm_vm_physmap_slide(temp_ptov_table, gVirtBase, segLOWEST - gVirtBase, AP_RWNA, 0);
1565 
1566 	// kext bootstrap segments
1567 #if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR) && !defined(KERNEL_INTEGRITY_PV_CTRR)
1568 	/* __KLD,__text is covered by the rorgn */
1569 	arm_vm_physmap_slide(temp_ptov_table, segKLDB, segSizeKLD, AP_RONA, 0);
1570 #endif
1571 	arm_vm_physmap_slide(temp_ptov_table, segKLDDATAB, segSizeKLDDATA, AP_RONA, 0);
1572 
1573 	// Early-boot data
1574 	arm_vm_physmap_slide(temp_ptov_table, segBOOTDATAB, segSizeBOOTDATA, AP_RONA, 0);
1575 
1576 	PE_parse_boot_argn("keepsyms", &keep_linkedit, sizeof(keep_linkedit));
1577 #if CONFIG_DTRACE
1578 	if (dtrace_keep_kernel_symbols()) {
1579 		keep_linkedit = TRUE;
1580 	}
1581 #endif /* CONFIG_DTRACE */
1582 #if KASAN_DYNAMIC_DENYLIST
1583 	/* KASAN's dynamic denylist needs to query the LINKEDIT segment at runtime.  As such, the
1584 	 * kext bootstrap code will not jettison LINKEDIT on kasan kernels, so don't bother to relocate it. */
1585 	keep_linkedit = TRUE;
1586 #endif /* KASAN_DYNAMIC_DENYLIST */
1587 	if (!keep_linkedit) {
1588 		// Kernel LINKEDIT
1589 		arm_vm_physmap_slide(temp_ptov_table, segLINKB, segSizeLINK, AP_RWNA, 0);
1590 
1591 		if (segSizePLKLINKEDIT) {
1592 			// Prelinked kernel LINKEDIT
1593 			arm_vm_physmap_slide(temp_ptov_table, segPLKLINKEDITB, segSizePLKLINKEDIT, AP_RWNA, 0);
1594 		}
1595 	}
1596 
1597 	// Prelinked kernel plists
1598 	arm_vm_physmap_slide(temp_ptov_table, segPRELINKINFOB, segSizePRELINKINFO, AP_RWNA, 0);
1599 
1600 	// Device tree (if not locked down), ramdisk, boot args
1601 	arm_vm_physmap_slide(temp_ptov_table, end_kern, (args->topOfKernelData - gPhysBase + gVirtBase) - end_kern, AP_RWNA, 0);
1602 	if (!SecureDTIsLockedDown()) {
1603 		PE_slide_devicetree(temp_ptov_table[ptov_index - 1].va - end_kern);
1604 	}
1605 
1606 	// Remainder of physical memory
1607 	arm_vm_physmap_slide(temp_ptov_table, (args->topOfKernelData - gPhysBase + gVirtBase),
1608 	    real_avail_end - args->topOfKernelData, AP_RWNA, 0);
1609 
1610 
1611 #if HAS_MTE
1612 	arm_vm_physmap_tag_region_init(temp_ptov_table);
1613 #endif /* HAS_MTE */
1614 
1615 	assert((temp_ptov_table[ptov_index - 1].va + temp_ptov_table[ptov_index - 1].len) <= physmap_end);
1616 
1617 	// Sort in descending order of segment length.  LUT traversal is linear, so largest (most likely used)
1618 	// segments should be placed earliest in the table to optimize lookup performance.
1619 	qsort(temp_ptov_table, PTOV_TABLE_SIZE, sizeof(temp_ptov_table[0]), cmp_ptov_entries);
1620 
1621 	memcpy(ptov_table, temp_ptov_table, sizeof(ptov_table));
1622 }
1623 
1624 #else
1625 
1626 static void
arm_vm_physmap_init(boot_args * args)1627 arm_vm_physmap_init(boot_args *args)
1628 {
1629 	ptov_table_entry temp_ptov_table[PTOV_TABLE_SIZE];
1630 	bzero(temp_ptov_table, sizeof(temp_ptov_table));
1631 
1632 	// Will be handed back to VM layer through ml_static_mfree() in arm_vm_prot_finalize()
1633 	arm_vm_physmap_slide(temp_ptov_table, gVirtBase, segLOWEST - gVirtBase, AP_RWNA,
1634 	    ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT);
1635 
1636 	arm_vm_page_granular_RWNX(end_kern, phystokv(args->topOfKernelData) - end_kern,
1637 	    ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); /* Device Tree (if not locked down), RAM Disk (if present), bootArgs */
1638 
1639 	arm_vm_physmap_slide(temp_ptov_table, (args->topOfKernelData - gPhysBase + gVirtBase),
1640 	    real_avail_end - args->topOfKernelData, AP_RWNA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // rest of physmem
1641 
1642 	assert((temp_ptov_table[ptov_index - 1].va + temp_ptov_table[ptov_index - 1].len) <= physmap_end);
1643 
1644 	// Sort in descending order of segment length.  LUT traversal is linear, so largest (most likely used)
1645 	// segments should be placed earliest in the table to optimize lookup performance.
1646 	qsort(temp_ptov_table, PTOV_TABLE_SIZE, sizeof(temp_ptov_table[0]), cmp_ptov_entries);
1647 
1648 	memcpy(ptov_table, temp_ptov_table, sizeof(ptov_table));
1649 }
1650 
1651 #endif // XNU_MONITOR
1652 
1653 void
arm_vm_prot_finalize(boot_args * args __unused)1654 arm_vm_prot_finalize(boot_args * args __unused)
1655 {
1656 	/*
1657 	 * At this point, we are far enough along in the boot process that it will be
1658 	 * safe to free up all of the memory preceeding the kernel.  It may in fact
1659 	 * be safe to do this earlier.
1660 	 *
1661 	 * This keeps the memory in the V-to-P mapping, but advertises it to the VM
1662 	 * as usable.
1663 	 */
1664 
1665 	/*
1666 	 * if old style PRELINK segment exists, free memory before it, and after it before XNU text
1667 	 * otherwise we're dealing with a new style kernel cache, so we should just free the
1668 	 * memory before PRELINK_TEXT segment, since the rest of the KEXT read only data segments
1669 	 * should be immediately followed by XNU's TEXT segment
1670 	 */
1671 
1672 	ml_static_mfree(phystokv(gPhysBase), segLOWEST - gVirtBase);
1673 
1674 	/*
1675 	 * KTRR support means we will be mucking with these pages and trying to
1676 	 * protect them; we cannot free the pages to the VM if we do this.
1677 	 */
1678 	if (!segSizePLKDATACONST && !segSizePLKTEXTEXEC && segSizePRELINKTEXT) {
1679 		/* If new segments not present, PRELINK_TEXT is not dynamically sized, free DRAM between it and xnu TEXT */
1680 		ml_static_mfree(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT));
1681 	}
1682 
1683 	/* tighten permissions on kext read only data and code */
1684 	arm_vm_page_granular_RNX(segPRELINKTEXTB, segSizePRELINKTEXT, ARM64_GRANULE_ALLOW_BLOCK);
1685 	arm_vm_page_granular_RNX(segPLKDATACONSTB, segSizePLKDATACONST, ARM64_GRANULE_ALLOW_BLOCK);
1686 
1687 	cpu_stack_alloc(&BootCpuData);
1688 	arm64_replace_bootstack(&BootCpuData);
1689 	ml_static_mfree(phystokv(segBOOTDATAB - gVirtBase + gPhysBase), segSizeBOOTDATA);
1690 
1691 #if __ARM_KERNEL_PROTECT__
1692 	arm_vm_populate_kernel_el0_mappings();
1693 #endif /* __ARM_KERNEL_PROTECT__ */
1694 
1695 #if XNU_MONITOR
1696 #if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR) && !defined(KERNEL_INTEGRITY_PV_CTRR)
1697 	/* __KLD,__text is covered by the rorgn */
1698 	for (vm_offset_t va = segKLDB; va < (segKLDB + segSizeKLD); va += ARM_PGBYTES) {
1699 		pt_entry_t *pte = arm_kva_to_pte(va);
1700 		*pte = ARM_PTE_EMPTY;
1701 	}
1702 #endif
1703 	for (vm_offset_t va = segKLDDATAB; va < (segKLDDATAB + segSizeKLDDATA); va += ARM_PGBYTES) {
1704 		pt_entry_t *pte = arm_kva_to_pte(va);
1705 		*pte = ARM_PTE_EMPTY;
1706 	}
1707 	/* Clear the original stack mappings; these pages should be mapped through ptov_table. */
1708 	for (vm_offset_t va = segBOOTDATAB; va < (segBOOTDATAB + segSizeBOOTDATA); va += ARM_PGBYTES) {
1709 		pt_entry_t *pte = arm_kva_to_pte(va);
1710 		*pte = ARM_PTE_EMPTY;
1711 	}
1712 	/* Clear the original PRELINKINFO mapping. This segment should be jettisoned during I/O Kit
1713 	 * initialization before we reach this point. */
1714 	for (vm_offset_t va = segPRELINKINFOB; va < (segPRELINKINFOB + segSizePRELINKINFO); va += ARM_PGBYTES) {
1715 		pt_entry_t *pte = arm_kva_to_pte(va);
1716 		*pte = ARM_PTE_EMPTY;
1717 	}
1718 	if (!keep_linkedit) {
1719 		for (vm_offset_t va = segLINKB; va < (segLINKB + segSizeLINK); va += ARM_PGBYTES) {
1720 			pt_entry_t *pte = arm_kva_to_pte(va);
1721 			*pte = ARM_PTE_EMPTY;
1722 		}
1723 		if (segSizePLKLINKEDIT) {
1724 			for (vm_offset_t va = segPLKLINKEDITB; va < (segPLKLINKEDITB + segSizePLKLINKEDIT); va += ARM_PGBYTES) {
1725 				pt_entry_t *pte = arm_kva_to_pte(va);
1726 				*pte = ARM_PTE_EMPTY;
1727 			}
1728 		}
1729 	}
1730 #endif /* XNU_MONITOR */
1731 
1732 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
1733 	/*
1734 	 * __LAST,__pinst should no longer be executable.
1735 	 */
1736 	arm_vm_page_granular_RNX(segLASTB, segSizeLAST, ARM64_GRANULE_ALLOW_BLOCK);
1737 
1738 	/* __LASTDATA_CONST should no longer be writable. */
1739 	if (segLASTDATACONSTB) {
1740 		arm_vm_page_granular_RNX(segLASTDATACONSTB, segSizeLASTDATACONST, ARM64_GRANULE_ALLOW_BLOCK);
1741 	}
1742 
1743 	/*
1744 	 * __KLD,__text should no longer be executable.
1745 	 */
1746 	arm_vm_page_granular_RNX(segKLDB, segSizeKLD, ARM64_GRANULE_ALLOW_BLOCK);
1747 
1748 	/*
1749 	 * Must wait until all other region permissions are set before locking down DATA_CONST
1750 	 * as the kernel static page tables live in DATA_CONST on KTRR enabled systems
1751 	 * and will become immutable.
1752 	 */
1753 #endif
1754 
1755 	arm_vm_page_granular_RNX(segDATACONSTB, segSizeDATACONST, ARM64_GRANULE_ALLOW_BLOCK);
1756 
1757 	__builtin_arm_dsb(DSB_ISH);
1758 	flush_mmu_tlb();
1759 }
1760 
1761 /*
1762  * Initialize and enter blank (invalid) page tables in a L1 translation table for a given VA range.
1763  *
1764  * This is a helper function used to build up the initial page tables for the kernel translation table.
1765  * With KERNEL_INTEGRITY we keep at least the root level of the kernel page table immutable, thus the need
1766  * to preallocate before machine_lockdown any L1 entries necessary during the entire kernel runtime.
1767  *
1768  * For a given VA range, if necessary, allocate new L2 translation tables and install the table entries in
1769  * the appropriate L1 table indexes. called before the translation table is active
1770  *
1771  * parameters:
1772  *
1773  * tt: virtual address of L1 translation table to modify
1774  * start: beginning of VA range
1775  * end: end of VA range
1776  * static_map: whether to allocate the new translation table page from read only memory
1777  * table_attrs: attributes of new table entry in addition to VALID and TYPE_TABLE attributes
1778  *
1779  */
1780 
1781 static void
init_ptpages(tt_entry_t * tt,vm_map_address_t start,vm_map_address_t end,bool static_map,uint64_t table_attrs)1782 init_ptpages(tt_entry_t *tt, vm_map_address_t start, vm_map_address_t end, bool static_map, uint64_t table_attrs)
1783 {
1784 	tt_entry_t *l1_tte;
1785 	vm_offset_t ptpage_vaddr;
1786 
1787 	l1_tte = tt + L1_TABLE_T1_INDEX(start, TCR_EL1_BOOT);
1788 
1789 	while (start < end) {
1790 		if (*l1_tte == ARM_TTE_EMPTY) {
1791 			/* Allocate a page and setup L1 Table TTE in L1 */
1792 			ptpage_vaddr = alloc_ptpage(static_map);
1793 			*l1_tte = (kvtophys(ptpage_vaddr) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | table_attrs;
1794 			bzero((void *)ptpage_vaddr, ARM_PGBYTES);
1795 		}
1796 
1797 		if ((start + ARM_TT_L1_SIZE) < start) {
1798 			/* If this is the last L1 entry, it must cover the last mapping. */
1799 			break;
1800 		}
1801 
1802 		start += ARM_TT_L1_SIZE;
1803 		l1_tte++;
1804 	}
1805 }
1806 
1807 #define ARM64_PHYSMAP_SLIDE_RANGE (1ULL << 30) // 1 GB
1808 #define ARM64_PHYSMAP_SLIDE_MASK  (ARM64_PHYSMAP_SLIDE_RANGE - 1)
1809 
1810 void
arm_vm_init(uint64_t memory_size,boot_args * args)1811 arm_vm_init(uint64_t memory_size, boot_args * args)
1812 {
1813 	vm_map_address_t va_l1, va_l1_end;
1814 	tt_entry_t       *cpu_l1_tte;
1815 	vm_map_address_t va_l2, va_l2_end;
1816 	tt_entry_t       *cpu_l2_tte;
1817 	pmap_paddr_t     boot_ttep;
1818 	tt_entry_t       *boot_tte;
1819 	uint64_t         mem_segments;
1820 	vm_offset_t      ptpage_vaddr;
1821 	vm_map_address_t dynamic_memory_begin;
1822 
1823 	/*
1824 	 * Get the virtual and physical kernel-managed memory base from boot_args.
1825 	 */
1826 	gVirtBase = args->virtBase;
1827 	gPhysBase = args->physBase;
1828 #if KASAN
1829 	real_phys_size = args->memSize + (shadow_ptop - shadow_pbase);
1830 #else
1831 	real_phys_size = args->memSize;
1832 #endif
1833 	/*
1834 	 * Ensure the physical region we specify for the VM to manage ends on a
1835 	 * software page boundary.  Note that the software page size (PAGE_SIZE)
1836 	 * may be a multiple of the hardware page size specified in ARM_PGBYTES.
1837 	 * We must round the reported memory size down to the nearest PAGE_SIZE
1838 	 * boundary to ensure the VM does not try to manage a page it does not
1839 	 * completely own.  The KASAN shadow region, if present, is managed entirely
1840 	 * in units of the hardware page size and should not need similar treatment.
1841 	 */
1842 	gPhysSize = mem_size = ((gPhysBase + args->memSize) & ~PAGE_MASK) - gPhysBase;
1843 #if HAS_MTE
1844 	/*
1845 	 * If MTE is enabled, iBoot pushed us down a contiguous memory region that
1846 	 * contains both the memory we can freely use along with the memory
1847 	 * that is reserved for tags. Fixup gPhysSize and mem_size until we enable
1848 	 * tag page reclaiming.
1849 	 */
1850 	if (is_mte_enabled) {
1851 		arm_vm_mte_init();
1852 		gPhysSize = mem_size = mte_tag_storage_start - gPhysBase;
1853 	}
1854 #endif /* HAS_MTE */
1855 
1856 	mem_actual = args->memSizeActual ? args->memSizeActual : mem_size;
1857 
1858 	if ((memory_size != 0) && (mem_size > memory_size)) {
1859 		mem_size = memory_size;
1860 		max_mem_actual = memory_size;
1861 	} else {
1862 		max_mem_actual = mem_actual;
1863 	}
1864 #if !defined(ARM_LARGE_MEMORY)
1865 	if (mem_size >= ((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 2)) {
1866 		panic("Unsupported memory configuration %lx", mem_size);
1867 	}
1868 #endif
1869 
1870 #if defined(ARM_LARGE_MEMORY)
1871 	unsigned long physmap_l1_entries = ((real_phys_size + ARM64_PHYSMAP_SLIDE_RANGE) >> ARM_TT_L1_SHIFT) + 1;
1872 	physmap_base = VM_MIN_KERNEL_ADDRESS - (physmap_l1_entries << ARM_TT_L1_SHIFT);
1873 #else
1874 	physmap_base = phystokv(args->topOfKernelData);
1875 #endif
1876 
1877 	// Slide the physical aperture to a random page-aligned location within the slide range
1878 	uint64_t physmap_slide = early_random() & ARM64_PHYSMAP_SLIDE_MASK & ~((uint64_t)PAGE_MASK);
1879 	assert(physmap_slide < ARM64_PHYSMAP_SLIDE_RANGE);
1880 
1881 	physmap_base += physmap_slide;
1882 
1883 #if XNU_MONITOR
1884 	physmap_base = ROUND_TWIG(physmap_base);
1885 #if defined(ARM_LARGE_MEMORY)
1886 	static_memory_end = phystokv(args->topOfKernelData);
1887 #else
1888 	static_memory_end = physmap_base + mem_size;
1889 #endif // ARM_LARGE_MEMORY
1890 	physmap_end = physmap_base + real_phys_size;
1891 
1892 #if HAS_MTE
1893 	physmap_end += gDramSize / MTE_PAGES_PER_TAG_PAGE;
1894 #endif /* HAS_MTE */
1895 
1896 #else
1897 #if defined(ARM_LARGE_MEMORY)
1898 	/* For large memory systems with no PPL such as virtual machines */
1899 	static_memory_end = phystokv(args->topOfKernelData);
1900 	physmap_end = physmap_base + real_phys_size;
1901 #else
1902 	static_memory_end = physmap_base + mem_size + (PTOV_TABLE_SIZE * ARM_TT_TWIG_SIZE); // worst possible case for block alignment
1903 	physmap_end = physmap_base + real_phys_size + (PTOV_TABLE_SIZE * ARM_TT_TWIG_SIZE);
1904 #endif // ARM_LARGE_MEMORY
1905 #endif
1906 
1907 #if KASAN && !defined(ARM_LARGE_MEMORY)
1908 	/* add the KASAN stolen memory to the physmap */
1909 	dynamic_memory_begin = static_memory_end + (shadow_ptop - shadow_pbase);
1910 #else
1911 	dynamic_memory_begin = static_memory_end;
1912 #endif
1913 #if XNU_MONITOR
1914 	pmap_stacks_start = (void*)dynamic_memory_begin;
1915 	dynamic_memory_begin += PPL_STACK_REGION_SIZE;
1916 	pmap_stacks_end = (void*)dynamic_memory_begin;
1917 
1918 #if HAS_GUARDED_IO_FILTER
1919     iofilter_stacks_start = (void*)dynamic_memory_begin;
1920     dynamic_memory_begin += IOFILTER_STACK_REGION_SIZE;
1921     iofilter_stacks_end = (void*)dynamic_memory_begin;
1922 #endif
1923 #endif
1924 	if (dynamic_memory_begin > VM_MAX_KERNEL_ADDRESS) {
1925 		panic("Unsupported memory configuration %lx", mem_size);
1926 	}
1927 
1928 	boot_tte = (tt_entry_t *)&bootstrap_pagetables;
1929 	boot_ttep = kvtophys((vm_offset_t)boot_tte);
1930 
1931 #if DEVELOPMENT || DEBUG
1932 	/* Sanity check - assert that BOOTSTRAP_TABLE_SIZE is sufficiently-large to
1933 	 * hold our bootstrap mappings for any possible slide */
1934 	size_t bytes_mapped = dynamic_memory_begin - gVirtBase;
1935 	size_t l1_entries = 1 + ((bytes_mapped + ARM_TT_L1_SIZE - 1) / ARM_TT_L1_SIZE);
1936 	/* 1 L1 each for V=P and KVA, plus 1 page for each L2 */
1937 	size_t pages_used = 2 * (l1_entries + 1);
1938 	if (pages_used > BOOTSTRAP_TABLE_SIZE) {
1939 		panic("BOOTSTRAP_TABLE_SIZE too small for memory config");
1940 	}
1941 #endif
1942 
1943 	/*
1944 	 *  TTBR0 L1, TTBR0 L2 - 1:1 bootstrap mapping.
1945 	 *  TTBR1 L1, TTBR1 L2 - kernel mapping
1946 	 */
1947 
1948 	/*
1949 	 * TODO: free bootstrap table memory back to allocator.
1950 	 * on large memory systems bootstrap tables could be quite large.
1951 	 * after bootstrap complete, xnu can warm start with a single 16KB page mapping
1952 	 * to trampoline to KVA. this requires only 3 pages to stay resident.
1953 	 */
1954 	first_avail_phys = avail_start = args->topOfKernelData;
1955 
1956 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
1957 	arm_replace_identity_map();
1958 #endif
1959 
1960 	/* Initialize invalid tte page */
1961 	invalid_tte = (tt_entry_t *)alloc_ptpage(TRUE);
1962 	invalid_ttep = kvtophys((vm_offset_t)invalid_tte);
1963 	bzero(invalid_tte, ARM_PGBYTES);
1964 
1965 	/*
1966 	 * Initialize l1 page table page
1967 	 */
1968 	cpu_tte = (tt_entry_t *)alloc_ptpage(TRUE);
1969 	cpu_ttep = kvtophys((vm_offset_t)cpu_tte);
1970 	bzero(cpu_tte, ARM_PGBYTES);
1971 	avail_end = gPhysBase + mem_size;
1972 	assert(!(avail_end & PAGE_MASK));
1973 
1974 #if KASAN
1975 	real_avail_end = gPhysBase + real_phys_size;
1976 #else
1977 	real_avail_end = avail_end;
1978 #endif
1979 
1980 	/*
1981 	 * Initialize l1 and l2 page table pages :
1982 	 *   map physical memory at the kernel base virtual address
1983 	 *   cover the kernel dynamic address range section
1984 	 *
1985 	 *   the so called physical aperture should be statically mapped
1986 	 */
1987 	init_ptpages(cpu_tte, gVirtBase, dynamic_memory_begin, TRUE, ARM_TTE_TABLE_AP(ARM_TTE_TABLE_AP_USER_NA));
1988 
1989 #if defined(ARM_LARGE_MEMORY)
1990 	/*
1991 	 * Initialize l1 page table pages :
1992 	 *   on large memory systems the physical aperture exists separately below
1993 	 *   the rest of the kernel virtual address space
1994 	 */
1995 	init_ptpages(cpu_tte, physmap_base, ROUND_L1(physmap_end), TRUE, ARM_DYNAMIC_TABLE_XN | ARM_TTE_TABLE_AP(ARM_TTE_TABLE_AP_USER_NA));
1996 #endif
1997 
1998 
1999 #if __ARM_KERNEL_PROTECT__
2000 	/* Expand the page tables to prepare for the EL0 mappings. */
2001 	arm_vm_expand_kernel_el0_mappings();
2002 #endif /* __ARM_KERNEL_PROTECT__ */
2003 
2004 	/*
2005 	 * Now retrieve addresses for various segments from kernel mach-o header
2006 	 */
2007 	segPRELINKTEXTB  = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_TEXT", &segSizePRELINKTEXT);
2008 	segPLKDATACONSTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PLK_DATA_CONST", &segSizePLKDATACONST);
2009 	segPLKTEXTEXECB  = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PLK_TEXT_EXEC", &segSizePLKTEXTEXEC);
2010 	segTEXTB         = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__TEXT", &segSizeTEXT);
2011 	segDATACONSTB    = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA_CONST", &segSizeDATACONST);
2012 	segTEXTEXECB     = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__TEXT_EXEC", &segSizeTEXTEXEC);
2013 #if XNU_MONITOR
2014 	segPPLTEXTB      = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PPLTEXT", &segSizePPLTEXT);
2015 	segPPLTRAMPB     = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PPLTRAMP", &segSizePPLTRAMP);
2016 	segPPLDATACONSTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PPLDATA_CONST", &segSizePPLDATACONST);
2017 #endif
2018 	segDATAB         = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA", &segSizeDATA);
2019 #if XNU_MONITOR
2020 	segPPLDATAB      = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PPLDATA", &segSizePPLDATA);
2021 #endif
2022 
2023 	segBOOTDATAB     = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__BOOTDATA", &segSizeBOOTDATA);
2024 	segLINKB         = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LINKEDIT", &segSizeLINK);
2025 	segKLDB          = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLD", &segSizeKLD);
2026 	segKLDDATAB      = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLDDATA", &segSizeKLDDATA);
2027 	segPRELINKDATAB  = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_DATA", &segSizePRELINKDATA);
2028 	segPRELINKINFOB  = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_INFO", &segSizePRELINKINFO);
2029 	segPLKLLVMCOVB   = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PLK_LLVM_COV", &segSizePLKLLVMCOV);
2030 	segPLKLINKEDITB  = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PLK_LINKEDIT", &segSizePLKLINKEDIT);
2031 	segLASTB         = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LAST", &segSizeLAST);
2032 	segLASTDATACONSTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LASTDATA_CONST", &segSizeLASTDATACONST);
2033 
2034 	sectHIBTEXTB     = (vm_offset_t) getsectdatafromheader(&_mh_execute_header, "__TEXT_EXEC", "__hib_text", &sectSizeHIBTEXT);
2035 	sectHIBDATACONSTB = (vm_offset_t) getsectdatafromheader(&_mh_execute_header, "__DATA_CONST", "__hib_const", &sectSizeHIBDATACONST);
2036 	segHIBDATAB      = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__HIBDATA", &segSizeHIBDATA);
2037 
2038 	if (kernel_mach_header_is_in_fileset(&_mh_execute_header)) {
2039 		kernel_mach_header_t *kc_mh = PE_get_kc_header(KCKindPrimary);
2040 
2041 		// fileset has kext PLK_TEXT_EXEC under kernel collection TEXT_EXEC following kernel's LAST
2042 		segKCTEXTEXECB = (vm_offset_t) getsegdatafromheader(kc_mh,             "__TEXT_EXEC", &segSizeKCTEXTEXEC);
2043 		assert(segPLKTEXTEXECB && !segSizePLKTEXTEXEC);                        // kernel PLK_TEXT_EXEC must be empty
2044 
2045 		assert(segLASTB);                                                      // kernel LAST can be empty, but it must have
2046 		                                                                       // a valid address for computations below.
2047 
2048 		assert(segKCTEXTEXECB <= segLASTB);                                    // KC TEXT_EXEC must contain kernel LAST
2049 		assert(segKCTEXTEXECB + segSizeKCTEXTEXEC >= segLASTB + segSizeLAST);
2050 		segPLKTEXTEXECB = segLASTB + segSizeLAST;
2051 		segSizePLKTEXTEXEC = segSizeKCTEXTEXEC - (segPLKTEXTEXECB - segKCTEXTEXECB);
2052 
2053 		// fileset has kext PLK_DATA_CONST under kernel collection DATA_CONST following kernel's LASTDATA_CONST
2054 		segKCDATACONSTB = (vm_offset_t) getsegdatafromheader(kc_mh,            "__DATA_CONST", &segSizeKCDATACONST);
2055 		assert(segPLKDATACONSTB && !segSizePLKDATACONST);                      // kernel PLK_DATA_CONST must be empty
2056 		assert(segLASTDATACONSTB && segSizeLASTDATACONST);                     // kernel LASTDATA_CONST must be non-empty
2057 		assert(segKCDATACONSTB <= segLASTDATACONSTB);                          // KC DATA_CONST must contain kernel LASTDATA_CONST
2058 		assert(segKCDATACONSTB + segSizeKCDATACONST >= segLASTDATACONSTB + segSizeLASTDATACONST);
2059 		segPLKDATACONSTB = segLASTDATACONSTB + segSizeLASTDATACONST;
2060 		segSizePLKDATACONST = segSizeKCDATACONST - (segPLKDATACONSTB - segKCDATACONSTB);
2061 
2062 		// fileset has kext PRELINK_DATA under kernel collection DATA following kernel's empty PRELINK_DATA
2063 		segKCDATAB      = (vm_offset_t) getsegdatafromheader(kc_mh,            "__DATA", &segSizeKCDATA);
2064 		assert(segPRELINKDATAB && !segSizePRELINKDATA);                        // kernel PRELINK_DATA must be empty
2065 		assert(segKCDATAB <= segPRELINKDATAB);                                 // KC DATA must contain kernel PRELINK_DATA
2066 		assert(segKCDATAB + segSizeKCDATA >= segPRELINKDATAB + segSizePRELINKDATA);
2067 		segSizePRELINKDATA = segSizeKCDATA - (segPRELINKDATAB - segKCDATAB);
2068 
2069 		// fileset has consolidated PRELINK_TEXT, PRELINK_INFO and LINKEDIT at the kernel collection level
2070 		assert(segPRELINKTEXTB && !segSizePRELINKTEXT);                        // kernel PRELINK_TEXT must be empty
2071 		segPRELINKTEXTB = (vm_offset_t) getsegdatafromheader(kc_mh,            "__PRELINK_TEXT", &segSizePRELINKTEXT);
2072 		assert(segPRELINKINFOB && !segSizePRELINKINFO);                        // kernel PRELINK_INFO must be empty
2073 		segPRELINKINFOB = (vm_offset_t) getsegdatafromheader(kc_mh,            "__PRELINK_INFO", &segSizePRELINKINFO);
2074 		segLINKB        = (vm_offset_t) getsegdatafromheader(kc_mh,            "__LINKEDIT", &segSizeLINK);
2075 	}
2076 
2077 	(void) PE_parse_boot_argn("use_contiguous_hint", &use_contiguous_hint, sizeof(use_contiguous_hint));
2078 	assert(segSizePRELINKTEXT < 0x03000000); /* 23355738 */
2079 
2080 	/* if one of the new segments is present, the other one better be as well */
2081 	if (segSizePLKDATACONST || segSizePLKTEXTEXEC) {
2082 		assert(segSizePLKDATACONST && segSizePLKTEXTEXEC);
2083 	}
2084 
2085 	etext = (vm_offset_t) segTEXTB + segSizeTEXT;
2086 	sdata = (vm_offset_t) segDATAB;
2087 	edata = (vm_offset_t) segDATAB + segSizeDATA;
2088 	end_kern = round_page(segHIGHESTKC ? segHIGHESTKC : getlastkerneladdr()); /* Force end to next page */
2089 
2090 	vm_set_page_size();
2091 
2092 	vm_kernel_base = segTEXTB;
2093 	vm_kernel_top = (vm_offset_t) &last_kernel_symbol;
2094 	vm_kext_base = segPRELINKTEXTB;
2095 	vm_kext_top = vm_kext_base + segSizePRELINKTEXT;
2096 
2097 	vm_prelink_stext = segPRELINKTEXTB;
2098 	if (!segSizePLKTEXTEXEC && !segSizePLKDATACONST) {
2099 		vm_prelink_etext = segPRELINKTEXTB + segSizePRELINKTEXT;
2100 	} else {
2101 		vm_prelink_etext = segPRELINKTEXTB + segSizePRELINKTEXT + segSizePLKDATACONST + segSizePLKTEXTEXEC;
2102 	}
2103 	vm_prelink_sinfo = segPRELINKINFOB;
2104 	vm_prelink_einfo = segPRELINKINFOB + segSizePRELINKINFO;
2105 	vm_slinkedit = segLINKB;
2106 	vm_elinkedit = segLINKB + segSizeLINK;
2107 
2108 	vm_prelink_sdata = segPRELINKDATAB;
2109 	vm_prelink_edata = segPRELINKDATAB + segSizePRELINKDATA;
2110 
2111 	arm_vm_prot_init(args);
2112 
2113 	/*
2114 	 * Initialize the page tables for the low globals:
2115 	 *   cover this address range:
2116 	 *     LOW_GLOBAL_BASE_ADDRESS + 2MB
2117 	 */
2118 	va_l1 = va_l2 = LOW_GLOBAL_BASE_ADDRESS;
2119 	cpu_l1_tte = cpu_tte + L1_TABLE_T1_INDEX(va_l1, TCR_EL1_BOOT);
2120 	cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
2121 	ptpage_vaddr = alloc_ptpage(TRUE);
2122 	*cpu_l2_tte = (kvtophys(ptpage_vaddr) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN;
2123 	bzero((void *)ptpage_vaddr, ARM_PGBYTES);
2124 
2125 	/*
2126 	 * Initialize l2 page table pages :
2127 	 *   cover this address range:
2128 	 *    KERNEL_DYNAMIC_ADDR - VM_MAX_KERNEL_ADDRESS
2129 	 */
2130 #if defined(ARM_LARGE_MEMORY)
2131 	/*
2132 	 * dynamic mapped memory outside the VM allocator VA range required to bootstrap VM system
2133 	 * don't expect to exceed 64GB, no sense mapping any more space between here and the VM heap range
2134 	 */
2135 	init_ptpages(cpu_tte, dynamic_memory_begin, ROUND_L1(dynamic_memory_begin), FALSE, ARM_DYNAMIC_TABLE_XN | ARM_TTE_TABLE_AP(ARM_TTE_TABLE_AP_USER_NA));
2136 #else
2137 	/*
2138 	 * TODO: do these pages really need to come from RO memory?
2139 	 * With legacy 3 level table systems we never mapped more than a single L1 entry so this may be dead code
2140 	 */
2141 	init_ptpages(cpu_tte, dynamic_memory_begin, VM_MAX_KERNEL_ADDRESS, TRUE, ARM_DYNAMIC_TABLE_XN | ARM_TTE_TABLE_AP(ARM_TTE_TABLE_AP_USER_NA));
2142 #endif
2143 
2144 #if KASAN
2145 	/* record the extent of the physmap */
2146 	physmap_vbase = physmap_base;
2147 	physmap_vtop = physmap_end;
2148 	kasan_init();
2149 #endif /* KASAN */
2150 
2151 #if CONFIG_CPU_COUNTERS
2152 	mt_early_init();
2153 #endif /* CONFIG_CPU_COUNTERS */
2154 
2155 	arm_vm_physmap_init(args);
2156 	set_mmu_ttb_alternate(cpu_ttep & TTBR_BADDR_MASK);
2157 
2158 	ml_enable_monitor();
2159 
2160 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
2161 
2162 	flush_mmu_tlb();
2163 	kva_active = TRUE;
2164 	// global table pointers may need to be different due to physical aperture remapping
2165 	cpu_tte = (tt_entry_t*)(phystokv(cpu_ttep));
2166 	invalid_tte = (tt_entry_t*)(phystokv(invalid_ttep));
2167 
2168 	// From here on out, we're off the bootstrap translation tables.
2169 
2170 
2171 	/* AuxKC initialization has to be deferred until this point, since
2172 	 * the AuxKC may not have been fully mapped in the bootstrap
2173 	 * tables, if it spilled downwards into the prior L2 block.
2174 	 *
2175 	 * Now that its mapping set up by arm_vm_prot_init() is active,
2176 	 * we can traverse and fix it up.
2177 	 */
2178 
2179 	/* Calculate the physical bounds of the kernelcache; using
2180 	 * gVirtBase/gPhysBase math to do this directly is generally a bad idea
2181 	 * as the physmap is no longer physically contiguous.  However, this is
2182 	 * done here as segLOWEST and end_kern are both virtual addresses the
2183 	 * bootstrap physmap, and because kvtophys references the page tables
2184 	 * (at least at the time this comment was written), meaning that at
2185 	 * least end_kern may not point to a valid mapping on some kernelcache
2186 	 * configurations, so kvtophys would report a physical address of 0.
2187 	 *
2188 	 * Long term, the kernelcache should probably be described in terms of
2189 	 * multiple physical ranges, as there is no strong guarantee or
2190 	 * requirement that the kernelcache will always be physically
2191 	 * contiguous.
2192 	 */
2193 	arm_vm_kernelcache_phys_start = segLOWEST - gVirtBase + gPhysBase;
2194 	arm_vm_kernelcache_phys_end = end_kern - gVirtBase + gPhysBase;;
2195 
2196 	/* Calculate the number of pages that belong to the kernelcache. */
2197 	vm_page_kernelcache_count = (unsigned int) (atop_64(arm_vm_kernelcache_phys_end - arm_vm_kernelcache_phys_start));
2198 
2199 	if (arm_vm_auxkc_init()) {
2200 		if (segLOWESTROAuxKC < segLOWESTRO) {
2201 			segLOWESTRO = segLOWESTROAuxKC;
2202 		}
2203 		if (segHIGHESTROAuxKC > segHIGHESTRO) {
2204 			segHIGHESTRO = segHIGHESTROAuxKC;
2205 		}
2206 		if (segLOWESTRXAuxKC < segLOWESTTEXT) {
2207 			segLOWESTTEXT = segLOWESTRXAuxKC;
2208 		}
2209 		assert(segLOWEST == segLOWESTAuxKC);
2210 
2211 		// The preliminary auxKC mapping has been broken up.
2212 		flush_mmu_tlb();
2213 	}
2214 
2215 	sane_size = mem_size - (avail_start - gPhysBase);
2216 	max_mem = mem_size;
2217 	vm_kernel_slid_base = segLOWESTTEXT;
2218 	// vm_kernel_slide is set by arm_init()->arm_slide_rebase_and_sign_image()
2219 	vm_kernel_stext = segTEXTB;
2220 
2221 	if (kernel_mach_header_is_in_fileset(&_mh_execute_header)) {
2222 		vm_kernel_etext = segTEXTEXECB + segSizeTEXTEXEC;
2223 		vm_kernel_slid_top = vm_slinkedit;
2224 	} else {
2225 		assert(segDATACONSTB == segTEXTB + segSizeTEXT);
2226 		assert(segTEXTEXECB == segDATACONSTB + segSizeDATACONST);
2227 		vm_kernel_etext = segTEXTB + segSizeTEXT + segSizeDATACONST + segSizeTEXTEXEC;
2228 		vm_kernel_slid_top = vm_prelink_einfo;
2229 	}
2230 
2231 	dynamic_memory_begin = ROUND_TWIG(dynamic_memory_begin);
2232 #if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST)
2233 	// reserve a 32MB region without permission overrides to use later for a CTRR unit test
2234 	{
2235 		extern vm_offset_t ctrr_test_page;
2236 		tt_entry_t *new_tte;
2237 
2238 		ctrr_test_page = dynamic_memory_begin;
2239 		dynamic_memory_begin += ARM_TT_L2_SIZE;
2240 		cpu_l1_tte = cpu_tte + L1_TABLE_T1_INDEX(ctrr_test_page, TCR_EL1_BOOT);
2241 		assert((*cpu_l1_tte) & ARM_TTE_VALID);
2242 		cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((ctrr_test_page & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
2243 		assert((*cpu_l2_tte) == ARM_TTE_EMPTY);
2244 		new_tte = (tt_entry_t *)alloc_ptpage(FALSE);
2245 		bzero(new_tte, ARM_PGBYTES);
2246 		*cpu_l2_tte = (kvtophys((vm_offset_t)new_tte) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
2247 	}
2248 #endif /* (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST) */
2249 #if XNU_MONITOR
2250 	for (vm_offset_t cur = (vm_offset_t)pmap_stacks_start; cur < (vm_offset_t)pmap_stacks_end; cur += ARM_PGBYTES) {
2251 		arm_vm_map(cpu_tte, cur, ARM_PTE_EMPTY);
2252 	}
2253 #if HAS_GUARDED_IO_FILTER
2254     for (vm_offset_t cur = (vm_offset_t)iofilter_stacks_start; cur < (vm_offset_t)iofilter_stacks_end; cur += ARM_PGBYTES) {
2255         arm_vm_map(cpu_tte, cur, ARM_PTE_EMPTY);
2256     }
2257 #endif
2258 #endif
2259 	pmap_bootstrap(dynamic_memory_begin);
2260 
2261 	disable_preemption();
2262 
2263 	/*
2264 	 * Initialize l3 page table pages :
2265 	 *   cover this address range:
2266 	 *    2MB + FrameBuffer size + 10MB for each 256MB segment
2267 	 */
2268 
2269 	mem_segments = (mem_size + 0x0FFFFFFF) >> 28;
2270 
2271 	va_l1 = dynamic_memory_begin;
2272 	va_l1_end = va_l1 + ((2 + (mem_segments * 10)) << 20);
2273 	va_l1_end += round_page(args->Video.v_height * args->Video.v_rowBytes);
2274 	va_l1_end = (va_l1_end + 0x00000000007FFFFFULL) & 0xFFFFFFFFFF800000ULL;
2275 
2276 	cpu_l1_tte = cpu_tte + L1_TABLE_T1_INDEX(va_l1, TCR_EL1_BOOT);
2277 
2278 	while (va_l1 < va_l1_end) {
2279 		va_l2 = va_l1;
2280 
2281 		if (((va_l1 & ~ARM_TT_L1_OFFMASK) + ARM_TT_L1_SIZE) < va_l1) {
2282 			/* If this is the last L1 entry, it must cover the last mapping. */
2283 			va_l2_end = va_l1_end;
2284 		} else {
2285 			va_l2_end = MIN((va_l1 & ~ARM_TT_L1_OFFMASK) + ARM_TT_L1_SIZE, va_l1_end);
2286 		}
2287 
2288 		cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
2289 
2290 		while (va_l2 < va_l2_end) {
2291 			pt_entry_t *    ptp;
2292 			pmap_paddr_t    ptp_phys;
2293 
2294 			/* Allocate a page and setup L3 Table TTE in L2 */
2295 			ptp = (pt_entry_t *) alloc_ptpage(FALSE);
2296 			ptp_phys = (pmap_paddr_t)kvtophys((vm_offset_t)ptp);
2297 
2298 			bzero(ptp, ARM_PGBYTES);
2299 			pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE);
2300 
2301 			*cpu_l2_tte = (pa_to_tte(ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN;
2302 
2303 			va_l2 += ARM_TT_L2_SIZE;
2304 			cpu_l2_tte++;
2305 		}
2306 
2307 		va_l1 = va_l2_end;
2308 		cpu_l1_tte++;
2309 	}
2310 
2311 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
2312 	/*
2313 	 * In this configuration, the bootstrap mappings (arm_vm_init) and
2314 	 * the heap mappings occupy separate L1 regions.  Explicitly set up
2315 	 * the heap L1 allocations here.
2316 	 */
2317 #if defined(ARM_LARGE_MEMORY)
2318 	init_ptpages(cpu_tte, KERNEL_PMAP_HEAP_RANGE_START & ~ARM_TT_L1_OFFMASK, VM_MAX_KERNEL_ADDRESS, FALSE, ARM_DYNAMIC_TABLE_XN | ARM_TTE_TABLE_AP(ARM_TTE_TABLE_AP_USER_NA));
2319 #else // defined(ARM_LARGE_MEMORY)
2320 	va_l1 = VM_MIN_KERNEL_ADDRESS & ~ARM_TT_L1_OFFMASK;
2321 	init_ptpages(cpu_tte, VM_MIN_KERNEL_ADDRESS & ~ARM_TT_L1_OFFMASK, VM_MAX_KERNEL_ADDRESS, FALSE, ARM_DYNAMIC_TABLE_XN | ARM_TTE_TABLE_AP(ARM_TTE_TABLE_AP_USER_NA));
2322 #endif // defined(ARM_LARGE_MEMORY)
2323 #else
2324 #if defined(ARM_LARGE_MEMORY)
2325 	/* For large memory systems with no KTRR/CTRR such as virtual machines */
2326 	init_ptpages(cpu_tte, KERNEL_PMAP_HEAP_RANGE_START & ~ARM_TT_L1_OFFMASK, VM_MAX_KERNEL_ADDRESS, FALSE, ARM_DYNAMIC_TABLE_XN | ARM_TTE_TABLE_AP(ARM_TTE_TABLE_AP_USER_NA));
2327 #endif
2328 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
2329 
2330 	/*
2331 	 * Initialize l3 page table pages :
2332 	 *   cover this address range:
2333 	 *   ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - PE_EARLY_BOOT_VA) to VM_MAX_KERNEL_ADDRESS
2334 	 */
2335 	va_l1 = (VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - PE_EARLY_BOOT_VA;
2336 	va_l1_end = VM_MAX_KERNEL_ADDRESS;
2337 
2338 	cpu_l1_tte = cpu_tte + L1_TABLE_T1_INDEX(va_l1, TCR_EL1_BOOT);
2339 
2340 	while (va_l1 < va_l1_end) {
2341 		va_l2 = va_l1;
2342 
2343 		if (((va_l1 & ~ARM_TT_L1_OFFMASK) + ARM_TT_L1_SIZE) < va_l1) {
2344 			/* If this is the last L1 entry, it must cover the last mapping. */
2345 			va_l2_end = va_l1_end;
2346 		} else {
2347 			va_l2_end = MIN((va_l1 & ~ARM_TT_L1_OFFMASK) + ARM_TT_L1_SIZE, va_l1_end);
2348 		}
2349 
2350 		cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
2351 
2352 		while (va_l2 < va_l2_end) {
2353 			pt_entry_t *    ptp;
2354 			pmap_paddr_t    ptp_phys;
2355 
2356 			/* Allocate a page and setup L3 Table TTE in L2 */
2357 			ptp = (pt_entry_t *) alloc_ptpage(FALSE);
2358 			ptp_phys = (pmap_paddr_t)kvtophys((vm_offset_t)ptp);
2359 
2360 			bzero(ptp, ARM_PGBYTES);
2361 			pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE);
2362 
2363 			*cpu_l2_tte = (pa_to_tte(ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN;
2364 
2365 			va_l2 += ARM_TT_L2_SIZE;
2366 			cpu_l2_tte++;
2367 		}
2368 
2369 		va_l1 = va_l2_end;
2370 		cpu_l1_tte++;
2371 	}
2372 
2373 
2374 	/*
2375 	 * Adjust avail_start so that the range that the VM owns
2376 	 * starts on a PAGE_SIZE aligned boundary.
2377 	 */
2378 	avail_start = (avail_start + PAGE_MASK) & ~PAGE_MASK;
2379 
2380 #if XNU_MONITOR
2381 	pmap_static_allocations_done();
2382 #endif
2383 	first_avail = avail_start;
2384 	patch_low_glo_static_region(args->topOfKernelData, avail_start - args->topOfKernelData);
2385 	enable_preemption();
2386 }
2387 
2388 /*
2389  * Returns true if the address is within __TEXT, __TEXT_EXEC or __DATA_CONST
2390  * segment range. This is what [vm_kernel_stext, vm_kernel_etext) range used to
2391  * cover. The segments together may not be continuous anymore and so individual
2392  * intervals are inspected.
2393  */
2394 bool
kernel_text_contains(vm_offset_t addr)2395 kernel_text_contains(vm_offset_t addr)
2396 {
2397 	if (segTEXTB <= addr && addr < (segTEXTB + segSizeTEXT)) {
2398 		return true;
2399 	}
2400 	if (segTEXTEXECB <= addr && addr < (segTEXTEXECB + segSizeTEXTEXEC)) {
2401 		return true;
2402 	}
2403 	return segDATACONSTB <= addr && addr < (segDATACONSTB + segSizeDATACONST);
2404 }
2405