xref: /xnu-8796.121.2/osfmk/i386/i386_vm_init.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2003-2019 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989, 1988 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 
57 
58 #include <mach/i386/vm_param.h>
59 
60 #include <string.h>
61 #include <mach/vm_param.h>
62 #include <mach/vm_prot.h>
63 #include <mach/machine.h>
64 #include <mach/time_value.h>
65 #include <kern/spl.h>
66 #include <kern/assert.h>
67 #include <kern/debug.h>
68 #include <kern/misc_protos.h>
69 #include <kern/cpu_data.h>
70 #include <kern/processor.h>
71 #include <vm/vm_page.h>
72 #include <vm/pmap.h>
73 #include <vm/vm_kern.h>
74 #include <i386/pmap.h>
75 #include <i386/misc_protos.h>
76 #include <i386/cpuid.h>
77 #include <mach/thread_status.h>
78 #include <pexpert/i386/efi.h>
79 #include <pexpert/pexpert.h>
80 #include <i386/i386_lowmem.h>
81 #include <i386/misc_protos.h>
82 #include <x86_64/lowglobals.h>
83 #include <i386/pal_routines.h>
84 
85 #include <mach-o/loader.h>
86 #include <libkern/kernel_mach_header.h>
87 
88 #define P2ROUNDUP(x, align)             (-(-(x) & -(align)))
89 
90 vm_size_t       mem_size = 0;
91 pmap_paddr_t    first_avail = 0;/* first after page tables */
92 
93 uint64_t        max_mem;        /* Size of physical memory minus carveouts (bytes), adjusted by maxmem */
94 uint64_t        max_mem_actual; /* Actual size of physical memory (bytes) adjusted by
95                                  * the maxmem boot-arg */
96 uint64_t        mem_actual;
97 uint64_t        sane_size = 0;  /* Memory size for defaults calculations */
98 
99 /*
100  * KASLR parameters
101  */
102 ppnum_t         vm_kernel_base_page;
103 vm_offset_t     vm_kernel_base;
104 vm_offset_t     vm_kernel_top;
105 vm_offset_t     vm_kernel_stext;
106 vm_offset_t     vm_kernel_etext;
107 vm_offset_t     vm_kernel_slide;
108 vm_offset_t     vm_kernel_slid_base;
109 vm_offset_t     vm_kernel_slid_top;
110 vm_offset_t vm_hib_base;
111 vm_offset_t     vm_kext_base = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
112 vm_offset_t     vm_kext_top = VM_MIN_KERNEL_ADDRESS;
113 
114 vm_offset_t vm_prelink_stext;
115 vm_offset_t vm_prelink_etext;
116 vm_offset_t vm_prelink_sinfo;
117 vm_offset_t vm_prelink_einfo;
118 vm_offset_t vm_slinkedit;
119 vm_offset_t vm_elinkedit;
120 
121 vm_offset_t vm_kernel_builtinkmod_text;
122 vm_offset_t vm_kernel_builtinkmod_text_end;
123 
124 #define MAXLORESERVE    (32 * 1024 * 1024)
125 
126 ppnum_t         max_ppnum = 0;
127 
128 /*
129  * pmap_high_used* are the highest range of physical memory used for kernel
130  * internals (page tables, vm_pages) via pmap_steal_memory() that don't
131  * need to be encrypted in hibernation images. There can be one gap in
132  * the middle of this due to fragmentation when using a mix of small
133  * and large pages.  In that case, the fragment lives between the high
134  * and middle ranges.
135  */
136 ppnum_t pmap_high_used_top = 0;
137 ppnum_t pmap_high_used_bottom = 0;
138 ppnum_t pmap_middle_used_top = 0;
139 ppnum_t pmap_middle_used_bottom = 0;
140 
141 enum {PMAP_MAX_RESERVED_RANGES = 32};
142 uint32_t pmap_reserved_pages_allocated = 0;
143 uint32_t pmap_reserved_range_indices[PMAP_MAX_RESERVED_RANGES];
144 uint32_t pmap_last_reserved_range_index = 0;
145 uint32_t pmap_reserved_ranges = 0;
146 
147 extern unsigned int bsd_mbuf_cluster_reserve(boolean_t *);
148 
149 pmap_paddr_t     avail_start, avail_end;
150 vm_offset_t     virtual_avail, virtual_end;
151 static pmap_paddr_t     avail_remaining;
152 vm_offset_t     static_memory_end = 0;
153 
154 vm_offset_t     sHIB, eHIB, stext, etext, sdata, edata, end, sconst, econst;
155 
156 /*
157  * _mh_execute_header is the mach_header for the currently executing kernel
158  */
159 vm_offset_t segTEXTB; unsigned long segSizeTEXT;
160 vm_offset_t segDATAB; unsigned long segSizeDATA;
161 vm_offset_t segLINKB; unsigned long segSizeLINK;
162 vm_offset_t segPRELINKTEXTB; unsigned long segSizePRELINKTEXT;
163 vm_offset_t segPRELINKINFOB; unsigned long segSizePRELINKINFO;
164 vm_offset_t segHIBB; unsigned long segSizeHIB;
165 unsigned long segSizeConst;
166 
167 static kernel_segment_command_t *segTEXT, *segDATA;
168 static kernel_section_t *cursectTEXT, *lastsectTEXT;
169 static kernel_segment_command_t *segCONST;
170 
171 extern uint64_t firmware_Conventional_bytes;
172 extern uint64_t firmware_RuntimeServices_bytes;
173 extern uint64_t firmware_ACPIReclaim_bytes;
174 extern uint64_t firmware_ACPINVS_bytes;
175 extern uint64_t firmware_PalCode_bytes;
176 extern uint64_t firmware_Reserved_bytes;
177 extern uint64_t firmware_Unusable_bytes;
178 extern uint64_t firmware_other_bytes;
179 uint64_t firmware_MMIO_bytes;
180 
181 /*
182  * Linker magic to establish the highest address in the kernel.
183  */
184 extern void     *last_kernel_symbol;
185 
186 #define LG_PPNUM_PAGES (I386_LPGBYTES >> PAGE_SHIFT)
187 #define LG_PPNUM_MASK (I386_LPGMASK >> PAGE_SHIFT)
188 
189 /* set so no region large page fragment pages exist */
190 #define RESET_FRAG(r) (((r)->alloc_frag_up = 1), ((r)->alloc_frag_down = 0))
191 
192 boolean_t       memmap = FALSE;
193 #if     DEBUG || DEVELOPMENT
194 static void
kprint_memmap(vm_offset_t maddr,unsigned int msize,unsigned int mcount)195 kprint_memmap(vm_offset_t maddr, unsigned int msize, unsigned int mcount)
196 {
197 	unsigned int         i;
198 	unsigned int         j;
199 	pmap_memory_region_t *p = pmap_memory_regions;
200 	EfiMemoryRange       *mptr;
201 	addr64_t             region_start, region_end;
202 	addr64_t             efi_start, efi_end;
203 
204 	for (j = 0; j < pmap_memory_region_count; j++, p++) {
205 		kprintf("pmap region %d type %d base 0x%llx alloc_up 0x%llx alloc_down 0x%llx"
206 		    " alloc_frag_up 0x%llx alloc_frag_down 0x%llx top 0x%llx\n",
207 		    j, p->type,
208 		    (addr64_t) p->base << I386_PGSHIFT,
209 		    (addr64_t) p->alloc_up << I386_PGSHIFT,
210 		    (addr64_t) p->alloc_down << I386_PGSHIFT,
211 		    (addr64_t) p->alloc_frag_up << I386_PGSHIFT,
212 		    (addr64_t) p->alloc_frag_down << I386_PGSHIFT,
213 		    (addr64_t) p->end   << I386_PGSHIFT);
214 		region_start = (addr64_t) p->base << I386_PGSHIFT;
215 		region_end = ((addr64_t) p->end << I386_PGSHIFT) - 1;
216 		mptr = (EfiMemoryRange *) maddr;
217 		for (i = 0;
218 		    i < mcount;
219 		    i++, mptr = (EfiMemoryRange *)(((vm_offset_t)mptr) + msize)) {
220 			if (mptr->Type != kEfiLoaderCode &&
221 			    mptr->Type != kEfiLoaderData &&
222 			    mptr->Type != kEfiBootServicesCode &&
223 			    mptr->Type != kEfiBootServicesData &&
224 			    mptr->Type != kEfiConventionalMemory) {
225 				efi_start = (addr64_t)mptr->PhysicalStart;
226 				efi_end = efi_start + ((vm_offset_t)mptr->NumberOfPages << I386_PGSHIFT) - 1;
227 				if ((efi_start >= region_start && efi_start <= region_end) ||
228 				    (efi_end >= region_start && efi_end <= region_end)) {
229 					kprintf(" *** Overlapping region with EFI runtime region %d\n", i);
230 				}
231 			}
232 		}
233 	}
234 }
235 #define DPRINTF(x...)   do { if (memmap) kprintf(x); } while (0)
236 
237 #else
238 
239 static void
kprint_memmap(vm_offset_t maddr,unsigned int msize,unsigned int mcount)240 kprint_memmap(vm_offset_t maddr, unsigned int msize, unsigned int mcount)
241 {
242 #pragma unused(maddr, msize, mcount)
243 }
244 
245 #define DPRINTF(x...)
246 #endif /* DEBUG */
247 
248 /*
249  * Basic VM initialization.
250  */
251 void
i386_vm_init(uint64_t maxmem,boolean_t IA32e,boot_args * args)252 i386_vm_init(uint64_t   maxmem,
253     boolean_t  IA32e,
254     boot_args  *args)
255 {
256 	pmap_memory_region_t *pmptr;
257 	pmap_memory_region_t *prev_pmptr;
258 	EfiMemoryRange *mptr;
259 	unsigned int mcount;
260 	unsigned int msize;
261 	vm_offset_t maddr;
262 	ppnum_t fap;
263 	unsigned int i;
264 	ppnum_t maxpg = 0;
265 	uint32_t pmap_type;
266 	uint32_t maxloreserve;
267 	uint32_t maxdmaaddr;
268 	uint32_t  mbuf_reserve = 0;
269 	boolean_t mbuf_override = FALSE;
270 	boolean_t coalescing_permitted;
271 	vm_kernel_base_page = i386_btop(args->kaddr);
272 	vm_offset_t base_address;
273 	vm_offset_t static_base_address;
274 
275 	PE_parse_boot_argn("memmap", &memmap, sizeof(memmap));
276 
277 	/*
278 	 * Establish the KASLR parameters.
279 	 */
280 	static_base_address = ml_static_ptovirt(KERNEL_BASE_OFFSET);
281 	base_address        = ml_static_ptovirt(args->kaddr);
282 	vm_kernel_slide     = base_address - static_base_address;
283 	if (args->kslide) {
284 		kprintf("KASLR slide: 0x%016lx dynamic\n", vm_kernel_slide);
285 		if (vm_kernel_slide != ((vm_offset_t)args->kslide)) {
286 			panic("Kernel base inconsistent with slide - rebased?");
287 		}
288 	} else {
289 		/* No slide relative to on-disk symbols */
290 		kprintf("KASLR slide: 0x%016lx static and ignored\n",
291 		    vm_kernel_slide);
292 		vm_kernel_slide = 0;
293 	}
294 
295 	/*
296 	 * Zero out local relocations to avoid confusing kxld.
297 	 * TODO: might be better to move this code to OSKext::initialize
298 	 */
299 	if (_mh_execute_header.flags & MH_PIE) {
300 		struct load_command *loadcmd;
301 		uint32_t cmd;
302 
303 		loadcmd = (struct load_command *)((uintptr_t)&_mh_execute_header +
304 		    sizeof(_mh_execute_header));
305 
306 		for (cmd = 0; cmd < _mh_execute_header.ncmds; cmd++) {
307 			if (loadcmd->cmd == LC_DYSYMTAB) {
308 				struct dysymtab_command *dysymtab;
309 
310 				dysymtab = (struct dysymtab_command *)loadcmd;
311 				dysymtab->nlocrel = 0;
312 				dysymtab->locreloff = 0;
313 				kprintf("Hiding local relocations\n");
314 				break;
315 			}
316 			loadcmd = (struct load_command *)((uintptr_t)loadcmd + loadcmd->cmdsize);
317 		}
318 	}
319 
320 	/*
321 	 * Now retrieve addresses for end, edata, and etext
322 	 * from MACH-O headers.
323 	 */
324 	segTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
325 	    "__TEXT", &segSizeTEXT);
326 	segDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
327 	    "__DATA", &segSizeDATA);
328 	segLINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
329 	    "__LINKEDIT", &segSizeLINK);
330 	segHIBB  = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
331 	    "__HIB", &segSizeHIB);
332 	segPRELINKTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
333 	    "__PRELINK_TEXT", &segSizePRELINKTEXT);
334 	segPRELINKINFOB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
335 	    "__PRELINK_INFO", &segSizePRELINKINFO);
336 	segTEXT = getsegbynamefromheader(&_mh_execute_header,
337 	    "__TEXT");
338 	segDATA = getsegbynamefromheader(&_mh_execute_header,
339 	    "__DATA");
340 	segCONST = getsegbynamefromheader(&_mh_execute_header,
341 	    "__DATA_CONST");
342 	cursectTEXT = lastsectTEXT = firstsect(segTEXT);
343 	/* Discover the last TEXT section within the TEXT segment */
344 	while ((cursectTEXT = nextsect(segTEXT, cursectTEXT)) != NULL) {
345 		lastsectTEXT = cursectTEXT;
346 	}
347 
348 	sHIB  = segHIBB;
349 	eHIB  = segHIBB + segSizeHIB;
350 	vm_hib_base = sHIB;
351 	/* Zero-padded from ehib to stext if text is 2M-aligned */
352 	stext = segTEXTB;
353 	lowGlo.lgStext = stext;
354 	etext = (vm_offset_t) round_page_64(lastsectTEXT->addr + lastsectTEXT->size);
355 	/* Zero-padded from etext to sdata if text is 2M-aligned */
356 	sdata = segDATAB;
357 	edata = segDATAB + segSizeDATA;
358 
359 	sconst = segCONST->vmaddr;
360 	segSizeConst = segCONST->vmsize;
361 	econst = sconst + segSizeConst;
362 
363 	kc_format_t kc_format = KCFormatUnknown;
364 
365 	/* XXX: FIXME_IN_dyld: For new-style kernel caches, the ending address of __DATA_CONST may not be page-aligned */
366 	if (PE_get_primary_kc_format(&kc_format) && kc_format == KCFormatFileset) {
367 		/* Round up the end */
368 		econst = P2ROUNDUP(econst, PAGE_SIZE);
369 		edata = P2ROUNDUP(edata, PAGE_SIZE);
370 	} else {
371 		assert(((sconst | econst) & PAGE_MASK) == 0);
372 		assert(((sdata | edata) & PAGE_MASK) == 0);
373 	}
374 
375 	DPRINTF("segTEXTB    = %p\n", (void *) segTEXTB);
376 	DPRINTF("segDATAB    = %p\n", (void *) segDATAB);
377 	DPRINTF("segLINKB    = %p\n", (void *) segLINKB);
378 	DPRINTF("segHIBB     = %p\n", (void *) segHIBB);
379 	DPRINTF("segPRELINKTEXTB = %p\n", (void *) segPRELINKTEXTB);
380 	DPRINTF("segPRELINKINFOB = %p\n", (void *) segPRELINKINFOB);
381 	DPRINTF("sHIB        = %p\n", (void *) sHIB);
382 	DPRINTF("eHIB        = %p\n", (void *) eHIB);
383 	DPRINTF("stext       = %p\n", (void *) stext);
384 	DPRINTF("etext       = %p\n", (void *) etext);
385 	DPRINTF("sdata       = %p\n", (void *) sdata);
386 	DPRINTF("edata       = %p\n", (void *) edata);
387 	DPRINTF("sconst      = %p\n", (void *) sconst);
388 	DPRINTF("econst      = %p\n", (void *) econst);
389 	DPRINTF("kernel_top  = %p\n", (void *) &last_kernel_symbol);
390 
391 	vm_kernel_base  = sHIB;
392 	vm_kernel_top   = (vm_offset_t) &last_kernel_symbol;
393 	vm_kernel_stext = stext;
394 	vm_kernel_etext = etext;
395 	vm_prelink_stext = segPRELINKTEXTB;
396 	vm_prelink_etext = segPRELINKTEXTB + segSizePRELINKTEXT;
397 	vm_prelink_sinfo = segPRELINKINFOB;
398 	vm_prelink_einfo = segPRELINKINFOB + segSizePRELINKINFO;
399 	vm_slinkedit = segLINKB;
400 	vm_elinkedit = segLINKB + segSizeLINK;
401 
402 	/*
403 	 * In the fileset world, we want to be able to (un)slide addresses from
404 	 * the kernel or any of the kexts (e.g., for kernel logging metadata
405 	 * passed between the kernel and logd in userspace). VM_KERNEL_UNSLIDE
406 	 * (via VM_KERNEL_IS_SLID) should apply to the addresses in the range
407 	 * from the first basement address to the last boot kc address.
408 	 *
409 	 *                     ^
410 	 *                     :
411 	 *                     |
412 	 *  vm_kernel_slid_top - ---------------------------------------------
413 	 *                     |
414 	 *                     :
415 	 *                     : Boot kc (kexts in the boot kc here)
416 	 *                     : - - - - - - - - - - - - - - - - - - - - - - -
417 	 *                     :
418 	 *                     :
419 	 *                     | Boot kc (kernel here)
420 	 *                     - ---------------------------------------------
421 	 *                     |
422 	 *                     :
423 	 *                     | Basement (kexts in pageable and aux kcs here)
424 	 * vm_kernel_slid_base - ---------------------------------------------
425 	 *                     0
426 	 */
427 
428 	vm_kernel_slid_base = vm_kext_base + vm_kernel_slide;
429 	vm_kernel_slid_top = (kc_format == KCFormatFileset) ?
430 	    vm_slinkedit : vm_prelink_einfo;
431 
432 	vm_page_kernelcache_count = (unsigned int) (atop_64(vm_kernel_top - vm_kernel_base));
433 
434 	vm_set_page_size();
435 
436 	/*
437 	 * Compute the memory size.
438 	 */
439 
440 	avail_remaining = 0;
441 	avail_end = 0;
442 	pmptr = pmap_memory_regions;
443 	prev_pmptr = 0;
444 	pmap_memory_region_count = pmap_memory_region_current = 0;
445 	fap = (ppnum_t) i386_btop(first_avail);
446 
447 	maddr = ml_static_ptovirt((vm_offset_t)args->MemoryMap);
448 	mptr = (EfiMemoryRange *)maddr;
449 	if (args->MemoryMapDescriptorSize == 0) {
450 		panic("Invalid memory map descriptor size");
451 	}
452 	msize = args->MemoryMapDescriptorSize;
453 	mcount = args->MemoryMapSize / msize;
454 
455 #define FOURGIG 0x0000000100000000ULL
456 #define ONEGIG  0x0000000040000000ULL
457 
458 	for (i = 0; i < mcount; i++, mptr = (EfiMemoryRange *)(((vm_offset_t)mptr) + msize)) {
459 		ppnum_t base, top;
460 		uint64_t region_bytes = 0;
461 
462 		if (pmap_memory_region_count >= PMAP_MEMORY_REGIONS_SIZE) {
463 			kprintf("WARNING: truncating memory region count at %d\n", pmap_memory_region_count);
464 			break;
465 		}
466 		base = (ppnum_t) (mptr->PhysicalStart >> I386_PGSHIFT);
467 		top = (ppnum_t) (((mptr->PhysicalStart) >> I386_PGSHIFT) + mptr->NumberOfPages - 1);
468 
469 		if (base == 0) {
470 			/*
471 			 * Avoid having to deal with the edge case of the
472 			 * very first possible physical page and the roll-over
473 			 * to -1; just ignore that page.
474 			 */
475 			kprintf("WARNING: ignoring first page in [0x%llx:0x%llx]\n", (uint64_t) base, (uint64_t) top);
476 			base++;
477 		}
478 		if (top + 1 == 0) {
479 			/*
480 			 * Avoid having to deal with the edge case of the
481 			 * very last possible physical page and the roll-over
482 			 * to 0; just ignore that page.
483 			 */
484 			kprintf("WARNING: ignoring last page in [0x%llx:0x%llx]\n", (uint64_t) base, (uint64_t) top);
485 			top--;
486 		}
487 		if (top < base) {
488 			/*
489 			 * That was the only page in that region, so
490 			 * ignore the whole region.
491 			 */
492 			continue;
493 		}
494 
495 #if     MR_RSV_TEST
496 		static uint32_t nmr = 0;
497 		if ((base > 0x20000) && (nmr++ < 4)) {
498 			mptr->Attribute |= EFI_MEMORY_KERN_RESERVED;
499 		}
500 #endif
501 		region_bytes = (uint64_t)(mptr->NumberOfPages << I386_PGSHIFT);
502 		pmap_type = mptr->Type;
503 
504 		switch (mptr->Type) {
505 		case kEfiLoaderCode:
506 		case kEfiLoaderData:
507 		case kEfiBootServicesCode:
508 		case kEfiBootServicesData:
509 		case kEfiConventionalMemory:
510 			/*
511 			 * Consolidate usable memory types into one.
512 			 */
513 			pmap_type = kEfiConventionalMemory;
514 			sane_size += region_bytes;
515 			firmware_Conventional_bytes += region_bytes;
516 			break;
517 		/*
518 		 * sane_size should reflect the total amount of physical
519 		 * RAM in the system, not just the amount that is
520 		 * available for the OS to use.
521 		 * We now get this value from SMBIOS tables
522 		 * rather than reverse engineering the memory map.
523 		 * But the legacy computation of "sane_size" is kept
524 		 * for diagnostic information.
525 		 */
526 
527 		case kEfiRuntimeServicesCode:
528 		case kEfiRuntimeServicesData:
529 			firmware_RuntimeServices_bytes += region_bytes;
530 			sane_size += region_bytes;
531 			break;
532 		case kEfiACPIReclaimMemory:
533 			firmware_ACPIReclaim_bytes += region_bytes;
534 			sane_size += region_bytes;
535 			break;
536 		case kEfiACPIMemoryNVS:
537 			firmware_ACPINVS_bytes += region_bytes;
538 			sane_size += region_bytes;
539 			break;
540 		case kEfiPalCode:
541 			firmware_PalCode_bytes += region_bytes;
542 			sane_size += region_bytes;
543 			break;
544 
545 		case kEfiReservedMemoryType:
546 			firmware_Reserved_bytes += region_bytes;
547 			break;
548 		case kEfiUnusableMemory:
549 			firmware_Unusable_bytes += region_bytes;
550 			break;
551 		case kEfiMemoryMappedIO:
552 		case kEfiMemoryMappedIOPortSpace:
553 			firmware_MMIO_bytes += region_bytes;
554 			break;
555 		default:
556 			firmware_other_bytes += region_bytes;
557 			break;
558 		}
559 
560 		DPRINTF("EFI region %d: type %u/%d, base 0x%x, top 0x%x %s\n",
561 		    i, mptr->Type, pmap_type, base, top,
562 		    (mptr->Attribute & EFI_MEMORY_KERN_RESERVED)? "RESERVED" :
563 		    (mptr->Attribute & EFI_MEMORY_RUNTIME)? "RUNTIME" : "");
564 
565 		if (maxpg) {
566 			if (base >= maxpg) {
567 				break;
568 			}
569 			top = (top > maxpg) ? maxpg : top;
570 		}
571 
572 		/*
573 		 * handle each region
574 		 */
575 		if ((mptr->Attribute & EFI_MEMORY_RUNTIME) == EFI_MEMORY_RUNTIME ||
576 		    pmap_type != kEfiConventionalMemory) {
577 			prev_pmptr = 0;
578 			continue;
579 		} else {
580 			/*
581 			 * Usable memory region
582 			 */
583 			if (top < I386_LOWMEM_RESERVED ||
584 			    !pal_is_usable_memory(base, top)) {
585 				prev_pmptr = 0;
586 				continue;
587 			}
588 			/*
589 			 * A range may be marked with with the
590 			 * EFI_MEMORY_KERN_RESERVED attribute
591 			 * on some systems, to indicate that the range
592 			 * must not be made available to devices.
593 			 */
594 
595 			if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
596 				if (++pmap_reserved_ranges > PMAP_MAX_RESERVED_RANGES) {
597 					panic("Too many reserved ranges %u", pmap_reserved_ranges);
598 				}
599 			}
600 
601 			if (top < fap) {
602 				/*
603 				 * entire range below first_avail
604 				 * salvage some low memory pages
605 				 * we use some very low memory at startup
606 				 * mark as already allocated here
607 				 */
608 				if (base >= I386_LOWMEM_RESERVED) {
609 					pmptr->base = base;
610 				} else {
611 					pmptr->base = I386_LOWMEM_RESERVED;
612 				}
613 
614 				pmptr->end = top;
615 
616 
617 				if ((mptr->Attribute & EFI_MEMORY_KERN_RESERVED) &&
618 				    (top < vm_kernel_base_page)) {
619 					pmptr->alloc_up = pmptr->base;
620 					pmptr->alloc_down = pmptr->end;
621 					RESET_FRAG(pmptr);
622 					pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
623 				} else {
624 					/*
625 					 * mark as already mapped
626 					 */
627 					pmptr->alloc_up = top + 1;
628 					pmptr->alloc_down = top;
629 					RESET_FRAG(pmptr);
630 				}
631 				pmptr->type = pmap_type;
632 				pmptr->attribute = mptr->Attribute;
633 			} else if ((base < fap) && (top > fap)) {
634 				/*
635 				 * spans first_avail
636 				 * put mem below first avail in table but
637 				 * mark already allocated
638 				 */
639 				pmptr->base = base;
640 				pmptr->end = (fap - 1);
641 				pmptr->alloc_up = pmptr->end + 1;
642 				pmptr->alloc_down = pmptr->end;
643 				RESET_FRAG(pmptr);
644 				pmptr->type = pmap_type;
645 				pmptr->attribute = mptr->Attribute;
646 				/*
647 				 * we bump these here inline so the accounting
648 				 * below works correctly
649 				 */
650 				pmptr++;
651 				pmap_memory_region_count++;
652 
653 				pmptr->alloc_up = pmptr->base = fap;
654 				pmptr->type = pmap_type;
655 				pmptr->attribute = mptr->Attribute;
656 				pmptr->alloc_down = pmptr->end = top;
657 				RESET_FRAG(pmptr);
658 
659 				if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
660 					pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
661 				}
662 			} else {
663 				/*
664 				 * entire range useable
665 				 */
666 				pmptr->alloc_up = pmptr->base = base;
667 				pmptr->type = pmap_type;
668 				pmptr->attribute = mptr->Attribute;
669 				pmptr->alloc_down = pmptr->end = top;
670 				RESET_FRAG(pmptr);
671 				if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
672 					pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
673 				}
674 			}
675 
676 			if (i386_ptob(pmptr->end) > avail_end) {
677 				avail_end = i386_ptob(pmptr->end);
678 			}
679 
680 			avail_remaining += (pmptr->end - pmptr->base);
681 			coalescing_permitted = (prev_pmptr && (pmptr->attribute == prev_pmptr->attribute) && ((pmptr->attribute & EFI_MEMORY_KERN_RESERVED) == 0));
682 			/*
683 			 * Consolidate contiguous memory regions, if possible
684 			 */
685 			if (prev_pmptr &&
686 			    (pmptr->type == prev_pmptr->type) &&
687 			    (coalescing_permitted) &&
688 			    (pmptr->base == pmptr->alloc_up) &&
689 			    (prev_pmptr->end == prev_pmptr->alloc_down) &&
690 			    (pmptr->base == (prev_pmptr->end + 1))) {
691 				prev_pmptr->end = pmptr->end;
692 				prev_pmptr->alloc_down = pmptr->alloc_down;
693 				RESET_FRAG(pmptr);
694 			} else {
695 				pmap_memory_region_count++;
696 				prev_pmptr = pmptr;
697 				pmptr++;
698 			}
699 		}
700 	}
701 
702 	if (memmap) {
703 		kprint_memmap(maddr, msize, mcount);
704 	}
705 
706 	avail_start = first_avail;
707 	mem_actual = args->PhysicalMemorySize;
708 
709 	/*
710 	 * For user visible memory size, round up to 128 Mb
711 	 * - accounting for the various stolen memory not reported by EFI.
712 	 * This is maintained for historical, comparison purposes but
713 	 * we now use the memory size reported by EFI/Booter.
714 	 */
715 	sane_size = (sane_size + 128 * MB - 1) & ~((uint64_t)(128 * MB - 1));
716 	if (sane_size != mem_actual) {
717 		printf("mem_actual: 0x%llx\n legacy sane_size: 0x%llx\n",
718 		    mem_actual, sane_size);
719 	}
720 	sane_size = mem_actual;
721 
722 	/*
723 	 * We cap at KERNEL_MAXMEM bytes (see vm_param.h).
724 	 * Unless overriden by the maxmem= boot-arg
725 	 * -- which is a non-zero maxmem argument to this function.
726 	 */
727 	if (maxmem == 0 && sane_size > KERNEL_MAXMEM) {
728 		maxmem = KERNEL_MAXMEM;
729 		printf("Physical memory %lld bytes capped at %dGB\n",
730 		    sane_size, (uint32_t) (KERNEL_MAXMEM / GB));
731 	}
732 
733 	/*
734 	 * if user set maxmem, reduce memory sizes
735 	 */
736 	if ((maxmem > (uint64_t)first_avail) && (maxmem < sane_size)) {
737 		ppnum_t discarded_pages  = (ppnum_t)((sane_size - maxmem) >> I386_PGSHIFT);
738 		ppnum_t highest_pn = 0;
739 		ppnum_t cur_end  = 0;
740 		uint64_t        pages_to_use;
741 		unsigned        cur_region = 0;
742 
743 		sane_size = maxmem;
744 
745 		if (avail_remaining > discarded_pages) {
746 			avail_remaining -= discarded_pages;
747 		} else {
748 			avail_remaining = 0;
749 		}
750 
751 		pages_to_use = avail_remaining;
752 
753 		while (cur_region < pmap_memory_region_count && pages_to_use) {
754 			for (cur_end = pmap_memory_regions[cur_region].base;
755 			    cur_end < pmap_memory_regions[cur_region].end && pages_to_use;
756 			    cur_end++) {
757 				if (cur_end > highest_pn) {
758 					highest_pn = cur_end;
759 				}
760 				pages_to_use--;
761 			}
762 			if (pages_to_use == 0) {
763 				pmap_memory_regions[cur_region].end = cur_end;
764 				pmap_memory_regions[cur_region].alloc_down = cur_end;
765 				RESET_FRAG(&pmap_memory_regions[cur_region]);
766 			}
767 
768 			cur_region++;
769 		}
770 		pmap_memory_region_count = cur_region;
771 
772 		avail_end = i386_ptob(highest_pn + 1);
773 	}
774 
775 	/*
776 	 * mem_size is only a 32 bit container... follow the PPC route
777 	 * and pin it to a 2 Gbyte maximum
778 	 */
779 	if (sane_size > (FOURGIG >> 1)) {
780 		mem_size = (vm_size_t)(FOURGIG >> 1);
781 	} else {
782 		mem_size = (vm_size_t)sane_size;
783 	}
784 	max_mem = sane_size;
785 	max_mem_actual = sane_size;
786 
787 	kprintf("Physical memory %llu MB\n", sane_size / MB);
788 
789 	max_valid_low_ppnum = (2 * GB) / PAGE_SIZE;
790 
791 	if (!PE_parse_boot_argn("max_valid_dma_addr", &maxdmaaddr, sizeof(maxdmaaddr))) {
792 		max_valid_dma_address = (uint64_t)4 * (uint64_t)GB;
793 	} else {
794 		max_valid_dma_address = ((uint64_t) maxdmaaddr) * MB;
795 
796 		if ((max_valid_dma_address / PAGE_SIZE) < max_valid_low_ppnum) {
797 			max_valid_low_ppnum = (ppnum_t)(max_valid_dma_address / PAGE_SIZE);
798 		}
799 	}
800 	if (avail_end >= max_valid_dma_address) {
801 		if (!PE_parse_boot_argn("maxloreserve", &maxloreserve, sizeof(maxloreserve))) {
802 			if (sane_size >= (ONEGIG * 15)) {
803 				maxloreserve = (MAXLORESERVE / PAGE_SIZE) * 4;
804 			} else if (sane_size >= (ONEGIG * 7)) {
805 				maxloreserve = (MAXLORESERVE / PAGE_SIZE) * 2;
806 			} else {
807 				maxloreserve = MAXLORESERVE / PAGE_SIZE;
808 			}
809 
810 #if SOCKETS
811 			mbuf_reserve = bsd_mbuf_cluster_reserve(&mbuf_override) / PAGE_SIZE;
812 #endif
813 		} else {
814 			maxloreserve = (maxloreserve * (1024 * 1024)) / PAGE_SIZE;
815 		}
816 
817 		if (maxloreserve) {
818 			vm_lopage_free_limit = maxloreserve;
819 
820 			if (mbuf_override == TRUE) {
821 				vm_lopage_free_limit += mbuf_reserve;
822 				vm_lopage_lowater = 0;
823 			} else {
824 				vm_lopage_lowater = vm_lopage_free_limit / 16;
825 			}
826 
827 			vm_lopage_refill = TRUE;
828 			vm_lopage_needed = TRUE;
829 		}
830 	}
831 
832 	/*
833 	 *	Initialize kernel physical map.
834 	 *	Kernel virtual address starts at VM_KERNEL_MIN_ADDRESS.
835 	 */
836 	kprintf("avail_remaining = 0x%lx\n", (unsigned long)avail_remaining);
837 	pmap_bootstrap(0, IA32e);
838 }
839 
840 
841 unsigned int
pmap_free_pages(void)842 pmap_free_pages(void)
843 {
844 	return (unsigned int)avail_remaining;
845 }
846 
847 boolean_t pmap_next_page_reserved(ppnum_t *);
848 
849 /*
850  * Pick a page from a "kernel private" reserved range; works around
851  * errata on some hardware. EFI marks pages which can't be used for
852  * certain kinds of I/O-ish activities as reserved. We reserve them for
853  * kernel internal usage and prevent them from ever going on regular
854  * free list.
855  */
856 boolean_t
pmap_next_page_reserved(ppnum_t * pn)857 pmap_next_page_reserved(
858 	ppnum_t              *pn)
859 {
860 	uint32_t             n;
861 	pmap_memory_region_t *region;
862 	uint32_t             reserved_index;
863 
864 	if (pmap_reserved_ranges) {
865 		for (n = 0; n < pmap_last_reserved_range_index; n++) {
866 			reserved_index = pmap_reserved_range_indices[n];
867 			region = &pmap_memory_regions[reserved_index];
868 			if (region->alloc_up <= region->alloc_down) {
869 				*pn = region->alloc_up++;
870 			} else if (region->alloc_frag_up <= region->alloc_frag_down) {
871 				*pn = region->alloc_frag_up++;
872 			} else {
873 				continue;
874 			}
875 			avail_remaining--;
876 
877 			if (*pn > max_ppnum) {
878 				max_ppnum = *pn;
879 			}
880 
881 			pmap_reserved_pages_allocated++;
882 #if DEBUG
883 			if (region->alloc_up > region->alloc_down) {
884 				kprintf("Exhausted reserved range index: %u, base: 0x%x end: 0x%x, type: 0x%x, attribute: 0x%llx\n", reserved_index, region->base, region->end, region->type, region->attribute);
885 			}
886 #endif
887 			return TRUE;
888 		}
889 	}
890 	return FALSE;
891 }
892 
893 /*
894  * Return the highest large page available. Fails once there are no more large pages.
895  */
896 kern_return_t
pmap_next_page_large(ppnum_t * pn)897 pmap_next_page_large(
898 	ppnum_t              *pn)
899 {
900 	int                  r;
901 	pmap_memory_region_t *region;
902 	ppnum_t              frag_start;
903 	ppnum_t              lgpg;
904 
905 	if (avail_remaining < LG_PPNUM_PAGES) {
906 		return KERN_FAILURE;
907 	}
908 
909 	for (r = pmap_memory_region_count - 1; r >= 0; r--) {
910 		region = &pmap_memory_regions[r];
911 
912 		/*
913 		 * First check if there is enough memory.
914 		 */
915 		if (region->alloc_down < region->alloc_up ||
916 		    (region->alloc_down - region->alloc_up + 1) < LG_PPNUM_PAGES) {
917 			continue;
918 		}
919 
920 		/*
921 		 * Find the starting large page, creating a fragment if needed.
922 		 */
923 		if ((region->alloc_down & LG_PPNUM_MASK) == LG_PPNUM_MASK) {
924 			lgpg = (region->alloc_down & ~LG_PPNUM_MASK);
925 		} else {
926 			/* Can only have 1 fragment per region at a time */
927 			if (region->alloc_frag_up <= region->alloc_frag_down) {
928 				continue;
929 			}
930 
931 			/* Check for enough room below any fragment. */
932 			frag_start = (region->alloc_down & ~LG_PPNUM_MASK);
933 			if (frag_start < region->alloc_up ||
934 			    frag_start - region->alloc_up < LG_PPNUM_PAGES) {
935 				continue;
936 			}
937 
938 			lgpg = frag_start - LG_PPNUM_PAGES;
939 			region->alloc_frag_up = frag_start;
940 			region->alloc_frag_down = region->alloc_down;
941 		}
942 
943 		*pn = lgpg;
944 		region->alloc_down = lgpg - 1;
945 
946 
947 		avail_remaining -= LG_PPNUM_PAGES;
948 		if (*pn + LG_PPNUM_MASK > max_ppnum) {
949 			max_ppnum = *pn + LG_PPNUM_MASK;
950 		}
951 
952 		return KERN_SUCCESS;
953 	}
954 	return KERN_FAILURE;
955 }
956 
957 boolean_t
pmap_next_page_hi(ppnum_t * pn,boolean_t might_free)958 pmap_next_page_hi(
959 	ppnum_t              *pn,
960 	boolean_t            might_free)
961 {
962 	pmap_memory_region_t *region;
963 	int                  n;
964 
965 	if (!might_free && pmap_next_page_reserved(pn)) {
966 		return TRUE;
967 	}
968 
969 	if (avail_remaining) {
970 		for (n = pmap_memory_region_count - 1; n >= 0; n--) {
971 			region = &pmap_memory_regions[n];
972 			if (region->alloc_frag_up <= region->alloc_frag_down) {
973 				*pn = region->alloc_frag_down--;
974 			} else if (region->alloc_down >= region->alloc_up) {
975 				*pn = region->alloc_down--;
976 			} else {
977 				continue;
978 			}
979 
980 			avail_remaining--;
981 
982 			if (*pn > max_ppnum) {
983 				max_ppnum = *pn;
984 			}
985 
986 			return TRUE;
987 		}
988 	}
989 	return FALSE;
990 }
991 
992 /*
993  * Record which high pages have been allocated so far,
994  * so that pmap_init() can mark them PMAP_NOENCRYPT, which
995  * makes hibernation faster.
996  *
997  * Because of the code in pmap_next_page_large(), we could
998  * theoretically have fragments in several regions.
999  * In practice that just doesn't happen. The last pmap region
1000  * is normally the largest and will satisfy all pmap_next_hi/large()
1001  * allocations. Since this information is used as an optimization
1002  * and it's ok to be conservative, we'll just record the information
1003  * for the final region.
1004  */
1005 void
pmap_hi_pages_done(void)1006 pmap_hi_pages_done(void)
1007 {
1008 	pmap_memory_region_t *r;
1009 
1010 	r = &pmap_memory_regions[pmap_memory_region_count - 1];
1011 	pmap_high_used_top = r->end;
1012 	if (r->alloc_frag_up <= r->alloc_frag_down) {
1013 		pmap_high_used_bottom = r->alloc_frag_down + 1;
1014 		pmap_middle_used_top = r->alloc_frag_up - 1;
1015 		if (r->alloc_up <= r->alloc_down) {
1016 			pmap_middle_used_bottom = r->alloc_down + 1;
1017 		} else {
1018 			pmap_high_used_bottom = r->base;
1019 		}
1020 	} else {
1021 		if (r->alloc_up <= r->alloc_down) {
1022 			pmap_high_used_bottom = r->alloc_down + 1;
1023 		} else {
1024 			pmap_high_used_bottom = r->base;
1025 		}
1026 	}
1027 #if     DEBUG || DEVELOPMENT
1028 	kprintf("pmap_high_used_top      0x%x\n", pmap_high_used_top);
1029 	kprintf("pmap_high_used_bottom   0x%x\n", pmap_high_used_bottom);
1030 	kprintf("pmap_middle_used_top    0x%x\n", pmap_middle_used_top);
1031 	kprintf("pmap_middle_used_bottom 0x%x\n", pmap_middle_used_bottom);
1032 #endif
1033 }
1034 
1035 /*
1036  * Return the next available page from lowest memory for general use.
1037  */
1038 boolean_t
pmap_next_page(ppnum_t * pn)1039 pmap_next_page(
1040 	ppnum_t              *pn)
1041 {
1042 	pmap_memory_region_t *region;
1043 
1044 	if (avail_remaining) {
1045 		while (pmap_memory_region_current < pmap_memory_region_count) {
1046 			region = &pmap_memory_regions[pmap_memory_region_current];
1047 			if (region->alloc_up <= region->alloc_down) {
1048 				*pn = region->alloc_up++;
1049 			} else if (region->alloc_frag_up <= region->alloc_frag_down) {
1050 				*pn = region->alloc_frag_up++;
1051 			} else {
1052 				pmap_memory_region_current++;
1053 				continue;
1054 			}
1055 			avail_remaining--;
1056 
1057 			if (*pn > max_ppnum) {
1058 				max_ppnum = *pn;
1059 			}
1060 
1061 			return TRUE;
1062 		}
1063 	}
1064 	return FALSE;
1065 }
1066 
1067 
1068 boolean_t
pmap_valid_page(ppnum_t pn)1069 pmap_valid_page(
1070 	ppnum_t pn)
1071 {
1072 	unsigned int i;
1073 	pmap_memory_region_t *pmptr = pmap_memory_regions;
1074 
1075 	for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
1076 		if ((pn >= pmptr->base) && (pn <= pmptr->end)) {
1077 			return TRUE;
1078 		}
1079 	}
1080 	return FALSE;
1081 }
1082 
1083 /*
1084  * Returns true if the address lies in the kernel __TEXT segment range.
1085  */
1086 bool
kernel_text_contains(vm_offset_t addr)1087 kernel_text_contains(vm_offset_t addr)
1088 {
1089 	return vm_kernel_stext <= addr && addr < vm_kernel_etext;
1090 }
1091