xref: /xnu-11417.101.15/osfmk/i386/i386_vm_init.c (revision e3723e1f17661b24996789d8afc084c0c3303b26)
1 /*
2  * Copyright (c) 2003-2019 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989, 1988 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 
57 
58 #include <mach/i386/vm_param.h>
59 
60 #include <string.h>
61 #include <mach/vm_param.h>
62 #include <mach/vm_prot.h>
63 #include <mach/machine.h>
64 #include <mach/time_value.h>
65 #include <kern/spl.h>
66 #include <kern/assert.h>
67 #include <kern/debug.h>
68 #include <kern/misc_protos.h>
69 #include <kern/cpu_data.h>
70 #include <kern/processor.h>
71 #include <vm/vm_page.h>
72 #include <vm/pmap.h>
73 #include <vm/vm_kern.h>
74 #include <i386/pmap.h>
75 #include <i386/misc_protos.h>
76 #include <i386/cpuid.h>
77 #include <mach/thread_status.h>
78 #include <pexpert/i386/efi.h>
79 #include <pexpert/pexpert.h>
80 #include <i386/i386_lowmem.h>
81 #include <i386/misc_protos.h>
82 #include <x86_64/lowglobals.h>
83 #include <i386/pal_routines.h>
84 #include <vm/vm_page_internal.h>
85 
86 #include <mach-o/loader.h>
87 #include <libkern/kernel_mach_header.h>
88 
89 #define P2ROUNDUP(x, align)             (-(-(x) & -(align)))
90 
91 vm_size_t       mem_size = 0;
92 pmap_paddr_t    first_avail = 0;/* first after page tables */
93 
94 uint64_t        max_mem;        /* Size of physical memory minus carveouts (bytes), adjusted by maxmem */
95 uint64_t        max_mem_actual; /* Actual size of physical memory (bytes) adjusted by
96                                  * the maxmem boot-arg */
97 uint64_t        mem_actual;
98 uint64_t        sane_size = 0;  /* Memory size for defaults calculations */
99 
100 /*
101  * KASLR parameters
102  */
103 ppnum_t         vm_kernel_base_page;
104 vm_offset_t     vm_kernel_base;
105 vm_offset_t     vm_kernel_top;
106 vm_offset_t     vm_kernel_stext;
107 vm_offset_t     vm_kernel_etext;
108 vm_offset_t     vm_kernel_slide;
109 vm_offset_t     vm_kernel_slid_base;
110 vm_offset_t     vm_kernel_slid_top;
111 vm_offset_t vm_hib_base;
112 vm_offset_t     vm_kext_base = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
113 vm_offset_t     vm_kext_top = VM_MIN_KERNEL_ADDRESS;
114 
115 vm_offset_t vm_prelink_stext;
116 vm_offset_t vm_prelink_etext;
117 vm_offset_t vm_prelink_sinfo;
118 vm_offset_t vm_prelink_einfo;
119 vm_offset_t vm_slinkedit;
120 vm_offset_t vm_elinkedit;
121 
122 vm_offset_t vm_kernel_builtinkmod_text;
123 vm_offset_t vm_kernel_builtinkmod_text_end;
124 
125 #define MAXLORESERVE    (32 * 1024 * 1024)
126 
127 ppnum_t         max_ppnum = 0;
128 
129 /*
130  * pmap_high_used* are the highest range of physical memory used for kernel
131  * internals (page tables, vm_pages) via pmap_steal_memory() that don't
132  * need to be encrypted in hibernation images. There can be one gap in
133  * the middle of this due to fragmentation when using a mix of small
134  * and large pages.  In that case, the fragment lives between the high
135  * and middle ranges.
136  */
137 ppnum_t pmap_high_used_top = 0;
138 ppnum_t pmap_high_used_bottom = 0;
139 ppnum_t pmap_middle_used_top = 0;
140 ppnum_t pmap_middle_used_bottom = 0;
141 
142 enum {PMAP_MAX_RESERVED_RANGES = 32};
143 uint32_t pmap_reserved_pages_allocated = 0;
144 uint32_t pmap_reserved_range_indices[PMAP_MAX_RESERVED_RANGES];
145 uint32_t pmap_last_reserved_range_index = 0;
146 uint32_t pmap_reserved_ranges = 0;
147 
148 extern unsigned int bsd_mbuf_cluster_reserve(boolean_t *);
149 
150 pmap_paddr_t     avail_start, avail_end;
151 vm_offset_t     virtual_avail, virtual_end;
152 static pmap_paddr_t     avail_remaining;
153 vm_offset_t     static_memory_end = 0;
154 
155 vm_offset_t     sHIB, eHIB, stext, etext, sdata, edata, end, sconst, econst;
156 
157 /*
158  * _mh_execute_header is the mach_header for the currently executing kernel
159  */
160 vm_offset_t segTEXTB; unsigned long segSizeTEXT;
161 vm_offset_t segDATAB; unsigned long segSizeDATA;
162 vm_offset_t segLINKB; unsigned long segSizeLINK;
163 vm_offset_t segPRELINKTEXTB; unsigned long segSizePRELINKTEXT;
164 vm_offset_t segPRELINKINFOB; unsigned long segSizePRELINKINFO;
165 vm_offset_t segHIBB; unsigned long segSizeHIB;
166 unsigned long segSizeConst;
167 
168 static kernel_segment_command_t *segTEXT, *segDATA;
169 static kernel_section_t *cursectTEXT, *lastsectTEXT;
170 static kernel_segment_command_t *segCONST;
171 
172 extern uint64_t firmware_Conventional_bytes;
173 extern uint64_t firmware_RuntimeServices_bytes;
174 extern uint64_t firmware_ACPIReclaim_bytes;
175 extern uint64_t firmware_ACPINVS_bytes;
176 extern uint64_t firmware_PalCode_bytes;
177 extern uint64_t firmware_Reserved_bytes;
178 extern uint64_t firmware_Unusable_bytes;
179 extern uint64_t firmware_other_bytes;
180 uint64_t firmware_MMIO_bytes;
181 
182 /*
183  * Linker magic to establish the highest address in the kernel.
184  */
185 extern void     *last_kernel_symbol;
186 
187 #define LG_PPNUM_PAGES (I386_LPGBYTES >> PAGE_SHIFT)
188 #define LG_PPNUM_MASK (I386_LPGMASK >> PAGE_SHIFT)
189 
190 /* set so no region large page fragment pages exist */
191 #define RESET_FRAG(r) (((r)->alloc_frag_up = 1), ((r)->alloc_frag_down = 0))
192 
193 boolean_t       memmap = FALSE;
194 #if     DEBUG || DEVELOPMENT
195 static void
kprint_memmap(vm_offset_t maddr,unsigned int msize,unsigned int mcount)196 kprint_memmap(vm_offset_t maddr, unsigned int msize, unsigned int mcount)
197 {
198 	unsigned int         i;
199 	unsigned int         j;
200 	pmap_memory_region_t *p = pmap_memory_regions;
201 	EfiMemoryRange       *mptr;
202 	addr64_t             region_start, region_end;
203 	addr64_t             efi_start, efi_end;
204 
205 	for (j = 0; j < pmap_memory_region_count; j++, p++) {
206 		kprintf("pmap region %d type %d base 0x%llx alloc_up 0x%llx alloc_down 0x%llx"
207 		    " alloc_frag_up 0x%llx alloc_frag_down 0x%llx top 0x%llx\n",
208 		    j, p->type,
209 		    (addr64_t) p->base << I386_PGSHIFT,
210 		    (addr64_t) p->alloc_up << I386_PGSHIFT,
211 		    (addr64_t) p->alloc_down << I386_PGSHIFT,
212 		    (addr64_t) p->alloc_frag_up << I386_PGSHIFT,
213 		    (addr64_t) p->alloc_frag_down << I386_PGSHIFT,
214 		    (addr64_t) p->end   << I386_PGSHIFT);
215 		region_start = (addr64_t) p->base << I386_PGSHIFT;
216 		region_end = ((addr64_t) p->end << I386_PGSHIFT) - 1;
217 		mptr = (EfiMemoryRange *) maddr;
218 		for (i = 0;
219 		    i < mcount;
220 		    i++, mptr = (EfiMemoryRange *)(((vm_offset_t)mptr) + msize)) {
221 			if (mptr->Type != kEfiLoaderCode &&
222 			    mptr->Type != kEfiLoaderData &&
223 			    mptr->Type != kEfiBootServicesCode &&
224 			    mptr->Type != kEfiBootServicesData &&
225 			    mptr->Type != kEfiConventionalMemory) {
226 				efi_start = (addr64_t)mptr->PhysicalStart;
227 				efi_end = efi_start + ((vm_offset_t)mptr->NumberOfPages << I386_PGSHIFT) - 1;
228 				if ((efi_start >= region_start && efi_start <= region_end) ||
229 				    (efi_end >= region_start && efi_end <= region_end)) {
230 					kprintf(" *** Overlapping region with EFI runtime region %d\n", i);
231 				}
232 			}
233 		}
234 	}
235 }
236 #define DPRINTF(x...)   do { if (memmap) kprintf(x); } while (0)
237 
238 #else
239 
240 static void
kprint_memmap(vm_offset_t maddr,unsigned int msize,unsigned int mcount)241 kprint_memmap(vm_offset_t maddr, unsigned int msize, unsigned int mcount)
242 {
243 #pragma unused(maddr, msize, mcount)
244 }
245 
246 #define DPRINTF(x...)
247 #endif /* DEBUG */
248 
249 /*
250  * Basic VM initialization.
251  */
252 void
i386_vm_init(uint64_t maxmem,boolean_t IA32e,boot_args * args)253 i386_vm_init(uint64_t   maxmem,
254     boolean_t  IA32e,
255     boot_args  *args)
256 {
257 	pmap_memory_region_t *pmptr;
258 	pmap_memory_region_t *prev_pmptr;
259 	EfiMemoryRange *mptr;
260 	unsigned int mcount;
261 	unsigned int msize;
262 	vm_offset_t maddr;
263 	ppnum_t fap;
264 	unsigned int i;
265 	ppnum_t maxpg = 0;
266 	uint32_t pmap_type;
267 	uint32_t maxloreserve;
268 	uint32_t maxdmaaddr;
269 	uint32_t  mbuf_reserve = 0;
270 	boolean_t mbuf_override = FALSE;
271 	boolean_t coalescing_permitted;
272 	vm_kernel_base_page = i386_btop(args->kaddr);
273 	vm_offset_t base_address;
274 	vm_offset_t static_base_address;
275 
276 	PE_parse_boot_argn("memmap", &memmap, sizeof(memmap));
277 
278 	/*
279 	 * Establish the KASLR parameters.
280 	 */
281 	static_base_address = ml_static_ptovirt(KERNEL_BASE_OFFSET);
282 	base_address        = ml_static_ptovirt(args->kaddr);
283 	vm_kernel_slide     = base_address - static_base_address;
284 	if (args->kslide) {
285 		kprintf("KASLR slide: 0x%016lx dynamic\n", vm_kernel_slide);
286 		if (vm_kernel_slide != ((vm_offset_t)args->kslide)) {
287 			panic("Kernel base inconsistent with slide - rebased?");
288 		}
289 	} else {
290 		/* No slide relative to on-disk symbols */
291 		kprintf("KASLR slide: 0x%016lx static and ignored\n",
292 		    vm_kernel_slide);
293 		vm_kernel_slide = 0;
294 	}
295 
296 	/*
297 	 * Zero out local relocations to avoid confusing kxld.
298 	 * TODO: might be better to move this code to OSKext::initialize
299 	 */
300 	if (_mh_execute_header.flags & MH_PIE) {
301 		struct load_command *loadcmd;
302 		uint32_t cmd;
303 
304 		loadcmd = (struct load_command *)((uintptr_t)&_mh_execute_header +
305 		    sizeof(_mh_execute_header));
306 
307 		for (cmd = 0; cmd < _mh_execute_header.ncmds; cmd++) {
308 			if (loadcmd->cmd == LC_DYSYMTAB) {
309 				struct dysymtab_command *dysymtab;
310 
311 				dysymtab = (struct dysymtab_command *)loadcmd;
312 				dysymtab->nlocrel = 0;
313 				dysymtab->locreloff = 0;
314 				kprintf("Hiding local relocations\n");
315 				break;
316 			}
317 			loadcmd = (struct load_command *)((uintptr_t)loadcmd + loadcmd->cmdsize);
318 		}
319 	}
320 
321 	/*
322 	 * Now retrieve addresses for end, edata, and etext
323 	 * from MACH-O headers.
324 	 */
325 	segTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
326 	    "__TEXT", &segSizeTEXT);
327 	segDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
328 	    "__DATA", &segSizeDATA);
329 	segLINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
330 	    "__LINKEDIT", &segSizeLINK);
331 	segHIBB  = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
332 	    "__HIB", &segSizeHIB);
333 	segPRELINKTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
334 	    "__PRELINK_TEXT", &segSizePRELINKTEXT);
335 	segPRELINKINFOB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
336 	    "__PRELINK_INFO", &segSizePRELINKINFO);
337 	segTEXT = getsegbynamefromheader(&_mh_execute_header,
338 	    "__TEXT");
339 	segDATA = getsegbynamefromheader(&_mh_execute_header,
340 	    "__DATA");
341 	segCONST = getsegbynamefromheader(&_mh_execute_header,
342 	    "__DATA_CONST");
343 	cursectTEXT = lastsectTEXT = firstsect(segTEXT);
344 	/* Discover the last TEXT section within the TEXT segment */
345 	while ((cursectTEXT = nextsect(segTEXT, cursectTEXT)) != NULL) {
346 		lastsectTEXT = cursectTEXT;
347 	}
348 
349 	sHIB  = segHIBB;
350 	eHIB  = segHIBB + segSizeHIB;
351 	vm_hib_base = sHIB;
352 	/* Zero-padded from ehib to stext if text is 2M-aligned */
353 	stext = segTEXTB;
354 	lowGlo.lgStext = stext;
355 	etext = (vm_offset_t) round_page_64(lastsectTEXT->addr + lastsectTEXT->size);
356 	/* Zero-padded from etext to sdata if text is 2M-aligned */
357 	sdata = segDATAB;
358 	edata = segDATAB + segSizeDATA;
359 
360 	sconst = segCONST->vmaddr;
361 	segSizeConst = segCONST->vmsize;
362 	econst = sconst + segSizeConst;
363 
364 	kc_format_t kc_format = KCFormatUnknown;
365 
366 	/* XXX: FIXME_IN_dyld: For new-style kernel caches, the ending address of __DATA_CONST may not be page-aligned */
367 	if (PE_get_primary_kc_format(&kc_format) && kc_format == KCFormatFileset) {
368 		/* Round up the end */
369 		econst = P2ROUNDUP(econst, PAGE_SIZE);
370 		edata = P2ROUNDUP(edata, PAGE_SIZE);
371 	} else {
372 		assert(((sconst | econst) & PAGE_MASK) == 0);
373 		assert(((sdata | edata) & PAGE_MASK) == 0);
374 	}
375 
376 	DPRINTF("segTEXTB    = %p\n", (void *) segTEXTB);
377 	DPRINTF("segDATAB    = %p\n", (void *) segDATAB);
378 	DPRINTF("segLINKB    = %p\n", (void *) segLINKB);
379 	DPRINTF("segHIBB     = %p\n", (void *) segHIBB);
380 	DPRINTF("segPRELINKTEXTB = %p\n", (void *) segPRELINKTEXTB);
381 	DPRINTF("segPRELINKINFOB = %p\n", (void *) segPRELINKINFOB);
382 	DPRINTF("sHIB        = %p\n", (void *) sHIB);
383 	DPRINTF("eHIB        = %p\n", (void *) eHIB);
384 	DPRINTF("stext       = %p\n", (void *) stext);
385 	DPRINTF("etext       = %p\n", (void *) etext);
386 	DPRINTF("sdata       = %p\n", (void *) sdata);
387 	DPRINTF("edata       = %p\n", (void *) edata);
388 	DPRINTF("sconst      = %p\n", (void *) sconst);
389 	DPRINTF("econst      = %p\n", (void *) econst);
390 	DPRINTF("kernel_top  = %p\n", (void *) &last_kernel_symbol);
391 
392 	vm_kernel_base  = sHIB;
393 	vm_kernel_top   = (vm_offset_t) &last_kernel_symbol;
394 	vm_kernel_stext = stext;
395 	vm_kernel_etext = etext;
396 	vm_prelink_stext = segPRELINKTEXTB;
397 	vm_prelink_etext = segPRELINKTEXTB + segSizePRELINKTEXT;
398 	vm_prelink_sinfo = segPRELINKINFOB;
399 	vm_prelink_einfo = segPRELINKINFOB + segSizePRELINKINFO;
400 	vm_slinkedit = segLINKB;
401 	vm_elinkedit = segLINKB + segSizeLINK;
402 
403 	/*
404 	 * In the fileset world, we want to be able to (un)slide addresses from
405 	 * the kernel or any of the kexts (e.g., for kernel logging metadata
406 	 * passed between the kernel and logd in userspace). VM_KERNEL_UNSLIDE
407 	 * (via VM_KERNEL_IS_SLID) should apply to the addresses in the range
408 	 * from the first basement address to the last boot kc address.
409 	 *
410 	 *                     ^
411 	 *                     :
412 	 *                     |
413 	 *  vm_kernel_slid_top - ---------------------------------------------
414 	 *                     |
415 	 *                     :
416 	 *                     : Boot kc (kexts in the boot kc here)
417 	 *                     : - - - - - - - - - - - - - - - - - - - - - - -
418 	 *                     :
419 	 *                     :
420 	 *                     | Boot kc (kernel here)
421 	 *                     - ---------------------------------------------
422 	 *                     |
423 	 *                     :
424 	 *                     | Basement (kexts in pageable and aux kcs here)
425 	 * vm_kernel_slid_base - ---------------------------------------------
426 	 *                     0
427 	 */
428 
429 	vm_kernel_slid_base = vm_kext_base + vm_kernel_slide;
430 	vm_kernel_slid_top = (kc_format == KCFormatFileset) ?
431 	    vm_slinkedit : vm_prelink_einfo;
432 
433 	vm_page_kernelcache_count = (unsigned int) (atop_64(vm_kernel_top - vm_kernel_base));
434 
435 	vm_set_page_size();
436 
437 	/*
438 	 * Compute the memory size.
439 	 */
440 
441 	avail_remaining = 0;
442 	avail_end = 0;
443 	pmptr = pmap_memory_regions;
444 	prev_pmptr = 0;
445 	pmap_memory_region_count = pmap_memory_region_current = 0;
446 	fap = (ppnum_t) i386_btop(first_avail);
447 
448 	maddr = ml_static_ptovirt((vm_offset_t)args->MemoryMap);
449 	mptr = (EfiMemoryRange *)maddr;
450 	if (args->MemoryMapDescriptorSize == 0) {
451 		panic("Invalid memory map descriptor size");
452 	}
453 	msize = args->MemoryMapDescriptorSize;
454 	mcount = args->MemoryMapSize / msize;
455 
456 #define FOURGIG 0x0000000100000000ULL
457 #define ONEGIG  0x0000000040000000ULL
458 
459 	for (i = 0; i < mcount; i++, mptr = (EfiMemoryRange *)(((vm_offset_t)mptr) + msize)) {
460 		ppnum_t base, top;
461 		uint64_t region_bytes = 0;
462 
463 		if (pmap_memory_region_count >= PMAP_MEMORY_REGIONS_SIZE) {
464 			kprintf("WARNING: truncating memory region count at %d\n", pmap_memory_region_count);
465 			break;
466 		}
467 		base = (ppnum_t) (mptr->PhysicalStart >> I386_PGSHIFT);
468 		top = (ppnum_t) (((mptr->PhysicalStart) >> I386_PGSHIFT) + mptr->NumberOfPages - 1);
469 
470 		if (base == 0) {
471 			/*
472 			 * Avoid having to deal with the edge case of the
473 			 * very first possible physical page and the roll-over
474 			 * to -1; just ignore that page.
475 			 */
476 			kprintf("WARNING: ignoring first page in [0x%llx:0x%llx]\n", (uint64_t) base, (uint64_t) top);
477 			base++;
478 		}
479 		if (top + 1 == 0) {
480 			/*
481 			 * Avoid having to deal with the edge case of the
482 			 * very last possible physical page and the roll-over
483 			 * to 0; just ignore that page.
484 			 */
485 			kprintf("WARNING: ignoring last page in [0x%llx:0x%llx]\n", (uint64_t) base, (uint64_t) top);
486 			top--;
487 		}
488 		if (top < base) {
489 			/*
490 			 * That was the only page in that region, so
491 			 * ignore the whole region.
492 			 */
493 			continue;
494 		}
495 
496 #if     MR_RSV_TEST
497 		static uint32_t nmr = 0;
498 		if ((base > 0x20000) && (nmr++ < 4)) {
499 			mptr->Attribute |= EFI_MEMORY_KERN_RESERVED;
500 		}
501 #endif
502 		region_bytes = (uint64_t)(mptr->NumberOfPages << I386_PGSHIFT);
503 		pmap_type = mptr->Type;
504 
505 		switch (mptr->Type) {
506 		case kEfiLoaderCode:
507 		case kEfiLoaderData:
508 		case kEfiBootServicesCode:
509 		case kEfiBootServicesData:
510 		case kEfiConventionalMemory:
511 			/*
512 			 * Consolidate usable memory types into one.
513 			 */
514 			pmap_type = kEfiConventionalMemory;
515 			sane_size += region_bytes;
516 			firmware_Conventional_bytes += region_bytes;
517 			break;
518 		/*
519 		 * sane_size should reflect the total amount of physical
520 		 * RAM in the system, not just the amount that is
521 		 * available for the OS to use.
522 		 * We now get this value from SMBIOS tables
523 		 * rather than reverse engineering the memory map.
524 		 * But the legacy computation of "sane_size" is kept
525 		 * for diagnostic information.
526 		 */
527 
528 		case kEfiRuntimeServicesCode:
529 		case kEfiRuntimeServicesData:
530 			firmware_RuntimeServices_bytes += region_bytes;
531 			sane_size += region_bytes;
532 			break;
533 		case kEfiACPIReclaimMemory:
534 			firmware_ACPIReclaim_bytes += region_bytes;
535 			sane_size += region_bytes;
536 			break;
537 		case kEfiACPIMemoryNVS:
538 			firmware_ACPINVS_bytes += region_bytes;
539 			sane_size += region_bytes;
540 			break;
541 		case kEfiPalCode:
542 			firmware_PalCode_bytes += region_bytes;
543 			sane_size += region_bytes;
544 			break;
545 
546 		case kEfiReservedMemoryType:
547 			firmware_Reserved_bytes += region_bytes;
548 			break;
549 		case kEfiUnusableMemory:
550 			firmware_Unusable_bytes += region_bytes;
551 			break;
552 		case kEfiMemoryMappedIO:
553 		case kEfiMemoryMappedIOPortSpace:
554 			firmware_MMIO_bytes += region_bytes;
555 			break;
556 		default:
557 			firmware_other_bytes += region_bytes;
558 			break;
559 		}
560 
561 		DPRINTF("EFI region %d: type %u/%d, base 0x%x, top 0x%x %s\n",
562 		    i, mptr->Type, pmap_type, base, top,
563 		    (mptr->Attribute & EFI_MEMORY_KERN_RESERVED)? "RESERVED" :
564 		    (mptr->Attribute & EFI_MEMORY_RUNTIME)? "RUNTIME" : "");
565 
566 		if (maxpg) {
567 			if (base >= maxpg) {
568 				break;
569 			}
570 			top = (top > maxpg) ? maxpg : top;
571 		}
572 
573 		/*
574 		 * handle each region
575 		 */
576 		if ((mptr->Attribute & EFI_MEMORY_RUNTIME) == EFI_MEMORY_RUNTIME ||
577 		    pmap_type != kEfiConventionalMemory) {
578 			prev_pmptr = 0;
579 			continue;
580 		} else {
581 			/*
582 			 * Usable memory region
583 			 */
584 			if (top < I386_LOWMEM_RESERVED ||
585 			    !pal_is_usable_memory(base, top)) {
586 				prev_pmptr = 0;
587 				continue;
588 			}
589 			/*
590 			 * A range may be marked with with the
591 			 * EFI_MEMORY_KERN_RESERVED attribute
592 			 * on some systems, to indicate that the range
593 			 * must not be made available to devices.
594 			 */
595 
596 			if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
597 				if (++pmap_reserved_ranges > PMAP_MAX_RESERVED_RANGES) {
598 					panic("Too many reserved ranges %u", pmap_reserved_ranges);
599 				}
600 			}
601 
602 			if (top < fap) {
603 				/*
604 				 * entire range below first_avail
605 				 * salvage some low memory pages
606 				 * we use some very low memory at startup
607 				 * mark as already allocated here
608 				 */
609 				if (base >= I386_LOWMEM_RESERVED) {
610 					pmptr->base = base;
611 				} else {
612 					pmptr->base = I386_LOWMEM_RESERVED;
613 				}
614 
615 				pmptr->end = top;
616 
617 
618 				if ((mptr->Attribute & EFI_MEMORY_KERN_RESERVED) &&
619 				    (top < vm_kernel_base_page)) {
620 					pmptr->alloc_up = pmptr->base;
621 					pmptr->alloc_down = pmptr->end;
622 					RESET_FRAG(pmptr);
623 					pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
624 				} else {
625 					/*
626 					 * mark as already mapped
627 					 */
628 					pmptr->alloc_up = top + 1;
629 					pmptr->alloc_down = top;
630 					RESET_FRAG(pmptr);
631 				}
632 				pmptr->type = pmap_type;
633 				pmptr->attribute = mptr->Attribute;
634 			} else if ((base < fap) && (top > fap)) {
635 				/*
636 				 * spans first_avail
637 				 * put mem below first avail in table but
638 				 * mark already allocated
639 				 */
640 				pmptr->base = base;
641 				pmptr->end = (fap - 1);
642 				pmptr->alloc_up = pmptr->end + 1;
643 				pmptr->alloc_down = pmptr->end;
644 				RESET_FRAG(pmptr);
645 				pmptr->type = pmap_type;
646 				pmptr->attribute = mptr->Attribute;
647 				/*
648 				 * we bump these here inline so the accounting
649 				 * below works correctly
650 				 */
651 				pmptr++;
652 				pmap_memory_region_count++;
653 
654 				pmptr->alloc_up = pmptr->base = fap;
655 				pmptr->type = pmap_type;
656 				pmptr->attribute = mptr->Attribute;
657 				pmptr->alloc_down = pmptr->end = top;
658 				RESET_FRAG(pmptr);
659 
660 				if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
661 					pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
662 				}
663 			} else {
664 				/*
665 				 * entire range useable
666 				 */
667 				pmptr->alloc_up = pmptr->base = base;
668 				pmptr->type = pmap_type;
669 				pmptr->attribute = mptr->Attribute;
670 				pmptr->alloc_down = pmptr->end = top;
671 				RESET_FRAG(pmptr);
672 				if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
673 					pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
674 				}
675 			}
676 
677 			if (i386_ptob(pmptr->end) > avail_end) {
678 				avail_end = i386_ptob(pmptr->end);
679 			}
680 
681 			avail_remaining += (pmptr->end - pmptr->base);
682 			coalescing_permitted = (prev_pmptr && (pmptr->attribute == prev_pmptr->attribute) && ((pmptr->attribute & EFI_MEMORY_KERN_RESERVED) == 0));
683 			/*
684 			 * Consolidate contiguous memory regions, if possible
685 			 */
686 			if (prev_pmptr &&
687 			    (pmptr->type == prev_pmptr->type) &&
688 			    (coalescing_permitted) &&
689 			    (pmptr->base == pmptr->alloc_up) &&
690 			    (prev_pmptr->end == prev_pmptr->alloc_down) &&
691 			    (pmptr->base == (prev_pmptr->end + 1))) {
692 				prev_pmptr->end = pmptr->end;
693 				prev_pmptr->alloc_down = pmptr->alloc_down;
694 				RESET_FRAG(pmptr);
695 			} else {
696 				pmap_memory_region_count++;
697 				prev_pmptr = pmptr;
698 				pmptr++;
699 			}
700 		}
701 	}
702 
703 	if (memmap) {
704 		kprint_memmap(maddr, msize, mcount);
705 	}
706 
707 	avail_start = first_avail;
708 	mem_actual = args->PhysicalMemorySize;
709 
710 	/*
711 	 * For user visible memory size, round up to 128 Mb
712 	 * - accounting for the various stolen memory not reported by EFI.
713 	 * This is maintained for historical, comparison purposes but
714 	 * we now use the memory size reported by EFI/Booter.
715 	 */
716 	sane_size = (sane_size + 128 * MB - 1) & ~((uint64_t)(128 * MB - 1));
717 	if (sane_size != mem_actual) {
718 		printf("mem_actual: 0x%llx\n legacy sane_size: 0x%llx\n",
719 		    mem_actual, sane_size);
720 	}
721 	sane_size = mem_actual;
722 
723 	/*
724 	 * We cap at KERNEL_MAXMEM bytes (see vm_param.h).
725 	 * Unless overriden by the maxmem= boot-arg
726 	 * -- which is a non-zero maxmem argument to this function.
727 	 */
728 	if (maxmem == 0 && sane_size > KERNEL_MAXMEM) {
729 		maxmem = KERNEL_MAXMEM;
730 		printf("Physical memory %lld bytes capped at %dGB\n",
731 		    sane_size, (uint32_t) (KERNEL_MAXMEM / GB));
732 	}
733 
734 	/*
735 	 * if user set maxmem, reduce memory sizes
736 	 */
737 	if ((maxmem > (uint64_t)first_avail) && (maxmem < sane_size)) {
738 		ppnum_t discarded_pages  = (ppnum_t)((sane_size - maxmem) >> I386_PGSHIFT);
739 		ppnum_t highest_pn = 0;
740 		ppnum_t cur_end  = 0;
741 		uint64_t        pages_to_use;
742 		unsigned        cur_region = 0;
743 
744 		sane_size = maxmem;
745 
746 		if (avail_remaining > discarded_pages) {
747 			avail_remaining -= discarded_pages;
748 		} else {
749 			avail_remaining = 0;
750 		}
751 
752 		pages_to_use = avail_remaining;
753 
754 		while (cur_region < pmap_memory_region_count && pages_to_use) {
755 			for (cur_end = pmap_memory_regions[cur_region].base;
756 			    cur_end < pmap_memory_regions[cur_region].end && pages_to_use;
757 			    cur_end++) {
758 				if (cur_end > highest_pn) {
759 					highest_pn = cur_end;
760 				}
761 				pages_to_use--;
762 			}
763 			if (pages_to_use == 0) {
764 				pmap_memory_regions[cur_region].end = cur_end;
765 				pmap_memory_regions[cur_region].alloc_down = cur_end;
766 				RESET_FRAG(&pmap_memory_regions[cur_region]);
767 			}
768 
769 			cur_region++;
770 		}
771 		pmap_memory_region_count = cur_region;
772 
773 		avail_end = i386_ptob(highest_pn + 1);
774 	}
775 
776 	/*
777 	 * mem_size is only a 32 bit container... follow the PPC route
778 	 * and pin it to a 2 Gbyte maximum
779 	 */
780 	if (sane_size > (FOURGIG >> 1)) {
781 		mem_size = (vm_size_t)(FOURGIG >> 1);
782 	} else {
783 		mem_size = (vm_size_t)sane_size;
784 	}
785 	max_mem = sane_size;
786 	max_mem_actual = sane_size;
787 
788 	kprintf("Physical memory %llu MB\n", sane_size / MB);
789 
790 	max_valid_low_ppnum = (2 * GB) / PAGE_SIZE;
791 
792 	if (!PE_parse_boot_argn("max_valid_dma_addr", &maxdmaaddr, sizeof(maxdmaaddr))) {
793 		max_valid_dma_address = (uint64_t)4 * (uint64_t)GB;
794 	} else {
795 		max_valid_dma_address = ((uint64_t) maxdmaaddr) * MB;
796 
797 		if ((max_valid_dma_address / PAGE_SIZE) < max_valid_low_ppnum) {
798 			max_valid_low_ppnum = (ppnum_t)(max_valid_dma_address / PAGE_SIZE);
799 		}
800 	}
801 	if (avail_end >= max_valid_dma_address) {
802 		if (!PE_parse_boot_argn("maxloreserve", &maxloreserve, sizeof(maxloreserve))) {
803 			if (sane_size >= (ONEGIG * 15)) {
804 				maxloreserve = (MAXLORESERVE / PAGE_SIZE) * 4;
805 			} else if (sane_size >= (ONEGIG * 7)) {
806 				maxloreserve = (MAXLORESERVE / PAGE_SIZE) * 2;
807 			} else {
808 				maxloreserve = MAXLORESERVE / PAGE_SIZE;
809 			}
810 
811 #if SOCKETS
812 			mbuf_reserve = bsd_mbuf_cluster_reserve(&mbuf_override) / PAGE_SIZE;
813 #endif
814 		} else {
815 			maxloreserve = (maxloreserve * (1024 * 1024)) / PAGE_SIZE;
816 		}
817 
818 		if (maxloreserve) {
819 			vm_lopage_free_limit = maxloreserve;
820 
821 			if (mbuf_override == TRUE) {
822 				vm_lopage_free_limit += mbuf_reserve;
823 				vm_lopage_lowater = 0;
824 			} else {
825 				vm_lopage_lowater = vm_lopage_free_limit / 16;
826 			}
827 
828 			vm_lopage_refill = TRUE;
829 			vm_lopage_needed = TRUE;
830 		}
831 	}
832 
833 	/*
834 	 *	Initialize kernel physical map.
835 	 *	Kernel virtual address starts at VM_KERNEL_MIN_ADDRESS.
836 	 */
837 	kprintf("avail_remaining = 0x%lx\n", (unsigned long)avail_remaining);
838 	pmap_bootstrap(0, IA32e);
839 }
840 
841 
842 unsigned int
pmap_free_pages(void)843 pmap_free_pages(void)
844 {
845 	return (unsigned int)avail_remaining;
846 }
847 
848 boolean_t pmap_next_page_reserved(ppnum_t *);
849 
850 /*
851  * Pick a page from a "kernel private" reserved range; works around
852  * errata on some hardware. EFI marks pages which can't be used for
853  * certain kinds of I/O-ish activities as reserved. We reserve them for
854  * kernel internal usage and prevent them from ever going on regular
855  * free list.
856  */
857 boolean_t
pmap_next_page_reserved(ppnum_t * pn)858 pmap_next_page_reserved(
859 	ppnum_t              *pn)
860 {
861 	uint32_t             n;
862 	pmap_memory_region_t *region;
863 	uint32_t             reserved_index;
864 
865 	if (pmap_reserved_ranges) {
866 		for (n = 0; n < pmap_last_reserved_range_index; n++) {
867 			reserved_index = pmap_reserved_range_indices[n];
868 			region = &pmap_memory_regions[reserved_index];
869 			if (region->alloc_up <= region->alloc_down) {
870 				*pn = region->alloc_up++;
871 			} else if (region->alloc_frag_up <= region->alloc_frag_down) {
872 				*pn = region->alloc_frag_up++;
873 			} else {
874 				continue;
875 			}
876 			avail_remaining--;
877 
878 			if (*pn > max_ppnum) {
879 				max_ppnum = *pn;
880 			}
881 
882 			pmap_reserved_pages_allocated++;
883 #if DEBUG
884 			if (region->alloc_up > region->alloc_down) {
885 				kprintf("Exhausted reserved range index: %u, base: 0x%x end: 0x%x, type: 0x%x, attribute: 0x%llx\n", reserved_index, region->base, region->end, region->type, region->attribute);
886 			}
887 #endif
888 			return TRUE;
889 		}
890 	}
891 	return FALSE;
892 }
893 
894 /*
895  * Return the highest large page available. Fails once there are no more large pages.
896  */
897 kern_return_t
pmap_next_page_large(ppnum_t * pn)898 pmap_next_page_large(
899 	ppnum_t              *pn)
900 {
901 	int                  r;
902 	pmap_memory_region_t *region;
903 	ppnum_t              frag_start;
904 	ppnum_t              lgpg;
905 
906 	if (avail_remaining < LG_PPNUM_PAGES) {
907 		return KERN_FAILURE;
908 	}
909 
910 	for (r = pmap_memory_region_count - 1; r >= 0; r--) {
911 		region = &pmap_memory_regions[r];
912 
913 		/*
914 		 * First check if there is enough memory.
915 		 */
916 		if (region->alloc_down < region->alloc_up ||
917 		    (region->alloc_down - region->alloc_up + 1) < LG_PPNUM_PAGES) {
918 			continue;
919 		}
920 
921 		/*
922 		 * Find the starting large page, creating a fragment if needed.
923 		 */
924 		if ((region->alloc_down & LG_PPNUM_MASK) == LG_PPNUM_MASK) {
925 			lgpg = (region->alloc_down & ~LG_PPNUM_MASK);
926 		} else {
927 			/* Can only have 1 fragment per region at a time */
928 			if (region->alloc_frag_up <= region->alloc_frag_down) {
929 				continue;
930 			}
931 
932 			/* Check for enough room below any fragment. */
933 			frag_start = (region->alloc_down & ~LG_PPNUM_MASK);
934 			if (frag_start < region->alloc_up ||
935 			    frag_start - region->alloc_up < LG_PPNUM_PAGES) {
936 				continue;
937 			}
938 
939 			lgpg = frag_start - LG_PPNUM_PAGES;
940 			region->alloc_frag_up = frag_start;
941 			region->alloc_frag_down = region->alloc_down;
942 		}
943 
944 		*pn = lgpg;
945 		region->alloc_down = lgpg - 1;
946 
947 
948 		avail_remaining -= LG_PPNUM_PAGES;
949 		if (*pn + LG_PPNUM_MASK > max_ppnum) {
950 			max_ppnum = *pn + LG_PPNUM_MASK;
951 		}
952 
953 		return KERN_SUCCESS;
954 	}
955 	return KERN_FAILURE;
956 }
957 
958 boolean_t
pmap_next_page_hi(ppnum_t * pn,boolean_t might_free)959 pmap_next_page_hi(
960 	ppnum_t              *pn,
961 	boolean_t            might_free)
962 {
963 	pmap_memory_region_t *region;
964 	int                  n;
965 
966 	if (!might_free && pmap_next_page_reserved(pn)) {
967 		return TRUE;
968 	}
969 
970 	if (avail_remaining) {
971 		for (n = pmap_memory_region_count - 1; n >= 0; n--) {
972 			region = &pmap_memory_regions[n];
973 			if (region->alloc_frag_up <= region->alloc_frag_down) {
974 				*pn = region->alloc_frag_down--;
975 			} else if (region->alloc_down >= region->alloc_up) {
976 				*pn = region->alloc_down--;
977 			} else {
978 				continue;
979 			}
980 
981 			avail_remaining--;
982 
983 			if (*pn > max_ppnum) {
984 				max_ppnum = *pn;
985 			}
986 
987 			return TRUE;
988 		}
989 	}
990 	return FALSE;
991 }
992 
993 /*
994  * Record which high pages have been allocated so far,
995  * so that pmap_init() can mark them PMAP_NOENCRYPT, which
996  * makes hibernation faster.
997  *
998  * Because of the code in pmap_next_page_large(), we could
999  * theoretically have fragments in several regions.
1000  * In practice that just doesn't happen. The last pmap region
1001  * is normally the largest and will satisfy all pmap_next_hi/large()
1002  * allocations. Since this information is used as an optimization
1003  * and it's ok to be conservative, we'll just record the information
1004  * for the final region.
1005  */
1006 void
pmap_hi_pages_done(void)1007 pmap_hi_pages_done(void)
1008 {
1009 	pmap_memory_region_t *r;
1010 
1011 	r = &pmap_memory_regions[pmap_memory_region_count - 1];
1012 	pmap_high_used_top = r->end;
1013 	if (r->alloc_frag_up <= r->alloc_frag_down) {
1014 		pmap_high_used_bottom = r->alloc_frag_down + 1;
1015 		pmap_middle_used_top = r->alloc_frag_up - 1;
1016 		if (r->alloc_up <= r->alloc_down) {
1017 			pmap_middle_used_bottom = r->alloc_down + 1;
1018 		} else {
1019 			pmap_high_used_bottom = r->base;
1020 		}
1021 	} else {
1022 		if (r->alloc_up <= r->alloc_down) {
1023 			pmap_high_used_bottom = r->alloc_down + 1;
1024 		} else {
1025 			pmap_high_used_bottom = r->base;
1026 		}
1027 	}
1028 #if     DEBUG || DEVELOPMENT
1029 	kprintf("pmap_high_used_top      0x%x\n", pmap_high_used_top);
1030 	kprintf("pmap_high_used_bottom   0x%x\n", pmap_high_used_bottom);
1031 	kprintf("pmap_middle_used_top    0x%x\n", pmap_middle_used_top);
1032 	kprintf("pmap_middle_used_bottom 0x%x\n", pmap_middle_used_bottom);
1033 #endif
1034 }
1035 
1036 /*
1037  * Return the next available page from lowest memory for general use.
1038  */
1039 boolean_t
pmap_next_page(ppnum_t * pn)1040 pmap_next_page(
1041 	ppnum_t              *pn)
1042 {
1043 	pmap_memory_region_t *region;
1044 
1045 	if (avail_remaining) {
1046 		while (pmap_memory_region_current < pmap_memory_region_count) {
1047 			region = &pmap_memory_regions[pmap_memory_region_current];
1048 			if (region->alloc_up <= region->alloc_down) {
1049 				*pn = region->alloc_up++;
1050 			} else if (region->alloc_frag_up <= region->alloc_frag_down) {
1051 				*pn = region->alloc_frag_up++;
1052 			} else {
1053 				pmap_memory_region_current++;
1054 				continue;
1055 			}
1056 			avail_remaining--;
1057 
1058 			if (*pn > max_ppnum) {
1059 				max_ppnum = *pn;
1060 			}
1061 
1062 			return TRUE;
1063 		}
1064 	}
1065 	return FALSE;
1066 }
1067 
1068 
1069 boolean_t
pmap_valid_page(ppnum_t pn)1070 pmap_valid_page(
1071 	ppnum_t pn)
1072 {
1073 	unsigned int i;
1074 	pmap_memory_region_t *pmptr = pmap_memory_regions;
1075 
1076 	for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
1077 		if ((pn >= pmptr->base) && (pn <= pmptr->end)) {
1078 			return TRUE;
1079 		}
1080 	}
1081 	return FALSE;
1082 }
1083 
1084 /*
1085  * Returns true if the address lies in the kernel __TEXT segment range.
1086  */
1087 bool
kernel_text_contains(vm_offset_t addr)1088 kernel_text_contains(vm_offset_t addr)
1089 {
1090 	return vm_kernel_stext <= addr && addr < vm_kernel_etext;
1091 }
1092