1 /*
2 * Copyright (c) 2003-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989, 1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57
58 #include <mach/i386/vm_param.h>
59
60 #include <string.h>
61 #include <mach/vm_param.h>
62 #include <mach/vm_prot.h>
63 #include <mach/machine.h>
64 #include <mach/time_value.h>
65 #include <kern/spl.h>
66 #include <kern/assert.h>
67 #include <kern/debug.h>
68 #include <kern/misc_protos.h>
69 #include <kern/cpu_data.h>
70 #include <kern/processor.h>
71 #include <vm/vm_page.h>
72 #include <vm/pmap.h>
73 #include <vm/vm_kern.h>
74 #include <i386/pmap.h>
75 #include <i386/misc_protos.h>
76 #include <i386/cpuid.h>
77 #include <mach/thread_status.h>
78 #include <pexpert/i386/efi.h>
79 #include <pexpert/pexpert.h>
80 #include <i386/i386_lowmem.h>
81 #include <i386/misc_protos.h>
82 #include <x86_64/lowglobals.h>
83 #include <i386/pal_routines.h>
84 #include <vm/vm_page_internal.h>
85
86 #include <mach-o/loader.h>
87 #include <libkern/kernel_mach_header.h>
88
89 #define P2ROUNDUP(x, align) (-(-(x) & -(align)))
90
91 vm_size_t mem_size = 0;
92 pmap_paddr_t first_avail = 0;/* first after page tables */
93
94 uint64_t max_mem; /* Size of physical memory minus carveouts (bytes), adjusted by maxmem */
95 uint64_t max_mem_actual; /* Actual size of physical memory (bytes) adjusted by
96 * the maxmem boot-arg */
97 uint64_t mem_actual;
98 uint64_t sane_size = 0; /* Memory size for defaults calculations */
99
100 /*
101 * KASLR parameters
102 */
103 ppnum_t vm_kernel_base_page;
104 vm_offset_t vm_kernel_base;
105 vm_offset_t vm_kernel_top;
106 vm_offset_t vm_kernel_stext;
107 vm_offset_t vm_kernel_etext;
108 vm_offset_t vm_kernel_slide;
109 vm_offset_t vm_kernel_slid_base;
110 vm_offset_t vm_kernel_slid_top;
111 vm_offset_t vm_hib_base;
112 vm_offset_t vm_kext_base = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
113 vm_offset_t vm_kext_top = VM_MIN_KERNEL_ADDRESS;
114
115 vm_offset_t vm_prelink_stext;
116 vm_offset_t vm_prelink_etext;
117 vm_offset_t vm_prelink_sinfo;
118 vm_offset_t vm_prelink_einfo;
119 vm_offset_t vm_slinkedit;
120 vm_offset_t vm_elinkedit;
121
122 vm_offset_t vm_kernel_builtinkmod_text;
123 vm_offset_t vm_kernel_builtinkmod_text_end;
124
125 #define MAXLORESERVE (32 * 1024 * 1024)
126
127 ppnum_t max_ppnum = 0;
128
129 /*
130 * pmap_high_used* are the highest range of physical memory used for kernel
131 * internals (page tables, vm_pages) via pmap_steal_memory() that don't
132 * need to be encrypted in hibernation images. There can be one gap in
133 * the middle of this due to fragmentation when using a mix of small
134 * and large pages. In that case, the fragment lives between the high
135 * and middle ranges.
136 */
137 ppnum_t pmap_high_used_top = 0;
138 ppnum_t pmap_high_used_bottom = 0;
139 ppnum_t pmap_middle_used_top = 0;
140 ppnum_t pmap_middle_used_bottom = 0;
141
142 enum {PMAP_MAX_RESERVED_RANGES = 32};
143 uint32_t pmap_reserved_pages_allocated = 0;
144 uint32_t pmap_reserved_range_indices[PMAP_MAX_RESERVED_RANGES];
145 uint32_t pmap_last_reserved_range_index = 0;
146 uint32_t pmap_reserved_ranges = 0;
147
148 extern unsigned int bsd_mbuf_cluster_reserve(boolean_t *);
149
150 pmap_paddr_t avail_start, avail_end;
151 vm_offset_t virtual_avail, virtual_end;
152 static pmap_paddr_t avail_remaining;
153 vm_offset_t static_memory_end = 0;
154
155 vm_offset_t sHIB, eHIB, stext, etext, sdata, edata, end, sconst, econst;
156
157 /*
158 * _mh_execute_header is the mach_header for the currently executing kernel
159 */
160 vm_offset_t segTEXTB; unsigned long segSizeTEXT;
161 vm_offset_t segDATAB; unsigned long segSizeDATA;
162 vm_offset_t segLINKB; unsigned long segSizeLINK;
163 vm_offset_t segPRELINKTEXTB; unsigned long segSizePRELINKTEXT;
164 vm_offset_t segPRELINKINFOB; unsigned long segSizePRELINKINFO;
165 vm_offset_t segHIBB; unsigned long segSizeHIB;
166 unsigned long segSizeConst;
167
168 static kernel_segment_command_t *segTEXT, *segDATA;
169 static kernel_section_t *cursectTEXT, *lastsectTEXT;
170 static kernel_segment_command_t *segCONST;
171
172 extern uint64_t firmware_Conventional_bytes;
173 extern uint64_t firmware_RuntimeServices_bytes;
174 extern uint64_t firmware_ACPIReclaim_bytes;
175 extern uint64_t firmware_ACPINVS_bytes;
176 extern uint64_t firmware_PalCode_bytes;
177 extern uint64_t firmware_Reserved_bytes;
178 extern uint64_t firmware_Unusable_bytes;
179 extern uint64_t firmware_other_bytes;
180 uint64_t firmware_MMIO_bytes;
181
182 /*
183 * Linker magic to establish the highest address in the kernel.
184 */
185 extern void *last_kernel_symbol;
186
187 #define LG_PPNUM_PAGES (I386_LPGBYTES >> PAGE_SHIFT)
188 #define LG_PPNUM_MASK (I386_LPGMASK >> PAGE_SHIFT)
189
190 /* set so no region large page fragment pages exist */
191 #define RESET_FRAG(r) (((r)->alloc_frag_up = 1), ((r)->alloc_frag_down = 0))
192
193 boolean_t memmap = FALSE;
194 #if DEBUG || DEVELOPMENT
195 static void
kprint_memmap(vm_offset_t maddr,unsigned int msize,unsigned int mcount)196 kprint_memmap(vm_offset_t maddr, unsigned int msize, unsigned int mcount)
197 {
198 unsigned int i;
199 unsigned int j;
200 pmap_memory_region_t *p = pmap_memory_regions;
201 EfiMemoryRange *mptr;
202 addr64_t region_start, region_end;
203 addr64_t efi_start, efi_end;
204
205 for (j = 0; j < pmap_memory_region_count; j++, p++) {
206 kprintf("pmap region %d type %d base 0x%llx alloc_up 0x%llx alloc_down 0x%llx"
207 " alloc_frag_up 0x%llx alloc_frag_down 0x%llx top 0x%llx\n",
208 j, p->type,
209 (addr64_t) p->base << I386_PGSHIFT,
210 (addr64_t) p->alloc_up << I386_PGSHIFT,
211 (addr64_t) p->alloc_down << I386_PGSHIFT,
212 (addr64_t) p->alloc_frag_up << I386_PGSHIFT,
213 (addr64_t) p->alloc_frag_down << I386_PGSHIFT,
214 (addr64_t) p->end << I386_PGSHIFT);
215 region_start = (addr64_t) p->base << I386_PGSHIFT;
216 region_end = ((addr64_t) p->end << I386_PGSHIFT) - 1;
217 mptr = (EfiMemoryRange *) maddr;
218 for (i = 0;
219 i < mcount;
220 i++, mptr = (EfiMemoryRange *)(((vm_offset_t)mptr) + msize)) {
221 if (mptr->Type != kEfiLoaderCode &&
222 mptr->Type != kEfiLoaderData &&
223 mptr->Type != kEfiBootServicesCode &&
224 mptr->Type != kEfiBootServicesData &&
225 mptr->Type != kEfiConventionalMemory) {
226 efi_start = (addr64_t)mptr->PhysicalStart;
227 efi_end = efi_start + ((vm_offset_t)mptr->NumberOfPages << I386_PGSHIFT) - 1;
228 if ((efi_start >= region_start && efi_start <= region_end) ||
229 (efi_end >= region_start && efi_end <= region_end)) {
230 kprintf(" *** Overlapping region with EFI runtime region %d\n", i);
231 }
232 }
233 }
234 }
235 }
236 #define DPRINTF(x...) do { if (memmap) kprintf(x); } while (0)
237
238 #else
239
240 static void
kprint_memmap(vm_offset_t maddr,unsigned int msize,unsigned int mcount)241 kprint_memmap(vm_offset_t maddr, unsigned int msize, unsigned int mcount)
242 {
243 #pragma unused(maddr, msize, mcount)
244 }
245
246 #define DPRINTF(x...)
247 #endif /* DEBUG */
248
249 /*
250 * Basic VM initialization.
251 */
252 void
i386_vm_init(uint64_t maxmem,boolean_t IA32e,boot_args * args)253 i386_vm_init(uint64_t maxmem,
254 boolean_t IA32e,
255 boot_args *args)
256 {
257 pmap_memory_region_t *pmptr;
258 pmap_memory_region_t *prev_pmptr;
259 EfiMemoryRange *mptr;
260 unsigned int mcount;
261 unsigned int msize;
262 vm_offset_t maddr;
263 ppnum_t fap;
264 unsigned int i;
265 ppnum_t maxpg = 0;
266 uint32_t pmap_type;
267 uint32_t maxloreserve;
268 uint32_t maxdmaaddr;
269 uint32_t mbuf_reserve = 0;
270 boolean_t mbuf_override = FALSE;
271 boolean_t coalescing_permitted;
272 vm_kernel_base_page = i386_btop(args->kaddr);
273 vm_offset_t base_address;
274 vm_offset_t static_base_address;
275
276 PE_parse_boot_argn("memmap", &memmap, sizeof(memmap));
277
278 /*
279 * Establish the KASLR parameters.
280 */
281 static_base_address = ml_static_ptovirt(KERNEL_BASE_OFFSET);
282 base_address = ml_static_ptovirt(args->kaddr);
283 vm_kernel_slide = base_address - static_base_address;
284 if (args->kslide) {
285 kprintf("KASLR slide: 0x%016lx dynamic\n", vm_kernel_slide);
286 if (vm_kernel_slide != ((vm_offset_t)args->kslide)) {
287 panic("Kernel base inconsistent with slide - rebased?");
288 }
289 } else {
290 /* No slide relative to on-disk symbols */
291 kprintf("KASLR slide: 0x%016lx static and ignored\n",
292 vm_kernel_slide);
293 vm_kernel_slide = 0;
294 }
295
296 /*
297 * Zero out local relocations to avoid confusing kxld.
298 * TODO: might be better to move this code to OSKext::initialize
299 */
300 if (_mh_execute_header.flags & MH_PIE) {
301 struct load_command *loadcmd;
302 uint32_t cmd;
303
304 loadcmd = (struct load_command *)((uintptr_t)&_mh_execute_header +
305 sizeof(_mh_execute_header));
306
307 for (cmd = 0; cmd < _mh_execute_header.ncmds; cmd++) {
308 if (loadcmd->cmd == LC_DYSYMTAB) {
309 struct dysymtab_command *dysymtab;
310
311 dysymtab = (struct dysymtab_command *)loadcmd;
312 dysymtab->nlocrel = 0;
313 dysymtab->locreloff = 0;
314 kprintf("Hiding local relocations\n");
315 break;
316 }
317 loadcmd = (struct load_command *)((uintptr_t)loadcmd + loadcmd->cmdsize);
318 }
319 }
320
321 /*
322 * Now retrieve addresses for end, edata, and etext
323 * from MACH-O headers.
324 */
325 segTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
326 "__TEXT", &segSizeTEXT);
327 segDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
328 "__DATA", &segSizeDATA);
329 segLINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
330 "__LINKEDIT", &segSizeLINK);
331 segHIBB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
332 "__HIB", &segSizeHIB);
333 segPRELINKTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
334 "__PRELINK_TEXT", &segSizePRELINKTEXT);
335 segPRELINKINFOB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header,
336 "__PRELINK_INFO", &segSizePRELINKINFO);
337 segTEXT = getsegbynamefromheader(&_mh_execute_header,
338 "__TEXT");
339 segDATA = getsegbynamefromheader(&_mh_execute_header,
340 "__DATA");
341 segCONST = getsegbynamefromheader(&_mh_execute_header,
342 "__DATA_CONST");
343 cursectTEXT = lastsectTEXT = firstsect(segTEXT);
344 /* Discover the last TEXT section within the TEXT segment */
345 while ((cursectTEXT = nextsect(segTEXT, cursectTEXT)) != NULL) {
346 lastsectTEXT = cursectTEXT;
347 }
348
349 sHIB = segHIBB;
350 eHIB = segHIBB + segSizeHIB;
351 vm_hib_base = sHIB;
352 /* Zero-padded from ehib to stext if text is 2M-aligned */
353 stext = segTEXTB;
354 lowGlo.lgStext = stext;
355 etext = (vm_offset_t) round_page_64(lastsectTEXT->addr + lastsectTEXT->size);
356 /* Zero-padded from etext to sdata if text is 2M-aligned */
357 sdata = segDATAB;
358 edata = segDATAB + segSizeDATA;
359
360 sconst = segCONST->vmaddr;
361 segSizeConst = segCONST->vmsize;
362 econst = sconst + segSizeConst;
363
364 kc_format_t kc_format = KCFormatUnknown;
365
366 /* XXX: FIXME_IN_dyld: For new-style kernel caches, the ending address of __DATA_CONST may not be page-aligned */
367 if (PE_get_primary_kc_format(&kc_format) && kc_format == KCFormatFileset) {
368 /* Round up the end */
369 econst = P2ROUNDUP(econst, PAGE_SIZE);
370 edata = P2ROUNDUP(edata, PAGE_SIZE);
371 } else {
372 assert(((sconst | econst) & PAGE_MASK) == 0);
373 assert(((sdata | edata) & PAGE_MASK) == 0);
374 }
375
376 DPRINTF("segTEXTB = %p\n", (void *) segTEXTB);
377 DPRINTF("segDATAB = %p\n", (void *) segDATAB);
378 DPRINTF("segLINKB = %p\n", (void *) segLINKB);
379 DPRINTF("segHIBB = %p\n", (void *) segHIBB);
380 DPRINTF("segPRELINKTEXTB = %p\n", (void *) segPRELINKTEXTB);
381 DPRINTF("segPRELINKINFOB = %p\n", (void *) segPRELINKINFOB);
382 DPRINTF("sHIB = %p\n", (void *) sHIB);
383 DPRINTF("eHIB = %p\n", (void *) eHIB);
384 DPRINTF("stext = %p\n", (void *) stext);
385 DPRINTF("etext = %p\n", (void *) etext);
386 DPRINTF("sdata = %p\n", (void *) sdata);
387 DPRINTF("edata = %p\n", (void *) edata);
388 DPRINTF("sconst = %p\n", (void *) sconst);
389 DPRINTF("econst = %p\n", (void *) econst);
390 DPRINTF("kernel_top = %p\n", (void *) &last_kernel_symbol);
391
392 vm_kernel_base = sHIB;
393 vm_kernel_top = (vm_offset_t) &last_kernel_symbol;
394 vm_kernel_stext = stext;
395 vm_kernel_etext = etext;
396 vm_prelink_stext = segPRELINKTEXTB;
397 vm_prelink_etext = segPRELINKTEXTB + segSizePRELINKTEXT;
398 vm_prelink_sinfo = segPRELINKINFOB;
399 vm_prelink_einfo = segPRELINKINFOB + segSizePRELINKINFO;
400 vm_slinkedit = segLINKB;
401 vm_elinkedit = segLINKB + segSizeLINK;
402
403 /*
404 * In the fileset world, we want to be able to (un)slide addresses from
405 * the kernel or any of the kexts (e.g., for kernel logging metadata
406 * passed between the kernel and logd in userspace). VM_KERNEL_UNSLIDE
407 * (via VM_KERNEL_IS_SLID) should apply to the addresses in the range
408 * from the first basement address to the last boot kc address.
409 *
410 * ^
411 * :
412 * |
413 * vm_kernel_slid_top - ---------------------------------------------
414 * |
415 * :
416 * : Boot kc (kexts in the boot kc here)
417 * : - - - - - - - - - - - - - - - - - - - - - - -
418 * :
419 * :
420 * | Boot kc (kernel here)
421 * - ---------------------------------------------
422 * |
423 * :
424 * | Basement (kexts in pageable and aux kcs here)
425 * vm_kernel_slid_base - ---------------------------------------------
426 * 0
427 */
428
429 vm_kernel_slid_base = vm_kext_base + vm_kernel_slide;
430 vm_kernel_slid_top = (kc_format == KCFormatFileset) ?
431 vm_slinkedit : vm_prelink_einfo;
432
433 vm_page_kernelcache_count = (unsigned int) (atop_64(vm_kernel_top - vm_kernel_base));
434
435 vm_set_page_size();
436
437 /*
438 * Compute the memory size.
439 */
440
441 avail_remaining = 0;
442 avail_end = 0;
443 pmptr = pmap_memory_regions;
444 prev_pmptr = 0;
445 pmap_memory_region_count = pmap_memory_region_current = 0;
446 fap = (ppnum_t) i386_btop(first_avail);
447
448 maddr = ml_static_ptovirt((vm_offset_t)args->MemoryMap);
449 mptr = (EfiMemoryRange *)maddr;
450 if (args->MemoryMapDescriptorSize == 0) {
451 panic("Invalid memory map descriptor size");
452 }
453 msize = args->MemoryMapDescriptorSize;
454 mcount = args->MemoryMapSize / msize;
455
456 #define FOURGIG 0x0000000100000000ULL
457 #define ONEGIG 0x0000000040000000ULL
458
459 for (i = 0; i < mcount; i++, mptr = (EfiMemoryRange *)(((vm_offset_t)mptr) + msize)) {
460 ppnum_t base, top;
461 uint64_t region_bytes = 0;
462
463 if (pmap_memory_region_count >= PMAP_MEMORY_REGIONS_SIZE) {
464 kprintf("WARNING: truncating memory region count at %d\n", pmap_memory_region_count);
465 break;
466 }
467 base = (ppnum_t) (mptr->PhysicalStart >> I386_PGSHIFT);
468 top = (ppnum_t) (((mptr->PhysicalStart) >> I386_PGSHIFT) + mptr->NumberOfPages - 1);
469
470 if (base == 0) {
471 /*
472 * Avoid having to deal with the edge case of the
473 * very first possible physical page and the roll-over
474 * to -1; just ignore that page.
475 */
476 kprintf("WARNING: ignoring first page in [0x%llx:0x%llx]\n", (uint64_t) base, (uint64_t) top);
477 base++;
478 }
479 if (top + 1 == 0) {
480 /*
481 * Avoid having to deal with the edge case of the
482 * very last possible physical page and the roll-over
483 * to 0; just ignore that page.
484 */
485 kprintf("WARNING: ignoring last page in [0x%llx:0x%llx]\n", (uint64_t) base, (uint64_t) top);
486 top--;
487 }
488 if (top < base) {
489 /*
490 * That was the only page in that region, so
491 * ignore the whole region.
492 */
493 continue;
494 }
495
496 #if MR_RSV_TEST
497 static uint32_t nmr = 0;
498 if ((base > 0x20000) && (nmr++ < 4)) {
499 mptr->Attribute |= EFI_MEMORY_KERN_RESERVED;
500 }
501 #endif
502 region_bytes = (uint64_t)(mptr->NumberOfPages << I386_PGSHIFT);
503 pmap_type = mptr->Type;
504
505 switch (mptr->Type) {
506 case kEfiLoaderCode:
507 case kEfiLoaderData:
508 case kEfiBootServicesCode:
509 case kEfiBootServicesData:
510 case kEfiConventionalMemory:
511 /*
512 * Consolidate usable memory types into one.
513 */
514 pmap_type = kEfiConventionalMemory;
515 sane_size += region_bytes;
516 firmware_Conventional_bytes += region_bytes;
517 break;
518 /*
519 * sane_size should reflect the total amount of physical
520 * RAM in the system, not just the amount that is
521 * available for the OS to use.
522 * We now get this value from SMBIOS tables
523 * rather than reverse engineering the memory map.
524 * But the legacy computation of "sane_size" is kept
525 * for diagnostic information.
526 */
527
528 case kEfiRuntimeServicesCode:
529 case kEfiRuntimeServicesData:
530 firmware_RuntimeServices_bytes += region_bytes;
531 sane_size += region_bytes;
532 break;
533 case kEfiACPIReclaimMemory:
534 firmware_ACPIReclaim_bytes += region_bytes;
535 sane_size += region_bytes;
536 break;
537 case kEfiACPIMemoryNVS:
538 firmware_ACPINVS_bytes += region_bytes;
539 sane_size += region_bytes;
540 break;
541 case kEfiPalCode:
542 firmware_PalCode_bytes += region_bytes;
543 sane_size += region_bytes;
544 break;
545
546 case kEfiReservedMemoryType:
547 firmware_Reserved_bytes += region_bytes;
548 break;
549 case kEfiUnusableMemory:
550 firmware_Unusable_bytes += region_bytes;
551 break;
552 case kEfiMemoryMappedIO:
553 case kEfiMemoryMappedIOPortSpace:
554 firmware_MMIO_bytes += region_bytes;
555 break;
556 default:
557 firmware_other_bytes += region_bytes;
558 break;
559 }
560
561 DPRINTF("EFI region %d: type %u/%d, base 0x%x, top 0x%x %s\n",
562 i, mptr->Type, pmap_type, base, top,
563 (mptr->Attribute & EFI_MEMORY_KERN_RESERVED)? "RESERVED" :
564 (mptr->Attribute & EFI_MEMORY_RUNTIME)? "RUNTIME" : "");
565
566 if (maxpg) {
567 if (base >= maxpg) {
568 break;
569 }
570 top = (top > maxpg) ? maxpg : top;
571 }
572
573 /*
574 * handle each region
575 */
576 if ((mptr->Attribute & EFI_MEMORY_RUNTIME) == EFI_MEMORY_RUNTIME ||
577 pmap_type != kEfiConventionalMemory) {
578 prev_pmptr = 0;
579 continue;
580 } else {
581 /*
582 * Usable memory region
583 */
584 if (top < I386_LOWMEM_RESERVED ||
585 !pal_is_usable_memory(base, top)) {
586 prev_pmptr = 0;
587 continue;
588 }
589 /*
590 * A range may be marked with with the
591 * EFI_MEMORY_KERN_RESERVED attribute
592 * on some systems, to indicate that the range
593 * must not be made available to devices.
594 */
595
596 if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
597 if (++pmap_reserved_ranges > PMAP_MAX_RESERVED_RANGES) {
598 panic("Too many reserved ranges %u", pmap_reserved_ranges);
599 }
600 }
601
602 if (top < fap) {
603 /*
604 * entire range below first_avail
605 * salvage some low memory pages
606 * we use some very low memory at startup
607 * mark as already allocated here
608 */
609 if (base >= I386_LOWMEM_RESERVED) {
610 pmptr->base = base;
611 } else {
612 pmptr->base = I386_LOWMEM_RESERVED;
613 }
614
615 pmptr->end = top;
616
617
618 if ((mptr->Attribute & EFI_MEMORY_KERN_RESERVED) &&
619 (top < vm_kernel_base_page)) {
620 pmptr->alloc_up = pmptr->base;
621 pmptr->alloc_down = pmptr->end;
622 RESET_FRAG(pmptr);
623 pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
624 } else {
625 /*
626 * mark as already mapped
627 */
628 pmptr->alloc_up = top + 1;
629 pmptr->alloc_down = top;
630 RESET_FRAG(pmptr);
631 }
632 pmptr->type = pmap_type;
633 pmptr->attribute = mptr->Attribute;
634 } else if ((base < fap) && (top > fap)) {
635 /*
636 * spans first_avail
637 * put mem below first avail in table but
638 * mark already allocated
639 */
640 pmptr->base = base;
641 pmptr->end = (fap - 1);
642 pmptr->alloc_up = pmptr->end + 1;
643 pmptr->alloc_down = pmptr->end;
644 RESET_FRAG(pmptr);
645 pmptr->type = pmap_type;
646 pmptr->attribute = mptr->Attribute;
647 /*
648 * we bump these here inline so the accounting
649 * below works correctly
650 */
651 pmptr++;
652 pmap_memory_region_count++;
653
654 pmptr->alloc_up = pmptr->base = fap;
655 pmptr->type = pmap_type;
656 pmptr->attribute = mptr->Attribute;
657 pmptr->alloc_down = pmptr->end = top;
658 RESET_FRAG(pmptr);
659
660 if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
661 pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
662 }
663 } else {
664 /*
665 * entire range useable
666 */
667 pmptr->alloc_up = pmptr->base = base;
668 pmptr->type = pmap_type;
669 pmptr->attribute = mptr->Attribute;
670 pmptr->alloc_down = pmptr->end = top;
671 RESET_FRAG(pmptr);
672 if (mptr->Attribute & EFI_MEMORY_KERN_RESERVED) {
673 pmap_reserved_range_indices[pmap_last_reserved_range_index++] = pmap_memory_region_count;
674 }
675 }
676
677 if (i386_ptob(pmptr->end) > avail_end) {
678 avail_end = i386_ptob(pmptr->end);
679 }
680
681 avail_remaining += (pmptr->end - pmptr->base);
682 coalescing_permitted = (prev_pmptr && (pmptr->attribute == prev_pmptr->attribute) && ((pmptr->attribute & EFI_MEMORY_KERN_RESERVED) == 0));
683 /*
684 * Consolidate contiguous memory regions, if possible
685 */
686 if (prev_pmptr &&
687 (pmptr->type == prev_pmptr->type) &&
688 (coalescing_permitted) &&
689 (pmptr->base == pmptr->alloc_up) &&
690 (prev_pmptr->end == prev_pmptr->alloc_down) &&
691 (pmptr->base == (prev_pmptr->end + 1))) {
692 prev_pmptr->end = pmptr->end;
693 prev_pmptr->alloc_down = pmptr->alloc_down;
694 RESET_FRAG(pmptr);
695 } else {
696 pmap_memory_region_count++;
697 prev_pmptr = pmptr;
698 pmptr++;
699 }
700 }
701 }
702
703 if (memmap) {
704 kprint_memmap(maddr, msize, mcount);
705 }
706
707 avail_start = first_avail;
708 mem_actual = args->PhysicalMemorySize;
709
710 /*
711 * For user visible memory size, round up to 128 Mb
712 * - accounting for the various stolen memory not reported by EFI.
713 * This is maintained for historical, comparison purposes but
714 * we now use the memory size reported by EFI/Booter.
715 */
716 sane_size = (sane_size + 128 * MB - 1) & ~((uint64_t)(128 * MB - 1));
717 if (sane_size != mem_actual) {
718 printf("mem_actual: 0x%llx\n legacy sane_size: 0x%llx\n",
719 mem_actual, sane_size);
720 }
721 sane_size = mem_actual;
722
723 /*
724 * We cap at KERNEL_MAXMEM bytes (see vm_param.h).
725 * Unless overriden by the maxmem= boot-arg
726 * -- which is a non-zero maxmem argument to this function.
727 */
728 if (maxmem == 0 && sane_size > KERNEL_MAXMEM) {
729 maxmem = KERNEL_MAXMEM;
730 printf("Physical memory %lld bytes capped at %dGB\n",
731 sane_size, (uint32_t) (KERNEL_MAXMEM / GB));
732 }
733
734 /*
735 * if user set maxmem, reduce memory sizes
736 */
737 if ((maxmem > (uint64_t)first_avail) && (maxmem < sane_size)) {
738 ppnum_t discarded_pages = (ppnum_t)((sane_size - maxmem) >> I386_PGSHIFT);
739 ppnum_t highest_pn = 0;
740 ppnum_t cur_end = 0;
741 uint64_t pages_to_use;
742 unsigned cur_region = 0;
743
744 sane_size = maxmem;
745
746 if (avail_remaining > discarded_pages) {
747 avail_remaining -= discarded_pages;
748 } else {
749 avail_remaining = 0;
750 }
751
752 pages_to_use = avail_remaining;
753
754 while (cur_region < pmap_memory_region_count && pages_to_use) {
755 for (cur_end = pmap_memory_regions[cur_region].base;
756 cur_end < pmap_memory_regions[cur_region].end && pages_to_use;
757 cur_end++) {
758 if (cur_end > highest_pn) {
759 highest_pn = cur_end;
760 }
761 pages_to_use--;
762 }
763 if (pages_to_use == 0) {
764 pmap_memory_regions[cur_region].end = cur_end;
765 pmap_memory_regions[cur_region].alloc_down = cur_end;
766 RESET_FRAG(&pmap_memory_regions[cur_region]);
767 }
768
769 cur_region++;
770 }
771 pmap_memory_region_count = cur_region;
772
773 avail_end = i386_ptob(highest_pn + 1);
774 }
775
776 /*
777 * mem_size is only a 32 bit container... follow the PPC route
778 * and pin it to a 2 Gbyte maximum
779 */
780 if (sane_size > (FOURGIG >> 1)) {
781 mem_size = (vm_size_t)(FOURGIG >> 1);
782 } else {
783 mem_size = (vm_size_t)sane_size;
784 }
785 max_mem = sane_size;
786 max_mem_actual = sane_size;
787
788 kprintf("Physical memory %llu MB\n", sane_size / MB);
789
790 max_valid_low_ppnum = (2 * GB) / PAGE_SIZE;
791
792 if (!PE_parse_boot_argn("max_valid_dma_addr", &maxdmaaddr, sizeof(maxdmaaddr))) {
793 max_valid_dma_address = (uint64_t)4 * (uint64_t)GB;
794 } else {
795 max_valid_dma_address = ((uint64_t) maxdmaaddr) * MB;
796
797 if ((max_valid_dma_address / PAGE_SIZE) < max_valid_low_ppnum) {
798 max_valid_low_ppnum = (ppnum_t)(max_valid_dma_address / PAGE_SIZE);
799 }
800 }
801 if (avail_end >= max_valid_dma_address) {
802 if (!PE_parse_boot_argn("maxloreserve", &maxloreserve, sizeof(maxloreserve))) {
803 if (sane_size >= (ONEGIG * 15)) {
804 maxloreserve = (MAXLORESERVE / PAGE_SIZE) * 4;
805 } else if (sane_size >= (ONEGIG * 7)) {
806 maxloreserve = (MAXLORESERVE / PAGE_SIZE) * 2;
807 } else {
808 maxloreserve = MAXLORESERVE / PAGE_SIZE;
809 }
810
811 #if SOCKETS
812 mbuf_reserve = bsd_mbuf_cluster_reserve(&mbuf_override) / PAGE_SIZE;
813 #endif
814 } else {
815 maxloreserve = (maxloreserve * (1024 * 1024)) / PAGE_SIZE;
816 }
817
818 if (maxloreserve) {
819 vm_lopage_free_limit = maxloreserve;
820
821 if (mbuf_override == TRUE) {
822 vm_lopage_free_limit += mbuf_reserve;
823 vm_lopage_lowater = 0;
824 } else {
825 vm_lopage_lowater = vm_lopage_free_limit / 16;
826 }
827
828 vm_lopage_refill = TRUE;
829 vm_lopage_needed = TRUE;
830 }
831 }
832
833 /*
834 * Initialize kernel physical map.
835 * Kernel virtual address starts at VM_KERNEL_MIN_ADDRESS.
836 */
837 kprintf("avail_remaining = 0x%lx\n", (unsigned long)avail_remaining);
838 pmap_bootstrap(0, IA32e);
839 }
840
841
842 unsigned int
pmap_free_pages(void)843 pmap_free_pages(void)
844 {
845 return (unsigned int)avail_remaining;
846 }
847
848 boolean_t pmap_next_page_reserved(ppnum_t *);
849
850 /*
851 * Pick a page from a "kernel private" reserved range; works around
852 * errata on some hardware. EFI marks pages which can't be used for
853 * certain kinds of I/O-ish activities as reserved. We reserve them for
854 * kernel internal usage and prevent them from ever going on regular
855 * free list.
856 */
857 boolean_t
pmap_next_page_reserved(ppnum_t * pn)858 pmap_next_page_reserved(
859 ppnum_t *pn)
860 {
861 uint32_t n;
862 pmap_memory_region_t *region;
863 uint32_t reserved_index;
864
865 if (pmap_reserved_ranges) {
866 for (n = 0; n < pmap_last_reserved_range_index; n++) {
867 reserved_index = pmap_reserved_range_indices[n];
868 region = &pmap_memory_regions[reserved_index];
869 if (region->alloc_up <= region->alloc_down) {
870 *pn = region->alloc_up++;
871 } else if (region->alloc_frag_up <= region->alloc_frag_down) {
872 *pn = region->alloc_frag_up++;
873 } else {
874 continue;
875 }
876 avail_remaining--;
877
878 if (*pn > max_ppnum) {
879 max_ppnum = *pn;
880 }
881
882 pmap_reserved_pages_allocated++;
883 #if DEBUG
884 if (region->alloc_up > region->alloc_down) {
885 kprintf("Exhausted reserved range index: %u, base: 0x%x end: 0x%x, type: 0x%x, attribute: 0x%llx\n", reserved_index, region->base, region->end, region->type, region->attribute);
886 }
887 #endif
888 return TRUE;
889 }
890 }
891 return FALSE;
892 }
893
894 /*
895 * Return the highest large page available. Fails once there are no more large pages.
896 */
897 kern_return_t
pmap_next_page_large(ppnum_t * pn)898 pmap_next_page_large(
899 ppnum_t *pn)
900 {
901 int r;
902 pmap_memory_region_t *region;
903 ppnum_t frag_start;
904 ppnum_t lgpg;
905
906 if (avail_remaining < LG_PPNUM_PAGES) {
907 return KERN_FAILURE;
908 }
909
910 for (r = pmap_memory_region_count - 1; r >= 0; r--) {
911 region = &pmap_memory_regions[r];
912
913 /*
914 * First check if there is enough memory.
915 */
916 if (region->alloc_down < region->alloc_up ||
917 (region->alloc_down - region->alloc_up + 1) < LG_PPNUM_PAGES) {
918 continue;
919 }
920
921 /*
922 * Find the starting large page, creating a fragment if needed.
923 */
924 if ((region->alloc_down & LG_PPNUM_MASK) == LG_PPNUM_MASK) {
925 lgpg = (region->alloc_down & ~LG_PPNUM_MASK);
926 } else {
927 /* Can only have 1 fragment per region at a time */
928 if (region->alloc_frag_up <= region->alloc_frag_down) {
929 continue;
930 }
931
932 /* Check for enough room below any fragment. */
933 frag_start = (region->alloc_down & ~LG_PPNUM_MASK);
934 if (frag_start < region->alloc_up ||
935 frag_start - region->alloc_up < LG_PPNUM_PAGES) {
936 continue;
937 }
938
939 lgpg = frag_start - LG_PPNUM_PAGES;
940 region->alloc_frag_up = frag_start;
941 region->alloc_frag_down = region->alloc_down;
942 }
943
944 *pn = lgpg;
945 region->alloc_down = lgpg - 1;
946
947
948 avail_remaining -= LG_PPNUM_PAGES;
949 if (*pn + LG_PPNUM_MASK > max_ppnum) {
950 max_ppnum = *pn + LG_PPNUM_MASK;
951 }
952
953 return KERN_SUCCESS;
954 }
955 return KERN_FAILURE;
956 }
957
958 boolean_t
pmap_next_page_hi(ppnum_t * pn,boolean_t might_free)959 pmap_next_page_hi(
960 ppnum_t *pn,
961 boolean_t might_free)
962 {
963 pmap_memory_region_t *region;
964 int n;
965
966 if (!might_free && pmap_next_page_reserved(pn)) {
967 return TRUE;
968 }
969
970 if (avail_remaining) {
971 for (n = pmap_memory_region_count - 1; n >= 0; n--) {
972 region = &pmap_memory_regions[n];
973 if (region->alloc_frag_up <= region->alloc_frag_down) {
974 *pn = region->alloc_frag_down--;
975 } else if (region->alloc_down >= region->alloc_up) {
976 *pn = region->alloc_down--;
977 } else {
978 continue;
979 }
980
981 avail_remaining--;
982
983 if (*pn > max_ppnum) {
984 max_ppnum = *pn;
985 }
986
987 return TRUE;
988 }
989 }
990 return FALSE;
991 }
992
993 /*
994 * Record which high pages have been allocated so far,
995 * so that pmap_init() can mark them PMAP_NOENCRYPT, which
996 * makes hibernation faster.
997 *
998 * Because of the code in pmap_next_page_large(), we could
999 * theoretically have fragments in several regions.
1000 * In practice that just doesn't happen. The last pmap region
1001 * is normally the largest and will satisfy all pmap_next_hi/large()
1002 * allocations. Since this information is used as an optimization
1003 * and it's ok to be conservative, we'll just record the information
1004 * for the final region.
1005 */
1006 void
pmap_hi_pages_done(void)1007 pmap_hi_pages_done(void)
1008 {
1009 pmap_memory_region_t *r;
1010
1011 r = &pmap_memory_regions[pmap_memory_region_count - 1];
1012 pmap_high_used_top = r->end;
1013 if (r->alloc_frag_up <= r->alloc_frag_down) {
1014 pmap_high_used_bottom = r->alloc_frag_down + 1;
1015 pmap_middle_used_top = r->alloc_frag_up - 1;
1016 if (r->alloc_up <= r->alloc_down) {
1017 pmap_middle_used_bottom = r->alloc_down + 1;
1018 } else {
1019 pmap_high_used_bottom = r->base;
1020 }
1021 } else {
1022 if (r->alloc_up <= r->alloc_down) {
1023 pmap_high_used_bottom = r->alloc_down + 1;
1024 } else {
1025 pmap_high_used_bottom = r->base;
1026 }
1027 }
1028 #if DEBUG || DEVELOPMENT
1029 kprintf("pmap_high_used_top 0x%x\n", pmap_high_used_top);
1030 kprintf("pmap_high_used_bottom 0x%x\n", pmap_high_used_bottom);
1031 kprintf("pmap_middle_used_top 0x%x\n", pmap_middle_used_top);
1032 kprintf("pmap_middle_used_bottom 0x%x\n", pmap_middle_used_bottom);
1033 #endif
1034 }
1035
1036 /*
1037 * Return the next available page from lowest memory for general use.
1038 */
1039 boolean_t
pmap_next_page(ppnum_t * pn)1040 pmap_next_page(
1041 ppnum_t *pn)
1042 {
1043 pmap_memory_region_t *region;
1044
1045 if (avail_remaining) {
1046 while (pmap_memory_region_current < pmap_memory_region_count) {
1047 region = &pmap_memory_regions[pmap_memory_region_current];
1048 if (region->alloc_up <= region->alloc_down) {
1049 *pn = region->alloc_up++;
1050 } else if (region->alloc_frag_up <= region->alloc_frag_down) {
1051 *pn = region->alloc_frag_up++;
1052 } else {
1053 pmap_memory_region_current++;
1054 continue;
1055 }
1056 avail_remaining--;
1057
1058 if (*pn > max_ppnum) {
1059 max_ppnum = *pn;
1060 }
1061
1062 return TRUE;
1063 }
1064 }
1065 return FALSE;
1066 }
1067
1068
1069 boolean_t
pmap_valid_page(ppnum_t pn)1070 pmap_valid_page(
1071 ppnum_t pn)
1072 {
1073 unsigned int i;
1074 pmap_memory_region_t *pmptr = pmap_memory_regions;
1075
1076 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
1077 if ((pn >= pmptr->base) && (pn <= pmptr->end)) {
1078 return TRUE;
1079 }
1080 }
1081 return FALSE;
1082 }
1083
1084 /*
1085 * Returns true if the address lies in the kernel __TEXT segment range.
1086 */
1087 bool
kernel_text_contains(vm_offset_t addr)1088 kernel_text_contains(vm_offset_t addr)
1089 {
1090 return vm_kernel_stext <= addr && addr < vm_kernel_etext;
1091 }
1092