xref: /xnu-11215.41.3/osfmk/i386/pmap_internal.h (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 
30 #ifndef _I386_PMAP_INTERNAL_
31 #define _I386_PMAP_INTERNAL_
32 #ifdef MACH_KERNEL_PRIVATE
33 
34 #include <vm/pmap.h>
35 #include <sys/kdebug.h>
36 #include <kern/ledger.h>
37 #include <kern/simple_lock.h>
38 #include <i386/bit_routines.h>
39 
40 /*
41  * pmap locking
42  */
43 
44 static inline void
PMAP_LOCK_EXCLUSIVE(pmap_t p)45 PMAP_LOCK_EXCLUSIVE(pmap_t p)
46 {
47 	mp_disable_preemption();
48 	lck_rw_lock_exclusive(&p->pmap_rwl);
49 }
50 
51 static inline void
PMAP_LOCK_SHARED(pmap_t p)52 PMAP_LOCK_SHARED(pmap_t p)
53 {
54 	mp_disable_preemption();
55 	lck_rw_lock_shared(&p->pmap_rwl);
56 }
57 
58 static inline void
PMAP_LOCK_SHARED_TO_EXCLUSIVE(pmap_t p)59 PMAP_LOCK_SHARED_TO_EXCLUSIVE(pmap_t p)
60 {
61 	lck_rw_lock_shared_to_exclusive(&p->pmap_rwl);
62 }
63 
64 static inline void
PMAP_LOCK_EXCLUSIVE_TO_SHARED(pmap_t p)65 PMAP_LOCK_EXCLUSIVE_TO_SHARED(pmap_t p)
66 {
67 	lck_rw_lock_exclusive_to_shared(&p->pmap_rwl);
68 }
69 
70 static inline void
PMAP_UNLOCK_EXCLUSIVE(pmap_t p)71 PMAP_UNLOCK_EXCLUSIVE(pmap_t p)
72 {
73 	lck_rw_unlock_exclusive(&p->pmap_rwl);
74 	mp_enable_preemption();
75 }
76 
77 static inline void
PMAP_UNLOCK_SHARED(pmap_t p)78 PMAP_UNLOCK_SHARED(pmap_t p)
79 {
80 	lck_rw_unlock_shared(&p->pmap_rwl);
81 	mp_enable_preemption();
82 }
83 
84 #define iswired(pte)    ((pte) & INTEL_PTE_WIRED)
85 
86 #ifdef  PMAP_TRACES
87 extern  boolean_t       pmap_trace;
88 #define PMAP_TRACE(...) \
89 	if (pmap_trace) { \
90 	        KDBG_RELEASE(__VA_ARGS__); \
91 	}
92 #else
93 #define PMAP_TRACE(...) KDBG_DEBUG(__VA_ARGS__)
94 #endif /* PMAP_TRACES */
95 
96 #define PMAP_TRACE_CONSTANT(...) KDBG_RELEASE(__VA_ARGS__)
97 
98 kern_return_t   pmap_expand_pml4(
99 	pmap_t          map,
100 	vm_map_offset_t v,
101 	unsigned int options);
102 
103 kern_return_t   pmap_expand_pdpt(
104 	pmap_t          map,
105 	vm_map_offset_t v,
106 	unsigned int options);
107 
108 void            phys_attribute_set(
109 	ppnum_t         phys,
110 	int             bits);
111 
112 void            pmap_set_reference(
113 	ppnum_t pn);
114 
115 boolean_t       phys_page_exists(
116 	ppnum_t pn);
117 
118 void
119     pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *);
120 
121 void
122     pmap_update_cache_attributes_locked(ppnum_t, unsigned);
123 
124 
125 static inline void
PMAP_UPDATE_TLBS(pmap_t fp,addr64_t s,addr64_t e)126 PMAP_UPDATE_TLBS(pmap_t fp, addr64_t s, addr64_t e)
127 {
128 	pmap_flush_tlbs(fp, s, e, 0, NULL);
129 }
130 
131 #define PMAP_DELAY_TLB_FLUSH            0x01
132 
133 static inline void
PMAP_UPDATE_TLBS_DELAYED(pmap_t fp,addr64_t s,addr64_t e,pmap_flush_context * pfc)134 PMAP_UPDATE_TLBS_DELAYED(pmap_t fp, addr64_t s, addr64_t e, pmap_flush_context *pfc)
135 {
136 	pmap_flush_tlbs(fp, s, e, PMAP_DELAY_TLB_FLUSH, pfc);
137 }
138 
139 /*
140  *	Private data structures.
141  */
142 
143 /*
144  *	For each vm_page_t, there is a list of all currently
145  *	valid virtual mappings of that page.  An entry is
146  *	a pv_rooted_entry_t; the list is the pv_table.
147  *
148  *      N.B.  with the new combo rooted/hashed scheme it is
149  *      only possibly to remove individual non-rooted entries
150  *      if they are found via the hashed chains as there is no
151  *      way to unlink the singly linked hashed entries if navigated to
152  *      via the queue list off the rooted entries.  Think of it as
153  *      hash/walk/pull, keeping track of the prev pointer while walking
154  *      the singly linked hash list.  All of this is to save memory and
155  *      keep both types of pv_entries as small as possible.
156  */
157 
158 /*
159  *
160  *  PV HASHING Changes - JK 1/2007
161  *
162  *  Pve's establish physical to virtual mappings.  These are used for aliasing of a
163  *  physical page to (potentially many) virtual addresses within pmaps. In the
164  *  previous implementation the structure of the pv_entries (each 16 bytes in size) was
165  *
166  *  typedef struct pv_entry {
167  *   struct pv_entry_t    next;
168  *   pmap_t                    pmap;
169  *   vm_map_offset_t   va;
170  *  } *pv_entry_t;
171  *
172  *  An initial array of these is created at boot time, one per physical page of
173  *  memory, indexed by the physical page number. Additionally, a pool of entries
174  *  is created from a pv_zone to be used as needed by pmap_enter() when it is
175  *  creating new mappings.  Originally, we kept this pool around because the code
176  *  in pmap_enter() was unable to block if it needed an entry and none were
177  *  available - we'd panic.  Some time ago I restructured the pmap_enter() code
178  *  so that for user pmaps it can block while zalloc'ing a pv structure and restart,
179  *  removing a panic from the code (in the case of the kernel pmap we cannot block
180  *  and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
181  *  The pool has not been removed since there is a large performance gain keeping
182  *  freed pv's around for reuse and not suffering the overhead of zalloc for every
183  *  new pv we need.
184  *
185  *  As pmap_enter() created new mappings it linked the new pve's for them off the
186  *  fixed pv array for that ppn (off the next pointer).  These pve's are accessed
187  *  for several operations, one of them being address space teardown. In that case,
188  *  we basically do this
189  *
190  *       for (every page/pte in the space) {
191  *               calc pve_ptr from the ppn in the pte
192  *               for (every pv in the list for the ppn) {
193  *                       if (this pv is for this pmap/vaddr) {
194  *                               do housekeeping
195  *                               unlink/free the pv
196  *                       }
197  *               }
198  *       }
199  *
200  *  The problem arose when we were running, say 8000 (or even 2000) apache or
201  *  other processes and one or all terminate. The list hanging off each pv array
202  *  entry could have thousands of entries.  We were continuously linearly searching
203  *  each of these lists as we stepped through the address space we were tearing
204  *  down.  Because of the locks we hold, likely taking a cache miss for each node,
205  *  and interrupt disabling for MP issues the system became completely unresponsive
206  *  for many seconds while we did this.
207  *
208  *  Realizing that pve's are accessed in two distinct ways (linearly running the
209  *  list by ppn for operations like pmap_page_protect and finding and
210  *  modifying/removing a single pve as part of pmap_enter processing) has led to
211  *  modifying the pve structures and databases.
212  *
213  *  There are now two types of pve structures.  A "rooted" structure which is
214  *  basically the original structure accessed in an array by ppn, and a ''hashed''
215  *  structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
216  *  designed with the two goals of minimizing wired memory and making the lookup of
217  *  a ppn faster.  Since a vast majority of pages in the system are not aliased
218  *  and hence represented by a single pv entry I've kept the rooted entry size as
219  *  small as possible because there is one of these dedicated for every physical
220  *  page of memory.  The hashed pve's are larger due to the addition of the hash
221  *  link and the ppn entry needed for matching while running the hash list to find
222  *  the entry we are looking for.  This way, only systems that have lots of
223  *  aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
224  *  structures have the same first three fields allowing some simplification in
225  *  the code.
226  *
227  *  They have these shapes
228  *
229  *  typedef struct pv_rooted_entry {
230  *       queue_head_t		qlink;
231  *       vm_map_offset_t		va;
232  *       pmap_t			pmap;
233  *  } *pv_rooted_entry_t;
234  *
235  *
236  *  typedef struct pv_hashed_entry {
237  *       queue_head_t		qlink;
238  *       vm_map_offset_t		va;
239  *       pmap_t			pmap;
240  *       ppnum_t                ppn;
241  *       struct pv_hashed_entry *nexth;
242  *  } *pv_hashed_entry_t;
243  *
244  *  The main flow difference is that the code is now aware of the rooted entry and
245  *  the hashed entries.  Code that runs the pv list still starts with the rooted
246  *  entry and then continues down the qlink onto the hashed entries.  Code that is
247  *  looking up a specific pv entry first checks the rooted entry and then hashes
248  *  and runs the hash list for the match. The hash list lengths are much smaller
249  *  than the original pv lists that contained all aliases for the specific ppn.
250  *
251  */
252 
253 typedef struct pv_rooted_entry {
254 	/* first three entries must match pv_hashed_entry_t */
255 	queue_head_t            qlink;
256 	vm_map_offset_t         va_and_flags;   /* virtual address for mapping */
257 	pmap_t                  pmap;   /* pmap where mapping lies */
258 } *pv_rooted_entry_t;
259 
260 #define PV_ROOTED_ENTRY_NULL    ((pv_rooted_entry_t) 0)
261 
262 typedef struct pv_hashed_entry {
263 	/* first three entries must match pv_rooted_entry_t */
264 	queue_head_t            qlink;
265 	vm_map_offset_t         va_and_flags;
266 	pmap_t                  pmap;
267 	ppnum_t                 ppn;
268 	struct pv_hashed_entry  *nexth;
269 } *pv_hashed_entry_t;
270 
271 #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
272 
273 #define PVE_VA(pve) ((pve)->va_and_flags & (vm_map_offset_t)~PAGE_MASK)
274 #define PVE_FLAGS(pve) ((pve)->va_and_flags & PAGE_MASK)
275 #define PVE_IS_ALTACCT 0x001
276 #define PVE_IS_ALTACCT_PAGE(pve) \
277 	(((pve)->va_and_flags & PVE_IS_ALTACCT) ? TRUE : FALSE)
278 
279 //#define PV_DEBUG 1   /* uncomment to enable some PV debugging code */
280 #ifdef PV_DEBUG
281 #define CHK_NPVHASH() if(0 == npvhashmask) panic("npvhash uninitialized");
282 #else
283 #define CHK_NPVHASH(x)
284 #endif
285 
286 #define NPVHASHBUCKETS (4096)
287 #define NPVHASHMASK ((NPVHASHBUCKETS) - 1) /* MUST BE 2^N - 1 */
288 #define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000
289 #define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000
290 #define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
291 #define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
292 
293 extern volatile uint32_t        mappingrecurse;
294 extern uint32_t  pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark;
295 
296 /*
297  * PV hash locking
298  */
299 
300 #define LOCK_PV_HASH(hash)      lock_hash_hash(hash)
301 #define UNLOCK_PV_HASH(hash)    unlock_hash_hash(hash)
302 extern uint32_t npvhashmask;
303 extern pv_hashed_entry_t        *pv_hash_table;  /* hash lists */
304 extern pv_hashed_entry_t        pv_hashed_free_list;
305 extern pv_hashed_entry_t        pv_hashed_kern_free_list;
306 decl_simple_lock_data(extern, pv_hashed_free_list_lock);
307 decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock);
308 decl_simple_lock_data(extern, pv_hash_table_lock);
309 decl_simple_lock_data(extern, phys_backup_lock);
310 
311 extern zone_t           pv_hashed_list_zone;    /* zone of pv_hashed_entry
312                                                  * structures */
313 
314 extern uint32_t         pv_hashed_free_count;
315 extern uint32_t         pv_hashed_kern_free_count;
316 /*
317  *	Each entry in the pv_head_table is locked by a bit in the
318  *	pv_lock_table.  The lock bits are accessed by the address of
319  *	the frame they lock.
320  */
321 #define pv_lock_table_size(n)   (((n)+BYTE_SIZE-1)/BYTE_SIZE)
322 #define pv_hash_lock_table_size(n)  (((n)+BYTE_SIZE-1)/BYTE_SIZE)
323 extern char             *pv_lock_table;         /* pointer to array of bits */
324 extern char             *pv_hash_lock_table;
325 extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */
326 
327 extern event_t mapping_replenish_event;
328 
329 static inline void
PV_HASHED_ALLOC(pv_hashed_entry_t * pvh_ep)330 PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep)
331 {
332 	pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL);
333 	simple_lock(&pv_hashed_free_list_lock, LCK_GRP_NULL);
334 	/* If the kernel reserved pool is low, let non-kernel mappings allocate
335 	 * synchronously, possibly subject to a throttle.
336 	 */
337 	if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) {
338 		pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next;
339 		pv_hashed_free_count--;
340 	}
341 
342 	simple_unlock(&pv_hashed_free_list_lock);
343 
344 	if (pv_hashed_free_count <= pv_hashed_low_water_mark) {
345 		if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) {
346 			thread_wakeup(&mapping_replenish_event);
347 		}
348 	}
349 }
350 
351 static inline void
PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh,pv_hashed_entry_t pvh_et,int pv_cnt)352 PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt)
353 {
354 	simple_lock(&pv_hashed_free_list_lock, LCK_GRP_NULL);
355 	pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;
356 	pv_hashed_free_list = pvh_eh;
357 	pv_hashed_free_count += (uint32_t)pv_cnt;
358 	simple_unlock(&pv_hashed_free_list_lock);
359 }
360 
361 extern unsigned pmap_kern_reserve_alloc_stat;
362 
363 static inline void
PV_HASHED_KERN_ALLOC(pv_hashed_entry_t * pvh_e)364 PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e)
365 {
366 	pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL);
367 	simple_lock(&pv_hashed_kern_free_list_lock, LCK_GRP_NULL);
368 
369 	if ((*pvh_e = pv_hashed_kern_free_list) != 0) {
370 		pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next;
371 		pv_hashed_kern_free_count--;
372 		pmap_kern_reserve_alloc_stat++;
373 	}
374 
375 	simple_unlock(&pv_hashed_kern_free_list_lock);
376 
377 	if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) {
378 		if (!mappingrecurse && os_atomic_cmpxchg(&mappingrecurse, 0, 1, acq_rel)) {
379 			thread_wakeup(&mapping_replenish_event);
380 		}
381 	}
382 }
383 
384 static inline void
PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh,pv_hashed_entry_t pvh_et,int pv_cnt)385 PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt)
386 {
387 	simple_lock(&pv_hashed_kern_free_list_lock, LCK_GRP_NULL);
388 	pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;
389 	pv_hashed_kern_free_list = pvh_eh;
390 	pv_hashed_kern_free_count += (uint32_t)pv_cnt;
391 	simple_unlock(&pv_hashed_kern_free_list_lock);
392 }
393 
394 extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters;
395 extern event_t pmap_user_pv_throttle_event;
396 
397 static inline void
pmap_pv_throttle(__unused pmap_t p)398 pmap_pv_throttle(__unused pmap_t p)
399 {
400 	pmap_assert(p != kernel_pmap);
401 	/* Apply throttle on non-kernel mappings */
402 	if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) {
403 		pmap_pv_throttle_stat++;
404 		/* This doesn't need to be strictly accurate, merely a hint
405 		 * to eliminate the timeout when the reserve is replenished.
406 		 */
407 		pmap_pv_throttled_waiters++;
408 		assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC);
409 		thread_block(THREAD_CONTINUE_NULL);
410 	}
411 }
412 
413 /*
414  *	Index into pv_head table, its lock bits, and the modify/reference and managed bits
415  */
416 
417 #define pa_index(pa)            (i386_btop(pa))
418 #define ppn_to_pai(ppn)         ((int)ppn)
419 
420 #define pai_to_pvh(pai)         (&pv_head_table[pai])
421 #define lock_pvh_pai(pai)       bit_lock(pai, (void *)pv_lock_table)
422 #define unlock_pvh_pai(pai)     bit_unlock(pai, (void *)pv_lock_table)
423 #define pvhash(idx)             (&pv_hash_table[idx])
424 #define lock_hash_hash(hash)    bit_lock(hash, (void *)pv_hash_lock_table)
425 #define unlock_hash_hash(hash)  bit_unlock(hash, (void *)pv_hash_lock_table)
426 
427 #define IS_MANAGED_PAGE(x)                              \
428 	((unsigned int)(x) <= last_managed_page &&      \
429 	 ((unsigned long long)pmap_phys_attributes[x] & PHYS_MANAGED))
430 #define IS_INTERNAL_PAGE(x)                     \
431 	(IS_MANAGED_PAGE(x) && ((unsigned long long)pmap_phys_attributes[x] & PHYS_INTERNAL))
432 #define IS_REUSABLE_PAGE(x)                     \
433 	(IS_MANAGED_PAGE(x) && ((unsigned long long)pmap_phys_attributes[x] & PHYS_REUSABLE))
434 #define IS_ALTACCT_PAGE(x, pve)                          \
435 	(IS_MANAGED_PAGE((x)) &&                        \
436 	 (PVE_IS_ALTACCT_PAGE((pve))))
437 
438 /*
439  *	Physical page attributes.  Copy bits from PTE definition.
440  */
441 #define PHYS_MODIFIED   INTEL_PTE_MOD   /* page modified */
442 #define PHYS_REFERENCED INTEL_PTE_REF   /* page referenced */
443 #define PHYS_MANAGED    INTEL_PTE_VALID /* page is managed */
444 #define PHYS_NOENCRYPT  INTEL_PTE_USER  /* no need to encrypt this page in the hibernation image */
445 #define PHYS_NCACHE     INTEL_PTE_NCACHE
446 #define PHYS_PAT        INTEL_PTE_PAT
447 #define PHYS_CACHEABILITY_MASK (INTEL_PTE_PAT | INTEL_PTE_NCACHE)
448 #define PHYS_INTERNAL   INTEL_PTE_WTHRU /* page from internal object */
449 #define PHYS_REUSABLE   INTEL_PTE_WRITE /* page is "reusable" */
450 
451 #if DEVELOPMENT || DEBUG
452 extern boolean_t        pmap_disable_kheap_nx;
453 extern boolean_t        pmap_disable_kstack_nx;
454 #endif
455 
456 #define PMAP_EXPAND_OPTIONS_NONE (0x0)
457 #define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT)
458 #define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER)
459 #define PMAP_EXPAND_OPTIONS_ALIASMAP (0x40000000U)
460 /*
461  *	Amount of virtual memory mapped by one
462  *	page-directory entry.
463  */
464 #define PDE_MAPPED_SIZE         (pdetova(1))
465 
466 /*
467  *	Locking and TLB invalidation
468  */
469 
470 /*
471  *	Locking Protocols: (changed 2/2007 JK)
472  *
473  *	There are two structures in the pmap module that need locking:
474  *	the pmaps themselves, and the per-page pv_lists (which are locked
475  *	by locking the pv_lock_table entry that corresponds to the pv_head
476  *	for the list in question.)  Most routines want to lock a pmap and
477  *	then do operations in it that require pv_list locking -- however
478  *	pmap_remove_all and pmap_copy_on_write operate on a physical page
479  *	basis and want to do the locking in the reverse order, i.e. lock
480  *	a pv_list and then go through all the pmaps referenced by that list.
481  *
482  *      The system wide pmap lock has been removed. Now, paths take a lock
483  *      on the pmap before changing its 'shape' and the reverse order lockers
484  *      (coming in by phys ppn) take a lock on the corresponding pv and then
485  *      retest to be sure nothing changed during the window before they locked
486  *      and can then run up/down the pv lists holding the list lock. This also
487  *      lets the pmap layer run (nearly completely) interrupt enabled, unlike
488  *      previously.
489  */
490 
491 /*
492  * PV locking
493  */
494 
495 #define LOCK_PVH(index) {               \
496 	mp_disable_preemption();        \
497 	lock_pvh_pai(index);            \
498 }
499 
500 #define UNLOCK_PVH(index) {             \
501 	unlock_pvh_pai(index);          \
502 	mp_enable_preemption();         \
503 }
504 
505 extern uint64_t pde_mapped_size;
506 
507 extern char             *pmap_phys_attributes;
508 extern ppnum_t          last_managed_page;
509 
510 /*
511  * Used to record high memory allocated to kernel before
512  * pmap_init() gets called.
513  */
514 extern ppnum_t pmap_high_used_top;
515 extern ppnum_t pmap_high_used_bottom;
516 extern ppnum_t pmap_middle_used_top;
517 extern ppnum_t pmap_middle_used_bottom;
518 
519 /*
520  * when spinning through pmap_remove
521  * ensure that we don't spend too much
522  * time with preemption disabled.
523  * I'm setting the current threshold
524  * to 20us
525  */
526 #define MAX_PREEMPTION_LATENCY_NS 20000
527 extern uint64_t max_preemption_latency_tsc;
528 
529 #if DEBUG
530 #define PMAP_INTR_DEBUG (1)
531 #endif
532 
533 #if PMAP_INTR_DEBUG
534 #define pmap_intr_assert() {                                                    \
535 	if (processor_avail_count > 1 && !ml_get_interrupts_enabled())          \
536 	        panic("pmap interrupt assert %d", processor_avail_count); \
537 }
538 #else
539 #define pmap_intr_assert()
540 #endif
541 #if DEVELOPMENT || DEBUG
542 extern int              nx_enabled;
543 #endif
544 extern unsigned int    inuse_ptepages_count;
545 
546 static inline uint32_t
pvhashidx(pmap_t pmap,vm_map_offset_t va)547 pvhashidx(pmap_t pmap, vm_map_offset_t va)
548 {
549 	uint32_t hashidx = ((uint32_t)(uintptr_t)pmap ^
550 	    ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
551 	    npvhashmask;
552 	return hashidx;
553 }
554 
555 /*
556  * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
557  * properly deals with the anchor.
558  * must be called with the hash locked, does not unlock it
559  */
560 static inline void
pmap_pvh_unlink(pv_hashed_entry_t pvh)561 pmap_pvh_unlink(pv_hashed_entry_t pvh)
562 {
563 	pv_hashed_entry_t       curh;
564 	pv_hashed_entry_t       *pprevh;
565 	uint32_t                pvhash_idx;
566 
567 	CHK_NPVHASH();
568 	pvhash_idx = pvhashidx(pvh->pmap, PVE_VA(pvh));
569 
570 	pprevh = pvhash(pvhash_idx);
571 
572 #if PV_DEBUG
573 	if (NULL == *pprevh) {
574 		panic("pvh_unlink null anchor"); /* JK DEBUG */
575 	}
576 #endif
577 	curh = *pprevh;
578 
579 	while (PV_HASHED_ENTRY_NULL != curh) {
580 		if (pvh == curh) {
581 			break;
582 		}
583 		pprevh = &curh->nexth;
584 		curh = curh->nexth;
585 	}
586 	if (PV_HASHED_ENTRY_NULL == curh) {
587 		panic("pmap_pvh_unlink no pvh");
588 	}
589 	*pprevh = pvh->nexth;
590 	return;
591 }
592 
593 static inline void
pv_hash_add(pv_hashed_entry_t pvh_e,pv_rooted_entry_t pv_h)594 pv_hash_add(pv_hashed_entry_t   pvh_e,
595     pv_rooted_entry_t   pv_h)
596 {
597 	pv_hashed_entry_t       *hashp;
598 	uint32_t                pvhash_idx;
599 
600 	CHK_NPVHASH();
601 	pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e));
602 	LOCK_PV_HASH(pvhash_idx);
603 	insque(&pvh_e->qlink, &pv_h->qlink);
604 	hashp = pvhash(pvhash_idx);
605 #if PV_DEBUG
606 	if (NULL == hashp) {
607 		panic("pv_hash_add(%p) null hash bucket", pvh_e);
608 	}
609 #endif
610 	pvh_e->nexth = *hashp;
611 	*hashp = pvh_e;
612 	UNLOCK_PV_HASH(pvhash_idx);
613 }
614 
615 static inline void
pv_hash_remove(pv_hashed_entry_t pvh_e)616 pv_hash_remove(pv_hashed_entry_t pvh_e)
617 {
618 	uint32_t                pvhash_idx;
619 
620 	CHK_NPVHASH();
621 	pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e));
622 	LOCK_PV_HASH(pvhash_idx);
623 	remque(&pvh_e->qlink);
624 	pmap_pvh_unlink(pvh_e);
625 	UNLOCK_PV_HASH(pvhash_idx);
626 }
627 
628 static inline boolean_t
popcnt1(uint64_t distance)629 popcnt1(uint64_t distance)
630 {
631 	return (distance & (distance - 1)) == 0;
632 }
633 
634 /*
635  * Routines to handle suppression of/recovery from some forms of pagetable corruption
636  * incidents observed in the field. These can be either software induced (wild
637  * stores to the mapwindows where applicable, use after free errors
638  * (typically of pages addressed physically), mis-directed DMAs etc., or due
639  * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
640  * the recording mechanism is deliberately not MP-safe. The overarching goal is to
641  * still assert on potential software races, but attempt recovery from incidents
642  * identifiable as occurring due to issues beyond the control of the pmap module.
643  * The latter includes single-bit errors and malformed pagetable entries.
644  * We currently limit ourselves to recovery/suppression of one incident per
645  * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
646  * are logged.
647  * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
648  */
649 
650 typedef enum {
651 	PTE_VALID                = 0x0,
652 	PTE_INVALID              = 0x1,
653 	PTE_RSVD                 = 0x2,
654 	PTE_SUPERVISOR           = 0x4,
655 	PTE_BITFLIP              = 0x8,
656 	PV_BITFLIP               = 0x10,
657 	PTE_INVALID_CACHEABILITY = 0x20,
658 	PTE_NXBITFLIP            = 0x40
659 } pmap_pagetable_corruption_t;
660 
661 typedef enum {
662 	ROOT_PRESENT = 0,
663 	ROOT_ABSENT = 1
664 } pmap_pv_assertion_t;
665 
666 typedef enum {
667 	PMAP_ACTION_IGNORE      = 0x0,
668 	PMAP_ACTION_ASSERT      = 0x1,
669 	PMAP_ACTION_RETRY       = 0x2,
670 	PMAP_ACTION_RETRY_RELOCK = 0x4
671 } pmap_pagetable_corruption_action_t;
672 
673 #define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
674 extern uint64_t pmap_pagetable_corruption_interval_abstime;
675 
676 extern uint32_t pmap_pagetable_corruption_incidents;
677 #define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
678 typedef struct {
679 	pmap_pv_assertion_t incident;
680 	pmap_pagetable_corruption_t reason;
681 	pmap_pagetable_corruption_action_t action;
682 	pmap_t  pmap;
683 	vm_map_offset_t vaddr;
684 	pt_entry_t pte;
685 	ppnum_t ppn;
686 	pmap_t pvpmap;
687 	vm_map_offset_t pvva;
688 	uint64_t abstime;
689 	int adj_ptes_count;
690 #define PMPTCR_MAX_ADJ_PTES (2)
691 	uint64_t adj_ptes[PMPTCR_MAX_ADJ_PTES];
692 } pmap_pagetable_corruption_record_t;
693 
694 extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
695 extern uint64_t pmap_pagetable_corruption_last_abstime;
696 extern thread_call_t    pmap_pagetable_corruption_log_call;
697 extern boolean_t pmap_pagetable_corruption_timeout;
698 
699 static inline pmap_pagetable_corruption_action_t
pmap_pagetable_corruption_log(pmap_pv_assertion_t incident,pmap_pagetable_corruption_t suppress_reason,pmap_pagetable_corruption_action_t action,pmap_t pmap,vm_map_offset_t vaddr,pt_entry_t * ptep,ppnum_t ppn,pmap_t pvpmap,vm_map_offset_t pvva,int adj_pteps_cnt,uint64_t ** adj_pteps)700 pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason,
701     pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep,
702     ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva, int adj_pteps_cnt, uint64_t **adj_pteps)
703 {
704 	uint32_t pmap_pagetable_corruption_log_index;
705 	uint64_t curtime = mach_absolute_time();
706 
707 	if ((curtime - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
708 		pmap_pagetable_corruption_timeout = TRUE;
709 		action = PMAP_ACTION_ASSERT;
710 	} else {
711 		pmap_pagetable_corruption_last_abstime = curtime;
712 	}
713 
714 	pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
715 	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
716 	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
717 	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action;
718 	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap;
719 	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr;
720 	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep;
721 	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
722 	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
723 	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
724 	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = curtime;
725 	if (adj_pteps_cnt > 0 && adj_pteps != NULL) {
726 		pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes_count = MIN(adj_pteps_cnt, PMPTCR_MAX_ADJ_PTES);
727 		for (int i = 0; i < pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes_count; i++) {
728 			pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].adj_ptes[i] = *adj_pteps[i];
729 		}
730 	}
731 	/* Asynchronously log */
732 	thread_call_enter(pmap_pagetable_corruption_log_call);
733 
734 	return action;
735 }
736 
737 static inline pmap_pagetable_corruption_action_t
pmap_classify_pagetable_corruption(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t * ppnp,pt_entry_t * ptep,pmap_pv_assertion_t incident)738 pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident)
739 {
740 	pmap_pagetable_corruption_action_t      action = PMAP_ACTION_ASSERT;
741 	pmap_pagetable_corruption_t     suppress_reason = PTE_VALID;
742 	ppnum_t                 suppress_ppn = 0;
743 	pt_entry_t cpte = *ptep;
744 	ppnum_t cpn = pa_index(pte_to_pa(cpte));
745 	ppnum_t ppn = *ppnp;
746 	pv_rooted_entry_t       pv_h = pai_to_pvh(ppn_to_pai(ppn));
747 	pv_rooted_entry_t       pv_e = pv_h;
748 	uint32_t        bitdex;
749 	pmap_t pvpmap = pv_h->pmap;
750 	vm_map_offset_t pvva = PVE_VA(pv_h);
751 	vm_map_offset_t pve_flags;
752 	boolean_t ppcd = FALSE;
753 	boolean_t is_ept;
754 
755 	/* Ideally, we'd consult the Mach VM here to definitively determine
756 	 * the nature of the mapping for this address space and address.
757 	 * As that would be a layering violation in this context, we
758 	 * use various heuristics to recover from single bit errors,
759 	 * malformed pagetable entries etc. These are not intended
760 	 * to be comprehensive.
761 	 */
762 
763 	/* As a precautionary measure, mark A+D */
764 	pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
765 	is_ept = is_ept_pmap(pmap);
766 
767 	/*
768 	 * Correct potential single bit errors in either (but not both) element
769 	 * of the PV
770 	 */
771 	do {
772 		if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && PVE_VA(pv_e) == vaddr) ||
773 		    (pv_e->pmap == pmap && popcnt1(PVE_VA(pv_e) ^ vaddr))) {
774 			pve_flags = PVE_FLAGS(pv_e);
775 			pv_e->pmap = pmap;
776 			pv_h->va_and_flags = vaddr | pve_flags;
777 			suppress_reason = PV_BITFLIP;
778 			action = PMAP_ACTION_RETRY;
779 			goto pmap_cpc_exit;
780 		}
781 	} while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h));
782 
783 	/* Discover root entries with a Hamming
784 	 * distance of 1 from the supplied
785 	 * physical page frame.
786 	 */
787 	for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) {
788 		ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex);
789 		if (IS_MANAGED_PAGE(npn)) {
790 			pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn));
791 			if (PVE_VA(npv_h) == vaddr && npv_h->pmap == pmap) {
792 				suppress_reason = PTE_BITFLIP;
793 				suppress_ppn = npn;
794 				action = PMAP_ACTION_RETRY_RELOCK;
795 				UNLOCK_PVH(ppn_to_pai(ppn));
796 				*ppnp = npn;
797 				goto pmap_cpc_exit;
798 			}
799 		}
800 	}
801 
802 	if (pmap == kernel_pmap) {
803 		action = PMAP_ACTION_ASSERT;
804 		goto pmap_cpc_exit;
805 	}
806 
807 	/*
808 	 * Check for malformed/inconsistent entries.
809 	 * The first check here isn't useful for EPT PTEs because INTEL_EPT_NCACHE == 0
810 	 */
811 	if (!is_ept && ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PAT)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU))) {
812 		action = PMAP_ACTION_IGNORE;
813 		suppress_reason = PTE_INVALID_CACHEABILITY;
814 	} else if (cpte & INTEL_PTE_RSVD) {
815 		action = PMAP_ACTION_IGNORE;
816 		suppress_reason = PTE_RSVD;
817 	} else if ((pmap != kernel_pmap) && (!is_ept) && ((cpte & INTEL_PTE_USER) == 0)) {
818 		action = PMAP_ACTION_IGNORE;
819 		suppress_reason = PTE_SUPERVISOR;
820 	}
821 pmap_cpc_exit:
822 	PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd));
823 
824 	if (debug_boot_arg && !ppcd) {
825 		action = PMAP_ACTION_ASSERT;
826 	}
827 
828 	return pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva, 0, 0);
829 }
830 
831 static inline boolean_t
pmap_compressed_pte_corruption_repair(uint64_t pte,uint64_t * pte_addr,uint64_t * ptep,pmap_t pmap,vm_map_offset_t vaddr)832 pmap_compressed_pte_corruption_repair(uint64_t pte, uint64_t *pte_addr, uint64_t *ptep, pmap_t pmap,
833     vm_map_offset_t vaddr)
834 {
835 	uint64_t *adj_pteps[2];
836 	int pteidx = ((uintptr_t)ptep & INTEL_OFFMASK) / sizeof(pt_entry_t);
837 	pmap_pagetable_corruption_action_t action = PMAP_ACTION_IGNORE;
838 
839 	/*
840 	 * Grab pointers to PTEs on either side of the PTE in question, unless we're at the start of
841 	 * a PT (grab pointers to the next and next-next PTEs) or the end of a PT (grab the previous
842 	 * 2 PTEs).
843 	 */
844 	if (pteidx == 0) {
845 		adj_pteps[0] = ptep + 1;
846 		adj_pteps[1] = ptep + 2;
847 	} else if (pteidx == (NPTPG - 1)) {
848 		adj_pteps[0] = ptep - 2;
849 		adj_pteps[1] = ptep - 1;
850 	} else {
851 		adj_pteps[0] = ptep - 1;
852 		adj_pteps[1] = ptep + 1;
853 	}
854 
855 	/*
856 	 * Since the compressed PTE no longer has a PTE associated, we cannot pass in the pv data to
857 	 * pmap_pagetable_corruption_log, so instead supply adjacent PTEs for logging.
858 	 */
859 	if (pmap_pagetable_corruption_log(ROOT_ABSENT, (pte & INTEL_PTE_NX) ? PTE_NXBITFLIP : PTE_BITFLIP,
860 	    action, pmap, vaddr, ptep, (ppnum_t)~0UL, 0, 0, sizeof(adj_pteps) / sizeof(adj_pteps[0]),
861 	    adj_pteps) != PMAP_ACTION_ASSERT) {
862 		/* Correct the flipped bit(s) and continue */
863 		pmap_store_pte(is_ept_pmap(pmap), ptep, pte & INTEL_PTE_COMPRESSED_MASK);
864 		pmap->corrected_compressed_ptes_count++;
865 		return TRUE; /* Returning TRUE to indicate this is a now a valid compressed PTE (we hope) */
866 	}
867 
868 	panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted? Adjacent PTEs: 0x%llx@%p, 0x%llx@%p",
869 	    pte_addr, pte, pte & ~INTEL_PTE_COMPRESSED_MASK, *adj_pteps[0], adj_pteps[0], *adj_pteps[1], adj_pteps[1]);
870 	/*NOTREACHED*/
871 }
872 
873 /*
874  * Remove pv list entry.
875  * Called with pv_head_table entry locked.
876  * Returns pv entry to be freed (or NULL).
877  */
878 static inline __attribute__((always_inline)) pv_hashed_entry_t
pmap_pv_remove(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t * ppnp,pt_entry_t * pte,boolean_t * was_altacct)879 pmap_pv_remove(pmap_t           pmap,
880     vm_map_offset_t  vaddr,
881     ppnum_t          *ppnp,
882     pt_entry_t       *pte,
883     boolean_t        *was_altacct)
884 {
885 	pv_hashed_entry_t       pvh_e;
886 	pv_rooted_entry_t       pv_h;
887 	pv_hashed_entry_t       *pprevh;
888 	uint32_t                pvhash_idx;
889 	uint32_t                pv_cnt;
890 	ppnum_t                 ppn;
891 
892 	*was_altacct = FALSE;
893 pmap_pv_remove_retry:
894 	ppn = *ppnp;
895 	pvh_e = PV_HASHED_ENTRY_NULL;
896 	pv_h = pai_to_pvh(ppn_to_pai(ppn));
897 
898 	if (__improbable(pv_h->pmap == PMAP_NULL)) {
899 		pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT);
900 		if (pac == PMAP_ACTION_IGNORE) {
901 			goto pmap_pv_remove_exit;
902 		} else if (pac == PMAP_ACTION_ASSERT) {
903 			panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list, priors: %d", pmap, vaddr, ppn, *pte, ppnp, pte, pmap_pagetable_corruption_incidents);
904 		} else if (pac == PMAP_ACTION_RETRY_RELOCK) {
905 			LOCK_PVH(ppn_to_pai(*ppnp));
906 			pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
907 			goto pmap_pv_remove_retry;
908 		} else if (pac == PMAP_ACTION_RETRY) {
909 			goto pmap_pv_remove_retry;
910 		}
911 	}
912 
913 	if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) {
914 		*was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pv_h);
915 		/*
916 		 * Header is the pv_rooted_entry.
917 		 * We can't free that. If there is a queued
918 		 * entry after this one we remove that
919 		 * from the ppn queue, we remove it from the hash chain
920 		 * and copy it to the rooted entry. Then free it instead.
921 		 */
922 		pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
923 		if (pv_h != (pv_rooted_entry_t) pvh_e) {
924 			/*
925 			 * Entry queued to root, remove this from hash
926 			 * and install as new root.
927 			 */
928 			CHK_NPVHASH();
929 			pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e));
930 			LOCK_PV_HASH(pvhash_idx);
931 			remque(&pvh_e->qlink);
932 			pprevh = pvhash(pvhash_idx);
933 			if (PV_HASHED_ENTRY_NULL == *pprevh) {
934 				panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): "
935 				    "empty hash, removing rooted, priors: %d",
936 				    pmap, vaddr, ppn, pmap_pagetable_corruption_incidents);
937 			}
938 			pmap_pvh_unlink(pvh_e);
939 			UNLOCK_PV_HASH(pvhash_idx);
940 			pv_h->pmap = pvh_e->pmap;
941 			pv_h->va_and_flags = pvh_e->va_and_flags;
942 			/* dispose of pvh_e */
943 		} else {
944 			/* none queued after rooted */
945 			pv_h->pmap = PMAP_NULL;
946 			pvh_e = PV_HASHED_ENTRY_NULL;
947 		}
948 	} else {
949 		/*
950 		 * not removing rooted pv. find it on hash chain, remove from
951 		 * ppn queue and hash chain and free it
952 		 */
953 		CHK_NPVHASH();
954 		pvhash_idx = pvhashidx(pmap, vaddr);
955 		LOCK_PV_HASH(pvhash_idx);
956 		pprevh = pvhash(pvhash_idx);
957 		if (PV_HASHED_ENTRY_NULL == *pprevh) {
958 			panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash, priors: %d",
959 			    pmap, vaddr, ppn, *pte, pte, pmap_pagetable_corruption_incidents);
960 		}
961 		pvh_e = *pprevh;
962 		pmap_pv_hashlist_walks++;
963 		pv_cnt = 0;
964 		while (PV_HASHED_ENTRY_NULL != pvh_e) {
965 			pv_cnt++;
966 			if (pvh_e->pmap == pmap &&
967 			    PVE_VA(pvh_e) == vaddr &&
968 			    pvh_e->ppn == ppn) {
969 				break;
970 			}
971 			pprevh = &pvh_e->nexth;
972 			pvh_e = pvh_e->nexth;
973 		}
974 
975 		if (PV_HASHED_ENTRY_NULL == pvh_e) {
976 			pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT);
977 
978 			if (pac == PMAP_ACTION_ASSERT) {
979 				panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx, priors: %d", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, PVE_VA(pv_h), pmap_pagetable_corruption_incidents);
980 			} else {
981 				UNLOCK_PV_HASH(pvhash_idx);
982 				if (pac == PMAP_ACTION_RETRY_RELOCK) {
983 					LOCK_PVH(ppn_to_pai(*ppnp));
984 					pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
985 					goto pmap_pv_remove_retry;
986 				} else if (pac == PMAP_ACTION_RETRY) {
987 					goto pmap_pv_remove_retry;
988 				} else if (pac == PMAP_ACTION_IGNORE) {
989 					goto pmap_pv_remove_exit;
990 				}
991 			}
992 		}
993 
994 		*was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pvh_e);
995 
996 		pmap_pv_hashlist_cnts += pv_cnt;
997 		if (pmap_pv_hashlist_max < pv_cnt) {
998 			pmap_pv_hashlist_max = pv_cnt;
999 		}
1000 		*pprevh = pvh_e->nexth;
1001 		remque(&pvh_e->qlink);
1002 		UNLOCK_PV_HASH(pvhash_idx);
1003 	}
1004 pmap_pv_remove_exit:
1005 	return pvh_e;
1006 }
1007 
1008 static inline __attribute__((always_inline)) boolean_t
pmap_pv_is_altacct(pmap_t pmap,vm_map_offset_t vaddr,ppnum_t ppn)1009 pmap_pv_is_altacct(
1010 	pmap_t          pmap,
1011 	vm_map_offset_t vaddr,
1012 	ppnum_t         ppn)
1013 {
1014 	pv_hashed_entry_t       pvh_e;
1015 	pv_rooted_entry_t       pv_h;
1016 	uint32_t                pvhash_idx;
1017 	boolean_t               is_altacct;
1018 
1019 	pvh_e = PV_HASHED_ENTRY_NULL;
1020 	pv_h = pai_to_pvh(ppn_to_pai(ppn));
1021 
1022 	if (__improbable(pv_h->pmap == PMAP_NULL)) {
1023 		return FALSE;
1024 	}
1025 
1026 	if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) {
1027 		/*
1028 		 * Header is the pv_rooted_entry.
1029 		 */
1030 		return IS_ALTACCT_PAGE(ppn, pv_h);
1031 	}
1032 
1033 	CHK_NPVHASH();
1034 	pvhash_idx = pvhashidx(pmap, vaddr);
1035 	LOCK_PV_HASH(pvhash_idx);
1036 	pvh_e = *(pvhash(pvhash_idx));
1037 	while (PV_HASHED_ENTRY_NULL != pvh_e) {
1038 		if (pvh_e->pmap == pmap &&
1039 		    PVE_VA(pvh_e) == vaddr &&
1040 		    pvh_e->ppn == ppn) {
1041 			break;
1042 		}
1043 		pvh_e = pvh_e->nexth;
1044 	}
1045 	if (PV_HASHED_ENTRY_NULL == pvh_e) {
1046 		is_altacct = FALSE;
1047 	} else {
1048 		is_altacct = IS_ALTACCT_PAGE(ppn, pvh_e);
1049 	}
1050 	UNLOCK_PV_HASH(pvhash_idx);
1051 
1052 	return is_altacct;
1053 }
1054 
1055 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,vm_size_t bytes)1056 PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes)
1057 {
1058 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, (ledger_amount_t)bytes);
1059 }
1060 
1061 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,vm_size_t bytes)1062 PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes)
1063 {
1064 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, (ledger_amount_t)bytes);
1065 }
1066 
1067 extern boolean_t        pmap_initialized;/* Has pmap_init completed? */
1068 #define valid_page(x) (pmap_initialized && pmap_valid_page(x))
1069 
1070 int             phys_attribute_test(
1071 	ppnum_t         phys,
1072 	int             bits);
1073 void            phys_attribute_clear(
1074 	ppnum_t         phys,
1075 	int             bits,
1076 	unsigned int    options,
1077 	void            *arg);
1078 
1079 //#define PCID_DEBUG 1
1080 #if     PCID_DEBUG
1081 #define pmap_pcid_log(fmt, args...)                                     \
1082 	do {                                                            \
1083 	        kprintf(fmt, ##args);                                   \
1084 	        printf(fmt, ##args);                                    \
1085 	} while(0)
1086 #else
1087 #define pmap_pcid_log(fmt, args...)
1088 #endif
1089 void    pmap_pcid_configure(void);
1090 
1091 
1092 /*
1093  * Atomic 64-bit compare and exchange of a page table entry.
1094  */
1095 
1096 #include <machine/atomic.h>
1097 static inline boolean_t
pmap_cmpx_pte(pt_entry_t * entryp,pt_entry_t old,pt_entry_t new)1098 pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new)
1099 {
1100 	return __c11_atomic_compare_exchange_strong((_Atomic pt_entry_t *)entryp, &old, new,
1101 	           memory_order_acq_rel_smp, memory_order_relaxed);
1102 }
1103 
1104 
1105 #if DEVELOPMENT || DEBUG
1106 extern uint32_t pmap_update_clear_pte_count;
1107 extern uint32_t pmap_update_invalid_pte_count;
1108 #endif
1109 
1110 static inline void
pmap_update_pte(boolean_t is_ept,pt_entry_t * mptep,uint64_t pclear_bits,uint64_t pset_bits,bool oldpte_invalid_ok)1111 pmap_update_pte(boolean_t is_ept, pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits, bool oldpte_invalid_ok)
1112 {
1113 	pt_entry_t npte, opte;
1114 	do {
1115 		opte = *mptep;
1116 		if (__improbable(opte == 0)) {
1117 #if DEVELOPMENT || DEBUG
1118 			pmap_update_clear_pte_count++;
1119 #endif
1120 			return;
1121 		} else if (__improbable(!oldpte_invalid_ok && (opte & PTE_VALID_MASK(is_ept)) == 0)) {
1122 #if DEVELOPMENT || DEBUG
1123 			pmap_update_invalid_pte_count++;
1124 #endif
1125 			return;
1126 		}
1127 		npte = opte & ~(pclear_bits);
1128 		npte |= pset_bits;
1129 #if DEVELOPMENT || DEBUG
1130 		if (__improbable(pmap_inject_pte_corruption != 0 && is_ept == FALSE && (npte & PTE_COMPRESSED))) {
1131 			pmap_inject_pte_corruption = 0;
1132 			/* Inject a corruption event */
1133 			npte |= INTEL_PTE_NX;
1134 		}
1135 #endif
1136 	}       while (!pmap_cmpx_pte(mptep, opte, npte));
1137 
1138 	if (__improbable((is_ept == FALSE) && (npte & PTE_COMPRESSED) && (npte & INTEL_PTE_NX))) {
1139 		pmap_corrupted_pte_detected(mptep, pclear_bits, pset_bits);
1140 	}
1141 }
1142 
1143 /*
1144  * The single pml4 page per pmap is allocated at pmap create time and exists
1145  * for the duration of the pmap. we allocate this page in kernel vm.
1146  * this returns the address of the requested pml4 entry in the top level page.
1147  */
1148 static inline
1149 pml4_entry_t *
pmap64_pml4(pmap_t pmap,vm_map_offset_t vaddr)1150 pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
1151 {
1152 	if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) &&
1153 	    (vaddr < 0xFFFF800000000000ULL))) {
1154 		return NULL;
1155 	}
1156 
1157 #if     DEBUG
1158 	return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]);
1159 #else
1160 	return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG - 1)];
1161 #endif
1162 }
1163 
1164 static inline pml4_entry_t *
pmap64_user_pml4(pmap_t pmap,vm_map_offset_t vaddr)1165 pmap64_user_pml4(pmap_t pmap, vm_map_offset_t vaddr)
1166 {
1167 	if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) &&
1168 	    (vaddr < 0xFFFF800000000000ULL))) {
1169 		return NULL;
1170 	}
1171 
1172 #if     DEBUG
1173 	return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_ucr3)[(vaddr >> PML4SHIFT) & (NPML4PG - 1)]);
1174 #else
1175 	return &pmap->pm_upml4[(vaddr >> PML4SHIFT) & (NPML4PG - 1)];
1176 #endif
1177 }
1178 
1179 /*
1180  * Returns address of requested PDPT entry in the physmap.
1181  */
1182 static inline pdpt_entry_t *
pmap64_pdpt(pmap_t pmap,vm_map_offset_t vaddr)1183 pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
1184 {
1185 	pml4_entry_t    newpf;
1186 	pml4_entry_t    *pml4;
1187 	boolean_t       is_ept;
1188 
1189 	pml4 = pmap64_pml4(pmap, vaddr);
1190 	is_ept = is_ept_pmap(pmap);
1191 
1192 	if (pml4 && (*pml4 & PTE_VALID_MASK(is_ept))) {
1193 		newpf = *pml4 & PG_FRAME;
1194 		return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf))
1195 		       [(vaddr >> PDPTSHIFT) & (NPDPTPG - 1)];
1196 	}
1197 	return NULL;
1198 }
1199 /*
1200  * Returns the address of the requested PDE entry in the physmap.
1201  */
1202 static inline pd_entry_t *
pmap_pde_internal1(vm_map_offset_t vaddr,boolean_t is_ept,pdpt_entry_t * pdpte)1203 pmap_pde_internal1(vm_map_offset_t vaddr, boolean_t is_ept, pdpt_entry_t *pdpte)
1204 {
1205 	if (*pdpte & PTE_VALID_MASK(is_ept)) {
1206 		pdpt_entry_t    newpf = *pdpte & PG_FRAME;
1207 		return &((pd_entry_t *) PHYSMAP_PTOV(newpf))
1208 		       [(vaddr >> PDSHIFT) & (NPDPG - 1)];
1209 	} else {
1210 		return NULL;
1211 	}
1212 }
1213 
1214 static inline pd_entry_t *
pmap_pde_internal0(pmap_t pmap,vm_map_offset_t vaddr,boolean_t is_ept)1215 pmap_pde_internal0(pmap_t pmap, vm_map_offset_t vaddr, boolean_t is_ept)
1216 {
1217 	pdpt_entry_t    *pdpt;
1218 
1219 	pdpt = pmap64_pdpt(pmap, vaddr);
1220 	if (pdpt) {
1221 		return pmap_pde_internal1(vaddr, is_ept, pdpt);
1222 	} else {
1223 		return NULL;
1224 	}
1225 }
1226 
1227 
1228 static inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_map_offset_t vaddr)1229 pmap_pde(pmap_t pmap, vm_map_offset_t vaddr)
1230 {
1231 	pdpt_entry_t    *pdpt;
1232 	boolean_t       is_ept;
1233 
1234 	pdpt = pmap64_pdpt(pmap, vaddr);
1235 	is_ept = is_ept_pmap(pmap);
1236 
1237 	if (pdpt) {
1238 		return pmap_pde_internal1(vaddr, is_ept, pdpt);
1239 	} else {
1240 		return NULL;
1241 	}
1242 }
1243 
1244 
1245 /*
1246  * return address of mapped pte for vaddr va in pmap pmap.
1247  *
1248  * In case the pde maps a superpage, return the pde, which, in this case
1249  * is the actual page table entry.
1250  */
1251 
1252 
1253 static inline pt_entry_t *
pmap_pte_internal(vm_map_offset_t vaddr,boolean_t is_ept,pd_entry_t * pde)1254 pmap_pte_internal(vm_map_offset_t vaddr, boolean_t is_ept, pd_entry_t *pde)
1255 {
1256 	if (*pde & PTE_VALID_MASK(is_ept)) {
1257 		if (__improbable(*pde & PTE_PS)) {
1258 			return pde;
1259 		}
1260 		pd_entry_t      newpf = *pde & PG_FRAME;
1261 
1262 		return &((pt_entry_t *)PHYSMAP_PTOV(newpf))
1263 		       [i386_btop(vaddr) & (ppnum_t)(NPTEPG - 1)];
1264 	} else {
1265 		return NULL;
1266 	}
1267 }
1268 
1269 static inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_map_offset_t vaddr)1270 pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
1271 {
1272 	pd_entry_t      *pde;
1273 
1274 	boolean_t       is_ept;
1275 
1276 	is_ept = is_ept_pmap(pmap);
1277 
1278 	pde = pmap_pde_internal0(pmap, vaddr, is_ept);
1279 
1280 	if (pde) {
1281 		return pmap_pte_internal(vaddr, is_ept, pde);
1282 	} else {
1283 		return NULL;
1284 	}
1285 }
1286 
1287 extern void     pmap_alias(
1288 	vm_offset_t     ava,
1289 	vm_map_offset_t start,
1290 	vm_map_offset_t end,
1291 	vm_prot_t       prot,
1292 	unsigned int options);
1293 
1294 #if     DEBUG
1295 #define DPRINTF(x...)   kprintf(x)
1296 #else
1297 #define DPRINTF(x...)
1298 #endif
1299 
1300 #endif /* MACH_KERNEL_PRIVATE */
1301 #endif /* _I386_PMAP_INTERNAL_ */
1302