xref: /xnu-11215.41.3/osfmk/x86_64/pmap_pcid.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <i386/proc_reg.h>
30 #include <i386/cpuid.h>
31 #include <i386/tsc.h>
32 #include <vm/pmap.h>
33 #include <vm/vm_map.h>
34 #include <i386/pmap_internal.h>
35 #include <i386/pmap_pcid.h>
36 
37 /*
38  * PCID (Process context identifier) aka tagged TLB support.
39  * On processors with this feature, unless disabled via the -pmap_pcid_disable
40  * boot-arg, the following algorithm is in effect:
41  * Each processor maintains an array of tag refcounts indexed by tag.
42  * Each address space maintains an array of tags indexed by CPU number.
43  * Each address space maintains a coherency vector, indexed by CPU
44  * indicating that the TLB state for that address space has a pending
45  * invalidation.
46  * On a context switch, a refcounted tag is lazily assigned to the newly
47  * dispatched (CPU, address space) tuple.
48  * When an inactive address space is invalidated on a remote CPU, it is marked
49  * for invalidation upon the next dispatch. Some invalidations are
50  * also processed at the user/kernel boundary.
51  * Provisions are made for the case where a CPU is overcommmitted, i.e.
52  * more active address spaces exist than the number of logical tags
53  * provided for by the processor architecture (currently 4096).
54  * The algorithm assumes the processor remaps the logical tags
55  * to physical TLB context IDs in an LRU fashion for efficiency. (DRK '10)
56  */
57 
58 uint32_t        pmap_pcid_ncpus;
59 boolean_t       pmap_pcid_disabled = FALSE;
60 bool            invpcid_enabled = false;
61 static uint32_t INP_MAX = 0;
62 pcid_cdata_t pcid_data[MAX_CPUS] __attribute__((aligned(64)));
63 
64 void
pmap_pcid_configure(void)65 pmap_pcid_configure(void)
66 {
67 	int ccpu = cpu_number();
68 	uintptr_t cr4 = get_cr4();
69 	boolean_t pcid_present = FALSE;
70 
71 	pmap_pcid_log("PCID configure invoked on CPU %d\n", ccpu);
72 	pmap_assert(ml_get_interrupts_enabled() == FALSE || get_preemption_level() != 0);
73 	pmap_assert(cpu_mode_is64bit());
74 
75 	if (PE_parse_boot_argn("-pmap_pcid_disable", &pmap_pcid_disabled, sizeof(pmap_pcid_disabled))) {
76 		pmap_pcid_log("PMAP: PCID feature disabled\n");
77 		printf("PMAP: PCID feature disabled, %u\n", pmap_pcid_disabled);
78 		kprintf("PMAP: PCID feature disabled %u\n", pmap_pcid_disabled);
79 	}
80 	/* no_shared_cr3+PCID is currently unsupported */
81 
82 #if     DEBUG
83 	if (pmap_pcid_disabled == FALSE) {
84 		no_shared_cr3 = FALSE;
85 	} else {
86 		no_shared_cr3 = TRUE;
87 	}
88 #else
89 	if (no_shared_cr3) {
90 		pmap_pcid_disabled = TRUE;
91 	}
92 #endif
93 	if (pmap_pcid_disabled || no_shared_cr3) {
94 		unsigned i;
95 		/* Reset PCID status, as we may have picked up
96 		 * strays if discovered prior to platform
97 		 * expert initialization.
98 		 */
99 		for (i = 0; i < real_ncpus; i++) {
100 			if (cpu_datap(i)) {
101 				cpu_datap(i)->cpu_pmap_pcid_enabled = FALSE;
102 			}
103 			pmap_pcid_ncpus = 0;
104 		}
105 		cpu_datap(ccpu)->cpu_pmap_pcid_enabled = FALSE;
106 		return;
107 	}
108 	/* DRKTODO: assert if features haven't been discovered yet. Redundant
109 	 * invocation of cpu_mode_init and descendants masks this for now.
110 	 */
111 	if ((cpuid_features() & CPUID_FEATURE_PCID)) {
112 		pcid_present = TRUE;
113 	} else {
114 		cpu_datap(ccpu)->cpu_pmap_pcid_enabled = FALSE;
115 		pmap_pcid_log("PMAP: PCID not detected CPU %d\n", ccpu);
116 		return;
117 	}
118 	if ((cr4 & (CR4_PCIDE | CR4_PGE)) == (CR4_PCIDE | CR4_PGE)) {
119 		cpu_datap(ccpu)->cpu_pmap_pcid_enabled = TRUE;
120 		pmap_pcid_log("PMAP: PCID already enabled %d\n", ccpu);
121 		return;
122 	}
123 	if (pcid_present == TRUE) {
124 		if (ccpu == 0) {
125 			if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_INVPCID) {
126 				invpcid_enabled = true;
127 			}
128 		}
129 #if DEVELOPMENT || DEBUG
130 		PE_parse_boot_argn("pmap_inp_max", &INP_MAX, sizeof(INP_MAX));
131 #endif
132 		pmap_pcid_log("Pre-PCID:CR0: 0x%lx, CR3: 0x%lx, CR4(CPU %d): 0x%lx\n", get_cr0(), get_cr3_raw(), ccpu, cr4);
133 
134 		if (cpu_number() >= PMAP_PCID_MAX_CPUS) {
135 			panic("PMAP_PCID_MAX_CPUS %d", cpu_number());
136 		}
137 		if ((get_cr4() & CR4_PGE) == 0) {
138 			set_cr4(get_cr4() | CR4_PGE);
139 			pmap_pcid_log("Toggled PGE ON (CPU: %d\n", ccpu);
140 		}
141 		set_cr4(get_cr4() | CR4_PCIDE);
142 		pmap_pcid_log("Post PCID: CR0: 0x%lx, CR3: 0x%lx, CR4(CPU %d): 0x%lx\n", get_cr0(), get_cr3_raw(), ccpu, get_cr4());
143 		pmap_tlbi_range(0, ~0ULL, true, 0);
144 		cpu_datap(ccpu)->cpu_pmap_pcid_enabled = TRUE;
145 
146 		if (OSIncrementAtomic(&pmap_pcid_ncpus) == machine_info.max_cpus) {
147 			pmap_pcid_log("All PCIDs enabled: real_ncpus: %d, pmap_pcid_ncpus: %d\n", real_ncpus, pmap_pcid_ncpus);
148 		}
149 		cpu_datap(ccpu)->cpu_pmap_pcid_coherentp =
150 		    cpu_datap(ccpu)->cpu_pmap_pcid_coherentp_kernel =
151 		    &(kernel_pmap->pmap_pcid_coherency_vector[ccpu]);
152 		cpu_datap(ccpu)->cpu_pcid_data = &pcid_data[ccpu];
153 		cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_refcounts[0] = 1;
154 	}
155 }
156 
157 void
pmap_pcid_initialize(pmap_t p)158 pmap_pcid_initialize(pmap_t p)
159 {
160 	unsigned i;
161 	unsigned nc = sizeof(p->pmap_pcid_cpus) / sizeof(pcid_t);
162 
163 	pmap_assert(nc >= real_ncpus);
164 	for (i = 0; i < nc; i++) {
165 		p->pmap_pcid_cpus[i] = PMAP_PCID_INVALID_PCID;
166 		/* We assume here that the coherency vector is zeroed by
167 		 * pmap_create
168 		 */
169 	}
170 }
171 
172 void
pmap_pcid_initialize_kernel(pmap_t p)173 pmap_pcid_initialize_kernel(pmap_t p)
174 {
175 	unsigned i;
176 	unsigned nc = sizeof(p->pmap_pcid_cpus) / sizeof(pcid_t);
177 
178 	for (i = 0; i < nc; i++) {
179 		p->pmap_pcid_cpus[i] = 0;
180 		/* We assume here that the coherency vector is zeroed by
181 		 * pmap_create
182 		 */
183 	}
184 }
185 
186 pcid_t
pmap_pcid_allocate_pcid(int ccpu)187 pmap_pcid_allocate_pcid(int ccpu)
188 {
189 	int i;
190 	pcid_ref_t      cur_min = 0xFF;
191 	uint32_t        cur_min_index = ~1;
192 	pcid_ref_t      *cpu_pcid_refcounts = &cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_refcounts[0];
193 	pcid_ref_t      old_count;
194 
195 	if ((i = cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_free_hint) != 0) {
196 		if (cpu_pcid_refcounts[i] == 0) {
197 			(void)__sync_fetch_and_add(&cpu_pcid_refcounts[i], 1);
198 			cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_free_hint = 0;
199 			return i;
200 		}
201 	}
202 	/* Linear scan to discover free slot, with hint. Room for optimization
203 	 * but with intelligent prefetchers this should be
204 	 * adequately performant, as it is invoked
205 	 * only on first dispatch of a new address space onto
206 	 * a given processor. DRKTODO: use larger loads and
207 	 * zero byte discovery -- any pattern != ~1 should
208 	 * signify a free slot.
209 	 */
210 	for (i = PMAP_PCID_MIN_PCID; i < PMAP_PCID_MAX_PCID; i++) {
211 		pcid_ref_t cur_refcount = cpu_pcid_refcounts[i];
212 
213 		pmap_assert(cur_refcount < PMAP_PCID_MAX_REFCOUNT);
214 
215 		if (cur_refcount == 0) {
216 			(void)__sync_fetch_and_add(&cpu_pcid_refcounts[i], 1);
217 			return i;
218 		} else {
219 			if (cur_refcount < cur_min) {
220 				cur_min_index = i;
221 				cur_min = cur_refcount;
222 			}
223 		}
224 	}
225 	pmap_assert(cur_min_index > 0 && cur_min_index < PMAP_PCID_MAX_PCID);
226 	/* Consider "rebalancing" tags actively in highly oversubscribed cases
227 	 * perhaps selecting tags with lower activity.
228 	 */
229 
230 	old_count = __sync_fetch_and_add(&cpu_pcid_refcounts[cur_min_index], 1);
231 	pmap_assert(old_count < PMAP_PCID_MAX_REFCOUNT);
232 	return cur_min_index;
233 }
234 
235 void
pmap_pcid_deallocate_pcid(int ccpu,pmap_t tpmap)236 pmap_pcid_deallocate_pcid(int ccpu, pmap_t tpmap)
237 {
238 	pcid_t pcid;
239 	pmap_t lp;
240 	pcid_ref_t prior_count;
241 
242 	pcid = tpmap->pmap_pcid_cpus[ccpu];
243 	pmap_assert(pcid != PMAP_PCID_INVALID_PCID);
244 	if (pcid == PMAP_PCID_INVALID_PCID) {
245 		return;
246 	}
247 
248 	lp = cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_last_pmap_dispatched[pcid];
249 	pmap_assert(pcid > 0 && pcid < PMAP_PCID_MAX_PCID);
250 	pmap_assert(cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_refcounts[pcid] >= 1);
251 
252 	if (lp == tpmap) {
253 		(void)__sync_bool_compare_and_swap(&cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_last_pmap_dispatched[pcid], tpmap, PMAP_INVALID);
254 	}
255 
256 	if ((prior_count = __sync_fetch_and_sub(&cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_refcounts[pcid], 1)) == 1) {
257 		cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_free_hint = pcid;
258 	}
259 	pmap_assert(prior_count <= PMAP_PCID_MAX_REFCOUNT);
260 }
261 
262 void
pmap_destroy_pcid_sync(pmap_t p)263 pmap_destroy_pcid_sync(pmap_t p)
264 {
265 	int i;
266 	pmap_assert(ml_get_interrupts_enabled() == FALSE || get_preemption_level() != 0);
267 	for (i = 0; i < PMAP_PCID_MAX_CPUS; i++) {
268 		if (p->pmap_pcid_cpus[i] != PMAP_PCID_INVALID_PCID) {
269 			pmap_pcid_deallocate_pcid(i, p);
270 		}
271 	}
272 }
273 
274 pcid_t
pcid_for_pmap_cpu_tuple(pmap_t cpmap,thread_t cthread,int ccpu)275 pcid_for_pmap_cpu_tuple(pmap_t cpmap, thread_t cthread, int ccpu)
276 {
277 	pmap_t active_pmap = cpmap;
278 
279 	if (__improbable(cpmap->pagezero_accessible)) {
280 		if ((cthread->machine.specFlags & CopyIOActive) == 0) {
281 			active_pmap = kernel_pmap;
282 		}
283 	}
284 
285 	return active_pmap->pmap_pcid_cpus[ccpu];
286 }
287 int npz = 0;
288 
289 #if PMAP_ASSERT
290 #define PCID_RECORD_SIZE 128
291 uint64_t pcid_record_array[PCID_RECORD_SIZE];
292 #endif
293 #define PMAP_UPCIDP(p) ((p ? (p + PMAP_PCID_MAX_PCID) : 0) | 1ULL << 63)
294 
295 void
pmap_pcid_activate(pmap_t tpmap,int ccpu,boolean_t nopagezero,boolean_t copyio)296 pmap_pcid_activate(pmap_t tpmap, int ccpu, boolean_t nopagezero, boolean_t copyio)
297 {
298 	pcid_t          new_pcid = tpmap->pmap_pcid_cpus[ccpu];
299 	pmap_t          last_pmap;
300 	boolean_t       pcid_conflict = FALSE, pending_flush = FALSE;
301 	pcid_cdata_t    *pcdata = cpu_datap(ccpu)->cpu_pcid_data;
302 
303 	pmap_assert(cpu_datap(ccpu)->cpu_pmap_pcid_enabled);
304 	if (__improbable(new_pcid == PMAP_PCID_INVALID_PCID)) {
305 		new_pcid = tpmap->pmap_pcid_cpus[ccpu] = pmap_pcid_allocate_pcid(ccpu);
306 	}
307 
308 	pmap_assert(new_pcid != PMAP_PCID_INVALID_PCID);
309 #ifdef  PCID_ASSERT
310 	cpu_datap(ccpu)->cpu_last_pcid = cpu_datap(ccpu)->cpu_active_pcid;
311 #endif
312 	cpu_datap(ccpu)->cpu_active_pcid = new_pcid;
313 
314 	pending_flush = (tpmap->pmap_pcid_coherency_vector[ccpu] != 0);
315 	if (__probable(pending_flush == FALSE)) {
316 		last_pmap = pcdata->cpu_pcid_last_pmap_dispatched[new_pcid];
317 		pcid_conflict = ((last_pmap != NULL) && (tpmap != last_pmap));
318 	}
319 	if (__improbable(pending_flush || pcid_conflict)) {
320 		pmap_pcid_validate_cpu(tpmap, ccpu);
321 	}
322 	/* Consider making this a unique id */
323 	pcdata->cpu_pcid_last_pmap_dispatched[new_pcid] = tpmap;
324 
325 	pmap_assert(new_pcid < PMAP_PCID_MAX_PCID);
326 	pmap_assert(((tpmap == kernel_pmap) && new_pcid == 0) ||
327 	    ((new_pcid != PMAP_PCID_INVALID_PCID) && (new_pcid != 0)));
328 #if     PMAP_ASSERT
329 	pcid_record_array[ccpu % PCID_RECORD_SIZE] = tpmap->pm_cr3 | new_pcid | (((uint64_t)(!(pending_flush || pcid_conflict))) << 63);
330 	pml4_entry_t *pml4 = pmap64_pml4(tpmap, 0ULL);
331 	/* Diagnostic to detect pagetable anchor corruption */
332 	if (pml4[KERNEL_PML4_INDEX] != kernel_pmap->pm_pml4[KERNEL_PML4_INDEX]) {
333 		__asm__ volatile ("int3");
334 	}
335 #endif  /* PMAP_ASSERT */
336 
337 	pmap_paddr_t ncr3 = tpmap->pm_cr3;
338 
339 	if (__improbable(nopagezero)) {
340 		pending_flush = TRUE;
341 		if (copyio == FALSE) {
342 			new_pcid = kernel_pmap->pmap_pcid_cpus[ccpu];
343 			ncr3 = kernel_pmap->pm_cr3;
344 		}
345 		cpu_datap(ccpu)->cpu_kernel_pcid = kernel_pmap->pmap_pcid_cpus[ccpu];
346 		npz++;
347 	}
348 
349 	set_cr3_composed(ncr3, new_pcid, 1ULL);
350 	cpu_shadowp(ccpu)->cpu_shadowtask_cr3 = ncr3 | new_pcid | (1ULL << 63);
351 
352 	bool preserve = !pcid_conflict && !pending_flush;
353 	if (preserve == true) {
354 		/* We did not previously observe a pending invalidation for this
355 		 * ASID. However, the load from the coherency vector
356 		 * could've been reordered ahead of the store to the
357 		 * active_cr3 field (in the context switch path, our
358 		 * caller). Re-consult the pending invalidation vector
359 		 * after the CR3 write. We rely on MOV CR3's documented
360 		 * serializing property to avoid insertion of an expensive
361 		 * barrier. (DRK)
362 		 */
363 		pending_flush = (tpmap->pmap_pcid_coherency_vector[ccpu] != 0);
364 		if (__improbable(pending_flush != 0)) {
365 			pmap_pcid_validate_cpu(tpmap, ccpu);
366 			preserve = false;
367 		}
368 	}
369 
370 	if (preserve == false) {
371 		bool gtlbi = (invpcid_enabled == false);
372 		pmap_tlbi_range(0, ~0ULL, gtlbi, new_pcid);
373 	}
374 
375 	uint64_t spcid = PMAP_UPCIDP(new_pcid);
376 	uint64_t scr3 = tpmap->pm_ucr3 | spcid;
377 
378 	cpu_datap(ccpu)->cpu_ucr3 = scr3;
379 	cpu_shadowp(ccpu)->cpu_ucr3 = scr3;
380 
381 	cpu_datap(ccpu)->cpu_pmap_pcid_coherentp = &(tpmap->pmap_pcid_coherency_vector[ccpu]);
382 #if DEBUG
383 	cpu_datap(ccpu)->cpu_pcid_last_cr3 = scr3;
384 	KERNEL_DEBUG_CONSTANT(0x9c1d0000, tpmap, new_pcid, pending_flush, pcid_conflict, 0);
385 #endif
386 }
387 
388 typedef enum {
389 	INP_ALLG = 2, INP_ASPACE = 1, INP_SINGLE = 0, INP_ALLNG = 3
390 } invpcid_type_t;
391 typedef struct __attribute__((packed)) {
392 	uint64_t ipcid_and_rsvd;
393 	uint64_t iaddr;
394 } invpcid_desc_t;
395 
396 static inline void
invpcid(invpcid_type_t itype,pcid_t ipcid,uint64_t iaddr)397 invpcid(invpcid_type_t itype, pcid_t ipcid, uint64_t iaddr)
398 {
399 	invpcid_desc_t ipcdt;
400 
401 	ipcdt.ipcid_and_rsvd = ipcid;
402 	ipcdt.iaddr = iaddr;
403 
404 	uint64_t iptype = itype; //promote to workaround assembler bug
405 
406 	__asm__ volatile ("invpcid %0, %1" :: "m" (ipcdt), "r" (iptype) : "memory");
407 }
408 
409 
410 void
pmap_tlbi_range(uint64_t startv,uint64_t endv,bool global,uint16_t pcid)411 pmap_tlbi_range(uint64_t startv, uint64_t endv, bool global, uint16_t pcid)
412 {
413 	assert(ml_get_interrupts_enabled() == FALSE ||
414 	    get_preemption_level() != 0);
415 
416 	if (invpcid_enabled) {
417 		if (global) {
418 			invpcid(INP_ALLG, 0, 0ULL);
419 		} else {
420 			/* TODO: separate large page invalidation check */
421 			if ((endv - startv) >= INP_MAX) {
422 				invpcid(INP_ASPACE, pcid, 0ULL);
423 				if (pcid) {
424 					invpcid(INP_ASPACE, (pcid + PMAP_PCID_MAX_PCID), 0ULL);
425 				}
426 			} else {
427 				uint64_t cv = startv;
428 				for (; cv < endv; cv += PAGE_SIZE) {
429 					invpcid(INP_SINGLE, pcid, cv);
430 					if (pcid) {
431 						invpcid(INP_SINGLE, (pcid + PMAP_PCID_MAX_PCID), cv);
432 					}
433 				}
434 			}
435 		}
436 	} else {
437 		if (pmap_pcid_ncpus) {
438 			uintptr_t cr4 = get_cr4();
439 			if (__improbable((cr4 & CR4_PGE) == 0)) {
440 				set_cr4(cr4 | CR4_PGE);
441 			} else {
442 				set_cr4(cr4 & ~CR4_PGE);
443 				set_cr4(cr4 | CR4_PGE);
444 			}
445 		} else {
446 			set_cr3_raw(get_cr3_raw());
447 		}
448 	}
449 	__c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
450 }
451