xref: /xnu-11417.140.69/osfmk/kern/ecc_logging.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2013 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <mach/host_priv.h>
30 #include <mach/host_special_ports.h>
31 #include <mach/memory_error_notification.h>
32 
33 #include <mach/mach_types.h>
34 #include <mach/host_info.h>
35 #include <kern/host.h>
36 #include <kern/locks.h>
37 #include <kern/ecc.h>
38 #include <kern/spl.h>
39 #include <kern/mpsc_queue.h>
40 #include <kern/thread.h>
41 #include <kern/thread_call.h>
42 #include <kern/startup.h>
43 #include <os/log.h>
44 #include <pexpert/pexpert.h>
45 #include <pexpert/device_tree.h>
46 #include <libkern/OSAtomic.h>
47 #include <arm/pmap_public.h>
48 #include <vm/vm_page.h>
49 #include <vm/vm_protos.h>
50 
51 /* New CoreAnalytics ECC logging mechanism */
52 
53 /**
54  * Stubs for targets which do not support ECC.
55  */
56 
57 kern_return_t
ecc_log_memory_error(__unused pmap_paddr_t physical_address,__unused uint32_t ecc_flags)58 ecc_log_memory_error(
59 	__unused pmap_paddr_t physical_address,
60 	__unused uint32_t ecc_flags)
61 {
62 	return KERN_NOT_SUPPORTED;
63 }
64 
65 kern_return_t
ecc_log_memory_error_internal(__unused pmap_paddr_t physical_address,__unused uint32_t ecc_flags)66 ecc_log_memory_error_internal(
67 	__unused pmap_paddr_t physical_address,
68 	__unused uint32_t ecc_flags)
69 {
70 	return KERN_NOT_SUPPORTED;
71 }
72 
73 kern_return_t
ecc_log_memory_error_ce(__unused pmap_paddr_t physical_address,__unused uint32_t ecc_flags,__unused uint32_t ce_count)74 ecc_log_memory_error_ce(
75 	__unused pmap_paddr_t physical_address,
76 	__unused uint32_t ecc_flags,
77 	__unused uint32_t ce_count)
78 {
79 	return KERN_NOT_SUPPORTED;
80 }
81 
82 
83 kern_return_t
kern_ecc_poll_register(__unused platform_error_handler_ecc_poll_t poll_func,__unused uint32_t max_errors)84 kern_ecc_poll_register(
85 	__unused platform_error_handler_ecc_poll_t poll_func,
86 	__unused uint32_t max_errors)
87 {
88 	return KERN_NOT_SUPPORTED;
89 }
90 
91 /*
92  * Used to report earlier errors that were found after ECC gets enabled.
93  * We don't want the VM to panic for these.
94  */
95 kern_return_t
ecc_log_memory_error_delayed(__unused pmap_paddr_t physical_address,__unused uint32_t ecc_flags)96 ecc_log_memory_error_delayed(
97 	__unused pmap_paddr_t physical_address,
98 	__unused uint32_t ecc_flags)
99 {
100 	return KERN_FAILURE;
101 }
102 
103 /**
104  * MCC Logging
105  */
106 
107 /**
108  * TODO: rdar://97394997 (Clean up ECC / MCC logging)
109  * We can probably clean some of this up and share some of the code with ECC.
110  */
111 #if XNU_HANDLE_MCC
112 
113 static struct mpsc_daemon_queue mcc_memory_error_event_queue;
114 struct _mcc_mem_err_event {
115 	struct mpsc_queue_chain link;
116 	mcc_ecc_event_t event;
117 };
118 typedef struct _mcc_mem_err_event* mcc_mem_err_event_t;
119 
120 #define MCC_ECC_NUM_ERRORS (1024)
121 #define MCC_ERROR_EVENT_QUEUE_PRIORITY MAXPRI_USER
122 static struct _mcc_mem_err_event mcc_events[MCC_ECC_NUM_ERRORS];
123 static atomic_int mcc_events_producer_idx = 0;
124 static atomic_int mcc_events_consumer_idx = 0;
125 SCALABLE_COUNTER_DEFINE(mcc_dropped_events);
126 LCK_GRP_DECLARE(mcc_lock_grp, "mcc");
127 LCK_SPIN_DECLARE(mcc_lock, &mcc_lock_grp);
128 
129 static inline int
mcc_events_next(int idx)130 mcc_events_next(int idx)
131 {
132 	assert(idx < MCC_ECC_NUM_ERRORS);
133 	return (idx + 1) % MCC_ECC_NUM_ERRORS;
134 }
135 
136 /* MCC ECC CoreAnalytics Error Logging */
137 static void
mcc_error_notify_user(mcc_ecc_event_t event)138 mcc_error_notify_user(mcc_ecc_event_t event)
139 {
140 	mach_port_t user_port = MACH_PORT_NULL;
141 	kern_return_t kr;
142 
143 	kr = host_get_memory_error_port(host_priv_self(), &user_port);
144 	assert(kr == KERN_SUCCESS);
145 	if (!IPC_PORT_VALID(user_port)) {
146 		os_log_error(OS_LOG_DEFAULT, "Failed to get memory error port - mcc");
147 		return;
148 	}
149 
150 	mcc_memory_error_notification(user_port, event);
151 
152 	ipc_port_release_send(user_port);
153 }
154 
155 static void
mcc_memory_error_event_queue_invoke(mpsc_queue_chain_t e,mpsc_daemon_queue_t queue __unused)156 mcc_memory_error_event_queue_invoke(mpsc_queue_chain_t e, mpsc_daemon_queue_t queue __unused)
157 {
158 	mcc_mem_err_event_t event;
159 
160 	/* The consumer should never be invoked if there is nothing to consume. */
161 	int mcc_events_consumer_curr_idx = atomic_load(&mcc_events_consumer_idx);
162 	assert(mcc_events_consumer_curr_idx != atomic_load(&mcc_events_producer_idx));
163 
164 	event = mpsc_queue_element(e, struct _mcc_mem_err_event, link);
165 	mcc_error_notify_user(event->event);
166 	int mcc_events_consumer_next_idx = mcc_events_next(mcc_events_consumer_curr_idx);
167 	atomic_store(&mcc_events_consumer_idx, mcc_events_consumer_next_idx);
168 }
169 
170 static mcc_mem_err_event_t
mcc_memory_error_create_event(mcc_ecc_event_t mcc_event)171 mcc_memory_error_create_event(mcc_ecc_event_t mcc_event)
172 {
173 	mcc_mem_err_event_t ret = NULL;
174 
175 	/**
176 	 * @note We are unable to dynamically allocate events, because this function can be called from
177 	 * the primary interrupt context.  Instead, we allocate from a statically sized ring buffer.
178 	 */
179 	const boolean_t interrupts_enabled = ml_set_interrupts_enabled(FALSE);
180 	lck_spin_lock(&mcc_lock);
181 	int mcc_events_producer_curr_idx = atomic_load(&mcc_events_producer_idx);
182 	int mcc_events_producer_next_idx = mcc_events_next(mcc_events_producer_curr_idx);
183 	if (mcc_events_producer_next_idx == atomic_load(&mcc_events_consumer_idx)) {
184 		/**
185 		 * The consumer is running behind the producer, and we're in the primary interrupt context.
186 		 * Drop this event and return NULL to the caller.
187 		 */
188 		counter_inc(&mcc_dropped_events);
189 		ret = NULL;
190 		goto done;
191 	}
192 
193 	mcc_mem_err_event_t event = &mcc_events[mcc_events_producer_curr_idx];
194 	event->event = mcc_event;
195 	atomic_store(&mcc_events_producer_idx, mcc_events_producer_next_idx);
196 	ret = event;
197 
198 done:
199 	lck_spin_unlock(&mcc_lock);
200 	ml_set_interrupts_enabled(interrupts_enabled);
201 	return ret;
202 }
203 
204 __startup_func
205 static void
mcc_logging_init(void)206 mcc_logging_init(void)
207 {
208 	mpsc_daemon_queue_init_with_thread(&mcc_memory_error_event_queue,
209 	    mcc_memory_error_event_queue_invoke, MCC_ERROR_EVENT_QUEUE_PRIORITY,
210 	    "daemon.mcc_error-events", MPSC_DAEMON_INIT_INACTIVE);
211 
212 	mpsc_daemon_queue_activate(&mcc_memory_error_event_queue);
213 }
214 STARTUP(THREAD_CALL, STARTUP_RANK_MIDDLE, mcc_logging_init);
215 
216 #endif /* XNU_HANDLE_MCC */
217 
218 kern_return_t
mcc_log_memory_error(mcc_ecc_event_t mcc_event __unused)219 mcc_log_memory_error(mcc_ecc_event_t mcc_event __unused)
220 {
221 #if XNU_HANDLE_MCC
222 	mcc_mem_err_event_t event = mcc_memory_error_create_event(mcc_event);
223 	if (event == NULL) {
224 		return KERN_RESOURCE_SHORTAGE;
225 	}
226 	assert(mcc_memory_error_event_queue.mpd_thread != NULL);
227 	mpsc_daemon_enqueue(&mcc_memory_error_event_queue,
228 	    &event->link, MPSC_QUEUE_DISABLE_PREEMPTION);
229 	return KERN_SUCCESS;
230 #else
231 	return KERN_FAILURE;
232 #endif
233 }
234 
235 #if (DEBUG || DEVELOPMENT)
236 static int
mcc_memory_error_notify_test_run(int64_t in,int64_t * out)237 mcc_memory_error_notify_test_run(int64_t in, int64_t *out)
238 {
239 	printf("Running mcc_memory_error_notify_test for %llu iterations\n", in);
240 	for (uint64_t i = 0; i < in; i++) {
241 		mcc_ecc_event_t event = {.version = MCC_ECC_V1, .status = (uint32_t)i};
242 		/**
243 		 * To accurately test mcc_log_memory_error, we must disable preemption, because it is called
244 		 * from the primary interrupt context.
245 		 */
246 		disable_preemption();
247 		mcc_log_memory_error(event);
248 		enable_preemption();
249 	}
250 
251 	*out = 1;
252 	return 0;
253 }
254 
255 SYSCTL_TEST_REGISTER(mcc_memory_error_notify_test, mcc_memory_error_notify_test_run);
256 #endif /* (DEBUG || DEVELOPMENT) */
257 
258 
259 /* Legacy ECC logging mechanism */
260 
261 /*
262  * ECC data.  Not really KPCs, but this still seems like the
263  * best home for this code.
264  *
265  * Circular buffer of events.  When we fill up, drop data.
266  */
267 #define ECC_EVENT_BUFFER_COUNT  (256)
268 
269 struct ecc_event                ecc_data[ECC_EVENT_BUFFER_COUNT];
270 static uint32_t                 ecc_data_next_read;
271 static uint32_t                 ecc_data_next_write;
272 static boolean_t                ecc_data_empty = TRUE; // next read == next write : empty or full?
273 static LCK_GRP_DECLARE(ecc_data_lock_group, "ecc-data");
274 static LCK_SPIN_DECLARE(ecc_data_lock, &ecc_data_lock_group);
275 static uint32_t                 ecc_correction_count;
276 
277 
278 uint32_t
ecc_log_get_correction_count()279 ecc_log_get_correction_count()
280 {
281 	return ecc_correction_count;
282 }
283 
284 kern_return_t
ecc_log_record_event(const struct ecc_event * ev)285 ecc_log_record_event(const struct ecc_event *ev)
286 {
287 	spl_t x;
288 
289 	if (ev->count > ECC_EVENT_INFO_DATA_ENTRIES) {
290 		panic("Count of %u on ecc event is too large.", (unsigned)ev->count);
291 	}
292 
293 	x = splhigh();
294 	lck_spin_lock(&ecc_data_lock);
295 
296 	ecc_correction_count++;
297 
298 	if (ecc_data_next_read == ecc_data_next_write && !ecc_data_empty) {
299 		lck_spin_unlock(&ecc_data_lock);
300 		splx(x);
301 		return KERN_FAILURE;
302 	}
303 
304 	bcopy(ev, &ecc_data[ecc_data_next_write], sizeof(*ev));
305 	ecc_data_next_write++;
306 	ecc_data_next_write %= ECC_EVENT_BUFFER_COUNT;
307 	ecc_data_empty = FALSE;
308 
309 	lck_spin_unlock(&ecc_data_lock);
310 	splx(x);
311 
312 	return KERN_SUCCESS;
313 }
314 
315 
316 kern_return_t
ecc_log_get_next_event(struct ecc_event * ev)317 ecc_log_get_next_event(struct ecc_event *ev)
318 {
319 	spl_t x;
320 
321 	x = splhigh();
322 	lck_spin_lock(&ecc_data_lock);
323 
324 	if (ecc_data_empty) {
325 		assert(ecc_data_next_write == ecc_data_next_read);
326 
327 		lck_spin_unlock(&ecc_data_lock);
328 		splx(x);
329 		return KERN_FAILURE;
330 	}
331 
332 	bcopy(&ecc_data[ecc_data_next_read], ev, sizeof(*ev));
333 	ecc_data_next_read++;
334 	ecc_data_next_read %= ECC_EVENT_BUFFER_COUNT;
335 
336 	if (ecc_data_next_read == ecc_data_next_write) {
337 		ecc_data_empty = TRUE;
338 	}
339 
340 	lck_spin_unlock(&ecc_data_lock);
341 	splx(x);
342 
343 	return KERN_SUCCESS;
344 }
345