xref: /xnu-10002.61.3/osfmk/kern/ecc.h (revision 0f4c859e951fba394238ab619495c4e1d54d0f34)
1 /*
2  * Copyright (c) 2013 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #pragma once
30 
31 #include <mach/kern_return.h>
32 #include <stdint.h>
33 #include <sys/cdefs.h>
34 #include <mach/vm_types.h>
35 
36 __BEGIN_DECLS
37 
38 #ifdef XNU_KERNEL_PRIVATE
39 extern ppnum_t *ecc_bad_pages;
40 extern uint32_t ecc_bad_pages_count;
41 
42 /* Counts for sysctls*/
43 extern uint32_t vm_ecc_db_pages_count;
44 extern uint32_t vm_ecc_zero_pages_count;
45 extern uint32_t vm_ecc_panic_pages_count;
46 extern uint32_t vm_ecc_max_db_pages;
47 #endif
48 
49 /* Old ECC logging mechanism */
50 
51 #define ECC_EVENT_INFO_DATA_ENTRIES     8
52 struct ecc_event {
53 	uint8_t         id;     // ID of memory (e.g. L2C), platform-specific
54 	uint8_t         count;  // Of uint64_t's used, starting at index 0
55 	uint64_t        data[ECC_EVENT_INFO_DATA_ENTRIES] __attribute__((aligned(8))); // Event-specific data
56 };
57 
58 #ifdef KERNEL_PRIVATE
59 extern kern_return_t    ecc_log_record_event(const struct ecc_event *ev);
60 #endif
61 
62 #ifdef XNU_KERNEL_PRIVATE
63 #include <mach/vm_param.h>
64 
65 #define ECC_PANIC_PAGE_MAGIC 0xEC
66 #define ECC_PANIC_PAGE_SIGN ((1ULL << 63) | (ECC_PANIC_PAGE_MAGIC))
67 #define ECC_PANIC_PAGE_MASK ((1ULL << 63) | (PAGE_MASK))
68 extern kern_return_t    ecc_log_get_next_event(struct ecc_event *ev);
69 extern uint32_t         ecc_log_get_correction_count(void);
70 #endif
71 
72 #define ECC_TESTING (DEVELOPMENT || DEBUG)
73 
74 /* New CoreAnalytics ECC logging mechanism */
75 
76 #define VM_ECC_PAGE_POISON_GRANULE_SHIFT (7)
77 #define VM_ECC_PAGE_POISON_GRANULE (1 << VM_ECC_PAGE_POISON_GRANULE_SHIFT)
78 
79 /* Flags to describe ECC memory errors */
80 __options_decl(ecc_flags_t, uint32_t, {
81 	ECC_NONE                        = 0x00000000,
82 	/* An error is correctable (1) or uncorrectable (0). */
83 	ECC_IS_CORRECTABLE              = 0x00000001,
84 	/* The database is corrupt. */
85 	ECC_DB_CORRUPTED                = 0x00000002,
86 	/* The error was injected for testing purposes. */
87 	ECC_IS_TEST_ERROR               = 0x00000004,
88 	/* Do not trigger a CA report, just record to the DB (for testing purposes) */
89 	ECC_DB_ONLY                     = 0x00000008,
90 });
91 
92 /**
93  * ECC versions.
94  */
95 __options_decl(ecc_version_t, uint32_t, {
96 	ECC_V1,
97 
98 	// Metadata
99 	ECC_NUM_VERSIONS
100 });
101 
102 /**
103  * ECC event descriptor.
104  *
105  * @note If a new ECC version has been added (e.g. future hardware must
106  * log new or different data) new fields should be appended to this struct to
107  * represent the new data.  No fields should be deleted from this struct unless
108  * the field corresponds only to hardware that has been deprecated.
109  */
110 typedef struct {
111 	/* Version of this struct. */
112 	ecc_version_t version;
113 	/* Flags describing the reported error. */
114 	ecc_flags_t flags;
115 	/* Physical address of failure */
116 	uint64_t physaddr;
117 	/* Number of CEs reported at physaddr */
118 	uint32_t ce_count;
119 	/* Vendor ID */
120 	uint32_t vendor;
121 	/* Reserved for future extension to report row, column, bank, etc. */
122 	uint32_t reserved[4];
123 } ecc_event_t;
124 _Static_assert(sizeof(ecc_event_t) == 10 * sizeof(uint32_t), "ecc_event_t size must be updated in memory_error_notification.defs");
125 
126 /**
127  * platform_error_handler_ecc_poll_t is the type of callback registered by the
128  * platform error handler that xnu can use to poll for ECC data.
129  */
130 typedef int (*platform_error_handler_ecc_poll_t)(uint64_t *addrs, uint32_t *error_count);
131 kern_return_t kern_ecc_poll_register(platform_error_handler_ecc_poll_t poll_func, uint32_t max_errors);
132 
133 /* Flags to describe MCC memory errors */
134 __options_decl(mcc_flags_t, uint32_t, {
135 	MCC_NONE                        = 0x00000000,
136 	MCC_IS_SINGLE_BIT               = 0x00000001,
137 	MCC_IS_MULTI_BIT                = 0x00000002,
138 });
139 
140 /**
141  * MCC ECC versions.
142  */
143 typedef enum {
144 	MCC_ECC_V1,
145 
146 	// Metadata
147 	MCC_ECC_NUM_VERSIONS
148 } mcc_ecc_version_t;
149 
150 /**
151  * MCC ECC event descriptor.
152  *
153  * @note If a new MCC ECC version has been added, because i.e. future hardware must log new or different data,
154  * new fields should be appended to this struct to represent the new data.  No fields should be
155  * deleted from this struct unless the field corresponds only to hardware that has been deprecated.
156  */
157 typedef struct {
158 	/* Version of this struct. */
159 	mcc_ecc_version_t version;
160 	/* Flags used to describe the error. */
161 	mcc_flags_t flags;
162 	/* Interrupt status at the time of the MCC error. */
163 	uint32_t status;
164 	/* AMCC on which the error occurred. */
165 	uint32_t amcc;
166 	/* Plane of the AMCC on which the error occurred. */
167 	uint32_t plane;
168 	/* MemCache error Bank of first one bit error. */
169 	uint32_t bank;
170 	/* MemCache error Way of first one bit error. */
171 	uint32_t way;
172 	/* MemCache error Index of first one bit error. */
173 	uint32_t index;
174 	/* Indicates whether the error is in upper half cache line or lower half cache line. */
175 	uint32_t bit_off_cl;
176 	/* MemCache one bit error bit offset of first one bit error with in half cache line. */
177 	uint32_t bit_off_within_hcl;
178 } mcc_ecc_event_t;
179 _Static_assert(sizeof(mcc_ecc_event_t) == 10 * sizeof(uint32_t), "ecc_event_t size must be updated in memory_error_notification.defs");
180 
181 #if KERNEL_PRIVATE
182 
183 /**
184  * Logs any memory error.
185  *
186  * This will notify mmaintenanced of the error. The error
187  * will get added to a database of errors and sent to
188  * CoreAnalytics. If ECC_IS_CORRECTABLE == 0,
189  * the address will be added to dramecc.db and will
190  * be retired for the lifetime of the device.
191  *
192  * If it is too early in boot to send a notification directly
193  * to the deamon, the error will be added to an array to be serviced
194  * later by an mpsc_daemon_queue.
195  *
196  * If ECC_IS_CORRECTABLE flag is set with this function, it
197  * assumes one error. If caller wishes to report the CE count
198  * reported by hardware, use ecc_log_memory_error_ce().
199  *
200  * @param physical_address address that the error occured on
201  * @param ecc_flags flags used to describe the error
202  *
203  * @returns KERN_SUCCESS if logging supported by hw, KERN_FAILURE if not
204  */
205 extern kern_return_t ecc_log_memory_error(uint64_t physical_address, ecc_flags_t ecc_flags);
206 extern kern_return_t ecc_log_memory_error_internal(uint64_t physical_address, ecc_flags_t ecc_flags);
207 
208 /*
209  * Used to report delayed errors, scraped after ECC is enabled.
210  */
211 extern kern_return_t ecc_log_memory_error_delayed(uint64_t physical_address, ecc_flags_t ecc_flags);
212 
213 /**
214  * Logs a correctable memory error.
215  *
216  * ECC_IS_CORRECTABLE is implied. Including this flag or not
217  * makes no difference for this function.
218  *
219  * @param physical_address address that the error occured on
220  * @param ecc_flags flags used to describe the error
221  * @param ce_count number of CEs occured on this page reported by HW
222  *
223  * @returns KERN_SUCCESS if logging supported by hw, KERN_FAILURE if not
224  */
225 kern_return_t ecc_log_memory_error_ce(uint64_t physical_address, ecc_flags_t ecc_flags, uint32_t ce_count);
226 
227 /**
228  * Logs an MCC error.
229  *
230  * @param event Event to be logged
231  * @returns KERN_SUCCESS on success, KERN_FAILURE otherwise
232  */
233 kern_return_t
234 mcc_log_memory_error(mcc_ecc_event_t event);
235 
236 #endif /* KERNEL_PRIVATE */
237 
238 __END_DECLS
239