xref: /xnu-11215.81.4/osfmk/kern/ecc.h (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452)
1 /*
2  * Copyright (c) 2013 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #pragma once
30 
31 #include <mach/kern_return.h>
32 #include <stdint.h>
33 #include <sys/cdefs.h>
34 #include <mach/vm_types.h>
35 
36 __BEGIN_DECLS
37 
38 #ifdef XNU_KERNEL_PRIVATE
39 extern ppnum_t *ecc_bad_pages;
40 extern uint32_t ecc_bad_pages_count;
41 
42 /* Counts for sysctls*/
43 extern uint32_t vm_ecc_db_pages_count;
44 extern uint32_t vm_ecc_zero_pages_count;
45 extern uint32_t vm_ecc_panic_pages_count;
46 extern uint32_t vm_ecc_max_db_pages;
47 #endif
48 
49 /* Old ECC logging mechanism */
50 
51 #define ECC_EVENT_INFO_DATA_ENTRIES     8
52 struct ecc_event {
53 	uint8_t         id;     // ID of memory (e.g. L2C), platform-specific
54 	uint8_t         count;  // Of uint64_t's used, starting at index 0
55 	uint64_t        data[ECC_EVENT_INFO_DATA_ENTRIES] __attribute__((aligned(8))); // Event-specific data
56 };
57 
58 #ifdef KERNEL_PRIVATE
59 extern kern_return_t    ecc_log_record_event(const struct ecc_event *ev);
60 #endif
61 
62 #ifdef XNU_KERNEL_PRIVATE
63 #include <mach/vm_param.h>
64 
65 #define ECC_PANIC_PAGE_MAGIC 0xEC
66 #define ECC_PANIC_PAGE_SIGN ((1ULL << 63) | (ECC_PANIC_PAGE_MAGIC))
67 #define ECC_PANIC_PAGE_MASK ((1ULL << 63) | (PAGE_MASK))
68 extern kern_return_t    ecc_log_get_next_event(struct ecc_event *ev);
69 extern uint32_t         ecc_log_get_correction_count(void);
70 #endif
71 
72 #define ECC_TESTING (DEVELOPMENT || DEBUG)
73 
74 /* New CoreAnalytics ECC logging mechanism */
75 
76 #define VM_ECC_PAGE_POISON_GRANULE_SHIFT (7)
77 #define VM_ECC_PAGE_POISON_GRANULE (1 << VM_ECC_PAGE_POISON_GRANULE_SHIFT)
78 
79 /* Flags to describe ECC memory errors */
80 __options_decl(ecc_flags_t, uint32_t, {
81 	ECC_NONE                        = 0x00000000,
82 	/* An error is correctable (1) or uncorrectable (0). */
83 	ECC_IS_CORRECTABLE              = 0x00000001,
84 	/* The database is corrupt. */
85 	ECC_DB_CORRUPTED                = 0x00000002,
86 	/* The error was injected for testing purposes. */
87 	ECC_IS_TEST_ERROR               = 0x00000004,
88 	/* Do not trigger a CA report, just record to the DB (for testing purposes) */
89 	ECC_DB_ONLY                     = 0x00000008,
90 	/* Filter out the given address from the DB*/
91 	ECC_REMOVE_ADDR                     = 0x00000010
92 });
93 
94 /**
95  * ECC versions.
96  */
97 __options_decl(ecc_version_t, uint32_t, {
98 	ECC_V1,
99 
100 	// Metadata
101 	ECC_NUM_VERSIONS
102 });
103 
104 /**
105  * ECC event descriptor.
106  *
107  * @note If a new ECC version has been added (e.g. future hardware must
108  * log new or different data) new fields should be appended to this struct to
109  * represent the new data.  No fields should be deleted from this struct unless
110  * the field corresponds only to hardware that has been deprecated.
111  */
112 typedef struct {
113 	/* Version of this struct. */
114 	ecc_version_t version;
115 	/* Flags describing the reported error. */
116 	ecc_flags_t flags;
117 	/* Physical address of failure */
118 	uint64_t physaddr;
119 	/* Number of CEs reported at physaddr */
120 	uint32_t ce_count;
121 	/* Vendor ID */
122 	uint32_t vendor;
123 	/* Reserved for future extension to report row, column, bank, etc. */
124 	uint32_t reserved[4];
125 } ecc_event_t;
126 _Static_assert(sizeof(ecc_event_t) == 10 * sizeof(uint32_t), "ecc_event_t size must be updated in memory_error_notification.defs");
127 
128 /**
129  * platform_error_handler_ecc_poll_t is the type of callback registered by the
130  * platform error handler that xnu can use to poll for ECC data.
131  */
132 typedef int (*platform_error_handler_ecc_poll_t)(uint64_t *addrs, uint32_t *error_count);
133 kern_return_t kern_ecc_poll_register(platform_error_handler_ecc_poll_t poll_func, uint32_t max_errors);
134 
135 /* Flags to describe MCC memory errors */
136 __options_decl(mcc_flags_t, uint32_t, {
137 	MCC_NONE                        = 0x00000000,
138 	MCC_IS_SINGLE_BIT               = 0x00000001,
139 	MCC_IS_MULTI_BIT                = 0x00000002,
140 });
141 
142 /**
143  * MCC ECC versions.
144  */
145 typedef enum {
146 	MCC_ECC_V1,
147 
148 	// Metadata
149 	MCC_ECC_NUM_VERSIONS
150 } mcc_ecc_version_t;
151 
152 /**
153  * MCC ECC event descriptor.
154  *
155  * @note If a new MCC ECC version has been added, because i.e. future hardware must log new or different data,
156  * new fields should be appended to this struct to represent the new data.  No fields should be
157  * deleted from this struct unless the field corresponds only to hardware that has been deprecated.
158  */
159 typedef struct {
160 	/* Version of this struct. */
161 	mcc_ecc_version_t version;
162 	/* Flags used to describe the error. */
163 	mcc_flags_t flags;
164 	/* Interrupt status at the time of the MCC error. */
165 	uint32_t status;
166 	/* AMCC on which the error occurred. */
167 	uint32_t amcc;
168 	/* Plane of the AMCC on which the error occurred. */
169 	uint32_t plane;
170 	/* MemCache error Bank of first one bit error. */
171 	uint32_t bank;
172 	/* MemCache error Way of first one bit error. */
173 	uint32_t way;
174 	/* MemCache error Index of first one bit error. */
175 	uint32_t index;
176 	/* Indicates whether the error is in upper half cache line or lower half cache line. */
177 	uint32_t bit_off_cl;
178 	/* MemCache one bit error bit offset of first one bit error with in half cache line. */
179 	uint32_t bit_off_within_hcl;
180 } mcc_ecc_event_t;
181 _Static_assert(sizeof(mcc_ecc_event_t) == 10 * sizeof(uint32_t), "ecc_event_t size must be updated in memory_error_notification.defs");
182 
183 #if KERNEL_PRIVATE
184 
185 /**
186  * Logs any memory error.
187  *
188  * This will notify mmaintenanced of the error. The error
189  * will get added to a database of errors and sent to
190  * CoreAnalytics. If ECC_IS_CORRECTABLE == 0,
191  * the address will be added to dramecc.db and will
192  * be retired for the lifetime of the device.
193  *
194  * If it is too early in boot to send a notification directly
195  * to the deamon, the error will be added to an array to be serviced
196  * later by an mpsc_daemon_queue.
197  *
198  * If ECC_IS_CORRECTABLE flag is set with this function, it
199  * assumes one error. If caller wishes to report the CE count
200  * reported by hardware, use ecc_log_memory_error_ce().
201  *
202  * @param physical_address address that the error occured on
203  * @param ecc_flags flags used to describe the error
204  *
205  * @returns KERN_SUCCESS if logging supported by hw, KERN_FAILURE if not
206  */
207 extern kern_return_t ecc_log_memory_error(uint64_t physical_address, ecc_flags_t ecc_flags);
208 extern kern_return_t ecc_log_memory_error_internal(uint64_t physical_address, ecc_flags_t ecc_flags);
209 
210 /*
211  * Used to report delayed errors, scraped after ECC is enabled.
212  */
213 extern kern_return_t ecc_log_memory_error_delayed(uint64_t physical_address, ecc_flags_t ecc_flags);
214 
215 /**
216  * Logs a correctable memory error.
217  *
218  * ECC_IS_CORRECTABLE is implied. Including this flag or not
219  * makes no difference for this function.
220  *
221  * @param physical_address address that the error occured on
222  * @param ecc_flags flags used to describe the error
223  * @param ce_count number of CEs occured on this page reported by HW
224  *
225  * @returns KERN_SUCCESS if logging supported by hw, KERN_FAILURE if not
226  */
227 kern_return_t ecc_log_memory_error_ce(uint64_t physical_address, ecc_flags_t ecc_flags, uint32_t ce_count);
228 
229 /**
230  * Logs an MCC error.
231  *
232  * @param event Event to be logged
233  * @returns KERN_SUCCESS on success, KERN_FAILURE otherwise
234  */
235 kern_return_t
236 mcc_log_memory_error(mcc_ecc_event_t event);
237 
238 #endif /* KERNEL_PRIVATE */
239 
240 __END_DECLS
241