xref: /xnu-10002.41.9/osfmk/kern/ecc.h (revision 699cd48037512bf4380799317ca44ca453c82f57)
1 /*
2  * Copyright (c) 2013 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #pragma once
30 
31 #include <mach/kern_return.h>
32 #include <stdint.h>
33 #include <sys/cdefs.h>
34 #include <mach/vm_types.h>
35 
36 __BEGIN_DECLS
37 
38 #ifdef XNU_KERNEL_PRIVATE
39 extern ppnum_t *ecc_bad_pages;
40 extern uint32_t ecc_bad_pages_count;
41 
42 /* Counts for sysctls*/
43 extern uint32_t vm_ecc_db_pages_count;
44 extern uint32_t vm_ecc_zero_pages_count;
45 extern uint32_t vm_ecc_panic_pages_count;
46 extern uint32_t vm_ecc_max_db_pages;
47 #endif
48 
49 /* Old ECC logging mechanism */
50 
51 #define ECC_EVENT_INFO_DATA_ENTRIES     8
52 struct ecc_event {
53 	uint8_t         id;     // ID of memory (e.g. L2C), platform-specific
54 	uint8_t         count;  // Of uint64_t's used, starting at index 0
55 	uint64_t        data[ECC_EVENT_INFO_DATA_ENTRIES] __attribute__((aligned(8))); // Event-specific data
56 };
57 
58 #ifdef KERNEL_PRIVATE
59 extern kern_return_t    ecc_log_record_event(const struct ecc_event *ev);
60 #endif
61 
62 #ifdef XNU_KERNEL_PRIVATE
63 extern kern_return_t    ecc_log_get_next_event(struct ecc_event *ev);
64 extern uint32_t         ecc_log_get_correction_count(void);
65 #endif
66 
67 #define ECC_TESTING (DEVELOPMENT || DEBUG)
68 
69 /* New CoreAnalytics ECC logging mechanism */
70 
71 #define VM_ECC_PAGE_POISON_GRANULE_SHIFT (7)
72 #define VM_ECC_PAGE_POISON_GRANULE (1 << VM_ECC_PAGE_POISON_GRANULE_SHIFT)
73 
74 /* Flags to describe ECC memory errors */
75 __options_decl(ecc_flags_t, uint32_t, {
76 	ECC_NONE                        = 0x00000000,
77 	/* An error is correctable (1) or uncorrectable (0). */
78 	ECC_IS_CORRECTABLE              = 0x00000001,
79 	/* The database is corrupt. */
80 	ECC_DB_CORRUPTED                = 0x00000002,
81 	/* The error was injected for testing purposes. */
82 	ECC_IS_TEST_ERROR               = 0x00000004,
83 	/* Do not trigger a CA report, just record to the DB (for testing purposes) */
84 	ECC_DB_ONLY                     = 0x00000008,
85 });
86 
87 /**
88  * ECC versions.
89  */
90 __options_decl(ecc_version_t, uint32_t, {
91 	ECC_V1,
92 
93 	// Metadata
94 	ECC_NUM_VERSIONS
95 });
96 
97 /**
98  * ECC event descriptor.
99  *
100  * @note If a new ECC version has been added (e.g. future hardware must
101  * log new or different data) new fields should be appended to this struct to
102  * represent the new data.  No fields should be deleted from this struct unless
103  * the field corresponds only to hardware that has been deprecated.
104  */
105 typedef struct {
106 	/* Version of this struct. */
107 	ecc_version_t version;
108 	/* Flags describing the reported error. */
109 	ecc_flags_t flags;
110 	/* Physical address of failure */
111 	uint64_t physaddr;
112 	/* Number of CEs reported at physaddr */
113 	uint32_t ce_count;
114 	/* Vendor ID */
115 	uint32_t vendor;
116 	/* Reserved for future extension to report row, column, bank, etc. */
117 	uint32_t reserved[4];
118 } ecc_event_t;
119 _Static_assert(sizeof(ecc_event_t) == 10 * sizeof(uint32_t), "ecc_event_t size must be updated in memory_error_notification.defs");
120 
121 /**
122  * platform_error_handler_ecc_poll_t is the type of callback registered by the
123  * platform error handler that xnu can use to poll for ECC data.
124  */
125 typedef int (*platform_error_handler_ecc_poll_t)(uint64_t *addrs, uint32_t *error_count);
126 kern_return_t kern_ecc_poll_register(platform_error_handler_ecc_poll_t poll_func, uint32_t max_errors);
127 
128 /* Flags to describe MCC memory errors */
129 __options_decl(mcc_flags_t, uint32_t, {
130 	MCC_NONE                        = 0x00000000,
131 	MCC_IS_SINGLE_BIT               = 0x00000001,
132 	MCC_IS_MULTI_BIT                = 0x00000002,
133 });
134 
135 /**
136  * MCC ECC versions.
137  */
138 typedef enum {
139 	MCC_ECC_V1,
140 
141 	// Metadata
142 	MCC_ECC_NUM_VERSIONS
143 } mcc_ecc_version_t;
144 
145 /**
146  * MCC ECC event descriptor.
147  *
148  * @note If a new MCC ECC version has been added, because i.e. future hardware must log new or different data,
149  * new fields should be appended to this struct to represent the new data.  No fields should be
150  * deleted from this struct unless the field corresponds only to hardware that has been deprecated.
151  */
152 typedef struct {
153 	/* Version of this struct. */
154 	mcc_ecc_version_t version;
155 	/* Flags used to describe the error. */
156 	mcc_flags_t flags;
157 	/* Interrupt status at the time of the MCC error. */
158 	uint32_t status;
159 	/* AMCC on which the error occurred. */
160 	uint32_t amcc;
161 	/* Plane of the AMCC on which the error occurred. */
162 	uint32_t plane;
163 	/* MemCache error Bank of first one bit error. */
164 	uint32_t bank;
165 	/* MemCache error Way of first one bit error. */
166 	uint32_t way;
167 	/* MemCache error Index of first one bit error. */
168 	uint32_t index;
169 	/* Indicates whether the error is in upper half cache line or lower half cache line. */
170 	uint32_t bit_off_cl;
171 	/* MemCache one bit error bit offset of first one bit error with in half cache line. */
172 	uint32_t bit_off_within_hcl;
173 } mcc_ecc_event_t;
174 _Static_assert(sizeof(mcc_ecc_event_t) == 10 * sizeof(uint32_t), "ecc_event_t size must be updated in memory_error_notification.defs");
175 
176 #if KERNEL_PRIVATE
177 
178 /**
179  * Logs any memory error.
180  *
181  * This will notify mmaintenanced of the error. The error
182  * will get added to a database of errors and sent to
183  * CoreAnalytics. If ECC_IS_CORRECTABLE == 0,
184  * the address will be added to dramecc.db and will
185  * be retired for the lifetime of the device.
186  *
187  * If it is too early in boot to send a notification directly
188  * to the deamon, the error will be added to an array to be serviced
189  * later by an mpsc_daemon_queue.
190  *
191  * If ECC_IS_CORRECTABLE flag is set with this function, it
192  * assumes one error. If caller wishes to report the CE count
193  * reported by hardware, use ecc_log_memory_error_ce().
194  *
195  * @param physical_address address that the error occured on
196  * @param ecc_flags flags used to describe the error
197  *
198  * @returns KERN_SUCCESS if logging supported by hw, KERN_FAILURE if not
199  */
200 extern kern_return_t ecc_log_memory_error(uint64_t physical_address, ecc_flags_t ecc_flags);
201 extern kern_return_t ecc_log_memory_error_internal(uint64_t physical_address, ecc_flags_t ecc_flags);
202 
203 /*
204  * Used to report delayed errors, scraped after ECC is enabled.
205  */
206 extern kern_return_t ecc_log_memory_error_delayed(uint64_t physical_address, ecc_flags_t ecc_flags);
207 
208 /**
209  * Logs a correctable memory error.
210  *
211  * ECC_IS_CORRECTABLE is implied. Including this flag or not
212  * makes no difference for this function.
213  *
214  * @param physical_address address that the error occured on
215  * @param ecc_flags flags used to describe the error
216  * @param ce_count number of CEs occured on this page reported by HW
217  *
218  * @returns KERN_SUCCESS if logging supported by hw, KERN_FAILURE if not
219  */
220 kern_return_t ecc_log_memory_error_ce(uint64_t physical_address, ecc_flags_t ecc_flags, uint32_t ce_count);
221 
222 /**
223  * Logs an MCC error.
224  *
225  * @param event Event to be logged
226  * @returns KERN_SUCCESS on success, KERN_FAILURE otherwise
227  */
228 kern_return_t
229 mcc_log_memory_error(mcc_ecc_event_t event);
230 
231 #endif /* KERNEL_PRIVATE */
232 
233 __END_DECLS
234