xref: /xnu-12377.61.12/osfmk/arm64/vm_mte_compress.c (revision 4d495c6e23c53686cf65f45067f79024cf5dcee8) !
1 /*
2  * Copyright (c) 2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include "vm_mte_compress.h"
30 
31 #ifndef COMPRESSOR_TESTER
32 // this file is included by the tester but having these includes there conflicts with the SDK includes
33 #include <pexpert/arm64/board_config.h>
34 
35 #include <sys/cdefs.h> // for __improbable()
36 #include <string.h>
37 #include <kern/assert.h>
38 
39 #include <vm/vm_compressor_xnu.h>
40 #endif // COMPRESSOR_TESTER
41 
42 #if HAS_MTE
43 
44 // Run Length Encoding (RLE) algorithm for compressing MTE tags
45 // ======================================
46 // MTE tags are compressed using a simple one-pass RLE (run-length encoding) compression algorithm
47 // The input is a buffer of 512 bytes representing the MTE tags of a 16k page (tag = nibble = 4 bits)
48 // The output is the success status of the compression and the compressed data buffer.
49 // The compressed data is made of a byte-aligned stream of commands, each command encodes either
50 // - A sequence of non-repeating tags (commands 0-3)
51 // - or a repeating run of a single tag (commands 4-F).
52 // Most commands are encoded by a single byte, except commands 1,2,3 which are encoded by a span of more than 1 byte.
53 // The MSB bits of the (first) byte of a command encodes the meaning of the command. There are 16 commands as defined
54 // in the comment table below.
55 // Example:
56 //   RLE: "01 42"   encodes "21 22" - 1 time the tag '1' and 3 times the tag '2'
57 //
58 // Some lengths of tag runs need to be encoded by a sequence of several commands.
59 // For instance "22 22 22 22 22 22 22 22 22 22" - a run of 20 instances of the tag "2" will be
60 // encoded by a sequence of 2 commands: A2 (repeat 16 times tag '2') 52 (repeat 4 times tag '2')
61 // Similarly, a sequence of 4 non-repeating tags for instance "12 34" will be encoded by
62 // a sequence of 2 commands: "12 14 (emit the 3 tags 2,1,4) 03 (emit the single tag 3)"
63 // In commands that spam more than 1 byte, the tags in are consumed and emitted LSB 4 bits first, then MSB 4 bits.
64 // Non-repeating commands may also encode short sequences of actually repeating commands for instance
65 // the sequence "33 3" may be encoded as non-repeating: 13 33 or repeating: 43, depending on the
66 // internal state of the compression.
67 //
68 // Considerations for the selection of commands:
69 // - The number of non-repeating tags that commands 0,1,2,3 emit is odd so that the command
70 // would always make a whole number of bytes and no bits are wasted.
71 // - The counts of repeat in the tag-repeating commands were selected to handle the wide
72 // variety of sizes of blocks possible in the input.
73 //
74 // The compressed MTE tags data encoded here is saved in the compressor segments after the WKDM compressed page data.
75 // WKDM compressed data must start at 64 byte alignment, so the size that the compressed MTE
76 // tags effectively take is rounded up to a multiple of 64 bytes. This means that the maximum compression ratio
77 // is 64/512=12.5%. A main goal of this compression is that this would also be the typical compressed
78 // tags size.
79 //
80 // The WKDMd (decompress) instruction caches in the 512 bytes that follow the buffer it decompresses in expectation
81 // of reading the tags data, see instruction documentation.
82 //
83 //   0 - Emit the next 1 nibble            // commands 0..3 are groups of non-repeating nibbles
84 //   1 - Emit the next 3 nibbles           //    encoded by 2 bytes
85 //   2 - Emit the next 5 nibbles           //    encoded by 3 bytes
86 //   3 - Emit the next 7 nibbles           //    encoded by 4 bytes
87 //   4 - Emit the next nibble 3 times      // command 4..15 are runs of the same nibble by varying repetition counts.
88 //   5 - Emit the next nibble 4 times
89 //   6 - Emit the next nibble 5 times
90 //   7 - Emit the next nibble 6 times
91 //   8 - Emit the next nibble 7 times
92 //   9 - Emit the next nibble 8 times
93 //   A(10) - Emit the next nibble 16 times
94 //   B(11) - Emit the next nibble 32 times
95 //   C(12) - Emit the next nibble 64 times
96 //   D(13) - Emit the next nibble 128 times
97 //   E(14) - Emit the next nibble 256 times
98 //   F(15) - Emit the next nibble 512 times
99 
100 #define CMTE_MAX_NON_REPEATING 7
101 #define CMTE_LAST_NON_REPEAT_COMMAND 3
102 #define CMTE_FIRST_REPEAT_COMMAND 4
103 #define CMTE_MIN_REPEAT_IN_ONE_COMMAND 3
104 #define CMTE_MAX_REPEAT_IN_ONE_COMMAND 512
105 
106 // map command to the number of tags it emits to output
107 static const uint16_t cmd_to_count[16] = {
108 	1, 3, 5, 7, // commands 0,1,2,3 are non-repeat commands, must be odd numbers
109 	3, 4, 5, 6, 7, 8, // commands 4,5,6,7,8,9 handle incrementing sizes
110 	16, 32, 64, 128, 256, 512 // commands A to F have longer jumps
111 };
112 
113 // For a given non-repeat sequence length, what command should we start to encode with
114 static const int8_t non_repeat_count_to_cmd[CMTE_MAX_NON_REPEATING + 1] = {
115 	-1, // 0 count is not valid
116 	0, 0, // sequence of 1,2 non-repeating tags, use command 0 which handles 1 nibble
117 	1, 1, // sequence of 3,4 non-repeating tags, use command 1 which handles 3 nibbles (4 non-repeating will be followed by command 0)
118 	2, 2, // sequence of 5,6 non-repeating tags, use command 2 which handles 5 nibbles (6 non-repeating will be followed by command 0)
119 	3 // sequence of 7 non-repeating tags, use command 3 which handles 7 nibbles
120 };
121 // For a given repeat count, from what command should we start to encode the sequence with
122 // index is tag repeat count, value is command
123 static const int8_t repeat_count_to_cmd[CMTE_MAX_REPEAT_IN_ONE_COMMAND + 1] = {
124 	-1, // 0 repeats is not valid
125 	-1, -1, // 1,2 repeated are handled by non-repeat commands
126 	4, 5, 6, 7, 8, 9, // 3-8 repeats handled by commands 4-9
127 	9, 9, 9, 9, 9, 9, 9, // 9-15 repeats handled by command 9 which does 8 repeats
128 	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, // 16-31 repeats handled by command 10 which does 16 repeats
129 	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, // (32-47)
130 	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, // (48-63) 32-63 repeats handled by command 11 which does 32 repeats
131 	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // (64-79)
132 	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // (80-95)
133 	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // (96-111)
134 	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // (112-127) 64-127 repeats handled by command 12 which does 64 repeats
135 	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, // (128-159)
136 	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, // (160-191)
137 	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, // (192-223)
138 	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, // (224-255) 128-255 repeats handled by command 13 which does 128 repeats
139 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // ...
140 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // (256-319)
141 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // ...
142 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // (320-383)
143 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // ...
144 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // (384-447)
145 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // ...
146 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // (448-511) 256-511 repeats handled by command 14 which does 256 repeats
147 	15 // 512 repeats and upwards are handled by command 15 which does 512 repeats
148 };
149 
150 #define CMTE_INVALID_TAG 0xFF
151 
152 // if the compress output is bigger than this, stop trying to compress and go for the memcpy option
153 // This is because in the compressed segments the WKDMc output is aligned to 64 byte so if we passed the (512-64) byte
154 // mark we might as well memcpy the entire 512 bytes and have easy time on decompress
155 #define CMTE_RESIGN_COMPRESS_SIZE (C_MTE_SIZE - C_SEG_OFFSET_ALIGNMENT_BOUNDARY)
156 
157 struct run_length_state {
158 	// current run-length we're handling
159 	uint32_t cur_count; // 0 means we handled it all
160 	uint8_t cur_tag;
161 
162 	// scratch-pad for non-repeating command
163 	uint8_t scratch[CMTE_MAX_NON_REPEATING];
164 	uint32_t scratch_len; // 0 means it's empty
165 	// offset of next byte to write to the output buffer
166 	uint32_t out_offset;
167 };
168 
169 // return:
170 // - C_MTE_SIZE: if the input is incompressible, input copied to output
171 // - C_MTE_SIZE + 1 + tag: if the entire input has the same tag
172 // - size (lower than C_MTE_SIZE): input was compressed to size
173 uint32_t
vm_mte_rle_compress_tags(uint8_t * tags_in_buf,uint32_t in_size,uint8_t * out_buf,__assert_only uint32_t out_size)174 vm_mte_rle_compress_tags(uint8_t *tags_in_buf, uint32_t in_size, uint8_t *out_buf, __assert_only uint32_t out_size)
175 {
176 	// worst case we're going to need this much size.
177 	// if it's uncompressible, we'll need at least the in size
178 	assert(out_size >= C_MTE_SIZE && out_size >= in_size);
179 	assert(in_size == C_MTE_SIZE);
180 
181 	struct run_length_state s = {};
182 	struct run_length_state *s_ptr = &s; // avoid using __block which prevents some optimizations
183 
184 #define OUTPUT_BYTE(b) do { \
185 	    if (__improbable(s_ptr->out_offset + 1 >= CMTE_RESIGN_COMPRESS_SIZE))  \
186 	        return false; \
187 	    out_buf[s_ptr->out_offset++] = (b); \
188     } while(false)
189 
190 	// this is done in a block since it needs to happen in the loop that goes over nibbles and at the end of the loop
191 	// return false if we ran out of output buffer
192 	bool (^process_tag)(bool) = ^bool (bool is_final) {
193 		// if it's the final call, we need to flush the scratch pad no matter what
194 		while (s_ptr->cur_count > 0 || (is_final && s_ptr->scratch_len > 0)) { // this will do at most 2 loops
195 			bool do_emit_scratch = false;
196 			// is it a run shorter than the shortest repeat command && we can add to cur to scratch without overflowing it?
197 			// for runs longer, always prefer to emit repeat command
198 			if (s_ptr->cur_count < CMTE_MIN_REPEAT_IN_ONE_COMMAND && s_ptr->scratch_len + s_ptr->cur_count <= CMTE_MAX_NON_REPEATING) {
199 				// adding to the scratch would not be above the maximum we can emit in one non-repeat command
200 				for (uint32_t i = 0; i < s_ptr->cur_count; ++i) {
201 					s_ptr->scratch[s_ptr->scratch_len++] = s_ptr->cur_tag;
202 				}
203 				s_ptr->cur_count = 0; // we just handled all of cur so reset it
204 				if (s_ptr->scratch_len == CMTE_MAX_NON_REPEATING || is_final) {
205 					// if it reached the size of scratch, need to emit scratch
206 					do_emit_scratch = true;
207 				}
208 			} else {
209 				// can't add to scratch because that would be beyond the allowed size in the scratch
210 				if (s_ptr->scratch_len > 0) {
211 					do_emit_scratch = true;
212 				}
213 			}
214 
215 			if (do_emit_scratch) {
216 				// we may emit several non-repeat command, this keeps track of where we are in the scratch buffer
217 				uint32_t scratch_offset = 0;
218 				while (s_ptr->scratch_len > 0) {
219 					// scratch has at least 1. first byte is the (non-repeat) command and the first nibble
220 					int8_t cmd = non_repeat_count_to_cmd[s_ptr->scratch_len];
221 					assert(cmd >= 0 && cmd <= CMTE_LAST_NON_REPEAT_COMMAND);
222 					uint8_t out_byte = (uint8_t)((uint8_t)cmd << 4) | s_ptr->scratch[scratch_offset++];
223 					OUTPUT_BYTE(out_byte);
224 					uint32_t consume_count = cmd_to_count[cmd]; // how many more tags to output?
225 					assert(consume_count <= s_ptr->scratch_len);
226 					// next bytes are the additional nibbles for the non-repeat command
227 					uint32_t scratch_i = 1; // skip 0 which was already written
228 					for (; scratch_i < consume_count; scratch_i += 2) {
229 						out_byte = s_ptr->scratch[scratch_offset++];
230 						out_byte |= s_ptr->scratch[scratch_offset++] << 4;
231 						OUTPUT_BYTE(out_byte);
232 					}
233 					s_ptr->scratch_len -= consume_count;
234 				}
235 			}
236 
237 			// now emit the cur as a series of repeating commands
238 			while (s_ptr->cur_count >= CMTE_MIN_REPEAT_IN_ONE_COMMAND) {
239 				int8_t cmd;
240 				if (s_ptr->cur_count > CMTE_MAX_REPEAT_IN_ONE_COMMAND) {
241 					cmd = repeat_count_to_cmd[CMTE_MAX_REPEAT_IN_ONE_COMMAND];
242 				} else {
243 					cmd = repeat_count_to_cmd[s_ptr->cur_count];
244 				}
245 				// reached a length that's not handled by a repeat command, shouldn't happen
246 				assert(cmd >= CMTE_FIRST_REPEAT_COMMAND);
247 
248 				uint8_t out_byte = (uint8_t)((uint8_t)cmd << 4) | s_ptr->cur_tag;
249 				OUTPUT_BYTE(out_byte);
250 
251 				// handled some of the length of the current run
252 				uint32_t consumed_count = cmd_to_count[cmd];
253 				assert(consumed_count <= s_ptr->cur_count); // consumed too much, shouldn't happen
254 				s_ptr->cur_count -= consumed_count;
255 			}
256 			// there might be still a few repeats left in cur, next iteration of the while loop will handle it via
257 			// the scratch path
258 		}
259 		return true;
260 	}; // process_tag()
261 
262 	// first level goes over bytes
263 	for (uint32_t byte_i = 0; byte_i < in_size; ++byte_i) {
264 		uint8_t b = tags_in_buf[byte_i];
265 		uint8_t tag = b & 0xF;
266 		// second level iterates over 4 bit nibbles
267 		for (int nibble_i = 0; nibble_i < 2; ++nibble_i) {
268 			// third level counts the run length of equal nibbles
269 			if (tag == s.cur_tag) {
270 				// add to the current run-length
271 				++s.cur_count;
272 			} else {
273 				// previous run-length finished, process it
274 				// fourth level decides what to do with each run-length
275 				if (!process_tag(false)) {
276 					goto too_long;
277 				}
278 
279 				// done handling the just-ended run, now create a new run in cur
280 				s.cur_tag = tag;
281 				s.cur_count = 1;
282 			}
283 
284 			tag = b >> 4;
285 		}
286 	}
287 
288 	if (s.out_offset == 0 && s.scratch_len == 0) {
289 		// the entire buffer was a single run of the same nibble value, return special value to indicate that along
290 		// with the repeating tag (single-tag optimization)
291 		return C_MTE_SIZE + 1 + s.cur_tag;
292 	}
293 	// process anything remaining in cur or in scratch
294 	if (!process_tag(true)) {
295 		goto too_long;
296 	}
297 	assert(s.scratch_len == 0);
298 
299 	// normal case - return how many bytes were written to the compressed output
300 	return (uint32_t)s.out_offset;
301 
302 too_long:
303 	memcpy(out_buf, tags_in_buf, in_size);
304 	return C_MTE_SIZE;
305 #undef OUTPUT_BYTE
306 }
307 
308 // convert the return value of vm_mte_rle_compress_tags() to the actual size written to the buffer
309 uint32_t
vm_mte_compressed_tags_actual_size(uint32_t mte_size)310 vm_mte_compressed_tags_actual_size(uint32_t mte_size)
311 {
312 	if (mte_size <= C_MTE_SIZE) {
313 		return mte_size;
314 	}
315 	return 0;
316 }
317 
318 // parse an RLE encoded compressed buffer and decompress it
319 // in_size is the return value of rle_compress_mte_tags()
320 // return true on success, false if input is malformed
321 bool
vm_mte_rle_decompress_tags(uint8_t * in_buf,uint32_t in_size,uint8_t * tags_out_buf,__assert_only uint32_t out_size)322 vm_mte_rle_decompress_tags(uint8_t *in_buf, uint32_t in_size, uint8_t *tags_out_buf, __assert_only uint32_t out_size)
323 {
324 	assert(out_size == C_MTE_SIZE);
325 	if (in_size == C_MTE_SIZE) {
326 		memcpy(tags_out_buf, in_buf, in_size);
327 		return true;
328 	}
329 	if (in_size > C_MTE_SIZE) { // same tag optimization
330 		assert(in_size <= C_SLOT_C_MTE_SIZE_MAX);
331 		uint8_t tag = (in_size & 0xF) - 1;
332 		uint8_t dbl_tag = tag | (uint8_t)(tag << 4);
333 		memset(tags_out_buf, dbl_tag, C_MTE_SIZE);
334 		// possible optimization: don't need to fill the buffer to do the eventual STGMs
335 		return true;
336 	}
337 
338 	uint32_t out_offset = 0;
339 	uint8_t part_byte = CMTE_INVALID_TAG;
340 #define OUTPUT_TAG(t) do { \
341 	if (__improbable(out_offset >= C_MTE_SIZE)) \
342 	    return false; \
343 	if (part_byte == CMTE_INVALID_TAG) \
344 	    part_byte = (t); \
345 	else { \
346 	    tags_out_buf[out_offset++] = part_byte | (uint8_t)((t) << 4); \
347 	    part_byte = CMTE_INVALID_TAG; \
348 	} \
349     } while(false)
350 
351 	// process commands one by one
352 	uint32_t in_offset = 0;
353 	while (in_offset < in_size) {
354 		uint8_t in_byte = in_buf[in_offset++];
355 		uint8_t cmd = in_byte >> 4;
356 		if (cmd < CMTE_FIRST_REPEAT_COMMAND) {
357 			// it's a non-repeat command
358 			uint8_t first_tag = in_byte & 0xF;
359 			OUTPUT_TAG(first_tag);
360 			int32_t nonrpt_i = cmd_to_count[cmd] - 1; // number of tags we still need to read
361 			assert(nonrpt_i % 2 == 0); // sanity
362 			for (; nonrpt_i > 0; nonrpt_i -= 2) {
363 				if (__improbable(in_offset >= in_size)) {
364 					return false; // no more bytes to read (encoding error)
365 				}
366 				uint8_t in_byte2 = in_buf[in_offset++];
367 				OUTPUT_TAG(in_byte2 & 0xF);
368 				OUTPUT_TAG(in_byte2 >> 4);
369 			}
370 		} else {
371 			// it's a repeat command
372 			uint8_t tag = in_byte & 0xF;
373 			uint32_t repeat = cmd_to_count[cmd];
374 			// sanity check for cmd_to_count table
375 			assert(repeat >= CMTE_MIN_REPEAT_IN_ONE_COMMAND && repeat <= CMTE_MAX_REPEAT_IN_ONE_COMMAND);
376 			if (part_byte != CMTE_INVALID_TAG) {
377 				// we have a byte in progress, need emit one tag of this run to finish this byte before memset
378 				OUTPUT_TAG(tag);
379 				repeat -= 1;
380 			}
381 			uint32_t fill_sz = repeat / 2;
382 			if (__improbable(out_offset + fill_sz > C_MTE_SIZE)) {
383 				return false; // overflow
384 			}
385 			if (fill_sz > 0) {
386 				uint8_t dbl_tag = tag | (uint8_t)(tag << 4);
387 				memset(tags_out_buf + out_offset, dbl_tag, fill_sz);
388 				out_offset += fill_sz;
389 			}
390 			// do we need to add the extra one for an odd repeat count
391 			if ((repeat % 2) != 0) {
392 				OUTPUT_TAG(tag);
393 			}
394 		}
395 	}
396 	// make sure we output exactly the expected amount and no partial byte pending
397 	//   (first condition can't be reached since we would bail before so this is here for sanity)
398 	if (__improbable(out_offset != C_MTE_SIZE || part_byte != CMTE_INVALID_TAG)) {
399 		return false;
400 	}
401 	return true;
402 #undef OUTPUT_TAG
403 }
404 
405 #if DEVELOPMENT || DEBUG
406 // parse an RLE encoded compressed buffer and increment a commands histogram
407 // this is used by the tester
408 bool
vm_mte_rle_comp_histogram(uint8_t * in_buf,uint32_t in_size,struct comp_histogram * hist)409 vm_mte_rle_comp_histogram(uint8_t *in_buf, uint32_t in_size, struct comp_histogram *hist)
410 {
411 	if (in_size > C_MTE_SIZE) { // same tag optimization
412 		assert(in_size <= C_SLOT_C_MTE_SIZE_MAX);
413 		hist->same_value_count++;
414 		return true;
415 	}
416 	assert(in_size > 0);
417 	hist->comp_size_bins[(in_size - 1) / C_SEG_OFFSET_ALIGNMENT_BOUNDARY]++;
418 
419 	if (in_size == C_MTE_SIZE) { // not compressed
420 		return true;
421 	}
422 
423 	// process commands one by one
424 	uint32_t in_offset = 0;
425 	while (in_offset < in_size) {
426 		uint8_t in_byte = in_buf[in_offset++];
427 		uint8_t cmd = in_byte >> 4;
428 		hist->cmd_bins[cmd]++;
429 		hist->cmd_total++;
430 		if (cmd < CMTE_FIRST_REPEAT_COMMAND) {
431 			// it's a non-repeat command
432 			int32_t nonrpt_i = cmd_to_count[cmd] - 1; // number of tags we still need to read
433 			assert(nonrpt_i % 2 == 0); // sanity
434 			for (; nonrpt_i > 0; nonrpt_i -= 2) {
435 				if (__improbable(in_offset >= in_size)) {
436 					return false; // no more bytes to read (encoding error)
437 				}
438 				in_offset++;
439 			}
440 		} else {
441 			// it's a repeat command, only 1 byte
442 		}
443 	}
444 	return true;
445 }
446 
447 void
vm_mte_rle_runs_histogram(uint8_t * tags_in_buf,uint32_t in_size,struct runs_histogram * hist)448 vm_mte_rle_runs_histogram(uint8_t *tags_in_buf, uint32_t in_size, struct runs_histogram *hist)
449 {
450 	assert(in_size == C_MTE_SIZE);
451 
452 	struct run_length_state s = {};
453 	struct run_length_state *s_ptr = &s; // avoid using __block which prevents some optimizations
454 
455 	// this is done in a block since it needs to happen in the loop that goes over nibbles and at the end of the loop
456 	// return false if we ran out of output buffer
457 	void (^process_tag)() = ^void () {
458 		if (s_ptr->cur_count == 0) {
459 			return; // first time
460 		}
461 		assert(s_ptr->cur_count <= VM_MTE_C_MAX_TAG_RUN);
462 		hist->rh_bins[s_ptr->cur_count]++;
463 	}; // process_tag()
464 
465 	// first level goes over bytes
466 	for (uint32_t byte_i = 0; byte_i < in_size; ++byte_i) {
467 		uint8_t b = tags_in_buf[byte_i];
468 		uint8_t tag = b & 0xF;
469 		// second level iterates over 4 bit nibbles
470 		for (int nibble_i = 0; nibble_i < 2; ++nibble_i) {
471 			// third level counts the run length of equal nibbles
472 			if (tag == s.cur_tag) {
473 				// add to the current run-length
474 				++s.cur_count;
475 			} else {
476 				// previous run-length finished, process it
477 				// fourth level decides what to do with each run-length
478 				process_tag();
479 
480 				// done handling the just-ended run, now create a new run in cur
481 				s.cur_tag = tag;
482 				s.cur_count = 1;
483 			}
484 			tag = b >> 4;
485 		}
486 	}
487 
488 	// process anything remaining in cur or in scratch
489 	process_tag();
490 }
491 
492 
493 #endif /* DEVELOPMENT || DEBUG */
494 
495 #ifndef COMPRESSOR_TESTER // compressor_tags_stats is not defined in the tester
496 
497 SCALABLE_COUNTER_DEFINE(compressor_tagged_pages_compressed);
498 // different reasons why tagged pages were removed from the compressor
499 SCALABLE_COUNTER_DEFINE(compressor_tagged_pages_decompressed);
500 SCALABLE_COUNTER_DEFINE(compressor_tagged_pages_freed);
501 SCALABLE_COUNTER_DEFINE(compressor_tagged_pages_corrupted);
502 // current number of bytes taken by compressed tags in the compressor
503 SCALABLE_COUNTER_DEFINE(compressor_tags_overhead_bytes);
504 // current number of tagged pages that reside in the compressor
505 SCALABLE_COUNTER_DEFINE(compressor_tagged_pages);
506 // current number of tag storage pages composing the compressor pool
507 SCALABLE_COUNTER_DEFINE(compressor_tag_storage_pages_in_pool);
508 // current number of non-tag storage pages composing the compressor pool
509 SCALABLE_COUNTER_DEFINE(compressor_non_tag_storage_pages_in_pool);
510 // the following is a breakdown of tagged_pages_compressed
511 #if DEVELOPMENT || DEBUG
512 SCALABLE_COUNTER_DEFINE(compressor_tags_all_zero);
513 SCALABLE_COUNTER_DEFINE(compressor_tags_same_value);
514 SCALABLE_COUNTER_DEFINE(compressor_tags_below_align);
515 SCALABLE_COUNTER_DEFINE(compressor_tags_above_align);
516 SCALABLE_COUNTER_DEFINE(compressor_tags_incompressible);
517 #endif /* DEVELOPMENT || DEBUG */
518 
519 #if DEVELOPMENT || DEBUG
520 // don't want to expose in release counters which may indirectly expose tag information
521 #define debug_counter_inc(c) counter_inc(c)
522 #else /* DEVELOPMENT || DEBUG */
523 #define debug_counter_inc(c)
524 #endif /* DEVELOPMENT || DEBUG */
525 
526 void
vm_mte_tags_stats_compressed(uint32_t size_written)527 vm_mte_tags_stats_compressed(uint32_t size_written)
528 {
529 	counter_inc(&compressor_tagged_pages_compressed);
530 	bool add_overhead = false;
531 
532 	// 64 bytes is the minimum space compressed tags can take since that's the alignment requirement of wkdmc output
533 	// so even if the tags are compressed to 1 byte, it's still going to take 64 bytes in the segment
534 	if (size_written < C_SEG_OFFSET_ALIGNMENT_BOUNDARY) {
535 		debug_counter_inc(&compressor_tags_below_align);
536 		add_overhead = true;
537 	} else if (size_written == C_MTE_SIZE) {
538 		debug_counter_inc(&compressor_tags_incompressible);
539 		add_overhead = true;
540 	} else if (size_written == C_MTE_SIZE + 1) {
541 		// same-value indicator + tag == 0
542 		debug_counter_inc(&compressor_tags_all_zero);
543 	} else if (size_written > C_MTE_SIZE) {
544 		// just same-value indicator
545 		debug_counter_inc(&compressor_tags_same_value);
546 	} else {
547 		debug_counter_inc(&compressor_tags_above_align);
548 		add_overhead = true;
549 	}
550 
551 	if (add_overhead) {
552 		uint64_t rounded = C_SEG_ROUND_TO_ALIGNMENT(size_written);
553 		counter_add(&compressor_tags_overhead_bytes, rounded);
554 	}
555 	counter_inc(&compressor_tagged_pages);
556 }
557 
558 void
vm_mte_tags_stats_removed(uint32_t mte_size,vm_mte_c_tags_removal_reason_t reason)559 vm_mte_tags_stats_removed(uint32_t mte_size, vm_mte_c_tags_removal_reason_t reason)
560 {
561 	switch (reason) {
562 	case VM_MTE_C_TAGS_REMOVAL_DECOMPRESSED:
563 		counter_inc(&compressor_tagged_pages_decompressed);
564 		break;
565 	case VM_MTE_C_TAGS_REMOVAL_FREE:
566 		counter_inc(&compressor_tagged_pages_freed);
567 		break;
568 	case VM_MTE_C_TAGS_REMOVAL_CORRUPT:
569 		counter_inc(&compressor_tagged_pages_corrupted);
570 		break;
571 	default:
572 		panic("Unexpected compressor tags removal reason %u", reason);
573 	}
574 
575 	if (mte_size <= C_MTE_SIZE) { // same-tag optimization doesn't take any buffer space
576 		uint64_t rounded = C_SEG_ROUND_TO_ALIGNMENT(mte_size);
577 		counter_add(&compressor_tags_overhead_bytes, -(uint64_t)rounded);
578 	}
579 	counter_dec(&compressor_tagged_pages);
580 }
581 
582 #endif // COMPRESSOR_TESTER
583 #endif // HAS_MTE
584