1 /*
2 * Copyright (c) 2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include "vm_mte_compress.h"
30
31 #ifndef COMPRESSOR_TESTER
32 // this file is included by the tester but having these includes there conflicts with the SDK includes
33 #include <pexpert/arm64/board_config.h>
34
35 #include <sys/cdefs.h> // for __improbable()
36 #include <string.h>
37 #include <kern/assert.h>
38
39 #include <vm/vm_compressor_xnu.h>
40 #endif // COMPRESSOR_TESTER
41
42 #if HAS_MTE
43
44 // Run Length Encoding (RLE) algorithm for compressing MTE tags
45 // ======================================
46 // MTE tags are compressed using a simple one-pass RLE (run-length encoding) compression algorithm
47 // The input is a buffer of 512 bytes representing the MTE tags of a 16k page (tag = nibble = 4 bits)
48 // The output is the success status of the compression and the compressed data buffer.
49 // The compressed data is made of a byte-aligned stream of commands, each command encodes either
50 // - A sequence of non-repeating tags (commands 0-3)
51 // - or a repeating run of a single tag (commands 4-F).
52 // Most commands are encoded by a single byte, except commands 1,2,3 which are encoded by a span of more than 1 byte.
53 // The MSB bits of the (first) byte of a command encodes the meaning of the command. There are 16 commands as defined
54 // in the comment table below.
55 // Example:
56 // RLE: "01 42" encodes "21 22" - 1 time the tag '1' and 3 times the tag '2'
57 //
58 // Some lengths of tag runs need to be encoded by a sequence of several commands.
59 // For instance "22 22 22 22 22 22 22 22 22 22" - a run of 20 instances of the tag "2" will be
60 // encoded by a sequence of 2 commands: A2 (repeat 16 times tag '2') 52 (repeat 4 times tag '2')
61 // Similarly, a sequence of 4 non-repeating tags for instance "12 34" will be encoded by
62 // a sequence of 2 commands: "12 14 (emit the 3 tags 2,1,4) 03 (emit the single tag 3)"
63 // In commands that spam more than 1 byte, the tags in are consumed and emitted LSB 4 bits first, then MSB 4 bits.
64 // Non-repeating commands may also encode short sequences of actually repeating commands for instance
65 // the sequence "33 3" may be encoded as non-repeating: 13 33 or repeating: 43, depending on the
66 // internal state of the compression.
67 //
68 // Considerations for the selection of commands:
69 // - The number of non-repeating tags that commands 0,1,2,3 emit is odd so that the command
70 // would always make a whole number of bytes and no bits are wasted.
71 // - The counts of repeat in the tag-repeating commands were selected to handle the wide
72 // variety of sizes of blocks possible in the input.
73 //
74 // The compressed MTE tags data encoded here is saved in the compressor segments after the WKDM compressed page data.
75 // WKDM compressed data must start at 64 byte alignment, so the size that the compressed MTE
76 // tags effectively take is rounded up to a multiple of 64 bytes. This means that the maximum compression ratio
77 // is 64/512=12.5%. A main goal of this compression is that this would also be the typical compressed
78 // tags size.
79 //
80 // The WKDMd (decompress) instruction caches in the 512 bytes that follow the buffer it decompresses in expectation
81 // of reading the tags data, see instruction documentation.
82 //
83 // 0 - Emit the next 1 nibble // commands 0..3 are groups of non-repeating nibbles
84 // 1 - Emit the next 3 nibbles // encoded by 2 bytes
85 // 2 - Emit the next 5 nibbles // encoded by 3 bytes
86 // 3 - Emit the next 7 nibbles // encoded by 4 bytes
87 // 4 - Emit the next nibble 3 times // command 4..15 are runs of the same nibble by varying repetition counts.
88 // 5 - Emit the next nibble 4 times
89 // 6 - Emit the next nibble 5 times
90 // 7 - Emit the next nibble 6 times
91 // 8 - Emit the next nibble 7 times
92 // 9 - Emit the next nibble 8 times
93 // A(10) - Emit the next nibble 16 times
94 // B(11) - Emit the next nibble 32 times
95 // C(12) - Emit the next nibble 64 times
96 // D(13) - Emit the next nibble 128 times
97 // E(14) - Emit the next nibble 256 times
98 // F(15) - Emit the next nibble 512 times
99
100 #define CMTE_MAX_NON_REPEATING 7
101 #define CMTE_LAST_NON_REPEAT_COMMAND 3
102 #define CMTE_FIRST_REPEAT_COMMAND 4
103 #define CMTE_MIN_REPEAT_IN_ONE_COMMAND 3
104 #define CMTE_MAX_REPEAT_IN_ONE_COMMAND 512
105
106 // map command to the number of tags it emits to output
107 static const uint16_t cmd_to_count[16] = {
108 1, 3, 5, 7, // commands 0,1,2,3 are non-repeat commands, must be odd numbers
109 3, 4, 5, 6, 7, 8, // commands 4,5,6,7,8,9 handle incrementing sizes
110 16, 32, 64, 128, 256, 512 // commands A to F have longer jumps
111 };
112
113 // For a given non-repeat sequence length, what command should we start to encode with
114 static const int8_t non_repeat_count_to_cmd[CMTE_MAX_NON_REPEATING + 1] = {
115 -1, // 0 count is not valid
116 0, 0, // sequence of 1,2 non-repeating tags, use command 0 which handles 1 nibble
117 1, 1, // sequence of 3,4 non-repeating tags, use command 1 which handles 3 nibbles (4 non-repeating will be followed by command 0)
118 2, 2, // sequence of 5,6 non-repeating tags, use command 2 which handles 5 nibbles (6 non-repeating will be followed by command 0)
119 3 // sequence of 7 non-repeating tags, use command 3 which handles 7 nibbles
120 };
121 // For a given repeat count, from what command should we start to encode the sequence with
122 // index is tag repeat count, value is command
123 static const int8_t repeat_count_to_cmd[CMTE_MAX_REPEAT_IN_ONE_COMMAND + 1] = {
124 -1, // 0 repeats is not valid
125 -1, -1, // 1,2 repeated are handled by non-repeat commands
126 4, 5, 6, 7, 8, 9, // 3-8 repeats handled by commands 4-9
127 9, 9, 9, 9, 9, 9, 9, // 9-15 repeats handled by command 9 which does 8 repeats
128 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, // 16-31 repeats handled by command 10 which does 16 repeats
129 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, // (32-47)
130 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, // (48-63) 32-63 repeats handled by command 11 which does 32 repeats
131 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // (64-79)
132 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // (80-95)
133 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // (96-111)
134 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // (112-127) 64-127 repeats handled by command 12 which does 64 repeats
135 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, // (128-159)
136 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, // (160-191)
137 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, // (192-223)
138 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, // (224-255) 128-255 repeats handled by command 13 which does 128 repeats
139 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // ...
140 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // (256-319)
141 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // ...
142 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // (320-383)
143 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // ...
144 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // (384-447)
145 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // ...
146 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, // (448-511) 256-511 repeats handled by command 14 which does 256 repeats
147 15 // 512 repeats and upwards are handled by command 15 which does 512 repeats
148 };
149
150 #define CMTE_INVALID_TAG 0xFF
151
152 // if the compress output is bigger than this, stop trying to compress and go for the memcpy option
153 // This is because in the compressed segments the WKDMc output is aligned to 64 byte so if we passed the (512-64) byte
154 // mark we might as well memcpy the entire 512 bytes and have easy time on decompress
155 #define CMTE_RESIGN_COMPRESS_SIZE (C_MTE_SIZE - C_SEG_OFFSET_ALIGNMENT_BOUNDARY)
156
157 struct run_length_state {
158 // current run-length we're handling
159 uint32_t cur_count; // 0 means we handled it all
160 uint8_t cur_tag;
161
162 // scratch-pad for non-repeating command
163 uint8_t scratch[CMTE_MAX_NON_REPEATING];
164 uint32_t scratch_len; // 0 means it's empty
165 // offset of next byte to write to the output buffer
166 uint32_t out_offset;
167 };
168
169 // return:
170 // - C_MTE_SIZE: if the input is incompressible, input copied to output
171 // - C_MTE_SIZE + 1 + tag: if the entire input has the same tag
172 // - size (lower than C_MTE_SIZE): input was compressed to size
173 uint32_t
vm_mte_rle_compress_tags(uint8_t * tags_in_buf,uint32_t in_size,uint8_t * out_buf,__assert_only uint32_t out_size)174 vm_mte_rle_compress_tags(uint8_t *tags_in_buf, uint32_t in_size, uint8_t *out_buf, __assert_only uint32_t out_size)
175 {
176 // worst case we're going to need this much size.
177 // if it's uncompressible, we'll need at least the in size
178 assert(out_size >= C_MTE_SIZE && out_size >= in_size);
179 assert(in_size == C_MTE_SIZE);
180
181 struct run_length_state s = {};
182 struct run_length_state *s_ptr = &s; // avoid using __block which prevents some optimizations
183
184 #define OUTPUT_BYTE(b) do { \
185 if (__improbable(s_ptr->out_offset + 1 >= CMTE_RESIGN_COMPRESS_SIZE)) \
186 return false; \
187 out_buf[s_ptr->out_offset++] = (b); \
188 } while(false)
189
190 // this is done in a block since it needs to happen in the loop that goes over nibbles and at the end of the loop
191 // return false if we ran out of output buffer
192 bool (^process_tag)(bool) = ^bool (bool is_final) {
193 // if it's the final call, we need to flush the scratch pad no matter what
194 while (s_ptr->cur_count > 0 || (is_final && s_ptr->scratch_len > 0)) { // this will do at most 2 loops
195 bool do_emit_scratch = false;
196 // is it a run shorter than the shortest repeat command && we can add to cur to scratch without overflowing it?
197 // for runs longer, always prefer to emit repeat command
198 if (s_ptr->cur_count < CMTE_MIN_REPEAT_IN_ONE_COMMAND && s_ptr->scratch_len + s_ptr->cur_count <= CMTE_MAX_NON_REPEATING) {
199 // adding to the scratch would not be above the maximum we can emit in one non-repeat command
200 for (uint32_t i = 0; i < s_ptr->cur_count; ++i) {
201 s_ptr->scratch[s_ptr->scratch_len++] = s_ptr->cur_tag;
202 }
203 s_ptr->cur_count = 0; // we just handled all of cur so reset it
204 if (s_ptr->scratch_len == CMTE_MAX_NON_REPEATING || is_final) {
205 // if it reached the size of scratch, need to emit scratch
206 do_emit_scratch = true;
207 }
208 } else {
209 // can't add to scratch because that would be beyond the allowed size in the scratch
210 if (s_ptr->scratch_len > 0) {
211 do_emit_scratch = true;
212 }
213 }
214
215 if (do_emit_scratch) {
216 // we may emit several non-repeat command, this keeps track of where we are in the scratch buffer
217 uint32_t scratch_offset = 0;
218 while (s_ptr->scratch_len > 0) {
219 // scratch has at least 1. first byte is the (non-repeat) command and the first nibble
220 int8_t cmd = non_repeat_count_to_cmd[s_ptr->scratch_len];
221 assert(cmd >= 0 && cmd <= CMTE_LAST_NON_REPEAT_COMMAND);
222 uint8_t out_byte = (uint8_t)((uint8_t)cmd << 4) | s_ptr->scratch[scratch_offset++];
223 OUTPUT_BYTE(out_byte);
224 uint32_t consume_count = cmd_to_count[cmd]; // how many more tags to output?
225 assert(consume_count <= s_ptr->scratch_len);
226 // next bytes are the additional nibbles for the non-repeat command
227 uint32_t scratch_i = 1; // skip 0 which was already written
228 for (; scratch_i < consume_count; scratch_i += 2) {
229 out_byte = s_ptr->scratch[scratch_offset++];
230 out_byte |= s_ptr->scratch[scratch_offset++] << 4;
231 OUTPUT_BYTE(out_byte);
232 }
233 s_ptr->scratch_len -= consume_count;
234 }
235 }
236
237 // now emit the cur as a series of repeating commands
238 while (s_ptr->cur_count >= CMTE_MIN_REPEAT_IN_ONE_COMMAND) {
239 int8_t cmd;
240 if (s_ptr->cur_count > CMTE_MAX_REPEAT_IN_ONE_COMMAND) {
241 cmd = repeat_count_to_cmd[CMTE_MAX_REPEAT_IN_ONE_COMMAND];
242 } else {
243 cmd = repeat_count_to_cmd[s_ptr->cur_count];
244 }
245 // reached a length that's not handled by a repeat command, shouldn't happen
246 assert(cmd >= CMTE_FIRST_REPEAT_COMMAND);
247
248 uint8_t out_byte = (uint8_t)((uint8_t)cmd << 4) | s_ptr->cur_tag;
249 OUTPUT_BYTE(out_byte);
250
251 // handled some of the length of the current run
252 uint32_t consumed_count = cmd_to_count[cmd];
253 assert(consumed_count <= s_ptr->cur_count); // consumed too much, shouldn't happen
254 s_ptr->cur_count -= consumed_count;
255 }
256 // there might be still a few repeats left in cur, next iteration of the while loop will handle it via
257 // the scratch path
258 }
259 return true;
260 }; // process_tag()
261
262 // first level goes over bytes
263 for (uint32_t byte_i = 0; byte_i < in_size; ++byte_i) {
264 uint8_t b = tags_in_buf[byte_i];
265 uint8_t tag = b & 0xF;
266 // second level iterates over 4 bit nibbles
267 for (int nibble_i = 0; nibble_i < 2; ++nibble_i) {
268 // third level counts the run length of equal nibbles
269 if (tag == s.cur_tag) {
270 // add to the current run-length
271 ++s.cur_count;
272 } else {
273 // previous run-length finished, process it
274 // fourth level decides what to do with each run-length
275 if (!process_tag(false)) {
276 goto too_long;
277 }
278
279 // done handling the just-ended run, now create a new run in cur
280 s.cur_tag = tag;
281 s.cur_count = 1;
282 }
283
284 tag = b >> 4;
285 }
286 }
287
288 if (s.out_offset == 0 && s.scratch_len == 0) {
289 // the entire buffer was a single run of the same nibble value, return special value to indicate that along
290 // with the repeating tag (single-tag optimization)
291 return C_MTE_SIZE + 1 + s.cur_tag;
292 }
293 // process anything remaining in cur or in scratch
294 if (!process_tag(true)) {
295 goto too_long;
296 }
297 assert(s.scratch_len == 0);
298
299 // normal case - return how many bytes were written to the compressed output
300 return (uint32_t)s.out_offset;
301
302 too_long:
303 memcpy(out_buf, tags_in_buf, in_size);
304 return C_MTE_SIZE;
305 #undef OUTPUT_BYTE
306 }
307
308 // convert the return value of vm_mte_rle_compress_tags() to the actual size written to the buffer
309 uint32_t
vm_mte_compressed_tags_actual_size(uint32_t mte_size)310 vm_mte_compressed_tags_actual_size(uint32_t mte_size)
311 {
312 if (mte_size <= C_MTE_SIZE) {
313 return mte_size;
314 }
315 return 0;
316 }
317
318 // parse an RLE encoded compressed buffer and decompress it
319 // in_size is the return value of rle_compress_mte_tags()
320 // return true on success, false if input is malformed
321 bool
vm_mte_rle_decompress_tags(uint8_t * in_buf,uint32_t in_size,uint8_t * tags_out_buf,__assert_only uint32_t out_size)322 vm_mte_rle_decompress_tags(uint8_t *in_buf, uint32_t in_size, uint8_t *tags_out_buf, __assert_only uint32_t out_size)
323 {
324 assert(out_size == C_MTE_SIZE);
325 if (in_size == C_MTE_SIZE) {
326 memcpy(tags_out_buf, in_buf, in_size);
327 return true;
328 }
329 if (in_size > C_MTE_SIZE) { // same tag optimization
330 assert(in_size <= C_SLOT_C_MTE_SIZE_MAX);
331 uint8_t tag = (in_size & 0xF) - 1;
332 uint8_t dbl_tag = tag | (uint8_t)(tag << 4);
333 memset(tags_out_buf, dbl_tag, C_MTE_SIZE);
334 // possible optimization: don't need to fill the buffer to do the eventual STGMs
335 return true;
336 }
337
338 uint32_t out_offset = 0;
339 uint8_t part_byte = CMTE_INVALID_TAG;
340 #define OUTPUT_TAG(t) do { \
341 if (__improbable(out_offset >= C_MTE_SIZE)) \
342 return false; \
343 if (part_byte == CMTE_INVALID_TAG) \
344 part_byte = (t); \
345 else { \
346 tags_out_buf[out_offset++] = part_byte | (uint8_t)((t) << 4); \
347 part_byte = CMTE_INVALID_TAG; \
348 } \
349 } while(false)
350
351 // process commands one by one
352 uint32_t in_offset = 0;
353 while (in_offset < in_size) {
354 uint8_t in_byte = in_buf[in_offset++];
355 uint8_t cmd = in_byte >> 4;
356 if (cmd < CMTE_FIRST_REPEAT_COMMAND) {
357 // it's a non-repeat command
358 uint8_t first_tag = in_byte & 0xF;
359 OUTPUT_TAG(first_tag);
360 int32_t nonrpt_i = cmd_to_count[cmd] - 1; // number of tags we still need to read
361 assert(nonrpt_i % 2 == 0); // sanity
362 for (; nonrpt_i > 0; nonrpt_i -= 2) {
363 if (__improbable(in_offset >= in_size)) {
364 return false; // no more bytes to read (encoding error)
365 }
366 uint8_t in_byte2 = in_buf[in_offset++];
367 OUTPUT_TAG(in_byte2 & 0xF);
368 OUTPUT_TAG(in_byte2 >> 4);
369 }
370 } else {
371 // it's a repeat command
372 uint8_t tag = in_byte & 0xF;
373 uint32_t repeat = cmd_to_count[cmd];
374 // sanity check for cmd_to_count table
375 assert(repeat >= CMTE_MIN_REPEAT_IN_ONE_COMMAND && repeat <= CMTE_MAX_REPEAT_IN_ONE_COMMAND);
376 if (part_byte != CMTE_INVALID_TAG) {
377 // we have a byte in progress, need emit one tag of this run to finish this byte before memset
378 OUTPUT_TAG(tag);
379 repeat -= 1;
380 }
381 uint32_t fill_sz = repeat / 2;
382 if (__improbable(out_offset + fill_sz > C_MTE_SIZE)) {
383 return false; // overflow
384 }
385 if (fill_sz > 0) {
386 uint8_t dbl_tag = tag | (uint8_t)(tag << 4);
387 memset(tags_out_buf + out_offset, dbl_tag, fill_sz);
388 out_offset += fill_sz;
389 }
390 // do we need to add the extra one for an odd repeat count
391 if ((repeat % 2) != 0) {
392 OUTPUT_TAG(tag);
393 }
394 }
395 }
396 // make sure we output exactly the expected amount and no partial byte pending
397 // (first condition can't be reached since we would bail before so this is here for sanity)
398 if (__improbable(out_offset != C_MTE_SIZE || part_byte != CMTE_INVALID_TAG)) {
399 return false;
400 }
401 return true;
402 #undef OUTPUT_TAG
403 }
404
405 #if DEVELOPMENT || DEBUG
406 // parse an RLE encoded compressed buffer and increment a commands histogram
407 // this is used by the tester
408 bool
vm_mte_rle_comp_histogram(uint8_t * in_buf,uint32_t in_size,struct comp_histogram * hist)409 vm_mte_rle_comp_histogram(uint8_t *in_buf, uint32_t in_size, struct comp_histogram *hist)
410 {
411 if (in_size > C_MTE_SIZE) { // same tag optimization
412 assert(in_size <= C_SLOT_C_MTE_SIZE_MAX);
413 hist->same_value_count++;
414 return true;
415 }
416 assert(in_size > 0);
417 hist->comp_size_bins[(in_size - 1) / C_SEG_OFFSET_ALIGNMENT_BOUNDARY]++;
418
419 if (in_size == C_MTE_SIZE) { // not compressed
420 return true;
421 }
422
423 // process commands one by one
424 uint32_t in_offset = 0;
425 while (in_offset < in_size) {
426 uint8_t in_byte = in_buf[in_offset++];
427 uint8_t cmd = in_byte >> 4;
428 hist->cmd_bins[cmd]++;
429 hist->cmd_total++;
430 if (cmd < CMTE_FIRST_REPEAT_COMMAND) {
431 // it's a non-repeat command
432 int32_t nonrpt_i = cmd_to_count[cmd] - 1; // number of tags we still need to read
433 assert(nonrpt_i % 2 == 0); // sanity
434 for (; nonrpt_i > 0; nonrpt_i -= 2) {
435 if (__improbable(in_offset >= in_size)) {
436 return false; // no more bytes to read (encoding error)
437 }
438 in_offset++;
439 }
440 } else {
441 // it's a repeat command, only 1 byte
442 }
443 }
444 return true;
445 }
446
447 void
vm_mte_rle_runs_histogram(uint8_t * tags_in_buf,uint32_t in_size,struct runs_histogram * hist)448 vm_mte_rle_runs_histogram(uint8_t *tags_in_buf, uint32_t in_size, struct runs_histogram *hist)
449 {
450 assert(in_size == C_MTE_SIZE);
451
452 struct run_length_state s = {};
453 struct run_length_state *s_ptr = &s; // avoid using __block which prevents some optimizations
454
455 // this is done in a block since it needs to happen in the loop that goes over nibbles and at the end of the loop
456 // return false if we ran out of output buffer
457 void (^process_tag)() = ^void () {
458 if (s_ptr->cur_count == 0) {
459 return; // first time
460 }
461 assert(s_ptr->cur_count <= VM_MTE_C_MAX_TAG_RUN);
462 hist->rh_bins[s_ptr->cur_count]++;
463 }; // process_tag()
464
465 // first level goes over bytes
466 for (uint32_t byte_i = 0; byte_i < in_size; ++byte_i) {
467 uint8_t b = tags_in_buf[byte_i];
468 uint8_t tag = b & 0xF;
469 // second level iterates over 4 bit nibbles
470 for (int nibble_i = 0; nibble_i < 2; ++nibble_i) {
471 // third level counts the run length of equal nibbles
472 if (tag == s.cur_tag) {
473 // add to the current run-length
474 ++s.cur_count;
475 } else {
476 // previous run-length finished, process it
477 // fourth level decides what to do with each run-length
478 process_tag();
479
480 // done handling the just-ended run, now create a new run in cur
481 s.cur_tag = tag;
482 s.cur_count = 1;
483 }
484 tag = b >> 4;
485 }
486 }
487
488 // process anything remaining in cur or in scratch
489 process_tag();
490 }
491
492
493 #endif /* DEVELOPMENT || DEBUG */
494
495 #ifndef COMPRESSOR_TESTER // compressor_tags_stats is not defined in the tester
496
497 SCALABLE_COUNTER_DEFINE(compressor_tagged_pages_compressed);
498 // different reasons why tagged pages were removed from the compressor
499 SCALABLE_COUNTER_DEFINE(compressor_tagged_pages_decompressed);
500 SCALABLE_COUNTER_DEFINE(compressor_tagged_pages_freed);
501 SCALABLE_COUNTER_DEFINE(compressor_tagged_pages_corrupted);
502 // current number of bytes taken by compressed tags in the compressor
503 SCALABLE_COUNTER_DEFINE(compressor_tags_overhead_bytes);
504 // current number of tagged pages that reside in the compressor
505 SCALABLE_COUNTER_DEFINE(compressor_tagged_pages);
506 // current number of tag storage pages composing the compressor pool
507 SCALABLE_COUNTER_DEFINE(compressor_tag_storage_pages_in_pool);
508 // current number of non-tag storage pages composing the compressor pool
509 SCALABLE_COUNTER_DEFINE(compressor_non_tag_storage_pages_in_pool);
510 // the following is a breakdown of tagged_pages_compressed
511 #if DEVELOPMENT || DEBUG
512 SCALABLE_COUNTER_DEFINE(compressor_tags_all_zero);
513 SCALABLE_COUNTER_DEFINE(compressor_tags_same_value);
514 SCALABLE_COUNTER_DEFINE(compressor_tags_below_align);
515 SCALABLE_COUNTER_DEFINE(compressor_tags_above_align);
516 SCALABLE_COUNTER_DEFINE(compressor_tags_incompressible);
517 #endif /* DEVELOPMENT || DEBUG */
518
519 #if DEVELOPMENT || DEBUG
520 // don't want to expose in release counters which may indirectly expose tag information
521 #define debug_counter_inc(c) counter_inc(c)
522 #else /* DEVELOPMENT || DEBUG */
523 #define debug_counter_inc(c)
524 #endif /* DEVELOPMENT || DEBUG */
525
526 void
vm_mte_tags_stats_compressed(uint32_t size_written)527 vm_mte_tags_stats_compressed(uint32_t size_written)
528 {
529 counter_inc(&compressor_tagged_pages_compressed);
530 bool add_overhead = false;
531
532 // 64 bytes is the minimum space compressed tags can take since that's the alignment requirement of wkdmc output
533 // so even if the tags are compressed to 1 byte, it's still going to take 64 bytes in the segment
534 if (size_written < C_SEG_OFFSET_ALIGNMENT_BOUNDARY) {
535 debug_counter_inc(&compressor_tags_below_align);
536 add_overhead = true;
537 } else if (size_written == C_MTE_SIZE) {
538 debug_counter_inc(&compressor_tags_incompressible);
539 add_overhead = true;
540 } else if (size_written == C_MTE_SIZE + 1) {
541 // same-value indicator + tag == 0
542 debug_counter_inc(&compressor_tags_all_zero);
543 } else if (size_written > C_MTE_SIZE) {
544 // just same-value indicator
545 debug_counter_inc(&compressor_tags_same_value);
546 } else {
547 debug_counter_inc(&compressor_tags_above_align);
548 add_overhead = true;
549 }
550
551 if (add_overhead) {
552 uint64_t rounded = C_SEG_ROUND_TO_ALIGNMENT(size_written);
553 counter_add(&compressor_tags_overhead_bytes, rounded);
554 }
555 counter_inc(&compressor_tagged_pages);
556 }
557
558 void
vm_mte_tags_stats_removed(uint32_t mte_size,vm_mte_c_tags_removal_reason_t reason)559 vm_mte_tags_stats_removed(uint32_t mte_size, vm_mte_c_tags_removal_reason_t reason)
560 {
561 switch (reason) {
562 case VM_MTE_C_TAGS_REMOVAL_DECOMPRESSED:
563 counter_inc(&compressor_tagged_pages_decompressed);
564 break;
565 case VM_MTE_C_TAGS_REMOVAL_FREE:
566 counter_inc(&compressor_tagged_pages_freed);
567 break;
568 case VM_MTE_C_TAGS_REMOVAL_CORRUPT:
569 counter_inc(&compressor_tagged_pages_corrupted);
570 break;
571 default:
572 panic("Unexpected compressor tags removal reason %u", reason);
573 }
574
575 if (mte_size <= C_MTE_SIZE) { // same-tag optimization doesn't take any buffer space
576 uint64_t rounded = C_SEG_ROUND_TO_ALIGNMENT(mte_size);
577 counter_add(&compressor_tags_overhead_bytes, -(uint64_t)rounded);
578 }
579 counter_dec(&compressor_tagged_pages);
580 }
581
582 #endif // COMPRESSOR_TESTER
583 #endif // HAS_MTE
584