xref: /xnu-8020.121.3/tests/vm/fault_throughput.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Benchmark VM fault throughput.
3  * This test faults memory for a configurable amount of time across a
4  * configurable number of threads. Currently it only measures zero fill faults.
5  * Currently it supports two variants:
6  * 1. Each thread gets its own vm objects to fault in
7  * 2. Threads share vm objects
8  *
9  * We'll add more fault types as we identify problematic user-facing workloads
10  * in macro benchmarks.
11  *
12  * Throughput is reported as pages / second using both wall time and cpu time.
13  * CPU time is a more reliable metric for regression testing, but wall time can
14  * highlight blocking in the VM.
15  *
16  * Running this benchmark directly is not recommended.
17  * Use fault_throughput.lua which provides a nicer interface and outputs
18  * perfdata.
19  */
20 #include <assert.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <stdarg.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <strings.h>
27 
28 #include <sys/mman.h>
29 #include <sys/types.h>
30 #include <sys/sysctl.h>
31 
32 /*
33  * TODO: Make this benchmark runnable on linux so we can do a perf comparison.
34  * We're mostly using POSIX APIs, but we'll need to replace
35  * the sysctls with the /proc equivalents, and replace clock_gettime_nsec_np
36  * with the linux equivalent.
37  */
38 #include <mach/mach.h>
39 
40 #include <TargetConditionals.h>
41 
42 #include <pthread.h>
43 #include <stdatomic.h>
44 
45 #include "benchmark/helpers.h"
46 
47 #if (TARGET_OS_OSX || TARGET_OS_SIMULATOR)
48 /*
49  * On non-embedded platforms we coalesce vm objects up to 128 MB, so
50  * we make the objects 128 MB on that platform to ensure they're not
51  * merged with anything else.
52  */
53 const static size_t kVmObjectSize = 128 * (1UL << 20);
54 #else
55 /*
56  * Embedded platforms don't coalesce vm objects. This number
57  * needs to be big enough that faulting it in dwarfs the cost of dequeuing
58  * it from the work queue, but can't be too large or else we won't be able
59  * to allocate one per thread in the separate-objects benchmark.
60  */
61 const static size_t kVmObjectSize = 4 * (1UL << 20);
62 #endif /* (TARGET_OS_OSX || TARGET_OS_SIMULATOR) */
63 static const clockid_t kWallTimeClock = CLOCK_MONOTONIC_RAW;
64 static const clockid_t kThreadCPUTimeClock = CLOCK_THREAD_CPUTIME_ID;
65 /* These globals are set dynamically during test setup based on sysctls. */
66 static uint64_t kCacheLineSize = 0;
67 /* The VM page size */
68 static size_t kPageSize = 0;
69 
70 
71 typedef struct fault_buffer {
72 	unsigned char* fb_start; /* The start of this buffer. */
73 	size_t fb_size; /* The size of this buffer in bytes. */
74 } fault_buffer_t;
75 
76 typedef enum test_variant {
77 	VARIANT_SEPARATE_VM_OBJECTS,
78 	VARIANT_SHARE_VM_OBJECTS
79 } test_variant_t;
80 
81 typedef struct test_globals {
82 	/* This lock protects: tg_cv, tg_running_count, tg_done, tg_current_iteration, and tg_iterations_completed. */
83 	pthread_mutex_t tg_lock;
84 	pthread_cond_t tg_cv;
85 	/* The number of currently running threads */
86 	unsigned int tg_running_count;
87 	/* Set during cleanup to indicate that the benchmark is over. */
88 	bool tg_done;
89 	size_t tg_current_iteration;
90 	size_t tg_iterations_completed;
91 	unsigned int tg_num_threads;
92 	test_variant_t tg_variant;
93 	bool pin_threads;
94 	/*
95 	 * An array of memory objects to fault in.
96 	 * This is basically a workqueue of
97 	 * contiguous chunks of memory that the worker threads
98 	 * will fault in.
99 	 */
100 	fault_buffer_t *tg_fault_buffer_arr;
101 	size_t tg_fault_buffer_arr_length;
102 	/*
103 	 * To avoid false sharing, we pad the test globals with an extra cache line and place the atomic
104 	 * next_fault_buffer_index size_t after the cache line.
105 	 */
106 	__unused char padding[];
107 	/*
108 	 * This field is directly after the padding buffer.
109 	 * It is used to synchronize access to tg_fault_buffer_arr.
110 	 */
111 	//_Atomic size_t tg_next_fault_buffer_index;
112 } test_globals_t;
113 
114 typedef struct {
115 	void *test_globals;
116 	uint32_t cpu_id;
117 } faulting_thread_args_t;
118 
119 static faulting_thread_args_t *faulting_thread_args;
120 
121 static const char* kSeparateObjectsArgument = "separate-objects";
122 static const char* kShareObjectsArgument = "share-objects";
123 
124 /* Arguments parsed from the command line */
125 typedef struct test_args {
126 	uint32_t n_threads;
127 	uint32_t first_cpu;
128 	uint64_t duration_seconds;
129 	test_variant_t variant;
130 	bool pin_threads;
131 	bool verbose;
132 } test_args_t;
133 
134 /*
135  * Fault in the pages in the given buffer.
136  */
137 static void fault_pages(fault_buffer_t *buffer, size_t stride);
138 /* Get a unique fault buffer from the global work queue. */
139 static fault_buffer_t *get_fault_buffer(test_globals_t* globals);
140 /*
141  * Grabs buffers from the global test structure and faults them in, using this
142  * test variant's stride, until there are no more buffers to grab.
143  * Returns the number of microseconds spent on-cpu.
144  */
145 static uint64_t grab_and_fault_pages(test_globals_t* globals);
146 
147 static bool worker_thread_iteration_setup(size_t current_iteration, test_globals_t *globals);
148 static void worker_thread_iteration_complete(test_globals_t *globals);
149 
150 static void parse_arguments(int argc, char **argv, test_args_t *args);
151 /*
152  * Sets up the test globals and spawns the background threads to do the faults.
153  * Returns an array of size `num_threads`
154  * Containing the thread ids of the forked threads.
155  */
156 static pthread_t* setup_test(test_globals_t *globals, const test_args_t *args, size_t memory_size, bool verbose);
157 static test_globals_t *allocate_test_globals(void);
158 /* Initializes variables in the globals array. */
159 static void init_globals(test_globals_t *globals, const test_args_t *args);
160 static inline _Atomic size_t *next_fault_buffer_index_ptr(test_globals_t *globals);
161 /*
162  * Called on the main thread.
163  * Waits for the background threads to be ready, sets up the memory objects,
164  * and then starts a faulting iteration.
165  * Returns the start (wall) time.
166  */
167 static uint64_t start_iteration(test_globals_t* globals, test_variant_t variant, bool verbose);
168 /*
169  * Called on the main thread.
170  * Waits for the background threads to complete the iteration and cleans up.
171  * Returns the total amount of time spent faulting pages in nanoseconds by all threads thus far.
172  */
173 static uint64_t finish_iteration(test_globals_t *globals, uint64_t start_time);
174 /*
175  * Called on the main thread.
176  * Maps buffers and places them in the work queue.
177  */
178 static void setup_memory(test_globals_t* globals, test_variant_t variant);
179 /*
180  * Dump test results as a csv to stdout.
181  * Use fault_throughput.lua to convert to perfdata.
182  */
183 static void output_results(const test_globals_t *globals, double walltime_elapsed_seconds, double cputime_elapsed_seconds);
184 static void cleanup_test(test_globals_t *globals);
185 /*
186  * Join the background threads and return the total microseconds
187  * of cpu time spent faulting across all of the threads.
188  * Takes ownership of the threads array and frees it.
189  */
190 static uint64_t join_background_threads(test_globals_t *globals, pthread_t *threads);
191 static void unmap_fault_buffers(test_globals_t *globals);
192 /*
193  * Get the stride between each vm object in the fault buffer array.
194  */
195 static size_t fault_buffer_stride(const test_globals_t *globals);
196 
197 int
main(int argc,char ** argv)198 main(int argc, char **argv)
199 {
200 	/* How much memory should the test consume (per-core on the system)? */
201 #if (TARGET_OS_OSX || TARGET_OS_SIMULATOR)
202 	static const size_t memory_per_core = kVmObjectSize;
203 #else
204 	static const size_t memory_per_core = 25 * (1UL << 20);
205 #endif /* (TARGET_OS_OSX || TARGET_OS_SIMULATOR) */
206 	const size_t kMemSize = memory_per_core * (size_t) get_ncpu();
207 	test_globals_t *globals = allocate_test_globals();
208 	/* Total wall-time spent faulting in pages. */
209 	uint64_t wall_time_elapsed_ns = 0;
210 	/* Total cpu-time spent faulting in pages */
211 	uint64_t cpu_time_faulting_us = 0;
212 	uint64_t start_time_ns;
213 	test_args_t args;
214 	parse_arguments(argc, argv, &args);
215 	pthread_t* threads = setup_test(globals, &args, kMemSize, args.verbose);
216 
217 	/* Keep doing more iterations until we've hit our (wall) time budget */
218 	while (wall_time_elapsed_ns < args.duration_seconds * kNumNanosecondsInSecond) {
219 		benchmark_log(args.verbose, "----Starting Iteration %lu-----\n", globals->tg_current_iteration + 1);
220 		start_time_ns = start_iteration(globals, args.variant, args.verbose);
221 		wall_time_elapsed_ns += finish_iteration(globals, start_time_ns);
222 		benchmark_log(args.verbose, "----Completed Iteration %lu----\n", globals->tg_current_iteration);
223 	}
224 
225 	benchmark_log(args.verbose, "Hit time budget\nJoining worker threads\n");
226 	cpu_time_faulting_us = join_background_threads(globals, threads);
227 	benchmark_log(args.verbose, "----End Test Output----\n");
228 	output_results(globals, (double) wall_time_elapsed_ns / kNumNanosecondsInSecond,
229 	    (double)cpu_time_faulting_us / kNumMicrosecondsInSecond);
230 	cleanup_test(globals);
231 
232 	return 0;
233 }
234 
235 
236 /* The main loop for the worker threads. */
237 static void*
faulting_thread(void * arg)238 faulting_thread(void* arg)
239 {
240 	test_globals_t* globals = ((faulting_thread_args_t *)arg)->test_globals;
241 	uint64_t on_cpu_time_faulting = 0;
242 	size_t current_iteration = 1;
243 
244 	if (globals->pin_threads) {
245 		uint32_t cpu_id = ((faulting_thread_args_t *)arg)->cpu_id;
246 		int err = sysctlbyname("kern.sched_thread_bind_cpu", NULL, 0, &cpu_id, sizeof(cpu_id));
247 		assert(err == 0);
248 	}
249 
250 	while (true) {
251 		bool should_continue = worker_thread_iteration_setup(current_iteration, globals);
252 		if (!should_continue) {
253 			break;
254 		}
255 		on_cpu_time_faulting += grab_and_fault_pages(globals);
256 		worker_thread_iteration_complete(globals);
257 		current_iteration++;
258 	}
259 	return (void*)on_cpu_time_faulting;
260 }
261 
262 /*
263  * Called on the worker threads before each iteration to synchronize this
264  * iteration start with the other threads.
265  * Returns true if the iteration should continue, and false if the test is over.
266  */
267 static bool
worker_thread_iteration_setup(size_t current_iteration,test_globals_t * globals)268 worker_thread_iteration_setup(size_t current_iteration, test_globals_t *globals)
269 {
270 	bool should_continue = false;
271 	int ret = 0;
272 	// Gate on the other threads being ready to start
273 	ret = pthread_mutex_lock(&globals->tg_lock);
274 	assert(ret == 0);
275 	globals->tg_running_count++;
276 	if (globals->tg_running_count == globals->tg_num_threads) {
277 		// All the worker threads are running.
278 		// Wake up the main thread so that it can ungate the test.
279 		ret = pthread_cond_broadcast(&globals->tg_cv);
280 		assert(ret == 0);
281 	}
282 	/*
283 	 * The main thread will start this iteration by incrementing
284 	 * tg_current_iteration. Block until that happens.
285 	 * See start_iteration for the wakeup code.
286 	 */
287 	while (!globals->tg_done && globals->tg_current_iteration != current_iteration) {
288 		ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock);
289 		assert(ret == 0);
290 	}
291 	should_continue = !globals->tg_done;
292 	ret = pthread_mutex_unlock(&globals->tg_lock);
293 	assert(ret == 0);
294 	return should_continue;
295 }
296 
297 /*
298  * Called on the worker threads before each iteration finishes to synchronize
299  * with the other threads.
300  */
301 static void
worker_thread_iteration_complete(test_globals_t * globals)302 worker_thread_iteration_complete(test_globals_t *globals)
303 {
304 	int ret;
305 	// Mark ourselves as done and wait for the other threads to finish
306 	ret = pthread_mutex_lock(&globals->tg_lock);
307 	assert(ret == 0);
308 	globals->tg_running_count--;
309 	if (globals->tg_running_count == 0) {
310 		// We're the last one to finish. Mark this iteration as completed and wake everyone up.
311 		globals->tg_iterations_completed++;
312 		ret = pthread_cond_broadcast(&globals->tg_cv);
313 		assert(ret == 0);
314 	} else {
315 		// Others are running. Wait for them to finish.
316 		while (globals->tg_iterations_completed != globals->tg_current_iteration) {
317 			ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock);
318 			assert(ret == 0);
319 		}
320 	}
321 	ret = pthread_mutex_unlock(&globals->tg_lock);
322 	assert(ret == 0);
323 }
324 
325 static void
fault_pages(fault_buffer_t * buffer,size_t stride)326 fault_pages(fault_buffer_t *buffer, size_t stride)
327 {
328 	volatile unsigned char val;
329 	for (unsigned char* ptr = buffer->fb_start; ptr < buffer->fb_start + buffer->fb_size; ptr += stride) {
330 		val = *ptr;
331 	}
332 }
333 
334 static fault_buffer_t *
get_fault_buffer(test_globals_t * globals)335 get_fault_buffer(test_globals_t* globals)
336 {
337 	size_t index = atomic_fetch_add_explicit(next_fault_buffer_index_ptr(globals), 1UL, memory_order_acq_rel);
338 	if (index < globals->tg_fault_buffer_arr_length) {
339 		return &globals->tg_fault_buffer_arr[index];
340 	}
341 	return NULL;
342 }
343 
344 static uint64_t
grab_and_fault_pages(test_globals_t * globals)345 grab_and_fault_pages(test_globals_t* globals)
346 {
347 	struct timespec start_time, end_time;
348 	uint64_t nanoseconds_faulting_on_cpu = 0;
349 	int ret;
350 	size_t stride = fault_buffer_stride(globals) * kPageSize;
351 	while (true) {
352 		fault_buffer_t *object = get_fault_buffer(globals);
353 		if (object == NULL) {
354 			break;
355 		}
356 		ret = clock_gettime(kThreadCPUTimeClock, &start_time);
357 		assert(ret == 0);
358 
359 		fault_pages(object, stride);
360 
361 		ret = clock_gettime(kThreadCPUTimeClock, &end_time);
362 		assert(ret == 0);
363 		nanoseconds_faulting_on_cpu += (unsigned long) timespec_difference_us(&end_time, &start_time);
364 	}
365 	return nanoseconds_faulting_on_cpu;
366 }
367 
368 static uint64_t
start_iteration(test_globals_t * globals,test_variant_t variant,bool verbose)369 start_iteration(test_globals_t* globals, test_variant_t variant, bool verbose)
370 {
371 	int ret;
372 	uint64_t start_time;
373 	ret = pthread_mutex_lock(&globals->tg_lock);
374 	assert(ret == 0);
375 	benchmark_log(verbose, "Waiting for workers to catch up before starting next iteration.\n");
376 	/* Wait until all the threads are ready to go to the next iteration */
377 	while (globals->tg_running_count != globals->tg_num_threads) {
378 		ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock);
379 	}
380 	benchmark_log(verbose, "Workers are all caught up\n");
381 	setup_memory(globals, variant);
382 	benchmark_log(verbose, "Initialized data structures for iteration. Waking workers.\n");
383 	/* Grab a timestamp, tick the current iteration, and wake up the worker threads */
384 	start_time = current_timestamp_ns();
385 	globals->tg_current_iteration++;
386 	ret = pthread_mutex_unlock(&globals->tg_lock);
387 	assert(ret == 0);
388 	ret = pthread_cond_broadcast(&globals->tg_cv);
389 	assert(ret == 0);
390 	return start_time;
391 }
392 
393 static uint64_t
finish_iteration(test_globals_t * globals,uint64_t start_time)394 finish_iteration(test_globals_t* globals, uint64_t start_time)
395 {
396 	int ret;
397 	uint64_t end_time;
398 	ret = pthread_mutex_lock(&globals->tg_lock);
399 	assert(ret == 0);
400 	while (globals->tg_iterations_completed != globals->tg_current_iteration) {
401 		ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock);
402 	}
403 	end_time = current_timestamp_ns();
404 	ret = pthread_mutex_unlock(&globals->tg_lock);
405 	unmap_fault_buffers(globals);
406 	assert(ret == 0);
407 	return end_time - start_time;
408 }
409 
410 static void
setup_memory(test_globals_t * globals,test_variant_t variant)411 setup_memory(test_globals_t* globals, test_variant_t variant)
412 {
413 	size_t stride = fault_buffer_stride(globals);
414 	for (size_t i = 0; i < globals->tg_fault_buffer_arr_length; i += stride) {
415 		fault_buffer_t *object = &globals->tg_fault_buffer_arr[i];
416 		object->fb_start = mmap_buffer(kVmObjectSize);
417 		object->fb_size = kVmObjectSize;
418 		if (variant == VARIANT_SHARE_VM_OBJECTS) {
419 			/*
420 			 * Insert another buffer into the work queue for each thread.
421 			 * Each buffer starts 1 page past where the previous buffer started into the vm object.
422 			 * Since each thread strides by the number of threads * the page size they won't fault in the same pages.
423 			 */
424 			for (size_t j = 1; j < globals->tg_num_threads; j++) {
425 				size_t offset = kPageSize * j;
426 				fault_buffer_t *offset_object = &globals->tg_fault_buffer_arr[i + j];
427 				offset_object->fb_start = object->fb_start + offset;
428 				offset_object->fb_size = object->fb_size - offset;
429 			}
430 		} else if (variant != VARIANT_SEPARATE_VM_OBJECTS) {
431 			fprintf(stderr, "Unknown test variant.\n");
432 			exit(2);
433 		}
434 	}
435 	atomic_store_explicit(next_fault_buffer_index_ptr(globals), 0, memory_order_release);
436 }
437 
438 static void
unmap_fault_buffers(test_globals_t * globals)439 unmap_fault_buffers(test_globals_t* globals)
440 {
441 	size_t stride = fault_buffer_stride(globals);
442 	for (size_t i = 0; i < globals->tg_fault_buffer_arr_length; i += stride) {
443 		fault_buffer_t *buffer = &globals->tg_fault_buffer_arr[i];
444 		int res = munmap(buffer->fb_start, buffer->fb_size);
445 		assert(res == 0);
446 	}
447 }
448 
449 static test_globals_t *
allocate_test_globals()450 allocate_test_globals()
451 {
452 	test_globals_t *globals = NULL;
453 	int ret;
454 	if (kCacheLineSize == 0) {
455 		size_t cachelinesize_size = sizeof(kCacheLineSize);
456 		ret = sysctlbyname("hw.cachelinesize", &kCacheLineSize, &cachelinesize_size, NULL, 0);
457 		assert(ret == 0);
458 		assert(kCacheLineSize > 0);
459 	}
460 	if (kPageSize == 0) {
461 		size_t pagesize_size = sizeof(kPageSize);
462 		ret = sysctlbyname("vm.pagesize", &kPageSize, &pagesize_size, NULL, 0);
463 		assert(ret == 0);
464 		assert(kPageSize > 0);
465 	}
466 	size_t test_globals_size = sizeof(test_globals_t) + kCacheLineSize + sizeof(_Atomic size_t);
467 	globals = malloc(test_globals_size);
468 	assert(globals != NULL);
469 	memset(globals, 0, test_globals_size);
470 	return globals;
471 }
472 
473 static void
init_globals(test_globals_t * globals,const test_args_t * args)474 init_globals(test_globals_t *globals, const test_args_t *args)
475 {
476 	pthread_mutexattr_t mutex_attrs;
477 	pthread_condattr_t cond_attrs;
478 	int ret;
479 	memset(globals, 0, sizeof(test_globals_t));
480 
481 	ret = pthread_mutexattr_init(&mutex_attrs);
482 	assert(ret == 0);
483 	ret = pthread_mutex_init(&globals->tg_lock, &mutex_attrs);
484 	assert(ret == 0);
485 	ret = pthread_condattr_init(&cond_attrs);
486 	assert(ret == 0);
487 	ret = pthread_cond_init(&globals->tg_cv, &cond_attrs);
488 	assert(ret == 0);
489 	ret = pthread_mutexattr_destroy(&mutex_attrs);
490 	assert(ret == 0);
491 	ret = pthread_condattr_destroy(&cond_attrs);
492 	assert(ret == 0);
493 
494 	globals->tg_num_threads = args->n_threads;
495 	globals->tg_variant = args->variant;
496 	globals->pin_threads = args->pin_threads;
497 }
498 
499 static void
init_fault_buffer_arr(test_globals_t * globals,const test_args_t * args,size_t memory_size)500 init_fault_buffer_arr(test_globals_t *globals, const test_args_t *args, size_t memory_size)
501 {
502 	if (args->variant == VARIANT_SEPARATE_VM_OBJECTS) {
503 		// This variant creates separate vm objects up to memory size bytes total
504 		globals->tg_fault_buffer_arr_length = memory_size / kVmObjectSize;
505 	} else if (args->variant == VARIANT_SHARE_VM_OBJECTS) {
506 		// This variant creates separate vm objects up to memory size bytes total
507 		// And places a pointer into each vm object for each thread.
508 		globals->tg_fault_buffer_arr_length = memory_size / kVmObjectSize * globals->tg_num_threads;
509 	} else {
510 		fprintf(stderr, "Unsupported test variant.\n");
511 		exit(2);
512 	}
513 	// It doesn't make sense to have more threads than elements in the work queue.
514 	// NB: Since we scale memory_size by ncpus, this can only happen if the user
515 	// tries to run the benchmark with many more threads than cores.
516 	assert(globals->tg_fault_buffer_arr_length >= globals->tg_num_threads);
517 	globals->tg_fault_buffer_arr = calloc(sizeof(fault_buffer_t), globals->tg_fault_buffer_arr_length);
518 	assert(globals->tg_fault_buffer_arr);
519 }
520 
521 static pthread_t *
spawn_worker_threads(test_globals_t * globals,unsigned int num_threads,unsigned int first_cpu)522 spawn_worker_threads(test_globals_t *globals, unsigned int num_threads, unsigned int first_cpu)
523 {
524 	int ret;
525 	pthread_attr_t pthread_attrs;
526 	globals->tg_num_threads = num_threads;
527 	pthread_t* threads = malloc(sizeof(pthread_t) * num_threads);
528 	faulting_thread_args = malloc(sizeof(faulting_thread_args_t) * num_threads);
529 	assert(threads);
530 	ret = pthread_attr_init(&pthread_attrs);
531 	assert(ret == 0);
532 	// Spawn the background threads
533 	for (unsigned int i = 0; i < num_threads; i++) {
534 		if (globals->pin_threads) {
535 			faulting_thread_args[i].cpu_id = (i + first_cpu) % get_ncpu();
536 		}
537 		faulting_thread_args[i].test_globals = globals;
538 		ret = pthread_create(threads + i, &pthread_attrs, faulting_thread, &faulting_thread_args[i]);
539 		assert(ret == 0);
540 	}
541 	ret = pthread_attr_destroy(&pthread_attrs);
542 	assert(ret == 0);
543 	return threads;
544 }
545 
546 static pthread_t*
setup_test(test_globals_t * globals,const test_args_t * args,size_t memory_size,bool verbose)547 setup_test(test_globals_t *globals, const test_args_t *args, size_t memory_size, bool verbose)
548 {
549 	init_globals(globals, args);
550 	init_fault_buffer_arr(globals, args, memory_size);
551 	benchmark_log(verbose, "Initialized global data structures.\n");
552 	pthread_t *workers = spawn_worker_threads(globals, args->n_threads, args->first_cpu);
553 	benchmark_log(verbose, "Spawned workers.\n");
554 	return workers;
555 }
556 
557 static uint64_t
join_background_threads(test_globals_t * globals,pthread_t * threads)558 join_background_threads(test_globals_t *globals, pthread_t *threads)
559 {
560 	// Set the done flag so that the background threads exit
561 	int ret;
562 	uint64_t total_cputime_spent_faulting = 0;
563 	ret = pthread_mutex_lock(&globals->tg_lock);
564 	assert(ret == 0);
565 	globals->tg_done = true;
566 	ret = pthread_cond_broadcast(&globals->tg_cv);
567 	assert(ret == 0);
568 	ret = pthread_mutex_unlock(&globals->tg_lock);
569 	assert(ret == 0);
570 
571 	// Join the background threads
572 	for (unsigned int i = 0; i < globals->tg_num_threads; i++) {
573 		uint64_t cputime_spent_faulting = 0;
574 		ret = pthread_join(threads[i], (void **)&cputime_spent_faulting);
575 		assert(ret == 0);
576 		total_cputime_spent_faulting += cputime_spent_faulting;
577 	}
578 	free(threads);
579 	free(faulting_thread_args);
580 	return total_cputime_spent_faulting;
581 }
582 
583 static void
cleanup_test(test_globals_t * globals)584 cleanup_test(test_globals_t* globals)
585 {
586 	int ret;
587 	ret = pthread_mutex_destroy(&globals->tg_lock);
588 	assert(ret == 0);
589 	ret = pthread_cond_destroy(&globals->tg_cv);
590 	assert(ret == 0);
591 	free(globals->tg_fault_buffer_arr);
592 	free(globals);
593 }
594 
595 static void
output_results(const test_globals_t * globals,double walltime_elapsed_seconds,double cputime_elapsed_seconds)596 output_results(const test_globals_t* globals, double walltime_elapsed_seconds, double cputime_elapsed_seconds)
597 {
598 	size_t pgsize;
599 	size_t sysctl_size = sizeof(pgsize);
600 	int ret = sysctlbyname("vm.pagesize", &pgsize, &sysctl_size, NULL, 0);
601 	assert(ret == 0);
602 	size_t num_pages = 0;
603 	double walltime_throughput, cputime_throughput;
604 	size_t stride = fault_buffer_stride(globals);
605 	for (size_t i = 0; i < globals->tg_fault_buffer_arr_length; i += stride) {
606 		num_pages += globals->tg_fault_buffer_arr[i].fb_size / pgsize;
607 	}
608 	num_pages *= globals->tg_iterations_completed;
609 	walltime_throughput = num_pages / walltime_elapsed_seconds;
610 	cputime_throughput = num_pages / cputime_elapsed_seconds;
611 	printf("-----Results-----\n");
612 	printf("Throughput (pages / wall second), Throughput (pages / CPU second)\n");
613 	printf("%f,%f\n", walltime_throughput, cputime_throughput);
614 }
615 
616 static void
print_help(char ** argv)617 print_help(char** argv)
618 {
619 	fprintf(stderr, "%s: <test-variant> [-v] duration num_threads\n", argv[0]);
620 	fprintf(stderr, "\ntest variants:\n");
621 	fprintf(stderr, "	%s	Fault in different vm objects in each thread.\n", kSeparateObjectsArgument);
622 	fprintf(stderr, "	%s		Share vm objects across faulting threads.\n", kShareObjectsArgument);
623 }
624 
625 static void
parse_arguments(int argc,char ** argv,test_args_t * args)626 parse_arguments(int argc, char** argv, test_args_t *args)
627 {
628 	int current_argument = 1;
629 	memset(args, 0, sizeof(test_args_t));
630 	if (argc < 4 || argc > 6) {
631 		print_help(argv);
632 		exit(1);
633 	}
634 	if (argv[current_argument][0] == '-') {
635 		if (strcmp(argv[current_argument], "-v") == 0) {
636 			args->verbose = true;
637 		} else {
638 			fprintf(stderr, "Unknown argument %s\n", argv[current_argument]);
639 			print_help(argv);
640 			exit(1);
641 		}
642 		current_argument++;
643 	}
644 	if (strncasecmp(argv[current_argument], kSeparateObjectsArgument, strlen(kSeparateObjectsArgument)) == 0) {
645 		args->variant = VARIANT_SEPARATE_VM_OBJECTS;
646 	} else if (strncasecmp(argv[current_argument], kShareObjectsArgument, strlen(kShareObjectsArgument)) == 0) {
647 		args->variant = VARIANT_SHARE_VM_OBJECTS;
648 	} else {
649 		print_help(argv);
650 		exit(1);
651 	}
652 	current_argument++;
653 
654 	long duration = strtol(argv[current_argument++], NULL, 10);
655 	if (duration == 0) {
656 		print_help(argv);
657 		exit(1);
658 	}
659 	long num_cores = strtol(argv[current_argument++], NULL, 10);
660 	if (num_cores == 0) {
661 		print_help(argv);
662 		exit(1);
663 	}
664 	if (current_argument < argc) {
665 		long first_cpu = strtol(argv[current_argument++], NULL, 10);
666 		assert(first_cpu >= 0 && first_cpu < get_ncpu());
667 		args->pin_threads = true;
668 		args->first_cpu = (unsigned int) first_cpu;
669 	} else {
670 		args->pin_threads = false;
671 	}
672 
673 	assert(num_cores > 0 && num_cores <= get_ncpu());
674 	args->n_threads = (unsigned int) num_cores;
675 	args->duration_seconds = (unsigned long) duration;
676 }
677 
678 static inline
679 _Atomic size_t *
next_fault_buffer_index_ptr(test_globals_t * globals)680 next_fault_buffer_index_ptr(test_globals_t *globals)
681 {
682 	return (_Atomic size_t *) (((ptrdiff_t)(globals + 1)) + (int64_t)kCacheLineSize);
683 }
684 static size_t
fault_buffer_stride(const test_globals_t * globals)685 fault_buffer_stride(const test_globals_t *globals)
686 {
687 	size_t stride;
688 	if (globals->tg_variant == VARIANT_SEPARATE_VM_OBJECTS) {
689 		stride = 1;
690 	} else if (globals->tg_variant == VARIANT_SHARE_VM_OBJECTS) {
691 		stride = globals->tg_num_threads;
692 	} else {
693 		fprintf(stderr, "Unknown variant\n");
694 		exit(-1);
695 	}
696 	return stride;
697 }
698