xref: /xnu-8796.101.5/osfmk/kern/sched_average.c (revision aca3beaa3dfbd42498b42c5e5ce20a938e6554e5) !
1 /*
2  * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	Author:	Avadis Tevanian, Jr.
60  *	Date:	1986
61  *
62  *	Compute various averages.
63  */
64 
65 #include <mach/mach_types.h>
66 
67 #include <kern/sched.h>
68 #include <kern/assert.h>
69 #include <kern/processor.h>
70 #include <kern/thread.h>
71 #if CONFIG_TELEMETRY
72 #include <kern/telemetry.h>
73 #endif
74 #include <kern/zalloc_internal.h>
75 
76 #include <sys/kdebug.h>
77 
78 uint32_t        avenrun[3] = {0, 0, 0};
79 uint32_t        mach_factor[3] = {0, 0, 0};
80 
81 uint32_t        sched_load_average, sched_mach_factor;
82 
83 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
84 /*
85  * Values are scaled by LOAD_SCALE, defined in processor_info.h
86  */
87 #define base(n)         ((n) << SCHED_TICK_SHIFT)
88 #define frac(n)         (((base(n) - 1) * LOAD_SCALE) /	base(n))
89 
90 static uint32_t         fract[3] = {
91 	frac(5),                /* 5 second average */
92 	frac(30),               /* 30 second average */
93 	frac(60),               /* 1 minute average */
94 };
95 
96 #undef base
97 #undef frac
98 
99 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
100 
101 static unsigned int             sched_nrun;
102 
103 typedef void    (*sched_avg_comp_t)(
104 	void                    *param);
105 
106 static struct sched_average {
107 	sched_avg_comp_t        comp;
108 	void                    *param;
109 	int                     period; /* in seconds */
110 	uint64_t                deadline;
111 } sched_average[] = {
112 	{ compute_averunnable, &sched_nrun, 5, 0 },
113 	{ compute_stack_target, NULL, 5, 1 },
114 	{ compute_pageout_gc_throttle, NULL, 1, 0 },
115 	{ compute_pmap_gc_throttle, NULL, 60, 0 },
116 	{ compute_zone_working_set_size, NULL, ZONE_WSS_UPDATE_PERIOD, 0 },
117 #if CONFIG_TELEMETRY
118 	{ compute_telemetry, NULL, 1, 0 },
119 #endif
120 	{ NULL, NULL, 0, 0 }
121 };
122 
123 typedef struct sched_average    *sched_average_t;
124 
125 /*
126  * Scheduler load calculation algorithm
127  *
128  * The scheduler load values provide an estimate of the number of runnable
129  * timeshare threads in the system at various priority bands. The load
130  * ultimately affects the priority shifts applied to all threads in a band
131  * causing them to timeshare with other threads in the system. The load is
132  * maintained in buckets, with each bucket corresponding to a priority band.
133  *
134  * Each runnable thread on the system contributes its load to its priority
135  * band and to the bands above it. The contribution of a thread to the bands
136  * above it is not strictly 1:1 and is weighted based on the priority band
137  * of the thread. The rules of thread load contribution to each of its higher
138  * bands are as follows:
139  *
140  * - DF threads: Upto (2 * NCPUs) threads
141  * - UT threads: Upto NCPUs threads
142  * - BG threads: Upto 1 thread
143  *
144  * To calculate the load values, the various run buckets are sampled (every
145  * sched_load_compute_interval_abs) and the weighted contributions of the the
146  * lower bucket threads are added. The resultant value is plugged into an
147  * exponentially weighted moving average formula:
148  *      new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
149  *      (where, alpha < 1)
150  * The calculations for the scheduler load are done using fixpoint math with
151  * a scale factor of 16 to avoid expensive divides and floating point
152  * operations. The final load values are a smooth curve representative of
153  * the actual number of runnable threads in a priority band.
154  */
155 
156 /* Maintains the current (scaled for fixpoint) load in various buckets */
157 uint32_t sched_load[TH_BUCKET_MAX];
158 
159 /*
160  * Alpha factor for the EWMA alogrithm. The current values are chosen as
161  * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
162  * enough to changing system load but does not see too many spikes from bursty
163  * activity. The current values ensure that the scheduler would converge
164  * to the latest load in 2-3 sched_load_compute_interval_abs intervals
165  * (which amounts to ~30-45ms with current values).
166  */
167 #define SCHED_LOAD_EWMA_ALPHA_OLD      6
168 #define SCHED_LOAD_EWMA_ALPHA_NEW      10
169 #define SCHED_LOAD_EWMA_ALPHA_SHIFT    4
170 static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT));
171 
172 /* For fixpoint EWMA, roundup the load to make it converge */
173 #define SCHED_LOAD_EWMA_ROUNDUP(load)   (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
174 
175 /* Macro to convert scaled sched load to a real load value */
176 #define SCHED_LOAD_EWMA_UNSCALE(load)   (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
177 
178 /*
179  * Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers)
180  */
181 void
compute_sched_load(void)182 compute_sched_load(void)
183 {
184 	/*
185 	 * Retrieve a snapshot of the current run counts.
186 	 *
187 	 * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
188 	 * not byte-by-byte copy.
189 	 */
190 	uint32_t ncpus = processor_avail_count;
191 	uint32_t load_now[TH_BUCKET_MAX];
192 
193 	load_now[TH_BUCKET_RUN]      = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
194 	load_now[TH_BUCKET_FIXPRI]   = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed);
195 	load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed);
196 	load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed);
197 	load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed);
198 	load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed);
199 
200 	assert(load_now[TH_BUCKET_RUN] >= 0);
201 	assert(load_now[TH_BUCKET_FIXPRI] >= 0);
202 
203 	uint32_t nthreads = load_now[TH_BUCKET_RUN];
204 	uint32_t nfixpri  = load_now[TH_BUCKET_FIXPRI];
205 
206 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
207 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE,
208 	    load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]),
209 	    load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0);
210 
211 	/*
212 	 * Compute the timeshare priority conversion factor based on loading.
213 	 * Because our counters may be incremented and accessed
214 	 * concurrently with respect to each other, we may have
215 	 * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
216 	 * is broken, so truncate values in these cases.
217 	 */
218 	uint32_t timeshare_threads = (nthreads - nfixpri);
219 	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
220 		if (load_now[i] > timeshare_threads) {
221 			load_now[i] = timeshare_threads;
222 		}
223 	}
224 
225 	/*
226 	 * Default threads contribute up to (NCPUS * 2) of load to FG threads
227 	 */
228 	if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) {
229 		load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF];
230 	} else {
231 		load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2);
232 	}
233 
234 	/*
235 	 * Utility threads contribute up to NCPUS of load to FG & DF threads
236 	 */
237 	if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) {
238 		load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT];
239 		load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT];
240 	} else {
241 		load_now[TH_BUCKET_SHARE_FG] += ncpus;
242 		load_now[TH_BUCKET_SHARE_DF] += ncpus;
243 	}
244 
245 	/*
246 	 * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
247 	 */
248 	if (load_now[TH_BUCKET_SHARE_BG] > 0) {
249 		load_now[TH_BUCKET_SHARE_FG] += 1;
250 		load_now[TH_BUCKET_SHARE_DF] += 1;
251 		load_now[TH_BUCKET_SHARE_UT] += 1;
252 	}
253 
254 	/*
255 	 * The conversion factor consists of two components:
256 	 * a fixed value based on the absolute time unit (sched_fixed_shift),
257 	 * and a dynamic portion based on load (sched_load_shifts).
258 	 *
259 	 * Zero load results in a out of range shift count.
260 	 */
261 
262 	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
263 		uint32_t bucket_load = 0;
264 
265 		if (load_now[i] > ncpus) {
266 			/* Normalize the load to number of CPUs */
267 			if (ncpus > 1) {
268 				bucket_load = load_now[i] / ncpus;
269 			} else {
270 				bucket_load = load_now[i];
271 			}
272 
273 			if (bucket_load > MAX_LOAD) {
274 				bucket_load = MAX_LOAD;
275 			}
276 		}
277 		/* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
278 		sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW);
279 		sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT;
280 	}
281 
282 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
283 	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE,
284 	    SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]),
285 	    SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0);
286 }
287 
288 void
compute_averages(uint64_t stdelta)289 compute_averages(uint64_t stdelta)
290 {
291 	uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - 1;
292 	uint32_t ncpus = processor_avail_count;
293 
294 	/* Update the global pri_shifts based on the latest values */
295 	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
296 		uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]);
297 		uint32_t shift = sched_fixed_shift - sched_load_shifts[bucket_load];
298 
299 		if (shift > SCHED_PRI_SHIFT_MAX) {
300 			sched_pri_shifts[i] = INT8_MAX;
301 		} else {
302 			sched_pri_shifts[i] = shift;
303 		}
304 	}
305 
306 	/*
307 	 * Sample total running threads for the load average calculation.
308 	 */
309 	sched_nrun = nthreads;
310 
311 	/*
312 	 * Load average and mach factor calculations for
313 	 * those which ask about these things.
314 	 */
315 	uint32_t average_now = nthreads * LOAD_SCALE;
316 	uint32_t factor_now;
317 
318 	if (nthreads > ncpus) {
319 		factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1);
320 	} else {
321 		factor_now = (ncpus - nthreads) * LOAD_SCALE;
322 	}
323 
324 	/*
325 	 * For those statistics that formerly relied on being recomputed
326 	 * on timer ticks, advance by the approximate number of corresponding
327 	 * elapsed intervals, thus compensating for potential idle intervals.
328 	 */
329 	for (uint32_t index = 0; index < stdelta; index++) {
330 		sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5;
331 		sched_load_average = ((sched_load_average << 2) + average_now) / 5;
332 	}
333 
334 	/*
335 	 * Compute old-style Mach load averages.
336 	 */
337 	for (uint32_t index = 0; index < stdelta; index++) {
338 		for (uint32_t i = 0; i < 3; i++) {
339 			mach_factor[i] = ((mach_factor[i] * fract[i]) +
340 			    (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
341 
342 			avenrun[i] = ((avenrun[i] * fract[i]) +
343 			    (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
344 		}
345 	}
346 
347 	/*
348 	 * Compute averages in other components.
349 	 */
350 	uint64_t abstime = mach_absolute_time();
351 
352 	for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) {
353 		if (abstime >= avg->deadline) {
354 			uint64_t period_abs = (avg->period * sched_one_second_interval);
355 			uint64_t ninvokes = 1;
356 
357 			ninvokes += (abstime - avg->deadline) / period_abs;
358 			ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA);
359 
360 			for (uint32_t index = 0; index < ninvokes; index++) {
361 				(*avg->comp)(avg->param);
362 			}
363 			avg->deadline = abstime + period_abs;
364 		}
365 	}
366 }
367