xref: /xnu-8792.81.2/osfmk/kern/timer_call.c (revision 19c3b8c28c31cb8130e034cfb5df6bf9ba342d90)
1 /*
2  * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Timer interrupt callout module.
30  */
31 
32 #include <mach/mach_types.h>
33 
34 #include <kern/clock.h>
35 #include <kern/counter.h>
36 #include <kern/smp.h>
37 #include <kern/processor.h>
38 #include <kern/timer_call.h>
39 #include <kern/timer_queue.h>
40 #include <kern/thread.h>
41 #include <kern/thread_group.h>
42 #include <kern/policy_internal.h>
43 
44 #include <sys/kdebug.h>
45 
46 #if CONFIG_DTRACE
47 #include <mach/sdt.h>
48 #endif
49 
50 
51 #if DEBUG
52 #define TIMER_ASSERT    1
53 #endif
54 
55 //#define TIMER_ASSERT	1
56 //#define TIMER_DBG	1
57 
58 #if TIMER_DBG
59 #define DBG(x...) kprintf("DBG: " x);
60 #else
61 #define DBG(x...)
62 #endif
63 
64 #if TIMER_TRACE
65 #define TIMER_KDEBUG_TRACE      KERNEL_DEBUG_CONSTANT_IST
66 #else
67 #define TIMER_KDEBUG_TRACE(x...)
68 #endif
69 
70 LCK_GRP_DECLARE(timer_call_lck_grp, "timer_call");
71 LCK_GRP_DECLARE(timer_longterm_lck_grp, "timer_longterm");
72 
73 /* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
74 #define timer_queue_lock_spin(queue) lck_ticket_lock(&(queue)->lock_data, LCK_GRP_NULL)
75 #define timer_queue_unlock(queue)    lck_ticket_unlock(&(queue)->lock_data)
76 
77 /*
78  * The longterm timer object is a global structure holding all timers
79  * beyond the short-term, local timer queue threshold. The boot processor
80  * is responsible for moving each timer to its local timer queue
81  * if and when that timer becomes due within the threshold.
82  */
83 
84 /* Sentinel for "no time set": */
85 #define TIMER_LONGTERM_NONE             EndOfAllTime
86 /* The default threadhold is the delta above which a timer is "long-term" */
87 #if defined(__x86_64__)
88 #define TIMER_LONGTERM_THRESHOLD        (1ULL * NSEC_PER_SEC)   /* 1 sec */
89 #else
90 #define TIMER_LONGTERM_THRESHOLD        TIMER_LONGTERM_NONE     /* disabled */
91 #endif
92 
93 /*
94  * The scan_limit throttles processing of the longterm queue.
95  * If the scan time exceeds this limit, we terminate, unlock
96  * and defer for scan_interval. This prevents unbounded holding of
97  * timer queue locks with interrupts masked.
98  */
99 #define TIMER_LONGTERM_SCAN_LIMIT       (100ULL * NSEC_PER_USEC)        /* 100 us */
100 #define TIMER_LONGTERM_SCAN_INTERVAL    (100ULL * NSEC_PER_USEC)        /* 100 us */
101 /* Sentinel for "scan limit exceeded": */
102 #define TIMER_LONGTERM_SCAN_AGAIN       0
103 
104 /*
105  * In a similar way to the longterm queue's scan limit, the following bounds the
106  * amount of time spent processing regular timers.
107  */
108 TUNABLE_WRITEABLE(uint64_t, timer_scan_limit_us, "timer_scan_limit_us", 400);
109 TUNABLE_WRITEABLE(uint64_t, timer_scan_interval_us, "timer_scan_interval_us", 40);
110 static uint64_t timer_scan_limit_abs = 0;
111 static uint64_t timer_scan_interval_abs = 0;
112 
113 /*
114  * Count of times scanning the queue was aborted early (to avoid long
115  * scan times).
116  */
117 SCALABLE_COUNTER_DEFINE(timer_scan_pauses_cnt);
118 
119 /*
120  * Count of times scanning the queue was aborted early resulting in a
121  * postponed hard deadline.
122  */
123 SCALABLE_COUNTER_DEFINE(timer_scan_postpones_cnt);
124 
125 #define MAX_TIMER_SCAN_LIMIT    (30000ULL * NSEC_PER_USEC)  /* 30 ms */
126 #define MIN_TIMER_SCAN_LIMIT    (   50ULL * NSEC_PER_USEC)  /* 50 us */
127 #define MAX_TIMER_SCAN_INTERVAL ( 2000ULL * NSEC_PER_USEC)  /*  2 ms */
128 #define MIN_TIMER_SCAN_INTERVAL (   20ULL * NSEC_PER_USEC)  /* 20 us */
129 
130 typedef struct {
131 	uint64_t        interval;       /* longterm timer interval */
132 	uint64_t        margin;         /* fudge factor (10% of interval */
133 	uint64_t        deadline;       /* first/soonest longterm deadline */
134 	uint64_t        preempted;      /* sooner timer has pre-empted */
135 	timer_call_t    call;           /* first/soonest longterm timer call */
136 	uint64_t        deadline_set;   /* next timer set */
137 	timer_call_data_t timer;        /* timer used by threshold management */
138 	                                /* Stats: */
139 	uint64_t        scans;          /*   num threshold timer scans */
140 	uint64_t        preempts;       /*   num threshold reductions */
141 	uint64_t        latency;        /*   average threshold latency */
142 	uint64_t        latency_min;    /*   minimum threshold latency */
143 	uint64_t        latency_max;    /*   maximum threshold latency */
144 } threshold_t;
145 
146 typedef struct {
147 	mpqueue_head_t  queue;          /* longterm timer list */
148 	uint64_t        enqueues;       /* num timers queued */
149 	uint64_t        dequeues;       /* num timers dequeued */
150 	uint64_t        escalates;      /* num timers becoming shortterm */
151 	uint64_t        scan_time;      /* last time the list was scanned */
152 	threshold_t     threshold;      /* longterm timer threshold */
153 	uint64_t        scan_limit;     /* maximum scan time */
154 	uint64_t        scan_interval;  /* interval between LT "escalation" scans */
155 	uint64_t        scan_pauses;    /* num scans exceeding time limit */
156 } timer_longterm_t;
157 
158 timer_longterm_t                timer_longterm = {
159 	.scan_limit = TIMER_LONGTERM_SCAN_LIMIT,
160 	.scan_interval = TIMER_LONGTERM_SCAN_INTERVAL,
161 };
162 
163 static mpqueue_head_t           *timer_longterm_queue = NULL;
164 
165 static void                     timer_longterm_init(void);
166 static void                     timer_longterm_callout(
167 	timer_call_param_t      p0,
168 	timer_call_param_t      p1);
169 extern void                     timer_longterm_scan(
170 	timer_longterm_t        *tlp,
171 	uint64_t                now);
172 static void                     timer_longterm_update(
173 	timer_longterm_t *tlp);
174 static void                     timer_longterm_update_locked(
175 	timer_longterm_t *tlp);
176 static mpqueue_head_t *         timer_longterm_enqueue_unlocked(
177 	timer_call_t            call,
178 	uint64_t                now,
179 	uint64_t                deadline,
180 	mpqueue_head_t **       old_queue,
181 	uint64_t                soft_deadline,
182 	uint64_t                ttd,
183 	timer_call_param_t      param1,
184 	uint32_t                callout_flags);
185 static void                     timer_longterm_dequeued_locked(
186 	timer_call_t            call);
187 
188 uint64_t past_deadline_timers;
189 uint64_t past_deadline_deltas;
190 uint64_t past_deadline_longest;
191 uint64_t past_deadline_shortest = ~0ULL;
192 enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
193 
194 uint64_t past_deadline_timer_adjustment;
195 
196 static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
197 boolean_t       mach_timer_coalescing_enabled = TRUE;
198 
199 mpqueue_head_t  *timer_call_enqueue_deadline_unlocked(
200 	timer_call_t            call,
201 	mpqueue_head_t          *queue,
202 	uint64_t                deadline,
203 	uint64_t                soft_deadline,
204 	uint64_t                ttd,
205 	timer_call_param_t      param1,
206 	uint32_t                flags);
207 
208 mpqueue_head_t  *timer_call_dequeue_unlocked(
209 	timer_call_t            call);
210 
211 timer_coalescing_priority_params_t tcoal_prio_params;
212 
213 #if TCOAL_PRIO_STATS
214 int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
215 #define TCOAL_PRIO_STAT(x) (x++)
216 #else
217 #define TCOAL_PRIO_STAT(x)
218 #endif
219 
220 static void
timer_call_init_abstime(void)221 timer_call_init_abstime(void)
222 {
223 	int i;
224 	uint64_t result;
225 	timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
226 	nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
227 	nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
228 	tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
229 	nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
230 	tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
231 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
232 	tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
233 	tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
234 	tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
235 	tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
236 	tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
237 	tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
238 
239 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
240 	    &tcoal_prio_params.timer_coalesce_rt_abstime_max);
241 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
242 	    &tcoal_prio_params.timer_coalesce_bg_abstime_max);
243 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
244 	    &tcoal_prio_params.timer_coalesce_kt_abstime_max);
245 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
246 	    &tcoal_prio_params.timer_coalesce_fp_abstime_max);
247 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
248 	    &tcoal_prio_params.timer_coalesce_ts_abstime_max);
249 
250 	for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
251 		tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
252 		nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
253 		    &tcoal_prio_params.latency_qos_abstime_max[i]);
254 		tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
255 	}
256 
257 	nanoseconds_to_absolutetime(timer_scan_limit_us * NSEC_PER_USEC, &timer_scan_limit_abs);
258 	nanoseconds_to_absolutetime(timer_scan_interval_us * NSEC_PER_USEC, &timer_scan_interval_abs);
259 }
260 
261 
262 void
timer_call_init(void)263 timer_call_init(void)
264 {
265 	timer_longterm_init();
266 	timer_call_init_abstime();
267 }
268 
269 
270 void
timer_call_queue_init(mpqueue_head_t * queue)271 timer_call_queue_init(mpqueue_head_t *queue)
272 {
273 	DBG("timer_call_queue_init(%p)\n", queue);
274 	mpqueue_init(queue, &timer_call_lck_grp, LCK_ATTR_NULL);
275 }
276 
277 
278 void
timer_call_setup(timer_call_t call,timer_call_func_t func,timer_call_param_t param0)279 timer_call_setup(
280 	timer_call_t                    call,
281 	timer_call_func_t               func,
282 	timer_call_param_t              param0)
283 {
284 	DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
285 
286 	*call = (struct timer_call) {
287 		.tc_func = func,
288 		.tc_param0 = param0,
289 		.tc_async_dequeue = false,
290 	};
291 
292 	simple_lock_init(&(call)->tc_lock, 0);
293 }
294 
295 timer_call_t
timer_call_alloc(timer_call_func_t func,timer_call_param_t param0)296 timer_call_alloc(
297 	timer_call_func_t       func,
298 	timer_call_param_t      param0)
299 {
300 	timer_call_t call;
301 
302 	call = kalloc_type(struct timer_call, Z_ZERO | Z_WAITOK | Z_NOFAIL);
303 	timer_call_setup(call, func, param0);
304 	return call;
305 }
306 
307 void
timer_call_free(timer_call_t call)308 timer_call_free(
309 	timer_call_t            call)
310 {
311 	kfree_type(struct timer_call, call);
312 }
313 
314 static mpqueue_head_t*
mpqueue_for_timer_call(timer_call_t entry)315 mpqueue_for_timer_call(timer_call_t entry)
316 {
317 	queue_t queue_entry_is_on = entry->tc_queue;
318 	/* 'cast' the queue back to the orignal mpqueue */
319 	return __container_of(queue_entry_is_on, struct mpqueue_head, head);
320 }
321 
322 
323 static __inline__ mpqueue_head_t *
timer_call_entry_dequeue(timer_call_t entry)324 timer_call_entry_dequeue(
325 	timer_call_t            entry)
326 {
327 	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
328 
329 	/* The entry was always on a queue */
330 	assert(old_mpqueue != NULL);
331 
332 #if TIMER_ASSERT
333 	if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
334 		panic("_call_entry_dequeue() "
335 		    "entry %p is not locked\n", entry);
336 	}
337 
338 	/*
339 	 * XXX The queue lock is actually a mutex in spin mode
340 	 *     but there's no way to test for it being held
341 	 *     so we pretend it's a spinlock!
342 	 */
343 	if (!hw_lock_held((hw_lock_t)&old_mpqueue->lock_data)) {
344 		panic("_call_entry_dequeue() "
345 		    "queue %p is not locked\n", old_mpqueue);
346 	}
347 #endif /* TIMER_ASSERT */
348 
349 	if (old_mpqueue != timer_longterm_queue) {
350 		priority_queue_remove(&old_mpqueue->mpq_pqhead,
351 		    &entry->tc_pqlink);
352 	}
353 
354 	remqueue(&entry->tc_qlink);
355 
356 	entry->tc_queue = NULL;
357 
358 	old_mpqueue->count--;
359 
360 	return old_mpqueue;
361 }
362 
363 static __inline__ mpqueue_head_t *
timer_call_entry_enqueue_deadline(timer_call_t entry,mpqueue_head_t * new_mpqueue,uint64_t deadline)364 timer_call_entry_enqueue_deadline(
365 	timer_call_t                    entry,
366 	mpqueue_head_t                  *new_mpqueue,
367 	uint64_t                        deadline)
368 {
369 	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
370 
371 #if TIMER_ASSERT
372 	if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
373 		panic("_call_entry_enqueue_deadline() "
374 		    "entry %p is not locked\n", entry);
375 	}
376 
377 	/* XXX More lock pretense:  */
378 	if (!hw_lock_held((hw_lock_t)&new_mpqueue->lock_data)) {
379 		panic("_call_entry_enqueue_deadline() "
380 		    "queue %p is not locked\n", new_mpqueue);
381 	}
382 
383 	if (old_mpqueue != NULL && old_mpqueue != new_mpqueue) {
384 		panic("_call_entry_enqueue_deadline() "
385 		    "old_mpqueue %p != new_mpqueue", old_mpqueue);
386 	}
387 #endif /* TIMER_ASSERT */
388 
389 	/* no longterm queue involved */
390 	assert(new_mpqueue != timer_longterm_queue);
391 	assert(old_mpqueue != timer_longterm_queue);
392 
393 	if (old_mpqueue == new_mpqueue) {
394 		/* optimize the same-queue case to avoid a full re-insert */
395 		uint64_t old_deadline = entry->tc_pqlink.deadline;
396 		entry->tc_pqlink.deadline = deadline;
397 
398 		if (old_deadline < deadline) {
399 			priority_queue_entry_increased(&new_mpqueue->mpq_pqhead,
400 			    &entry->tc_pqlink);
401 		} else {
402 			priority_queue_entry_decreased(&new_mpqueue->mpq_pqhead,
403 			    &entry->tc_pqlink);
404 		}
405 	} else {
406 		if (old_mpqueue != NULL) {
407 			priority_queue_remove(&old_mpqueue->mpq_pqhead,
408 			    &entry->tc_pqlink);
409 
410 			re_queue_tail(&new_mpqueue->head, &entry->tc_qlink);
411 		} else {
412 			enqueue_tail(&new_mpqueue->head, &entry->tc_qlink);
413 		}
414 
415 		entry->tc_queue = &new_mpqueue->head;
416 		entry->tc_pqlink.deadline = deadline;
417 
418 		priority_queue_insert(&new_mpqueue->mpq_pqhead, &entry->tc_pqlink);
419 	}
420 
421 
422 	/* For efficiency, track the earliest soft deadline on the queue,
423 	 * so that fuzzy decisions can be made without lock acquisitions.
424 	 */
425 
426 	timer_call_t thead = priority_queue_min(&new_mpqueue->mpq_pqhead, struct timer_call, tc_pqlink);
427 
428 	new_mpqueue->earliest_soft_deadline = thead->tc_flags & TIMER_CALL_RATELIMITED ? thead->tc_pqlink.deadline : thead->tc_soft_deadline;
429 
430 	if (old_mpqueue) {
431 		old_mpqueue->count--;
432 	}
433 	new_mpqueue->count++;
434 
435 	return old_mpqueue;
436 }
437 
438 static __inline__ void
timer_call_entry_enqueue_tail(timer_call_t entry,mpqueue_head_t * queue)439 timer_call_entry_enqueue_tail(
440 	timer_call_t                    entry,
441 	mpqueue_head_t                  *queue)
442 {
443 	/* entry is always dequeued before this call */
444 	assert(entry->tc_queue == NULL);
445 
446 	/*
447 	 * this is only used for timer_longterm_queue, which is unordered
448 	 * and thus needs no priority queueing
449 	 */
450 	assert(queue == timer_longterm_queue);
451 
452 	enqueue_tail(&queue->head, &entry->tc_qlink);
453 
454 	entry->tc_queue = &queue->head;
455 
456 	queue->count++;
457 	return;
458 }
459 
460 /*
461  * Remove timer entry from its queue but don't change the queue pointer
462  * and set the async_dequeue flag. This is locking case 2b.
463  */
464 static __inline__ void
timer_call_entry_dequeue_async(timer_call_t entry)465 timer_call_entry_dequeue_async(
466 	timer_call_t            entry)
467 {
468 	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
469 	if (old_mpqueue) {
470 		old_mpqueue->count--;
471 
472 		if (old_mpqueue != timer_longterm_queue) {
473 			priority_queue_remove(&old_mpqueue->mpq_pqhead,
474 			    &entry->tc_pqlink);
475 		}
476 
477 		remqueue(&entry->tc_qlink);
478 		entry->tc_async_dequeue = true;
479 	}
480 	return;
481 }
482 
483 #if TIMER_ASSERT
484 unsigned timer_call_enqueue_deadline_unlocked_async1;
485 unsigned timer_call_enqueue_deadline_unlocked_async2;
486 #endif
487 /*
488  * Assumes call_entry and queues unlocked, interrupts disabled.
489  */
490 __inline__ mpqueue_head_t *
timer_call_enqueue_deadline_unlocked(timer_call_t call,mpqueue_head_t * queue,uint64_t deadline,uint64_t soft_deadline,uint64_t ttd,timer_call_param_t param1,uint32_t callout_flags)491 timer_call_enqueue_deadline_unlocked(
492 	timer_call_t                    call,
493 	mpqueue_head_t                  *queue,
494 	uint64_t                        deadline,
495 	uint64_t                        soft_deadline,
496 	uint64_t                        ttd,
497 	timer_call_param_t              param1,
498 	uint32_t                        callout_flags)
499 {
500 	DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
501 
502 	simple_lock(&call->tc_lock, LCK_GRP_NULL);
503 
504 	mpqueue_head_t  *old_queue = mpqueue_for_timer_call(call);
505 
506 	if (old_queue != NULL) {
507 		timer_queue_lock_spin(old_queue);
508 		if (call->tc_async_dequeue) {
509 			/* collision (1c): timer already dequeued, clear flag */
510 #if TIMER_ASSERT
511 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
512 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
513 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
514 			    call->tc_async_dequeue,
515 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
516 			    0x1c, 0);
517 			timer_call_enqueue_deadline_unlocked_async1++;
518 #endif
519 			call->tc_async_dequeue = false;
520 			call->tc_queue = NULL;
521 		} else if (old_queue != queue) {
522 			timer_call_entry_dequeue(call);
523 #if TIMER_ASSERT
524 			timer_call_enqueue_deadline_unlocked_async2++;
525 #endif
526 		}
527 		if (old_queue == timer_longterm_queue) {
528 			timer_longterm_dequeued_locked(call);
529 		}
530 		if (old_queue != queue) {
531 			timer_queue_unlock(old_queue);
532 			timer_queue_lock_spin(queue);
533 		}
534 	} else {
535 		timer_queue_lock_spin(queue);
536 	}
537 
538 	call->tc_soft_deadline = soft_deadline;
539 	call->tc_flags = callout_flags;
540 	call->tc_param1 = param1;
541 	call->tc_ttd = ttd;
542 
543 	timer_call_entry_enqueue_deadline(call, queue, deadline);
544 	timer_queue_unlock(queue);
545 	simple_unlock(&call->tc_lock);
546 
547 	return old_queue;
548 }
549 
550 #if TIMER_ASSERT
551 unsigned timer_call_dequeue_unlocked_async1;
552 unsigned timer_call_dequeue_unlocked_async2;
553 #endif
554 mpqueue_head_t *
timer_call_dequeue_unlocked(timer_call_t call)555 timer_call_dequeue_unlocked(
556 	timer_call_t            call)
557 {
558 	DBG("timer_call_dequeue_unlocked(%p)\n", call);
559 
560 	simple_lock(&call->tc_lock, LCK_GRP_NULL);
561 
562 	mpqueue_head_t  *old_queue = mpqueue_for_timer_call(call);
563 
564 #if TIMER_ASSERT
565 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
566 	    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
567 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
568 	    call->tc_async_dequeue,
569 	    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
570 	    0, 0);
571 #endif
572 	if (old_queue != NULL) {
573 		timer_queue_lock_spin(old_queue);
574 		if (call->tc_async_dequeue) {
575 			/* collision (1c): timer already dequeued, clear flag */
576 #if TIMER_ASSERT
577 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
578 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
579 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
580 			    call->tc_async_dequeue,
581 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
582 			    0x1c, 0);
583 			timer_call_dequeue_unlocked_async1++;
584 #endif
585 			call->tc_async_dequeue = false;
586 			call->tc_queue = NULL;
587 		} else {
588 			timer_call_entry_dequeue(call);
589 		}
590 		if (old_queue == timer_longterm_queue) {
591 			timer_longterm_dequeued_locked(call);
592 		}
593 		timer_queue_unlock(old_queue);
594 	}
595 	simple_unlock(&call->tc_lock);
596 	return old_queue;
597 }
598 
599 uint64_t
timer_call_past_deadline_timer_handle(uint64_t deadline,uint64_t ctime)600 timer_call_past_deadline_timer_handle(uint64_t deadline, uint64_t ctime)
601 {
602 	uint64_t delta = (ctime - deadline);
603 
604 	past_deadline_timers++;
605 	past_deadline_deltas += delta;
606 	if (delta > past_deadline_longest) {
607 		past_deadline_longest = deadline;
608 	}
609 	if (delta < past_deadline_shortest) {
610 		past_deadline_shortest = delta;
611 	}
612 
613 	return ctime + past_deadline_timer_adjustment;
614 }
615 
616 /*
617  * Timer call entry locking model
618  * ==============================
619  *
620  * Timer call entries are linked on per-cpu timer queues which are protected
621  * by the queue lock and the call entry lock. The locking protocol is:
622  *
623  *  0) The canonical locking order is timer call entry followed by queue.
624  *
625  *  1) With only the entry lock held, entry.queue is valid:
626  *    1a) NULL: the entry is not queued, or
627  *    1b) non-NULL: this queue must be locked before the entry is modified.
628  *        After locking the queue, the call.async_dequeue flag must be checked:
629  *    1c) TRUE: the entry was removed from the queue by another thread
630  *	        and we must NULL the entry.queue and reset this flag, or
631  *    1d) FALSE: (ie. queued), the entry can be manipulated.
632  *
633  *  2) If a queue lock is obtained first, the queue is stable:
634  *    2a) If a try-lock of a queued entry succeeds, the call can be operated on
635  *	  and dequeued.
636  *    2b) If a try-lock fails, it indicates that another thread is attempting
637  *        to change the entry and move it to a different position in this queue
638  *        or to different queue. The entry can be dequeued but it should not be
639  *        operated upon since it is being changed. Furthermore, we don't null
640  *	  the entry.queue pointer (protected by the entry lock we don't own).
641  *	  Instead, we set the async_dequeue flag -- see (1c).
642  *    2c) Same as 2b but occurring when a longterm timer is matured.
643  *  3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
644  *     should be manipulated with the appropriate timer queue lock held,
645  *     to prevent queue traversal observations from observing inconsistent
646  *     updates to an in-flight callout.
647  */
648 
649 /*
650  * In the debug case, we assert that the timer call locking protocol
651  * is being obeyed.
652  */
653 
654 static boolean_t
timer_call_enter_internal(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint64_t leeway,uint32_t flags,boolean_t ratelimited)655 timer_call_enter_internal(
656 	timer_call_t            call,
657 	timer_call_param_t      param1,
658 	uint64_t                deadline,
659 	uint64_t                leeway,
660 	uint32_t                flags,
661 	boolean_t               ratelimited)
662 {
663 	mpqueue_head_t          *queue = NULL;
664 	mpqueue_head_t          *old_queue;
665 	spl_t                   s;
666 	uint64_t                slop;
667 	uint32_t                urgency;
668 	uint64_t                sdeadline, ttd;
669 
670 	assert(call->tc_func != NULL);
671 	s = splclock();
672 
673 	sdeadline = deadline;
674 	uint64_t ctime = mach_absolute_time();
675 
676 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
677 	    DECR_TIMER_ENTER | DBG_FUNC_START,
678 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
679 	    VM_KERNEL_ADDRHIDE(param1), deadline, flags, 0);
680 
681 	urgency = (flags & TIMER_CALL_URGENCY_MASK);
682 
683 	boolean_t slop_ratelimited = FALSE;
684 	slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
685 
686 	if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop) {
687 		slop = leeway;
688 	}
689 
690 	if (UINT64_MAX - deadline <= slop) {
691 		deadline = UINT64_MAX;
692 	} else {
693 		deadline += slop;
694 	}
695 
696 	if (__improbable(deadline < ctime)) {
697 		deadline = timer_call_past_deadline_timer_handle(deadline, ctime);
698 		sdeadline = deadline;
699 	}
700 
701 	if (ratelimited || slop_ratelimited) {
702 		flags |= TIMER_CALL_RATELIMITED;
703 	} else {
704 		flags &= ~TIMER_CALL_RATELIMITED;
705 	}
706 
707 	ttd =  sdeadline - ctime;
708 #if CONFIG_DTRACE
709 	DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
710 	    timer_call_param_t, call->tc_param0, uint32_t, flags,
711 	    (deadline - sdeadline),
712 	    (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
713 #endif
714 
715 	/* Program timer callout parameters under the appropriate per-CPU or
716 	 * longterm queue lock. The callout may have been previously enqueued
717 	 * and in-flight on this or another timer queue.
718 	 */
719 	if (!ratelimited && !slop_ratelimited) {
720 		queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
721 	}
722 
723 	if (queue == NULL) {
724 		queue = timer_queue_assign(deadline);
725 		old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
726 	}
727 
728 #if TIMER_TRACE
729 	call->tc_entry_time = ctime;
730 #endif
731 
732 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
733 	    DECR_TIMER_ENTER | DBG_FUNC_END,
734 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
735 	    (old_queue != NULL), deadline, queue->count, 0);
736 
737 	splx(s);
738 
739 	return old_queue != NULL;
740 }
741 
742 /*
743  * timer_call_*()
744  *	return boolean indicating whether the call was previously queued.
745  */
746 boolean_t
timer_call_enter(timer_call_t call,uint64_t deadline,uint32_t flags)747 timer_call_enter(
748 	timer_call_t            call,
749 	uint64_t                deadline,
750 	uint32_t                flags)
751 {
752 	return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
753 }
754 
755 boolean_t
timer_call_enter1(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint32_t flags)756 timer_call_enter1(
757 	timer_call_t            call,
758 	timer_call_param_t      param1,
759 	uint64_t                deadline,
760 	uint32_t                flags)
761 {
762 	return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
763 }
764 
765 boolean_t
timer_call_enter_with_leeway(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint64_t leeway,uint32_t flags,boolean_t ratelimited)766 timer_call_enter_with_leeway(
767 	timer_call_t            call,
768 	timer_call_param_t      param1,
769 	uint64_t                deadline,
770 	uint64_t                leeway,
771 	uint32_t                flags,
772 	boolean_t               ratelimited)
773 {
774 	return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
775 }
776 
777 boolean_t
timer_call_cancel(timer_call_t call)778 timer_call_cancel(
779 	timer_call_t            call)
780 {
781 	mpqueue_head_t          *old_queue;
782 	spl_t                   s;
783 
784 	s = splclock();
785 
786 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
787 	    DECR_TIMER_CANCEL | DBG_FUNC_START,
788 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
789 	    call->tc_pqlink.deadline, call->tc_soft_deadline, call->tc_flags, 0);
790 
791 	old_queue = timer_call_dequeue_unlocked(call);
792 
793 	if (old_queue != NULL) {
794 		timer_queue_lock_spin(old_queue);
795 
796 		timer_call_t new_head = priority_queue_min(&old_queue->mpq_pqhead, struct timer_call, tc_pqlink);
797 
798 		if (new_head) {
799 			timer_queue_cancel(old_queue, call->tc_pqlink.deadline, new_head->tc_pqlink.deadline);
800 			old_queue->earliest_soft_deadline = new_head->tc_flags & TIMER_CALL_RATELIMITED ? new_head->tc_pqlink.deadline : new_head->tc_soft_deadline;
801 		} else {
802 			timer_queue_cancel(old_queue, call->tc_pqlink.deadline, UINT64_MAX);
803 			old_queue->earliest_soft_deadline = UINT64_MAX;
804 		}
805 
806 		timer_queue_unlock(old_queue);
807 	}
808 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
809 	    DECR_TIMER_CANCEL | DBG_FUNC_END,
810 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
811 	    VM_KERNEL_UNSLIDE_OR_PERM(old_queue),
812 	    call->tc_pqlink.deadline - mach_absolute_time(),
813 	    call->tc_pqlink.deadline - call->tc_entry_time, 0);
814 	splx(s);
815 
816 #if CONFIG_DTRACE
817 	DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
818 	    timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
819 	    (call->tc_ttd >> 32), (unsigned) (call->tc_ttd & 0xFFFFFFFF));
820 #endif /* CONFIG_DTRACE */
821 
822 	return old_queue != NULL;
823 }
824 
825 static uint32_t timer_queue_shutdown_lock_skips;
826 static uint32_t timer_queue_shutdown_discarded;
827 
828 void
timer_queue_shutdown(mpqueue_head_t * queue)829 timer_queue_shutdown(
830 	mpqueue_head_t          *queue)
831 {
832 	timer_call_t            call;
833 	mpqueue_head_t          *new_queue;
834 	spl_t                   s;
835 
836 
837 	DBG("timer_queue_shutdown(%p)\n", queue);
838 
839 	s = splclock();
840 
841 	while (TRUE) {
842 		timer_queue_lock_spin(queue);
843 
844 		call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
845 
846 		if (call == NULL) {
847 			break;
848 		}
849 
850 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
851 			/*
852 			 * case (2b) lock order inversion, dequeue and skip
853 			 * Don't change the call_entry queue back-pointer
854 			 * but set the async_dequeue field.
855 			 */
856 			timer_queue_shutdown_lock_skips++;
857 			timer_call_entry_dequeue_async(call);
858 #if TIMER_ASSERT
859 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
860 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
861 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
862 			    call->tc_async_dequeue,
863 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
864 			    0x2b, 0);
865 #endif
866 			timer_queue_unlock(queue);
867 			continue;
868 		}
869 
870 		boolean_t call_local = ((call->tc_flags & TIMER_CALL_LOCAL) != 0);
871 
872 		/* remove entry from old queue */
873 		timer_call_entry_dequeue(call);
874 		timer_queue_unlock(queue);
875 
876 		if (call_local == FALSE) {
877 			/* and queue it on new, discarding LOCAL timers */
878 			new_queue = timer_queue_assign(call->tc_pqlink.deadline);
879 			timer_queue_lock_spin(new_queue);
880 			timer_call_entry_enqueue_deadline(
881 				call, new_queue, call->tc_pqlink.deadline);
882 			timer_queue_unlock(new_queue);
883 		} else {
884 			timer_queue_shutdown_discarded++;
885 		}
886 
887 		assert(call_local == FALSE);
888 		simple_unlock(&call->tc_lock);
889 	}
890 
891 	timer_queue_unlock(queue);
892 	splx(s);
893 }
894 
895 
896 static uint32_t timer_queue_expire_lock_skips;
897 uint64_t
timer_queue_expire_with_options(mpqueue_head_t * queue,uint64_t deadline,boolean_t rescan)898 timer_queue_expire_with_options(
899 	mpqueue_head_t          *queue,
900 	uint64_t                deadline,
901 	boolean_t               rescan)
902 {
903 	timer_call_t    call = NULL;
904 	uint32_t tc_iterations = 0;
905 	DBG("timer_queue_expire(%p,)\n", queue);
906 
907 	/* 'rescan' means look at every timer in the list, instead of
908 	 * early-exiting when the head of the list expires in the future.
909 	 * when 'rescan' is true, iterate by linked list instead of priority queue.
910 	 *
911 	 * TODO: if we keep a deadline ordered and soft-deadline ordered
912 	 * priority queue, then it's no longer necessary to do that
913 	 */
914 
915 	uint64_t cur_deadline = deadline;
916 
917 	/* Force an early return if this time limit is hit. */
918 	const uint64_t time_limit = deadline + timer_scan_limit_abs;
919 
920 	/* Next deadline if the time limit is hit */
921 	uint64_t time_limit_deadline = 0;
922 
923 	timer_queue_lock_spin(queue);
924 
925 	while (!queue_empty(&queue->head)) {
926 		if (++tc_iterations > 1) {
927 			const uint64_t now = mach_absolute_time();
928 
929 			/*
930 			 * Abort the scan if it's taking too long to avoid long
931 			 * periods with interrupts disabled.
932 			 * Scanning will restart after a short pause
933 			 * (timer_scan_interval_abs) if there's a hard deadline
934 			 * pending.
935 			 */
936 			if (rescan == FALSE && now > time_limit) {
937 				TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
938 				    DECR_TIMER_PAUSE | DBG_FUNC_NONE,
939 				    queue->count, tc_iterations - 1,
940 				    0, 0, 0);
941 
942 				counter_inc(&timer_scan_pauses_cnt);
943 				time_limit_deadline = now + timer_scan_interval_abs;
944 				break;
945 			}
946 
947 			/*
948 			 * Upon processing one or more timer calls, refresh the
949 			 * deadline to account for time elapsed in the callout
950 			 */
951 			cur_deadline = now;
952 		}
953 
954 		if (call == NULL) {
955 			if (rescan == FALSE) {
956 				call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
957 			} else {
958 				call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
959 			}
960 		}
961 
962 		if (call->tc_soft_deadline <= cur_deadline) {
963 			timer_call_func_t               func;
964 			timer_call_param_t              param0, param1;
965 
966 			TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->tc_soft_deadline, 0, 0, 0);
967 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
968 			    DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
969 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
970 			    call->tc_soft_deadline,
971 			    call->tc_pqlink.deadline,
972 			    call->tc_entry_time, 0);
973 
974 			if ((call->tc_flags & TIMER_CALL_RATELIMITED) &&
975 			    (call->tc_pqlink.deadline > cur_deadline)) {
976 				if (rescan == FALSE) {
977 					break;
978 				}
979 			}
980 
981 			if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
982 				/* case (2b) lock inversion, dequeue and skip */
983 				timer_queue_expire_lock_skips++;
984 				timer_call_entry_dequeue_async(call);
985 				call = NULL;
986 				continue;
987 			}
988 
989 			timer_call_entry_dequeue(call);
990 
991 			func = call->tc_func;
992 			param0 = call->tc_param0;
993 			param1 = call->tc_param1;
994 
995 			simple_unlock(&call->tc_lock);
996 			timer_queue_unlock(queue);
997 
998 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
999 			    DECR_TIMER_CALLOUT | DBG_FUNC_START,
1000 			    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
1001 			    VM_KERNEL_ADDRHIDE(param0),
1002 			    VM_KERNEL_ADDRHIDE(param1),
1003 			    0);
1004 
1005 #if CONFIG_DTRACE
1006 			DTRACE_TMR7(callout__start, timer_call_func_t, func,
1007 			    timer_call_param_t, param0, unsigned, call->tc_flags,
1008 			    0, (call->tc_ttd >> 32),
1009 			    (unsigned) (call->tc_ttd & 0xFFFFFFFF), call);
1010 #endif
1011 			/* Maintain time-to-deadline in per-processor data
1012 			 * structure for thread wakeup deadline statistics.
1013 			 */
1014 			uint64_t *ttdp = &current_processor()->timer_call_ttd;
1015 			*ttdp = call->tc_ttd;
1016 			(*func)(param0, param1);
1017 			*ttdp = 0;
1018 #if CONFIG_DTRACE
1019 			DTRACE_TMR4(callout__end, timer_call_func_t, func,
1020 			    param0, param1, call);
1021 #endif
1022 
1023 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1024 			    DECR_TIMER_CALLOUT | DBG_FUNC_END,
1025 			    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
1026 			    VM_KERNEL_ADDRHIDE(param0),
1027 			    VM_KERNEL_ADDRHIDE(param1),
1028 			    0);
1029 			call = NULL;
1030 			timer_queue_lock_spin(queue);
1031 		} else {
1032 			if (__probable(rescan == FALSE)) {
1033 				break;
1034 			} else {
1035 				int64_t skew = call->tc_pqlink.deadline - call->tc_soft_deadline;
1036 				assert(call->tc_pqlink.deadline >= call->tc_soft_deadline);
1037 
1038 				/* DRK: On a latency quality-of-service level change,
1039 				 * re-sort potentially rate-limited timers. The platform
1040 				 * layer determines which timers require
1041 				 * this. In the absence of the per-callout
1042 				 * synchronization requirement, a global resort could
1043 				 * be more efficient. The re-sort effectively
1044 				 * annuls all timer adjustments, i.e. the "soft
1045 				 * deadline" is the sort key.
1046 				 */
1047 
1048 				if (timer_resort_threshold(skew)) {
1049 					if (__probable(simple_lock_try(&call->tc_lock, LCK_GRP_NULL))) {
1050 						/* TODO: don't need to dequeue before enqueue */
1051 						timer_call_entry_dequeue(call);
1052 						timer_call_entry_enqueue_deadline(call, queue, call->tc_soft_deadline);
1053 						simple_unlock(&call->tc_lock);
1054 						call = NULL;
1055 					}
1056 				}
1057 				if (call) {
1058 					call = qe_queue_next(&queue->head, call, struct timer_call, tc_qlink);
1059 
1060 					if (call == NULL) {
1061 						break;
1062 					}
1063 				}
1064 			}
1065 		}
1066 	}
1067 
1068 	call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
1069 
1070 	if (call) {
1071 		/*
1072 		 * Even if the time limit has been hit, it doesn't mean a hard
1073 		 * deadline will be missed - the next hard deadline may be in
1074 		 * future.
1075 		 */
1076 		if (time_limit_deadline > call->tc_pqlink.deadline) {
1077 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1078 			    DECR_TIMER_POSTPONE | DBG_FUNC_NONE,
1079 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1080 			    call->tc_pqlink.deadline,
1081 			    time_limit_deadline,
1082 			    0, 0);
1083 			counter_inc(&timer_scan_postpones_cnt);
1084 			cur_deadline = time_limit_deadline;
1085 		} else {
1086 			cur_deadline = call->tc_pqlink.deadline;
1087 		}
1088 		queue->earliest_soft_deadline = (call->tc_flags & TIMER_CALL_RATELIMITED) ? call->tc_pqlink.deadline: call->tc_soft_deadline;
1089 	} else {
1090 		queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
1091 	}
1092 
1093 	timer_queue_unlock(queue);
1094 
1095 	return cur_deadline;
1096 }
1097 
1098 uint64_t
timer_queue_expire(mpqueue_head_t * queue,uint64_t deadline)1099 timer_queue_expire(
1100 	mpqueue_head_t          *queue,
1101 	uint64_t                deadline)
1102 {
1103 	return timer_queue_expire_with_options(queue, deadline, FALSE);
1104 }
1105 
1106 extern int serverperfmode;
1107 static uint32_t timer_queue_migrate_lock_skips;
1108 /*
1109  * timer_queue_migrate() is called by timer_queue_migrate_cpu()
1110  * to move timer requests from the local processor (queue_from)
1111  * to a target processor's (queue_to).
1112  */
1113 int
timer_queue_migrate(mpqueue_head_t * queue_from,mpqueue_head_t * queue_to)1114 timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
1115 {
1116 	timer_call_t    call;
1117 	timer_call_t    head_to;
1118 	int             timers_migrated = 0;
1119 
1120 	DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
1121 
1122 	assert(!ml_get_interrupts_enabled());
1123 	assert(queue_from != queue_to);
1124 
1125 	if (serverperfmode) {
1126 		/*
1127 		 * if we're running a high end server
1128 		 * avoid migrations... they add latency
1129 		 * and don't save us power under typical
1130 		 * server workloads
1131 		 */
1132 		return -4;
1133 	}
1134 
1135 	/*
1136 	 * Take both local (from) and target (to) timer queue locks while
1137 	 * moving the timers from the local queue to the target processor.
1138 	 * We assume that the target is always the boot processor.
1139 	 * But only move if all of the following is true:
1140 	 *  - the target queue is non-empty
1141 	 *  - the local queue is non-empty
1142 	 *  - the local queue's first deadline is later than the target's
1143 	 *  - the local queue contains no non-migrateable "local" call
1144 	 * so that we need not have the target resync.
1145 	 */
1146 
1147 	timer_queue_lock_spin(queue_to);
1148 
1149 	head_to = priority_queue_min(&queue_to->mpq_pqhead, struct timer_call, tc_pqlink);
1150 
1151 	if (head_to == NULL) {
1152 		timers_migrated = -1;
1153 		goto abort1;
1154 	}
1155 
1156 	timer_queue_lock_spin(queue_from);
1157 
1158 	call = priority_queue_min(&queue_from->mpq_pqhead, struct timer_call, tc_pqlink);
1159 
1160 	if (call == NULL) {
1161 		timers_migrated = -2;
1162 		goto abort2;
1163 	}
1164 
1165 	if (call->tc_pqlink.deadline < head_to->tc_pqlink.deadline) {
1166 		timers_migrated = 0;
1167 		goto abort2;
1168 	}
1169 
1170 	/* perform scan for non-migratable timers */
1171 	qe_foreach_element(call, &queue_from->head, tc_qlink) {
1172 		if (call->tc_flags & TIMER_CALL_LOCAL) {
1173 			timers_migrated = -3;
1174 			goto abort2;
1175 		}
1176 	}
1177 
1178 	/* migration loop itself -- both queues are locked */
1179 	qe_foreach_element_safe(call, &queue_from->head, tc_qlink) {
1180 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1181 			/* case (2b) lock order inversion, dequeue only */
1182 #ifdef TIMER_ASSERT
1183 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1184 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1185 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1186 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1187 			    0,
1188 			    0x2b, 0);
1189 #endif
1190 			timer_queue_migrate_lock_skips++;
1191 			timer_call_entry_dequeue_async(call);
1192 			continue;
1193 		}
1194 		timer_call_entry_dequeue(call);
1195 		timer_call_entry_enqueue_deadline(
1196 			call, queue_to, call->tc_pqlink.deadline);
1197 		timers_migrated++;
1198 		simple_unlock(&call->tc_lock);
1199 	}
1200 	queue_from->earliest_soft_deadline = UINT64_MAX;
1201 abort2:
1202 	timer_queue_unlock(queue_from);
1203 abort1:
1204 	timer_queue_unlock(queue_to);
1205 
1206 	return timers_migrated;
1207 }
1208 
1209 void
timer_queue_trace_cpu(int ncpu)1210 timer_queue_trace_cpu(int ncpu)
1211 {
1212 	timer_call_nosync_cpu(
1213 		ncpu,
1214 		(void (*)(void *))timer_queue_trace,
1215 		(void*) timer_queue_cpu(ncpu));
1216 }
1217 
1218 void
timer_queue_trace(mpqueue_head_t * queue)1219 timer_queue_trace(
1220 	mpqueue_head_t                  *queue)
1221 {
1222 	timer_call_t    call;
1223 	spl_t           s;
1224 
1225 	if (!kdebug_enable) {
1226 		return;
1227 	}
1228 
1229 	s = splclock();
1230 	timer_queue_lock_spin(queue);
1231 
1232 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1233 	    DECR_TIMER_QUEUE | DBG_FUNC_START,
1234 	    queue->count, mach_absolute_time(), 0, 0, 0);
1235 
1236 	qe_foreach_element(call, &queue->head, tc_qlink) {
1237 		TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1238 		    DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1239 		    call->tc_soft_deadline,
1240 		    call->tc_pqlink.deadline,
1241 		    call->tc_entry_time,
1242 		    VM_KERNEL_UNSLIDE(call->tc_func),
1243 		    0);
1244 	}
1245 
1246 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1247 	    DECR_TIMER_QUEUE | DBG_FUNC_END,
1248 	    queue->count, mach_absolute_time(), 0, 0, 0);
1249 
1250 	timer_queue_unlock(queue);
1251 	splx(s);
1252 }
1253 
1254 void
timer_longterm_dequeued_locked(timer_call_t call)1255 timer_longterm_dequeued_locked(timer_call_t call)
1256 {
1257 	timer_longterm_t        *tlp = &timer_longterm;
1258 
1259 	tlp->dequeues++;
1260 	if (call == tlp->threshold.call) {
1261 		tlp->threshold.call = NULL;
1262 	}
1263 }
1264 
1265 /*
1266  * Place a timer call in the longterm list
1267  * and adjust the next timer callout deadline if the new timer is first.
1268  */
1269 mpqueue_head_t *
timer_longterm_enqueue_unlocked(timer_call_t call,uint64_t now,uint64_t deadline,mpqueue_head_t ** old_queue,uint64_t soft_deadline,uint64_t ttd,timer_call_param_t param1,uint32_t callout_flags)1270 timer_longterm_enqueue_unlocked(timer_call_t    call,
1271     uint64_t        now,
1272     uint64_t        deadline,
1273     mpqueue_head_t  **old_queue,
1274     uint64_t        soft_deadline,
1275     uint64_t        ttd,
1276     timer_call_param_t      param1,
1277     uint32_t        callout_flags)
1278 {
1279 	timer_longterm_t        *tlp = &timer_longterm;
1280 	boolean_t               update_required = FALSE;
1281 	uint64_t                longterm_threshold;
1282 
1283 	longterm_threshold = now + tlp->threshold.interval;
1284 
1285 	/*
1286 	 * Return NULL without doing anything if:
1287 	 *  - this timer is local, or
1288 	 *  - the longterm mechanism is disabled, or
1289 	 *  - this deadline is too short.
1290 	 */
1291 	if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
1292 	    (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
1293 	    (deadline <= longterm_threshold)) {
1294 		return NULL;
1295 	}
1296 
1297 	/*
1298 	 * Remove timer from its current queue, if any.
1299 	 */
1300 	*old_queue = timer_call_dequeue_unlocked(call);
1301 
1302 	/*
1303 	 * Lock the longterm queue, queue timer and determine
1304 	 * whether an update is necessary.
1305 	 */
1306 	assert(!ml_get_interrupts_enabled());
1307 	simple_lock(&call->tc_lock, LCK_GRP_NULL);
1308 	timer_queue_lock_spin(timer_longterm_queue);
1309 	call->tc_pqlink.deadline = deadline;
1310 	call->tc_param1 = param1;
1311 	call->tc_ttd = ttd;
1312 	call->tc_soft_deadline = soft_deadline;
1313 	call->tc_flags = callout_flags;
1314 	timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1315 
1316 	tlp->enqueues++;
1317 
1318 	/*
1319 	 * We'll need to update the currently set threshold timer
1320 	 * if the new deadline is sooner and no sooner update is in flight.
1321 	 */
1322 	if (deadline < tlp->threshold.deadline &&
1323 	    deadline < tlp->threshold.preempted) {
1324 		tlp->threshold.preempted = deadline;
1325 		tlp->threshold.call = call;
1326 		update_required = TRUE;
1327 	}
1328 	timer_queue_unlock(timer_longterm_queue);
1329 	simple_unlock(&call->tc_lock);
1330 
1331 	if (update_required) {
1332 		/*
1333 		 * Note: this call expects that calling the master cpu
1334 		 * alone does not involve locking the topo lock.
1335 		 */
1336 		timer_call_nosync_cpu(
1337 			master_cpu,
1338 			(void (*)(void *))timer_longterm_update,
1339 			(void *)tlp);
1340 	}
1341 
1342 	return timer_longterm_queue;
1343 }
1344 
1345 /*
1346  * Scan for timers below the longterm threshold.
1347  * Move these to the local timer queue (of the boot processor on which the
1348  * calling thread is running).
1349  * Both the local (boot) queue and the longterm queue are locked.
1350  * The scan is similar to the timer migrate sequence but is performed by
1351  * successively examining each timer on the longterm queue:
1352  *  - if within the short-term threshold
1353  *    - enter on the local queue (unless being deleted),
1354  *  - otherwise:
1355  *    - if sooner, deadline becomes the next threshold deadline.
1356  * The total scan time is limited to TIMER_LONGTERM_SCAN_LIMIT. Should this be
1357  * exceeded, we abort and reschedule again so that we don't shut others from
1358  * the timer queues. Longterm timers firing late is not critical.
1359  */
1360 void
timer_longterm_scan(timer_longterm_t * tlp,uint64_t time_start)1361 timer_longterm_scan(timer_longterm_t    *tlp,
1362     uint64_t            time_start)
1363 {
1364 	timer_call_t    call;
1365 	uint64_t        threshold = TIMER_LONGTERM_NONE;
1366 	uint64_t        deadline;
1367 	uint64_t        time_limit = time_start + tlp->scan_limit;
1368 	mpqueue_head_t  *timer_master_queue;
1369 
1370 	assert(!ml_get_interrupts_enabled());
1371 	assert(cpu_number() == master_cpu);
1372 
1373 	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1374 		threshold = time_start + tlp->threshold.interval;
1375 	}
1376 
1377 	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1378 	tlp->threshold.call = NULL;
1379 
1380 	if (queue_empty(&timer_longterm_queue->head)) {
1381 		return;
1382 	}
1383 
1384 	timer_master_queue = timer_queue_cpu(master_cpu);
1385 	timer_queue_lock_spin(timer_master_queue);
1386 
1387 	qe_foreach_element_safe(call, &timer_longterm_queue->head, tc_qlink) {
1388 		deadline = call->tc_soft_deadline;
1389 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1390 			/* case (2c) lock order inversion, dequeue only */
1391 #ifdef TIMER_ASSERT
1392 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1393 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1394 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1395 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1396 			    0,
1397 			    0x2c, 0);
1398 #endif
1399 			timer_call_entry_dequeue_async(call);
1400 			continue;
1401 		}
1402 		if (deadline < threshold) {
1403 			/*
1404 			 * This timer needs moving (escalating)
1405 			 * to the local (boot) processor's queue.
1406 			 */
1407 #ifdef TIMER_ASSERT
1408 			if (deadline < time_start) {
1409 				TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1410 				    DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1411 				    VM_KERNEL_UNSLIDE_OR_PERM(call),
1412 				    deadline,
1413 				    time_start,
1414 				    threshold,
1415 				    0);
1416 			}
1417 #endif
1418 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1419 			    DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1420 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1421 			    call->tc_pqlink.deadline,
1422 			    call->tc_entry_time,
1423 			    VM_KERNEL_UNSLIDE(call->tc_func),
1424 			    0);
1425 			tlp->escalates++;
1426 			timer_call_entry_dequeue(call);
1427 			timer_call_entry_enqueue_deadline(
1428 				call, timer_master_queue, call->tc_pqlink.deadline);
1429 			/*
1430 			 * A side-effect of the following call is to update
1431 			 * the actual hardware deadline if required.
1432 			 */
1433 			(void) timer_queue_assign(deadline);
1434 		} else {
1435 			if (deadline < tlp->threshold.deadline) {
1436 				tlp->threshold.deadline = deadline;
1437 				tlp->threshold.call = call;
1438 			}
1439 		}
1440 		simple_unlock(&call->tc_lock);
1441 
1442 		/* Abort scan if we're taking too long. */
1443 		if (mach_absolute_time() > time_limit) {
1444 			tlp->threshold.deadline = TIMER_LONGTERM_SCAN_AGAIN;
1445 			tlp->scan_pauses++;
1446 			DBG("timer_longterm_scan() paused %llu, qlen: %llu\n",
1447 			    time_limit, tlp->queue.count);
1448 			break;
1449 		}
1450 	}
1451 
1452 	timer_queue_unlock(timer_master_queue);
1453 }
1454 
1455 void
timer_longterm_callout(timer_call_param_t p0,__unused timer_call_param_t p1)1456 timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1457 {
1458 	timer_longterm_t        *tlp = (timer_longterm_t *) p0;
1459 
1460 	timer_longterm_update(tlp);
1461 }
1462 
1463 void
timer_longterm_update_locked(timer_longterm_t * tlp)1464 timer_longterm_update_locked(timer_longterm_t *tlp)
1465 {
1466 	uint64_t        latency;
1467 
1468 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1469 	    DECR_TIMER_UPDATE | DBG_FUNC_START,
1470 	    VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1471 	    tlp->threshold.deadline,
1472 	    tlp->threshold.preempted,
1473 	    tlp->queue.count, 0);
1474 
1475 	tlp->scan_time = mach_absolute_time();
1476 	if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1477 		tlp->threshold.preempts++;
1478 		tlp->threshold.deadline = tlp->threshold.preempted;
1479 		tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1480 		/*
1481 		 * Note: in the unlikely event that a pre-empted timer has
1482 		 * itself been cancelled, we'll simply re-scan later at the
1483 		 * time of the preempted/cancelled timer.
1484 		 */
1485 	} else {
1486 		tlp->threshold.scans++;
1487 
1488 		/*
1489 		 * Maintain a moving average of our wakeup latency.
1490 		 * Clamp latency to 0 and ignore above threshold interval.
1491 		 */
1492 		if (tlp->scan_time > tlp->threshold.deadline_set) {
1493 			latency = tlp->scan_time - tlp->threshold.deadline_set;
1494 		} else {
1495 			latency = 0;
1496 		}
1497 		if (latency < tlp->threshold.interval) {
1498 			tlp->threshold.latency_min =
1499 			    MIN(tlp->threshold.latency_min, latency);
1500 			tlp->threshold.latency_max =
1501 			    MAX(tlp->threshold.latency_max, latency);
1502 			tlp->threshold.latency =
1503 			    (tlp->threshold.latency * 99 + latency) / 100;
1504 		}
1505 
1506 		timer_longterm_scan(tlp, tlp->scan_time);
1507 	}
1508 
1509 	tlp->threshold.deadline_set = tlp->threshold.deadline;
1510 	/* The next deadline timer to be set is adjusted */
1511 	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE &&
1512 	    tlp->threshold.deadline != TIMER_LONGTERM_SCAN_AGAIN) {
1513 		tlp->threshold.deadline_set -= tlp->threshold.margin;
1514 		tlp->threshold.deadline_set -= tlp->threshold.latency;
1515 	}
1516 
1517 	/* Throttle next scan time */
1518 	uint64_t scan_clamp = mach_absolute_time() + tlp->scan_interval;
1519 	if (tlp->threshold.deadline_set < scan_clamp) {
1520 		tlp->threshold.deadline_set = scan_clamp;
1521 	}
1522 
1523 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1524 	    DECR_TIMER_UPDATE | DBG_FUNC_END,
1525 	    VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1526 	    tlp->threshold.deadline,
1527 	    tlp->threshold.scans,
1528 	    tlp->queue.count, 0);
1529 }
1530 
1531 void
timer_longterm_update(timer_longterm_t * tlp)1532 timer_longterm_update(timer_longterm_t *tlp)
1533 {
1534 	spl_t   s = splclock();
1535 
1536 	timer_queue_lock_spin(timer_longterm_queue);
1537 
1538 	if (cpu_number() != master_cpu) {
1539 		panic("timer_longterm_update_master() on non-boot cpu");
1540 	}
1541 
1542 	timer_longterm_update_locked(tlp);
1543 
1544 	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1545 		timer_call_enter(
1546 			&tlp->threshold.timer,
1547 			tlp->threshold.deadline_set,
1548 			TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1549 	}
1550 
1551 	timer_queue_unlock(timer_longterm_queue);
1552 	splx(s);
1553 }
1554 
1555 void
timer_longterm_init(void)1556 timer_longterm_init(void)
1557 {
1558 	uint32_t                longterm;
1559 	timer_longterm_t        *tlp = &timer_longterm;
1560 
1561 	DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1562 
1563 	/*
1564 	 * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1565 	 * or TIMER_LONGTERM_NONE (disabled) for server;
1566 	 * overridden longterm boot-arg
1567 	 */
1568 	tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
1569 	    : TIMER_LONGTERM_THRESHOLD;
1570 	if (PE_parse_boot_argn("longterm", &longterm, sizeof(longterm))) {
1571 		tlp->threshold.interval = (longterm == 0) ?
1572 		    TIMER_LONGTERM_NONE :
1573 		    longterm * NSEC_PER_MSEC;
1574 	}
1575 	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1576 		printf("Longterm timer threshold: %llu ms\n",
1577 		    tlp->threshold.interval / NSEC_PER_MSEC);
1578 		kprintf("Longterm timer threshold: %llu ms\n",
1579 		    tlp->threshold.interval / NSEC_PER_MSEC);
1580 		nanoseconds_to_absolutetime(tlp->threshold.interval,
1581 		    &tlp->threshold.interval);
1582 		tlp->threshold.margin = tlp->threshold.interval / 10;
1583 		tlp->threshold.latency_min = EndOfAllTime;
1584 		tlp->threshold.latency_max = 0;
1585 	}
1586 
1587 	tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1588 	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1589 
1590 	mpqueue_init(&tlp->queue, &timer_longterm_lck_grp, LCK_ATTR_NULL);
1591 
1592 	timer_call_setup(&tlp->threshold.timer,
1593 	    timer_longterm_callout, (timer_call_param_t) tlp);
1594 
1595 	timer_longterm_queue = &tlp->queue;
1596 }
1597 
1598 enum {
1599 	THRESHOLD, QCOUNT,
1600 	ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
1601 	LATENCY, LATENCY_MIN, LATENCY_MAX, LONG_TERM_SCAN_LIMIT,
1602 	LONG_TERM_SCAN_INTERVAL, LONG_TERM_SCAN_PAUSES,
1603 	SCAN_LIMIT, SCAN_INTERVAL, SCAN_PAUSES, SCAN_POSTPONES,
1604 };
1605 uint64_t
timer_sysctl_get(int oid)1606 timer_sysctl_get(int oid)
1607 {
1608 	timer_longterm_t        *tlp = &timer_longterm;
1609 
1610 	switch (oid) {
1611 	case THRESHOLD:
1612 		return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
1613 		       0 : tlp->threshold.interval / NSEC_PER_MSEC;
1614 	case QCOUNT:
1615 		return tlp->queue.count;
1616 	case ENQUEUES:
1617 		return tlp->enqueues;
1618 	case DEQUEUES:
1619 		return tlp->dequeues;
1620 	case ESCALATES:
1621 		return tlp->escalates;
1622 	case SCANS:
1623 		return tlp->threshold.scans;
1624 	case PREEMPTS:
1625 		return tlp->threshold.preempts;
1626 	case LATENCY:
1627 		return tlp->threshold.latency;
1628 	case LATENCY_MIN:
1629 		return tlp->threshold.latency_min;
1630 	case LATENCY_MAX:
1631 		return tlp->threshold.latency_max;
1632 	case LONG_TERM_SCAN_LIMIT:
1633 		return tlp->scan_limit;
1634 	case LONG_TERM_SCAN_INTERVAL:
1635 		return tlp->scan_interval;
1636 	case LONG_TERM_SCAN_PAUSES:
1637 		return tlp->scan_pauses;
1638 	case SCAN_LIMIT:
1639 		return timer_scan_limit_us * NSEC_PER_USEC;
1640 	case SCAN_INTERVAL:
1641 		return timer_scan_interval_us * NSEC_PER_USEC;
1642 	case SCAN_PAUSES:
1643 		return counter_load(&timer_scan_pauses_cnt);
1644 	case SCAN_POSTPONES:
1645 		return counter_load(&timer_scan_postpones_cnt);
1646 
1647 	default:
1648 		return 0;
1649 	}
1650 }
1651 
1652 /*
1653  * timer_master_scan() is the inverse of timer_longterm_scan()
1654  * since it un-escalates timers to the longterm queue.
1655  */
1656 static void
timer_master_scan(timer_longterm_t * tlp,uint64_t now)1657 timer_master_scan(timer_longterm_t      *tlp,
1658     uint64_t              now)
1659 {
1660 	timer_call_t    call;
1661 	uint64_t        threshold;
1662 	uint64_t        deadline;
1663 	mpqueue_head_t  *timer_master_queue;
1664 
1665 	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1666 		threshold = now + tlp->threshold.interval;
1667 	} else {
1668 		threshold = TIMER_LONGTERM_NONE;
1669 	}
1670 
1671 	timer_master_queue = timer_queue_cpu(master_cpu);
1672 	timer_queue_lock_spin(timer_master_queue);
1673 
1674 	qe_foreach_element_safe(call, &timer_master_queue->head, tc_qlink) {
1675 		deadline = call->tc_pqlink.deadline;
1676 		if ((call->tc_flags & TIMER_CALL_LOCAL) != 0) {
1677 			continue;
1678 		}
1679 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1680 			/* case (2c) lock order inversion, dequeue only */
1681 			timer_call_entry_dequeue_async(call);
1682 			continue;
1683 		}
1684 		if (deadline > threshold) {
1685 			/* move from master to longterm */
1686 			timer_call_entry_dequeue(call);
1687 			timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1688 			if (deadline < tlp->threshold.deadline) {
1689 				tlp->threshold.deadline = deadline;
1690 				tlp->threshold.call = call;
1691 			}
1692 		}
1693 		simple_unlock(&call->tc_lock);
1694 	}
1695 	timer_queue_unlock(timer_master_queue);
1696 }
1697 
1698 static void
timer_sysctl_set_threshold(void * valp)1699 timer_sysctl_set_threshold(void* valp)
1700 {
1701 	uint64_t value =        (uint64_t)valp;
1702 	timer_longterm_t        *tlp = &timer_longterm;
1703 	spl_t                   s = splclock();
1704 	boolean_t               threshold_increase;
1705 
1706 	timer_queue_lock_spin(timer_longterm_queue);
1707 
1708 	timer_call_cancel(&tlp->threshold.timer);
1709 
1710 	/*
1711 	 * Set the new threshold and note whther it's increasing.
1712 	 */
1713 	if (value == 0) {
1714 		tlp->threshold.interval = TIMER_LONGTERM_NONE;
1715 		threshold_increase = TRUE;
1716 		timer_call_cancel(&tlp->threshold.timer);
1717 	} else {
1718 		uint64_t        old_interval = tlp->threshold.interval;
1719 		tlp->threshold.interval = value * NSEC_PER_MSEC;
1720 		nanoseconds_to_absolutetime(tlp->threshold.interval,
1721 		    &tlp->threshold.interval);
1722 		tlp->threshold.margin = tlp->threshold.interval / 10;
1723 		if (old_interval == TIMER_LONGTERM_NONE) {
1724 			threshold_increase = FALSE;
1725 		} else {
1726 			threshold_increase = (tlp->threshold.interval > old_interval);
1727 		}
1728 	}
1729 
1730 	if (threshold_increase /* or removal */) {
1731 		/* Escalate timers from the longterm queue */
1732 		timer_longterm_scan(tlp, mach_absolute_time());
1733 	} else { /* decrease or addition  */
1734 		/*
1735 		 * We scan the local/master queue for timers now longterm.
1736 		 * To be strictly correct, we should scan all processor queues
1737 		 * but timer migration results in most timers gravitating to the
1738 		 * master processor in any case.
1739 		 */
1740 		timer_master_scan(tlp, mach_absolute_time());
1741 	}
1742 
1743 	/* Set new timer accordingly */
1744 	tlp->threshold.deadline_set = tlp->threshold.deadline;
1745 	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1746 		tlp->threshold.deadline_set -= tlp->threshold.margin;
1747 		tlp->threshold.deadline_set -= tlp->threshold.latency;
1748 		timer_call_enter(
1749 			&tlp->threshold.timer,
1750 			tlp->threshold.deadline_set,
1751 			TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1752 	}
1753 
1754 	/* Reset stats */
1755 	tlp->enqueues = 0;
1756 	tlp->dequeues = 0;
1757 	tlp->escalates = 0;
1758 	tlp->scan_pauses = 0;
1759 	tlp->threshold.scans = 0;
1760 	tlp->threshold.preempts = 0;
1761 	tlp->threshold.latency = 0;
1762 	tlp->threshold.latency_min = EndOfAllTime;
1763 	tlp->threshold.latency_max = 0;
1764 
1765 	timer_queue_unlock(timer_longterm_queue);
1766 	splx(s);
1767 }
1768 
1769 int
timer_sysctl_set(int oid,uint64_t value)1770 timer_sysctl_set(int oid, uint64_t value)
1771 {
1772 	switch (oid) {
1773 	case THRESHOLD:
1774 		timer_call_cpu(
1775 			master_cpu,
1776 			timer_sysctl_set_threshold,
1777 			(void *) value);
1778 		return KERN_SUCCESS;
1779 	case LONG_TERM_SCAN_LIMIT:
1780 		timer_longterm.scan_limit = value;
1781 		return KERN_SUCCESS;
1782 	case LONG_TERM_SCAN_INTERVAL:
1783 		timer_longterm.scan_interval = value;
1784 		return KERN_SUCCESS;
1785 	case SCAN_LIMIT:
1786 		if (value > MAX_TIMER_SCAN_LIMIT ||
1787 		    value < MIN_TIMER_SCAN_LIMIT) {
1788 			return KERN_INVALID_ARGUMENT;
1789 		}
1790 		timer_scan_limit_us = value / NSEC_PER_USEC;
1791 		nanoseconds_to_absolutetime(timer_scan_limit_us * NSEC_PER_USEC,
1792 		    &timer_scan_limit_abs);
1793 		return KERN_SUCCESS;
1794 	case SCAN_INTERVAL:
1795 		if (value > MAX_TIMER_SCAN_INTERVAL ||
1796 		    value < MIN_TIMER_SCAN_INTERVAL) {
1797 			return KERN_INVALID_ARGUMENT;
1798 		}
1799 		timer_scan_interval_us = value / NSEC_PER_USEC;
1800 		nanoseconds_to_absolutetime(timer_scan_interval_us * NSEC_PER_USEC,
1801 		    &timer_scan_interval_abs);
1802 		return KERN_SUCCESS;
1803 	default:
1804 		return KERN_INVALID_ARGUMENT;
1805 	}
1806 }
1807 
1808 
1809 /* Select timer coalescing window based on per-task quality-of-service hints */
1810 static boolean_t
tcoal_qos_adjust(thread_t t,int32_t * tshift,uint64_t * tmax_abstime,boolean_t * pratelimited)1811 tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1812 {
1813 	uint32_t latency_qos;
1814 	boolean_t adjusted = FALSE;
1815 	task_t ctask = get_threadtask(t);
1816 
1817 	if (ctask) {
1818 		latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1819 
1820 		assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1821 
1822 		if (latency_qos) {
1823 			*tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1824 			*tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1825 			*pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1826 			adjusted = TRUE;
1827 		}
1828 	}
1829 	return adjusted;
1830 }
1831 
1832 
1833 /* Adjust timer deadlines based on priority of the thread and the
1834  * urgency value provided at timeout establishment. With this mechanism,
1835  * timers are no longer necessarily sorted in order of soft deadline
1836  * on a given timer queue, i.e. they may be differentially skewed.
1837  * In the current scheme, this could lead to fewer pending timers
1838  * processed than is technically possible when the HW deadline arrives.
1839  */
1840 static void
timer_compute_leeway(thread_t cthread,int32_t urgency,int32_t * tshift,uint64_t * tmax_abstime,boolean_t * pratelimited)1841 timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1842 {
1843 	int16_t tpri = cthread->sched_pri;
1844 	if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1845 		bool tg_critical = false;
1846 #if CONFIG_THREAD_GROUPS
1847 		tg_critical = (thread_group_get_flags(thread_group_get(cthread))) & THREAD_GROUP_FLAGS_CRITICAL;
1848 #endif /* CONFIG_THREAD_GROUPS */
1849 		bool timer_critical = (tpri >= BASEPRI_RTQUEUES) ||
1850 		    (urgency == TIMER_CALL_USER_CRITICAL) ||
1851 		    tg_critical;
1852 		if (timer_critical) {
1853 			*tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1854 			*tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1855 			TCOAL_PRIO_STAT(rt_tcl);
1856 		} else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
1857 		    (urgency == TIMER_CALL_USER_BACKGROUND)) {
1858 			/* Determine if timer should be subjected to a lower QoS */
1859 			if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1860 				if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1861 					return;
1862 				} else {
1863 					*pratelimited = FALSE;
1864 				}
1865 			}
1866 			*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1867 			*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1868 			TCOAL_PRIO_STAT(bg_tcl);
1869 		} else if (tpri >= MINPRI_KERNEL) {
1870 			*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1871 			*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1872 			TCOAL_PRIO_STAT(kt_tcl);
1873 		} else if (cthread->sched_mode == TH_MODE_FIXED) {
1874 			*tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1875 			*tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1876 			TCOAL_PRIO_STAT(fp_tcl);
1877 		} else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1878 			TCOAL_PRIO_STAT(qos_tcl);
1879 		} else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1880 			*tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1881 			*tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1882 			TCOAL_PRIO_STAT(ts_tcl);
1883 		} else {
1884 			TCOAL_PRIO_STAT(nc_tcl);
1885 		}
1886 	} else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1887 		*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1888 		*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1889 		TCOAL_PRIO_STAT(bg_tcl);
1890 	} else {
1891 		*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1892 		*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1893 		TCOAL_PRIO_STAT(kt_tcl);
1894 	}
1895 }
1896 
1897 
1898 int timer_user_idle_level;
1899 
1900 uint64_t
timer_call_slop(uint64_t deadline,uint64_t now,uint32_t flags,thread_t cthread,boolean_t * pratelimited)1901 timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1902 {
1903 	int32_t tcs_shift = 0;
1904 	uint64_t tcs_max_abstime = 0;
1905 	uint64_t adjval;
1906 	uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1907 
1908 	if (mach_timer_coalescing_enabled &&
1909 	    (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1910 		timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
1911 
1912 		if (tcs_shift >= 0) {
1913 			adjval =  MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
1914 		} else {
1915 			adjval =  MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
1916 		}
1917 		/* Apply adjustments derived from "user idle level" heuristic */
1918 		adjval += (adjval * timer_user_idle_level) >> 7;
1919 		return adjval;
1920 	} else {
1921 		return 0;
1922 	}
1923 }
1924 
1925 int
timer_get_user_idle_level(void)1926 timer_get_user_idle_level(void)
1927 {
1928 	return timer_user_idle_level;
1929 }
1930 
1931 kern_return_t
timer_set_user_idle_level(int ilevel)1932 timer_set_user_idle_level(int ilevel)
1933 {
1934 	boolean_t do_reeval = FALSE;
1935 
1936 	if ((ilevel < 0) || (ilevel > 128)) {
1937 		return KERN_INVALID_ARGUMENT;
1938 	}
1939 
1940 	if (ilevel < timer_user_idle_level) {
1941 		do_reeval = TRUE;
1942 	}
1943 
1944 	timer_user_idle_level = ilevel;
1945 
1946 	if (do_reeval) {
1947 		ml_timer_evaluate();
1948 	}
1949 
1950 	return KERN_SUCCESS;
1951 }
1952 
1953 #pragma mark - running timers
1954 
1955 #define RUNNING_TIMER_FAKE_FLAGS (TIMER_CALL_SYS_CRITICAL | \
1956     TIMER_CALL_LOCAL)
1957 
1958 /*
1959  * timer_call_trace_* functions mimic the tracing behavior from the normal
1960  * timer_call subsystem, so tools continue to function.
1961  */
1962 
1963 static void
timer_call_trace_enter_before(struct timer_call * call,uint64_t deadline,uint32_t flags,uint64_t now)1964 timer_call_trace_enter_before(struct timer_call *call, uint64_t deadline,
1965     uint32_t flags, uint64_t now)
1966 {
1967 #pragma unused(call, deadline, flags, now)
1968 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_START,
1969 	    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_ADDRHIDE(call->tc_param1),
1970 	    deadline, flags, 0);
1971 #if CONFIG_DTRACE
1972 	uint64_t ttd = deadline - now;
1973 	DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
1974 	    timer_call_param_t, call->tc_param0, uint32_t, flags, 0,
1975 	    (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
1976 #endif /* CONFIG_DTRACE */
1977 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1978 	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1979 }
1980 
1981 static void
timer_call_trace_enter_after(struct timer_call * call,uint64_t deadline)1982 timer_call_trace_enter_after(struct timer_call *call, uint64_t deadline)
1983 {
1984 #pragma unused(call, deadline)
1985 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1986 	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1987 }
1988 
1989 static void
timer_call_trace_cancel(struct timer_call * call)1990 timer_call_trace_cancel(struct timer_call *call)
1991 {
1992 #pragma unused(call)
1993 	__unused uint64_t deadline = call->tc_pqlink.deadline;
1994 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_START,
1995 	    VM_KERNEL_UNSLIDE_OR_PERM(call), deadline, 0,
1996 	    call->tc_flags, 0);
1997 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_END,
1998 	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline - mach_absolute_time(),
1999 	    deadline - call->tc_entry_time, 0);
2000 #if CONFIG_DTRACE
2001 #if TIMER_TRACE
2002 	uint64_t ttd = deadline - call->tc_entry_time;
2003 #else
2004 	uint64_t ttd = UINT64_MAX;
2005 #endif /* TIMER_TRACE */
2006 	DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
2007 	    timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
2008 	    (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF));
2009 #endif /* CONFIG_DTRACE */
2010 }
2011 
2012 static void
timer_call_trace_expire_entry(struct timer_call * call)2013 timer_call_trace_expire_entry(struct timer_call *call)
2014 {
2015 #pragma unused(call)
2016 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_START,
2017 	    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(call->tc_func),
2018 	    VM_KERNEL_ADDRHIDE(call->tc_param0),
2019 	    VM_KERNEL_ADDRHIDE(call->tc_param1),
2020 	    0);
2021 #if CONFIG_DTRACE
2022 #if TIMER_TRACE
2023 	uint64_t ttd = call->tc_pqlink.deadline - call->tc_entry_time;
2024 #else /* TIMER_TRACE */
2025 	uint64_t ttd = UINT64_MAX;
2026 #endif /* TIMER_TRACE */
2027 	DTRACE_TMR7(callout__start, timer_call_func_t, call->tc_func,
2028 	    timer_call_param_t, call->tc_param0, unsigned, call->tc_flags,
2029 	    0, (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
2030 #endif /* CONFIG_DTRACE */
2031 }
2032 
2033 static void
timer_call_trace_expire_return(struct timer_call * call)2034 timer_call_trace_expire_return(struct timer_call *call)
2035 {
2036 #pragma unused(call)
2037 #if CONFIG_DTRACE
2038 	DTRACE_TMR4(callout__end, timer_call_func_t, call->tc_func,
2039 	    call->tc_param0, call->tc_param1, NULL);
2040 #endif /* CONFIG_DTRACE */
2041 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_END,
2042 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
2043 	    VM_KERNEL_UNSLIDE(call->tc_func),
2044 	    VM_KERNEL_ADDRHIDE(call->tc_param0),
2045 	    VM_KERNEL_ADDRHIDE(call->tc_param1),
2046 	    0);
2047 }
2048 
2049 /*
2050  * Set a new deadline for a running timer on this processor.
2051  */
2052 void
running_timer_setup(processor_t processor,enum running_timer timer,void * param,uint64_t deadline,uint64_t now)2053 running_timer_setup(processor_t processor, enum running_timer timer,
2054     void *param, uint64_t deadline, uint64_t now)
2055 {
2056 	assert(timer < RUNNING_TIMER_MAX);
2057 	assert(ml_get_interrupts_enabled() == FALSE);
2058 
2059 	struct timer_call *call = &processor->running_timers[timer];
2060 
2061 	timer_call_trace_enter_before(call, deadline, RUNNING_TIMER_FAKE_FLAGS,
2062 	    now);
2063 
2064 	if (__improbable(deadline < now)) {
2065 		deadline = timer_call_past_deadline_timer_handle(deadline, now);
2066 	}
2067 
2068 	call->tc_pqlink.deadline = deadline;
2069 #if TIMER_TRACE
2070 	call->tc_entry_time = now;
2071 #endif /* TIMER_TRACE */
2072 	call->tc_param1 = param;
2073 
2074 	timer_call_trace_enter_after(call, deadline);
2075 }
2076 
2077 void
running_timers_sync(void)2078 running_timers_sync(void)
2079 {
2080 	timer_resync_deadlines();
2081 }
2082 
2083 void
running_timer_enter(processor_t processor,unsigned int timer,void * param,uint64_t deadline,uint64_t now)2084 running_timer_enter(processor_t processor, unsigned int timer,
2085     void *param, uint64_t deadline, uint64_t now)
2086 {
2087 	running_timer_setup(processor, timer, param, deadline, now);
2088 	running_timers_sync();
2089 }
2090 
2091 /*
2092  * Call the callback for any running timers that fired for this processor.
2093  * Returns true if any timers were past their deadline.
2094  */
2095 bool
running_timers_expire(processor_t processor,uint64_t now)2096 running_timers_expire(processor_t processor, uint64_t now)
2097 {
2098 	bool expired = false;
2099 
2100 	if (!processor->running_timers_active) {
2101 		return expired;
2102 	}
2103 
2104 	for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
2105 		struct timer_call *call = &processor->running_timers[i];
2106 
2107 		uint64_t deadline = call->tc_pqlink.deadline;
2108 		if (deadline > now) {
2109 			continue;
2110 		}
2111 
2112 		expired = true;
2113 		timer_call_trace_expire_entry(call);
2114 		call->tc_func(call->tc_param0, call->tc_param1);
2115 		timer_call_trace_expire_return(call);
2116 	}
2117 
2118 	return expired;
2119 }
2120 
2121 void
running_timer_clear(processor_t processor,enum running_timer timer)2122 running_timer_clear(processor_t processor, enum running_timer timer)
2123 {
2124 	struct timer_call *call = &processor->running_timers[timer];
2125 	uint64_t deadline = call->tc_pqlink.deadline;
2126 	if (deadline == EndOfAllTime) {
2127 		return;
2128 	}
2129 
2130 	call->tc_pqlink.deadline = EndOfAllTime;
2131 #if TIMER_TRACE
2132 	call->tc_entry_time = 0;
2133 #endif /* TIMER_TRACE */
2134 	timer_call_trace_cancel(call);
2135 }
2136 
2137 void
running_timer_cancel(processor_t processor,unsigned int timer)2138 running_timer_cancel(processor_t processor, unsigned int timer)
2139 {
2140 	running_timer_clear(processor, timer);
2141 	running_timers_sync();
2142 }
2143 
2144 uint64_t
running_timers_deadline(processor_t processor)2145 running_timers_deadline(processor_t processor)
2146 {
2147 	if (!processor->running_timers_active) {
2148 		return EndOfAllTime;
2149 	}
2150 
2151 	uint64_t deadline = EndOfAllTime;
2152 	for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
2153 		uint64_t candidate =
2154 		    processor->running_timers[i].tc_pqlink.deadline;
2155 		if (candidate != 0 && candidate < deadline) {
2156 			deadline = candidate;
2157 		}
2158 	}
2159 
2160 	return deadline;
2161 }
2162 
2163 void
running_timers_activate(processor_t processor)2164 running_timers_activate(processor_t processor)
2165 {
2166 	processor->running_timers_active = true;
2167 	running_timers_sync();
2168 }
2169 
2170 void
running_timers_deactivate(processor_t processor)2171 running_timers_deactivate(processor_t processor)
2172 {
2173 	assert(processor->running_timers_active == true);
2174 	processor->running_timers_active = false;
2175 	running_timers_sync();
2176 }
2177