xref: /xnu-12377.41.6/osfmk/kern/timer_call.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Timer interrupt callout module.
30  */
31 
32 #include <mach/mach_types.h>
33 
34 #include <kern/clock.h>
35 #include <kern/counter.h>
36 #include <kern/smp.h>
37 #include <kern/processor.h>
38 #include <kern/timer_call.h>
39 #include <kern/timer_queue.h>
40 #include <kern/thread.h>
41 #include <kern/thread_group.h>
42 #include <kern/policy_internal.h>
43 
44 #include <sys/kdebug.h>
45 
46 #if CONFIG_DTRACE
47 #include <mach/sdt.h>
48 #endif
49 
50 
51 #if DEBUG
52 #define TIMER_ASSERT    1
53 #endif
54 
55 //#define TIMER_ASSERT	1
56 //#define TIMER_DBG	1
57 
58 #if TIMER_DBG
59 #define DBG(x...) kprintf("DBG: " x);
60 #else
61 #define DBG(x...)
62 #endif
63 
64 #if TIMER_TRACE
65 #define TIMER_KDEBUG_TRACE      KERNEL_DEBUG_CONSTANT_IST
66 #else
67 #define TIMER_KDEBUG_TRACE(x...)
68 #endif
69 
70 LCK_GRP_DECLARE(timer_call_lck_grp, "timer_call");
71 LCK_GRP_DECLARE(timer_longterm_lck_grp, "timer_longterm");
72 LCK_GRP_DECLARE(timer_queue_lck_grp, "timer_queue");
73 
74 /* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
75 #define timer_queue_lock_spin(queue) lck_ticket_lock(&(queue)->lock_data, &timer_queue_lck_grp)
76 #define timer_queue_unlock(queue)    lck_ticket_unlock(&(queue)->lock_data)
77 
78 /*
79  * The longterm timer object is a global structure holding all timers
80  * beyond the short-term, local timer queue threshold. The boot processor
81  * is responsible for moving each timer to its local timer queue
82  * if and when that timer becomes due within the threshold.
83  */
84 
85 /* Sentinel for "no time set": */
86 #define TIMER_LONGTERM_NONE             EndOfAllTime
87 /* The default threadhold is the delta above which a timer is "long-term" */
88 #if defined(__x86_64__)
89 #define TIMER_LONGTERM_THRESHOLD        (1ULL * NSEC_PER_SEC)   /* 1 sec */
90 #else
91 #define TIMER_LONGTERM_THRESHOLD        TIMER_LONGTERM_NONE     /* disabled */
92 #endif
93 
94 /*
95  * The scan_limit throttles processing of the longterm queue.
96  * If the scan time exceeds this limit, we terminate, unlock
97  * and defer for scan_interval. This prevents unbounded holding of
98  * timer queue locks with interrupts masked.
99  */
100 #define TIMER_LONGTERM_SCAN_LIMIT       (100ULL * NSEC_PER_USEC)        /* 100 us */
101 #define TIMER_LONGTERM_SCAN_INTERVAL    (100ULL * NSEC_PER_USEC)        /* 100 us */
102 /* Sentinel for "scan limit exceeded": */
103 #define TIMER_LONGTERM_SCAN_AGAIN       0
104 
105 /*
106  * In a similar way to the longterm queue's scan limit, the following bounds the
107  * amount of time spent processing regular timers. This limit is also obeyed by
108  * thread_call_delayed_timer().
109  */
110 TUNABLE_WRITEABLE(uint64_t, timer_scan_limit_us, "timer_scan_limit_us", 400);
111 TUNABLE_WRITEABLE(uint64_t, timer_scan_interval_us, "timer_scan_interval_us", 40);
112 uint64_t timer_scan_limit_abs = 0;
113 static uint64_t timer_scan_interval_abs = 0;
114 
115 /*
116  * Count of times scanning the queue was aborted early (to avoid long
117  * scan times).
118  */
119 SCALABLE_COUNTER_DEFINE(timer_scan_pauses_cnt);
120 
121 /*
122  * Count of times scanning the queue was aborted early resulting in a
123  * postponed hard deadline.
124  */
125 SCALABLE_COUNTER_DEFINE(timer_scan_postpones_cnt);
126 
127 #define MAX_TIMER_SCAN_LIMIT    (30000ULL * NSEC_PER_USEC)  /* 30 ms */
128 #define MIN_TIMER_SCAN_LIMIT    (   50ULL * NSEC_PER_USEC)  /* 50 us */
129 #define MAX_TIMER_SCAN_INTERVAL ( 2000ULL * NSEC_PER_USEC)  /*  2 ms */
130 #define MIN_TIMER_SCAN_INTERVAL (   20ULL * NSEC_PER_USEC)  /* 20 us */
131 
132 typedef struct {
133 	uint64_t        interval;       /* longterm timer interval */
134 	uint64_t        margin;         /* fudge factor (10% of interval */
135 	uint64_t        deadline;       /* first/soonest longterm deadline */
136 	uint64_t        preempted;      /* sooner timer has pre-empted */
137 	timer_call_t    call;           /* first/soonest longterm timer call */
138 	uint64_t        deadline_set;   /* next timer set */
139 	timer_call_data_t timer;        /* timer used by threshold management */
140 	                                /* Stats: */
141 	uint64_t        scans;          /*   num threshold timer scans */
142 	uint64_t        preempts;       /*   num threshold reductions */
143 	uint64_t        latency;        /*   average threshold latency */
144 	uint64_t        latency_min;    /*   minimum threshold latency */
145 	uint64_t        latency_max;    /*   maximum threshold latency */
146 } threshold_t;
147 
148 typedef struct {
149 	mpqueue_head_t  queue;          /* longterm timer list */
150 	uint64_t        enqueues;       /* num timers queued */
151 	uint64_t        dequeues;       /* num timers dequeued */
152 	uint64_t        escalates;      /* num timers becoming shortterm */
153 	uint64_t        scan_time;      /* last time the list was scanned */
154 	threshold_t     threshold;      /* longterm timer threshold */
155 	uint64_t        scan_limit;     /* maximum scan time */
156 	uint64_t        scan_interval;  /* interval between LT "escalation" scans */
157 	uint64_t        scan_pauses;    /* num scans exceeding time limit */
158 } timer_longterm_t;
159 
160 timer_longterm_t                timer_longterm = {
161 	.scan_limit = TIMER_LONGTERM_SCAN_LIMIT,
162 	.scan_interval = TIMER_LONGTERM_SCAN_INTERVAL,
163 };
164 
165 static mpqueue_head_t           *timer_longterm_queue = NULL;
166 
167 static void                     timer_longterm_init(void);
168 static void                     timer_longterm_callout(
169 	timer_call_param_t      p0,
170 	timer_call_param_t      p1);
171 extern void                     timer_longterm_scan(
172 	timer_longterm_t        *tlp,
173 	uint64_t                now);
174 static void                     timer_longterm_update(
175 	timer_longterm_t *tlp);
176 static void                     timer_longterm_update_locked(
177 	timer_longterm_t *tlp);
178 static mpqueue_head_t *         timer_longterm_enqueue_unlocked(
179 	timer_call_t            call,
180 	uint64_t                now,
181 	uint64_t                deadline,
182 	mpqueue_head_t **       old_queue,
183 	uint64_t                soft_deadline,
184 	uint64_t                ttd,
185 	timer_call_param_t      param1,
186 	uint32_t                callout_flags);
187 static void                     timer_longterm_dequeued_locked(
188 	timer_call_t            call);
189 
190 uint64_t past_deadline_timers;
191 uint64_t past_deadline_deltas;
192 uint64_t past_deadline_longest;
193 uint64_t past_deadline_shortest = ~0ULL;
194 enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
195 
196 uint64_t past_deadline_timer_adjustment;
197 
198 static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
199 boolean_t       mach_timer_coalescing_enabled = TRUE;
200 
201 mpqueue_head_t  *timer_call_enqueue_deadline_unlocked(
202 	timer_call_t            call,
203 	mpqueue_head_t          *queue,
204 	uint64_t                deadline,
205 	uint64_t                soft_deadline,
206 	uint64_t                ttd,
207 	timer_call_param_t      param1,
208 	uint32_t                flags);
209 
210 mpqueue_head_t  *timer_call_dequeue_unlocked(
211 	timer_call_t            call);
212 
213 timer_coalescing_priority_params_t tcoal_prio_params;
214 
215 #if TCOAL_PRIO_STATS
216 int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
217 #define TCOAL_PRIO_STAT(x) (x++)
218 #else
219 #define TCOAL_PRIO_STAT(x)
220 #endif
221 
222 static void
timer_call_init_abstime(void)223 timer_call_init_abstime(void)
224 {
225 	int i;
226 	uint64_t result;
227 	timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
228 	nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
229 	nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
230 	tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
231 	nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
232 	tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
233 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
234 	tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
235 	tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
236 	tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
237 	tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
238 	tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
239 	tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
240 
241 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
242 	    &tcoal_prio_params.timer_coalesce_rt_abstime_max);
243 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
244 	    &tcoal_prio_params.timer_coalesce_bg_abstime_max);
245 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
246 	    &tcoal_prio_params.timer_coalesce_kt_abstime_max);
247 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
248 	    &tcoal_prio_params.timer_coalesce_fp_abstime_max);
249 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
250 	    &tcoal_prio_params.timer_coalesce_ts_abstime_max);
251 
252 	for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
253 		tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
254 		nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
255 		    &tcoal_prio_params.latency_qos_abstime_max[i]);
256 		tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
257 	}
258 
259 	nanoseconds_to_absolutetime(timer_scan_limit_us * NSEC_PER_USEC, &timer_scan_limit_abs);
260 	nanoseconds_to_absolutetime(timer_scan_interval_us * NSEC_PER_USEC, &timer_scan_interval_abs);
261 }
262 
263 
264 void
timer_call_init(void)265 timer_call_init(void)
266 {
267 	timer_longterm_init();
268 	timer_call_init_abstime();
269 }
270 
271 
272 void
timer_call_queue_init(mpqueue_head_t * queue)273 timer_call_queue_init(mpqueue_head_t *queue)
274 {
275 	DBG("timer_call_queue_init(%p)\n", queue);
276 	mpqueue_init(queue, &timer_call_lck_grp, LCK_ATTR_NULL);
277 }
278 
279 
280 void
timer_call_setup(timer_call_t call,timer_call_func_t func,timer_call_param_t param0)281 timer_call_setup(
282 	timer_call_t                    call,
283 	timer_call_func_t               func,
284 	timer_call_param_t              param0)
285 {
286 	DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
287 
288 	*call = (struct timer_call) {
289 		.tc_func = func,
290 		.tc_param0 = param0,
291 		.tc_async_dequeue = false,
292 	};
293 
294 	simple_lock_init(&(call)->tc_lock, 0);
295 }
296 
297 timer_call_t
timer_call_alloc(timer_call_func_t func,timer_call_param_t param0)298 timer_call_alloc(
299 	timer_call_func_t       func,
300 	timer_call_param_t      param0)
301 {
302 	timer_call_t call;
303 
304 	call = kalloc_type(struct timer_call, Z_ZERO | Z_WAITOK | Z_NOFAIL);
305 	timer_call_setup(call, func, param0);
306 	return call;
307 }
308 
309 void
timer_call_free(timer_call_t call)310 timer_call_free(
311 	timer_call_t            call)
312 {
313 	kfree_type(struct timer_call, call);
314 }
315 
316 static mpqueue_head_t*
mpqueue_for_timer_call(timer_call_t entry)317 mpqueue_for_timer_call(timer_call_t entry)
318 {
319 	queue_t queue_entry_is_on = entry->tc_queue;
320 	/* 'cast' the queue back to the orignal mpqueue */
321 	return __container_of(queue_entry_is_on, struct mpqueue_head, head);
322 }
323 
324 
325 static __inline__ mpqueue_head_t *
timer_call_entry_dequeue(timer_call_t entry)326 timer_call_entry_dequeue(
327 	timer_call_t            entry)
328 {
329 	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
330 
331 	/* The entry was always on a queue */
332 	assert(old_mpqueue != NULL);
333 
334 #if TIMER_ASSERT
335 	if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
336 		panic("_call_entry_dequeue() "
337 		    "entry %p is not locked\n", entry);
338 	}
339 
340 	/*
341 	 * XXX The queue lock is actually a mutex in spin mode
342 	 *     but there's no way to test for it being held
343 	 *     so we pretend it's a spinlock!
344 	 */
345 	if (!hw_lock_held((hw_lock_t)&old_mpqueue->lock_data)) {
346 		panic("_call_entry_dequeue() "
347 		    "queue %p is not locked\n", old_mpqueue);
348 	}
349 #endif /* TIMER_ASSERT */
350 
351 	if (old_mpqueue != timer_longterm_queue) {
352 		priority_queue_remove(&old_mpqueue->mpq_pqhead,
353 		    &entry->tc_pqlink);
354 	}
355 
356 	remqueue(&entry->tc_qlink);
357 
358 	entry->tc_queue = NULL;
359 
360 	old_mpqueue->count--;
361 
362 	return old_mpqueue;
363 }
364 
365 static __inline__ mpqueue_head_t *
timer_call_entry_enqueue_deadline(timer_call_t entry,mpqueue_head_t * new_mpqueue,uint64_t deadline)366 timer_call_entry_enqueue_deadline(
367 	timer_call_t                    entry,
368 	mpqueue_head_t                  *new_mpqueue,
369 	uint64_t                        deadline)
370 {
371 	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
372 
373 #if TIMER_ASSERT
374 	if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
375 		panic("_call_entry_enqueue_deadline() "
376 		    "entry %p is not locked\n", entry);
377 	}
378 
379 	/* XXX More lock pretense:  */
380 	if (!hw_lock_held((hw_lock_t)&new_mpqueue->lock_data)) {
381 		panic("_call_entry_enqueue_deadline() "
382 		    "queue %p is not locked\n", new_mpqueue);
383 	}
384 
385 	if (old_mpqueue != NULL && old_mpqueue != new_mpqueue) {
386 		panic("_call_entry_enqueue_deadline() "
387 		    "old_mpqueue %p != new_mpqueue", old_mpqueue);
388 	}
389 #endif /* TIMER_ASSERT */
390 
391 	/* no longterm queue involved */
392 	assert(new_mpqueue != timer_longterm_queue);
393 	assert(old_mpqueue != timer_longterm_queue);
394 
395 	if (old_mpqueue == new_mpqueue) {
396 		/* optimize the same-queue case to avoid a full re-insert */
397 		uint64_t old_deadline = entry->tc_pqlink.deadline;
398 		entry->tc_pqlink.deadline = deadline;
399 
400 		if (old_deadline < deadline) {
401 			priority_queue_entry_increased(&new_mpqueue->mpq_pqhead,
402 			    &entry->tc_pqlink);
403 		} else {
404 			priority_queue_entry_decreased(&new_mpqueue->mpq_pqhead,
405 			    &entry->tc_pqlink);
406 		}
407 	} else {
408 		if (old_mpqueue != NULL) {
409 			priority_queue_remove(&old_mpqueue->mpq_pqhead,
410 			    &entry->tc_pqlink);
411 
412 			re_queue_tail(&new_mpqueue->head, &entry->tc_qlink);
413 		} else {
414 			enqueue_tail(&new_mpqueue->head, &entry->tc_qlink);
415 		}
416 
417 		entry->tc_queue = &new_mpqueue->head;
418 		entry->tc_pqlink.deadline = deadline;
419 
420 		priority_queue_insert(&new_mpqueue->mpq_pqhead, &entry->tc_pqlink);
421 	}
422 
423 
424 	/* For efficiency, track the earliest soft deadline on the queue,
425 	 * so that fuzzy decisions can be made without lock acquisitions.
426 	 */
427 
428 	timer_call_t thead = priority_queue_min(&new_mpqueue->mpq_pqhead, struct timer_call, tc_pqlink);
429 
430 	new_mpqueue->earliest_soft_deadline = thead->tc_flags & TIMER_CALL_RATELIMITED ? thead->tc_pqlink.deadline : thead->tc_soft_deadline;
431 
432 	if (old_mpqueue) {
433 		old_mpqueue->count--;
434 	}
435 	new_mpqueue->count++;
436 
437 	return old_mpqueue;
438 }
439 
440 static __inline__ void
timer_call_entry_enqueue_tail(timer_call_t entry,mpqueue_head_t * queue)441 timer_call_entry_enqueue_tail(
442 	timer_call_t                    entry,
443 	mpqueue_head_t                  *queue)
444 {
445 	/* entry is always dequeued before this call */
446 	assert(entry->tc_queue == NULL);
447 
448 	/*
449 	 * this is only used for timer_longterm_queue, which is unordered
450 	 * and thus needs no priority queueing
451 	 */
452 	assert(queue == timer_longterm_queue);
453 
454 	enqueue_tail(&queue->head, &entry->tc_qlink);
455 
456 	entry->tc_queue = &queue->head;
457 
458 	queue->count++;
459 	return;
460 }
461 
462 /*
463  * Remove timer entry from its queue but don't change the queue pointer
464  * and set the async_dequeue flag. This is locking case 2b.
465  */
466 static __inline__ void
timer_call_entry_dequeue_async(timer_call_t entry)467 timer_call_entry_dequeue_async(
468 	timer_call_t            entry)
469 {
470 	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
471 	if (old_mpqueue) {
472 		old_mpqueue->count--;
473 
474 		if (old_mpqueue != timer_longterm_queue) {
475 			priority_queue_remove(&old_mpqueue->mpq_pqhead,
476 			    &entry->tc_pqlink);
477 		}
478 
479 		remqueue(&entry->tc_qlink);
480 		entry->tc_async_dequeue = true;
481 	}
482 	return;
483 }
484 
485 #if TIMER_ASSERT
486 unsigned timer_call_enqueue_deadline_unlocked_async1;
487 unsigned timer_call_enqueue_deadline_unlocked_async2;
488 #endif
489 /*
490  * Assumes call_entry and queues unlocked, interrupts disabled.
491  */
492 __inline__ mpqueue_head_t *
timer_call_enqueue_deadline_unlocked(timer_call_t call,mpqueue_head_t * queue,uint64_t deadline,uint64_t soft_deadline,uint64_t ttd,timer_call_param_t param1,uint32_t callout_flags)493 timer_call_enqueue_deadline_unlocked(
494 	timer_call_t                    call,
495 	mpqueue_head_t                  *queue,
496 	uint64_t                        deadline,
497 	uint64_t                        soft_deadline,
498 	uint64_t                        ttd,
499 	timer_call_param_t              param1,
500 	uint32_t                        callout_flags)
501 {
502 	DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
503 
504 	simple_lock(&call->tc_lock, LCK_GRP_NULL);
505 
506 	mpqueue_head_t  *old_queue = mpqueue_for_timer_call(call);
507 
508 	if (old_queue != NULL) {
509 		timer_queue_lock_spin(old_queue);
510 		if (call->tc_async_dequeue) {
511 			/* collision (1c): timer already dequeued, clear flag */
512 #if TIMER_ASSERT
513 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
514 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
515 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
516 			    call->tc_async_dequeue,
517 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
518 			    0x1c, 0);
519 			timer_call_enqueue_deadline_unlocked_async1++;
520 #endif
521 			call->tc_async_dequeue = false;
522 			call->tc_queue = NULL;
523 		} else if (old_queue != queue) {
524 			timer_call_entry_dequeue(call);
525 #if TIMER_ASSERT
526 			timer_call_enqueue_deadline_unlocked_async2++;
527 #endif
528 		}
529 		if (old_queue == timer_longterm_queue) {
530 			timer_longterm_dequeued_locked(call);
531 		}
532 		if (old_queue != queue) {
533 			timer_queue_unlock(old_queue);
534 			timer_queue_lock_spin(queue);
535 		}
536 	} else {
537 		timer_queue_lock_spin(queue);
538 	}
539 
540 	call->tc_soft_deadline = soft_deadline;
541 	call->tc_flags = callout_flags;
542 	call->tc_param1 = param1;
543 	call->tc_ttd = ttd;
544 
545 	timer_call_entry_enqueue_deadline(call, queue, deadline);
546 	timer_queue_unlock(queue);
547 	simple_unlock(&call->tc_lock);
548 
549 	return old_queue;
550 }
551 
552 #if TIMER_ASSERT
553 unsigned timer_call_dequeue_unlocked_async1;
554 unsigned timer_call_dequeue_unlocked_async2;
555 #endif
556 mpqueue_head_t *
timer_call_dequeue_unlocked(timer_call_t call)557 timer_call_dequeue_unlocked(
558 	timer_call_t            call)
559 {
560 	DBG("timer_call_dequeue_unlocked(%p)\n", call);
561 
562 	simple_lock(&call->tc_lock, LCK_GRP_NULL);
563 
564 	mpqueue_head_t  *old_queue = mpqueue_for_timer_call(call);
565 
566 #if TIMER_ASSERT
567 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
568 	    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
569 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
570 	    call->tc_async_dequeue,
571 	    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
572 	    0, 0);
573 #endif
574 	if (old_queue != NULL) {
575 		timer_queue_lock_spin(old_queue);
576 		if (call->tc_async_dequeue) {
577 			/* collision (1c): timer already dequeued, clear flag */
578 #if TIMER_ASSERT
579 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
580 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
581 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
582 			    call->tc_async_dequeue,
583 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
584 			    0x1c, 0);
585 			timer_call_dequeue_unlocked_async1++;
586 #endif
587 			call->tc_async_dequeue = false;
588 			call->tc_queue = NULL;
589 		} else {
590 			timer_call_entry_dequeue(call);
591 		}
592 		if (old_queue == timer_longterm_queue) {
593 			timer_longterm_dequeued_locked(call);
594 		}
595 		timer_queue_unlock(old_queue);
596 	}
597 	simple_unlock(&call->tc_lock);
598 	return old_queue;
599 }
600 
601 uint64_t
timer_call_past_deadline_timer_handle(uint64_t deadline,uint64_t ctime)602 timer_call_past_deadline_timer_handle(uint64_t deadline, uint64_t ctime)
603 {
604 	uint64_t delta = (ctime - deadline);
605 
606 	past_deadline_timers++;
607 	past_deadline_deltas += delta;
608 	if (delta > past_deadline_longest) {
609 		past_deadline_longest = deadline;
610 	}
611 	if (delta < past_deadline_shortest) {
612 		past_deadline_shortest = delta;
613 	}
614 
615 	return ctime + past_deadline_timer_adjustment;
616 }
617 
618 /*
619  * Timer call entry locking model
620  * ==============================
621  *
622  * Timer call entries are linked on per-cpu timer queues which are protected
623  * by the queue lock and the call entry lock. The locking protocol is:
624  *
625  *  0) The canonical locking order is timer call entry followed by queue.
626  *
627  *  1) With only the entry lock held, entry.queue is valid:
628  *    1a) NULL: the entry is not queued, or
629  *    1b) non-NULL: this queue must be locked before the entry is modified.
630  *        After locking the queue, the call.async_dequeue flag must be checked:
631  *    1c) TRUE: the entry was removed from the queue by another thread
632  *	        and we must NULL the entry.queue and reset this flag, or
633  *    1d) FALSE: (ie. queued), the entry can be manipulated.
634  *
635  *  2) If a queue lock is obtained first, the queue is stable:
636  *    2a) If a try-lock of a queued entry succeeds, the call can be operated on
637  *	  and dequeued.
638  *    2b) If a try-lock fails, it indicates that another thread is attempting
639  *        to change the entry and move it to a different position in this queue
640  *        or to different queue. The entry can be dequeued but it should not be
641  *        operated upon since it is being changed. Furthermore, we don't null
642  *	  the entry.queue pointer (protected by the entry lock we don't own).
643  *	  Instead, we set the async_dequeue flag -- see (1c).
644  *    2c) Same as 2b but occurring when a longterm timer is matured.
645  *  3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
646  *     should be manipulated with the appropriate timer queue lock held,
647  *     to prevent queue traversal observations from observing inconsistent
648  *     updates to an in-flight callout.
649  */
650 
651 /*
652  * In the debug case, we assert that the timer call locking protocol
653  * is being obeyed.
654  */
655 
656 static boolean_t
timer_call_enter_internal(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint64_t leeway,uint32_t flags,boolean_t ratelimited)657 timer_call_enter_internal(
658 	timer_call_t            call,
659 	timer_call_param_t      param1,
660 	uint64_t                deadline,
661 	uint64_t                leeway,
662 	uint32_t                flags,
663 	boolean_t               ratelimited)
664 {
665 	mpqueue_head_t          *queue = NULL;
666 	mpqueue_head_t          *old_queue;
667 	spl_t                   s;
668 	uint64_t                slop;
669 	uint32_t                urgency;
670 	uint64_t                sdeadline, ttd;
671 
672 	assert(call->tc_func != NULL);
673 	s = splclock();
674 
675 	sdeadline = deadline;
676 	uint64_t ctime = mach_absolute_time();
677 
678 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
679 	    DECR_TIMER_ENTER | DBG_FUNC_START,
680 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
681 	    VM_KERNEL_ADDRHIDE(param1), deadline, flags, 0);
682 
683 	urgency = (flags & TIMER_CALL_URGENCY_MASK);
684 
685 	boolean_t slop_ratelimited = FALSE;
686 	slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
687 
688 	if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop) {
689 		slop = leeway;
690 	}
691 
692 	if (UINT64_MAX - deadline <= slop) {
693 		deadline = UINT64_MAX;
694 	} else {
695 		deadline += slop;
696 	}
697 
698 	if (__improbable(deadline < ctime)) {
699 		deadline = timer_call_past_deadline_timer_handle(deadline, ctime);
700 		sdeadline = deadline;
701 	}
702 
703 	if (ratelimited || slop_ratelimited) {
704 		flags |= TIMER_CALL_RATELIMITED;
705 	} else {
706 		flags &= ~TIMER_CALL_RATELIMITED;
707 	}
708 
709 	ttd =  sdeadline - ctime;
710 #if CONFIG_DTRACE
711 	DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
712 	    timer_call_param_t, call->tc_param0, uint32_t, flags,
713 	    (deadline - sdeadline),
714 	    (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
715 #endif
716 
717 	/* Program timer callout parameters under the appropriate per-CPU or
718 	 * longterm queue lock. The callout may have been previously enqueued
719 	 * and in-flight on this or another timer queue.
720 	 */
721 	if (!ratelimited && !slop_ratelimited) {
722 		queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
723 	}
724 
725 	if (queue == NULL) {
726 		queue = timer_queue_assign(deadline);
727 		old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
728 	}
729 
730 #if TIMER_TRACE
731 	call->tc_entry_time = ctime;
732 #endif
733 
734 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
735 	    DECR_TIMER_ENTER | DBG_FUNC_END,
736 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
737 	    (old_queue != NULL), deadline, queue->count, 0);
738 
739 	splx(s);
740 
741 	return old_queue != NULL;
742 }
743 
744 /*
745  * timer_call_*()
746  *	return boolean indicating whether the call was previously queued.
747  */
748 boolean_t
timer_call_enter(timer_call_t call,uint64_t deadline,uint32_t flags)749 timer_call_enter(
750 	timer_call_t            call,
751 	uint64_t                deadline,
752 	uint32_t                flags)
753 {
754 	return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
755 }
756 
757 boolean_t
timer_call_enter1(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint32_t flags)758 timer_call_enter1(
759 	timer_call_t            call,
760 	timer_call_param_t      param1,
761 	uint64_t                deadline,
762 	uint32_t                flags)
763 {
764 	return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
765 }
766 
767 boolean_t
timer_call_enter_with_leeway(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint64_t leeway,uint32_t flags,boolean_t ratelimited)768 timer_call_enter_with_leeway(
769 	timer_call_t            call,
770 	timer_call_param_t      param1,
771 	uint64_t                deadline,
772 	uint64_t                leeway,
773 	uint32_t                flags,
774 	boolean_t               ratelimited)
775 {
776 	return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
777 }
778 
779 boolean_t
timer_call_cancel(timer_call_t call)780 timer_call_cancel(
781 	timer_call_t            call)
782 {
783 	mpqueue_head_t          *old_queue;
784 	spl_t                   s;
785 
786 	s = splclock();
787 
788 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
789 	    DECR_TIMER_CANCEL | DBG_FUNC_START,
790 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
791 	    call->tc_pqlink.deadline, call->tc_soft_deadline, call->tc_flags, 0);
792 
793 	old_queue = timer_call_dequeue_unlocked(call);
794 
795 	if (old_queue != NULL) {
796 		timer_queue_lock_spin(old_queue);
797 
798 		timer_call_t new_head = priority_queue_min(&old_queue->mpq_pqhead, struct timer_call, tc_pqlink);
799 
800 		if (new_head) {
801 			timer_queue_cancel(old_queue, call->tc_pqlink.deadline, new_head->tc_pqlink.deadline);
802 			old_queue->earliest_soft_deadline = new_head->tc_flags & TIMER_CALL_RATELIMITED ? new_head->tc_pqlink.deadline : new_head->tc_soft_deadline;
803 		} else {
804 			timer_queue_cancel(old_queue, call->tc_pqlink.deadline, UINT64_MAX);
805 			old_queue->earliest_soft_deadline = UINT64_MAX;
806 		}
807 
808 		timer_queue_unlock(old_queue);
809 	}
810 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
811 	    DECR_TIMER_CANCEL | DBG_FUNC_END,
812 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
813 	    VM_KERNEL_UNSLIDE_OR_PERM(old_queue),
814 	    call->tc_pqlink.deadline - mach_absolute_time(),
815 	    call->tc_pqlink.deadline - call->tc_entry_time, 0);
816 	splx(s);
817 
818 #if CONFIG_DTRACE
819 	DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
820 	    timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
821 	    (call->tc_ttd >> 32), (unsigned) (call->tc_ttd & 0xFFFFFFFF));
822 #endif /* CONFIG_DTRACE */
823 
824 	return old_queue != NULL;
825 }
826 
827 static uint32_t timer_queue_shutdown_lock_skips;
828 static uint32_t timer_queue_shutdown_discarded;
829 
830 void
timer_queue_shutdown(__kdebug_only int target_cpu,mpqueue_head_t * queue,mpqueue_head_t * new_queue)831 timer_queue_shutdown(
832 	__kdebug_only int target_cpu,
833 	mpqueue_head_t          *queue,
834 	mpqueue_head_t          *new_queue)
835 {
836 	DBG("timer_queue_shutdown(%p)\n", queue);
837 
838 	__kdebug_only int ntimers_moved = 0, lock_skips = 0, shutdown_discarded = 0;
839 
840 	spl_t s = splclock();
841 
842 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
843 	    DECR_TIMER_SHUTDOWN | DBG_FUNC_START,
844 	    target_cpu,
845 	    queue->earliest_soft_deadline, 0,
846 	    0, 0);
847 
848 	while (TRUE) {
849 		timer_queue_lock_spin(queue);
850 
851 		timer_call_t call = qe_queue_first(&queue->head,
852 		    struct timer_call, tc_qlink);
853 
854 		if (call == NULL) {
855 			break;
856 		}
857 
858 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
859 			/*
860 			 * case (2b) lock order inversion, dequeue and skip
861 			 * Don't change the call_entry queue back-pointer
862 			 * but set the async_dequeue field.
863 			 */
864 			lock_skips++;
865 			timer_queue_shutdown_lock_skips++;
866 			timer_call_entry_dequeue_async(call);
867 #if TIMER_ASSERT
868 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
869 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
870 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
871 			    call->tc_async_dequeue,
872 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
873 			    0x2b, 0);
874 #endif
875 			timer_queue_unlock(queue);
876 			continue;
877 		}
878 
879 		boolean_t call_local = ((call->tc_flags & TIMER_CALL_LOCAL) != 0);
880 
881 		/* remove entry from old queue */
882 		timer_call_entry_dequeue(call);
883 		timer_queue_unlock(queue);
884 
885 		if (call_local == FALSE) {
886 			/* and queue it on new, discarding LOCAL timers */
887 			timer_queue_lock_spin(new_queue);
888 			timer_call_entry_enqueue_deadline(
889 				call, new_queue, call->tc_pqlink.deadline);
890 			timer_queue_unlock(new_queue);
891 			ntimers_moved++;
892 		} else {
893 			shutdown_discarded++;
894 			timer_queue_shutdown_discarded++;
895 		}
896 
897 		assert(call_local == FALSE);
898 		simple_unlock(&call->tc_lock);
899 	}
900 
901 	timer_queue_unlock(queue);
902 
903 	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
904 	    DECR_TIMER_SHUTDOWN | DBG_FUNC_END,
905 	    target_cpu, ntimers_moved, lock_skips, shutdown_discarded, 0);
906 
907 	splx(s);
908 }
909 
910 
911 static uint32_t timer_queue_expire_lock_skips;
912 uint64_t
timer_queue_expire_with_options(mpqueue_head_t * queue,uint64_t deadline,boolean_t rescan)913 timer_queue_expire_with_options(
914 	mpqueue_head_t          *queue,
915 	uint64_t                deadline,
916 	boolean_t               rescan)
917 {
918 	timer_call_t    call = NULL;
919 	uint32_t tc_iterations = 0;
920 	DBG("timer_queue_expire(%p,)\n", queue);
921 
922 	/* 'rescan' means look at every timer in the list, instead of
923 	 * early-exiting when the head of the list expires in the future.
924 	 * when 'rescan' is true, iterate by linked list instead of priority queue.
925 	 *
926 	 * TODO: if we keep a deadline ordered and soft-deadline ordered
927 	 * priority queue, then it's no longer necessary to do that
928 	 */
929 
930 	uint64_t cur_deadline = deadline;
931 
932 	/* Force an early return if this time limit is hit. */
933 	const uint64_t time_limit = deadline + timer_scan_limit_abs;
934 
935 	/* Next deadline if the time limit is hit */
936 	uint64_t time_limit_deadline = 0;
937 
938 	timer_queue_lock_spin(queue);
939 
940 	while (!queue_empty(&queue->head)) {
941 		if (++tc_iterations > 1) {
942 			const uint64_t now = mach_absolute_time();
943 
944 			/*
945 			 * Abort the scan if it's taking too long to avoid long
946 			 * periods with interrupts disabled.
947 			 * Scanning will restart after a short pause
948 			 * (timer_scan_interval_abs) if there's a hard deadline
949 			 * pending.
950 			 */
951 			if (rescan == FALSE && now > time_limit) {
952 				TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
953 				    DECR_TIMER_PAUSE | DBG_FUNC_NONE,
954 				    queue->count, tc_iterations - 1,
955 				    0, 0, 0);
956 
957 				counter_inc(&timer_scan_pauses_cnt);
958 				time_limit_deadline = now + timer_scan_interval_abs;
959 				break;
960 			}
961 
962 			/*
963 			 * Upon processing one or more timer calls, refresh the
964 			 * deadline to account for time elapsed in the callout
965 			 */
966 			cur_deadline = now;
967 		}
968 
969 		if (call == NULL) {
970 			if (rescan == FALSE) {
971 				call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
972 			} else {
973 				call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
974 			}
975 		}
976 
977 		if (call->tc_soft_deadline <= cur_deadline) {
978 			timer_call_func_t               func;
979 			timer_call_param_t              param0, param1;
980 
981 			TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->tc_soft_deadline, 0, 0, 0);
982 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
983 			    DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
984 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
985 			    call->tc_soft_deadline,
986 			    call->tc_pqlink.deadline,
987 			    call->tc_entry_time, 0);
988 
989 			if ((call->tc_flags & TIMER_CALL_RATELIMITED) &&
990 			    (call->tc_pqlink.deadline > cur_deadline)) {
991 				if (rescan == FALSE) {
992 					break;
993 				}
994 			}
995 
996 			if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
997 				/* case (2b) lock inversion, dequeue and skip */
998 				timer_queue_expire_lock_skips++;
999 				timer_call_entry_dequeue_async(call);
1000 				call = NULL;
1001 				continue;
1002 			}
1003 
1004 			timer_call_entry_dequeue(call);
1005 
1006 			func = call->tc_func;
1007 			param0 = call->tc_param0;
1008 			param1 = call->tc_param1;
1009 
1010 			simple_unlock(&call->tc_lock);
1011 			timer_queue_unlock(queue);
1012 
1013 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1014 			    DECR_TIMER_CALLOUT | DBG_FUNC_START,
1015 			    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
1016 			    VM_KERNEL_ADDRHIDE(param0),
1017 			    VM_KERNEL_ADDRHIDE(param1),
1018 			    0);
1019 
1020 #if CONFIG_DTRACE
1021 			DTRACE_TMR7(callout__start, timer_call_func_t, func,
1022 			    timer_call_param_t, param0, unsigned, call->tc_flags,
1023 			    0, (call->tc_ttd >> 32),
1024 			    (unsigned) (call->tc_ttd & 0xFFFFFFFF), call);
1025 #endif
1026 			/* Maintain time-to-deadline in per-processor data
1027 			 * structure for thread wakeup deadline statistics.
1028 			 */
1029 			uint64_t *ttdp = &current_processor()->timer_call_ttd;
1030 			*ttdp = call->tc_ttd;
1031 			(*func)(param0, param1);
1032 			*ttdp = 0;
1033 #if CONFIG_DTRACE
1034 			DTRACE_TMR4(callout__end, timer_call_func_t, func,
1035 			    param0, param1, call);
1036 #endif
1037 
1038 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1039 			    DECR_TIMER_CALLOUT | DBG_FUNC_END,
1040 			    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
1041 			    VM_KERNEL_ADDRHIDE(param0),
1042 			    VM_KERNEL_ADDRHIDE(param1),
1043 			    0);
1044 			call = NULL;
1045 			timer_queue_lock_spin(queue);
1046 		} else {
1047 			if (__probable(rescan == FALSE)) {
1048 				break;
1049 			} else {
1050 				int64_t skew = call->tc_pqlink.deadline - call->tc_soft_deadline;
1051 				assert(call->tc_pqlink.deadline >= call->tc_soft_deadline);
1052 
1053 				/* DRK: On a latency quality-of-service level change,
1054 				 * re-sort potentially rate-limited timers. The platform
1055 				 * layer determines which timers require
1056 				 * this. In the absence of the per-callout
1057 				 * synchronization requirement, a global resort could
1058 				 * be more efficient. The re-sort effectively
1059 				 * annuls all timer adjustments, i.e. the "soft
1060 				 * deadline" is the sort key.
1061 				 */
1062 
1063 				if (timer_resort_threshold(skew)) {
1064 					if (__probable(simple_lock_try(&call->tc_lock, LCK_GRP_NULL))) {
1065 						/* TODO: don't need to dequeue before enqueue */
1066 						timer_call_entry_dequeue(call);
1067 						timer_call_entry_enqueue_deadline(call, queue, call->tc_soft_deadline);
1068 						simple_unlock(&call->tc_lock);
1069 						call = NULL;
1070 					}
1071 				}
1072 				if (call) {
1073 					call = qe_queue_next(&queue->head, call, struct timer_call, tc_qlink);
1074 
1075 					if (call == NULL) {
1076 						break;
1077 					}
1078 				}
1079 			}
1080 		}
1081 	}
1082 
1083 	call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
1084 
1085 	if (call) {
1086 		/*
1087 		 * Even if the time limit has been hit, it doesn't mean a hard
1088 		 * deadline will be missed - the next hard deadline may be in
1089 		 * future.
1090 		 */
1091 		if (time_limit_deadline > call->tc_pqlink.deadline) {
1092 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1093 			    DECR_TIMER_POSTPONE | DBG_FUNC_NONE,
1094 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1095 			    call->tc_pqlink.deadline,
1096 			    time_limit_deadline,
1097 			    0, 0);
1098 			counter_inc(&timer_scan_postpones_cnt);
1099 			cur_deadline = time_limit_deadline;
1100 		} else {
1101 			cur_deadline = call->tc_pqlink.deadline;
1102 		}
1103 		queue->earliest_soft_deadline = (call->tc_flags & TIMER_CALL_RATELIMITED) ? call->tc_pqlink.deadline: call->tc_soft_deadline;
1104 	} else {
1105 		queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
1106 	}
1107 
1108 	timer_queue_unlock(queue);
1109 
1110 	return cur_deadline;
1111 }
1112 
1113 uint64_t
timer_queue_expire(mpqueue_head_t * queue,uint64_t deadline)1114 timer_queue_expire(
1115 	mpqueue_head_t          *queue,
1116 	uint64_t                deadline)
1117 {
1118 	return timer_queue_expire_with_options(queue, deadline, FALSE);
1119 }
1120 
1121 static uint32_t timer_queue_migrate_lock_skips;
1122 /*
1123  * timer_queue_migrate() is called by timer_queue_migrate_cpu()
1124  * to move timer requests from the local processor (queue_from)
1125  * to a target processor's (queue_to).
1126  */
1127 int
timer_queue_migrate(mpqueue_head_t * queue_from,mpqueue_head_t * queue_to)1128 timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
1129 {
1130 	timer_call_t    call;
1131 	timer_call_t    head_to;
1132 	int             timers_migrated = 0;
1133 
1134 	DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
1135 
1136 	assert(!ml_get_interrupts_enabled());
1137 	assert(queue_from != queue_to);
1138 
1139 	if (serverperfmode) {
1140 		/*
1141 		 * if we're running a high end server
1142 		 * avoid migrations... they add latency
1143 		 * and don't save us power under typical
1144 		 * server workloads
1145 		 */
1146 		return -4;
1147 	}
1148 
1149 	/*
1150 	 * Take both local (from) and target (to) timer queue locks while
1151 	 * moving the timers from the local queue to the target processor.
1152 	 * We assume that the target is always the boot processor.
1153 	 * But only move if all of the following is true:
1154 	 *  - the target queue is non-empty
1155 	 *  - the local queue is non-empty
1156 	 *  - the local queue's first deadline is later than the target's
1157 	 *  - the local queue contains no non-migrateable "local" call
1158 	 * so that we need not have the target resync.
1159 	 */
1160 
1161 	timer_queue_lock_spin(queue_to);
1162 
1163 	head_to = priority_queue_min(&queue_to->mpq_pqhead, struct timer_call, tc_pqlink);
1164 
1165 	if (head_to == NULL) {
1166 		timers_migrated = -1;
1167 		goto abort1;
1168 	}
1169 
1170 	timer_queue_lock_spin(queue_from);
1171 
1172 	call = priority_queue_min(&queue_from->mpq_pqhead, struct timer_call, tc_pqlink);
1173 
1174 	if (call == NULL) {
1175 		timers_migrated = -2;
1176 		goto abort2;
1177 	}
1178 
1179 	if (call->tc_pqlink.deadline < head_to->tc_pqlink.deadline) {
1180 		timers_migrated = 0;
1181 		goto abort2;
1182 	}
1183 
1184 	/* perform scan for non-migratable timers */
1185 	qe_foreach_element(call, &queue_from->head, tc_qlink) {
1186 		if (call->tc_flags & TIMER_CALL_LOCAL) {
1187 			timers_migrated = -3;
1188 			goto abort2;
1189 		}
1190 	}
1191 
1192 	/* migration loop itself -- both queues are locked */
1193 	qe_foreach_element_safe(call, &queue_from->head, tc_qlink) {
1194 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1195 			/* case (2b) lock order inversion, dequeue only */
1196 #ifdef TIMER_ASSERT
1197 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1198 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1199 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1200 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1201 			    0,
1202 			    0x2b, 0);
1203 #endif
1204 			timer_queue_migrate_lock_skips++;
1205 			timer_call_entry_dequeue_async(call);
1206 			continue;
1207 		}
1208 		timer_call_entry_dequeue(call);
1209 		timer_call_entry_enqueue_deadline(
1210 			call, queue_to, call->tc_pqlink.deadline);
1211 		timers_migrated++;
1212 		simple_unlock(&call->tc_lock);
1213 	}
1214 	queue_from->earliest_soft_deadline = UINT64_MAX;
1215 abort2:
1216 	timer_queue_unlock(queue_from);
1217 abort1:
1218 	timer_queue_unlock(queue_to);
1219 
1220 	return timers_migrated;
1221 }
1222 
1223 void
timer_queue_trace_cpu(int ncpu)1224 timer_queue_trace_cpu(int ncpu)
1225 {
1226 	timer_call_nosync_cpu(
1227 		ncpu,
1228 		(void (*)(void *))timer_queue_trace,
1229 		(void*) timer_queue_cpu(ncpu));
1230 }
1231 
1232 void
timer_queue_trace(mpqueue_head_t * queue)1233 timer_queue_trace(
1234 	mpqueue_head_t                  *queue)
1235 {
1236 	timer_call_t    call;
1237 	spl_t           s;
1238 
1239 	if (!kdebug_enable) {
1240 		return;
1241 	}
1242 
1243 	s = splclock();
1244 	timer_queue_lock_spin(queue);
1245 
1246 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1247 	    DECR_TIMER_QUEUE | DBG_FUNC_START,
1248 	    queue->count, mach_absolute_time(), 0, 0, 0);
1249 
1250 	qe_foreach_element(call, &queue->head, tc_qlink) {
1251 		TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1252 		    DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1253 		    call->tc_soft_deadline,
1254 		    call->tc_pqlink.deadline,
1255 		    call->tc_entry_time,
1256 		    VM_KERNEL_UNSLIDE(call->tc_func),
1257 		    0);
1258 	}
1259 
1260 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1261 	    DECR_TIMER_QUEUE | DBG_FUNC_END,
1262 	    queue->count, mach_absolute_time(), 0, 0, 0);
1263 
1264 	timer_queue_unlock(queue);
1265 	splx(s);
1266 }
1267 
1268 void
timer_longterm_dequeued_locked(timer_call_t call)1269 timer_longterm_dequeued_locked(timer_call_t call)
1270 {
1271 	timer_longterm_t        *tlp = &timer_longterm;
1272 
1273 	tlp->dequeues++;
1274 	if (call == tlp->threshold.call) {
1275 		tlp->threshold.call = NULL;
1276 	}
1277 }
1278 
1279 /*
1280  * Place a timer call in the longterm list
1281  * and adjust the next timer callout deadline if the new timer is first.
1282  */
1283 mpqueue_head_t *
timer_longterm_enqueue_unlocked(timer_call_t call,uint64_t now,uint64_t deadline,mpqueue_head_t ** old_queue,uint64_t soft_deadline,uint64_t ttd,timer_call_param_t param1,uint32_t callout_flags)1284 timer_longterm_enqueue_unlocked(timer_call_t    call,
1285     uint64_t        now,
1286     uint64_t        deadline,
1287     mpqueue_head_t  **old_queue,
1288     uint64_t        soft_deadline,
1289     uint64_t        ttd,
1290     timer_call_param_t      param1,
1291     uint32_t        callout_flags)
1292 {
1293 	timer_longterm_t        *tlp = &timer_longterm;
1294 	boolean_t               update_required = FALSE;
1295 	uint64_t                longterm_threshold;
1296 
1297 	longterm_threshold = now + tlp->threshold.interval;
1298 
1299 	/*
1300 	 * Return NULL without doing anything if:
1301 	 *  - this timer is local, or
1302 	 *  - the longterm mechanism is disabled, or
1303 	 *  - this deadline is too short.
1304 	 */
1305 	if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
1306 	    (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
1307 	    (deadline <= longterm_threshold)) {
1308 		return NULL;
1309 	}
1310 
1311 	/*
1312 	 * Remove timer from its current queue, if any.
1313 	 */
1314 	*old_queue = timer_call_dequeue_unlocked(call);
1315 
1316 	/*
1317 	 * Lock the longterm queue, queue timer and determine
1318 	 * whether an update is necessary.
1319 	 */
1320 	assert(!ml_get_interrupts_enabled());
1321 	simple_lock(&call->tc_lock, LCK_GRP_NULL);
1322 	timer_queue_lock_spin(timer_longterm_queue);
1323 	call->tc_pqlink.deadline = deadline;
1324 	call->tc_param1 = param1;
1325 	call->tc_ttd = ttd;
1326 	call->tc_soft_deadline = soft_deadline;
1327 	call->tc_flags = callout_flags;
1328 	timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1329 
1330 	tlp->enqueues++;
1331 
1332 	/*
1333 	 * We'll need to update the currently set threshold timer
1334 	 * if the new deadline is sooner and no sooner update is in flight.
1335 	 */
1336 	if (deadline < tlp->threshold.deadline &&
1337 	    deadline < tlp->threshold.preempted) {
1338 		tlp->threshold.preempted = deadline;
1339 		tlp->threshold.call = call;
1340 		update_required = TRUE;
1341 	}
1342 	timer_queue_unlock(timer_longterm_queue);
1343 	simple_unlock(&call->tc_lock);
1344 
1345 	if (update_required) {
1346 		/*
1347 		 * Note: this call expects that calling the master cpu
1348 		 * alone does not involve locking the topo lock.
1349 		 */
1350 		timer_call_nosync_cpu(
1351 			master_cpu,
1352 			(void (*)(void *))timer_longterm_update,
1353 			(void *)tlp);
1354 	}
1355 
1356 	return timer_longterm_queue;
1357 }
1358 
1359 /*
1360  * Scan for timers below the longterm threshold.
1361  * Move these to the local timer queue (of the boot processor on which the
1362  * calling thread is running).
1363  * Both the local (boot) queue and the longterm queue are locked.
1364  * The scan is similar to the timer migrate sequence but is performed by
1365  * successively examining each timer on the longterm queue:
1366  *  - if within the short-term threshold
1367  *    - enter on the local queue (unless being deleted),
1368  *  - otherwise:
1369  *    - if sooner, deadline becomes the next threshold deadline.
1370  * The total scan time is limited to TIMER_LONGTERM_SCAN_LIMIT. Should this be
1371  * exceeded, we abort and reschedule again so that we don't shut others from
1372  * the timer queues. Longterm timers firing late is not critical.
1373  */
1374 void
timer_longterm_scan(timer_longterm_t * tlp,uint64_t time_start)1375 timer_longterm_scan(timer_longterm_t    *tlp,
1376     uint64_t            time_start)
1377 {
1378 	timer_call_t    call;
1379 	uint64_t        threshold = TIMER_LONGTERM_NONE;
1380 	uint64_t        deadline;
1381 	uint64_t        time_limit = time_start + tlp->scan_limit;
1382 	mpqueue_head_t  *timer_master_queue;
1383 
1384 	assert(!ml_get_interrupts_enabled());
1385 	assert(cpu_number() == master_cpu);
1386 
1387 	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1388 		threshold = time_start + tlp->threshold.interval;
1389 	}
1390 
1391 	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1392 	tlp->threshold.call = NULL;
1393 
1394 	if (queue_empty(&timer_longterm_queue->head)) {
1395 		return;
1396 	}
1397 
1398 	timer_master_queue = timer_queue_cpu(master_cpu);
1399 	timer_queue_lock_spin(timer_master_queue);
1400 
1401 	qe_foreach_element_safe(call, &timer_longterm_queue->head, tc_qlink) {
1402 		deadline = call->tc_soft_deadline;
1403 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1404 			/* case (2c) lock order inversion, dequeue only */
1405 #ifdef TIMER_ASSERT
1406 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1407 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1408 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1409 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1410 			    0,
1411 			    0x2c, 0);
1412 #endif
1413 			timer_call_entry_dequeue_async(call);
1414 			continue;
1415 		}
1416 		if (deadline < threshold) {
1417 			/*
1418 			 * This timer needs moving (escalating)
1419 			 * to the local (boot) processor's queue.
1420 			 */
1421 #ifdef TIMER_ASSERT
1422 			if (deadline < time_start) {
1423 				TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1424 				    DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1425 				    VM_KERNEL_UNSLIDE_OR_PERM(call),
1426 				    deadline,
1427 				    time_start,
1428 				    threshold,
1429 				    0);
1430 			}
1431 #endif
1432 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1433 			    DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1434 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1435 			    call->tc_pqlink.deadline,
1436 			    call->tc_entry_time,
1437 			    VM_KERNEL_UNSLIDE(call->tc_func),
1438 			    0);
1439 			tlp->escalates++;
1440 			timer_call_entry_dequeue(call);
1441 			timer_call_entry_enqueue_deadline(
1442 				call, timer_master_queue, call->tc_pqlink.deadline);
1443 			/*
1444 			 * A side-effect of the following call is to update
1445 			 * the actual hardware deadline if required.
1446 			 */
1447 			(void) timer_queue_assign(deadline);
1448 		} else {
1449 			if (deadline < tlp->threshold.deadline) {
1450 				tlp->threshold.deadline = deadline;
1451 				tlp->threshold.call = call;
1452 			}
1453 		}
1454 		simple_unlock(&call->tc_lock);
1455 
1456 		/* Abort scan if we're taking too long. */
1457 		if (mach_absolute_time() > time_limit) {
1458 			tlp->threshold.deadline = TIMER_LONGTERM_SCAN_AGAIN;
1459 			tlp->scan_pauses++;
1460 			DBG("timer_longterm_scan() paused %llu, qlen: %llu\n",
1461 			    time_limit, tlp->queue.count);
1462 			break;
1463 		}
1464 	}
1465 
1466 	timer_queue_unlock(timer_master_queue);
1467 }
1468 
1469 void
timer_longterm_callout(timer_call_param_t p0,__unused timer_call_param_t p1)1470 timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1471 {
1472 	timer_longterm_t        *tlp = (timer_longterm_t *) p0;
1473 
1474 	timer_longterm_update(tlp);
1475 }
1476 
1477 void
timer_longterm_update_locked(timer_longterm_t * tlp)1478 timer_longterm_update_locked(timer_longterm_t *tlp)
1479 {
1480 	uint64_t        latency;
1481 
1482 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1483 	    DECR_TIMER_UPDATE | DBG_FUNC_START,
1484 	    VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1485 	    tlp->threshold.deadline,
1486 	    tlp->threshold.preempted,
1487 	    tlp->queue.count, 0);
1488 
1489 	tlp->scan_time = mach_absolute_time();
1490 	if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1491 		tlp->threshold.preempts++;
1492 		tlp->threshold.deadline = tlp->threshold.preempted;
1493 		tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1494 		/*
1495 		 * Note: in the unlikely event that a pre-empted timer has
1496 		 * itself been cancelled, we'll simply re-scan later at the
1497 		 * time of the preempted/cancelled timer.
1498 		 */
1499 	} else {
1500 		tlp->threshold.scans++;
1501 
1502 		/*
1503 		 * Maintain a moving average of our wakeup latency.
1504 		 * Clamp latency to 0 and ignore above threshold interval.
1505 		 */
1506 		if (tlp->scan_time > tlp->threshold.deadline_set) {
1507 			latency = tlp->scan_time - tlp->threshold.deadline_set;
1508 		} else {
1509 			latency = 0;
1510 		}
1511 		if (latency < tlp->threshold.interval) {
1512 			tlp->threshold.latency_min =
1513 			    MIN(tlp->threshold.latency_min, latency);
1514 			tlp->threshold.latency_max =
1515 			    MAX(tlp->threshold.latency_max, latency);
1516 			tlp->threshold.latency =
1517 			    (tlp->threshold.latency * 99 + latency) / 100;
1518 		}
1519 
1520 		timer_longterm_scan(tlp, tlp->scan_time);
1521 	}
1522 
1523 	tlp->threshold.deadline_set = tlp->threshold.deadline;
1524 	/* The next deadline timer to be set is adjusted */
1525 	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE &&
1526 	    tlp->threshold.deadline != TIMER_LONGTERM_SCAN_AGAIN) {
1527 		tlp->threshold.deadline_set -= tlp->threshold.margin;
1528 		tlp->threshold.deadline_set -= tlp->threshold.latency;
1529 	}
1530 
1531 	/* Throttle next scan time */
1532 	uint64_t scan_clamp = mach_absolute_time() + tlp->scan_interval;
1533 	if (tlp->threshold.deadline_set < scan_clamp) {
1534 		tlp->threshold.deadline_set = scan_clamp;
1535 	}
1536 
1537 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1538 	    DECR_TIMER_UPDATE | DBG_FUNC_END,
1539 	    VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1540 	    tlp->threshold.deadline,
1541 	    tlp->threshold.scans,
1542 	    tlp->queue.count, 0);
1543 }
1544 
1545 void
timer_longterm_update(timer_longterm_t * tlp)1546 timer_longterm_update(timer_longterm_t *tlp)
1547 {
1548 	spl_t   s = splclock();
1549 
1550 	timer_queue_lock_spin(timer_longterm_queue);
1551 
1552 	if (cpu_number() != master_cpu) {
1553 		panic("timer_longterm_update_master() on non-boot cpu");
1554 	}
1555 
1556 	timer_longterm_update_locked(tlp);
1557 
1558 	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1559 		timer_call_enter(
1560 			&tlp->threshold.timer,
1561 			tlp->threshold.deadline_set,
1562 			TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1563 	}
1564 
1565 	timer_queue_unlock(timer_longterm_queue);
1566 	splx(s);
1567 }
1568 
1569 void
timer_longterm_init(void)1570 timer_longterm_init(void)
1571 {
1572 	uint32_t                longterm;
1573 	timer_longterm_t        *tlp = &timer_longterm;
1574 
1575 	DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1576 
1577 	/*
1578 	 * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1579 	 * or TIMER_LONGTERM_NONE (disabled) for server;
1580 	 * overridden longterm boot-arg
1581 	 */
1582 	tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
1583 	    : TIMER_LONGTERM_THRESHOLD;
1584 	if (PE_parse_boot_argn("longterm", &longterm, sizeof(longterm))) {
1585 		tlp->threshold.interval = (longterm == 0) ?
1586 		    TIMER_LONGTERM_NONE :
1587 		    longterm * NSEC_PER_MSEC;
1588 	}
1589 	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1590 		printf("Longterm timer threshold: %llu ms\n",
1591 		    tlp->threshold.interval / NSEC_PER_MSEC);
1592 		kprintf("Longterm timer threshold: %llu ms\n",
1593 		    tlp->threshold.interval / NSEC_PER_MSEC);
1594 		nanoseconds_to_absolutetime(tlp->threshold.interval,
1595 		    &tlp->threshold.interval);
1596 		tlp->threshold.margin = tlp->threshold.interval / 10;
1597 		tlp->threshold.latency_min = EndOfAllTime;
1598 		tlp->threshold.latency_max = 0;
1599 	}
1600 
1601 	tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1602 	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1603 
1604 	mpqueue_init(&tlp->queue, &timer_longterm_lck_grp, LCK_ATTR_NULL);
1605 
1606 	timer_call_setup(&tlp->threshold.timer,
1607 	    timer_longterm_callout, (timer_call_param_t) tlp);
1608 
1609 	timer_longterm_queue = &tlp->queue;
1610 }
1611 
1612 enum {
1613 	THRESHOLD, QCOUNT,
1614 	ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
1615 	LATENCY, LATENCY_MIN, LATENCY_MAX, LONG_TERM_SCAN_LIMIT,
1616 	LONG_TERM_SCAN_INTERVAL, LONG_TERM_SCAN_PAUSES,
1617 	SCAN_LIMIT, SCAN_INTERVAL, SCAN_PAUSES, SCAN_POSTPONES,
1618 };
1619 uint64_t
timer_sysctl_get(int oid)1620 timer_sysctl_get(int oid)
1621 {
1622 	timer_longterm_t        *tlp = &timer_longterm;
1623 
1624 	switch (oid) {
1625 	case THRESHOLD:
1626 		return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
1627 		       0 : tlp->threshold.interval / NSEC_PER_MSEC;
1628 	case QCOUNT:
1629 		return tlp->queue.count;
1630 	case ENQUEUES:
1631 		return tlp->enqueues;
1632 	case DEQUEUES:
1633 		return tlp->dequeues;
1634 	case ESCALATES:
1635 		return tlp->escalates;
1636 	case SCANS:
1637 		return tlp->threshold.scans;
1638 	case PREEMPTS:
1639 		return tlp->threshold.preempts;
1640 	case LATENCY:
1641 		return tlp->threshold.latency;
1642 	case LATENCY_MIN:
1643 		return tlp->threshold.latency_min;
1644 	case LATENCY_MAX:
1645 		return tlp->threshold.latency_max;
1646 	case LONG_TERM_SCAN_LIMIT:
1647 		return tlp->scan_limit;
1648 	case LONG_TERM_SCAN_INTERVAL:
1649 		return tlp->scan_interval;
1650 	case LONG_TERM_SCAN_PAUSES:
1651 		return tlp->scan_pauses;
1652 	case SCAN_LIMIT:
1653 		return timer_scan_limit_us * NSEC_PER_USEC;
1654 	case SCAN_INTERVAL:
1655 		return timer_scan_interval_us * NSEC_PER_USEC;
1656 	case SCAN_PAUSES:
1657 		return counter_load(&timer_scan_pauses_cnt);
1658 	case SCAN_POSTPONES:
1659 		return counter_load(&timer_scan_postpones_cnt);
1660 
1661 	default:
1662 		return 0;
1663 	}
1664 }
1665 
1666 /*
1667  * timer_master_scan() is the inverse of timer_longterm_scan()
1668  * since it un-escalates timers to the longterm queue.
1669  */
1670 static void
timer_master_scan(timer_longterm_t * tlp,uint64_t now)1671 timer_master_scan(timer_longterm_t      *tlp,
1672     uint64_t              now)
1673 {
1674 	timer_call_t    call;
1675 	uint64_t        threshold;
1676 	uint64_t        deadline;
1677 	mpqueue_head_t  *timer_master_queue;
1678 
1679 	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1680 		threshold = now + tlp->threshold.interval;
1681 	} else {
1682 		threshold = TIMER_LONGTERM_NONE;
1683 	}
1684 
1685 	timer_master_queue = timer_queue_cpu(master_cpu);
1686 	timer_queue_lock_spin(timer_master_queue);
1687 
1688 	qe_foreach_element_safe(call, &timer_master_queue->head, tc_qlink) {
1689 		deadline = call->tc_pqlink.deadline;
1690 		if ((call->tc_flags & TIMER_CALL_LOCAL) != 0) {
1691 			continue;
1692 		}
1693 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1694 			/* case (2c) lock order inversion, dequeue only */
1695 			timer_call_entry_dequeue_async(call);
1696 			continue;
1697 		}
1698 		if (deadline > threshold) {
1699 			/* move from master to longterm */
1700 			timer_call_entry_dequeue(call);
1701 			timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1702 			if (deadline < tlp->threshold.deadline) {
1703 				tlp->threshold.deadline = deadline;
1704 				tlp->threshold.call = call;
1705 			}
1706 		}
1707 		simple_unlock(&call->tc_lock);
1708 	}
1709 	timer_queue_unlock(timer_master_queue);
1710 }
1711 
1712 static void
timer_sysctl_set_threshold(void * valp)1713 timer_sysctl_set_threshold(void* valp)
1714 {
1715 	uint64_t value =        (uint64_t)valp;
1716 	timer_longterm_t        *tlp = &timer_longterm;
1717 	spl_t                   s = splclock();
1718 	boolean_t               threshold_increase;
1719 
1720 	timer_queue_lock_spin(timer_longterm_queue);
1721 
1722 	timer_call_cancel(&tlp->threshold.timer);
1723 
1724 	/*
1725 	 * Set the new threshold and note whther it's increasing.
1726 	 */
1727 	if (value == 0) {
1728 		tlp->threshold.interval = TIMER_LONGTERM_NONE;
1729 		threshold_increase = TRUE;
1730 		timer_call_cancel(&tlp->threshold.timer);
1731 	} else {
1732 		uint64_t        old_interval = tlp->threshold.interval;
1733 		tlp->threshold.interval = value * NSEC_PER_MSEC;
1734 		nanoseconds_to_absolutetime(tlp->threshold.interval,
1735 		    &tlp->threshold.interval);
1736 		tlp->threshold.margin = tlp->threshold.interval / 10;
1737 		if (old_interval == TIMER_LONGTERM_NONE) {
1738 			threshold_increase = FALSE;
1739 		} else {
1740 			threshold_increase = (tlp->threshold.interval > old_interval);
1741 		}
1742 	}
1743 
1744 	if (threshold_increase /* or removal */) {
1745 		/* Escalate timers from the longterm queue */
1746 		timer_longterm_scan(tlp, mach_absolute_time());
1747 	} else { /* decrease or addition  */
1748 		/*
1749 		 * We scan the local/master queue for timers now longterm.
1750 		 * To be strictly correct, we should scan all processor queues
1751 		 * but timer migration results in most timers gravitating to the
1752 		 * master processor in any case.
1753 		 */
1754 		timer_master_scan(tlp, mach_absolute_time());
1755 	}
1756 
1757 	/* Set new timer accordingly */
1758 	tlp->threshold.deadline_set = tlp->threshold.deadline;
1759 	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1760 		tlp->threshold.deadline_set -= tlp->threshold.margin;
1761 		tlp->threshold.deadline_set -= tlp->threshold.latency;
1762 		timer_call_enter(
1763 			&tlp->threshold.timer,
1764 			tlp->threshold.deadline_set,
1765 			TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1766 	}
1767 
1768 	/* Reset stats */
1769 	tlp->enqueues = 0;
1770 	tlp->dequeues = 0;
1771 	tlp->escalates = 0;
1772 	tlp->scan_pauses = 0;
1773 	tlp->threshold.scans = 0;
1774 	tlp->threshold.preempts = 0;
1775 	tlp->threshold.latency = 0;
1776 	tlp->threshold.latency_min = EndOfAllTime;
1777 	tlp->threshold.latency_max = 0;
1778 
1779 	timer_queue_unlock(timer_longterm_queue);
1780 	splx(s);
1781 }
1782 
1783 int
timer_sysctl_set(__unused int oid,__unused uint64_t value)1784 timer_sysctl_set(__unused int oid, __unused uint64_t value)
1785 {
1786 	if (support_bootcpu_shutdown) {
1787 		return KERN_NOT_SUPPORTED;
1788 	}
1789 
1790 	switch (oid) {
1791 	case THRESHOLD:
1792 		timer_call_cpu(
1793 			master_cpu,
1794 			timer_sysctl_set_threshold,
1795 			(void *) value);
1796 		return KERN_SUCCESS;
1797 	case LONG_TERM_SCAN_LIMIT:
1798 		timer_longterm.scan_limit = value;
1799 		return KERN_SUCCESS;
1800 	case LONG_TERM_SCAN_INTERVAL:
1801 		timer_longterm.scan_interval = value;
1802 		return KERN_SUCCESS;
1803 	case SCAN_LIMIT:
1804 		if (value > MAX_TIMER_SCAN_LIMIT ||
1805 		    value < MIN_TIMER_SCAN_LIMIT) {
1806 			return KERN_INVALID_ARGUMENT;
1807 		}
1808 		timer_scan_limit_us = value / NSEC_PER_USEC;
1809 		nanoseconds_to_absolutetime(timer_scan_limit_us * NSEC_PER_USEC,
1810 		    &timer_scan_limit_abs);
1811 		return KERN_SUCCESS;
1812 	case SCAN_INTERVAL:
1813 		if (value > MAX_TIMER_SCAN_INTERVAL ||
1814 		    value < MIN_TIMER_SCAN_INTERVAL) {
1815 			return KERN_INVALID_ARGUMENT;
1816 		}
1817 		timer_scan_interval_us = value / NSEC_PER_USEC;
1818 		nanoseconds_to_absolutetime(timer_scan_interval_us * NSEC_PER_USEC,
1819 		    &timer_scan_interval_abs);
1820 		return KERN_SUCCESS;
1821 	default:
1822 		return KERN_INVALID_ARGUMENT;
1823 	}
1824 }
1825 
1826 
1827 /* Select timer coalescing window based on per-task quality-of-service hints */
1828 static boolean_t
tcoal_qos_adjust(thread_t t,int32_t * tshift,uint64_t * tmax_abstime,boolean_t * pratelimited)1829 tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1830 {
1831 	uint32_t latency_qos;
1832 	boolean_t adjusted = FALSE;
1833 	task_t ctask = get_threadtask(t);
1834 
1835 	if (ctask) {
1836 		latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1837 
1838 		assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1839 
1840 		if (latency_qos) {
1841 			*tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1842 			*tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1843 			*pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1844 			adjusted = TRUE;
1845 		}
1846 	}
1847 	return adjusted;
1848 }
1849 
1850 
1851 /* Adjust timer deadlines based on priority of the thread and the
1852  * urgency value provided at timeout establishment. With this mechanism,
1853  * timers are no longer necessarily sorted in order of soft deadline
1854  * on a given timer queue, i.e. they may be differentially skewed.
1855  * In the current scheme, this could lead to fewer pending timers
1856  * processed than is technically possible when the HW deadline arrives.
1857  */
1858 static void
timer_compute_leeway(thread_t cthread,int32_t urgency,int32_t * tshift,uint64_t * tmax_abstime,boolean_t * pratelimited)1859 timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1860 {
1861 	int16_t tpri = cthread->sched_pri;
1862 	if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1863 		bool tg_critical = false;
1864 #if CONFIG_THREAD_GROUPS
1865 		uint32_t tg_flags = thread_group_get_flags(thread_group_get(cthread));
1866 		tg_critical = tg_flags & (THREAD_GROUP_FLAGS_CRITICAL | THREAD_GROUP_FLAGS_STRICT_TIMERS);
1867 #endif /* CONFIG_THREAD_GROUPS */
1868 		bool timer_critical = (tpri >= BASEPRI_RTQUEUES) ||
1869 		    (urgency == TIMER_CALL_USER_CRITICAL) ||
1870 		    tg_critical;
1871 		if (timer_critical) {
1872 			*tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1873 			*tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1874 			TCOAL_PRIO_STAT(rt_tcl);
1875 		} else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
1876 		    (urgency == TIMER_CALL_USER_BACKGROUND)) {
1877 			/* Determine if timer should be subjected to a lower QoS */
1878 			if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1879 				if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1880 					return;
1881 				} else {
1882 					*pratelimited = FALSE;
1883 				}
1884 			}
1885 			*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1886 			*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1887 			TCOAL_PRIO_STAT(bg_tcl);
1888 		} else if (tpri >= MINPRI_KERNEL) {
1889 			*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1890 			*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1891 			TCOAL_PRIO_STAT(kt_tcl);
1892 		} else if (cthread->sched_mode == TH_MODE_FIXED) {
1893 			*tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1894 			*tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1895 			TCOAL_PRIO_STAT(fp_tcl);
1896 		} else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1897 			TCOAL_PRIO_STAT(qos_tcl);
1898 		} else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1899 			*tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1900 			*tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1901 			TCOAL_PRIO_STAT(ts_tcl);
1902 		} else {
1903 			TCOAL_PRIO_STAT(nc_tcl);
1904 		}
1905 	} else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1906 		*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1907 		*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1908 		TCOAL_PRIO_STAT(bg_tcl);
1909 	} else {
1910 		*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1911 		*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1912 		TCOAL_PRIO_STAT(kt_tcl);
1913 	}
1914 }
1915 
1916 
1917 int timer_user_idle_level;
1918 
1919 uint64_t
timer_call_slop(uint64_t deadline,uint64_t now,uint32_t flags,thread_t cthread,boolean_t * pratelimited)1920 timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1921 {
1922 	int32_t tcs_shift = 0;
1923 	uint64_t tcs_max_abstime = 0;
1924 	uint64_t adjval;
1925 	uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1926 
1927 	if (mach_timer_coalescing_enabled &&
1928 	    (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1929 		timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
1930 
1931 		if (tcs_shift >= 0) {
1932 			adjval =  MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
1933 		} else {
1934 			adjval =  MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
1935 		}
1936 		/* Apply adjustments derived from "user idle level" heuristic */
1937 		adjval += (adjval * timer_user_idle_level) >> 7;
1938 		return adjval;
1939 	} else {
1940 		return 0;
1941 	}
1942 }
1943 
1944 int
timer_get_user_idle_level(void)1945 timer_get_user_idle_level(void)
1946 {
1947 	return timer_user_idle_level;
1948 }
1949 
1950 kern_return_t
timer_set_user_idle_level(int ilevel)1951 timer_set_user_idle_level(int ilevel)
1952 {
1953 	boolean_t do_reeval = FALSE;
1954 
1955 	if ((ilevel < 0) || (ilevel > 128)) {
1956 		return KERN_INVALID_ARGUMENT;
1957 	}
1958 
1959 	if (ilevel < timer_user_idle_level) {
1960 		do_reeval = TRUE;
1961 	}
1962 
1963 	timer_user_idle_level = ilevel;
1964 
1965 	if (do_reeval) {
1966 		ml_timer_evaluate();
1967 	}
1968 
1969 	return KERN_SUCCESS;
1970 }
1971 
1972 #pragma mark - running timers
1973 
1974 #define RUNNING_TIMER_FAKE_FLAGS (TIMER_CALL_SYS_CRITICAL | \
1975     TIMER_CALL_LOCAL)
1976 
1977 /*
1978  * timer_call_trace_* functions mimic the tracing behavior from the normal
1979  * timer_call subsystem, so tools continue to function.
1980  */
1981 
1982 static void
timer_call_trace_enter_before(struct timer_call * call,uint64_t deadline,uint32_t flags,uint64_t now)1983 timer_call_trace_enter_before(struct timer_call *call, uint64_t deadline,
1984     uint32_t flags, uint64_t now)
1985 {
1986 #pragma unused(call, deadline, flags, now)
1987 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_START,
1988 	    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_ADDRHIDE(call->tc_param1),
1989 	    deadline, flags, 0);
1990 #if CONFIG_DTRACE
1991 	uint64_t ttd = deadline - now;
1992 	DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
1993 	    timer_call_param_t, call->tc_param0, uint32_t, flags, 0,
1994 	    (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
1995 #endif /* CONFIG_DTRACE */
1996 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1997 	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1998 }
1999 
2000 static void
timer_call_trace_enter_after(struct timer_call * call,uint64_t deadline)2001 timer_call_trace_enter_after(struct timer_call *call, uint64_t deadline)
2002 {
2003 #pragma unused(call, deadline)
2004 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
2005 	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
2006 }
2007 
2008 static void
timer_call_trace_cancel(struct timer_call * call)2009 timer_call_trace_cancel(struct timer_call *call)
2010 {
2011 #pragma unused(call)
2012 	__unused uint64_t deadline = call->tc_pqlink.deadline;
2013 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_START,
2014 	    VM_KERNEL_UNSLIDE_OR_PERM(call), deadline, 0,
2015 	    call->tc_flags, 0);
2016 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_END,
2017 	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline - mach_absolute_time(),
2018 	    deadline - call->tc_entry_time, 0);
2019 #if CONFIG_DTRACE
2020 #if TIMER_TRACE
2021 	uint64_t ttd = deadline - call->tc_entry_time;
2022 #else
2023 	uint64_t ttd = UINT64_MAX;
2024 #endif /* TIMER_TRACE */
2025 	DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
2026 	    timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
2027 	    (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF));
2028 #endif /* CONFIG_DTRACE */
2029 }
2030 
2031 static void
timer_call_trace_expire_entry(struct timer_call * call)2032 timer_call_trace_expire_entry(struct timer_call *call)
2033 {
2034 #pragma unused(call)
2035 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_START,
2036 	    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(call->tc_func),
2037 	    VM_KERNEL_ADDRHIDE(call->tc_param0),
2038 	    VM_KERNEL_ADDRHIDE(call->tc_param1),
2039 	    0);
2040 #if CONFIG_DTRACE
2041 #if TIMER_TRACE
2042 	uint64_t ttd = call->tc_pqlink.deadline - call->tc_entry_time;
2043 #else /* TIMER_TRACE */
2044 	uint64_t ttd = UINT64_MAX;
2045 #endif /* TIMER_TRACE */
2046 	DTRACE_TMR7(callout__start, timer_call_func_t, call->tc_func,
2047 	    timer_call_param_t, call->tc_param0, unsigned, call->tc_flags,
2048 	    0, (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
2049 #endif /* CONFIG_DTRACE */
2050 }
2051 
2052 static void
timer_call_trace_expire_return(struct timer_call * call)2053 timer_call_trace_expire_return(struct timer_call *call)
2054 {
2055 #pragma unused(call)
2056 #if CONFIG_DTRACE
2057 	DTRACE_TMR4(callout__end, timer_call_func_t, call->tc_func,
2058 	    call->tc_param0, call->tc_param1, NULL);
2059 #endif /* CONFIG_DTRACE */
2060 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_END,
2061 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
2062 	    VM_KERNEL_UNSLIDE(call->tc_func),
2063 	    VM_KERNEL_ADDRHIDE(call->tc_param0),
2064 	    VM_KERNEL_ADDRHIDE(call->tc_param1),
2065 	    0);
2066 }
2067 
2068 /*
2069  * Set a new deadline for a running timer on this processor.
2070  */
2071 void
running_timer_setup(processor_t processor,enum running_timer timer,void * param,uint64_t deadline,uint64_t now)2072 running_timer_setup(processor_t processor, enum running_timer timer,
2073     void *param, uint64_t deadline, uint64_t now)
2074 {
2075 	assert(timer < RUNNING_TIMER_MAX);
2076 	assert(ml_get_interrupts_enabled() == FALSE);
2077 
2078 	struct timer_call *call = &processor->running_timers[timer];
2079 
2080 	timer_call_trace_enter_before(call, deadline, RUNNING_TIMER_FAKE_FLAGS,
2081 	    now);
2082 
2083 	if (__improbable(deadline < now)) {
2084 		deadline = timer_call_past_deadline_timer_handle(deadline, now);
2085 	}
2086 
2087 	call->tc_pqlink.deadline = deadline;
2088 #if TIMER_TRACE
2089 	call->tc_entry_time = now;
2090 #endif /* TIMER_TRACE */
2091 	call->tc_param1 = param;
2092 
2093 	timer_call_trace_enter_after(call, deadline);
2094 }
2095 
2096 void
running_timers_sync(void)2097 running_timers_sync(void)
2098 {
2099 	timer_resync_deadlines();
2100 }
2101 
2102 void
running_timer_enter(processor_t processor,unsigned int timer,void * param,uint64_t deadline,uint64_t now)2103 running_timer_enter(processor_t processor, unsigned int timer,
2104     void *param, uint64_t deadline, uint64_t now)
2105 {
2106 	running_timer_setup(processor, timer, param, deadline, now);
2107 	running_timers_sync();
2108 }
2109 
2110 /*
2111  * Call the callback for any running timers that fired for this processor.
2112  * Returns true if any timers were past their deadline.
2113  */
2114 bool
running_timers_expire(processor_t processor,uint64_t now)2115 running_timers_expire(processor_t processor, uint64_t now)
2116 {
2117 	bool expired = false;
2118 
2119 	if (!processor->running_timers_active) {
2120 		return expired;
2121 	}
2122 
2123 	for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
2124 		struct timer_call *call = &processor->running_timers[i];
2125 
2126 		uint64_t deadline = call->tc_pqlink.deadline;
2127 		if (deadline > now) {
2128 			continue;
2129 		}
2130 
2131 		expired = true;
2132 		timer_call_trace_expire_entry(call);
2133 		call->tc_func(call->tc_param0, call->tc_param1);
2134 		timer_call_trace_expire_return(call);
2135 	}
2136 
2137 	return expired;
2138 }
2139 
2140 void
running_timer_clear(processor_t processor,enum running_timer timer)2141 running_timer_clear(processor_t processor, enum running_timer timer)
2142 {
2143 	struct timer_call *call = &processor->running_timers[timer];
2144 	uint64_t deadline = call->tc_pqlink.deadline;
2145 	if (deadline == EndOfAllTime) {
2146 		return;
2147 	}
2148 
2149 	call->tc_pqlink.deadline = EndOfAllTime;
2150 #if TIMER_TRACE
2151 	call->tc_entry_time = 0;
2152 #endif /* TIMER_TRACE */
2153 	timer_call_trace_cancel(call);
2154 }
2155 
2156 void
running_timer_cancel(processor_t processor,unsigned int timer)2157 running_timer_cancel(processor_t processor, unsigned int timer)
2158 {
2159 	running_timer_clear(processor, timer);
2160 	running_timers_sync();
2161 }
2162 
2163 uint64_t
running_timers_deadline(processor_t processor)2164 running_timers_deadline(processor_t processor)
2165 {
2166 	if (!processor->running_timers_active) {
2167 		return EndOfAllTime;
2168 	}
2169 
2170 	uint64_t deadline = EndOfAllTime;
2171 	for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
2172 		uint64_t candidate =
2173 		    processor->running_timers[i].tc_pqlink.deadline;
2174 		if (candidate != 0 && candidate < deadline) {
2175 			deadline = candidate;
2176 		}
2177 	}
2178 
2179 	return deadline;
2180 }
2181 
2182 void
running_timers_activate(processor_t processor)2183 running_timers_activate(processor_t processor)
2184 {
2185 	processor->running_timers_active = true;
2186 	running_timers_sync();
2187 }
2188 
2189 void
running_timers_deactivate(processor_t processor)2190 running_timers_deactivate(processor_t processor)
2191 {
2192 	assert(processor->running_timers_active == true);
2193 	processor->running_timers_active = false;
2194 	running_timers_sync();
2195 }
2196