xref: /xnu-10002.81.5/osfmk/kern/timer_call.c (revision 5e3eaea39dcf651e66cb99ba7d70e32cc4a99587)
1 /*
2  * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Timer interrupt callout module.
30  */
31 
32 #include <mach/mach_types.h>
33 
34 #include <kern/clock.h>
35 #include <kern/counter.h>
36 #include <kern/smp.h>
37 #include <kern/processor.h>
38 #include <kern/timer_call.h>
39 #include <kern/timer_queue.h>
40 #include <kern/thread.h>
41 #include <kern/thread_group.h>
42 #include <kern/policy_internal.h>
43 
44 #include <sys/kdebug.h>
45 
46 #if CONFIG_DTRACE
47 #include <mach/sdt.h>
48 #endif
49 
50 
51 #if DEBUG
52 #define TIMER_ASSERT    1
53 #endif
54 
55 //#define TIMER_ASSERT	1
56 //#define TIMER_DBG	1
57 
58 #if TIMER_DBG
59 #define DBG(x...) kprintf("DBG: " x);
60 #else
61 #define DBG(x...)
62 #endif
63 
64 #if TIMER_TRACE
65 #define TIMER_KDEBUG_TRACE      KERNEL_DEBUG_CONSTANT_IST
66 #else
67 #define TIMER_KDEBUG_TRACE(x...)
68 #endif
69 
70 LCK_GRP_DECLARE(timer_call_lck_grp, "timer_call");
71 LCK_GRP_DECLARE(timer_longterm_lck_grp, "timer_longterm");
72 LCK_GRP_DECLARE(timer_queue_lck_grp, "timer_queue");
73 
74 /* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
75 #define timer_queue_lock_spin(queue) lck_ticket_lock(&(queue)->lock_data, &timer_queue_lck_grp)
76 #define timer_queue_unlock(queue)    lck_ticket_unlock(&(queue)->lock_data)
77 
78 /*
79  * The longterm timer object is a global structure holding all timers
80  * beyond the short-term, local timer queue threshold. The boot processor
81  * is responsible for moving each timer to its local timer queue
82  * if and when that timer becomes due within the threshold.
83  */
84 
85 /* Sentinel for "no time set": */
86 #define TIMER_LONGTERM_NONE             EndOfAllTime
87 /* The default threadhold is the delta above which a timer is "long-term" */
88 #if defined(__x86_64__)
89 #define TIMER_LONGTERM_THRESHOLD        (1ULL * NSEC_PER_SEC)   /* 1 sec */
90 #else
91 #define TIMER_LONGTERM_THRESHOLD        TIMER_LONGTERM_NONE     /* disabled */
92 #endif
93 
94 /*
95  * The scan_limit throttles processing of the longterm queue.
96  * If the scan time exceeds this limit, we terminate, unlock
97  * and defer for scan_interval. This prevents unbounded holding of
98  * timer queue locks with interrupts masked.
99  */
100 #define TIMER_LONGTERM_SCAN_LIMIT       (100ULL * NSEC_PER_USEC)        /* 100 us */
101 #define TIMER_LONGTERM_SCAN_INTERVAL    (100ULL * NSEC_PER_USEC)        /* 100 us */
102 /* Sentinel for "scan limit exceeded": */
103 #define TIMER_LONGTERM_SCAN_AGAIN       0
104 
105 /*
106  * In a similar way to the longterm queue's scan limit, the following bounds the
107  * amount of time spent processing regular timers.
108  */
109 TUNABLE_WRITEABLE(uint64_t, timer_scan_limit_us, "timer_scan_limit_us", 400);
110 TUNABLE_WRITEABLE(uint64_t, timer_scan_interval_us, "timer_scan_interval_us", 40);
111 static uint64_t timer_scan_limit_abs = 0;
112 static uint64_t timer_scan_interval_abs = 0;
113 
114 /*
115  * Count of times scanning the queue was aborted early (to avoid long
116  * scan times).
117  */
118 SCALABLE_COUNTER_DEFINE(timer_scan_pauses_cnt);
119 
120 /*
121  * Count of times scanning the queue was aborted early resulting in a
122  * postponed hard deadline.
123  */
124 SCALABLE_COUNTER_DEFINE(timer_scan_postpones_cnt);
125 
126 #define MAX_TIMER_SCAN_LIMIT    (30000ULL * NSEC_PER_USEC)  /* 30 ms */
127 #define MIN_TIMER_SCAN_LIMIT    (   50ULL * NSEC_PER_USEC)  /* 50 us */
128 #define MAX_TIMER_SCAN_INTERVAL ( 2000ULL * NSEC_PER_USEC)  /*  2 ms */
129 #define MIN_TIMER_SCAN_INTERVAL (   20ULL * NSEC_PER_USEC)  /* 20 us */
130 
131 typedef struct {
132 	uint64_t        interval;       /* longterm timer interval */
133 	uint64_t        margin;         /* fudge factor (10% of interval */
134 	uint64_t        deadline;       /* first/soonest longterm deadline */
135 	uint64_t        preempted;      /* sooner timer has pre-empted */
136 	timer_call_t    call;           /* first/soonest longterm timer call */
137 	uint64_t        deadline_set;   /* next timer set */
138 	timer_call_data_t timer;        /* timer used by threshold management */
139 	                                /* Stats: */
140 	uint64_t        scans;          /*   num threshold timer scans */
141 	uint64_t        preempts;       /*   num threshold reductions */
142 	uint64_t        latency;        /*   average threshold latency */
143 	uint64_t        latency_min;    /*   minimum threshold latency */
144 	uint64_t        latency_max;    /*   maximum threshold latency */
145 } threshold_t;
146 
147 typedef struct {
148 	mpqueue_head_t  queue;          /* longterm timer list */
149 	uint64_t        enqueues;       /* num timers queued */
150 	uint64_t        dequeues;       /* num timers dequeued */
151 	uint64_t        escalates;      /* num timers becoming shortterm */
152 	uint64_t        scan_time;      /* last time the list was scanned */
153 	threshold_t     threshold;      /* longterm timer threshold */
154 	uint64_t        scan_limit;     /* maximum scan time */
155 	uint64_t        scan_interval;  /* interval between LT "escalation" scans */
156 	uint64_t        scan_pauses;    /* num scans exceeding time limit */
157 } timer_longterm_t;
158 
159 timer_longterm_t                timer_longterm = {
160 	.scan_limit = TIMER_LONGTERM_SCAN_LIMIT,
161 	.scan_interval = TIMER_LONGTERM_SCAN_INTERVAL,
162 };
163 
164 static mpqueue_head_t           *timer_longterm_queue = NULL;
165 
166 static void                     timer_longterm_init(void);
167 static void                     timer_longterm_callout(
168 	timer_call_param_t      p0,
169 	timer_call_param_t      p1);
170 extern void                     timer_longterm_scan(
171 	timer_longterm_t        *tlp,
172 	uint64_t                now);
173 static void                     timer_longterm_update(
174 	timer_longterm_t *tlp);
175 static void                     timer_longterm_update_locked(
176 	timer_longterm_t *tlp);
177 static mpqueue_head_t *         timer_longterm_enqueue_unlocked(
178 	timer_call_t            call,
179 	uint64_t                now,
180 	uint64_t                deadline,
181 	mpqueue_head_t **       old_queue,
182 	uint64_t                soft_deadline,
183 	uint64_t                ttd,
184 	timer_call_param_t      param1,
185 	uint32_t                callout_flags);
186 static void                     timer_longterm_dequeued_locked(
187 	timer_call_t            call);
188 
189 uint64_t past_deadline_timers;
190 uint64_t past_deadline_deltas;
191 uint64_t past_deadline_longest;
192 uint64_t past_deadline_shortest = ~0ULL;
193 enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
194 
195 uint64_t past_deadline_timer_adjustment;
196 
197 static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
198 boolean_t       mach_timer_coalescing_enabled = TRUE;
199 
200 mpqueue_head_t  *timer_call_enqueue_deadline_unlocked(
201 	timer_call_t            call,
202 	mpqueue_head_t          *queue,
203 	uint64_t                deadline,
204 	uint64_t                soft_deadline,
205 	uint64_t                ttd,
206 	timer_call_param_t      param1,
207 	uint32_t                flags);
208 
209 mpqueue_head_t  *timer_call_dequeue_unlocked(
210 	timer_call_t            call);
211 
212 timer_coalescing_priority_params_t tcoal_prio_params;
213 
214 #if TCOAL_PRIO_STATS
215 int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
216 #define TCOAL_PRIO_STAT(x) (x++)
217 #else
218 #define TCOAL_PRIO_STAT(x)
219 #endif
220 
221 static void
timer_call_init_abstime(void)222 timer_call_init_abstime(void)
223 {
224 	int i;
225 	uint64_t result;
226 	timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
227 	nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
228 	nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
229 	tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
230 	nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
231 	tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
232 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
233 	tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
234 	tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
235 	tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
236 	tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
237 	tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
238 	tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
239 
240 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
241 	    &tcoal_prio_params.timer_coalesce_rt_abstime_max);
242 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
243 	    &tcoal_prio_params.timer_coalesce_bg_abstime_max);
244 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
245 	    &tcoal_prio_params.timer_coalesce_kt_abstime_max);
246 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
247 	    &tcoal_prio_params.timer_coalesce_fp_abstime_max);
248 	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
249 	    &tcoal_prio_params.timer_coalesce_ts_abstime_max);
250 
251 	for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
252 		tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
253 		nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
254 		    &tcoal_prio_params.latency_qos_abstime_max[i]);
255 		tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
256 	}
257 
258 	nanoseconds_to_absolutetime(timer_scan_limit_us * NSEC_PER_USEC, &timer_scan_limit_abs);
259 	nanoseconds_to_absolutetime(timer_scan_interval_us * NSEC_PER_USEC, &timer_scan_interval_abs);
260 }
261 
262 
263 void
timer_call_init(void)264 timer_call_init(void)
265 {
266 	timer_longterm_init();
267 	timer_call_init_abstime();
268 }
269 
270 
271 void
timer_call_queue_init(mpqueue_head_t * queue)272 timer_call_queue_init(mpqueue_head_t *queue)
273 {
274 	DBG("timer_call_queue_init(%p)\n", queue);
275 	mpqueue_init(queue, &timer_call_lck_grp, LCK_ATTR_NULL);
276 }
277 
278 
279 void
timer_call_setup(timer_call_t call,timer_call_func_t func,timer_call_param_t param0)280 timer_call_setup(
281 	timer_call_t                    call,
282 	timer_call_func_t               func,
283 	timer_call_param_t              param0)
284 {
285 	DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
286 
287 	*call = (struct timer_call) {
288 		.tc_func = func,
289 		.tc_param0 = param0,
290 		.tc_async_dequeue = false,
291 	};
292 
293 	simple_lock_init(&(call)->tc_lock, 0);
294 }
295 
296 timer_call_t
timer_call_alloc(timer_call_func_t func,timer_call_param_t param0)297 timer_call_alloc(
298 	timer_call_func_t       func,
299 	timer_call_param_t      param0)
300 {
301 	timer_call_t call;
302 
303 	call = kalloc_type(struct timer_call, Z_ZERO | Z_WAITOK | Z_NOFAIL);
304 	timer_call_setup(call, func, param0);
305 	return call;
306 }
307 
308 void
timer_call_free(timer_call_t call)309 timer_call_free(
310 	timer_call_t            call)
311 {
312 	kfree_type(struct timer_call, call);
313 }
314 
315 static mpqueue_head_t*
mpqueue_for_timer_call(timer_call_t entry)316 mpqueue_for_timer_call(timer_call_t entry)
317 {
318 	queue_t queue_entry_is_on = entry->tc_queue;
319 	/* 'cast' the queue back to the orignal mpqueue */
320 	return __container_of(queue_entry_is_on, struct mpqueue_head, head);
321 }
322 
323 
324 static __inline__ mpqueue_head_t *
timer_call_entry_dequeue(timer_call_t entry)325 timer_call_entry_dequeue(
326 	timer_call_t            entry)
327 {
328 	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
329 
330 	/* The entry was always on a queue */
331 	assert(old_mpqueue != NULL);
332 
333 #if TIMER_ASSERT
334 	if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
335 		panic("_call_entry_dequeue() "
336 		    "entry %p is not locked\n", entry);
337 	}
338 
339 	/*
340 	 * XXX The queue lock is actually a mutex in spin mode
341 	 *     but there's no way to test for it being held
342 	 *     so we pretend it's a spinlock!
343 	 */
344 	if (!hw_lock_held((hw_lock_t)&old_mpqueue->lock_data)) {
345 		panic("_call_entry_dequeue() "
346 		    "queue %p is not locked\n", old_mpqueue);
347 	}
348 #endif /* TIMER_ASSERT */
349 
350 	if (old_mpqueue != timer_longterm_queue) {
351 		priority_queue_remove(&old_mpqueue->mpq_pqhead,
352 		    &entry->tc_pqlink);
353 	}
354 
355 	remqueue(&entry->tc_qlink);
356 
357 	entry->tc_queue = NULL;
358 
359 	old_mpqueue->count--;
360 
361 	return old_mpqueue;
362 }
363 
364 static __inline__ mpqueue_head_t *
timer_call_entry_enqueue_deadline(timer_call_t entry,mpqueue_head_t * new_mpqueue,uint64_t deadline)365 timer_call_entry_enqueue_deadline(
366 	timer_call_t                    entry,
367 	mpqueue_head_t                  *new_mpqueue,
368 	uint64_t                        deadline)
369 {
370 	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
371 
372 #if TIMER_ASSERT
373 	if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
374 		panic("_call_entry_enqueue_deadline() "
375 		    "entry %p is not locked\n", entry);
376 	}
377 
378 	/* XXX More lock pretense:  */
379 	if (!hw_lock_held((hw_lock_t)&new_mpqueue->lock_data)) {
380 		panic("_call_entry_enqueue_deadline() "
381 		    "queue %p is not locked\n", new_mpqueue);
382 	}
383 
384 	if (old_mpqueue != NULL && old_mpqueue != new_mpqueue) {
385 		panic("_call_entry_enqueue_deadline() "
386 		    "old_mpqueue %p != new_mpqueue", old_mpqueue);
387 	}
388 #endif /* TIMER_ASSERT */
389 
390 	/* no longterm queue involved */
391 	assert(new_mpqueue != timer_longterm_queue);
392 	assert(old_mpqueue != timer_longterm_queue);
393 
394 	if (old_mpqueue == new_mpqueue) {
395 		/* optimize the same-queue case to avoid a full re-insert */
396 		uint64_t old_deadline = entry->tc_pqlink.deadline;
397 		entry->tc_pqlink.deadline = deadline;
398 
399 		if (old_deadline < deadline) {
400 			priority_queue_entry_increased(&new_mpqueue->mpq_pqhead,
401 			    &entry->tc_pqlink);
402 		} else {
403 			priority_queue_entry_decreased(&new_mpqueue->mpq_pqhead,
404 			    &entry->tc_pqlink);
405 		}
406 	} else {
407 		if (old_mpqueue != NULL) {
408 			priority_queue_remove(&old_mpqueue->mpq_pqhead,
409 			    &entry->tc_pqlink);
410 
411 			re_queue_tail(&new_mpqueue->head, &entry->tc_qlink);
412 		} else {
413 			enqueue_tail(&new_mpqueue->head, &entry->tc_qlink);
414 		}
415 
416 		entry->tc_queue = &new_mpqueue->head;
417 		entry->tc_pqlink.deadline = deadline;
418 
419 		priority_queue_insert(&new_mpqueue->mpq_pqhead, &entry->tc_pqlink);
420 	}
421 
422 
423 	/* For efficiency, track the earliest soft deadline on the queue,
424 	 * so that fuzzy decisions can be made without lock acquisitions.
425 	 */
426 
427 	timer_call_t thead = priority_queue_min(&new_mpqueue->mpq_pqhead, struct timer_call, tc_pqlink);
428 
429 	new_mpqueue->earliest_soft_deadline = thead->tc_flags & TIMER_CALL_RATELIMITED ? thead->tc_pqlink.deadline : thead->tc_soft_deadline;
430 
431 	if (old_mpqueue) {
432 		old_mpqueue->count--;
433 	}
434 	new_mpqueue->count++;
435 
436 	return old_mpqueue;
437 }
438 
439 static __inline__ void
timer_call_entry_enqueue_tail(timer_call_t entry,mpqueue_head_t * queue)440 timer_call_entry_enqueue_tail(
441 	timer_call_t                    entry,
442 	mpqueue_head_t                  *queue)
443 {
444 	/* entry is always dequeued before this call */
445 	assert(entry->tc_queue == NULL);
446 
447 	/*
448 	 * this is only used for timer_longterm_queue, which is unordered
449 	 * and thus needs no priority queueing
450 	 */
451 	assert(queue == timer_longterm_queue);
452 
453 	enqueue_tail(&queue->head, &entry->tc_qlink);
454 
455 	entry->tc_queue = &queue->head;
456 
457 	queue->count++;
458 	return;
459 }
460 
461 /*
462  * Remove timer entry from its queue but don't change the queue pointer
463  * and set the async_dequeue flag. This is locking case 2b.
464  */
465 static __inline__ void
timer_call_entry_dequeue_async(timer_call_t entry)466 timer_call_entry_dequeue_async(
467 	timer_call_t            entry)
468 {
469 	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
470 	if (old_mpqueue) {
471 		old_mpqueue->count--;
472 
473 		if (old_mpqueue != timer_longterm_queue) {
474 			priority_queue_remove(&old_mpqueue->mpq_pqhead,
475 			    &entry->tc_pqlink);
476 		}
477 
478 		remqueue(&entry->tc_qlink);
479 		entry->tc_async_dequeue = true;
480 	}
481 	return;
482 }
483 
484 #if TIMER_ASSERT
485 unsigned timer_call_enqueue_deadline_unlocked_async1;
486 unsigned timer_call_enqueue_deadline_unlocked_async2;
487 #endif
488 /*
489  * Assumes call_entry and queues unlocked, interrupts disabled.
490  */
491 __inline__ mpqueue_head_t *
timer_call_enqueue_deadline_unlocked(timer_call_t call,mpqueue_head_t * queue,uint64_t deadline,uint64_t soft_deadline,uint64_t ttd,timer_call_param_t param1,uint32_t callout_flags)492 timer_call_enqueue_deadline_unlocked(
493 	timer_call_t                    call,
494 	mpqueue_head_t                  *queue,
495 	uint64_t                        deadline,
496 	uint64_t                        soft_deadline,
497 	uint64_t                        ttd,
498 	timer_call_param_t              param1,
499 	uint32_t                        callout_flags)
500 {
501 	DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
502 
503 	simple_lock(&call->tc_lock, LCK_GRP_NULL);
504 
505 	mpqueue_head_t  *old_queue = mpqueue_for_timer_call(call);
506 
507 	if (old_queue != NULL) {
508 		timer_queue_lock_spin(old_queue);
509 		if (call->tc_async_dequeue) {
510 			/* collision (1c): timer already dequeued, clear flag */
511 #if TIMER_ASSERT
512 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
513 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
514 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
515 			    call->tc_async_dequeue,
516 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
517 			    0x1c, 0);
518 			timer_call_enqueue_deadline_unlocked_async1++;
519 #endif
520 			call->tc_async_dequeue = false;
521 			call->tc_queue = NULL;
522 		} else if (old_queue != queue) {
523 			timer_call_entry_dequeue(call);
524 #if TIMER_ASSERT
525 			timer_call_enqueue_deadline_unlocked_async2++;
526 #endif
527 		}
528 		if (old_queue == timer_longterm_queue) {
529 			timer_longterm_dequeued_locked(call);
530 		}
531 		if (old_queue != queue) {
532 			timer_queue_unlock(old_queue);
533 			timer_queue_lock_spin(queue);
534 		}
535 	} else {
536 		timer_queue_lock_spin(queue);
537 	}
538 
539 	call->tc_soft_deadline = soft_deadline;
540 	call->tc_flags = callout_flags;
541 	call->tc_param1 = param1;
542 	call->tc_ttd = ttd;
543 
544 	timer_call_entry_enqueue_deadline(call, queue, deadline);
545 	timer_queue_unlock(queue);
546 	simple_unlock(&call->tc_lock);
547 
548 	return old_queue;
549 }
550 
551 #if TIMER_ASSERT
552 unsigned timer_call_dequeue_unlocked_async1;
553 unsigned timer_call_dequeue_unlocked_async2;
554 #endif
555 mpqueue_head_t *
timer_call_dequeue_unlocked(timer_call_t call)556 timer_call_dequeue_unlocked(
557 	timer_call_t            call)
558 {
559 	DBG("timer_call_dequeue_unlocked(%p)\n", call);
560 
561 	simple_lock(&call->tc_lock, LCK_GRP_NULL);
562 
563 	mpqueue_head_t  *old_queue = mpqueue_for_timer_call(call);
564 
565 #if TIMER_ASSERT
566 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
567 	    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
568 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
569 	    call->tc_async_dequeue,
570 	    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
571 	    0, 0);
572 #endif
573 	if (old_queue != NULL) {
574 		timer_queue_lock_spin(old_queue);
575 		if (call->tc_async_dequeue) {
576 			/* collision (1c): timer already dequeued, clear flag */
577 #if TIMER_ASSERT
578 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
579 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
580 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
581 			    call->tc_async_dequeue,
582 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
583 			    0x1c, 0);
584 			timer_call_dequeue_unlocked_async1++;
585 #endif
586 			call->tc_async_dequeue = false;
587 			call->tc_queue = NULL;
588 		} else {
589 			timer_call_entry_dequeue(call);
590 		}
591 		if (old_queue == timer_longterm_queue) {
592 			timer_longterm_dequeued_locked(call);
593 		}
594 		timer_queue_unlock(old_queue);
595 	}
596 	simple_unlock(&call->tc_lock);
597 	return old_queue;
598 }
599 
600 uint64_t
timer_call_past_deadline_timer_handle(uint64_t deadline,uint64_t ctime)601 timer_call_past_deadline_timer_handle(uint64_t deadline, uint64_t ctime)
602 {
603 	uint64_t delta = (ctime - deadline);
604 
605 	past_deadline_timers++;
606 	past_deadline_deltas += delta;
607 	if (delta > past_deadline_longest) {
608 		past_deadline_longest = deadline;
609 	}
610 	if (delta < past_deadline_shortest) {
611 		past_deadline_shortest = delta;
612 	}
613 
614 	return ctime + past_deadline_timer_adjustment;
615 }
616 
617 /*
618  * Timer call entry locking model
619  * ==============================
620  *
621  * Timer call entries are linked on per-cpu timer queues which are protected
622  * by the queue lock and the call entry lock. The locking protocol is:
623  *
624  *  0) The canonical locking order is timer call entry followed by queue.
625  *
626  *  1) With only the entry lock held, entry.queue is valid:
627  *    1a) NULL: the entry is not queued, or
628  *    1b) non-NULL: this queue must be locked before the entry is modified.
629  *        After locking the queue, the call.async_dequeue flag must be checked:
630  *    1c) TRUE: the entry was removed from the queue by another thread
631  *	        and we must NULL the entry.queue and reset this flag, or
632  *    1d) FALSE: (ie. queued), the entry can be manipulated.
633  *
634  *  2) If a queue lock is obtained first, the queue is stable:
635  *    2a) If a try-lock of a queued entry succeeds, the call can be operated on
636  *	  and dequeued.
637  *    2b) If a try-lock fails, it indicates that another thread is attempting
638  *        to change the entry and move it to a different position in this queue
639  *        or to different queue. The entry can be dequeued but it should not be
640  *        operated upon since it is being changed. Furthermore, we don't null
641  *	  the entry.queue pointer (protected by the entry lock we don't own).
642  *	  Instead, we set the async_dequeue flag -- see (1c).
643  *    2c) Same as 2b but occurring when a longterm timer is matured.
644  *  3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
645  *     should be manipulated with the appropriate timer queue lock held,
646  *     to prevent queue traversal observations from observing inconsistent
647  *     updates to an in-flight callout.
648  */
649 
650 /*
651  * In the debug case, we assert that the timer call locking protocol
652  * is being obeyed.
653  */
654 
655 static boolean_t
timer_call_enter_internal(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint64_t leeway,uint32_t flags,boolean_t ratelimited)656 timer_call_enter_internal(
657 	timer_call_t            call,
658 	timer_call_param_t      param1,
659 	uint64_t                deadline,
660 	uint64_t                leeway,
661 	uint32_t                flags,
662 	boolean_t               ratelimited)
663 {
664 	mpqueue_head_t          *queue = NULL;
665 	mpqueue_head_t          *old_queue;
666 	spl_t                   s;
667 	uint64_t                slop;
668 	uint32_t                urgency;
669 	uint64_t                sdeadline, ttd;
670 
671 	assert(call->tc_func != NULL);
672 	s = splclock();
673 
674 	sdeadline = deadline;
675 	uint64_t ctime = mach_absolute_time();
676 
677 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
678 	    DECR_TIMER_ENTER | DBG_FUNC_START,
679 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
680 	    VM_KERNEL_ADDRHIDE(param1), deadline, flags, 0);
681 
682 	urgency = (flags & TIMER_CALL_URGENCY_MASK);
683 
684 	boolean_t slop_ratelimited = FALSE;
685 	slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
686 
687 	if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop) {
688 		slop = leeway;
689 	}
690 
691 	if (UINT64_MAX - deadline <= slop) {
692 		deadline = UINT64_MAX;
693 	} else {
694 		deadline += slop;
695 	}
696 
697 	if (__improbable(deadline < ctime)) {
698 		deadline = timer_call_past_deadline_timer_handle(deadline, ctime);
699 		sdeadline = deadline;
700 	}
701 
702 	if (ratelimited || slop_ratelimited) {
703 		flags |= TIMER_CALL_RATELIMITED;
704 	} else {
705 		flags &= ~TIMER_CALL_RATELIMITED;
706 	}
707 
708 	ttd =  sdeadline - ctime;
709 #if CONFIG_DTRACE
710 	DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
711 	    timer_call_param_t, call->tc_param0, uint32_t, flags,
712 	    (deadline - sdeadline),
713 	    (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
714 #endif
715 
716 	/* Program timer callout parameters under the appropriate per-CPU or
717 	 * longterm queue lock. The callout may have been previously enqueued
718 	 * and in-flight on this or another timer queue.
719 	 */
720 	if (!ratelimited && !slop_ratelimited) {
721 		queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
722 	}
723 
724 	if (queue == NULL) {
725 		queue = timer_queue_assign(deadline);
726 		old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
727 	}
728 
729 #if TIMER_TRACE
730 	call->tc_entry_time = ctime;
731 #endif
732 
733 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
734 	    DECR_TIMER_ENTER | DBG_FUNC_END,
735 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
736 	    (old_queue != NULL), deadline, queue->count, 0);
737 
738 	splx(s);
739 
740 	return old_queue != NULL;
741 }
742 
743 /*
744  * timer_call_*()
745  *	return boolean indicating whether the call was previously queued.
746  */
747 boolean_t
timer_call_enter(timer_call_t call,uint64_t deadline,uint32_t flags)748 timer_call_enter(
749 	timer_call_t            call,
750 	uint64_t                deadline,
751 	uint32_t                flags)
752 {
753 	return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
754 }
755 
756 boolean_t
timer_call_enter1(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint32_t flags)757 timer_call_enter1(
758 	timer_call_t            call,
759 	timer_call_param_t      param1,
760 	uint64_t                deadline,
761 	uint32_t                flags)
762 {
763 	return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
764 }
765 
766 boolean_t
timer_call_enter_with_leeway(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint64_t leeway,uint32_t flags,boolean_t ratelimited)767 timer_call_enter_with_leeway(
768 	timer_call_t            call,
769 	timer_call_param_t      param1,
770 	uint64_t                deadline,
771 	uint64_t                leeway,
772 	uint32_t                flags,
773 	boolean_t               ratelimited)
774 {
775 	return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
776 }
777 
778 boolean_t
timer_call_cancel(timer_call_t call)779 timer_call_cancel(
780 	timer_call_t            call)
781 {
782 	mpqueue_head_t          *old_queue;
783 	spl_t                   s;
784 
785 	s = splclock();
786 
787 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
788 	    DECR_TIMER_CANCEL | DBG_FUNC_START,
789 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
790 	    call->tc_pqlink.deadline, call->tc_soft_deadline, call->tc_flags, 0);
791 
792 	old_queue = timer_call_dequeue_unlocked(call);
793 
794 	if (old_queue != NULL) {
795 		timer_queue_lock_spin(old_queue);
796 
797 		timer_call_t new_head = priority_queue_min(&old_queue->mpq_pqhead, struct timer_call, tc_pqlink);
798 
799 		if (new_head) {
800 			timer_queue_cancel(old_queue, call->tc_pqlink.deadline, new_head->tc_pqlink.deadline);
801 			old_queue->earliest_soft_deadline = new_head->tc_flags & TIMER_CALL_RATELIMITED ? new_head->tc_pqlink.deadline : new_head->tc_soft_deadline;
802 		} else {
803 			timer_queue_cancel(old_queue, call->tc_pqlink.deadline, UINT64_MAX);
804 			old_queue->earliest_soft_deadline = UINT64_MAX;
805 		}
806 
807 		timer_queue_unlock(old_queue);
808 	}
809 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
810 	    DECR_TIMER_CANCEL | DBG_FUNC_END,
811 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
812 	    VM_KERNEL_UNSLIDE_OR_PERM(old_queue),
813 	    call->tc_pqlink.deadline - mach_absolute_time(),
814 	    call->tc_pqlink.deadline - call->tc_entry_time, 0);
815 	splx(s);
816 
817 #if CONFIG_DTRACE
818 	DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
819 	    timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
820 	    (call->tc_ttd >> 32), (unsigned) (call->tc_ttd & 0xFFFFFFFF));
821 #endif /* CONFIG_DTRACE */
822 
823 	return old_queue != NULL;
824 }
825 
826 static uint32_t timer_queue_shutdown_lock_skips;
827 static uint32_t timer_queue_shutdown_discarded;
828 
829 void
timer_queue_shutdown(mpqueue_head_t * queue)830 timer_queue_shutdown(
831 	mpqueue_head_t          *queue)
832 {
833 	timer_call_t            call;
834 	mpqueue_head_t          *new_queue;
835 	spl_t                   s;
836 
837 
838 	DBG("timer_queue_shutdown(%p)\n", queue);
839 
840 	s = splclock();
841 
842 	while (TRUE) {
843 		timer_queue_lock_spin(queue);
844 
845 		call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
846 
847 		if (call == NULL) {
848 			break;
849 		}
850 
851 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
852 			/*
853 			 * case (2b) lock order inversion, dequeue and skip
854 			 * Don't change the call_entry queue back-pointer
855 			 * but set the async_dequeue field.
856 			 */
857 			timer_queue_shutdown_lock_skips++;
858 			timer_call_entry_dequeue_async(call);
859 #if TIMER_ASSERT
860 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
861 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
862 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
863 			    call->tc_async_dequeue,
864 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
865 			    0x2b, 0);
866 #endif
867 			timer_queue_unlock(queue);
868 			continue;
869 		}
870 
871 		boolean_t call_local = ((call->tc_flags & TIMER_CALL_LOCAL) != 0);
872 
873 		/* remove entry from old queue */
874 		timer_call_entry_dequeue(call);
875 		timer_queue_unlock(queue);
876 
877 		if (call_local == FALSE) {
878 			/* and queue it on new, discarding LOCAL timers */
879 			new_queue = timer_queue_assign(call->tc_pqlink.deadline);
880 			timer_queue_lock_spin(new_queue);
881 			timer_call_entry_enqueue_deadline(
882 				call, new_queue, call->tc_pqlink.deadline);
883 			timer_queue_unlock(new_queue);
884 		} else {
885 			timer_queue_shutdown_discarded++;
886 		}
887 
888 		assert(call_local == FALSE);
889 		simple_unlock(&call->tc_lock);
890 	}
891 
892 	timer_queue_unlock(queue);
893 	splx(s);
894 }
895 
896 
897 static uint32_t timer_queue_expire_lock_skips;
898 uint64_t
timer_queue_expire_with_options(mpqueue_head_t * queue,uint64_t deadline,boolean_t rescan)899 timer_queue_expire_with_options(
900 	mpqueue_head_t          *queue,
901 	uint64_t                deadline,
902 	boolean_t               rescan)
903 {
904 	timer_call_t    call = NULL;
905 	uint32_t tc_iterations = 0;
906 	DBG("timer_queue_expire(%p,)\n", queue);
907 
908 	/* 'rescan' means look at every timer in the list, instead of
909 	 * early-exiting when the head of the list expires in the future.
910 	 * when 'rescan' is true, iterate by linked list instead of priority queue.
911 	 *
912 	 * TODO: if we keep a deadline ordered and soft-deadline ordered
913 	 * priority queue, then it's no longer necessary to do that
914 	 */
915 
916 	uint64_t cur_deadline = deadline;
917 
918 	/* Force an early return if this time limit is hit. */
919 	const uint64_t time_limit = deadline + timer_scan_limit_abs;
920 
921 	/* Next deadline if the time limit is hit */
922 	uint64_t time_limit_deadline = 0;
923 
924 	timer_queue_lock_spin(queue);
925 
926 	while (!queue_empty(&queue->head)) {
927 		if (++tc_iterations > 1) {
928 			const uint64_t now = mach_absolute_time();
929 
930 			/*
931 			 * Abort the scan if it's taking too long to avoid long
932 			 * periods with interrupts disabled.
933 			 * Scanning will restart after a short pause
934 			 * (timer_scan_interval_abs) if there's a hard deadline
935 			 * pending.
936 			 */
937 			if (rescan == FALSE && now > time_limit) {
938 				TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
939 				    DECR_TIMER_PAUSE | DBG_FUNC_NONE,
940 				    queue->count, tc_iterations - 1,
941 				    0, 0, 0);
942 
943 				counter_inc(&timer_scan_pauses_cnt);
944 				time_limit_deadline = now + timer_scan_interval_abs;
945 				break;
946 			}
947 
948 			/*
949 			 * Upon processing one or more timer calls, refresh the
950 			 * deadline to account for time elapsed in the callout
951 			 */
952 			cur_deadline = now;
953 		}
954 
955 		if (call == NULL) {
956 			if (rescan == FALSE) {
957 				call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
958 			} else {
959 				call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
960 			}
961 		}
962 
963 		if (call->tc_soft_deadline <= cur_deadline) {
964 			timer_call_func_t               func;
965 			timer_call_param_t              param0, param1;
966 
967 			TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->tc_soft_deadline, 0, 0, 0);
968 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
969 			    DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
970 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
971 			    call->tc_soft_deadline,
972 			    call->tc_pqlink.deadline,
973 			    call->tc_entry_time, 0);
974 
975 			if ((call->tc_flags & TIMER_CALL_RATELIMITED) &&
976 			    (call->tc_pqlink.deadline > cur_deadline)) {
977 				if (rescan == FALSE) {
978 					break;
979 				}
980 			}
981 
982 			if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
983 				/* case (2b) lock inversion, dequeue and skip */
984 				timer_queue_expire_lock_skips++;
985 				timer_call_entry_dequeue_async(call);
986 				call = NULL;
987 				continue;
988 			}
989 
990 			timer_call_entry_dequeue(call);
991 
992 			func = call->tc_func;
993 			param0 = call->tc_param0;
994 			param1 = call->tc_param1;
995 
996 			simple_unlock(&call->tc_lock);
997 			timer_queue_unlock(queue);
998 
999 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1000 			    DECR_TIMER_CALLOUT | DBG_FUNC_START,
1001 			    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
1002 			    VM_KERNEL_ADDRHIDE(param0),
1003 			    VM_KERNEL_ADDRHIDE(param1),
1004 			    0);
1005 
1006 #if CONFIG_DTRACE
1007 			DTRACE_TMR7(callout__start, timer_call_func_t, func,
1008 			    timer_call_param_t, param0, unsigned, call->tc_flags,
1009 			    0, (call->tc_ttd >> 32),
1010 			    (unsigned) (call->tc_ttd & 0xFFFFFFFF), call);
1011 #endif
1012 			/* Maintain time-to-deadline in per-processor data
1013 			 * structure for thread wakeup deadline statistics.
1014 			 */
1015 			uint64_t *ttdp = &current_processor()->timer_call_ttd;
1016 			*ttdp = call->tc_ttd;
1017 			(*func)(param0, param1);
1018 			*ttdp = 0;
1019 #if CONFIG_DTRACE
1020 			DTRACE_TMR4(callout__end, timer_call_func_t, func,
1021 			    param0, param1, call);
1022 #endif
1023 
1024 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1025 			    DECR_TIMER_CALLOUT | DBG_FUNC_END,
1026 			    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
1027 			    VM_KERNEL_ADDRHIDE(param0),
1028 			    VM_KERNEL_ADDRHIDE(param1),
1029 			    0);
1030 			call = NULL;
1031 			timer_queue_lock_spin(queue);
1032 		} else {
1033 			if (__probable(rescan == FALSE)) {
1034 				break;
1035 			} else {
1036 				int64_t skew = call->tc_pqlink.deadline - call->tc_soft_deadline;
1037 				assert(call->tc_pqlink.deadline >= call->tc_soft_deadline);
1038 
1039 				/* DRK: On a latency quality-of-service level change,
1040 				 * re-sort potentially rate-limited timers. The platform
1041 				 * layer determines which timers require
1042 				 * this. In the absence of the per-callout
1043 				 * synchronization requirement, a global resort could
1044 				 * be more efficient. The re-sort effectively
1045 				 * annuls all timer adjustments, i.e. the "soft
1046 				 * deadline" is the sort key.
1047 				 */
1048 
1049 				if (timer_resort_threshold(skew)) {
1050 					if (__probable(simple_lock_try(&call->tc_lock, LCK_GRP_NULL))) {
1051 						/* TODO: don't need to dequeue before enqueue */
1052 						timer_call_entry_dequeue(call);
1053 						timer_call_entry_enqueue_deadline(call, queue, call->tc_soft_deadline);
1054 						simple_unlock(&call->tc_lock);
1055 						call = NULL;
1056 					}
1057 				}
1058 				if (call) {
1059 					call = qe_queue_next(&queue->head, call, struct timer_call, tc_qlink);
1060 
1061 					if (call == NULL) {
1062 						break;
1063 					}
1064 				}
1065 			}
1066 		}
1067 	}
1068 
1069 	call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
1070 
1071 	if (call) {
1072 		/*
1073 		 * Even if the time limit has been hit, it doesn't mean a hard
1074 		 * deadline will be missed - the next hard deadline may be in
1075 		 * future.
1076 		 */
1077 		if (time_limit_deadline > call->tc_pqlink.deadline) {
1078 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1079 			    DECR_TIMER_POSTPONE | DBG_FUNC_NONE,
1080 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1081 			    call->tc_pqlink.deadline,
1082 			    time_limit_deadline,
1083 			    0, 0);
1084 			counter_inc(&timer_scan_postpones_cnt);
1085 			cur_deadline = time_limit_deadline;
1086 		} else {
1087 			cur_deadline = call->tc_pqlink.deadline;
1088 		}
1089 		queue->earliest_soft_deadline = (call->tc_flags & TIMER_CALL_RATELIMITED) ? call->tc_pqlink.deadline: call->tc_soft_deadline;
1090 	} else {
1091 		queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
1092 	}
1093 
1094 	timer_queue_unlock(queue);
1095 
1096 	return cur_deadline;
1097 }
1098 
1099 uint64_t
timer_queue_expire(mpqueue_head_t * queue,uint64_t deadline)1100 timer_queue_expire(
1101 	mpqueue_head_t          *queue,
1102 	uint64_t                deadline)
1103 {
1104 	return timer_queue_expire_with_options(queue, deadline, FALSE);
1105 }
1106 
1107 extern int serverperfmode;
1108 static uint32_t timer_queue_migrate_lock_skips;
1109 /*
1110  * timer_queue_migrate() is called by timer_queue_migrate_cpu()
1111  * to move timer requests from the local processor (queue_from)
1112  * to a target processor's (queue_to).
1113  */
1114 int
timer_queue_migrate(mpqueue_head_t * queue_from,mpqueue_head_t * queue_to)1115 timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
1116 {
1117 	timer_call_t    call;
1118 	timer_call_t    head_to;
1119 	int             timers_migrated = 0;
1120 
1121 	DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
1122 
1123 	assert(!ml_get_interrupts_enabled());
1124 	assert(queue_from != queue_to);
1125 
1126 	if (serverperfmode) {
1127 		/*
1128 		 * if we're running a high end server
1129 		 * avoid migrations... they add latency
1130 		 * and don't save us power under typical
1131 		 * server workloads
1132 		 */
1133 		return -4;
1134 	}
1135 
1136 	/*
1137 	 * Take both local (from) and target (to) timer queue locks while
1138 	 * moving the timers from the local queue to the target processor.
1139 	 * We assume that the target is always the boot processor.
1140 	 * But only move if all of the following is true:
1141 	 *  - the target queue is non-empty
1142 	 *  - the local queue is non-empty
1143 	 *  - the local queue's first deadline is later than the target's
1144 	 *  - the local queue contains no non-migrateable "local" call
1145 	 * so that we need not have the target resync.
1146 	 */
1147 
1148 	timer_queue_lock_spin(queue_to);
1149 
1150 	head_to = priority_queue_min(&queue_to->mpq_pqhead, struct timer_call, tc_pqlink);
1151 
1152 	if (head_to == NULL) {
1153 		timers_migrated = -1;
1154 		goto abort1;
1155 	}
1156 
1157 	timer_queue_lock_spin(queue_from);
1158 
1159 	call = priority_queue_min(&queue_from->mpq_pqhead, struct timer_call, tc_pqlink);
1160 
1161 	if (call == NULL) {
1162 		timers_migrated = -2;
1163 		goto abort2;
1164 	}
1165 
1166 	if (call->tc_pqlink.deadline < head_to->tc_pqlink.deadline) {
1167 		timers_migrated = 0;
1168 		goto abort2;
1169 	}
1170 
1171 	/* perform scan for non-migratable timers */
1172 	qe_foreach_element(call, &queue_from->head, tc_qlink) {
1173 		if (call->tc_flags & TIMER_CALL_LOCAL) {
1174 			timers_migrated = -3;
1175 			goto abort2;
1176 		}
1177 	}
1178 
1179 	/* migration loop itself -- both queues are locked */
1180 	qe_foreach_element_safe(call, &queue_from->head, tc_qlink) {
1181 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1182 			/* case (2b) lock order inversion, dequeue only */
1183 #ifdef TIMER_ASSERT
1184 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1185 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1186 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1187 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1188 			    0,
1189 			    0x2b, 0);
1190 #endif
1191 			timer_queue_migrate_lock_skips++;
1192 			timer_call_entry_dequeue_async(call);
1193 			continue;
1194 		}
1195 		timer_call_entry_dequeue(call);
1196 		timer_call_entry_enqueue_deadline(
1197 			call, queue_to, call->tc_pqlink.deadline);
1198 		timers_migrated++;
1199 		simple_unlock(&call->tc_lock);
1200 	}
1201 	queue_from->earliest_soft_deadline = UINT64_MAX;
1202 abort2:
1203 	timer_queue_unlock(queue_from);
1204 abort1:
1205 	timer_queue_unlock(queue_to);
1206 
1207 	return timers_migrated;
1208 }
1209 
1210 void
timer_queue_trace_cpu(int ncpu)1211 timer_queue_trace_cpu(int ncpu)
1212 {
1213 	timer_call_nosync_cpu(
1214 		ncpu,
1215 		(void (*)(void *))timer_queue_trace,
1216 		(void*) timer_queue_cpu(ncpu));
1217 }
1218 
1219 void
timer_queue_trace(mpqueue_head_t * queue)1220 timer_queue_trace(
1221 	mpqueue_head_t                  *queue)
1222 {
1223 	timer_call_t    call;
1224 	spl_t           s;
1225 
1226 	if (!kdebug_enable) {
1227 		return;
1228 	}
1229 
1230 	s = splclock();
1231 	timer_queue_lock_spin(queue);
1232 
1233 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1234 	    DECR_TIMER_QUEUE | DBG_FUNC_START,
1235 	    queue->count, mach_absolute_time(), 0, 0, 0);
1236 
1237 	qe_foreach_element(call, &queue->head, tc_qlink) {
1238 		TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1239 		    DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1240 		    call->tc_soft_deadline,
1241 		    call->tc_pqlink.deadline,
1242 		    call->tc_entry_time,
1243 		    VM_KERNEL_UNSLIDE(call->tc_func),
1244 		    0);
1245 	}
1246 
1247 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1248 	    DECR_TIMER_QUEUE | DBG_FUNC_END,
1249 	    queue->count, mach_absolute_time(), 0, 0, 0);
1250 
1251 	timer_queue_unlock(queue);
1252 	splx(s);
1253 }
1254 
1255 void
timer_longterm_dequeued_locked(timer_call_t call)1256 timer_longterm_dequeued_locked(timer_call_t call)
1257 {
1258 	timer_longterm_t        *tlp = &timer_longterm;
1259 
1260 	tlp->dequeues++;
1261 	if (call == tlp->threshold.call) {
1262 		tlp->threshold.call = NULL;
1263 	}
1264 }
1265 
1266 /*
1267  * Place a timer call in the longterm list
1268  * and adjust the next timer callout deadline if the new timer is first.
1269  */
1270 mpqueue_head_t *
timer_longterm_enqueue_unlocked(timer_call_t call,uint64_t now,uint64_t deadline,mpqueue_head_t ** old_queue,uint64_t soft_deadline,uint64_t ttd,timer_call_param_t param1,uint32_t callout_flags)1271 timer_longterm_enqueue_unlocked(timer_call_t    call,
1272     uint64_t        now,
1273     uint64_t        deadline,
1274     mpqueue_head_t  **old_queue,
1275     uint64_t        soft_deadline,
1276     uint64_t        ttd,
1277     timer_call_param_t      param1,
1278     uint32_t        callout_flags)
1279 {
1280 	timer_longterm_t        *tlp = &timer_longterm;
1281 	boolean_t               update_required = FALSE;
1282 	uint64_t                longterm_threshold;
1283 
1284 	longterm_threshold = now + tlp->threshold.interval;
1285 
1286 	/*
1287 	 * Return NULL without doing anything if:
1288 	 *  - this timer is local, or
1289 	 *  - the longterm mechanism is disabled, or
1290 	 *  - this deadline is too short.
1291 	 */
1292 	if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
1293 	    (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
1294 	    (deadline <= longterm_threshold)) {
1295 		return NULL;
1296 	}
1297 
1298 	/*
1299 	 * Remove timer from its current queue, if any.
1300 	 */
1301 	*old_queue = timer_call_dequeue_unlocked(call);
1302 
1303 	/*
1304 	 * Lock the longterm queue, queue timer and determine
1305 	 * whether an update is necessary.
1306 	 */
1307 	assert(!ml_get_interrupts_enabled());
1308 	simple_lock(&call->tc_lock, LCK_GRP_NULL);
1309 	timer_queue_lock_spin(timer_longterm_queue);
1310 	call->tc_pqlink.deadline = deadline;
1311 	call->tc_param1 = param1;
1312 	call->tc_ttd = ttd;
1313 	call->tc_soft_deadline = soft_deadline;
1314 	call->tc_flags = callout_flags;
1315 	timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1316 
1317 	tlp->enqueues++;
1318 
1319 	/*
1320 	 * We'll need to update the currently set threshold timer
1321 	 * if the new deadline is sooner and no sooner update is in flight.
1322 	 */
1323 	if (deadline < tlp->threshold.deadline &&
1324 	    deadline < tlp->threshold.preempted) {
1325 		tlp->threshold.preempted = deadline;
1326 		tlp->threshold.call = call;
1327 		update_required = TRUE;
1328 	}
1329 	timer_queue_unlock(timer_longterm_queue);
1330 	simple_unlock(&call->tc_lock);
1331 
1332 	if (update_required) {
1333 		/*
1334 		 * Note: this call expects that calling the master cpu
1335 		 * alone does not involve locking the topo lock.
1336 		 */
1337 		timer_call_nosync_cpu(
1338 			master_cpu,
1339 			(void (*)(void *))timer_longterm_update,
1340 			(void *)tlp);
1341 	}
1342 
1343 	return timer_longterm_queue;
1344 }
1345 
1346 /*
1347  * Scan for timers below the longterm threshold.
1348  * Move these to the local timer queue (of the boot processor on which the
1349  * calling thread is running).
1350  * Both the local (boot) queue and the longterm queue are locked.
1351  * The scan is similar to the timer migrate sequence but is performed by
1352  * successively examining each timer on the longterm queue:
1353  *  - if within the short-term threshold
1354  *    - enter on the local queue (unless being deleted),
1355  *  - otherwise:
1356  *    - if sooner, deadline becomes the next threshold deadline.
1357  * The total scan time is limited to TIMER_LONGTERM_SCAN_LIMIT. Should this be
1358  * exceeded, we abort and reschedule again so that we don't shut others from
1359  * the timer queues. Longterm timers firing late is not critical.
1360  */
1361 void
timer_longterm_scan(timer_longterm_t * tlp,uint64_t time_start)1362 timer_longterm_scan(timer_longterm_t    *tlp,
1363     uint64_t            time_start)
1364 {
1365 	timer_call_t    call;
1366 	uint64_t        threshold = TIMER_LONGTERM_NONE;
1367 	uint64_t        deadline;
1368 	uint64_t        time_limit = time_start + tlp->scan_limit;
1369 	mpqueue_head_t  *timer_master_queue;
1370 
1371 	assert(!ml_get_interrupts_enabled());
1372 	assert(cpu_number() == master_cpu);
1373 
1374 	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1375 		threshold = time_start + tlp->threshold.interval;
1376 	}
1377 
1378 	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1379 	tlp->threshold.call = NULL;
1380 
1381 	if (queue_empty(&timer_longterm_queue->head)) {
1382 		return;
1383 	}
1384 
1385 	timer_master_queue = timer_queue_cpu(master_cpu);
1386 	timer_queue_lock_spin(timer_master_queue);
1387 
1388 	qe_foreach_element_safe(call, &timer_longterm_queue->head, tc_qlink) {
1389 		deadline = call->tc_soft_deadline;
1390 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1391 			/* case (2c) lock order inversion, dequeue only */
1392 #ifdef TIMER_ASSERT
1393 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1394 			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1395 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1396 			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1397 			    0,
1398 			    0x2c, 0);
1399 #endif
1400 			timer_call_entry_dequeue_async(call);
1401 			continue;
1402 		}
1403 		if (deadline < threshold) {
1404 			/*
1405 			 * This timer needs moving (escalating)
1406 			 * to the local (boot) processor's queue.
1407 			 */
1408 #ifdef TIMER_ASSERT
1409 			if (deadline < time_start) {
1410 				TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1411 				    DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1412 				    VM_KERNEL_UNSLIDE_OR_PERM(call),
1413 				    deadline,
1414 				    time_start,
1415 				    threshold,
1416 				    0);
1417 			}
1418 #endif
1419 			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1420 			    DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1421 			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1422 			    call->tc_pqlink.deadline,
1423 			    call->tc_entry_time,
1424 			    VM_KERNEL_UNSLIDE(call->tc_func),
1425 			    0);
1426 			tlp->escalates++;
1427 			timer_call_entry_dequeue(call);
1428 			timer_call_entry_enqueue_deadline(
1429 				call, timer_master_queue, call->tc_pqlink.deadline);
1430 			/*
1431 			 * A side-effect of the following call is to update
1432 			 * the actual hardware deadline if required.
1433 			 */
1434 			(void) timer_queue_assign(deadline);
1435 		} else {
1436 			if (deadline < tlp->threshold.deadline) {
1437 				tlp->threshold.deadline = deadline;
1438 				tlp->threshold.call = call;
1439 			}
1440 		}
1441 		simple_unlock(&call->tc_lock);
1442 
1443 		/* Abort scan if we're taking too long. */
1444 		if (mach_absolute_time() > time_limit) {
1445 			tlp->threshold.deadline = TIMER_LONGTERM_SCAN_AGAIN;
1446 			tlp->scan_pauses++;
1447 			DBG("timer_longterm_scan() paused %llu, qlen: %llu\n",
1448 			    time_limit, tlp->queue.count);
1449 			break;
1450 		}
1451 	}
1452 
1453 	timer_queue_unlock(timer_master_queue);
1454 }
1455 
1456 void
timer_longterm_callout(timer_call_param_t p0,__unused timer_call_param_t p1)1457 timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1458 {
1459 	timer_longterm_t        *tlp = (timer_longterm_t *) p0;
1460 
1461 	timer_longterm_update(tlp);
1462 }
1463 
1464 void
timer_longterm_update_locked(timer_longterm_t * tlp)1465 timer_longterm_update_locked(timer_longterm_t *tlp)
1466 {
1467 	uint64_t        latency;
1468 
1469 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1470 	    DECR_TIMER_UPDATE | DBG_FUNC_START,
1471 	    VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1472 	    tlp->threshold.deadline,
1473 	    tlp->threshold.preempted,
1474 	    tlp->queue.count, 0);
1475 
1476 	tlp->scan_time = mach_absolute_time();
1477 	if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1478 		tlp->threshold.preempts++;
1479 		tlp->threshold.deadline = tlp->threshold.preempted;
1480 		tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1481 		/*
1482 		 * Note: in the unlikely event that a pre-empted timer has
1483 		 * itself been cancelled, we'll simply re-scan later at the
1484 		 * time of the preempted/cancelled timer.
1485 		 */
1486 	} else {
1487 		tlp->threshold.scans++;
1488 
1489 		/*
1490 		 * Maintain a moving average of our wakeup latency.
1491 		 * Clamp latency to 0 and ignore above threshold interval.
1492 		 */
1493 		if (tlp->scan_time > tlp->threshold.deadline_set) {
1494 			latency = tlp->scan_time - tlp->threshold.deadline_set;
1495 		} else {
1496 			latency = 0;
1497 		}
1498 		if (latency < tlp->threshold.interval) {
1499 			tlp->threshold.latency_min =
1500 			    MIN(tlp->threshold.latency_min, latency);
1501 			tlp->threshold.latency_max =
1502 			    MAX(tlp->threshold.latency_max, latency);
1503 			tlp->threshold.latency =
1504 			    (tlp->threshold.latency * 99 + latency) / 100;
1505 		}
1506 
1507 		timer_longterm_scan(tlp, tlp->scan_time);
1508 	}
1509 
1510 	tlp->threshold.deadline_set = tlp->threshold.deadline;
1511 	/* The next deadline timer to be set is adjusted */
1512 	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE &&
1513 	    tlp->threshold.deadline != TIMER_LONGTERM_SCAN_AGAIN) {
1514 		tlp->threshold.deadline_set -= tlp->threshold.margin;
1515 		tlp->threshold.deadline_set -= tlp->threshold.latency;
1516 	}
1517 
1518 	/* Throttle next scan time */
1519 	uint64_t scan_clamp = mach_absolute_time() + tlp->scan_interval;
1520 	if (tlp->threshold.deadline_set < scan_clamp) {
1521 		tlp->threshold.deadline_set = scan_clamp;
1522 	}
1523 
1524 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1525 	    DECR_TIMER_UPDATE | DBG_FUNC_END,
1526 	    VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1527 	    tlp->threshold.deadline,
1528 	    tlp->threshold.scans,
1529 	    tlp->queue.count, 0);
1530 }
1531 
1532 void
timer_longterm_update(timer_longterm_t * tlp)1533 timer_longterm_update(timer_longterm_t *tlp)
1534 {
1535 	spl_t   s = splclock();
1536 
1537 	timer_queue_lock_spin(timer_longterm_queue);
1538 
1539 	if (cpu_number() != master_cpu) {
1540 		panic("timer_longterm_update_master() on non-boot cpu");
1541 	}
1542 
1543 	timer_longterm_update_locked(tlp);
1544 
1545 	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1546 		timer_call_enter(
1547 			&tlp->threshold.timer,
1548 			tlp->threshold.deadline_set,
1549 			TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1550 	}
1551 
1552 	timer_queue_unlock(timer_longterm_queue);
1553 	splx(s);
1554 }
1555 
1556 void
timer_longterm_init(void)1557 timer_longterm_init(void)
1558 {
1559 	uint32_t                longterm;
1560 	timer_longterm_t        *tlp = &timer_longterm;
1561 
1562 	DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1563 
1564 	/*
1565 	 * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1566 	 * or TIMER_LONGTERM_NONE (disabled) for server;
1567 	 * overridden longterm boot-arg
1568 	 */
1569 	tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
1570 	    : TIMER_LONGTERM_THRESHOLD;
1571 	if (PE_parse_boot_argn("longterm", &longterm, sizeof(longterm))) {
1572 		tlp->threshold.interval = (longterm == 0) ?
1573 		    TIMER_LONGTERM_NONE :
1574 		    longterm * NSEC_PER_MSEC;
1575 	}
1576 	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1577 		printf("Longterm timer threshold: %llu ms\n",
1578 		    tlp->threshold.interval / NSEC_PER_MSEC);
1579 		kprintf("Longterm timer threshold: %llu ms\n",
1580 		    tlp->threshold.interval / NSEC_PER_MSEC);
1581 		nanoseconds_to_absolutetime(tlp->threshold.interval,
1582 		    &tlp->threshold.interval);
1583 		tlp->threshold.margin = tlp->threshold.interval / 10;
1584 		tlp->threshold.latency_min = EndOfAllTime;
1585 		tlp->threshold.latency_max = 0;
1586 	}
1587 
1588 	tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1589 	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1590 
1591 	mpqueue_init(&tlp->queue, &timer_longterm_lck_grp, LCK_ATTR_NULL);
1592 
1593 	timer_call_setup(&tlp->threshold.timer,
1594 	    timer_longterm_callout, (timer_call_param_t) tlp);
1595 
1596 	timer_longterm_queue = &tlp->queue;
1597 }
1598 
1599 enum {
1600 	THRESHOLD, QCOUNT,
1601 	ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
1602 	LATENCY, LATENCY_MIN, LATENCY_MAX, LONG_TERM_SCAN_LIMIT,
1603 	LONG_TERM_SCAN_INTERVAL, LONG_TERM_SCAN_PAUSES,
1604 	SCAN_LIMIT, SCAN_INTERVAL, SCAN_PAUSES, SCAN_POSTPONES,
1605 };
1606 uint64_t
timer_sysctl_get(int oid)1607 timer_sysctl_get(int oid)
1608 {
1609 	timer_longterm_t        *tlp = &timer_longterm;
1610 
1611 	switch (oid) {
1612 	case THRESHOLD:
1613 		return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
1614 		       0 : tlp->threshold.interval / NSEC_PER_MSEC;
1615 	case QCOUNT:
1616 		return tlp->queue.count;
1617 	case ENQUEUES:
1618 		return tlp->enqueues;
1619 	case DEQUEUES:
1620 		return tlp->dequeues;
1621 	case ESCALATES:
1622 		return tlp->escalates;
1623 	case SCANS:
1624 		return tlp->threshold.scans;
1625 	case PREEMPTS:
1626 		return tlp->threshold.preempts;
1627 	case LATENCY:
1628 		return tlp->threshold.latency;
1629 	case LATENCY_MIN:
1630 		return tlp->threshold.latency_min;
1631 	case LATENCY_MAX:
1632 		return tlp->threshold.latency_max;
1633 	case LONG_TERM_SCAN_LIMIT:
1634 		return tlp->scan_limit;
1635 	case LONG_TERM_SCAN_INTERVAL:
1636 		return tlp->scan_interval;
1637 	case LONG_TERM_SCAN_PAUSES:
1638 		return tlp->scan_pauses;
1639 	case SCAN_LIMIT:
1640 		return timer_scan_limit_us * NSEC_PER_USEC;
1641 	case SCAN_INTERVAL:
1642 		return timer_scan_interval_us * NSEC_PER_USEC;
1643 	case SCAN_PAUSES:
1644 		return counter_load(&timer_scan_pauses_cnt);
1645 	case SCAN_POSTPONES:
1646 		return counter_load(&timer_scan_postpones_cnt);
1647 
1648 	default:
1649 		return 0;
1650 	}
1651 }
1652 
1653 /*
1654  * timer_master_scan() is the inverse of timer_longterm_scan()
1655  * since it un-escalates timers to the longterm queue.
1656  */
1657 static void
timer_master_scan(timer_longterm_t * tlp,uint64_t now)1658 timer_master_scan(timer_longterm_t      *tlp,
1659     uint64_t              now)
1660 {
1661 	timer_call_t    call;
1662 	uint64_t        threshold;
1663 	uint64_t        deadline;
1664 	mpqueue_head_t  *timer_master_queue;
1665 
1666 	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1667 		threshold = now + tlp->threshold.interval;
1668 	} else {
1669 		threshold = TIMER_LONGTERM_NONE;
1670 	}
1671 
1672 	timer_master_queue = timer_queue_cpu(master_cpu);
1673 	timer_queue_lock_spin(timer_master_queue);
1674 
1675 	qe_foreach_element_safe(call, &timer_master_queue->head, tc_qlink) {
1676 		deadline = call->tc_pqlink.deadline;
1677 		if ((call->tc_flags & TIMER_CALL_LOCAL) != 0) {
1678 			continue;
1679 		}
1680 		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1681 			/* case (2c) lock order inversion, dequeue only */
1682 			timer_call_entry_dequeue_async(call);
1683 			continue;
1684 		}
1685 		if (deadline > threshold) {
1686 			/* move from master to longterm */
1687 			timer_call_entry_dequeue(call);
1688 			timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1689 			if (deadline < tlp->threshold.deadline) {
1690 				tlp->threshold.deadline = deadline;
1691 				tlp->threshold.call = call;
1692 			}
1693 		}
1694 		simple_unlock(&call->tc_lock);
1695 	}
1696 	timer_queue_unlock(timer_master_queue);
1697 }
1698 
1699 static void
timer_sysctl_set_threshold(void * valp)1700 timer_sysctl_set_threshold(void* valp)
1701 {
1702 	uint64_t value =        (uint64_t)valp;
1703 	timer_longterm_t        *tlp = &timer_longterm;
1704 	spl_t                   s = splclock();
1705 	boolean_t               threshold_increase;
1706 
1707 	timer_queue_lock_spin(timer_longterm_queue);
1708 
1709 	timer_call_cancel(&tlp->threshold.timer);
1710 
1711 	/*
1712 	 * Set the new threshold and note whther it's increasing.
1713 	 */
1714 	if (value == 0) {
1715 		tlp->threshold.interval = TIMER_LONGTERM_NONE;
1716 		threshold_increase = TRUE;
1717 		timer_call_cancel(&tlp->threshold.timer);
1718 	} else {
1719 		uint64_t        old_interval = tlp->threshold.interval;
1720 		tlp->threshold.interval = value * NSEC_PER_MSEC;
1721 		nanoseconds_to_absolutetime(tlp->threshold.interval,
1722 		    &tlp->threshold.interval);
1723 		tlp->threshold.margin = tlp->threshold.interval / 10;
1724 		if (old_interval == TIMER_LONGTERM_NONE) {
1725 			threshold_increase = FALSE;
1726 		} else {
1727 			threshold_increase = (tlp->threshold.interval > old_interval);
1728 		}
1729 	}
1730 
1731 	if (threshold_increase /* or removal */) {
1732 		/* Escalate timers from the longterm queue */
1733 		timer_longterm_scan(tlp, mach_absolute_time());
1734 	} else { /* decrease or addition  */
1735 		/*
1736 		 * We scan the local/master queue for timers now longterm.
1737 		 * To be strictly correct, we should scan all processor queues
1738 		 * but timer migration results in most timers gravitating to the
1739 		 * master processor in any case.
1740 		 */
1741 		timer_master_scan(tlp, mach_absolute_time());
1742 	}
1743 
1744 	/* Set new timer accordingly */
1745 	tlp->threshold.deadline_set = tlp->threshold.deadline;
1746 	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1747 		tlp->threshold.deadline_set -= tlp->threshold.margin;
1748 		tlp->threshold.deadline_set -= tlp->threshold.latency;
1749 		timer_call_enter(
1750 			&tlp->threshold.timer,
1751 			tlp->threshold.deadline_set,
1752 			TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1753 	}
1754 
1755 	/* Reset stats */
1756 	tlp->enqueues = 0;
1757 	tlp->dequeues = 0;
1758 	tlp->escalates = 0;
1759 	tlp->scan_pauses = 0;
1760 	tlp->threshold.scans = 0;
1761 	tlp->threshold.preempts = 0;
1762 	tlp->threshold.latency = 0;
1763 	tlp->threshold.latency_min = EndOfAllTime;
1764 	tlp->threshold.latency_max = 0;
1765 
1766 	timer_queue_unlock(timer_longterm_queue);
1767 	splx(s);
1768 }
1769 
1770 int
timer_sysctl_set(int oid,uint64_t value)1771 timer_sysctl_set(int oid, uint64_t value)
1772 {
1773 	switch (oid) {
1774 	case THRESHOLD:
1775 		timer_call_cpu(
1776 			master_cpu,
1777 			timer_sysctl_set_threshold,
1778 			(void *) value);
1779 		return KERN_SUCCESS;
1780 	case LONG_TERM_SCAN_LIMIT:
1781 		timer_longterm.scan_limit = value;
1782 		return KERN_SUCCESS;
1783 	case LONG_TERM_SCAN_INTERVAL:
1784 		timer_longterm.scan_interval = value;
1785 		return KERN_SUCCESS;
1786 	case SCAN_LIMIT:
1787 		if (value > MAX_TIMER_SCAN_LIMIT ||
1788 		    value < MIN_TIMER_SCAN_LIMIT) {
1789 			return KERN_INVALID_ARGUMENT;
1790 		}
1791 		timer_scan_limit_us = value / NSEC_PER_USEC;
1792 		nanoseconds_to_absolutetime(timer_scan_limit_us * NSEC_PER_USEC,
1793 		    &timer_scan_limit_abs);
1794 		return KERN_SUCCESS;
1795 	case SCAN_INTERVAL:
1796 		if (value > MAX_TIMER_SCAN_INTERVAL ||
1797 		    value < MIN_TIMER_SCAN_INTERVAL) {
1798 			return KERN_INVALID_ARGUMENT;
1799 		}
1800 		timer_scan_interval_us = value / NSEC_PER_USEC;
1801 		nanoseconds_to_absolutetime(timer_scan_interval_us * NSEC_PER_USEC,
1802 		    &timer_scan_interval_abs);
1803 		return KERN_SUCCESS;
1804 	default:
1805 		return KERN_INVALID_ARGUMENT;
1806 	}
1807 }
1808 
1809 
1810 /* Select timer coalescing window based on per-task quality-of-service hints */
1811 static boolean_t
tcoal_qos_adjust(thread_t t,int32_t * tshift,uint64_t * tmax_abstime,boolean_t * pratelimited)1812 tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1813 {
1814 	uint32_t latency_qos;
1815 	boolean_t adjusted = FALSE;
1816 	task_t ctask = get_threadtask(t);
1817 
1818 	if (ctask) {
1819 		latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1820 
1821 		assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1822 
1823 		if (latency_qos) {
1824 			*tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1825 			*tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1826 			*pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1827 			adjusted = TRUE;
1828 		}
1829 	}
1830 	return adjusted;
1831 }
1832 
1833 
1834 /* Adjust timer deadlines based on priority of the thread and the
1835  * urgency value provided at timeout establishment. With this mechanism,
1836  * timers are no longer necessarily sorted in order of soft deadline
1837  * on a given timer queue, i.e. they may be differentially skewed.
1838  * In the current scheme, this could lead to fewer pending timers
1839  * processed than is technically possible when the HW deadline arrives.
1840  */
1841 static void
timer_compute_leeway(thread_t cthread,int32_t urgency,int32_t * tshift,uint64_t * tmax_abstime,boolean_t * pratelimited)1842 timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1843 {
1844 	int16_t tpri = cthread->sched_pri;
1845 	if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1846 		bool tg_critical = false;
1847 #if CONFIG_THREAD_GROUPS
1848 		uint32_t tg_flags = thread_group_get_flags(thread_group_get(cthread));
1849 		tg_critical = tg_flags & (THREAD_GROUP_FLAGS_CRITICAL | THREAD_GROUP_FLAGS_STRICT_TIMERS);
1850 #endif /* CONFIG_THREAD_GROUPS */
1851 		bool timer_critical = (tpri >= BASEPRI_RTQUEUES) ||
1852 		    (urgency == TIMER_CALL_USER_CRITICAL) ||
1853 		    tg_critical;
1854 		if (timer_critical) {
1855 			*tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1856 			*tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1857 			TCOAL_PRIO_STAT(rt_tcl);
1858 		} else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
1859 		    (urgency == TIMER_CALL_USER_BACKGROUND)) {
1860 			/* Determine if timer should be subjected to a lower QoS */
1861 			if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1862 				if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1863 					return;
1864 				} else {
1865 					*pratelimited = FALSE;
1866 				}
1867 			}
1868 			*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1869 			*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1870 			TCOAL_PRIO_STAT(bg_tcl);
1871 		} else if (tpri >= MINPRI_KERNEL) {
1872 			*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1873 			*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1874 			TCOAL_PRIO_STAT(kt_tcl);
1875 		} else if (cthread->sched_mode == TH_MODE_FIXED) {
1876 			*tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1877 			*tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1878 			TCOAL_PRIO_STAT(fp_tcl);
1879 		} else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1880 			TCOAL_PRIO_STAT(qos_tcl);
1881 		} else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1882 			*tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1883 			*tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1884 			TCOAL_PRIO_STAT(ts_tcl);
1885 		} else {
1886 			TCOAL_PRIO_STAT(nc_tcl);
1887 		}
1888 	} else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1889 		*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1890 		*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1891 		TCOAL_PRIO_STAT(bg_tcl);
1892 	} else {
1893 		*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1894 		*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1895 		TCOAL_PRIO_STAT(kt_tcl);
1896 	}
1897 }
1898 
1899 
1900 int timer_user_idle_level;
1901 
1902 uint64_t
timer_call_slop(uint64_t deadline,uint64_t now,uint32_t flags,thread_t cthread,boolean_t * pratelimited)1903 timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1904 {
1905 	int32_t tcs_shift = 0;
1906 	uint64_t tcs_max_abstime = 0;
1907 	uint64_t adjval;
1908 	uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1909 
1910 	if (mach_timer_coalescing_enabled &&
1911 	    (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1912 		timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
1913 
1914 		if (tcs_shift >= 0) {
1915 			adjval =  MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
1916 		} else {
1917 			adjval =  MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
1918 		}
1919 		/* Apply adjustments derived from "user idle level" heuristic */
1920 		adjval += (adjval * timer_user_idle_level) >> 7;
1921 		return adjval;
1922 	} else {
1923 		return 0;
1924 	}
1925 }
1926 
1927 int
timer_get_user_idle_level(void)1928 timer_get_user_idle_level(void)
1929 {
1930 	return timer_user_idle_level;
1931 }
1932 
1933 kern_return_t
timer_set_user_idle_level(int ilevel)1934 timer_set_user_idle_level(int ilevel)
1935 {
1936 	boolean_t do_reeval = FALSE;
1937 
1938 	if ((ilevel < 0) || (ilevel > 128)) {
1939 		return KERN_INVALID_ARGUMENT;
1940 	}
1941 
1942 	if (ilevel < timer_user_idle_level) {
1943 		do_reeval = TRUE;
1944 	}
1945 
1946 	timer_user_idle_level = ilevel;
1947 
1948 	if (do_reeval) {
1949 		ml_timer_evaluate();
1950 	}
1951 
1952 	return KERN_SUCCESS;
1953 }
1954 
1955 #pragma mark - running timers
1956 
1957 #define RUNNING_TIMER_FAKE_FLAGS (TIMER_CALL_SYS_CRITICAL | \
1958     TIMER_CALL_LOCAL)
1959 
1960 /*
1961  * timer_call_trace_* functions mimic the tracing behavior from the normal
1962  * timer_call subsystem, so tools continue to function.
1963  */
1964 
1965 static void
timer_call_trace_enter_before(struct timer_call * call,uint64_t deadline,uint32_t flags,uint64_t now)1966 timer_call_trace_enter_before(struct timer_call *call, uint64_t deadline,
1967     uint32_t flags, uint64_t now)
1968 {
1969 #pragma unused(call, deadline, flags, now)
1970 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_START,
1971 	    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_ADDRHIDE(call->tc_param1),
1972 	    deadline, flags, 0);
1973 #if CONFIG_DTRACE
1974 	uint64_t ttd = deadline - now;
1975 	DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
1976 	    timer_call_param_t, call->tc_param0, uint32_t, flags, 0,
1977 	    (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
1978 #endif /* CONFIG_DTRACE */
1979 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1980 	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1981 }
1982 
1983 static void
timer_call_trace_enter_after(struct timer_call * call,uint64_t deadline)1984 timer_call_trace_enter_after(struct timer_call *call, uint64_t deadline)
1985 {
1986 #pragma unused(call, deadline)
1987 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1988 	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1989 }
1990 
1991 static void
timer_call_trace_cancel(struct timer_call * call)1992 timer_call_trace_cancel(struct timer_call *call)
1993 {
1994 #pragma unused(call)
1995 	__unused uint64_t deadline = call->tc_pqlink.deadline;
1996 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_START,
1997 	    VM_KERNEL_UNSLIDE_OR_PERM(call), deadline, 0,
1998 	    call->tc_flags, 0);
1999 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_END,
2000 	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline - mach_absolute_time(),
2001 	    deadline - call->tc_entry_time, 0);
2002 #if CONFIG_DTRACE
2003 #if TIMER_TRACE
2004 	uint64_t ttd = deadline - call->tc_entry_time;
2005 #else
2006 	uint64_t ttd = UINT64_MAX;
2007 #endif /* TIMER_TRACE */
2008 	DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
2009 	    timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
2010 	    (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF));
2011 #endif /* CONFIG_DTRACE */
2012 }
2013 
2014 static void
timer_call_trace_expire_entry(struct timer_call * call)2015 timer_call_trace_expire_entry(struct timer_call *call)
2016 {
2017 #pragma unused(call)
2018 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_START,
2019 	    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(call->tc_func),
2020 	    VM_KERNEL_ADDRHIDE(call->tc_param0),
2021 	    VM_KERNEL_ADDRHIDE(call->tc_param1),
2022 	    0);
2023 #if CONFIG_DTRACE
2024 #if TIMER_TRACE
2025 	uint64_t ttd = call->tc_pqlink.deadline - call->tc_entry_time;
2026 #else /* TIMER_TRACE */
2027 	uint64_t ttd = UINT64_MAX;
2028 #endif /* TIMER_TRACE */
2029 	DTRACE_TMR7(callout__start, timer_call_func_t, call->tc_func,
2030 	    timer_call_param_t, call->tc_param0, unsigned, call->tc_flags,
2031 	    0, (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
2032 #endif /* CONFIG_DTRACE */
2033 }
2034 
2035 static void
timer_call_trace_expire_return(struct timer_call * call)2036 timer_call_trace_expire_return(struct timer_call *call)
2037 {
2038 #pragma unused(call)
2039 #if CONFIG_DTRACE
2040 	DTRACE_TMR4(callout__end, timer_call_func_t, call->tc_func,
2041 	    call->tc_param0, call->tc_param1, NULL);
2042 #endif /* CONFIG_DTRACE */
2043 	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_END,
2044 	    VM_KERNEL_UNSLIDE_OR_PERM(call),
2045 	    VM_KERNEL_UNSLIDE(call->tc_func),
2046 	    VM_KERNEL_ADDRHIDE(call->tc_param0),
2047 	    VM_KERNEL_ADDRHIDE(call->tc_param1),
2048 	    0);
2049 }
2050 
2051 /*
2052  * Set a new deadline for a running timer on this processor.
2053  */
2054 void
running_timer_setup(processor_t processor,enum running_timer timer,void * param,uint64_t deadline,uint64_t now)2055 running_timer_setup(processor_t processor, enum running_timer timer,
2056     void *param, uint64_t deadline, uint64_t now)
2057 {
2058 	assert(timer < RUNNING_TIMER_MAX);
2059 	assert(ml_get_interrupts_enabled() == FALSE);
2060 
2061 	struct timer_call *call = &processor->running_timers[timer];
2062 
2063 	timer_call_trace_enter_before(call, deadline, RUNNING_TIMER_FAKE_FLAGS,
2064 	    now);
2065 
2066 	if (__improbable(deadline < now)) {
2067 		deadline = timer_call_past_deadline_timer_handle(deadline, now);
2068 	}
2069 
2070 	call->tc_pqlink.deadline = deadline;
2071 #if TIMER_TRACE
2072 	call->tc_entry_time = now;
2073 #endif /* TIMER_TRACE */
2074 	call->tc_param1 = param;
2075 
2076 	timer_call_trace_enter_after(call, deadline);
2077 }
2078 
2079 void
running_timers_sync(void)2080 running_timers_sync(void)
2081 {
2082 	timer_resync_deadlines();
2083 }
2084 
2085 void
running_timer_enter(processor_t processor,unsigned int timer,void * param,uint64_t deadline,uint64_t now)2086 running_timer_enter(processor_t processor, unsigned int timer,
2087     void *param, uint64_t deadline, uint64_t now)
2088 {
2089 	running_timer_setup(processor, timer, param, deadline, now);
2090 	running_timers_sync();
2091 }
2092 
2093 /*
2094  * Call the callback for any running timers that fired for this processor.
2095  * Returns true if any timers were past their deadline.
2096  */
2097 bool
running_timers_expire(processor_t processor,uint64_t now)2098 running_timers_expire(processor_t processor, uint64_t now)
2099 {
2100 	bool expired = false;
2101 
2102 	if (!processor->running_timers_active) {
2103 		return expired;
2104 	}
2105 
2106 	for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
2107 		struct timer_call *call = &processor->running_timers[i];
2108 
2109 		uint64_t deadline = call->tc_pqlink.deadline;
2110 		if (deadline > now) {
2111 			continue;
2112 		}
2113 
2114 		expired = true;
2115 		timer_call_trace_expire_entry(call);
2116 		call->tc_func(call->tc_param0, call->tc_param1);
2117 		timer_call_trace_expire_return(call);
2118 	}
2119 
2120 	return expired;
2121 }
2122 
2123 void
running_timer_clear(processor_t processor,enum running_timer timer)2124 running_timer_clear(processor_t processor, enum running_timer timer)
2125 {
2126 	struct timer_call *call = &processor->running_timers[timer];
2127 	uint64_t deadline = call->tc_pqlink.deadline;
2128 	if (deadline == EndOfAllTime) {
2129 		return;
2130 	}
2131 
2132 	call->tc_pqlink.deadline = EndOfAllTime;
2133 #if TIMER_TRACE
2134 	call->tc_entry_time = 0;
2135 #endif /* TIMER_TRACE */
2136 	timer_call_trace_cancel(call);
2137 }
2138 
2139 void
running_timer_cancel(processor_t processor,unsigned int timer)2140 running_timer_cancel(processor_t processor, unsigned int timer)
2141 {
2142 	running_timer_clear(processor, timer);
2143 	running_timers_sync();
2144 }
2145 
2146 uint64_t
running_timers_deadline(processor_t processor)2147 running_timers_deadline(processor_t processor)
2148 {
2149 	if (!processor->running_timers_active) {
2150 		return EndOfAllTime;
2151 	}
2152 
2153 	uint64_t deadline = EndOfAllTime;
2154 	for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
2155 		uint64_t candidate =
2156 		    processor->running_timers[i].tc_pqlink.deadline;
2157 		if (candidate != 0 && candidate < deadline) {
2158 			deadline = candidate;
2159 		}
2160 	}
2161 
2162 	return deadline;
2163 }
2164 
2165 void
running_timers_activate(processor_t processor)2166 running_timers_activate(processor_t processor)
2167 {
2168 	processor->running_timers_active = true;
2169 	running_timers_sync();
2170 }
2171 
2172 void
running_timers_deactivate(processor_t processor)2173 running_timers_deactivate(processor_t processor)
2174 {
2175 	assert(processor->running_timers_active == true);
2176 	processor->running_timers_active = false;
2177 	running_timers_sync();
2178 }
2179