1 /*
2 * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Timer interrupt callout module.
30 */
31
32 #include <mach/mach_types.h>
33
34 #include <kern/clock.h>
35 #include <kern/counter.h>
36 #include <kern/smp.h>
37 #include <kern/processor.h>
38 #include <kern/timer_call.h>
39 #include <kern/timer_queue.h>
40 #include <kern/thread.h>
41 #include <kern/thread_group.h>
42 #include <kern/policy_internal.h>
43
44 #include <sys/kdebug.h>
45
46 #if CONFIG_DTRACE
47 #include <mach/sdt.h>
48 #endif
49
50
51 #if DEBUG
52 #define TIMER_ASSERT 1
53 #endif
54
55 //#define TIMER_ASSERT 1
56 //#define TIMER_DBG 1
57
58 #if TIMER_DBG
59 #define DBG(x...) kprintf("DBG: " x);
60 #else
61 #define DBG(x...)
62 #endif
63
64 #if TIMER_TRACE
65 #define TIMER_KDEBUG_TRACE KERNEL_DEBUG_CONSTANT_IST
66 #else
67 #define TIMER_KDEBUG_TRACE(x...)
68 #endif
69
70 LCK_GRP_DECLARE(timer_call_lck_grp, "timer_call");
71 LCK_GRP_DECLARE(timer_longterm_lck_grp, "timer_longterm");
72
73 /* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
74 #define timer_queue_lock_spin(queue) lck_ticket_lock(&(queue)->lock_data, LCK_GRP_NULL)
75 #define timer_queue_unlock(queue) lck_ticket_unlock(&(queue)->lock_data)
76
77 /*
78 * The longterm timer object is a global structure holding all timers
79 * beyond the short-term, local timer queue threshold. The boot processor
80 * is responsible for moving each timer to its local timer queue
81 * if and when that timer becomes due within the threshold.
82 */
83
84 /* Sentinel for "no time set": */
85 #define TIMER_LONGTERM_NONE EndOfAllTime
86 /* The default threadhold is the delta above which a timer is "long-term" */
87 #if defined(__x86_64__)
88 #define TIMER_LONGTERM_THRESHOLD (1ULL * NSEC_PER_SEC) /* 1 sec */
89 #else
90 #define TIMER_LONGTERM_THRESHOLD TIMER_LONGTERM_NONE /* disabled */
91 #endif
92
93 /*
94 * The scan_limit throttles processing of the longterm queue.
95 * If the scan time exceeds this limit, we terminate, unlock
96 * and defer for scan_interval. This prevents unbounded holding of
97 * timer queue locks with interrupts masked.
98 */
99 #define TIMER_LONGTERM_SCAN_LIMIT (100ULL * NSEC_PER_USEC) /* 100 us */
100 #define TIMER_LONGTERM_SCAN_INTERVAL (100ULL * NSEC_PER_USEC) /* 100 us */
101 /* Sentinel for "scan limit exceeded": */
102 #define TIMER_LONGTERM_SCAN_AGAIN 0
103
104 /*
105 * In a similar way to the longterm queue's scan limit, the following bounds the
106 * amount of time spent processing regular timers.
107 */
108 TUNABLE_WRITEABLE(uint64_t, timer_scan_limit_us, "timer_scan_limit_us", 400);
109 TUNABLE_WRITEABLE(uint64_t, timer_scan_interval_us, "timer_scan_interval_us", 40);
110 static uint64_t timer_scan_limit_abs = 0;
111 static uint64_t timer_scan_interval_abs = 0;
112
113 /*
114 * Count of times scanning the queue was aborted early (to avoid long
115 * scan times).
116 */
117 SCALABLE_COUNTER_DEFINE(timer_scan_pauses_cnt);
118
119 /*
120 * Count of times scanning the queue was aborted early resulting in a
121 * postponed hard deadline.
122 */
123 SCALABLE_COUNTER_DEFINE(timer_scan_postpones_cnt);
124
125 #define MAX_TIMER_SCAN_LIMIT (30000ULL * NSEC_PER_USEC) /* 30 ms */
126 #define MIN_TIMER_SCAN_LIMIT ( 50ULL * NSEC_PER_USEC) /* 50 us */
127 #define MAX_TIMER_SCAN_INTERVAL ( 2000ULL * NSEC_PER_USEC) /* 2 ms */
128 #define MIN_TIMER_SCAN_INTERVAL ( 20ULL * NSEC_PER_USEC) /* 20 us */
129
130 typedef struct {
131 uint64_t interval; /* longterm timer interval */
132 uint64_t margin; /* fudge factor (10% of interval */
133 uint64_t deadline; /* first/soonest longterm deadline */
134 uint64_t preempted; /* sooner timer has pre-empted */
135 timer_call_t call; /* first/soonest longterm timer call */
136 uint64_t deadline_set; /* next timer set */
137 timer_call_data_t timer; /* timer used by threshold management */
138 /* Stats: */
139 uint64_t scans; /* num threshold timer scans */
140 uint64_t preempts; /* num threshold reductions */
141 uint64_t latency; /* average threshold latency */
142 uint64_t latency_min; /* minimum threshold latency */
143 uint64_t latency_max; /* maximum threshold latency */
144 } threshold_t;
145
146 typedef struct {
147 mpqueue_head_t queue; /* longterm timer list */
148 uint64_t enqueues; /* num timers queued */
149 uint64_t dequeues; /* num timers dequeued */
150 uint64_t escalates; /* num timers becoming shortterm */
151 uint64_t scan_time; /* last time the list was scanned */
152 threshold_t threshold; /* longterm timer threshold */
153 uint64_t scan_limit; /* maximum scan time */
154 uint64_t scan_interval; /* interval between LT "escalation" scans */
155 uint64_t scan_pauses; /* num scans exceeding time limit */
156 } timer_longterm_t;
157
158 timer_longterm_t timer_longterm = {
159 .scan_limit = TIMER_LONGTERM_SCAN_LIMIT,
160 .scan_interval = TIMER_LONGTERM_SCAN_INTERVAL,
161 };
162
163 static mpqueue_head_t *timer_longterm_queue = NULL;
164
165 static void timer_longterm_init(void);
166 static void timer_longterm_callout(
167 timer_call_param_t p0,
168 timer_call_param_t p1);
169 extern void timer_longterm_scan(
170 timer_longterm_t *tlp,
171 uint64_t now);
172 static void timer_longterm_update(
173 timer_longterm_t *tlp);
174 static void timer_longterm_update_locked(
175 timer_longterm_t *tlp);
176 static mpqueue_head_t * timer_longterm_enqueue_unlocked(
177 timer_call_t call,
178 uint64_t now,
179 uint64_t deadline,
180 mpqueue_head_t ** old_queue,
181 uint64_t soft_deadline,
182 uint64_t ttd,
183 timer_call_param_t param1,
184 uint32_t callout_flags);
185 static void timer_longterm_dequeued_locked(
186 timer_call_t call);
187
188 uint64_t past_deadline_timers;
189 uint64_t past_deadline_deltas;
190 uint64_t past_deadline_longest;
191 uint64_t past_deadline_shortest = ~0ULL;
192 enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
193
194 uint64_t past_deadline_timer_adjustment;
195
196 static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
197 boolean_t mach_timer_coalescing_enabled = TRUE;
198
199 mpqueue_head_t *timer_call_enqueue_deadline_unlocked(
200 timer_call_t call,
201 mpqueue_head_t *queue,
202 uint64_t deadline,
203 uint64_t soft_deadline,
204 uint64_t ttd,
205 timer_call_param_t param1,
206 uint32_t flags);
207
208 mpqueue_head_t *timer_call_dequeue_unlocked(
209 timer_call_t call);
210
211 timer_coalescing_priority_params_t tcoal_prio_params;
212
213 #if TCOAL_PRIO_STATS
214 int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
215 #define TCOAL_PRIO_STAT(x) (x++)
216 #else
217 #define TCOAL_PRIO_STAT(x)
218 #endif
219
220 static void
timer_call_init_abstime(void)221 timer_call_init_abstime(void)
222 {
223 int i;
224 uint64_t result;
225 timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
226 nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
227 nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
228 tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
229 nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
230 tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
231 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
232 tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
233 tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
234 tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
235 tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
236 tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
237 tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
238
239 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
240 &tcoal_prio_params.timer_coalesce_rt_abstime_max);
241 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
242 &tcoal_prio_params.timer_coalesce_bg_abstime_max);
243 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
244 &tcoal_prio_params.timer_coalesce_kt_abstime_max);
245 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
246 &tcoal_prio_params.timer_coalesce_fp_abstime_max);
247 nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
248 &tcoal_prio_params.timer_coalesce_ts_abstime_max);
249
250 for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
251 tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
252 nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
253 &tcoal_prio_params.latency_qos_abstime_max[i]);
254 tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
255 }
256
257 nanoseconds_to_absolutetime(timer_scan_limit_us * NSEC_PER_USEC, &timer_scan_limit_abs);
258 nanoseconds_to_absolutetime(timer_scan_interval_us * NSEC_PER_USEC, &timer_scan_interval_abs);
259 }
260
261
262 void
timer_call_init(void)263 timer_call_init(void)
264 {
265 timer_longterm_init();
266 timer_call_init_abstime();
267 }
268
269
270 void
timer_call_queue_init(mpqueue_head_t * queue)271 timer_call_queue_init(mpqueue_head_t *queue)
272 {
273 DBG("timer_call_queue_init(%p)\n", queue);
274 mpqueue_init(queue, &timer_call_lck_grp, LCK_ATTR_NULL);
275 }
276
277
278 void
timer_call_setup(timer_call_t call,timer_call_func_t func,timer_call_param_t param0)279 timer_call_setup(
280 timer_call_t call,
281 timer_call_func_t func,
282 timer_call_param_t param0)
283 {
284 DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
285
286 *call = (struct timer_call) {
287 .tc_func = func,
288 .tc_param0 = param0,
289 .tc_async_dequeue = false,
290 };
291
292 simple_lock_init(&(call)->tc_lock, 0);
293 }
294
295 timer_call_t
timer_call_alloc(timer_call_func_t func,timer_call_param_t param0)296 timer_call_alloc(
297 timer_call_func_t func,
298 timer_call_param_t param0)
299 {
300 timer_call_t call;
301
302 call = kalloc_type(struct timer_call, Z_ZERO | Z_WAITOK | Z_NOFAIL);
303 timer_call_setup(call, func, param0);
304 return call;
305 }
306
307 void
timer_call_free(timer_call_t call)308 timer_call_free(
309 timer_call_t call)
310 {
311 kfree_type(struct timer_call, call);
312 }
313
314 static mpqueue_head_t*
mpqueue_for_timer_call(timer_call_t entry)315 mpqueue_for_timer_call(timer_call_t entry)
316 {
317 queue_t queue_entry_is_on = entry->tc_queue;
318 /* 'cast' the queue back to the orignal mpqueue */
319 return __container_of(queue_entry_is_on, struct mpqueue_head, head);
320 }
321
322
323 static __inline__ mpqueue_head_t *
timer_call_entry_dequeue(timer_call_t entry)324 timer_call_entry_dequeue(
325 timer_call_t entry)
326 {
327 mpqueue_head_t *old_mpqueue = mpqueue_for_timer_call(entry);
328
329 /* The entry was always on a queue */
330 assert(old_mpqueue != NULL);
331
332 #if TIMER_ASSERT
333 if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
334 panic("_call_entry_dequeue() "
335 "entry %p is not locked\n", entry);
336 }
337
338 /*
339 * XXX The queue lock is actually a mutex in spin mode
340 * but there's no way to test for it being held
341 * so we pretend it's a spinlock!
342 */
343 if (!hw_lock_held((hw_lock_t)&old_mpqueue->lock_data)) {
344 panic("_call_entry_dequeue() "
345 "queue %p is not locked\n", old_mpqueue);
346 }
347 #endif /* TIMER_ASSERT */
348
349 if (old_mpqueue != timer_longterm_queue) {
350 priority_queue_remove(&old_mpqueue->mpq_pqhead,
351 &entry->tc_pqlink);
352 }
353
354 remqueue(&entry->tc_qlink);
355
356 entry->tc_queue = NULL;
357
358 old_mpqueue->count--;
359
360 return old_mpqueue;
361 }
362
363 static __inline__ mpqueue_head_t *
timer_call_entry_enqueue_deadline(timer_call_t entry,mpqueue_head_t * new_mpqueue,uint64_t deadline)364 timer_call_entry_enqueue_deadline(
365 timer_call_t entry,
366 mpqueue_head_t *new_mpqueue,
367 uint64_t deadline)
368 {
369 mpqueue_head_t *old_mpqueue = mpqueue_for_timer_call(entry);
370
371 #if TIMER_ASSERT
372 if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
373 panic("_call_entry_enqueue_deadline() "
374 "entry %p is not locked\n", entry);
375 }
376
377 /* XXX More lock pretense: */
378 if (!hw_lock_held((hw_lock_t)&new_mpqueue->lock_data)) {
379 panic("_call_entry_enqueue_deadline() "
380 "queue %p is not locked\n", new_mpqueue);
381 }
382
383 if (old_mpqueue != NULL && old_mpqueue != new_mpqueue) {
384 panic("_call_entry_enqueue_deadline() "
385 "old_mpqueue %p != new_mpqueue", old_mpqueue);
386 }
387 #endif /* TIMER_ASSERT */
388
389 /* no longterm queue involved */
390 assert(new_mpqueue != timer_longterm_queue);
391 assert(old_mpqueue != timer_longterm_queue);
392
393 if (old_mpqueue == new_mpqueue) {
394 /* optimize the same-queue case to avoid a full re-insert */
395 uint64_t old_deadline = entry->tc_pqlink.deadline;
396 entry->tc_pqlink.deadline = deadline;
397
398 if (old_deadline < deadline) {
399 priority_queue_entry_increased(&new_mpqueue->mpq_pqhead,
400 &entry->tc_pqlink);
401 } else {
402 priority_queue_entry_decreased(&new_mpqueue->mpq_pqhead,
403 &entry->tc_pqlink);
404 }
405 } else {
406 if (old_mpqueue != NULL) {
407 priority_queue_remove(&old_mpqueue->mpq_pqhead,
408 &entry->tc_pqlink);
409
410 re_queue_tail(&new_mpqueue->head, &entry->tc_qlink);
411 } else {
412 enqueue_tail(&new_mpqueue->head, &entry->tc_qlink);
413 }
414
415 entry->tc_queue = &new_mpqueue->head;
416 entry->tc_pqlink.deadline = deadline;
417
418 priority_queue_insert(&new_mpqueue->mpq_pqhead, &entry->tc_pqlink);
419 }
420
421
422 /* For efficiency, track the earliest soft deadline on the queue,
423 * so that fuzzy decisions can be made without lock acquisitions.
424 */
425
426 timer_call_t thead = priority_queue_min(&new_mpqueue->mpq_pqhead, struct timer_call, tc_pqlink);
427
428 new_mpqueue->earliest_soft_deadline = thead->tc_flags & TIMER_CALL_RATELIMITED ? thead->tc_pqlink.deadline : thead->tc_soft_deadline;
429
430 if (old_mpqueue) {
431 old_mpqueue->count--;
432 }
433 new_mpqueue->count++;
434
435 return old_mpqueue;
436 }
437
438 static __inline__ void
timer_call_entry_enqueue_tail(timer_call_t entry,mpqueue_head_t * queue)439 timer_call_entry_enqueue_tail(
440 timer_call_t entry,
441 mpqueue_head_t *queue)
442 {
443 /* entry is always dequeued before this call */
444 assert(entry->tc_queue == NULL);
445
446 /*
447 * this is only used for timer_longterm_queue, which is unordered
448 * and thus needs no priority queueing
449 */
450 assert(queue == timer_longterm_queue);
451
452 enqueue_tail(&queue->head, &entry->tc_qlink);
453
454 entry->tc_queue = &queue->head;
455
456 queue->count++;
457 return;
458 }
459
460 /*
461 * Remove timer entry from its queue but don't change the queue pointer
462 * and set the async_dequeue flag. This is locking case 2b.
463 */
464 static __inline__ void
timer_call_entry_dequeue_async(timer_call_t entry)465 timer_call_entry_dequeue_async(
466 timer_call_t entry)
467 {
468 mpqueue_head_t *old_mpqueue = mpqueue_for_timer_call(entry);
469 if (old_mpqueue) {
470 old_mpqueue->count--;
471
472 if (old_mpqueue != timer_longterm_queue) {
473 priority_queue_remove(&old_mpqueue->mpq_pqhead,
474 &entry->tc_pqlink);
475 }
476
477 remqueue(&entry->tc_qlink);
478 entry->tc_async_dequeue = true;
479 }
480 return;
481 }
482
483 #if TIMER_ASSERT
484 unsigned timer_call_enqueue_deadline_unlocked_async1;
485 unsigned timer_call_enqueue_deadline_unlocked_async2;
486 #endif
487 /*
488 * Assumes call_entry and queues unlocked, interrupts disabled.
489 */
490 __inline__ mpqueue_head_t *
timer_call_enqueue_deadline_unlocked(timer_call_t call,mpqueue_head_t * queue,uint64_t deadline,uint64_t soft_deadline,uint64_t ttd,timer_call_param_t param1,uint32_t callout_flags)491 timer_call_enqueue_deadline_unlocked(
492 timer_call_t call,
493 mpqueue_head_t *queue,
494 uint64_t deadline,
495 uint64_t soft_deadline,
496 uint64_t ttd,
497 timer_call_param_t param1,
498 uint32_t callout_flags)
499 {
500 DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
501
502 simple_lock(&call->tc_lock, LCK_GRP_NULL);
503
504 mpqueue_head_t *old_queue = mpqueue_for_timer_call(call);
505
506 if (old_queue != NULL) {
507 timer_queue_lock_spin(old_queue);
508 if (call->tc_async_dequeue) {
509 /* collision (1c): timer already dequeued, clear flag */
510 #if TIMER_ASSERT
511 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
512 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
513 VM_KERNEL_UNSLIDE_OR_PERM(call),
514 call->tc_async_dequeue,
515 VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
516 0x1c, 0);
517 timer_call_enqueue_deadline_unlocked_async1++;
518 #endif
519 call->tc_async_dequeue = false;
520 call->tc_queue = NULL;
521 } else if (old_queue != queue) {
522 timer_call_entry_dequeue(call);
523 #if TIMER_ASSERT
524 timer_call_enqueue_deadline_unlocked_async2++;
525 #endif
526 }
527 if (old_queue == timer_longterm_queue) {
528 timer_longterm_dequeued_locked(call);
529 }
530 if (old_queue != queue) {
531 timer_queue_unlock(old_queue);
532 timer_queue_lock_spin(queue);
533 }
534 } else {
535 timer_queue_lock_spin(queue);
536 }
537
538 call->tc_soft_deadline = soft_deadline;
539 call->tc_flags = callout_flags;
540 call->tc_param1 = param1;
541 call->tc_ttd = ttd;
542
543 timer_call_entry_enqueue_deadline(call, queue, deadline);
544 timer_queue_unlock(queue);
545 simple_unlock(&call->tc_lock);
546
547 return old_queue;
548 }
549
550 #if TIMER_ASSERT
551 unsigned timer_call_dequeue_unlocked_async1;
552 unsigned timer_call_dequeue_unlocked_async2;
553 #endif
554 mpqueue_head_t *
timer_call_dequeue_unlocked(timer_call_t call)555 timer_call_dequeue_unlocked(
556 timer_call_t call)
557 {
558 DBG("timer_call_dequeue_unlocked(%p)\n", call);
559
560 simple_lock(&call->tc_lock, LCK_GRP_NULL);
561
562 mpqueue_head_t *old_queue = mpqueue_for_timer_call(call);
563
564 #if TIMER_ASSERT
565 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
566 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
567 VM_KERNEL_UNSLIDE_OR_PERM(call),
568 call->tc_async_dequeue,
569 VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
570 0, 0);
571 #endif
572 if (old_queue != NULL) {
573 timer_queue_lock_spin(old_queue);
574 if (call->tc_async_dequeue) {
575 /* collision (1c): timer already dequeued, clear flag */
576 #if TIMER_ASSERT
577 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
578 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
579 VM_KERNEL_UNSLIDE_OR_PERM(call),
580 call->tc_async_dequeue,
581 VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
582 0x1c, 0);
583 timer_call_dequeue_unlocked_async1++;
584 #endif
585 call->tc_async_dequeue = false;
586 call->tc_queue = NULL;
587 } else {
588 timer_call_entry_dequeue(call);
589 }
590 if (old_queue == timer_longterm_queue) {
591 timer_longterm_dequeued_locked(call);
592 }
593 timer_queue_unlock(old_queue);
594 }
595 simple_unlock(&call->tc_lock);
596 return old_queue;
597 }
598
599 uint64_t
timer_call_past_deadline_timer_handle(uint64_t deadline,uint64_t ctime)600 timer_call_past_deadline_timer_handle(uint64_t deadline, uint64_t ctime)
601 {
602 uint64_t delta = (ctime - deadline);
603
604 past_deadline_timers++;
605 past_deadline_deltas += delta;
606 if (delta > past_deadline_longest) {
607 past_deadline_longest = deadline;
608 }
609 if (delta < past_deadline_shortest) {
610 past_deadline_shortest = delta;
611 }
612
613 return ctime + past_deadline_timer_adjustment;
614 }
615
616 /*
617 * Timer call entry locking model
618 * ==============================
619 *
620 * Timer call entries are linked on per-cpu timer queues which are protected
621 * by the queue lock and the call entry lock. The locking protocol is:
622 *
623 * 0) The canonical locking order is timer call entry followed by queue.
624 *
625 * 1) With only the entry lock held, entry.queue is valid:
626 * 1a) NULL: the entry is not queued, or
627 * 1b) non-NULL: this queue must be locked before the entry is modified.
628 * After locking the queue, the call.async_dequeue flag must be checked:
629 * 1c) TRUE: the entry was removed from the queue by another thread
630 * and we must NULL the entry.queue and reset this flag, or
631 * 1d) FALSE: (ie. queued), the entry can be manipulated.
632 *
633 * 2) If a queue lock is obtained first, the queue is stable:
634 * 2a) If a try-lock of a queued entry succeeds, the call can be operated on
635 * and dequeued.
636 * 2b) If a try-lock fails, it indicates that another thread is attempting
637 * to change the entry and move it to a different position in this queue
638 * or to different queue. The entry can be dequeued but it should not be
639 * operated upon since it is being changed. Furthermore, we don't null
640 * the entry.queue pointer (protected by the entry lock we don't own).
641 * Instead, we set the async_dequeue flag -- see (1c).
642 * 2c) Same as 2b but occurring when a longterm timer is matured.
643 * 3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
644 * should be manipulated with the appropriate timer queue lock held,
645 * to prevent queue traversal observations from observing inconsistent
646 * updates to an in-flight callout.
647 */
648
649 /*
650 * In the debug case, we assert that the timer call locking protocol
651 * is being obeyed.
652 */
653
654 static boolean_t
timer_call_enter_internal(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint64_t leeway,uint32_t flags,boolean_t ratelimited)655 timer_call_enter_internal(
656 timer_call_t call,
657 timer_call_param_t param1,
658 uint64_t deadline,
659 uint64_t leeway,
660 uint32_t flags,
661 boolean_t ratelimited)
662 {
663 mpqueue_head_t *queue = NULL;
664 mpqueue_head_t *old_queue;
665 spl_t s;
666 uint64_t slop;
667 uint32_t urgency;
668 uint64_t sdeadline, ttd;
669
670 assert(call->tc_func != NULL);
671 s = splclock();
672
673 sdeadline = deadline;
674 uint64_t ctime = mach_absolute_time();
675
676 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
677 DECR_TIMER_ENTER | DBG_FUNC_START,
678 VM_KERNEL_UNSLIDE_OR_PERM(call),
679 VM_KERNEL_ADDRHIDE(param1), deadline, flags, 0);
680
681 urgency = (flags & TIMER_CALL_URGENCY_MASK);
682
683 boolean_t slop_ratelimited = FALSE;
684 slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
685
686 if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop) {
687 slop = leeway;
688 }
689
690 if (UINT64_MAX - deadline <= slop) {
691 deadline = UINT64_MAX;
692 } else {
693 deadline += slop;
694 }
695
696 if (__improbable(deadline < ctime)) {
697 deadline = timer_call_past_deadline_timer_handle(deadline, ctime);
698 sdeadline = deadline;
699 }
700
701 if (ratelimited || slop_ratelimited) {
702 flags |= TIMER_CALL_RATELIMITED;
703 } else {
704 flags &= ~TIMER_CALL_RATELIMITED;
705 }
706
707 ttd = sdeadline - ctime;
708 #if CONFIG_DTRACE
709 DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
710 timer_call_param_t, call->tc_param0, uint32_t, flags,
711 (deadline - sdeadline),
712 (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
713 #endif
714
715 /* Program timer callout parameters under the appropriate per-CPU or
716 * longterm queue lock. The callout may have been previously enqueued
717 * and in-flight on this or another timer queue.
718 */
719 if (!ratelimited && !slop_ratelimited) {
720 queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
721 }
722
723 if (queue == NULL) {
724 queue = timer_queue_assign(deadline);
725 old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
726 }
727
728 #if TIMER_TRACE
729 call->tc_entry_time = ctime;
730 #endif
731
732 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
733 DECR_TIMER_ENTER | DBG_FUNC_END,
734 VM_KERNEL_UNSLIDE_OR_PERM(call),
735 (old_queue != NULL), deadline, queue->count, 0);
736
737 splx(s);
738
739 return old_queue != NULL;
740 }
741
742 /*
743 * timer_call_*()
744 * return boolean indicating whether the call was previously queued.
745 */
746 boolean_t
timer_call_enter(timer_call_t call,uint64_t deadline,uint32_t flags)747 timer_call_enter(
748 timer_call_t call,
749 uint64_t deadline,
750 uint32_t flags)
751 {
752 return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
753 }
754
755 boolean_t
timer_call_enter1(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint32_t flags)756 timer_call_enter1(
757 timer_call_t call,
758 timer_call_param_t param1,
759 uint64_t deadline,
760 uint32_t flags)
761 {
762 return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
763 }
764
765 boolean_t
timer_call_enter_with_leeway(timer_call_t call,timer_call_param_t param1,uint64_t deadline,uint64_t leeway,uint32_t flags,boolean_t ratelimited)766 timer_call_enter_with_leeway(
767 timer_call_t call,
768 timer_call_param_t param1,
769 uint64_t deadline,
770 uint64_t leeway,
771 uint32_t flags,
772 boolean_t ratelimited)
773 {
774 return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
775 }
776
777 boolean_t
timer_call_cancel(timer_call_t call)778 timer_call_cancel(
779 timer_call_t call)
780 {
781 mpqueue_head_t *old_queue;
782 spl_t s;
783
784 s = splclock();
785
786 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
787 DECR_TIMER_CANCEL | DBG_FUNC_START,
788 VM_KERNEL_UNSLIDE_OR_PERM(call),
789 call->tc_pqlink.deadline, call->tc_soft_deadline, call->tc_flags, 0);
790
791 old_queue = timer_call_dequeue_unlocked(call);
792
793 if (old_queue != NULL) {
794 timer_queue_lock_spin(old_queue);
795
796 timer_call_t new_head = priority_queue_min(&old_queue->mpq_pqhead, struct timer_call, tc_pqlink);
797
798 if (new_head) {
799 timer_queue_cancel(old_queue, call->tc_pqlink.deadline, new_head->tc_pqlink.deadline);
800 old_queue->earliest_soft_deadline = new_head->tc_flags & TIMER_CALL_RATELIMITED ? new_head->tc_pqlink.deadline : new_head->tc_soft_deadline;
801 } else {
802 timer_queue_cancel(old_queue, call->tc_pqlink.deadline, UINT64_MAX);
803 old_queue->earliest_soft_deadline = UINT64_MAX;
804 }
805
806 timer_queue_unlock(old_queue);
807 }
808 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
809 DECR_TIMER_CANCEL | DBG_FUNC_END,
810 VM_KERNEL_UNSLIDE_OR_PERM(call),
811 VM_KERNEL_UNSLIDE_OR_PERM(old_queue),
812 call->tc_pqlink.deadline - mach_absolute_time(),
813 call->tc_pqlink.deadline - call->tc_entry_time, 0);
814 splx(s);
815
816 #if CONFIG_DTRACE
817 DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
818 timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
819 (call->tc_ttd >> 32), (unsigned) (call->tc_ttd & 0xFFFFFFFF));
820 #endif /* CONFIG_DTRACE */
821
822 return old_queue != NULL;
823 }
824
825 static uint32_t timer_queue_shutdown_lock_skips;
826 static uint32_t timer_queue_shutdown_discarded;
827
828 void
timer_queue_shutdown(mpqueue_head_t * queue)829 timer_queue_shutdown(
830 mpqueue_head_t *queue)
831 {
832 timer_call_t call;
833 mpqueue_head_t *new_queue;
834 spl_t s;
835
836
837 DBG("timer_queue_shutdown(%p)\n", queue);
838
839 s = splclock();
840
841 while (TRUE) {
842 timer_queue_lock_spin(queue);
843
844 call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
845
846 if (call == NULL) {
847 break;
848 }
849
850 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
851 /*
852 * case (2b) lock order inversion, dequeue and skip
853 * Don't change the call_entry queue back-pointer
854 * but set the async_dequeue field.
855 */
856 timer_queue_shutdown_lock_skips++;
857 timer_call_entry_dequeue_async(call);
858 #if TIMER_ASSERT
859 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
860 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
861 VM_KERNEL_UNSLIDE_OR_PERM(call),
862 call->tc_async_dequeue,
863 VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
864 0x2b, 0);
865 #endif
866 timer_queue_unlock(queue);
867 continue;
868 }
869
870 boolean_t call_local = ((call->tc_flags & TIMER_CALL_LOCAL) != 0);
871
872 /* remove entry from old queue */
873 timer_call_entry_dequeue(call);
874 timer_queue_unlock(queue);
875
876 if (call_local == FALSE) {
877 /* and queue it on new, discarding LOCAL timers */
878 new_queue = timer_queue_assign(call->tc_pqlink.deadline);
879 timer_queue_lock_spin(new_queue);
880 timer_call_entry_enqueue_deadline(
881 call, new_queue, call->tc_pqlink.deadline);
882 timer_queue_unlock(new_queue);
883 } else {
884 timer_queue_shutdown_discarded++;
885 }
886
887 assert(call_local == FALSE);
888 simple_unlock(&call->tc_lock);
889 }
890
891 timer_queue_unlock(queue);
892 splx(s);
893 }
894
895
896 static uint32_t timer_queue_expire_lock_skips;
897 uint64_t
timer_queue_expire_with_options(mpqueue_head_t * queue,uint64_t deadline,boolean_t rescan)898 timer_queue_expire_with_options(
899 mpqueue_head_t *queue,
900 uint64_t deadline,
901 boolean_t rescan)
902 {
903 timer_call_t call = NULL;
904 uint32_t tc_iterations = 0;
905 DBG("timer_queue_expire(%p,)\n", queue);
906
907 /* 'rescan' means look at every timer in the list, instead of
908 * early-exiting when the head of the list expires in the future.
909 * when 'rescan' is true, iterate by linked list instead of priority queue.
910 *
911 * TODO: if we keep a deadline ordered and soft-deadline ordered
912 * priority queue, then it's no longer necessary to do that
913 */
914
915 uint64_t cur_deadline = deadline;
916
917 /* Force an early return if this time limit is hit. */
918 const uint64_t time_limit = deadline + timer_scan_limit_abs;
919
920 /* Next deadline if the time limit is hit */
921 uint64_t time_limit_deadline = 0;
922
923 timer_queue_lock_spin(queue);
924
925 while (!queue_empty(&queue->head)) {
926 if (++tc_iterations > 1) {
927 const uint64_t now = mach_absolute_time();
928
929 /*
930 * Abort the scan if it's taking too long to avoid long
931 * periods with interrupts disabled.
932 * Scanning will restart after a short pause
933 * (timer_scan_interval_abs) if there's a hard deadline
934 * pending.
935 */
936 if (rescan == FALSE && now > time_limit) {
937 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
938 DECR_TIMER_PAUSE | DBG_FUNC_NONE,
939 queue->count, tc_iterations - 1,
940 0, 0, 0);
941
942 counter_inc(&timer_scan_pauses_cnt);
943 time_limit_deadline = now + timer_scan_interval_abs;
944 break;
945 }
946
947 /*
948 * Upon processing one or more timer calls, refresh the
949 * deadline to account for time elapsed in the callout
950 */
951 cur_deadline = now;
952 }
953
954 if (call == NULL) {
955 if (rescan == FALSE) {
956 call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
957 } else {
958 call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
959 }
960 }
961
962 if (call->tc_soft_deadline <= cur_deadline) {
963 timer_call_func_t func;
964 timer_call_param_t param0, param1;
965
966 TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->tc_soft_deadline, 0, 0, 0);
967 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
968 DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
969 VM_KERNEL_UNSLIDE_OR_PERM(call),
970 call->tc_soft_deadline,
971 call->tc_pqlink.deadline,
972 call->tc_entry_time, 0);
973
974 if ((call->tc_flags & TIMER_CALL_RATELIMITED) &&
975 (call->tc_pqlink.deadline > cur_deadline)) {
976 if (rescan == FALSE) {
977 break;
978 }
979 }
980
981 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
982 /* case (2b) lock inversion, dequeue and skip */
983 timer_queue_expire_lock_skips++;
984 timer_call_entry_dequeue_async(call);
985 call = NULL;
986 continue;
987 }
988
989 timer_call_entry_dequeue(call);
990
991 func = call->tc_func;
992 param0 = call->tc_param0;
993 param1 = call->tc_param1;
994
995 simple_unlock(&call->tc_lock);
996 timer_queue_unlock(queue);
997
998 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
999 DECR_TIMER_CALLOUT | DBG_FUNC_START,
1000 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
1001 VM_KERNEL_ADDRHIDE(param0),
1002 VM_KERNEL_ADDRHIDE(param1),
1003 0);
1004
1005 #if CONFIG_DTRACE
1006 DTRACE_TMR7(callout__start, timer_call_func_t, func,
1007 timer_call_param_t, param0, unsigned, call->tc_flags,
1008 0, (call->tc_ttd >> 32),
1009 (unsigned) (call->tc_ttd & 0xFFFFFFFF), call);
1010 #endif
1011 /* Maintain time-to-deadline in per-processor data
1012 * structure for thread wakeup deadline statistics.
1013 */
1014 uint64_t *ttdp = ¤t_processor()->timer_call_ttd;
1015 *ttdp = call->tc_ttd;
1016 (*func)(param0, param1);
1017 *ttdp = 0;
1018 #if CONFIG_DTRACE
1019 DTRACE_TMR4(callout__end, timer_call_func_t, func,
1020 param0, param1, call);
1021 #endif
1022
1023 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1024 DECR_TIMER_CALLOUT | DBG_FUNC_END,
1025 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
1026 VM_KERNEL_ADDRHIDE(param0),
1027 VM_KERNEL_ADDRHIDE(param1),
1028 0);
1029 call = NULL;
1030 timer_queue_lock_spin(queue);
1031 } else {
1032 if (__probable(rescan == FALSE)) {
1033 break;
1034 } else {
1035 int64_t skew = call->tc_pqlink.deadline - call->tc_soft_deadline;
1036 assert(call->tc_pqlink.deadline >= call->tc_soft_deadline);
1037
1038 /* DRK: On a latency quality-of-service level change,
1039 * re-sort potentially rate-limited timers. The platform
1040 * layer determines which timers require
1041 * this. In the absence of the per-callout
1042 * synchronization requirement, a global resort could
1043 * be more efficient. The re-sort effectively
1044 * annuls all timer adjustments, i.e. the "soft
1045 * deadline" is the sort key.
1046 */
1047
1048 if (timer_resort_threshold(skew)) {
1049 if (__probable(simple_lock_try(&call->tc_lock, LCK_GRP_NULL))) {
1050 /* TODO: don't need to dequeue before enqueue */
1051 timer_call_entry_dequeue(call);
1052 timer_call_entry_enqueue_deadline(call, queue, call->tc_soft_deadline);
1053 simple_unlock(&call->tc_lock);
1054 call = NULL;
1055 }
1056 }
1057 if (call) {
1058 call = qe_queue_next(&queue->head, call, struct timer_call, tc_qlink);
1059
1060 if (call == NULL) {
1061 break;
1062 }
1063 }
1064 }
1065 }
1066 }
1067
1068 call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
1069
1070 if (call) {
1071 /*
1072 * Even if the time limit has been hit, it doesn't mean a hard
1073 * deadline will be missed - the next hard deadline may be in
1074 * future.
1075 */
1076 if (time_limit_deadline > call->tc_pqlink.deadline) {
1077 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1078 DECR_TIMER_POSTPONE | DBG_FUNC_NONE,
1079 VM_KERNEL_UNSLIDE_OR_PERM(call),
1080 call->tc_pqlink.deadline,
1081 time_limit_deadline,
1082 0, 0);
1083 counter_inc(&timer_scan_postpones_cnt);
1084 cur_deadline = time_limit_deadline;
1085 } else {
1086 cur_deadline = call->tc_pqlink.deadline;
1087 }
1088 queue->earliest_soft_deadline = (call->tc_flags & TIMER_CALL_RATELIMITED) ? call->tc_pqlink.deadline: call->tc_soft_deadline;
1089 } else {
1090 queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
1091 }
1092
1093 timer_queue_unlock(queue);
1094
1095 return cur_deadline;
1096 }
1097
1098 uint64_t
timer_queue_expire(mpqueue_head_t * queue,uint64_t deadline)1099 timer_queue_expire(
1100 mpqueue_head_t *queue,
1101 uint64_t deadline)
1102 {
1103 return timer_queue_expire_with_options(queue, deadline, FALSE);
1104 }
1105
1106 extern int serverperfmode;
1107 static uint32_t timer_queue_migrate_lock_skips;
1108 /*
1109 * timer_queue_migrate() is called by timer_queue_migrate_cpu()
1110 * to move timer requests from the local processor (queue_from)
1111 * to a target processor's (queue_to).
1112 */
1113 int
timer_queue_migrate(mpqueue_head_t * queue_from,mpqueue_head_t * queue_to)1114 timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
1115 {
1116 timer_call_t call;
1117 timer_call_t head_to;
1118 int timers_migrated = 0;
1119
1120 DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
1121
1122 assert(!ml_get_interrupts_enabled());
1123 assert(queue_from != queue_to);
1124
1125 if (serverperfmode) {
1126 /*
1127 * if we're running a high end server
1128 * avoid migrations... they add latency
1129 * and don't save us power under typical
1130 * server workloads
1131 */
1132 return -4;
1133 }
1134
1135 /*
1136 * Take both local (from) and target (to) timer queue locks while
1137 * moving the timers from the local queue to the target processor.
1138 * We assume that the target is always the boot processor.
1139 * But only move if all of the following is true:
1140 * - the target queue is non-empty
1141 * - the local queue is non-empty
1142 * - the local queue's first deadline is later than the target's
1143 * - the local queue contains no non-migrateable "local" call
1144 * so that we need not have the target resync.
1145 */
1146
1147 timer_queue_lock_spin(queue_to);
1148
1149 head_to = priority_queue_min(&queue_to->mpq_pqhead, struct timer_call, tc_pqlink);
1150
1151 if (head_to == NULL) {
1152 timers_migrated = -1;
1153 goto abort1;
1154 }
1155
1156 timer_queue_lock_spin(queue_from);
1157
1158 call = priority_queue_min(&queue_from->mpq_pqhead, struct timer_call, tc_pqlink);
1159
1160 if (call == NULL) {
1161 timers_migrated = -2;
1162 goto abort2;
1163 }
1164
1165 if (call->tc_pqlink.deadline < head_to->tc_pqlink.deadline) {
1166 timers_migrated = 0;
1167 goto abort2;
1168 }
1169
1170 /* perform scan for non-migratable timers */
1171 qe_foreach_element(call, &queue_from->head, tc_qlink) {
1172 if (call->tc_flags & TIMER_CALL_LOCAL) {
1173 timers_migrated = -3;
1174 goto abort2;
1175 }
1176 }
1177
1178 /* migration loop itself -- both queues are locked */
1179 qe_foreach_element_safe(call, &queue_from->head, tc_qlink) {
1180 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1181 /* case (2b) lock order inversion, dequeue only */
1182 #ifdef TIMER_ASSERT
1183 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1184 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1185 VM_KERNEL_UNSLIDE_OR_PERM(call),
1186 VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1187 0,
1188 0x2b, 0);
1189 #endif
1190 timer_queue_migrate_lock_skips++;
1191 timer_call_entry_dequeue_async(call);
1192 continue;
1193 }
1194 timer_call_entry_dequeue(call);
1195 timer_call_entry_enqueue_deadline(
1196 call, queue_to, call->tc_pqlink.deadline);
1197 timers_migrated++;
1198 simple_unlock(&call->tc_lock);
1199 }
1200 queue_from->earliest_soft_deadline = UINT64_MAX;
1201 abort2:
1202 timer_queue_unlock(queue_from);
1203 abort1:
1204 timer_queue_unlock(queue_to);
1205
1206 return timers_migrated;
1207 }
1208
1209 void
timer_queue_trace_cpu(int ncpu)1210 timer_queue_trace_cpu(int ncpu)
1211 {
1212 timer_call_nosync_cpu(
1213 ncpu,
1214 (void (*)(void *))timer_queue_trace,
1215 (void*) timer_queue_cpu(ncpu));
1216 }
1217
1218 void
timer_queue_trace(mpqueue_head_t * queue)1219 timer_queue_trace(
1220 mpqueue_head_t *queue)
1221 {
1222 timer_call_t call;
1223 spl_t s;
1224
1225 if (!kdebug_enable) {
1226 return;
1227 }
1228
1229 s = splclock();
1230 timer_queue_lock_spin(queue);
1231
1232 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1233 DECR_TIMER_QUEUE | DBG_FUNC_START,
1234 queue->count, mach_absolute_time(), 0, 0, 0);
1235
1236 qe_foreach_element(call, &queue->head, tc_qlink) {
1237 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1238 DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1239 call->tc_soft_deadline,
1240 call->tc_pqlink.deadline,
1241 call->tc_entry_time,
1242 VM_KERNEL_UNSLIDE(call->tc_func),
1243 0);
1244 }
1245
1246 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1247 DECR_TIMER_QUEUE | DBG_FUNC_END,
1248 queue->count, mach_absolute_time(), 0, 0, 0);
1249
1250 timer_queue_unlock(queue);
1251 splx(s);
1252 }
1253
1254 void
timer_longterm_dequeued_locked(timer_call_t call)1255 timer_longterm_dequeued_locked(timer_call_t call)
1256 {
1257 timer_longterm_t *tlp = &timer_longterm;
1258
1259 tlp->dequeues++;
1260 if (call == tlp->threshold.call) {
1261 tlp->threshold.call = NULL;
1262 }
1263 }
1264
1265 /*
1266 * Place a timer call in the longterm list
1267 * and adjust the next timer callout deadline if the new timer is first.
1268 */
1269 mpqueue_head_t *
timer_longterm_enqueue_unlocked(timer_call_t call,uint64_t now,uint64_t deadline,mpqueue_head_t ** old_queue,uint64_t soft_deadline,uint64_t ttd,timer_call_param_t param1,uint32_t callout_flags)1270 timer_longterm_enqueue_unlocked(timer_call_t call,
1271 uint64_t now,
1272 uint64_t deadline,
1273 mpqueue_head_t **old_queue,
1274 uint64_t soft_deadline,
1275 uint64_t ttd,
1276 timer_call_param_t param1,
1277 uint32_t callout_flags)
1278 {
1279 timer_longterm_t *tlp = &timer_longterm;
1280 boolean_t update_required = FALSE;
1281 uint64_t longterm_threshold;
1282
1283 longterm_threshold = now + tlp->threshold.interval;
1284
1285 /*
1286 * Return NULL without doing anything if:
1287 * - this timer is local, or
1288 * - the longterm mechanism is disabled, or
1289 * - this deadline is too short.
1290 */
1291 if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
1292 (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
1293 (deadline <= longterm_threshold)) {
1294 return NULL;
1295 }
1296
1297 /*
1298 * Remove timer from its current queue, if any.
1299 */
1300 *old_queue = timer_call_dequeue_unlocked(call);
1301
1302 /*
1303 * Lock the longterm queue, queue timer and determine
1304 * whether an update is necessary.
1305 */
1306 assert(!ml_get_interrupts_enabled());
1307 simple_lock(&call->tc_lock, LCK_GRP_NULL);
1308 timer_queue_lock_spin(timer_longterm_queue);
1309 call->tc_pqlink.deadline = deadline;
1310 call->tc_param1 = param1;
1311 call->tc_ttd = ttd;
1312 call->tc_soft_deadline = soft_deadline;
1313 call->tc_flags = callout_flags;
1314 timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1315
1316 tlp->enqueues++;
1317
1318 /*
1319 * We'll need to update the currently set threshold timer
1320 * if the new deadline is sooner and no sooner update is in flight.
1321 */
1322 if (deadline < tlp->threshold.deadline &&
1323 deadline < tlp->threshold.preempted) {
1324 tlp->threshold.preempted = deadline;
1325 tlp->threshold.call = call;
1326 update_required = TRUE;
1327 }
1328 timer_queue_unlock(timer_longterm_queue);
1329 simple_unlock(&call->tc_lock);
1330
1331 if (update_required) {
1332 /*
1333 * Note: this call expects that calling the master cpu
1334 * alone does not involve locking the topo lock.
1335 */
1336 timer_call_nosync_cpu(
1337 master_cpu,
1338 (void (*)(void *))timer_longterm_update,
1339 (void *)tlp);
1340 }
1341
1342 return timer_longterm_queue;
1343 }
1344
1345 /*
1346 * Scan for timers below the longterm threshold.
1347 * Move these to the local timer queue (of the boot processor on which the
1348 * calling thread is running).
1349 * Both the local (boot) queue and the longterm queue are locked.
1350 * The scan is similar to the timer migrate sequence but is performed by
1351 * successively examining each timer on the longterm queue:
1352 * - if within the short-term threshold
1353 * - enter on the local queue (unless being deleted),
1354 * - otherwise:
1355 * - if sooner, deadline becomes the next threshold deadline.
1356 * The total scan time is limited to TIMER_LONGTERM_SCAN_LIMIT. Should this be
1357 * exceeded, we abort and reschedule again so that we don't shut others from
1358 * the timer queues. Longterm timers firing late is not critical.
1359 */
1360 void
timer_longterm_scan(timer_longterm_t * tlp,uint64_t time_start)1361 timer_longterm_scan(timer_longterm_t *tlp,
1362 uint64_t time_start)
1363 {
1364 timer_call_t call;
1365 uint64_t threshold = TIMER_LONGTERM_NONE;
1366 uint64_t deadline;
1367 uint64_t time_limit = time_start + tlp->scan_limit;
1368 mpqueue_head_t *timer_master_queue;
1369
1370 assert(!ml_get_interrupts_enabled());
1371 assert(cpu_number() == master_cpu);
1372
1373 if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1374 threshold = time_start + tlp->threshold.interval;
1375 }
1376
1377 tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1378 tlp->threshold.call = NULL;
1379
1380 if (queue_empty(&timer_longterm_queue->head)) {
1381 return;
1382 }
1383
1384 timer_master_queue = timer_queue_cpu(master_cpu);
1385 timer_queue_lock_spin(timer_master_queue);
1386
1387 qe_foreach_element_safe(call, &timer_longterm_queue->head, tc_qlink) {
1388 deadline = call->tc_soft_deadline;
1389 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1390 /* case (2c) lock order inversion, dequeue only */
1391 #ifdef TIMER_ASSERT
1392 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1393 DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1394 VM_KERNEL_UNSLIDE_OR_PERM(call),
1395 VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1396 0,
1397 0x2c, 0);
1398 #endif
1399 timer_call_entry_dequeue_async(call);
1400 continue;
1401 }
1402 if (deadline < threshold) {
1403 /*
1404 * This timer needs moving (escalating)
1405 * to the local (boot) processor's queue.
1406 */
1407 #ifdef TIMER_ASSERT
1408 if (deadline < time_start) {
1409 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1410 DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1411 VM_KERNEL_UNSLIDE_OR_PERM(call),
1412 deadline,
1413 time_start,
1414 threshold,
1415 0);
1416 }
1417 #endif
1418 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1419 DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1420 VM_KERNEL_UNSLIDE_OR_PERM(call),
1421 call->tc_pqlink.deadline,
1422 call->tc_entry_time,
1423 VM_KERNEL_UNSLIDE(call->tc_func),
1424 0);
1425 tlp->escalates++;
1426 timer_call_entry_dequeue(call);
1427 timer_call_entry_enqueue_deadline(
1428 call, timer_master_queue, call->tc_pqlink.deadline);
1429 /*
1430 * A side-effect of the following call is to update
1431 * the actual hardware deadline if required.
1432 */
1433 (void) timer_queue_assign(deadline);
1434 } else {
1435 if (deadline < tlp->threshold.deadline) {
1436 tlp->threshold.deadline = deadline;
1437 tlp->threshold.call = call;
1438 }
1439 }
1440 simple_unlock(&call->tc_lock);
1441
1442 /* Abort scan if we're taking too long. */
1443 if (mach_absolute_time() > time_limit) {
1444 tlp->threshold.deadline = TIMER_LONGTERM_SCAN_AGAIN;
1445 tlp->scan_pauses++;
1446 DBG("timer_longterm_scan() paused %llu, qlen: %llu\n",
1447 time_limit, tlp->queue.count);
1448 break;
1449 }
1450 }
1451
1452 timer_queue_unlock(timer_master_queue);
1453 }
1454
1455 void
timer_longterm_callout(timer_call_param_t p0,__unused timer_call_param_t p1)1456 timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1457 {
1458 timer_longterm_t *tlp = (timer_longterm_t *) p0;
1459
1460 timer_longterm_update(tlp);
1461 }
1462
1463 void
timer_longterm_update_locked(timer_longterm_t * tlp)1464 timer_longterm_update_locked(timer_longterm_t *tlp)
1465 {
1466 uint64_t latency;
1467
1468 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1469 DECR_TIMER_UPDATE | DBG_FUNC_START,
1470 VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1471 tlp->threshold.deadline,
1472 tlp->threshold.preempted,
1473 tlp->queue.count, 0);
1474
1475 tlp->scan_time = mach_absolute_time();
1476 if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1477 tlp->threshold.preempts++;
1478 tlp->threshold.deadline = tlp->threshold.preempted;
1479 tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1480 /*
1481 * Note: in the unlikely event that a pre-empted timer has
1482 * itself been cancelled, we'll simply re-scan later at the
1483 * time of the preempted/cancelled timer.
1484 */
1485 } else {
1486 tlp->threshold.scans++;
1487
1488 /*
1489 * Maintain a moving average of our wakeup latency.
1490 * Clamp latency to 0 and ignore above threshold interval.
1491 */
1492 if (tlp->scan_time > tlp->threshold.deadline_set) {
1493 latency = tlp->scan_time - tlp->threshold.deadline_set;
1494 } else {
1495 latency = 0;
1496 }
1497 if (latency < tlp->threshold.interval) {
1498 tlp->threshold.latency_min =
1499 MIN(tlp->threshold.latency_min, latency);
1500 tlp->threshold.latency_max =
1501 MAX(tlp->threshold.latency_max, latency);
1502 tlp->threshold.latency =
1503 (tlp->threshold.latency * 99 + latency) / 100;
1504 }
1505
1506 timer_longterm_scan(tlp, tlp->scan_time);
1507 }
1508
1509 tlp->threshold.deadline_set = tlp->threshold.deadline;
1510 /* The next deadline timer to be set is adjusted */
1511 if (tlp->threshold.deadline != TIMER_LONGTERM_NONE &&
1512 tlp->threshold.deadline != TIMER_LONGTERM_SCAN_AGAIN) {
1513 tlp->threshold.deadline_set -= tlp->threshold.margin;
1514 tlp->threshold.deadline_set -= tlp->threshold.latency;
1515 }
1516
1517 /* Throttle next scan time */
1518 uint64_t scan_clamp = mach_absolute_time() + tlp->scan_interval;
1519 if (tlp->threshold.deadline_set < scan_clamp) {
1520 tlp->threshold.deadline_set = scan_clamp;
1521 }
1522
1523 TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1524 DECR_TIMER_UPDATE | DBG_FUNC_END,
1525 VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1526 tlp->threshold.deadline,
1527 tlp->threshold.scans,
1528 tlp->queue.count, 0);
1529 }
1530
1531 void
timer_longterm_update(timer_longterm_t * tlp)1532 timer_longterm_update(timer_longterm_t *tlp)
1533 {
1534 spl_t s = splclock();
1535
1536 timer_queue_lock_spin(timer_longterm_queue);
1537
1538 if (cpu_number() != master_cpu) {
1539 panic("timer_longterm_update_master() on non-boot cpu");
1540 }
1541
1542 timer_longterm_update_locked(tlp);
1543
1544 if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1545 timer_call_enter(
1546 &tlp->threshold.timer,
1547 tlp->threshold.deadline_set,
1548 TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1549 }
1550
1551 timer_queue_unlock(timer_longterm_queue);
1552 splx(s);
1553 }
1554
1555 void
timer_longterm_init(void)1556 timer_longterm_init(void)
1557 {
1558 uint32_t longterm;
1559 timer_longterm_t *tlp = &timer_longterm;
1560
1561 DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1562
1563 /*
1564 * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1565 * or TIMER_LONGTERM_NONE (disabled) for server;
1566 * overridden longterm boot-arg
1567 */
1568 tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
1569 : TIMER_LONGTERM_THRESHOLD;
1570 if (PE_parse_boot_argn("longterm", &longterm, sizeof(longterm))) {
1571 tlp->threshold.interval = (longterm == 0) ?
1572 TIMER_LONGTERM_NONE :
1573 longterm * NSEC_PER_MSEC;
1574 }
1575 if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1576 printf("Longterm timer threshold: %llu ms\n",
1577 tlp->threshold.interval / NSEC_PER_MSEC);
1578 kprintf("Longterm timer threshold: %llu ms\n",
1579 tlp->threshold.interval / NSEC_PER_MSEC);
1580 nanoseconds_to_absolutetime(tlp->threshold.interval,
1581 &tlp->threshold.interval);
1582 tlp->threshold.margin = tlp->threshold.interval / 10;
1583 tlp->threshold.latency_min = EndOfAllTime;
1584 tlp->threshold.latency_max = 0;
1585 }
1586
1587 tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1588 tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1589
1590 mpqueue_init(&tlp->queue, &timer_longterm_lck_grp, LCK_ATTR_NULL);
1591
1592 timer_call_setup(&tlp->threshold.timer,
1593 timer_longterm_callout, (timer_call_param_t) tlp);
1594
1595 timer_longterm_queue = &tlp->queue;
1596 }
1597
1598 enum {
1599 THRESHOLD, QCOUNT,
1600 ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
1601 LATENCY, LATENCY_MIN, LATENCY_MAX, LONG_TERM_SCAN_LIMIT,
1602 LONG_TERM_SCAN_INTERVAL, LONG_TERM_SCAN_PAUSES,
1603 SCAN_LIMIT, SCAN_INTERVAL, SCAN_PAUSES, SCAN_POSTPONES,
1604 };
1605 uint64_t
timer_sysctl_get(int oid)1606 timer_sysctl_get(int oid)
1607 {
1608 timer_longterm_t *tlp = &timer_longterm;
1609
1610 switch (oid) {
1611 case THRESHOLD:
1612 return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
1613 0 : tlp->threshold.interval / NSEC_PER_MSEC;
1614 case QCOUNT:
1615 return tlp->queue.count;
1616 case ENQUEUES:
1617 return tlp->enqueues;
1618 case DEQUEUES:
1619 return tlp->dequeues;
1620 case ESCALATES:
1621 return tlp->escalates;
1622 case SCANS:
1623 return tlp->threshold.scans;
1624 case PREEMPTS:
1625 return tlp->threshold.preempts;
1626 case LATENCY:
1627 return tlp->threshold.latency;
1628 case LATENCY_MIN:
1629 return tlp->threshold.latency_min;
1630 case LATENCY_MAX:
1631 return tlp->threshold.latency_max;
1632 case LONG_TERM_SCAN_LIMIT:
1633 return tlp->scan_limit;
1634 case LONG_TERM_SCAN_INTERVAL:
1635 return tlp->scan_interval;
1636 case LONG_TERM_SCAN_PAUSES:
1637 return tlp->scan_pauses;
1638 case SCAN_LIMIT:
1639 return timer_scan_limit_us * NSEC_PER_USEC;
1640 case SCAN_INTERVAL:
1641 return timer_scan_interval_us * NSEC_PER_USEC;
1642 case SCAN_PAUSES:
1643 return counter_load(&timer_scan_pauses_cnt);
1644 case SCAN_POSTPONES:
1645 return counter_load(&timer_scan_postpones_cnt);
1646
1647 default:
1648 return 0;
1649 }
1650 }
1651
1652 /*
1653 * timer_master_scan() is the inverse of timer_longterm_scan()
1654 * since it un-escalates timers to the longterm queue.
1655 */
1656 static void
timer_master_scan(timer_longterm_t * tlp,uint64_t now)1657 timer_master_scan(timer_longterm_t *tlp,
1658 uint64_t now)
1659 {
1660 timer_call_t call;
1661 uint64_t threshold;
1662 uint64_t deadline;
1663 mpqueue_head_t *timer_master_queue;
1664
1665 if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1666 threshold = now + tlp->threshold.interval;
1667 } else {
1668 threshold = TIMER_LONGTERM_NONE;
1669 }
1670
1671 timer_master_queue = timer_queue_cpu(master_cpu);
1672 timer_queue_lock_spin(timer_master_queue);
1673
1674 qe_foreach_element_safe(call, &timer_master_queue->head, tc_qlink) {
1675 deadline = call->tc_pqlink.deadline;
1676 if ((call->tc_flags & TIMER_CALL_LOCAL) != 0) {
1677 continue;
1678 }
1679 if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1680 /* case (2c) lock order inversion, dequeue only */
1681 timer_call_entry_dequeue_async(call);
1682 continue;
1683 }
1684 if (deadline > threshold) {
1685 /* move from master to longterm */
1686 timer_call_entry_dequeue(call);
1687 timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1688 if (deadline < tlp->threshold.deadline) {
1689 tlp->threshold.deadline = deadline;
1690 tlp->threshold.call = call;
1691 }
1692 }
1693 simple_unlock(&call->tc_lock);
1694 }
1695 timer_queue_unlock(timer_master_queue);
1696 }
1697
1698 static void
timer_sysctl_set_threshold(void * valp)1699 timer_sysctl_set_threshold(void* valp)
1700 {
1701 uint64_t value = (uint64_t)valp;
1702 timer_longterm_t *tlp = &timer_longterm;
1703 spl_t s = splclock();
1704 boolean_t threshold_increase;
1705
1706 timer_queue_lock_spin(timer_longterm_queue);
1707
1708 timer_call_cancel(&tlp->threshold.timer);
1709
1710 /*
1711 * Set the new threshold and note whther it's increasing.
1712 */
1713 if (value == 0) {
1714 tlp->threshold.interval = TIMER_LONGTERM_NONE;
1715 threshold_increase = TRUE;
1716 timer_call_cancel(&tlp->threshold.timer);
1717 } else {
1718 uint64_t old_interval = tlp->threshold.interval;
1719 tlp->threshold.interval = value * NSEC_PER_MSEC;
1720 nanoseconds_to_absolutetime(tlp->threshold.interval,
1721 &tlp->threshold.interval);
1722 tlp->threshold.margin = tlp->threshold.interval / 10;
1723 if (old_interval == TIMER_LONGTERM_NONE) {
1724 threshold_increase = FALSE;
1725 } else {
1726 threshold_increase = (tlp->threshold.interval > old_interval);
1727 }
1728 }
1729
1730 if (threshold_increase /* or removal */) {
1731 /* Escalate timers from the longterm queue */
1732 timer_longterm_scan(tlp, mach_absolute_time());
1733 } else { /* decrease or addition */
1734 /*
1735 * We scan the local/master queue for timers now longterm.
1736 * To be strictly correct, we should scan all processor queues
1737 * but timer migration results in most timers gravitating to the
1738 * master processor in any case.
1739 */
1740 timer_master_scan(tlp, mach_absolute_time());
1741 }
1742
1743 /* Set new timer accordingly */
1744 tlp->threshold.deadline_set = tlp->threshold.deadline;
1745 if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1746 tlp->threshold.deadline_set -= tlp->threshold.margin;
1747 tlp->threshold.deadline_set -= tlp->threshold.latency;
1748 timer_call_enter(
1749 &tlp->threshold.timer,
1750 tlp->threshold.deadline_set,
1751 TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1752 }
1753
1754 /* Reset stats */
1755 tlp->enqueues = 0;
1756 tlp->dequeues = 0;
1757 tlp->escalates = 0;
1758 tlp->scan_pauses = 0;
1759 tlp->threshold.scans = 0;
1760 tlp->threshold.preempts = 0;
1761 tlp->threshold.latency = 0;
1762 tlp->threshold.latency_min = EndOfAllTime;
1763 tlp->threshold.latency_max = 0;
1764
1765 timer_queue_unlock(timer_longterm_queue);
1766 splx(s);
1767 }
1768
1769 int
timer_sysctl_set(int oid,uint64_t value)1770 timer_sysctl_set(int oid, uint64_t value)
1771 {
1772 switch (oid) {
1773 case THRESHOLD:
1774 timer_call_cpu(
1775 master_cpu,
1776 timer_sysctl_set_threshold,
1777 (void *) value);
1778 return KERN_SUCCESS;
1779 case LONG_TERM_SCAN_LIMIT:
1780 timer_longterm.scan_limit = value;
1781 return KERN_SUCCESS;
1782 case LONG_TERM_SCAN_INTERVAL:
1783 timer_longterm.scan_interval = value;
1784 return KERN_SUCCESS;
1785 case SCAN_LIMIT:
1786 if (value > MAX_TIMER_SCAN_LIMIT ||
1787 value < MIN_TIMER_SCAN_LIMIT) {
1788 return KERN_INVALID_ARGUMENT;
1789 }
1790 timer_scan_limit_us = value / NSEC_PER_USEC;
1791 nanoseconds_to_absolutetime(timer_scan_limit_us * NSEC_PER_USEC,
1792 &timer_scan_limit_abs);
1793 return KERN_SUCCESS;
1794 case SCAN_INTERVAL:
1795 if (value > MAX_TIMER_SCAN_INTERVAL ||
1796 value < MIN_TIMER_SCAN_INTERVAL) {
1797 return KERN_INVALID_ARGUMENT;
1798 }
1799 timer_scan_interval_us = value / NSEC_PER_USEC;
1800 nanoseconds_to_absolutetime(timer_scan_interval_us * NSEC_PER_USEC,
1801 &timer_scan_interval_abs);
1802 return KERN_SUCCESS;
1803 default:
1804 return KERN_INVALID_ARGUMENT;
1805 }
1806 }
1807
1808
1809 /* Select timer coalescing window based on per-task quality-of-service hints */
1810 static boolean_t
tcoal_qos_adjust(thread_t t,int32_t * tshift,uint64_t * tmax_abstime,boolean_t * pratelimited)1811 tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1812 {
1813 uint32_t latency_qos;
1814 boolean_t adjusted = FALSE;
1815 task_t ctask = get_threadtask(t);
1816
1817 if (ctask) {
1818 latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1819
1820 assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1821
1822 if (latency_qos) {
1823 *tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1824 *tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1825 *pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1826 adjusted = TRUE;
1827 }
1828 }
1829 return adjusted;
1830 }
1831
1832
1833 /* Adjust timer deadlines based on priority of the thread and the
1834 * urgency value provided at timeout establishment. With this mechanism,
1835 * timers are no longer necessarily sorted in order of soft deadline
1836 * on a given timer queue, i.e. they may be differentially skewed.
1837 * In the current scheme, this could lead to fewer pending timers
1838 * processed than is technically possible when the HW deadline arrives.
1839 */
1840 static void
timer_compute_leeway(thread_t cthread,int32_t urgency,int32_t * tshift,uint64_t * tmax_abstime,boolean_t * pratelimited)1841 timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1842 {
1843 int16_t tpri = cthread->sched_pri;
1844 if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1845 bool tg_critical = false;
1846 #if CONFIG_THREAD_GROUPS
1847 tg_critical = (thread_group_get_flags(thread_group_get(cthread))) & THREAD_GROUP_FLAGS_CRITICAL;
1848 #endif /* CONFIG_THREAD_GROUPS */
1849 bool timer_critical = (tpri >= BASEPRI_RTQUEUES) ||
1850 (urgency == TIMER_CALL_USER_CRITICAL) ||
1851 tg_critical;
1852 if (timer_critical) {
1853 *tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1854 *tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1855 TCOAL_PRIO_STAT(rt_tcl);
1856 } else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
1857 (urgency == TIMER_CALL_USER_BACKGROUND)) {
1858 /* Determine if timer should be subjected to a lower QoS */
1859 if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1860 if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1861 return;
1862 } else {
1863 *pratelimited = FALSE;
1864 }
1865 }
1866 *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1867 *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1868 TCOAL_PRIO_STAT(bg_tcl);
1869 } else if (tpri >= MINPRI_KERNEL) {
1870 *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1871 *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1872 TCOAL_PRIO_STAT(kt_tcl);
1873 } else if (cthread->sched_mode == TH_MODE_FIXED) {
1874 *tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1875 *tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1876 TCOAL_PRIO_STAT(fp_tcl);
1877 } else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1878 TCOAL_PRIO_STAT(qos_tcl);
1879 } else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1880 *tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1881 *tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1882 TCOAL_PRIO_STAT(ts_tcl);
1883 } else {
1884 TCOAL_PRIO_STAT(nc_tcl);
1885 }
1886 } else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1887 *tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1888 *tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1889 TCOAL_PRIO_STAT(bg_tcl);
1890 } else {
1891 *tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1892 *tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1893 TCOAL_PRIO_STAT(kt_tcl);
1894 }
1895 }
1896
1897
1898 int timer_user_idle_level;
1899
1900 uint64_t
timer_call_slop(uint64_t deadline,uint64_t now,uint32_t flags,thread_t cthread,boolean_t * pratelimited)1901 timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1902 {
1903 int32_t tcs_shift = 0;
1904 uint64_t tcs_max_abstime = 0;
1905 uint64_t adjval;
1906 uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1907
1908 if (mach_timer_coalescing_enabled &&
1909 (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1910 timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
1911
1912 if (tcs_shift >= 0) {
1913 adjval = MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
1914 } else {
1915 adjval = MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
1916 }
1917 /* Apply adjustments derived from "user idle level" heuristic */
1918 adjval += (adjval * timer_user_idle_level) >> 7;
1919 return adjval;
1920 } else {
1921 return 0;
1922 }
1923 }
1924
1925 int
timer_get_user_idle_level(void)1926 timer_get_user_idle_level(void)
1927 {
1928 return timer_user_idle_level;
1929 }
1930
1931 kern_return_t
timer_set_user_idle_level(int ilevel)1932 timer_set_user_idle_level(int ilevel)
1933 {
1934 boolean_t do_reeval = FALSE;
1935
1936 if ((ilevel < 0) || (ilevel > 128)) {
1937 return KERN_INVALID_ARGUMENT;
1938 }
1939
1940 if (ilevel < timer_user_idle_level) {
1941 do_reeval = TRUE;
1942 }
1943
1944 timer_user_idle_level = ilevel;
1945
1946 if (do_reeval) {
1947 ml_timer_evaluate();
1948 }
1949
1950 return KERN_SUCCESS;
1951 }
1952
1953 #pragma mark - running timers
1954
1955 #define RUNNING_TIMER_FAKE_FLAGS (TIMER_CALL_SYS_CRITICAL | \
1956 TIMER_CALL_LOCAL)
1957
1958 /*
1959 * timer_call_trace_* functions mimic the tracing behavior from the normal
1960 * timer_call subsystem, so tools continue to function.
1961 */
1962
1963 static void
timer_call_trace_enter_before(struct timer_call * call,uint64_t deadline,uint32_t flags,uint64_t now)1964 timer_call_trace_enter_before(struct timer_call *call, uint64_t deadline,
1965 uint32_t flags, uint64_t now)
1966 {
1967 #pragma unused(call, deadline, flags, now)
1968 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_START,
1969 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_ADDRHIDE(call->tc_param1),
1970 deadline, flags, 0);
1971 #if CONFIG_DTRACE
1972 uint64_t ttd = deadline - now;
1973 DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
1974 timer_call_param_t, call->tc_param0, uint32_t, flags, 0,
1975 (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
1976 #endif /* CONFIG_DTRACE */
1977 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1978 VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1979 }
1980
1981 static void
timer_call_trace_enter_after(struct timer_call * call,uint64_t deadline)1982 timer_call_trace_enter_after(struct timer_call *call, uint64_t deadline)
1983 {
1984 #pragma unused(call, deadline)
1985 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1986 VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1987 }
1988
1989 static void
timer_call_trace_cancel(struct timer_call * call)1990 timer_call_trace_cancel(struct timer_call *call)
1991 {
1992 #pragma unused(call)
1993 __unused uint64_t deadline = call->tc_pqlink.deadline;
1994 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_START,
1995 VM_KERNEL_UNSLIDE_OR_PERM(call), deadline, 0,
1996 call->tc_flags, 0);
1997 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_END,
1998 VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline - mach_absolute_time(),
1999 deadline - call->tc_entry_time, 0);
2000 #if CONFIG_DTRACE
2001 #if TIMER_TRACE
2002 uint64_t ttd = deadline - call->tc_entry_time;
2003 #else
2004 uint64_t ttd = UINT64_MAX;
2005 #endif /* TIMER_TRACE */
2006 DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
2007 timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
2008 (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF));
2009 #endif /* CONFIG_DTRACE */
2010 }
2011
2012 static void
timer_call_trace_expire_entry(struct timer_call * call)2013 timer_call_trace_expire_entry(struct timer_call *call)
2014 {
2015 #pragma unused(call)
2016 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_START,
2017 VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(call->tc_func),
2018 VM_KERNEL_ADDRHIDE(call->tc_param0),
2019 VM_KERNEL_ADDRHIDE(call->tc_param1),
2020 0);
2021 #if CONFIG_DTRACE
2022 #if TIMER_TRACE
2023 uint64_t ttd = call->tc_pqlink.deadline - call->tc_entry_time;
2024 #else /* TIMER_TRACE */
2025 uint64_t ttd = UINT64_MAX;
2026 #endif /* TIMER_TRACE */
2027 DTRACE_TMR7(callout__start, timer_call_func_t, call->tc_func,
2028 timer_call_param_t, call->tc_param0, unsigned, call->tc_flags,
2029 0, (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
2030 #endif /* CONFIG_DTRACE */
2031 }
2032
2033 static void
timer_call_trace_expire_return(struct timer_call * call)2034 timer_call_trace_expire_return(struct timer_call *call)
2035 {
2036 #pragma unused(call)
2037 #if CONFIG_DTRACE
2038 DTRACE_TMR4(callout__end, timer_call_func_t, call->tc_func,
2039 call->tc_param0, call->tc_param1, NULL);
2040 #endif /* CONFIG_DTRACE */
2041 TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_END,
2042 VM_KERNEL_UNSLIDE_OR_PERM(call),
2043 VM_KERNEL_UNSLIDE(call->tc_func),
2044 VM_KERNEL_ADDRHIDE(call->tc_param0),
2045 VM_KERNEL_ADDRHIDE(call->tc_param1),
2046 0);
2047 }
2048
2049 /*
2050 * Set a new deadline for a running timer on this processor.
2051 */
2052 void
running_timer_setup(processor_t processor,enum running_timer timer,void * param,uint64_t deadline,uint64_t now)2053 running_timer_setup(processor_t processor, enum running_timer timer,
2054 void *param, uint64_t deadline, uint64_t now)
2055 {
2056 assert(timer < RUNNING_TIMER_MAX);
2057 assert(ml_get_interrupts_enabled() == FALSE);
2058
2059 struct timer_call *call = &processor->running_timers[timer];
2060
2061 timer_call_trace_enter_before(call, deadline, RUNNING_TIMER_FAKE_FLAGS,
2062 now);
2063
2064 if (__improbable(deadline < now)) {
2065 deadline = timer_call_past_deadline_timer_handle(deadline, now);
2066 }
2067
2068 call->tc_pqlink.deadline = deadline;
2069 #if TIMER_TRACE
2070 call->tc_entry_time = now;
2071 #endif /* TIMER_TRACE */
2072 call->tc_param1 = param;
2073
2074 timer_call_trace_enter_after(call, deadline);
2075 }
2076
2077 void
running_timers_sync(void)2078 running_timers_sync(void)
2079 {
2080 timer_resync_deadlines();
2081 }
2082
2083 void
running_timer_enter(processor_t processor,unsigned int timer,void * param,uint64_t deadline,uint64_t now)2084 running_timer_enter(processor_t processor, unsigned int timer,
2085 void *param, uint64_t deadline, uint64_t now)
2086 {
2087 running_timer_setup(processor, timer, param, deadline, now);
2088 running_timers_sync();
2089 }
2090
2091 /*
2092 * Call the callback for any running timers that fired for this processor.
2093 * Returns true if any timers were past their deadline.
2094 */
2095 bool
running_timers_expire(processor_t processor,uint64_t now)2096 running_timers_expire(processor_t processor, uint64_t now)
2097 {
2098 bool expired = false;
2099
2100 if (!processor->running_timers_active) {
2101 return expired;
2102 }
2103
2104 for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
2105 struct timer_call *call = &processor->running_timers[i];
2106
2107 uint64_t deadline = call->tc_pqlink.deadline;
2108 if (deadline > now) {
2109 continue;
2110 }
2111
2112 expired = true;
2113 timer_call_trace_expire_entry(call);
2114 call->tc_func(call->tc_param0, call->tc_param1);
2115 timer_call_trace_expire_return(call);
2116 }
2117
2118 return expired;
2119 }
2120
2121 void
running_timer_clear(processor_t processor,enum running_timer timer)2122 running_timer_clear(processor_t processor, enum running_timer timer)
2123 {
2124 struct timer_call *call = &processor->running_timers[timer];
2125 uint64_t deadline = call->tc_pqlink.deadline;
2126 if (deadline == EndOfAllTime) {
2127 return;
2128 }
2129
2130 call->tc_pqlink.deadline = EndOfAllTime;
2131 #if TIMER_TRACE
2132 call->tc_entry_time = 0;
2133 #endif /* TIMER_TRACE */
2134 timer_call_trace_cancel(call);
2135 }
2136
2137 void
running_timer_cancel(processor_t processor,unsigned int timer)2138 running_timer_cancel(processor_t processor, unsigned int timer)
2139 {
2140 running_timer_clear(processor, timer);
2141 running_timers_sync();
2142 }
2143
2144 uint64_t
running_timers_deadline(processor_t processor)2145 running_timers_deadline(processor_t processor)
2146 {
2147 if (!processor->running_timers_active) {
2148 return EndOfAllTime;
2149 }
2150
2151 uint64_t deadline = EndOfAllTime;
2152 for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
2153 uint64_t candidate =
2154 processor->running_timers[i].tc_pqlink.deadline;
2155 if (candidate != 0 && candidate < deadline) {
2156 deadline = candidate;
2157 }
2158 }
2159
2160 return deadline;
2161 }
2162
2163 void
running_timers_activate(processor_t processor)2164 running_timers_activate(processor_t processor)
2165 {
2166 processor->running_timers_active = true;
2167 running_timers_sync();
2168 }
2169
2170 void
running_timers_deactivate(processor_t processor)2171 running_timers_deactivate(processor_t processor)
2172 {
2173 assert(processor->running_timers_active == true);
2174 processor->running_timers_active = false;
2175 running_timers_sync();
2176 }
2177