1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_FREE_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: kern/thread.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young, David Golub
61 * Date: 1986
62 *
63 * Thread management primitives implementation.
64 */
65 /*
66 * Copyright (c) 1993 The University of Utah and
67 * the Computer Systems Laboratory (CSL). All rights reserved.
68 *
69 * Permission to use, copy, modify and distribute this software and its
70 * documentation is hereby granted, provided that both the copyright
71 * notice and this permission notice appear in all copies of the
72 * software, derivative works or modified versions, and any portions
73 * thereof, and that both notices appear in supporting documentation.
74 *
75 * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS
76 * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF
77 * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
78 *
79 * CSL requests users of this software to return to [email protected] any
80 * improvements that they make and grant CSL redistribution rights.
81 *
82 */
83
84 #include <mach/mach_types.h>
85 #include <mach/boolean.h>
86 #include <mach/policy.h>
87 #include <mach/thread_info.h>
88 #include <mach/thread_special_ports.h>
89 #include <mach/thread_act.h>
90 #include <mach/thread_status.h>
91 #include <mach/time_value.h>
92 #include <mach/vm_param.h>
93
94 #include <machine/thread.h>
95 #include <machine/pal_routines.h>
96 #include <machine/limits.h>
97
98 #include <kern/kern_types.h>
99 #include <kern/kalloc.h>
100 #include <kern/cpu_data.h>
101 #include <kern/extmod_statistics.h>
102 #include <kern/ipc_mig.h>
103 #include <kern/ipc_tt.h>
104 #include <kern/mach_param.h>
105 #include <kern/machine.h>
106 #include <kern/misc_protos.h>
107 #include <kern/processor.h>
108 #include <kern/queue.h>
109 #include <kern/restartable.h>
110 #include <kern/sched.h>
111 #include <kern/sched_prim.h>
112 #include <kern/syscall_subr.h>
113 #include <kern/task.h>
114 #include <kern/thread.h>
115 #include <kern/thread_group.h>
116 #include <kern/coalition.h>
117 #include <kern/host.h>
118 #include <kern/zalloc.h>
119 #include <kern/assert.h>
120 #include <kern/exc_resource.h>
121 #include <kern/exc_guard.h>
122 #include <kern/telemetry.h>
123 #include <kern/policy_internal.h>
124 #include <kern/turnstile.h>
125 #include <kern/sched_clutch.h>
126 #include <kern/recount.h>
127 #include <kern/smr.h>
128 #include <kern/ast.h>
129 #include <kern/compact_id.h>
130
131 #include <corpses/task_corpse.h>
132 #include <kern/kpc.h>
133
134 #if CONFIG_PERVASIVE_CPI
135 #include <kern/monotonic.h>
136 #include <machine/monotonic.h>
137 #endif /* CONFIG_PERVASIVE_CPI */
138
139 #include <ipc/ipc_kmsg.h>
140 #include <ipc/ipc_port.h>
141 #include <bank/bank_types.h>
142
143 #include <vm/vm_kern_xnu.h>
144 #include <vm/vm_pageout_xnu.h>
145
146 #include <sys/kdebug.h>
147 #include <sys/bsdtask_info.h>
148 #include <mach/sdt.h>
149 #include <san/kasan.h>
150 #include <san/kcov_stksz.h>
151
152 #include <stdatomic.h>
153
154 #if defined(HAS_APPLE_PAC)
155 #include <ptrauth.h>
156 #include <arm64/proc_reg.h>
157 #endif /* defined(HAS_APPLE_PAC) */
158
159 /*
160 * Exported interfaces
161 */
162 #include <mach/task_server.h>
163 #include <mach/thread_act_server.h>
164 #include <mach/mach_host_server.h>
165 #include <mach/host_priv_server.h>
166 #include <mach/mach_voucher_server.h>
167 #include <kern/policy_internal.h>
168
169 #if CONFIG_MACF
170 #include <security/mac_mach_internal.h>
171 #endif
172
173 #include <pthread/workqueue_trace.h>
174
175 #if CONFIG_EXCLAVES
176 #include <mach/exclaves.h>
177 #endif
178
179 LCK_GRP_DECLARE(thread_lck_grp, "thread");
180
181 static SECURITY_READ_ONLY_LATE(zone_t) thread_zone;
182 ZONE_DEFINE_ID(ZONE_ID_THREAD_RO, "threads_ro", struct thread_ro, ZC_READONLY);
183
184 static void thread_port_with_flavor_no_senders(ipc_port_t, mach_port_mscount_t);
185
186 IPC_KOBJECT_DEFINE(IKOT_THREAD_CONTROL);
187 IPC_KOBJECT_DEFINE(IKOT_THREAD_READ,
188 .iko_op_no_senders = thread_port_with_flavor_no_senders);
189 IPC_KOBJECT_DEFINE(IKOT_THREAD_INSPECT,
190 .iko_op_no_senders = thread_port_with_flavor_no_senders);
191
192 static struct mpsc_daemon_queue thread_stack_queue;
193 static struct mpsc_daemon_queue thread_terminate_queue;
194 static struct mpsc_daemon_queue thread_deallocate_queue;
195 static struct mpsc_daemon_queue thread_exception_queue;
196 static struct mpsc_daemon_queue thread_backtrace_queue;
197
198 decl_simple_lock_data(static, crashed_threads_lock);
199 static queue_head_t crashed_threads_queue;
200
201 struct thread_exception_elt {
202 struct mpsc_queue_chain link;
203 exception_type_t exception_type;
204 task_t exception_task;
205 thread_t exception_thread;
206 };
207
208 struct thread_backtrace_elt {
209 struct mpsc_queue_chain link;
210 exception_type_t exception_type;
211 kcdata_object_t obj;
212 exception_port_t exc_ports[BT_EXC_PORTS_COUNT]; /* send rights */
213 };
214
215 static SECURITY_READ_ONLY_LATE(struct thread) thread_template = {
216 #if MACH_ASSERT
217 .thread_magic = THREAD_MAGIC,
218 #endif /* MACH_ASSERT */
219 .wait_result = THREAD_WAITING,
220 .options = THREAD_ABORTSAFE,
221 .state = TH_WAIT | TH_UNINT,
222 .th_sched_bucket = TH_BUCKET_RUN,
223 .base_pri = BASEPRI_DEFAULT,
224 .realtime.deadline = UINT64_MAX,
225 .last_made_runnable_time = THREAD_NOT_RUNNABLE,
226 .last_basepri_change_time = THREAD_NOT_RUNNABLE,
227 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
228 .pri_shift = INT8_MAX,
229 #endif
230 /* timers are initialized in thread_bootstrap */
231 };
232
233 #define CTID_SIZE_BIT 20
234 #define CTID_MASK ((1u << CTID_SIZE_BIT) - 1)
235 #define CTID_MAX_THREAD_NUMBER (CTID_MASK - 1)
236 static_assert(CTID_MAX_THREAD_NUMBER <= COMPACT_ID_MAX);
237
238 #ifndef __LITTLE_ENDIAN__
239 #error "ctid relies on the ls bits of uint32_t to be populated"
240 #endif
241
242 __startup_data
243 static struct thread init_thread;
244 static SECURITY_READ_ONLY_LATE(uint32_t) ctid_nonce;
245 COMPACT_ID_TABLE_DEFINE(static, ctid_table);
246
247 __startup_func
248 static void
thread_zone_startup(void)249 thread_zone_startup(void)
250 {
251 size_t size = sizeof(struct thread);
252
253 #ifdef MACH_BSD
254 size += roundup(uthread_size, _Alignof(struct thread));
255 #endif
256 thread_zone = zone_create_ext("threads", size,
257 ZC_SEQUESTER | ZC_ZFREE_CLEARMEM, ZONE_ID_THREAD, NULL);
258 }
259 STARTUP(ZALLOC, STARTUP_RANK_FOURTH, thread_zone_startup);
260
261 static void thread_deallocate_enqueue(thread_t thread);
262 static void thread_deallocate_complete(thread_t thread);
263
264 static void ctid_table_remove(thread_t thread);
265 static void ctid_table_add(thread_t thread);
266 static void ctid_table_init(void);
267
268 #ifdef MACH_BSD
269 extern void proc_exit(void *);
270 extern mach_exception_data_type_t proc_encode_exit_exception_code(void *);
271 extern uint64_t get_dispatchqueue_offset_from_proc(void *);
272 extern uint64_t get_return_to_kernel_offset_from_proc(void *p);
273 extern uint64_t get_wq_quantum_offset_from_proc(void *);
274 extern int proc_selfpid(void);
275 extern void proc_name(int, char*, int);
276 extern char * proc_name_address(void *p);
277 exception_type_t get_exception_from_corpse_crashinfo(kcdata_descriptor_t corpse_info);
278 extern void kdebug_proc_name_args(struct proc *proc, long args[static 4]);
279 #endif /* MACH_BSD */
280
281 extern bool bsdthread_part_of_cooperative_workqueue(struct uthread *uth);
282 extern bool disable_exc_resource;
283 extern bool disable_exc_resource_during_audio;
284 extern int audio_active;
285 extern int debug_task;
286 int thread_max = CONFIG_THREAD_MAX; /* Max number of threads */
287 int task_threadmax = CONFIG_THREAD_MAX;
288
289 static uint64_t thread_unique_id = 100;
290
291 struct _thread_ledger_indices thread_ledgers = { .cpu_time = -1 };
292 static ledger_template_t thread_ledger_template = NULL;
293 static void init_thread_ledgers(void);
294
295 #if CONFIG_JETSAM
296 void jetsam_on_ledger_cpulimit_exceeded(void);
297 #endif
298
299 extern int task_thread_soft_limit;
300
301
302 /*
303 * Level (in terms of percentage of the limit) at which the CPU usage monitor triggers telemetry.
304 *
305 * (ie when any thread's CPU consumption exceeds 70% of the limit, start taking user
306 * stacktraces, aka micro-stackshots)
307 */
308 #define CPUMON_USTACKSHOTS_TRIGGER_DEFAULT_PCT 70
309
310 /* Percentage. Level at which we start gathering telemetry. */
311 static TUNABLE(uint8_t, cpumon_ustackshots_trigger_pct,
312 "cpumon_ustackshots_trigger_pct", CPUMON_USTACKSHOTS_TRIGGER_DEFAULT_PCT);
313 void __attribute__((noinline)) SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void);
314
315 #if DEVELOPMENT || DEBUG
316 TUNABLE_WRITEABLE(int, exc_resource_threads_enabled, "exc_resource_threads_enabled", 1);
317
318 void __attribute__((noinline)) SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(task_t, int);
319 #endif /* DEVELOPMENT || DEBUG */
320
321 /*
322 * The smallest interval over which we support limiting CPU consumption is 1ms
323 */
324 #define MINIMUM_CPULIMIT_INTERVAL_MS 1
325
326 os_refgrp_decl(static, thread_refgrp, "thread", NULL);
327
328 static inline void
init_thread_from_template(thread_t thread)329 init_thread_from_template(thread_t thread)
330 {
331 /*
332 * In general, struct thread isn't trivially-copyable, since it may
333 * contain pointers to thread-specific state. This may be enforced at
334 * compile time on architectures that store authed + diversified
335 * pointers in machine_thread.
336 *
337 * In this specific case, where we're initializing a new thread from a
338 * thread_template, we know all diversified pointers are NULL; these are
339 * safe to bitwise copy.
340 */
341 #pragma clang diagnostic push
342 #pragma clang diagnostic ignored "-Wnontrivial-memaccess"
343 memcpy(thread, &thread_template, sizeof(*thread));
344 #pragma clang diagnostic pop
345 }
346
347 static void
thread_ro_create(task_t parent_task,thread_t th,thread_ro_t tro_tpl)348 thread_ro_create(task_t parent_task, thread_t th, thread_ro_t tro_tpl)
349 {
350 #if __x86_64__
351 th->t_task = parent_task;
352 #endif
353 tro_tpl->tro_owner = th;
354 tro_tpl->tro_task = parent_task;
355 th->t_tro = zalloc_ro(ZONE_ID_THREAD_RO, Z_WAITOK | Z_ZERO | Z_NOFAIL);
356 zalloc_ro_update_elem(ZONE_ID_THREAD_RO, th->t_tro, tro_tpl);
357 }
358
359 static void
thread_ro_destroy(thread_t th)360 thread_ro_destroy(thread_t th)
361 {
362 thread_ro_t tro = get_thread_ro(th);
363 #if MACH_BSD
364 struct ucred *cred = tro->tro_cred;
365 struct ucred *rcred = tro->tro_realcred;
366 #endif
367 zfree_ro(ZONE_ID_THREAD_RO, tro);
368 #if MACH_BSD
369 uthread_cred_free(cred);
370 uthread_cred_free(rcred);
371 #endif
372 }
373
374 __startup_func
375 thread_t
thread_bootstrap(void)376 thread_bootstrap(void)
377 {
378 /*
379 * Fill in a template thread for fast initialization.
380 */
381 timer_init(&thread_template.runnable_timer);
382
383 init_thread_from_template(&init_thread);
384 /* fiddle with init thread to skip asserts in set_sched_pri */
385 init_thread.sched_pri = MAXPRI_KERNEL;
386
387 /*
388 * We can't quite use ctid yet, on ARM thread_bootstrap() is called
389 * before we can call random or anything,
390 * so we just make it barely work and it will get fixed up
391 * when the first thread is actually made.
392 */
393 *compact_id_resolve(&ctid_table, 0) = &init_thread;
394 init_thread.ctid = CTID_MASK;
395
396 return &init_thread;
397 }
398
399 void
thread_machine_init_template(void)400 thread_machine_init_template(void)
401 {
402 machine_thread_template_init(&thread_template);
403 }
404
405 void
thread_init(void)406 thread_init(void)
407 {
408 /*
409 * Initialize any machine-dependent
410 * per-thread structures necessary.
411 */
412 machine_thread_init();
413
414 init_thread_ledgers();
415 }
416
417 boolean_t
thread_is_active(thread_t thread)418 thread_is_active(thread_t thread)
419 {
420 return thread->active;
421 }
422
423 void
thread_corpse_continue(void)424 thread_corpse_continue(void)
425 {
426 thread_t thread = current_thread();
427
428 thread_terminate_internal(thread);
429
430 /*
431 * Handle the thread termination directly
432 * here instead of returning to userspace.
433 */
434 assert(thread->active == FALSE);
435 thread_ast_clear(thread, AST_APC);
436 thread_apc_ast(thread);
437
438 panic("thread_corpse_continue");
439 /*NOTREACHED*/
440 }
441
442 __dead2
443 static void
thread_terminate_continue(void)444 thread_terminate_continue(void)
445 {
446 panic("thread_terminate_continue");
447 /*NOTREACHED*/
448 }
449
450 /*
451 * thread_terminate_self:
452 */
453 void
thread_terminate_self(void)454 thread_terminate_self(void)
455 {
456 thread_t thread = current_thread();
457 thread_ro_t tro = get_thread_ro(thread);
458 task_t task = tro->tro_task;
459 void *bsd_info = get_bsdtask_info(task);
460 int threadcnt;
461
462 pal_thread_terminate_self(thread);
463
464 DTRACE_PROC(lwp__exit);
465
466 thread_mtx_lock(thread);
467
468 ipc_thread_disable(thread);
469
470 thread_mtx_unlock(thread);
471
472 thread_sched_call(thread, NULL);
473
474 spl_t s = splsched();
475 thread_lock(thread);
476
477 thread_depress_abort_locked(thread);
478
479 /*
480 * Before we take the thread_lock right above,
481 * act_set_ast_reset_pcs() might not yet observe
482 * that the thread is inactive, and could have
483 * requested an IPI Ack.
484 *
485 * Once we unlock the thread, we know that
486 * act_set_ast_reset_pcs() can't fail to notice
487 * that thread->active is false,
488 * and won't set new ones.
489 */
490 thread_reset_pcs_ack_IPI(thread);
491
492 thread_unlock(thread);
493
494 splx(s);
495
496 #if CONFIG_TASKWATCH
497 thead_remove_taskwatch(thread);
498 #endif /* CONFIG_TASKWATCH */
499
500 work_interval_thread_terminate(thread);
501
502 thread_mtx_lock(thread);
503
504 thread_policy_reset(thread);
505
506 thread_mtx_unlock(thread);
507
508 assert(thread->th_work_interval == NULL);
509
510 bank_swap_thread_bank_ledger(thread, NULL);
511
512 if (kdebug_enable && bsd_hasthreadname(get_bsdthread_info(thread))) {
513 char threadname[MAXTHREADNAMESIZE];
514 bsd_getthreadname(get_bsdthread_info(thread), threadname);
515 kernel_debug_string_simple(TRACE_STRING_THREADNAME_PREV, threadname);
516 }
517
518 uthread_cleanup(get_bsdthread_info(thread), tro);
519
520 if (kdebug_enable && bsd_info && !task_is_exec_copy(task)) {
521 /* trace out pid before we sign off */
522 long dbg_arg1 = 0;
523 long dbg_arg2 = 0;
524
525 kdbg_trace_data(get_bsdtask_info(task), &dbg_arg1, &dbg_arg2);
526 #if CONFIG_PERVASIVE_CPI
527 if (kdebug_debugid_enabled(DBG_MT_INSTRS_CYCLES_THR_EXIT)) {
528 struct recount_usage usage = { 0 };
529 struct recount_usage perf_only = { 0 };
530 boolean_t intrs_end = ml_set_interrupts_enabled(FALSE);
531 recount_current_thread_usage_perf_only(&usage, &perf_only);
532 ml_set_interrupts_enabled(intrs_end);
533 KDBG_RELEASE(DBG_MT_INSTRS_CYCLES_THR_EXIT,
534 recount_usage_instructions(&usage),
535 recount_usage_cycles(&usage),
536 recount_usage_system_time_mach(&usage),
537 usage.ru_metrics[RCT_LVL_USER].rm_time_mach);
538 #if __AMP__
539 KDBG_RELEASE(DBG_MT_P_INSTRS_CYCLES_THR_EXIT,
540 recount_usage_instructions(&perf_only),
541 recount_usage_cycles(&perf_only),
542 recount_usage_system_time_mach(&perf_only),
543 perf_only.ru_metrics[RCT_LVL_USER].rm_time_mach);
544 #endif // __AMP__
545 }
546 #endif/* CONFIG_PERVASIVE_CPI */
547 KDBG_RELEASE(TRACE_DATA_THREAD_TERMINATE_PID, dbg_arg1, dbg_arg2);
548 }
549
550 /*
551 * After this subtraction, this thread should never access
552 * task->bsd_info unless it got 0 back from the os_atomic_dec. It
553 * could be racing with other threads to be the last thread in the
554 * process, and the last thread in the process will tear down the proc
555 * structure and zero-out task->bsd_info.
556 */
557 threadcnt = os_atomic_dec(&task->active_thread_count, relaxed);
558
559 #if CONFIG_COALITIONS
560 /*
561 * Leave the coalitions when last thread of task is exiting and the
562 * task is not a corpse.
563 */
564 if (threadcnt == 0 && !task->corpse_info) {
565 coalitions_remove_task(task);
566 }
567 #endif
568
569 /*
570 * If we are the last thread to terminate and the task is
571 * associated with a BSD process, perform BSD process exit.
572 */
573 if (threadcnt == 0 && bsd_info != NULL) {
574 mach_exception_data_type_t subcode = 0;
575 if (kdebug_enable) {
576 /* since we're the last thread in this process, trace out the command name too */
577 long args[4] = { 0 };
578 kdebug_proc_name_args(bsd_info, args);
579 #if CONFIG_PERVASIVE_CPI
580 if (kdebug_debugid_enabled(DBG_MT_INSTRS_CYCLES_PROC_EXIT)) {
581 struct recount_usage usage = { 0 };
582 struct recount_usage perf_only = { 0 };
583 recount_current_task_usage_perf_only(&usage, &perf_only);
584 KDBG_RELEASE(DBG_MT_INSTRS_CYCLES_PROC_EXIT,
585 recount_usage_instructions(&usage),
586 recount_usage_cycles(&usage),
587 recount_usage_system_time_mach(&usage),
588 usage.ru_metrics[RCT_LVL_USER].rm_time_mach);
589 #if __AMP__
590 KDBG_RELEASE(DBG_MT_P_INSTRS_CYCLES_PROC_EXIT,
591 recount_usage_instructions(&perf_only),
592 recount_usage_cycles(&perf_only),
593 recount_usage_system_time_mach(&perf_only),
594 perf_only.ru_metrics[RCT_LVL_USER].rm_time_mach);
595 #endif // __AMP__
596 }
597 #endif/* CONFIG_PERVASIVE_CPI */
598 KDBG_RELEASE(TRACE_STRING_PROC_EXIT, args[0], args[1], args[2], args[3]);
599 }
600
601 /* Get the exit reason before proc_exit */
602 subcode = proc_encode_exit_exception_code(bsd_info);
603 proc_exit(bsd_info);
604 bsd_info = NULL;
605 #if CONFIG_EXCLAVES
606 task_clear_conclave(task);
607 #endif
608 /*
609 * if there is crash info in task
610 * then do the deliver action since this is
611 * last thread for this task.
612 */
613 if (task->corpse_info) {
614 /* reset all except task name port */
615 ipc_task_reset(task);
616 /* enable all task ports (name port unchanged) */
617 ipc_task_enable(task);
618 exception_type_t etype = get_exception_from_corpse_crashinfo(task->corpse_info);
619 task_deliver_crash_notification(task, current_thread(), etype, subcode);
620 }
621 }
622
623 if (threadcnt == 0) {
624 task_lock(task);
625 if (task_is_a_corpse_fork(task)) {
626 thread_wakeup((event_t)&task->active_thread_count);
627 }
628 task_unlock(task);
629 }
630
631 #if CONFIG_EXCLAVES
632 exclaves_thread_terminate(thread);
633 #endif
634
635 if (thread->th_vm_faults_disabled) {
636 panic("Thread %p terminating with vm_faults disabled.", thread);
637 }
638
639 s = splsched();
640 thread_lock(thread);
641
642 /*
643 * Ensure that the depress timer is no longer enqueued,
644 * so the timer can be safely deallocated
645 *
646 * TODO: build timer_call_cancel_wait
647 */
648
649 assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) == 0);
650
651 uint32_t delay_us = 1;
652
653 while (thread->depress_timer_active > 0) {
654 thread_unlock(thread);
655 splx(s);
656
657 delay(delay_us++);
658
659 if (delay_us > USEC_PER_SEC) {
660 panic("depress timer failed to inactivate!"
661 "thread: %p depress_timer_active: %d",
662 thread, thread->depress_timer_active);
663 }
664
665 s = splsched();
666 thread_lock(thread);
667 }
668
669 /*
670 * Cancel wait timer, and wait for
671 * concurrent expirations.
672 */
673 if (thread->wait_timer_armed) {
674 thread->wait_timer_armed = false;
675
676 if (timer_call_cancel(thread->wait_timer)) {
677 thread->wait_timer_active--;
678 }
679 }
680
681 delay_us = 1;
682
683 while (thread->wait_timer_active > 0) {
684 thread_unlock(thread);
685 splx(s);
686
687 delay(delay_us++);
688
689 if (delay_us > USEC_PER_SEC) {
690 panic("wait timer failed to inactivate!"
691 "thread: %p, wait_timer_active: %d, "
692 "wait_timer_armed: %d",
693 thread, thread->wait_timer_active,
694 thread->wait_timer_armed);
695 }
696
697 s = splsched();
698 thread_lock(thread);
699 }
700
701 /*
702 * If there is a reserved stack, release it.
703 */
704 if (thread->reserved_stack != 0) {
705 stack_free_reserved(thread);
706 thread->reserved_stack = 0;
707 }
708
709 /*
710 * Mark thread as terminating, and block.
711 */
712 thread->state |= TH_TERMINATE;
713 thread_mark_wait_locked(thread, THREAD_UNINT);
714
715 #if CONFIG_EXCLAVES
716 assert(thread->th_exclaves_ipc_ctx.ipcb == NULL);
717 assert(thread->th_exclaves_ipc_ctx.scid == 0);
718 assert(thread->th_exclaves_intstate == 0);
719 assert(thread->th_exclaves_state == 0);
720 #endif
721 assert(thread->th_work_interval_flags == TH_WORK_INTERVAL_FLAGS_NONE);
722 assert(thread->kern_promotion_schedpri == 0);
723 if (thread->rwlock_count > 0) {
724 panic("rwlock_count is %d for thread %p, possibly it still holds a rwlock", thread->rwlock_count, thread);
725 }
726 assert(thread->priority_floor_count == 0);
727 assert(thread->handoff_thread == THREAD_NULL);
728 assert(thread->th_work_interval == NULL);
729 assert(thread->t_rr_state.trr_value == 0);
730
731 assert3u(0, ==, thread->sched_flags &
732 (TH_SFLAG_WAITQ_PROMOTED |
733 TH_SFLAG_RW_PROMOTED |
734 TH_SFLAG_EXEC_PROMOTED |
735 TH_SFLAG_FLOOR_PROMOTED |
736 TH_SFLAG_PROMOTED |
737 TH_SFLAG_DEPRESS));
738
739 thread_unlock(thread);
740 /* splsched */
741
742 thread_block((thread_continue_t)thread_terminate_continue);
743 /*NOTREACHED*/
744 }
745
746 static bool
thread_ref_release(thread_t thread)747 thread_ref_release(thread_t thread)
748 {
749 if (thread == THREAD_NULL) {
750 return false;
751 }
752
753 assert_thread_magic(thread);
754
755 return os_ref_release_raw(&thread->ref_count, &thread_refgrp) == 0;
756 }
757
758 /* Drop a thread refcount safely without triggering a zfree */
759 void
thread_deallocate_safe(thread_t thread)760 thread_deallocate_safe(thread_t thread)
761 {
762 if (__improbable(thread_ref_release(thread))) {
763 /* enqueue the thread for thread deallocate deamon to call thread_deallocate_complete */
764 thread_deallocate_enqueue(thread);
765 }
766 }
767
768 void
thread_deallocate(thread_t thread)769 thread_deallocate(thread_t thread)
770 {
771 if (__improbable(thread_ref_release(thread))) {
772 thread_deallocate_complete(thread);
773 }
774 }
775
776 void
thread_deallocate_complete(thread_t thread)777 thread_deallocate_complete(
778 thread_t thread)
779 {
780 task_t task;
781
782 assert_thread_magic(thread);
783
784 assert(os_ref_get_count_raw(&thread->ref_count) == 0);
785
786 if (!(thread->state & TH_TERMINATE2)) {
787 panic("thread_deallocate: thread not properly terminated");
788 }
789
790 thread_assert_runq_null(thread);
791 assert(!(thread->state & TH_WAKING));
792
793 #if CONFIG_CPU_COUNTERS
794 kpc_thread_destroy(thread);
795 #endif /* CONFIG_CPU_COUNTERS */
796
797 ipc_thread_terminate(thread);
798
799 proc_thread_qos_deallocate(thread);
800
801 task = get_threadtask(thread);
802
803 #ifdef MACH_BSD
804 uthread_destroy(get_bsdthread_info(thread));
805 #endif /* MACH_BSD */
806
807 if (thread->t_ledger) {
808 ledger_dereference(thread->t_ledger);
809 }
810 if (thread->t_threadledger) {
811 ledger_dereference(thread->t_threadledger);
812 }
813
814 assert(thread->turnstile != TURNSTILE_NULL);
815 if (thread->turnstile) {
816 turnstile_deallocate(thread->turnstile);
817 }
818 turnstile_compact_id_put(thread->ctsid);
819
820 if (IPC_VOUCHER_NULL != thread->ith_voucher) {
821 ipc_voucher_release(thread->ith_voucher);
822 }
823
824 kfree_data(thread->thread_io_stats, sizeof(struct io_stat_info));
825 #if CONFIG_PREADOPT_TG
826 if (thread->old_preadopt_thread_group) {
827 thread_group_release(thread->old_preadopt_thread_group);
828 }
829
830 if (thread->preadopt_thread_group) {
831 thread_group_release(thread->preadopt_thread_group);
832 }
833 #endif /* CONFIG_PREADOPT_TG */
834
835 if (thread->kernel_stack != 0) {
836 stack_free(thread);
837 }
838
839 recount_thread_deinit(&thread->th_recount);
840
841 lck_mtx_destroy(&thread->mutex, &thread_lck_grp);
842 machine_thread_destroy(thread);
843
844 task_deallocate_grp(task, TASK_GRP_INTERNAL);
845
846 #if MACH_ASSERT
847 assert_thread_magic(thread);
848 thread->thread_magic = 0;
849 #endif /* MACH_ASSERT */
850
851 lck_mtx_lock(&tasks_threads_lock);
852 assert(terminated_threads_count > 0);
853 queue_remove(&terminated_threads, thread, thread_t, threads);
854 terminated_threads_count--;
855 lck_mtx_unlock(&tasks_threads_lock);
856
857 timer_call_free(thread->depress_timer);
858 timer_call_free(thread->wait_timer);
859
860 ctid_table_remove(thread);
861
862 thread_ro_destroy(thread);
863 zfree(thread_zone, thread);
864 }
865
866 /*
867 * thread_inspect_deallocate:
868 *
869 * Drop a thread inspection reference.
870 */
871 void
thread_inspect_deallocate(thread_inspect_t thread_inspect)872 thread_inspect_deallocate(
873 thread_inspect_t thread_inspect)
874 {
875 return thread_deallocate((thread_t)thread_inspect);
876 }
877
878 /*
879 * thread_read_deallocate:
880 *
881 * Drop a reference on thread read port.
882 */
883 void
thread_read_deallocate(thread_read_t thread_read)884 thread_read_deallocate(
885 thread_read_t thread_read)
886 {
887 return thread_deallocate((thread_t)thread_read);
888 }
889
890
891 /*
892 * thread_exception_queue_invoke:
893 *
894 * Deliver EXC_{RESOURCE,GUARD} exception
895 */
896 static void
thread_exception_queue_invoke(mpsc_queue_chain_t elm,__assert_only mpsc_daemon_queue_t dq)897 thread_exception_queue_invoke(mpsc_queue_chain_t elm,
898 __assert_only mpsc_daemon_queue_t dq)
899 {
900 struct thread_exception_elt *elt;
901 task_t task;
902 thread_t thread;
903 exception_type_t etype;
904
905 assert(dq == &thread_exception_queue);
906 elt = mpsc_queue_element(elm, struct thread_exception_elt, link);
907
908 etype = elt->exception_type;
909 task = elt->exception_task;
910 thread = elt->exception_thread;
911 assert_thread_magic(thread);
912
913 kfree_type(struct thread_exception_elt, elt);
914
915 /* wait for all the threads in the task to terminate */
916 task_lock(task);
917 task_wait_till_threads_terminate_locked(task);
918 task_unlock(task);
919
920 /* Consumes the task ref returned by task_generate_corpse_internal */
921 task_deallocate(task);
922 /* Consumes the thread ref returned by task_generate_corpse_internal */
923 thread_deallocate(thread);
924
925 /* Deliver the notification, also clears the corpse. */
926 task_deliver_crash_notification(task, thread, etype, 0);
927 }
928
929 static void
thread_backtrace_queue_invoke(mpsc_queue_chain_t elm,__assert_only mpsc_daemon_queue_t dq)930 thread_backtrace_queue_invoke(mpsc_queue_chain_t elm,
931 __assert_only mpsc_daemon_queue_t dq)
932 {
933 struct thread_backtrace_elt *elt;
934 kcdata_object_t obj;
935 exception_port_t exc_ports[BT_EXC_PORTS_COUNT]; /* send rights */
936 exception_type_t etype;
937
938 assert(dq == &thread_backtrace_queue);
939 elt = mpsc_queue_element(elm, struct thread_backtrace_elt, link);
940
941 obj = elt->obj;
942 memcpy(exc_ports, elt->exc_ports, sizeof(ipc_port_t) * BT_EXC_PORTS_COUNT);
943 etype = elt->exception_type;
944
945 kfree_type(struct thread_backtrace_elt, elt);
946
947 /* Deliver to backtrace exception ports */
948 exception_deliver_backtrace(obj, exc_ports, etype);
949
950 /*
951 * Release port right and kcdata object refs given by
952 * task_enqueue_exception_with_corpse()
953 */
954
955 for (unsigned int i = 0; i < BT_EXC_PORTS_COUNT; i++) {
956 ipc_port_release_send(exc_ports[i]);
957 }
958
959 kcdata_object_release(obj);
960 }
961
962 /*
963 * thread_exception_enqueue:
964 *
965 * Enqueue a corpse port to be delivered an EXC_{RESOURCE,GUARD}.
966 */
967 void
thread_exception_enqueue(task_t task,thread_t thread,exception_type_t etype)968 thread_exception_enqueue(
969 task_t task,
970 thread_t thread,
971 exception_type_t etype)
972 {
973 assert(EXC_RESOURCE == etype || EXC_GUARD == etype);
974 struct thread_exception_elt *elt = kalloc_type(struct thread_exception_elt, Z_WAITOK | Z_NOFAIL);
975 elt->exception_type = etype;
976 elt->exception_task = task;
977 elt->exception_thread = thread;
978
979 mpsc_daemon_enqueue(&thread_exception_queue, &elt->link,
980 MPSC_QUEUE_DISABLE_PREEMPTION);
981 }
982
983 void
thread_backtrace_enqueue(kcdata_object_t obj,exception_port_t ports[static BT_EXC_PORTS_COUNT],exception_type_t etype)984 thread_backtrace_enqueue(
985 kcdata_object_t obj,
986 exception_port_t ports[static BT_EXC_PORTS_COUNT],
987 exception_type_t etype)
988 {
989 struct thread_backtrace_elt *elt = kalloc_type(struct thread_backtrace_elt, Z_WAITOK | Z_NOFAIL);
990 elt->obj = obj;
991 elt->exception_type = etype;
992
993 memcpy(elt->exc_ports, ports, sizeof(ipc_port_t) * BT_EXC_PORTS_COUNT);
994
995 mpsc_daemon_enqueue(&thread_backtrace_queue, &elt->link,
996 MPSC_QUEUE_DISABLE_PREEMPTION);
997 }
998
999 /*
1000 * thread_copy_resource_info
1001 *
1002 * Copy the resource info counters from source
1003 * thread to destination thread.
1004 */
1005 void
thread_copy_resource_info(thread_t dst_thread,thread_t src_thread)1006 thread_copy_resource_info(
1007 thread_t dst_thread,
1008 thread_t src_thread)
1009 {
1010 dst_thread->c_switch = src_thread->c_switch;
1011 dst_thread->p_switch = src_thread->p_switch;
1012 dst_thread->ps_switch = src_thread->ps_switch;
1013 dst_thread->sched_time_save = src_thread->sched_time_save;
1014 dst_thread->runnable_timer = src_thread->runnable_timer;
1015 dst_thread->vtimer_user_save = src_thread->vtimer_user_save;
1016 dst_thread->vtimer_prof_save = src_thread->vtimer_prof_save;
1017 dst_thread->vtimer_rlim_save = src_thread->vtimer_rlim_save;
1018 dst_thread->vtimer_qos_save = src_thread->vtimer_qos_save;
1019 dst_thread->syscalls_unix = src_thread->syscalls_unix;
1020 dst_thread->syscalls_mach = src_thread->syscalls_mach;
1021 ledger_rollup(dst_thread->t_threadledger, src_thread->t_threadledger);
1022 recount_thread_copy(&dst_thread->th_recount, &src_thread->th_recount);
1023 *dst_thread->thread_io_stats = *src_thread->thread_io_stats;
1024 }
1025
1026 static void
thread_terminate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)1027 thread_terminate_queue_invoke(mpsc_queue_chain_t e,
1028 __assert_only mpsc_daemon_queue_t dq)
1029 {
1030 thread_t thread = mpsc_queue_element(e, struct thread, mpsc_links);
1031 task_t task = get_threadtask(thread);
1032
1033 assert(dq == &thread_terminate_queue);
1034
1035 task_lock(task);
1036
1037 /*
1038 * if marked for crash reporting, skip reaping.
1039 * The corpse delivery thread will clear bit and enqueue
1040 * for reaping when done
1041 *
1042 * Note: the inspection field is set under the task lock
1043 *
1044 * FIXME[mad]: why enqueue for termination before `inspection` is false ?
1045 */
1046 if (__improbable(thread->inspection)) {
1047 simple_lock(&crashed_threads_lock, &thread_lck_grp);
1048 task_unlock(task);
1049
1050 enqueue_tail(&crashed_threads_queue, &thread->runq_links);
1051 simple_unlock(&crashed_threads_lock);
1052 return;
1053 }
1054
1055 recount_task_rollup_thread(&task->tk_recount, &thread->th_recount);
1056
1057 task->total_runnable_time += timer_grab(&thread->runnable_timer);
1058 task->c_switch += thread->c_switch;
1059 task->p_switch += thread->p_switch;
1060 task->ps_switch += thread->ps_switch;
1061
1062 task->syscalls_unix += thread->syscalls_unix;
1063 task->syscalls_mach += thread->syscalls_mach;
1064
1065 task->task_timer_wakeups_bin_1 += thread->thread_timer_wakeups_bin_1;
1066 task->task_timer_wakeups_bin_2 += thread->thread_timer_wakeups_bin_2;
1067 task->task_gpu_ns += ml_gpu_stat(thread);
1068 task->decompressions += thread->decompressions;
1069
1070 thread_update_qos_cpu_time(thread);
1071
1072 queue_remove(&task->threads, thread, thread_t, task_threads);
1073 task->thread_count--;
1074
1075 /*
1076 * If the task is being halted, and there is only one thread
1077 * left in the task after this one, then wakeup that thread.
1078 */
1079 if (task->thread_count == 1 && task->halting) {
1080 thread_wakeup((event_t)&task->halting);
1081 }
1082
1083 task_unlock(task);
1084
1085 lck_mtx_lock(&tasks_threads_lock);
1086 queue_remove(&threads, thread, thread_t, threads);
1087 threads_count--;
1088 queue_enter(&terminated_threads, thread, thread_t, threads);
1089 terminated_threads_count++;
1090 lck_mtx_unlock(&tasks_threads_lock);
1091
1092 #if MACH_BSD
1093 /*
1094 * The thread no longer counts against the task's thread count,
1095 * we can now wake up any pending joiner.
1096 *
1097 * Note that the inheritor will be set to `thread` which is
1098 * incorrect once it is on the termination queue, however
1099 * the termination queue runs at MINPRI_KERNEL which is higher
1100 * than any user thread, so this isn't a priority inversion.
1101 */
1102 if (thread_get_tag(thread) & THREAD_TAG_USER_JOIN) {
1103 struct uthread *uth = get_bsdthread_info(thread);
1104 mach_port_name_t kport = uthread_joiner_port(uth);
1105
1106 /*
1107 * Clear the port low two bits to tell pthread that thread is gone.
1108 */
1109 #ifndef NO_PORT_GEN
1110 kport &= ~MACH_PORT_MAKE(0, IE_BITS_GEN_MASK + IE_BITS_GEN_ONE);
1111 #else
1112 kport |= MACH_PORT_MAKE(0, ~(IE_BITS_GEN_MASK + IE_BITS_GEN_ONE));
1113 #endif
1114 (void)copyoutmap_atomic32(task->map, kport,
1115 uthread_joiner_address(uth));
1116 uthread_joiner_wake(task, uth);
1117 }
1118 #endif
1119
1120 thread_deallocate(thread);
1121 }
1122
1123 static void
thread_deallocate_queue_invoke(mpsc_queue_chain_t e,__assert_only mpsc_daemon_queue_t dq)1124 thread_deallocate_queue_invoke(mpsc_queue_chain_t e,
1125 __assert_only mpsc_daemon_queue_t dq)
1126 {
1127 thread_t thread = mpsc_queue_element(e, struct thread, mpsc_links);
1128
1129 assert(dq == &thread_deallocate_queue);
1130
1131 thread_deallocate_complete(thread);
1132 }
1133
1134 /*
1135 * thread_terminate_enqueue:
1136 *
1137 * Enqueue a terminating thread for final disposition.
1138 *
1139 * Called at splsched.
1140 */
1141 void
thread_terminate_enqueue(thread_t thread)1142 thread_terminate_enqueue(
1143 thread_t thread)
1144 {
1145 KDBG_RELEASE(TRACE_DATA_THREAD_TERMINATE, thread->thread_id);
1146
1147 mpsc_daemon_enqueue(&thread_terminate_queue, &thread->mpsc_links,
1148 MPSC_QUEUE_DISABLE_PREEMPTION);
1149 }
1150
1151 /*
1152 * thread_deallocate_enqueue:
1153 *
1154 * Enqueue a thread for final deallocation.
1155 */
1156 static void
thread_deallocate_enqueue(thread_t thread)1157 thread_deallocate_enqueue(
1158 thread_t thread)
1159 {
1160 mpsc_daemon_enqueue(&thread_deallocate_queue, &thread->mpsc_links,
1161 MPSC_QUEUE_DISABLE_PREEMPTION);
1162 }
1163
1164 /*
1165 * thread_terminate_crashed_threads:
1166 * walk the list of crashed threads and put back set of threads
1167 * who are no longer being inspected.
1168 */
1169 void
thread_terminate_crashed_threads(void)1170 thread_terminate_crashed_threads(void)
1171 {
1172 thread_t th_remove;
1173
1174 simple_lock(&crashed_threads_lock, &thread_lck_grp);
1175 /*
1176 * loop through the crashed threads queue
1177 * to put any threads that are not being inspected anymore
1178 */
1179
1180 qe_foreach_element_safe(th_remove, &crashed_threads_queue, runq_links) {
1181 /* make sure current_thread is never in crashed queue */
1182 assert(th_remove != current_thread());
1183
1184 if (th_remove->inspection == FALSE) {
1185 remqueue(&th_remove->runq_links);
1186 mpsc_daemon_enqueue(&thread_terminate_queue, &th_remove->mpsc_links,
1187 MPSC_QUEUE_NONE);
1188 }
1189 }
1190
1191 simple_unlock(&crashed_threads_lock);
1192 }
1193
1194 /*
1195 * thread_stack_queue_invoke:
1196 *
1197 * Perform stack allocation as required due to
1198 * invoke failures.
1199 */
1200 static void
thread_stack_queue_invoke(mpsc_queue_chain_t elm,__assert_only mpsc_daemon_queue_t dq)1201 thread_stack_queue_invoke(mpsc_queue_chain_t elm,
1202 __assert_only mpsc_daemon_queue_t dq)
1203 {
1204 thread_t thread = mpsc_queue_element(elm, struct thread, mpsc_links);
1205
1206 assert(dq == &thread_stack_queue);
1207
1208 /* allocate stack with interrupts enabled so that we can call into VM */
1209 stack_alloc(thread);
1210
1211 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_WAIT) | DBG_FUNC_END, thread_tid(thread), 0, 0, 0, 0);
1212
1213 spl_t s = splsched();
1214 thread_lock(thread);
1215 thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1216 thread_unlock(thread);
1217 splx(s);
1218 }
1219
1220 /*
1221 * thread_stack_enqueue:
1222 *
1223 * Enqueue a thread for stack allocation.
1224 *
1225 * Called at splsched.
1226 */
1227 void
thread_stack_enqueue(thread_t thread)1228 thread_stack_enqueue(
1229 thread_t thread)
1230 {
1231 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_WAIT) | DBG_FUNC_START, thread_tid(thread), 0, 0, 0, 0);
1232 assert_thread_magic(thread);
1233
1234 mpsc_daemon_enqueue(&thread_stack_queue, &thread->mpsc_links,
1235 MPSC_QUEUE_DISABLE_PREEMPTION);
1236 }
1237
1238 void
thread_daemon_init(void)1239 thread_daemon_init(void)
1240 {
1241 kern_return_t result;
1242
1243 thread_deallocate_daemon_init();
1244
1245 thread_deallocate_daemon_register_queue(&thread_terminate_queue,
1246 thread_terminate_queue_invoke);
1247
1248 thread_deallocate_daemon_register_queue(&thread_deallocate_queue,
1249 thread_deallocate_queue_invoke);
1250
1251 ipc_object_deallocate_register_queue();
1252
1253 simple_lock_init(&crashed_threads_lock, 0);
1254 queue_init(&crashed_threads_queue);
1255
1256 result = mpsc_daemon_queue_init_with_thread(&thread_stack_queue,
1257 thread_stack_queue_invoke, BASEPRI_PREEMPT_HIGH,
1258 "daemon.thread-stack", MPSC_DAEMON_INIT_NONE);
1259 if (result != KERN_SUCCESS) {
1260 panic("thread_daemon_init: thread_stack_daemon");
1261 }
1262
1263 result = mpsc_daemon_queue_init_with_thread(&thread_exception_queue,
1264 thread_exception_queue_invoke, MINPRI_KERNEL,
1265 "daemon.thread-exception", MPSC_DAEMON_INIT_NONE);
1266
1267 if (result != KERN_SUCCESS) {
1268 panic("thread_daemon_init: thread_exception_daemon");
1269 }
1270
1271 result = mpsc_daemon_queue_init_with_thread(&thread_backtrace_queue,
1272 thread_backtrace_queue_invoke, MINPRI_KERNEL,
1273 "daemon.thread-backtrace", MPSC_DAEMON_INIT_NONE);
1274
1275 if (result != KERN_SUCCESS) {
1276 panic("thread_daemon_init: thread_backtrace_daemon");
1277 }
1278 }
1279
1280 __options_decl(thread_create_internal_options_t, uint32_t, {
1281 TH_OPTION_NONE = 0x00,
1282 TH_OPTION_NOSUSP = 0x02,
1283 TH_OPTION_WORKQ = 0x04,
1284 TH_OPTION_MAINTHREAD = 0x08,
1285 });
1286
1287 void
main_thread_set_immovable_pinned(thread_t thread)1288 main_thread_set_immovable_pinned(thread_t thread)
1289 {
1290 ipc_main_thread_set_immovable_pinned(thread);
1291 }
1292
1293 /*
1294 * Create a new thread.
1295 * Doesn't start the thread running.
1296 *
1297 * Task and tasks_threads_lock are returned locked on success.
1298 */
1299 static kern_return_t
thread_create_internal(task_t parent_task,integer_t priority,thread_continue_t continuation,void * parameter,thread_create_internal_options_t options,thread_t * out_thread)1300 thread_create_internal(
1301 task_t parent_task,
1302 integer_t priority,
1303 thread_continue_t continuation,
1304 void *parameter,
1305 thread_create_internal_options_t options,
1306 thread_t *out_thread)
1307 {
1308 thread_t new_thread;
1309 ipc_thread_init_options_t init_options = IPC_THREAD_INIT_NONE;
1310 struct thread_ro tro_tpl = { };
1311 bool first_thread = false;
1312 kern_return_t kr = KERN_FAILURE;
1313
1314 /*
1315 * Allocate a thread and initialize static fields
1316 */
1317 new_thread = zalloc_flags(thread_zone, Z_WAITOK | Z_NOFAIL);
1318
1319 if (__improbable(current_thread() == &init_thread)) {
1320 /*
1321 * The first thread ever is a global, but because we want to be
1322 * able to zone_id_require() threads, we have to stop using the
1323 * global piece of memory we used to boostrap the kernel and
1324 * jump to a proper thread from a zone.
1325 *
1326 * This is why that one thread will inherit its original
1327 * state differently.
1328 *
1329 * Also remember this thread in `vm_pageout_scan_thread`
1330 * as this is what the first thread ever becomes.
1331 *
1332 * Also pre-warm the depress timer since the VM pageout scan
1333 * daemon might need to use it.
1334 */
1335 assert(vm_pageout_scan_thread == THREAD_NULL);
1336 vm_pageout_scan_thread = new_thread;
1337
1338 first_thread = true;
1339 #pragma clang diagnostic push
1340 #pragma clang diagnostic ignored "-Wnontrivial-memaccess"
1341 /* work around 74481146 */
1342 memcpy(new_thread, &init_thread, sizeof(*new_thread));
1343 #pragma clang diagnostic pop
1344
1345 /*
1346 * Make the ctid table functional
1347 */
1348 ctid_table_init();
1349 new_thread->ctid = 0;
1350 } else {
1351 init_thread_from_template(new_thread);
1352 }
1353
1354 if (options & TH_OPTION_MAINTHREAD) {
1355 init_options |= IPC_THREAD_INIT_MAINTHREAD;
1356 }
1357
1358 os_ref_init_count_raw(&new_thread->ref_count, &thread_refgrp, 2);
1359 machine_thread_create(new_thread, parent_task, first_thread);
1360
1361 machine_thread_process_signature(new_thread, parent_task);
1362
1363 #ifdef MACH_BSD
1364 uthread_init(parent_task, get_bsdthread_info(new_thread),
1365 &tro_tpl, (options & TH_OPTION_WORKQ) != 0);
1366 if (!task_is_a_corpse(parent_task)) {
1367 /*
1368 * uthread_init will set tro_cred (with a +1)
1369 * and tro_proc for live tasks.
1370 */
1371 assert(tro_tpl.tro_cred && tro_tpl.tro_proc);
1372 }
1373 #endif /* MACH_BSD */
1374
1375 thread_lock_init(new_thread);
1376 wake_lock_init(new_thread);
1377
1378 lck_mtx_init(&new_thread->mutex, &thread_lck_grp, LCK_ATTR_NULL);
1379
1380 ipc_thread_init(parent_task, new_thread, &tro_tpl, init_options);
1381
1382 thread_ro_create(parent_task, new_thread, &tro_tpl);
1383
1384 new_thread->continuation = continuation;
1385 new_thread->parameter = parameter;
1386 new_thread->inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE;
1387 new_thread->requested_policy = default_thread_requested_policy;
1388 new_thread->__runq.runq = PROCESSOR_NULL;
1389 priority_queue_init(&new_thread->sched_inheritor_queue);
1390 priority_queue_init(&new_thread->base_inheritor_queue);
1391 #if CONFIG_SCHED_CLUTCH
1392 priority_queue_entry_init(&new_thread->th_clutch_runq_link);
1393 priority_queue_entry_init(&new_thread->th_clutch_pri_link);
1394 #endif /* CONFIG_SCHED_CLUTCH */
1395
1396 #if CONFIG_SCHED_EDGE
1397 new_thread->th_bound_cluster_enqueued = false;
1398 for (cluster_shared_rsrc_type_t shared_rsrc_type = CLUSTER_SHARED_RSRC_TYPE_MIN; shared_rsrc_type < CLUSTER_SHARED_RSRC_TYPE_COUNT; shared_rsrc_type++) {
1399 new_thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false;
1400 new_thread->th_shared_rsrc_heavy_user[shared_rsrc_type] = false;
1401 new_thread->th_shared_rsrc_heavy_perf_control[shared_rsrc_type] = false;
1402 }
1403 #endif /* CONFIG_SCHED_EDGE */
1404 new_thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE;
1405
1406 /* Allocate I/O Statistics structure */
1407 new_thread->thread_io_stats = kalloc_data(sizeof(struct io_stat_info),
1408 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1409
1410 #if KASAN_CLASSIC
1411 kasan_init_thread(&new_thread->kasan_data);
1412 #endif /* KASAN_CLASSIC */
1413
1414 #if CONFIG_KCOV
1415 kcov_init_thread(&new_thread->kcov_data);
1416 #endif
1417
1418 #if CONFIG_IOSCHED
1419 /* Clear out the I/O Scheduling info for AppleFSCompression */
1420 new_thread->decmp_upl = NULL;
1421 #endif /* CONFIG_IOSCHED */
1422
1423 new_thread->thread_region_page_shift = 0;
1424
1425 #if DEVELOPMENT || DEBUG
1426 task_lock(parent_task);
1427 uint16_t thread_limit = parent_task->task_thread_limit;
1428 if (exc_resource_threads_enabled &&
1429 thread_limit > 0 &&
1430 parent_task->thread_count >= thread_limit &&
1431 !parent_task->task_has_crossed_thread_limit &&
1432 !(task_is_a_corpse(parent_task))) {
1433 int thread_count = parent_task->thread_count;
1434 parent_task->task_has_crossed_thread_limit = TRUE;
1435 task_unlock(parent_task);
1436 SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(parent_task, thread_count);
1437 } else {
1438 task_unlock(parent_task);
1439 }
1440 #endif
1441
1442 lck_mtx_lock(&tasks_threads_lock);
1443 task_lock(parent_task);
1444
1445 /*
1446 * Fail thread creation if parent task is being torn down or has too many threads
1447 * If the caller asked for TH_OPTION_NOSUSP, also fail if the parent task is suspended
1448 */
1449 if (parent_task->active == 0 || parent_task->halting ||
1450 (parent_task->suspend_count > 0 && (options & TH_OPTION_NOSUSP) != 0) ||
1451 (parent_task->thread_count >= task_threadmax && parent_task != kernel_task)) {
1452 task_unlock(parent_task);
1453 lck_mtx_unlock(&tasks_threads_lock);
1454
1455 ipc_thread_disable(new_thread);
1456 ipc_thread_terminate(new_thread);
1457 kfree_data(new_thread->thread_io_stats,
1458 sizeof(struct io_stat_info));
1459 lck_mtx_destroy(&new_thread->mutex, &thread_lck_grp);
1460 kr = KERN_FAILURE;
1461 goto out_thread_cleanup;
1462 }
1463
1464 /* Protected by the tasks_threads_lock */
1465 new_thread->thread_id = ++thread_unique_id;
1466
1467 ctid_table_add(new_thread);
1468
1469 /* New threads inherit any default state on the task */
1470 machine_thread_inherit_taskwide(new_thread, parent_task);
1471
1472 task_reference_grp(parent_task, TASK_GRP_INTERNAL);
1473
1474 if (parent_task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PERTHR_LIMIT) {
1475 /*
1476 * This task has a per-thread CPU limit; make sure this new thread
1477 * gets its limit set too, before it gets out of the kernel.
1478 */
1479 act_set_astledger(new_thread);
1480 }
1481
1482 /* Instantiate a thread ledger. Do not fail thread creation if ledger creation fails. */
1483 if ((new_thread->t_threadledger = ledger_instantiate(thread_ledger_template,
1484 LEDGER_CREATE_INACTIVE_ENTRIES)) != LEDGER_NULL) {
1485 ledger_entry_setactive(new_thread->t_threadledger, thread_ledgers.cpu_time);
1486 }
1487
1488 new_thread->t_bankledger = LEDGER_NULL;
1489 new_thread->t_deduct_bank_ledger_time = 0;
1490 new_thread->t_deduct_bank_ledger_energy = 0;
1491
1492 new_thread->t_ledger = parent_task->ledger;
1493 if (new_thread->t_ledger) {
1494 ledger_reference(new_thread->t_ledger);
1495 }
1496
1497 recount_thread_init(&new_thread->th_recount);
1498
1499 /* Cache the task's map */
1500 new_thread->map = parent_task->map;
1501
1502 new_thread->depress_timer = timer_call_alloc(thread_depress_expire, new_thread);
1503 new_thread->wait_timer = timer_call_alloc(thread_timer_expire, new_thread);
1504
1505 #if CONFIG_CPU_COUNTERS
1506 kpc_thread_create(new_thread);
1507 #endif /* CONFIG_CPU_COUNTERS */
1508
1509 /* Set the thread's scheduling parameters */
1510 new_thread->sched_mode = SCHED(initial_thread_sched_mode)(parent_task);
1511 new_thread->max_priority = parent_task->max_priority;
1512 new_thread->task_priority = parent_task->priority;
1513
1514 #if CONFIG_THREAD_GROUPS
1515 thread_group_init_thread(new_thread, parent_task);
1516 #endif /* CONFIG_THREAD_GROUPS */
1517
1518 int new_priority = (priority < 0) ? parent_task->priority: priority;
1519 new_priority = (priority < 0)? parent_task->priority: priority;
1520 if (new_priority > new_thread->max_priority) {
1521 new_priority = new_thread->max_priority;
1522 }
1523 #if !defined(XNU_TARGET_OS_OSX)
1524 if (new_priority < MAXPRI_THROTTLE) {
1525 new_priority = MAXPRI_THROTTLE;
1526 }
1527 #endif /* !defined(XNU_TARGET_OS_OSX) */
1528
1529 new_thread->importance = new_priority - new_thread->task_priority;
1530
1531 sched_set_thread_base_priority(new_thread, new_priority);
1532
1533 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
1534 new_thread->sched_stamp = sched_tick;
1535 #if CONFIG_SCHED_CLUTCH
1536 new_thread->pri_shift = sched_clutch_thread_pri_shift(new_thread, new_thread->th_sched_bucket);
1537 #else /* CONFIG_SCHED_CLUTCH */
1538 new_thread->pri_shift = sched_pri_shifts[new_thread->th_sched_bucket];
1539 #endif /* CONFIG_SCHED_CLUTCH */
1540 #endif /* defined(CONFIG_SCHED_TIMESHARE_CORE) */
1541
1542 if (parent_task->max_priority <= MAXPRI_THROTTLE) {
1543 sched_thread_mode_demote(new_thread, TH_SFLAG_THROTTLED);
1544 }
1545
1546 thread_policy_create(new_thread);
1547
1548 /* Chain the thread onto the task's list */
1549 queue_enter(&parent_task->threads, new_thread, thread_t, task_threads);
1550 parent_task->thread_count++;
1551
1552 /* So terminating threads don't need to take the task lock to decrement */
1553 os_atomic_inc(&parent_task->active_thread_count, relaxed);
1554
1555 queue_enter(&threads, new_thread, thread_t, threads);
1556 threads_count++;
1557
1558 new_thread->active = TRUE;
1559 if (task_is_a_corpse_fork(parent_task)) {
1560 /* Set the inspection bit if the task is a corpse fork */
1561 new_thread->inspection = TRUE;
1562 } else {
1563 new_thread->inspection = FALSE;
1564 }
1565 new_thread->corpse_dup = FALSE;
1566 new_thread->turnstile = turnstile_alloc();
1567 new_thread->ctsid = turnstile_compact_id_get();
1568
1569
1570 *out_thread = new_thread;
1571
1572 if (kdebug_enable) {
1573 long args[4] = {};
1574
1575 kdbg_trace_data(get_bsdtask_info(parent_task), &args[1], &args[3]);
1576
1577 /*
1578 * Starting with 26604425, exec'ing creates a new task/thread.
1579 *
1580 * NEWTHREAD in the current process has two possible meanings:
1581 *
1582 * 1) Create a new thread for this process.
1583 * 2) Create a new thread for the future process this will become in an
1584 * exec.
1585 *
1586 * To disambiguate these, arg3 will be set to TRUE for case #2.
1587 *
1588 * The value we need to find (TPF_EXEC_COPY) is stable in the case of a
1589 * task exec'ing. The read of t_procflags does not take the proc_lock.
1590 */
1591 args[2] = task_is_exec_copy(parent_task) ? 1 : 0;
1592
1593 KDBG_RELEASE(TRACE_DATA_NEWTHREAD, (uintptr_t)thread_tid(new_thread),
1594 args[1], args[2], args[3]);
1595
1596 kdebug_proc_name_args(get_bsdtask_info(parent_task), args);
1597 KDBG_RELEASE(TRACE_STRING_NEWTHREAD, args[0], args[1], args[2],
1598 args[3]);
1599 }
1600
1601 DTRACE_PROC1(lwp__create, thread_t, *out_thread);
1602
1603 kr = KERN_SUCCESS;
1604 goto done;
1605
1606 out_thread_cleanup:
1607 #ifdef MACH_BSD
1608 {
1609 struct uthread *ut = get_bsdthread_info(new_thread);
1610
1611 uthread_cleanup(ut, &tro_tpl);
1612 uthread_destroy(ut);
1613 }
1614 #endif /* MACH_BSD */
1615
1616 machine_thread_destroy(new_thread);
1617
1618 thread_ro_destroy(new_thread);
1619 zfree(thread_zone, new_thread);
1620
1621 done:
1622 return kr;
1623 }
1624
1625 static kern_return_t
thread_create_with_options_internal(task_t task,thread_t * new_thread,boolean_t from_user,thread_create_internal_options_t options,thread_continue_t continuation)1626 thread_create_with_options_internal(
1627 task_t task,
1628 thread_t *new_thread,
1629 boolean_t from_user,
1630 thread_create_internal_options_t options,
1631 thread_continue_t continuation)
1632 {
1633 kern_return_t result;
1634 thread_t thread;
1635
1636 if (task == TASK_NULL || task == kernel_task) {
1637 return KERN_INVALID_ARGUMENT;
1638 }
1639
1640 #if CONFIG_MACF
1641 if (from_user && current_task() != task &&
1642 mac_proc_check_remote_thread_create(task, -1, NULL, 0) != 0) {
1643 return KERN_DENIED;
1644 }
1645 #endif
1646
1647 result = thread_create_internal(task, -1, continuation, NULL, options, &thread);
1648 if (result != KERN_SUCCESS) {
1649 return result;
1650 }
1651
1652 thread->user_stop_count = 1;
1653 thread_hold(thread);
1654 if (task->suspend_count > 0) {
1655 thread_hold(thread);
1656 }
1657
1658 if (from_user) {
1659 extmod_statistics_incr_thread_create(task);
1660 }
1661
1662 task_unlock(task);
1663 lck_mtx_unlock(&tasks_threads_lock);
1664
1665 *new_thread = thread;
1666
1667 return KERN_SUCCESS;
1668 }
1669
1670 kern_return_t
thread_create_immovable(task_t task,thread_t * new_thread)1671 thread_create_immovable(
1672 task_t task,
1673 thread_t *new_thread)
1674 {
1675 return thread_create_with_options_internal(task, new_thread, FALSE,
1676 TH_OPTION_NONE, (thread_continue_t)thread_bootstrap_return);
1677 }
1678
1679 kern_return_t
thread_create_from_user(task_t task,thread_t * new_thread)1680 thread_create_from_user(
1681 task_t task,
1682 thread_t *new_thread)
1683 {
1684 /* All thread ports are created immovable by default */
1685 return thread_create_with_options_internal(task, new_thread, TRUE, TH_OPTION_NONE,
1686 (thread_continue_t)thread_bootstrap_return);
1687 }
1688
1689 kern_return_t
thread_create_with_continuation(task_t task,thread_t * new_thread,thread_continue_t continuation)1690 thread_create_with_continuation(
1691 task_t task,
1692 thread_t *new_thread,
1693 thread_continue_t continuation)
1694 {
1695 return thread_create_with_options_internal(task, new_thread, FALSE, TH_OPTION_NONE, continuation);
1696 }
1697
1698 /*
1699 * Create a thread that is already started, but is waiting on an event
1700 */
1701 static kern_return_t
thread_create_waiting_internal(task_t task,thread_continue_t continuation,event_t event,block_hint_t block_hint,thread_create_internal_options_t options,thread_t * new_thread)1702 thread_create_waiting_internal(
1703 task_t task,
1704 thread_continue_t continuation,
1705 event_t event,
1706 block_hint_t block_hint,
1707 thread_create_internal_options_t options,
1708 thread_t *new_thread)
1709 {
1710 kern_return_t result;
1711 thread_t thread;
1712 wait_interrupt_t wait_interrupt = THREAD_INTERRUPTIBLE;
1713
1714 if (task == TASK_NULL || task == kernel_task) {
1715 return KERN_INVALID_ARGUMENT;
1716 }
1717
1718 result = thread_create_internal(task, -1, continuation, NULL,
1719 options, &thread);
1720 if (result != KERN_SUCCESS) {
1721 return result;
1722 }
1723
1724 /* note no user_stop_count or thread_hold here */
1725
1726 if (task->suspend_count > 0) {
1727 thread_hold(thread);
1728 }
1729
1730 thread_mtx_lock(thread);
1731 thread_set_pending_block_hint(thread, block_hint);
1732 if (options & TH_OPTION_WORKQ) {
1733 thread->static_param = true;
1734 event = workq_thread_init_and_wq_lock(task, thread);
1735 } else if (options & TH_OPTION_MAINTHREAD) {
1736 wait_interrupt = THREAD_UNINT;
1737 }
1738 thread_start_in_assert_wait(thread,
1739 assert_wait_queue(event), CAST_EVENT64_T(event),
1740 wait_interrupt);
1741 thread_mtx_unlock(thread);
1742
1743 task_unlock(task);
1744 lck_mtx_unlock(&tasks_threads_lock);
1745
1746 *new_thread = thread;
1747
1748 return KERN_SUCCESS;
1749 }
1750
1751 kern_return_t
main_thread_create_waiting(task_t task,thread_continue_t continuation,event_t event,thread_t * new_thread)1752 main_thread_create_waiting(
1753 task_t task,
1754 thread_continue_t continuation,
1755 event_t event,
1756 thread_t *new_thread)
1757 {
1758 return thread_create_waiting_internal(task, continuation, event,
1759 kThreadWaitNone, TH_OPTION_MAINTHREAD, new_thread);
1760 }
1761
1762
1763 static kern_return_t
thread_create_running_internal2(task_t task,int flavor,thread_state_t new_state,mach_msg_type_number_t new_state_count,thread_t * new_thread,boolean_t from_user)1764 thread_create_running_internal2(
1765 task_t task,
1766 int flavor,
1767 thread_state_t new_state,
1768 mach_msg_type_number_t new_state_count,
1769 thread_t *new_thread,
1770 boolean_t from_user)
1771 {
1772 kern_return_t result;
1773 thread_t thread;
1774
1775 if (task == TASK_NULL || task == kernel_task) {
1776 return KERN_INVALID_ARGUMENT;
1777 }
1778
1779 #if CONFIG_MACF
1780 if (from_user && current_task() != task &&
1781 mac_proc_check_remote_thread_create(task, flavor, new_state, new_state_count) != 0) {
1782 return KERN_DENIED;
1783 }
1784 #endif
1785
1786 result = thread_create_internal(task, -1,
1787 (thread_continue_t)thread_bootstrap_return, NULL,
1788 TH_OPTION_NONE, &thread);
1789 if (result != KERN_SUCCESS) {
1790 return result;
1791 }
1792
1793 if (task->suspend_count > 0) {
1794 thread_hold(thread);
1795 }
1796
1797 if (from_user) {
1798 result = machine_thread_state_convert_from_user(thread, flavor,
1799 new_state, new_state_count, NULL, 0, TSSF_FLAGS_NONE);
1800 }
1801 if (result == KERN_SUCCESS) {
1802 result = machine_thread_set_state(thread, flavor, new_state,
1803 new_state_count);
1804 }
1805 if (result != KERN_SUCCESS) {
1806 task_unlock(task);
1807 lck_mtx_unlock(&tasks_threads_lock);
1808
1809 thread_terminate(thread);
1810 thread_deallocate(thread);
1811 return result;
1812 }
1813
1814 thread_mtx_lock(thread);
1815 thread_start(thread);
1816 thread_mtx_unlock(thread);
1817
1818 if (from_user) {
1819 extmod_statistics_incr_thread_create(task);
1820 }
1821
1822 task_unlock(task);
1823 lck_mtx_unlock(&tasks_threads_lock);
1824
1825 *new_thread = thread;
1826
1827 return result;
1828 }
1829
1830 /* Prototype, see justification above */
1831 kern_return_t
1832 thread_create_running(
1833 task_t task,
1834 int flavor,
1835 thread_state_t new_state,
1836 mach_msg_type_number_t new_state_count,
1837 thread_t *new_thread);
1838
1839 kern_return_t
thread_create_running(task_t task,int flavor,thread_state_t new_state,mach_msg_type_number_t new_state_count,thread_t * new_thread)1840 thread_create_running(
1841 task_t task,
1842 int flavor,
1843 thread_state_t new_state,
1844 mach_msg_type_number_t new_state_count,
1845 thread_t *new_thread)
1846 {
1847 return thread_create_running_internal2(
1848 task, flavor, new_state, new_state_count,
1849 new_thread, FALSE);
1850 }
1851
1852 kern_return_t
thread_create_running_from_user(task_t task,int flavor,thread_state_t new_state,mach_msg_type_number_t new_state_count,thread_t * new_thread)1853 thread_create_running_from_user(
1854 task_t task,
1855 int flavor,
1856 thread_state_t new_state,
1857 mach_msg_type_number_t new_state_count,
1858 thread_t *new_thread)
1859 {
1860 return thread_create_running_internal2(
1861 task, flavor, new_state, new_state_count,
1862 new_thread, TRUE);
1863 }
1864
1865 kern_return_t
thread_create_workq_waiting(task_t task,thread_continue_t continuation,thread_t * new_thread,bool is_permanently_bound)1866 thread_create_workq_waiting(
1867 task_t task,
1868 thread_continue_t continuation,
1869 thread_t *new_thread,
1870 bool is_permanently_bound)
1871 {
1872 /*
1873 * Create thread, but don't pin control port just yet, in case someone calls
1874 * task_threads() and deallocates pinned port before kernel copyout happens,
1875 * which will result in pinned port guard exception. Instead, pin and copyout
1876 * atomically during workq_setup_and_run().
1877 */
1878 int options = TH_OPTION_WORKQ;
1879
1880 /*
1881 * Until we add a support for delayed thread creation for permanently
1882 * bound workqueue threads, we do not pass TH_OPTION_NOSUSP for their
1883 * creation.
1884 */
1885 if (!is_permanently_bound) {
1886 options |= TH_OPTION_NOSUSP;
1887 }
1888
1889 return thread_create_waiting_internal(task, continuation, NULL,
1890 is_permanently_bound ? kThreadWaitParkedBoundWorkQueue : kThreadWaitParkedWorkQueue,
1891 options, new_thread);
1892 }
1893
1894 /*
1895 * kernel_thread_create:
1896 *
1897 * Create a thread in the kernel task
1898 * to execute in kernel context.
1899 */
1900 kern_return_t
kernel_thread_create(thread_continue_t continuation,void * parameter,integer_t priority,thread_t * new_thread)1901 kernel_thread_create(
1902 thread_continue_t continuation,
1903 void *parameter,
1904 integer_t priority,
1905 thread_t *new_thread)
1906 {
1907 kern_return_t result;
1908 thread_t thread;
1909 task_t task = kernel_task;
1910
1911 result = thread_create_internal(task, priority, continuation, parameter,
1912 TH_OPTION_NONE, &thread);
1913 if (result != KERN_SUCCESS) {
1914 return result;
1915 }
1916
1917 task_unlock(task);
1918 lck_mtx_unlock(&tasks_threads_lock);
1919
1920 stack_alloc(thread);
1921 assert(thread->kernel_stack != 0);
1922 #if !defined(XNU_TARGET_OS_OSX)
1923 if (priority > BASEPRI_KERNEL)
1924 #endif
1925 thread->reserved_stack = thread->kernel_stack;
1926
1927 if (debug_task & 1) {
1928 kprintf("kernel_thread_create: thread = %p continuation = %p\n", thread, continuation);
1929 }
1930 *new_thread = thread;
1931
1932 return result;
1933 }
1934
1935 kern_return_t
kernel_thread_start_priority(thread_continue_t continuation,void * parameter,integer_t priority,thread_t * new_thread)1936 kernel_thread_start_priority(
1937 thread_continue_t continuation,
1938 void *parameter,
1939 integer_t priority,
1940 thread_t *new_thread)
1941 {
1942 kern_return_t result;
1943 thread_t thread;
1944
1945 result = kernel_thread_create(continuation, parameter, priority, &thread);
1946 if (result != KERN_SUCCESS) {
1947 return result;
1948 }
1949
1950 *new_thread = thread;
1951
1952 thread_mtx_lock(thread);
1953 thread_start(thread);
1954 thread_mtx_unlock(thread);
1955
1956 return result;
1957 }
1958
1959 kern_return_t
kernel_thread_start(thread_continue_t continuation,void * parameter,thread_t * new_thread)1960 kernel_thread_start(
1961 thread_continue_t continuation,
1962 void *parameter,
1963 thread_t *new_thread)
1964 {
1965 return kernel_thread_start_priority(continuation, parameter, -1, new_thread);
1966 }
1967
1968 /* Separated into helper function so it can be used by THREAD_BASIC_INFO and THREAD_EXTENDED_INFO */
1969 /* it is assumed that the thread is locked by the caller */
1970 static void
retrieve_thread_basic_info(thread_t thread,thread_basic_info_t basic_info)1971 retrieve_thread_basic_info(thread_t thread, thread_basic_info_t basic_info)
1972 {
1973 int state, flags;
1974
1975 /* fill in info */
1976
1977 thread_read_times(thread, &basic_info->user_time,
1978 &basic_info->system_time, NULL);
1979
1980 /*
1981 * Update lazy-evaluated scheduler info because someone wants it.
1982 */
1983 if (SCHED(can_update_priority)(thread)) {
1984 SCHED(update_priority)(thread);
1985 }
1986
1987 basic_info->sleep_time = 0;
1988
1989 /*
1990 * To calculate cpu_usage, first correct for timer rate,
1991 * then for 5/8 ageing. The correction factor [3/5] is
1992 * (1/(5/8) - 1).
1993 */
1994 basic_info->cpu_usage = 0;
1995 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
1996 if (sched_tick_interval) {
1997 basic_info->cpu_usage = (integer_t)(((uint64_t)thread->cpu_usage
1998 * TH_USAGE_SCALE) / sched_tick_interval);
1999 basic_info->cpu_usage = (basic_info->cpu_usage * 3) / 5;
2000 }
2001 #endif
2002
2003 if (basic_info->cpu_usage > TH_USAGE_SCALE) {
2004 basic_info->cpu_usage = TH_USAGE_SCALE;
2005 }
2006
2007 basic_info->policy = ((thread->sched_mode == TH_MODE_TIMESHARE)?
2008 POLICY_TIMESHARE: POLICY_RR);
2009
2010 flags = 0;
2011 if (thread->options & TH_OPT_IDLE_THREAD) {
2012 flags |= TH_FLAGS_IDLE;
2013 }
2014
2015 if (thread->options & TH_OPT_GLOBAL_FORCED_IDLE) {
2016 flags |= TH_FLAGS_GLOBAL_FORCED_IDLE;
2017 }
2018
2019 if (!thread->kernel_stack) {
2020 flags |= TH_FLAGS_SWAPPED;
2021 }
2022
2023 state = 0;
2024 if (thread->state & TH_TERMINATE) {
2025 state = TH_STATE_HALTED;
2026 } else if (thread->state & TH_RUN) {
2027 state = TH_STATE_RUNNING;
2028 } else if (thread->state & TH_UNINT) {
2029 state = TH_STATE_UNINTERRUPTIBLE;
2030 } else if (thread->state & TH_SUSP) {
2031 state = TH_STATE_STOPPED;
2032 } else if (thread->state & TH_WAIT) {
2033 state = TH_STATE_WAITING;
2034 }
2035
2036 basic_info->run_state = state;
2037 basic_info->flags = flags;
2038
2039 basic_info->suspend_count = thread->user_stop_count;
2040
2041 return;
2042 }
2043
2044 kern_return_t
thread_info_internal(thread_t thread,thread_flavor_t flavor,thread_info_t thread_info_out,mach_msg_type_number_t * thread_info_count)2045 thread_info_internal(
2046 thread_t thread,
2047 thread_flavor_t flavor,
2048 thread_info_t thread_info_out, /* ptr to OUT array */
2049 mach_msg_type_number_t *thread_info_count) /*IN/OUT*/
2050 {
2051 spl_t s;
2052
2053 if (thread == THREAD_NULL) {
2054 return KERN_INVALID_ARGUMENT;
2055 }
2056
2057 if (flavor == THREAD_BASIC_INFO) {
2058 if (*thread_info_count < THREAD_BASIC_INFO_COUNT) {
2059 return KERN_INVALID_ARGUMENT;
2060 }
2061
2062 s = splsched();
2063 thread_lock(thread);
2064
2065 retrieve_thread_basic_info(thread, (thread_basic_info_t) thread_info_out);
2066
2067 thread_unlock(thread);
2068 splx(s);
2069
2070 *thread_info_count = THREAD_BASIC_INFO_COUNT;
2071
2072 return KERN_SUCCESS;
2073 } else if (flavor == THREAD_IDENTIFIER_INFO) {
2074 thread_identifier_info_t identifier_info;
2075
2076 if (*thread_info_count < THREAD_IDENTIFIER_INFO_COUNT) {
2077 return KERN_INVALID_ARGUMENT;
2078 }
2079
2080 identifier_info = __IGNORE_WCASTALIGN((thread_identifier_info_t)thread_info_out);
2081
2082 s = splsched();
2083 thread_lock(thread);
2084
2085 identifier_info->thread_id = thread->thread_id;
2086 identifier_info->thread_handle = thread->machine.cthread_self;
2087 identifier_info->dispatch_qaddr = thread_dispatchqaddr(thread);
2088
2089 thread_unlock(thread);
2090 splx(s);
2091 return KERN_SUCCESS;
2092 } else if (flavor == THREAD_SCHED_TIMESHARE_INFO) {
2093 policy_timeshare_info_t ts_info;
2094
2095 if (*thread_info_count < POLICY_TIMESHARE_INFO_COUNT) {
2096 return KERN_INVALID_ARGUMENT;
2097 }
2098
2099 ts_info = (policy_timeshare_info_t)thread_info_out;
2100
2101 s = splsched();
2102 thread_lock(thread);
2103
2104 if (thread->sched_mode != TH_MODE_TIMESHARE) {
2105 thread_unlock(thread);
2106 splx(s);
2107 return KERN_INVALID_POLICY;
2108 }
2109
2110 ts_info->depressed = (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != 0;
2111 if (ts_info->depressed) {
2112 ts_info->base_priority = DEPRESSPRI;
2113 ts_info->depress_priority = thread->base_pri;
2114 } else {
2115 ts_info->base_priority = thread->base_pri;
2116 ts_info->depress_priority = -1;
2117 }
2118
2119 ts_info->cur_priority = thread->sched_pri;
2120 ts_info->max_priority = thread->max_priority;
2121
2122 thread_unlock(thread);
2123 splx(s);
2124
2125 *thread_info_count = POLICY_TIMESHARE_INFO_COUNT;
2126
2127 return KERN_SUCCESS;
2128 } else if (flavor == THREAD_SCHED_FIFO_INFO) {
2129 if (*thread_info_count < POLICY_FIFO_INFO_COUNT) {
2130 return KERN_INVALID_ARGUMENT;
2131 }
2132
2133 return KERN_INVALID_POLICY;
2134 } else if (flavor == THREAD_SCHED_RR_INFO) {
2135 policy_rr_info_t rr_info;
2136 uint32_t quantum_time;
2137 uint64_t quantum_ns;
2138
2139 if (*thread_info_count < POLICY_RR_INFO_COUNT) {
2140 return KERN_INVALID_ARGUMENT;
2141 }
2142
2143 rr_info = (policy_rr_info_t) thread_info_out;
2144
2145 s = splsched();
2146 thread_lock(thread);
2147
2148 if (thread->sched_mode == TH_MODE_TIMESHARE) {
2149 thread_unlock(thread);
2150 splx(s);
2151
2152 return KERN_INVALID_POLICY;
2153 }
2154
2155 rr_info->depressed = (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != 0;
2156 if (rr_info->depressed) {
2157 rr_info->base_priority = DEPRESSPRI;
2158 rr_info->depress_priority = thread->base_pri;
2159 } else {
2160 rr_info->base_priority = thread->base_pri;
2161 rr_info->depress_priority = -1;
2162 }
2163
2164 quantum_time = SCHED(initial_quantum_size)(THREAD_NULL);
2165 absolutetime_to_nanoseconds(quantum_time, &quantum_ns);
2166
2167 rr_info->max_priority = thread->max_priority;
2168 rr_info->quantum = (uint32_t)(quantum_ns / 1000 / 1000);
2169
2170 thread_unlock(thread);
2171 splx(s);
2172
2173 *thread_info_count = POLICY_RR_INFO_COUNT;
2174
2175 return KERN_SUCCESS;
2176 } else if (flavor == THREAD_EXTENDED_INFO) {
2177 thread_basic_info_data_t basic_info;
2178 thread_extended_info_t extended_info = __IGNORE_WCASTALIGN((thread_extended_info_t)thread_info_out);
2179
2180 if (*thread_info_count < THREAD_EXTENDED_INFO_COUNT) {
2181 return KERN_INVALID_ARGUMENT;
2182 }
2183
2184 s = splsched();
2185 thread_lock(thread);
2186
2187 /* NOTE: This mimics fill_taskthreadinfo(), which is the function used by proc_pidinfo() for
2188 * the PROC_PIDTHREADINFO flavor (which can't be used on corpses)
2189 */
2190 retrieve_thread_basic_info(thread, &basic_info);
2191 extended_info->pth_user_time = (((uint64_t)basic_info.user_time.seconds * NSEC_PER_SEC) + ((uint64_t)basic_info.user_time.microseconds * NSEC_PER_USEC));
2192 extended_info->pth_system_time = (((uint64_t)basic_info.system_time.seconds * NSEC_PER_SEC) + ((uint64_t)basic_info.system_time.microseconds * NSEC_PER_USEC));
2193
2194 extended_info->pth_cpu_usage = basic_info.cpu_usage;
2195 extended_info->pth_policy = basic_info.policy;
2196 extended_info->pth_run_state = basic_info.run_state;
2197 extended_info->pth_flags = basic_info.flags;
2198 extended_info->pth_sleep_time = basic_info.sleep_time;
2199 extended_info->pth_curpri = thread->sched_pri;
2200 extended_info->pth_priority = thread->base_pri;
2201 extended_info->pth_maxpriority = thread->max_priority;
2202
2203 bsd_getthreadname(get_bsdthread_info(thread), extended_info->pth_name);
2204
2205 thread_unlock(thread);
2206 splx(s);
2207
2208 *thread_info_count = THREAD_EXTENDED_INFO_COUNT;
2209
2210 return KERN_SUCCESS;
2211 } else if (flavor == THREAD_DEBUG_INFO_INTERNAL) {
2212 #if DEVELOPMENT || DEBUG
2213 thread_debug_info_internal_t dbg_info;
2214 if (*thread_info_count < THREAD_DEBUG_INFO_INTERNAL_COUNT) {
2215 return KERN_NOT_SUPPORTED;
2216 }
2217
2218 if (thread_info_out == NULL) {
2219 return KERN_INVALID_ARGUMENT;
2220 }
2221
2222 dbg_info = __IGNORE_WCASTALIGN((thread_debug_info_internal_t)thread_info_out);
2223 dbg_info->page_creation_count = thread->t_page_creation_count;
2224
2225 *thread_info_count = THREAD_DEBUG_INFO_INTERNAL_COUNT;
2226 return KERN_SUCCESS;
2227 #endif /* DEVELOPMENT || DEBUG */
2228 return KERN_NOT_SUPPORTED;
2229 }
2230
2231 return KERN_INVALID_ARGUMENT;
2232 }
2233
2234 static void
_convert_mach_to_time_value(uint64_t time_mach,time_value_t * time)2235 _convert_mach_to_time_value(uint64_t time_mach, time_value_t *time)
2236 {
2237 clock_sec_t secs;
2238 clock_usec_t usecs;
2239 absolutetime_to_microtime(time_mach, &secs, &usecs);
2240 time->seconds = (typeof(time->seconds))secs;
2241 time->microseconds = usecs;
2242 }
2243
2244 void
thread_read_times(thread_t thread,time_value_t * user_time,time_value_t * system_time,time_value_t * runnable_time)2245 thread_read_times(
2246 thread_t thread,
2247 time_value_t *user_time,
2248 time_value_t *system_time,
2249 time_value_t *runnable_time)
2250 {
2251 if (user_time && system_time) {
2252 struct recount_times_mach times = recount_thread_times(thread);
2253 _convert_mach_to_time_value(times.rtm_user, user_time);
2254 _convert_mach_to_time_value(times.rtm_system, system_time);
2255 }
2256
2257 if (runnable_time) {
2258 uint64_t runnable_time_mach = timer_grab(&thread->runnable_timer);
2259 _convert_mach_to_time_value(runnable_time_mach, runnable_time);
2260 }
2261 }
2262
2263 uint64_t
thread_get_runtime_self(void)2264 thread_get_runtime_self(void)
2265 {
2266 /*
2267 * Must be guaranteed to stay on the same CPU and not be updated by the
2268 * scheduler.
2269 */
2270 boolean_t interrupt_state = ml_set_interrupts_enabled(FALSE);
2271 uint64_t time_mach = recount_current_thread_time_mach();
2272 ml_set_interrupts_enabled(interrupt_state);
2273 return time_mach;
2274 }
2275
2276 /*
2277 * thread_wire_internal:
2278 *
2279 * Specify that the target thread must always be able
2280 * to run and to allocate memory.
2281 */
2282 kern_return_t
thread_wire_internal(host_priv_t host_priv,thread_t thread,boolean_t wired,boolean_t * prev_state)2283 thread_wire_internal(
2284 host_priv_t host_priv,
2285 thread_t thread,
2286 boolean_t wired,
2287 boolean_t *prev_state)
2288 {
2289 if (host_priv == NULL || thread != current_thread()) {
2290 return KERN_INVALID_ARGUMENT;
2291 }
2292
2293 if (prev_state) {
2294 *prev_state = (thread->options & TH_OPT_VMPRIV) != 0;
2295 }
2296
2297 if (wired) {
2298 if (!(thread->options & TH_OPT_VMPRIV)) {
2299 vm_page_free_reserve(1); /* XXX */
2300 }
2301 thread->options |= TH_OPT_VMPRIV;
2302 } else {
2303 if (thread->options & TH_OPT_VMPRIV) {
2304 vm_page_free_reserve(-1); /* XXX */
2305 }
2306 thread->options &= ~TH_OPT_VMPRIV;
2307 }
2308
2309 return KERN_SUCCESS;
2310 }
2311
2312
2313 /*
2314 * thread_wire:
2315 *
2316 * User-api wrapper for thread_wire_internal()
2317 */
2318 kern_return_t
thread_wire(host_priv_t host_priv __unused,thread_t thread __unused,boolean_t wired __unused)2319 thread_wire(
2320 host_priv_t host_priv __unused,
2321 thread_t thread __unused,
2322 boolean_t wired __unused)
2323 {
2324 return KERN_NOT_SUPPORTED;
2325 }
2326
2327 boolean_t
is_external_pageout_thread(void)2328 is_external_pageout_thread(void)
2329 {
2330 return current_thread() == pgo_iothread_external_state.pgo_iothread;
2331 }
2332
2333 boolean_t
is_vm_privileged(void)2334 is_vm_privileged(void)
2335 {
2336 return current_thread()->options & TH_OPT_VMPRIV ? TRUE : FALSE;
2337 }
2338
2339 boolean_t
set_vm_privilege(boolean_t privileged)2340 set_vm_privilege(boolean_t privileged)
2341 {
2342 boolean_t was_vmpriv;
2343
2344 if (current_thread()->options & TH_OPT_VMPRIV) {
2345 was_vmpriv = TRUE;
2346 } else {
2347 was_vmpriv = FALSE;
2348 }
2349
2350 if (privileged != FALSE) {
2351 current_thread()->options |= TH_OPT_VMPRIV;
2352 } else {
2353 current_thread()->options &= ~TH_OPT_VMPRIV;
2354 }
2355
2356 return was_vmpriv;
2357 }
2358
2359 void
thread_floor_boost_set_promotion_locked(thread_t thread)2360 thread_floor_boost_set_promotion_locked(thread_t thread)
2361 {
2362 assert(thread->priority_floor_count > 0);
2363
2364 if (!(thread->sched_flags & TH_SFLAG_FLOOR_PROMOTED)) {
2365 sched_thread_promote_reason(thread, TH_SFLAG_FLOOR_PROMOTED, 0);
2366 }
2367 }
2368
2369 /*! @function thread_priority_floor_start
2370 * @abstract boost the current thread priority to floor.
2371 * @discussion Increase the priority of the current thread to at least MINPRI_FLOOR.
2372 * The boost will be mantained until a corresponding thread_priority_floor_end()
2373 * is called. Every call of thread_priority_floor_start() needs to have a corresponding
2374 * call to thread_priority_floor_end() from the same thread.
2375 * No thread can return to userspace before calling thread_priority_floor_end().
2376 *
2377 * NOTE: avoid to use this function. Try to use gate_t or sleep_with_inheritor()
2378 * instead.
2379 * @result a token to be given to the corresponding thread_priority_floor_end()
2380 */
2381 thread_pri_floor_t
thread_priority_floor_start(void)2382 thread_priority_floor_start(void)
2383 {
2384 thread_pri_floor_t ret;
2385 thread_t thread = current_thread();
2386 __assert_only uint16_t prev_priority_floor_count;
2387
2388 assert(thread->priority_floor_count < UINT16_MAX);
2389 prev_priority_floor_count = thread->priority_floor_count++;
2390 #if MACH_ASSERT
2391 /*
2392 * Set the ast to check that the
2393 * priority_floor_count is going to be set to zero when
2394 * going back to userspace.
2395 * Set it only once when we increment it for the first time.
2396 */
2397 if (prev_priority_floor_count == 0) {
2398 act_set_debug_assert();
2399 }
2400 #endif
2401
2402 ret.thread = thread;
2403 return ret;
2404 }
2405
2406 /*! @function thread_priority_floor_end
2407 * @abstract ends the floor boost.
2408 * @param token the token obtained from thread_priority_floor_start()
2409 * @discussion ends the priority floor boost started with thread_priority_floor_start()
2410 */
2411 void
thread_priority_floor_end(thread_pri_floor_t * token)2412 thread_priority_floor_end(thread_pri_floor_t *token)
2413 {
2414 thread_t thread = current_thread();
2415
2416 assert(thread->priority_floor_count > 0);
2417 assertf(token->thread == thread, "thread_priority_floor_end called from a different thread from thread_priority_floor_start %p %p", thread, token->thread);
2418
2419 if ((thread->priority_floor_count-- == 1) && (thread->sched_flags & TH_SFLAG_FLOOR_PROMOTED)) {
2420 spl_t s = splsched();
2421 thread_lock(thread);
2422
2423 if (thread->sched_flags & TH_SFLAG_FLOOR_PROMOTED) {
2424 sched_thread_unpromote_reason(thread, TH_SFLAG_FLOOR_PROMOTED, 0);
2425 }
2426
2427 thread_unlock(thread);
2428 splx(s);
2429 }
2430
2431 token->thread = NULL;
2432 }
2433
2434 /*
2435 * XXX assuming current thread only, for now...
2436 */
2437 void
thread_guard_violation(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode,boolean_t fatal)2438 thread_guard_violation(thread_t thread,
2439 mach_exception_data_type_t code, mach_exception_data_type_t subcode, boolean_t fatal)
2440 {
2441 assert(thread == current_thread());
2442
2443 /* Don't set up the AST for kernel threads; this check is needed to ensure
2444 * that the guard_exc_* fields in the thread structure are set only by the
2445 * current thread and therefore, don't require a lock.
2446 */
2447 if (get_threadtask(thread) == kernel_task) {
2448 return;
2449 }
2450
2451 assert(EXC_GUARD_DECODE_GUARD_TYPE(code));
2452
2453 /*
2454 * Use the saved state area of the thread structure
2455 * to store all info required to handle the AST when
2456 * returning to userspace. It's possible that there is
2457 * already a pending guard exception. If it's non-fatal,
2458 * it can only be over-written by a fatal exception code.
2459 */
2460 if (thread->guard_exc_info.code && (thread->guard_exc_fatal || !fatal)) {
2461 return;
2462 }
2463
2464 thread->guard_exc_info.code = code;
2465 thread->guard_exc_info.subcode = subcode;
2466 thread->guard_exc_fatal = fatal ? 1 : 0;
2467
2468 spl_t s = splsched();
2469 thread_ast_set(thread, AST_GUARD);
2470 ast_propagate(thread);
2471 splx(s);
2472 }
2473
2474 #if CONFIG_DEBUG_SYSCALL_REJECTION
2475 extern void rejected_syscall_guard_ast(thread_t __unused t, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
2476 #endif /* CONFIG_DEBUG_SYSCALL_REJECTION */
2477
2478 /*
2479 * guard_ast:
2480 *
2481 * Handle AST_GUARD for a thread. This routine looks at the
2482 * state saved in the thread structure to determine the cause
2483 * of this exception. Based on this value, it invokes the
2484 * appropriate routine which determines other exception related
2485 * info and raises the exception.
2486 */
2487 void
guard_ast(thread_t t)2488 guard_ast(thread_t t)
2489 {
2490 const mach_exception_data_type_t
2491 code = t->guard_exc_info.code,
2492 subcode = t->guard_exc_info.subcode;
2493
2494 t->guard_exc_info.code = 0;
2495 t->guard_exc_info.subcode = 0;
2496 t->guard_exc_fatal = 0;
2497
2498 switch (EXC_GUARD_DECODE_GUARD_TYPE(code)) {
2499 case GUARD_TYPE_NONE:
2500 /* lingering AST_GUARD on the processor? */
2501 break;
2502 case GUARD_TYPE_MACH_PORT:
2503 mach_port_guard_ast(t, code, subcode);
2504 break;
2505 case GUARD_TYPE_FD:
2506 fd_guard_ast(t, code, subcode);
2507 break;
2508 case GUARD_TYPE_VN:
2509 vn_guard_ast(t, code, subcode);
2510 break;
2511 case GUARD_TYPE_VIRT_MEMORY:
2512 virt_memory_guard_ast(t, code, subcode);
2513 break;
2514 #if CONFIG_DEBUG_SYSCALL_REJECTION
2515 case GUARD_TYPE_REJECTED_SC:
2516 rejected_syscall_guard_ast(t, code, subcode);
2517 break;
2518 #endif /* CONFIG_DEBUG_SYSCALL_REJECTION */
2519 default:
2520 panic("guard_exc_info %llx %llx", code, subcode);
2521 }
2522 }
2523
2524 static void
thread_cputime_callback(int warning,__unused const void * arg0,__unused const void * arg1)2525 thread_cputime_callback(int warning, __unused const void *arg0, __unused const void *arg1)
2526 {
2527 if (warning == LEDGER_WARNING_ROSE_ABOVE) {
2528 #if CONFIG_TELEMETRY
2529 /*
2530 * This thread is in danger of violating the CPU usage monitor. Enable telemetry
2531 * on the entire task so there are micro-stackshots available if and when
2532 * EXC_RESOURCE is triggered. We could have chosen to enable micro-stackshots
2533 * for this thread only; but now that this task is suspect, knowing what all of
2534 * its threads are up to will be useful.
2535 */
2536 telemetry_task_ctl(current_task(), TF_CPUMON_WARNING, 1);
2537 #endif
2538 return;
2539 }
2540
2541 #if CONFIG_TELEMETRY
2542 /*
2543 * If the balance has dipped below the warning level (LEDGER_WARNING_DIPPED_BELOW) or
2544 * exceeded the limit, turn telemetry off for the task.
2545 */
2546 telemetry_task_ctl(current_task(), TF_CPUMON_WARNING, 0);
2547 #endif
2548
2549 if (warning == 0) {
2550 SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU();
2551 }
2552 }
2553
2554 void __attribute__((noinline))
SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void)2555 SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void)
2556 {
2557 int pid = 0;
2558 task_t task = current_task();
2559 thread_t thread = current_thread();
2560 uint64_t tid = thread->thread_id;
2561 const char *procname = "unknown";
2562 time_value_t thread_total_time = {0, 0};
2563 time_value_t thread_system_time;
2564 time_value_t thread_user_time;
2565 int action;
2566 uint8_t percentage;
2567 uint32_t usage_percent = 0;
2568 uint32_t interval_sec;
2569 uint64_t interval_ns;
2570 uint64_t balance_ns;
2571 boolean_t fatal = FALSE;
2572 boolean_t send_exc_resource = TRUE; /* in addition to RESOURCE_NOTIFY */
2573 kern_return_t kr;
2574
2575 #ifdef EXC_RESOURCE_MONITORS
2576 mach_exception_data_type_t code[EXCEPTION_CODE_MAX];
2577 #endif /* EXC_RESOURCE_MONITORS */
2578 struct ledger_entry_info lei;
2579
2580 assert(thread->t_threadledger != LEDGER_NULL);
2581
2582 /*
2583 * Extract the fatal bit and suspend the monitor (which clears the bit).
2584 */
2585 task_lock(task);
2586 if (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_FATAL_CPUMON) {
2587 fatal = TRUE;
2588 send_exc_resource = TRUE;
2589 }
2590 /* Only one thread can be here at a time. Whichever makes it through
2591 * first will successfully suspend the monitor and proceed to send the
2592 * notification. Other threads will get an error trying to suspend the
2593 * monitor and give up on sending the notification. In the first release,
2594 * the monitor won't be resumed for a number of seconds, but we may
2595 * eventually need to handle low-latency resume.
2596 */
2597 kr = task_suspend_cpumon(task);
2598 task_unlock(task);
2599 if (kr == KERN_INVALID_ARGUMENT) {
2600 return;
2601 }
2602
2603 #ifdef MACH_BSD
2604 pid = proc_selfpid();
2605 void *bsd_info = get_bsdtask_info(task);
2606 if (bsd_info != NULL) {
2607 procname = proc_name_address(bsd_info);
2608 }
2609 #endif
2610
2611 thread_get_cpulimit(&action, &percentage, &interval_ns);
2612
2613 interval_sec = (uint32_t)(interval_ns / NSEC_PER_SEC);
2614
2615 thread_read_times(thread, &thread_user_time, &thread_system_time, NULL);
2616 time_value_add(&thread_total_time, &thread_user_time);
2617 time_value_add(&thread_total_time, &thread_system_time);
2618 ledger_get_entry_info(thread->t_threadledger, thread_ledgers.cpu_time, &lei);
2619
2620 /* credit/debit/balance/limit are in absolute time units;
2621 * the refill info is in nanoseconds. */
2622 absolutetime_to_nanoseconds(lei.lei_balance, &balance_ns);
2623 if (lei.lei_last_refill > 0) {
2624 usage_percent = (uint32_t)((balance_ns * 100ULL) / lei.lei_last_refill);
2625 }
2626
2627 /* TODO: show task total runtime (via TASK_ABSOLUTETIME_INFO)? */
2628 printf("process %s[%d] thread %llu caught burning CPU! It used more than %d%% CPU over %u seconds\n",
2629 procname, pid, tid, percentage, interval_sec);
2630 printf(" (actual recent usage: %d%% over ~%llu seconds)\n",
2631 usage_percent, (lei.lei_last_refill + NSEC_PER_SEC / 2) / NSEC_PER_SEC);
2632 printf(" Thread lifetime cpu usage %d.%06ds, (%d.%06d user, %d.%06d sys)\n",
2633 thread_total_time.seconds, thread_total_time.microseconds,
2634 thread_user_time.seconds, thread_user_time.microseconds,
2635 thread_system_time.seconds, thread_system_time.microseconds);
2636 printf(" Ledger balance: %lld; mabs credit: %lld; mabs debit: %lld\n",
2637 lei.lei_balance, lei.lei_credit, lei.lei_debit);
2638 printf(" mabs limit: %llu; mabs period: %llu ns; last refill: %llu ns%s.\n",
2639 lei.lei_limit, lei.lei_refill_period, lei.lei_last_refill,
2640 (fatal ? " [fatal violation]" : ""));
2641
2642 /*
2643 * For now, send RESOURCE_NOTIFY in parallel with EXC_RESOURCE. Once
2644 * we have logging parity, we will stop sending EXC_RESOURCE (24508922).
2645 */
2646
2647 /* RESOURCE_NOTIFY MIG specifies nanoseconds of CPU time */
2648 lei.lei_balance = balance_ns;
2649 absolutetime_to_nanoseconds(lei.lei_limit, &lei.lei_limit);
2650 trace_resource_violation(RMON_CPUUSAGE_VIOLATED, &lei);
2651 kr = send_resource_violation(send_cpu_usage_violation, task, &lei,
2652 fatal ? kRNFatalLimitFlag : 0);
2653 if (kr) {
2654 printf("send_resource_violation(CPU usage, ...): error %#x\n", kr);
2655 }
2656
2657 #ifdef EXC_RESOURCE_MONITORS
2658 if (send_exc_resource) {
2659 if (disable_exc_resource) {
2660 printf("process %s[%d] thread %llu caught burning CPU! "
2661 "EXC_RESOURCE%s suppressed by a boot-arg\n",
2662 procname, pid, tid, fatal ? " (and termination)" : "");
2663 return;
2664 }
2665
2666 if (disable_exc_resource_during_audio && audio_active) {
2667 printf("process %s[%d] thread %llu caught burning CPU! "
2668 "EXC_RESOURCE & termination suppressed due to audio playback\n",
2669 procname, pid, tid);
2670 return;
2671 }
2672 }
2673
2674
2675 if (send_exc_resource) {
2676 code[0] = code[1] = 0;
2677 EXC_RESOURCE_ENCODE_TYPE(code[0], RESOURCE_TYPE_CPU);
2678 if (fatal) {
2679 EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_CPU_MONITOR_FATAL);
2680 } else {
2681 EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_CPU_MONITOR);
2682 }
2683 EXC_RESOURCE_CPUMONITOR_ENCODE_INTERVAL(code[0], interval_sec);
2684 EXC_RESOURCE_CPUMONITOR_ENCODE_PERCENTAGE(code[0], percentage);
2685 EXC_RESOURCE_CPUMONITOR_ENCODE_PERCENTAGE(code[1], usage_percent);
2686 exception_triage(EXC_RESOURCE, code, EXCEPTION_CODE_MAX);
2687 }
2688 #endif /* EXC_RESOURCE_MONITORS */
2689
2690 if (fatal) {
2691 #if CONFIG_JETSAM
2692 jetsam_on_ledger_cpulimit_exceeded();
2693 #else
2694 task_terminate_internal(task);
2695 #endif
2696 }
2697 }
2698
2699 bool os_variant_has_internal_diagnostics(const char *subsystem);
2700
2701 #if DEVELOPMENT || DEBUG
2702
2703 void __attribute__((noinline))
SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(task_t task,int thread_count)2704 SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(task_t task, int thread_count)
2705 {
2706 mach_exception_data_type_t code[EXCEPTION_CODE_MAX] = {0};
2707 int pid = task_pid(task);
2708 char procname[MAXCOMLEN + 1] = "unknown";
2709
2710 if (pid == 1) {
2711 /*
2712 * Cannot suspend launchd
2713 */
2714 return;
2715 }
2716
2717 proc_name(pid, procname, sizeof(procname));
2718
2719 /*
2720 * Skip all checks for testing when exc_resource_threads_enabled is overriden
2721 */
2722 if (exc_resource_threads_enabled == 2) {
2723 goto skip_checks;
2724 }
2725
2726 if (disable_exc_resource) {
2727 printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE "
2728 "suppressed by a boot-arg.\n", procname, pid, thread_count);
2729 return;
2730 }
2731
2732 if (!os_variant_has_internal_diagnostics("com.apple.xnu")) {
2733 printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE "
2734 "suppressed, internal diagnostics disabled.\n", procname, pid, thread_count);
2735 return;
2736 }
2737
2738 if (disable_exc_resource_during_audio && audio_active) {
2739 printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE "
2740 "suppressed due to audio playback.\n", procname, pid, thread_count);
2741 return;
2742 }
2743
2744 if (!exc_via_corpse_forking) {
2745 printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE "
2746 "suppressed due to corpse forking being disabled.\n", procname, pid,
2747 thread_count);
2748 return;
2749 }
2750
2751 skip_checks:
2752 printf("process %s[%d] crossed thread count high watermark (%d), sending "
2753 "EXC_RESOURCE\n", procname, pid, thread_count);
2754
2755 EXC_RESOURCE_ENCODE_TYPE(code[0], RESOURCE_TYPE_THREADS);
2756 EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_THREADS_HIGH_WATERMARK);
2757 EXC_RESOURCE_THREADS_ENCODE_THREADS(code[0], thread_count);
2758
2759 task_enqueue_exception_with_corpse(task, EXC_RESOURCE, code, EXCEPTION_CODE_MAX, NULL, FALSE);
2760 }
2761 #endif /* DEVELOPMENT || DEBUG */
2762
2763 void
thread_update_io_stats(thread_t thread,int size,int io_flags)2764 thread_update_io_stats(thread_t thread, int size, int io_flags)
2765 {
2766 task_t task = get_threadtask(thread);
2767 int io_tier;
2768
2769 if (thread->thread_io_stats == NULL || task->task_io_stats == NULL) {
2770 return;
2771 }
2772
2773 if (io_flags & DKIO_READ) {
2774 UPDATE_IO_STATS(thread->thread_io_stats->disk_reads, size);
2775 UPDATE_IO_STATS_ATOMIC(task->task_io_stats->disk_reads, size);
2776 }
2777
2778 if (io_flags & DKIO_META) {
2779 UPDATE_IO_STATS(thread->thread_io_stats->metadata, size);
2780 UPDATE_IO_STATS_ATOMIC(task->task_io_stats->metadata, size);
2781 }
2782
2783 if (io_flags & DKIO_PAGING) {
2784 UPDATE_IO_STATS(thread->thread_io_stats->paging, size);
2785 UPDATE_IO_STATS_ATOMIC(task->task_io_stats->paging, size);
2786 }
2787
2788 io_tier = ((io_flags & DKIO_TIER_MASK) >> DKIO_TIER_SHIFT);
2789 assert(io_tier < IO_NUM_PRIORITIES);
2790
2791 UPDATE_IO_STATS(thread->thread_io_stats->io_priority[io_tier], size);
2792 UPDATE_IO_STATS_ATOMIC(task->task_io_stats->io_priority[io_tier], size);
2793
2794 /* Update Total I/O Counts */
2795 UPDATE_IO_STATS(thread->thread_io_stats->total_io, size);
2796 UPDATE_IO_STATS_ATOMIC(task->task_io_stats->total_io, size);
2797
2798 if (!(io_flags & DKIO_READ)) {
2799 DTRACE_IO3(physical_writes, struct task *, task, uint32_t, size, int, io_flags);
2800 ledger_credit(task->ledger, task_ledgers.physical_writes, size);
2801 }
2802 }
2803
2804 static void
init_thread_ledgers(void)2805 init_thread_ledgers(void)
2806 {
2807 ledger_template_t t;
2808 int idx;
2809
2810 assert(thread_ledger_template == NULL);
2811
2812 if ((t = ledger_template_create("Per-thread ledger")) == NULL) {
2813 panic("couldn't create thread ledger template");
2814 }
2815
2816 if ((idx = ledger_entry_add(t, "cpu_time", "sched", "ns")) < 0) {
2817 panic("couldn't create cpu_time entry for thread ledger template");
2818 }
2819
2820 if (ledger_set_callback(t, idx, thread_cputime_callback, NULL, NULL) < 0) {
2821 panic("couldn't set thread ledger callback for cpu_time entry");
2822 }
2823
2824 thread_ledgers.cpu_time = idx;
2825
2826 ledger_template_complete(t);
2827 thread_ledger_template = t;
2828 }
2829
2830 /*
2831 * Returns the amount of (abs) CPU time that remains before the limit would be
2832 * hit or the amount of time left in the current interval, whichever is smaller.
2833 * This value changes as CPU time is consumed and the ledgers refilled.
2834 * Used to limit the quantum of a thread.
2835 */
2836 uint64_t
thread_cpulimit_remaining(uint64_t now)2837 thread_cpulimit_remaining(uint64_t now)
2838 {
2839 thread_t thread = current_thread();
2840
2841 if ((thread->options &
2842 (TH_OPT_PROC_CPULIMIT | TH_OPT_PRVT_CPULIMIT)) == 0) {
2843 return UINT64_MAX;
2844 }
2845
2846 /* Amount of time left in the current interval. */
2847 const uint64_t interval_remaining =
2848 ledger_get_interval_remaining(thread->t_threadledger, thread_ledgers.cpu_time, now);
2849
2850 /* Amount that can be spent until the limit is hit. */
2851 const uint64_t remaining =
2852 ledger_get_remaining(thread->t_threadledger, thread_ledgers.cpu_time);
2853
2854 return MIN(interval_remaining, remaining);
2855 }
2856
2857 /*
2858 * Returns true if a new interval should be started.
2859 */
2860 bool
thread_cpulimit_interval_has_expired(uint64_t now)2861 thread_cpulimit_interval_has_expired(uint64_t now)
2862 {
2863 thread_t thread = current_thread();
2864
2865 if ((thread->options &
2866 (TH_OPT_PROC_CPULIMIT | TH_OPT_PRVT_CPULIMIT)) == 0) {
2867 return false;
2868 }
2869
2870 return ledger_get_interval_remaining(thread->t_threadledger,
2871 thread_ledgers.cpu_time, now) == 0;
2872 }
2873
2874 /*
2875 * Balances the ledger and sets the last refill time to `now`.
2876 */
2877 void
thread_cpulimit_restart(uint64_t now)2878 thread_cpulimit_restart(uint64_t now)
2879 {
2880 thread_t thread = current_thread();
2881
2882 assert3u(thread->options & (TH_OPT_PROC_CPULIMIT | TH_OPT_PRVT_CPULIMIT), !=, 0);
2883
2884 ledger_restart(thread->t_threadledger, thread_ledgers.cpu_time, now);
2885 }
2886
2887 /*
2888 * Returns currently applied CPU usage limit, or 0/0 if none is applied.
2889 */
2890 int
thread_get_cpulimit(int * action,uint8_t * percentage,uint64_t * interval_ns)2891 thread_get_cpulimit(int *action, uint8_t *percentage, uint64_t *interval_ns)
2892 {
2893 int64_t abstime = 0;
2894 uint64_t limittime = 0;
2895 thread_t thread = current_thread();
2896
2897 *percentage = 0;
2898 *interval_ns = 0;
2899 *action = 0;
2900
2901 if (thread->t_threadledger == LEDGER_NULL) {
2902 /*
2903 * This thread has no per-thread ledger, so it can't possibly
2904 * have a CPU limit applied.
2905 */
2906 return KERN_SUCCESS;
2907 }
2908
2909 ledger_get_period(thread->t_threadledger, thread_ledgers.cpu_time, interval_ns);
2910 ledger_get_limit(thread->t_threadledger, thread_ledgers.cpu_time, &abstime);
2911
2912 if ((abstime == LEDGER_LIMIT_INFINITY) || (*interval_ns == 0)) {
2913 /*
2914 * This thread's CPU time ledger has no period or limit; so it
2915 * doesn't have a CPU limit applied.
2916 */
2917 return KERN_SUCCESS;
2918 }
2919
2920 /*
2921 * This calculation is the converse to the one in thread_set_cpulimit().
2922 */
2923 absolutetime_to_nanoseconds(abstime, &limittime);
2924 *percentage = (uint8_t)((limittime * 100ULL) / *interval_ns);
2925 assert(*percentage <= 100);
2926
2927 if (thread->options & TH_OPT_PROC_CPULIMIT) {
2928 assert((thread->options & TH_OPT_PRVT_CPULIMIT) == 0);
2929
2930 *action = THREAD_CPULIMIT_BLOCK;
2931 } else if (thread->options & TH_OPT_PRVT_CPULIMIT) {
2932 assert((thread->options & TH_OPT_PROC_CPULIMIT) == 0);
2933
2934 *action = THREAD_CPULIMIT_EXCEPTION;
2935 } else {
2936 *action = THREAD_CPULIMIT_DISABLE;
2937 }
2938
2939 return KERN_SUCCESS;
2940 }
2941
2942 /*
2943 * Set CPU usage limit on a thread.
2944 */
2945 int
thread_set_cpulimit(int action,uint8_t percentage,uint64_t interval_ns)2946 thread_set_cpulimit(int action, uint8_t percentage, uint64_t interval_ns)
2947 {
2948 thread_t thread = current_thread();
2949 ledger_t l;
2950 uint64_t limittime = 0;
2951 uint64_t abstime = 0;
2952
2953 assert(percentage <= 100);
2954 assert(percentage > 0 || action == THREAD_CPULIMIT_DISABLE);
2955
2956 /*
2957 * Disallow any change to the CPU limit if the TH_OPT_FORCED_LEDGER
2958 * flag is set.
2959 */
2960 if ((thread->options & TH_OPT_FORCED_LEDGER) != 0) {
2961 return KERN_FAILURE;
2962 }
2963
2964 if (action == THREAD_CPULIMIT_DISABLE) {
2965 /*
2966 * Remove CPU limit, if any exists.
2967 */
2968 if (thread->t_threadledger != LEDGER_NULL) {
2969 l = thread->t_threadledger;
2970 ledger_set_limit(l, thread_ledgers.cpu_time, LEDGER_LIMIT_INFINITY, 0);
2971 ledger_set_action(l, thread_ledgers.cpu_time, LEDGER_ACTION_IGNORE);
2972 thread->options &= ~(TH_OPT_PROC_CPULIMIT | TH_OPT_PRVT_CPULIMIT);
2973 }
2974
2975 return 0;
2976 }
2977
2978 if (interval_ns < MINIMUM_CPULIMIT_INTERVAL_MS * NSEC_PER_MSEC) {
2979 return KERN_INVALID_ARGUMENT;
2980 }
2981
2982 l = thread->t_threadledger;
2983 if (l == LEDGER_NULL) {
2984 /*
2985 * This thread doesn't yet have a per-thread ledger; so create one with the CPU time entry active.
2986 */
2987 if ((l = ledger_instantiate(thread_ledger_template, LEDGER_CREATE_INACTIVE_ENTRIES)) == LEDGER_NULL) {
2988 return KERN_RESOURCE_SHORTAGE;
2989 }
2990
2991 /*
2992 * We are the first to create this thread's ledger, so only activate our entry.
2993 */
2994 ledger_entry_setactive(l, thread_ledgers.cpu_time);
2995 thread->t_threadledger = l;
2996 }
2997
2998 /*
2999 * The limit is specified as a percentage of CPU over an interval in nanoseconds.
3000 * Calculate the amount of CPU time that the thread needs to consume in order to hit the limit.
3001 */
3002 limittime = (interval_ns * percentage) / 100;
3003 nanoseconds_to_absolutetime(limittime, &abstime);
3004 ledger_set_limit(l, thread_ledgers.cpu_time, abstime, cpumon_ustackshots_trigger_pct);
3005 /*
3006 * Refill the thread's allotted CPU time every interval_ns nanoseconds.
3007 */
3008 ledger_set_period(l, thread_ledgers.cpu_time, interval_ns);
3009
3010 if (action == THREAD_CPULIMIT_EXCEPTION) {
3011 /*
3012 * We don't support programming the CPU usage monitor on a task if any of its
3013 * threads have a per-thread blocking CPU limit configured.
3014 */
3015 if (thread->options & TH_OPT_PRVT_CPULIMIT) {
3016 panic("CPU usage monitor activated, but blocking thread limit exists");
3017 }
3018
3019 /*
3020 * Make a note that this thread's CPU limit is being used for the task-wide CPU
3021 * usage monitor. We don't have to arm the callback which will trigger the
3022 * exception, because that was done for us in ledger_instantiate (because the
3023 * ledger template used has a default callback).
3024 */
3025 thread->options |= TH_OPT_PROC_CPULIMIT;
3026 } else {
3027 /*
3028 * We deliberately override any CPU limit imposed by a task-wide limit (eg
3029 * CPU usage monitor).
3030 */
3031 thread->options &= ~TH_OPT_PROC_CPULIMIT;
3032
3033 thread->options |= TH_OPT_PRVT_CPULIMIT;
3034 /* The per-thread ledger template by default has a callback for CPU time */
3035 ledger_disable_callback(l, thread_ledgers.cpu_time);
3036 ledger_set_action(l, thread_ledgers.cpu_time, LEDGER_ACTION_BLOCK);
3037 }
3038
3039 return 0;
3040 }
3041
3042 void
thread_sched_call(thread_t thread,sched_call_t call)3043 thread_sched_call(
3044 thread_t thread,
3045 sched_call_t call)
3046 {
3047 assert((thread->state & TH_WAIT_REPORT) == 0);
3048 thread->sched_call = call;
3049 }
3050
3051 uint64_t
thread_tid(thread_t thread)3052 thread_tid(
3053 thread_t thread)
3054 {
3055 return thread != THREAD_NULL? thread->thread_id: 0;
3056 }
3057
3058 uint64_t
uthread_tid(struct uthread * uth)3059 uthread_tid(
3060 struct uthread *uth)
3061 {
3062 if (uth) {
3063 return thread_tid(get_machthread(uth));
3064 }
3065 return 0;
3066 }
3067
3068 uint16_t
thread_set_tag(thread_t th,uint16_t tag)3069 thread_set_tag(thread_t th, uint16_t tag)
3070 {
3071 return thread_set_tag_internal(th, tag);
3072 }
3073
3074 uint16_t
thread_get_tag(thread_t th)3075 thread_get_tag(thread_t th)
3076 {
3077 return thread_get_tag_internal(th);
3078 }
3079
3080 uint64_t
thread_last_run_time(thread_t th)3081 thread_last_run_time(thread_t th)
3082 {
3083 return th->last_run_time;
3084 }
3085
3086 /*
3087 * Shared resource contention management
3088 *
3089 * The scheduler attempts to load balance the shared resource intensive
3090 * workloads across clusters to ensure that the resource is not heavily
3091 * contended. The kernel relies on external agents (userspace or
3092 * performance controller) to identify shared resource heavy threads.
3093 * The load balancing is achieved based on the scheduler configuration
3094 * enabled on the platform.
3095 */
3096
3097
3098 #if CONFIG_SCHED_EDGE
3099
3100 /*
3101 * On the Edge scheduler, the load balancing is achieved by looking
3102 * at cluster level shared resource loads and migrating resource heavy
3103 * threads dynamically to under utilized cluster. Therefore, when a
3104 * thread is indicated as a resource heavy thread, the policy set
3105 * routine simply adds a flag to the thread which is looked at by
3106 * the scheduler on thread migration decisions.
3107 */
3108
3109 boolean_t
thread_shared_rsrc_policy_get(thread_t thread,cluster_shared_rsrc_type_t type)3110 thread_shared_rsrc_policy_get(thread_t thread, cluster_shared_rsrc_type_t type)
3111 {
3112 return thread->th_shared_rsrc_heavy_user[type] || thread->th_shared_rsrc_heavy_perf_control[type];
3113 }
3114
3115 __options_decl(sched_edge_rsrc_heavy_thread_state, uint32_t, {
3116 SCHED_EDGE_RSRC_HEAVY_THREAD_SET = 1,
3117 SCHED_EDGE_RSRC_HEAVY_THREAD_CLR = 2,
3118 });
3119
3120 kern_return_t
thread_shared_rsrc_policy_set(thread_t thread,__unused uint32_t index,cluster_shared_rsrc_type_t type,shared_rsrc_policy_agent_t agent)3121 thread_shared_rsrc_policy_set(thread_t thread, __unused uint32_t index, cluster_shared_rsrc_type_t type, shared_rsrc_policy_agent_t agent)
3122 {
3123 spl_t s = splsched();
3124 thread_lock(thread);
3125
3126 bool user = (agent == SHARED_RSRC_POLICY_AGENT_DISPATCH) || (agent == SHARED_RSRC_POLICY_AGENT_SYSCTL);
3127 bool *thread_flags = (user) ? thread->th_shared_rsrc_heavy_user : thread->th_shared_rsrc_heavy_perf_control;
3128 if (thread_flags[type]) {
3129 thread_unlock(thread);
3130 splx(s);
3131 return KERN_FAILURE;
3132 }
3133
3134 thread_flags[type] = true;
3135 thread_unlock(thread);
3136 splx(s);
3137
3138 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_RSRC_HEAVY_THREAD) | DBG_FUNC_NONE, SCHED_EDGE_RSRC_HEAVY_THREAD_SET, thread_tid(thread), type, agent);
3139 if (thread == current_thread()) {
3140 if (agent == SHARED_RSRC_POLICY_AGENT_PERFCTL_QUANTUM) {
3141 ast_on(AST_PREEMPT);
3142 } else {
3143 assert(agent != SHARED_RSRC_POLICY_AGENT_PERFCTL_CSW);
3144 thread_block(THREAD_CONTINUE_NULL);
3145 }
3146 }
3147 return KERN_SUCCESS;
3148 }
3149
3150 kern_return_t
thread_shared_rsrc_policy_clear(thread_t thread,cluster_shared_rsrc_type_t type,shared_rsrc_policy_agent_t agent)3151 thread_shared_rsrc_policy_clear(thread_t thread, cluster_shared_rsrc_type_t type, shared_rsrc_policy_agent_t agent)
3152 {
3153 spl_t s = splsched();
3154 thread_lock(thread);
3155
3156 bool user = (agent == SHARED_RSRC_POLICY_AGENT_DISPATCH) || (agent == SHARED_RSRC_POLICY_AGENT_SYSCTL);
3157 bool *thread_flags = (user) ? thread->th_shared_rsrc_heavy_user : thread->th_shared_rsrc_heavy_perf_control;
3158 if (!thread_flags[type]) {
3159 thread_unlock(thread);
3160 splx(s);
3161 return KERN_FAILURE;
3162 }
3163
3164 thread_flags[type] = false;
3165 thread_unlock(thread);
3166 splx(s);
3167
3168 KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_RSRC_HEAVY_THREAD) | DBG_FUNC_NONE, SCHED_EDGE_RSRC_HEAVY_THREAD_CLR, thread_tid(thread), type, agent);
3169 if (thread == current_thread()) {
3170 if (agent == SHARED_RSRC_POLICY_AGENT_PERFCTL_QUANTUM) {
3171 ast_on(AST_PREEMPT);
3172 } else {
3173 assert(agent != SHARED_RSRC_POLICY_AGENT_PERFCTL_CSW);
3174 thread_block(THREAD_CONTINUE_NULL);
3175 }
3176 }
3177 return KERN_SUCCESS;
3178 }
3179
3180 #else /* CONFIG_SCHED_EDGE */
3181
3182 /*
3183 * On non-Edge schedulers, the shared resource contention
3184 * is managed by simply binding threads to specific clusters
3185 * based on the worker index passed by the agents marking
3186 * this thread as resource heavy threads. The thread binding
3187 * approach does not provide any rebalancing opportunities;
3188 * it can also suffer from scheduling delays if the cluster
3189 * where the thread is bound is contended.
3190 */
3191
3192 boolean_t
thread_shared_rsrc_policy_get(__unused thread_t thread,__unused cluster_shared_rsrc_type_t type)3193 thread_shared_rsrc_policy_get(__unused thread_t thread, __unused cluster_shared_rsrc_type_t type)
3194 {
3195 return false;
3196 }
3197
3198 kern_return_t
thread_shared_rsrc_policy_set(thread_t thread,uint32_t index,__unused cluster_shared_rsrc_type_t type,__unused shared_rsrc_policy_agent_t agent)3199 thread_shared_rsrc_policy_set(thread_t thread, uint32_t index, __unused cluster_shared_rsrc_type_t type, __unused shared_rsrc_policy_agent_t agent)
3200 {
3201 return thread_bind_cluster_id(thread, index, THREAD_BIND_SOFT | THREAD_BIND_ELIGIBLE_ONLY);
3202 }
3203
3204 kern_return_t
thread_shared_rsrc_policy_clear(thread_t thread,__unused cluster_shared_rsrc_type_t type,__unused shared_rsrc_policy_agent_t agent)3205 thread_shared_rsrc_policy_clear(thread_t thread, __unused cluster_shared_rsrc_type_t type, __unused shared_rsrc_policy_agent_t agent)
3206 {
3207 return thread_bind_cluster_id(thread, 0, THREAD_UNBIND);
3208 }
3209
3210 #endif /* CONFIG_SCHED_EDGE */
3211
3212 uint64_t
thread_dispatchqaddr(thread_t thread)3213 thread_dispatchqaddr(
3214 thread_t thread)
3215 {
3216 uint64_t dispatchqueue_addr;
3217 uint64_t thread_handle;
3218 task_t task;
3219
3220 if (thread == THREAD_NULL) {
3221 return 0;
3222 }
3223
3224 thread_handle = thread->machine.cthread_self;
3225 if (thread_handle == 0) {
3226 return 0;
3227 }
3228
3229 task = get_threadtask(thread);
3230 void *bsd_info = get_bsdtask_info(task);
3231 if (thread->inspection == TRUE) {
3232 dispatchqueue_addr = thread_handle + get_task_dispatchqueue_offset(task);
3233 } else if (bsd_info) {
3234 dispatchqueue_addr = thread_handle + get_dispatchqueue_offset_from_proc(bsd_info);
3235 } else {
3236 dispatchqueue_addr = 0;
3237 }
3238
3239 return dispatchqueue_addr;
3240 }
3241
3242
3243 uint64_t
thread_wqquantum_addr(thread_t thread)3244 thread_wqquantum_addr(thread_t thread)
3245 {
3246 uint64_t thread_handle;
3247 task_t task;
3248
3249 if (thread == THREAD_NULL) {
3250 return 0;
3251 }
3252
3253 thread_handle = thread->machine.cthread_self;
3254 if (thread_handle == 0) {
3255 return 0;
3256 }
3257 task = get_threadtask(thread);
3258
3259 uint64_t wq_quantum_expiry_offset = get_wq_quantum_offset_from_proc(get_bsdtask_info(task));
3260 if (wq_quantum_expiry_offset == 0) {
3261 return 0;
3262 }
3263
3264 return wq_quantum_expiry_offset + thread_handle;
3265 }
3266
3267 uint64_t
thread_rettokern_addr(thread_t thread)3268 thread_rettokern_addr(
3269 thread_t thread)
3270 {
3271 uint64_t rettokern_addr;
3272 uint64_t rettokern_offset;
3273 uint64_t thread_handle;
3274 task_t task;
3275 void *bsd_info;
3276
3277 if (thread == THREAD_NULL) {
3278 return 0;
3279 }
3280
3281 thread_handle = thread->machine.cthread_self;
3282 if (thread_handle == 0) {
3283 return 0;
3284 }
3285 task = get_threadtask(thread);
3286 bsd_info = get_bsdtask_info(task);
3287
3288 if (bsd_info) {
3289 rettokern_offset = get_return_to_kernel_offset_from_proc(bsd_info);
3290
3291 /* Return 0 if return to kernel offset is not initialized. */
3292 if (rettokern_offset == 0) {
3293 rettokern_addr = 0;
3294 } else {
3295 rettokern_addr = thread_handle + rettokern_offset;
3296 }
3297 } else {
3298 rettokern_addr = 0;
3299 }
3300
3301 return rettokern_addr;
3302 }
3303
3304 /*
3305 * Export routines to other components for things that are done as macros
3306 * within the osfmk component.
3307 */
3308
3309 void
thread_mtx_lock(thread_t thread)3310 thread_mtx_lock(thread_t thread)
3311 {
3312 lck_mtx_lock(&thread->mutex);
3313 }
3314
3315 void
thread_mtx_unlock(thread_t thread)3316 thread_mtx_unlock(thread_t thread)
3317 {
3318 lck_mtx_unlock(&thread->mutex);
3319 }
3320
3321 void
thread_reference(thread_t thread)3322 thread_reference(
3323 thread_t thread)
3324 {
3325 if (thread != THREAD_NULL) {
3326 zone_id_require(ZONE_ID_THREAD, sizeof(struct thread), thread);
3327 os_ref_retain_raw(&thread->ref_count, &thread_refgrp);
3328 }
3329 }
3330
3331 void
thread_require(thread_t thread)3332 thread_require(thread_t thread)
3333 {
3334 zone_id_require(ZONE_ID_THREAD, sizeof(struct thread), thread);
3335 }
3336
3337 #undef thread_should_halt
3338
3339 boolean_t
thread_should_halt(thread_t th)3340 thread_should_halt(
3341 thread_t th)
3342 {
3343 return thread_should_halt_fast(th);
3344 }
3345
3346 /*
3347 * thread_set_voucher_name - reset the voucher port name bound to this thread
3348 *
3349 * Conditions: nothing locked
3350 */
3351
3352 kern_return_t
thread_set_voucher_name(mach_port_name_t voucher_name)3353 thread_set_voucher_name(mach_port_name_t voucher_name)
3354 {
3355 thread_t thread = current_thread();
3356 ipc_voucher_t new_voucher = IPC_VOUCHER_NULL;
3357 ipc_voucher_t voucher;
3358 ledger_t bankledger = NULL;
3359 struct thread_group *banktg = NULL;
3360 uint32_t persona_id = 0;
3361
3362 if (MACH_PORT_DEAD == voucher_name) {
3363 return KERN_INVALID_RIGHT;
3364 }
3365
3366 /*
3367 * agressively convert to voucher reference
3368 */
3369 if (MACH_PORT_VALID(voucher_name)) {
3370 new_voucher = convert_port_name_to_voucher(voucher_name);
3371 if (IPC_VOUCHER_NULL == new_voucher) {
3372 return KERN_INVALID_ARGUMENT;
3373 }
3374 }
3375 bank_get_bank_ledger_thread_group_and_persona(new_voucher, &bankledger, &banktg, &persona_id);
3376
3377 thread_mtx_lock(thread);
3378 voucher = thread->ith_voucher;
3379 thread->ith_voucher_name = voucher_name;
3380 thread->ith_voucher = new_voucher;
3381 thread_mtx_unlock(thread);
3382
3383 bank_swap_thread_bank_ledger(thread, bankledger);
3384 #if CONFIG_THREAD_GROUPS
3385 thread_group_set_bank(thread, banktg);
3386 #endif /* CONFIG_THREAD_GROUPS */
3387
3388 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3389 MACHDBG_CODE(DBG_MACH_IPC, MACH_THREAD_SET_VOUCHER) | DBG_FUNC_NONE,
3390 (uintptr_t)thread_tid(thread),
3391 (uintptr_t)voucher_name,
3392 VM_KERNEL_ADDRPERM((uintptr_t)new_voucher),
3393 persona_id, 0);
3394
3395 if (IPC_VOUCHER_NULL != voucher) {
3396 ipc_voucher_release(voucher);
3397 }
3398
3399 return KERN_SUCCESS;
3400 }
3401
3402 /*
3403 * thread_get_mach_voucher - return a voucher reference for the specified thread voucher
3404 *
3405 * Conditions: nothing locked
3406 *
3407 * NOTE: At the moment, there is no distinction between the current and effective
3408 * vouchers because we only set them at the thread level currently.
3409 */
3410 kern_return_t
thread_get_mach_voucher(thread_act_t thread,mach_voucher_selector_t __unused which,ipc_voucher_t * voucherp)3411 thread_get_mach_voucher(
3412 thread_act_t thread,
3413 mach_voucher_selector_t __unused which,
3414 ipc_voucher_t *voucherp)
3415 {
3416 ipc_voucher_t voucher;
3417
3418 if (THREAD_NULL == thread) {
3419 return KERN_INVALID_ARGUMENT;
3420 }
3421
3422 thread_mtx_lock(thread);
3423 voucher = thread->ith_voucher;
3424
3425 if (IPC_VOUCHER_NULL != voucher) {
3426 ipc_voucher_reference(voucher);
3427 thread_mtx_unlock(thread);
3428 *voucherp = voucher;
3429 return KERN_SUCCESS;
3430 }
3431
3432 thread_mtx_unlock(thread);
3433
3434 *voucherp = IPC_VOUCHER_NULL;
3435 return KERN_SUCCESS;
3436 }
3437
3438 /*
3439 * thread_set_mach_voucher - set a voucher reference for the specified thread voucher
3440 *
3441 * Conditions: callers holds a reference on the voucher.
3442 * nothing locked.
3443 *
3444 * We grab another reference to the voucher and bind it to the thread.
3445 * The old voucher reference associated with the thread is
3446 * discarded.
3447 */
3448 kern_return_t
thread_set_mach_voucher(thread_t thread,ipc_voucher_t voucher)3449 thread_set_mach_voucher(
3450 thread_t thread,
3451 ipc_voucher_t voucher)
3452 {
3453 ipc_voucher_t old_voucher;
3454 ledger_t bankledger = NULL;
3455 struct thread_group *banktg = NULL;
3456 uint32_t persona_id = 0;
3457
3458 if (THREAD_NULL == thread) {
3459 return KERN_INVALID_ARGUMENT;
3460 }
3461
3462 bank_get_bank_ledger_thread_group_and_persona(voucher, &bankledger, &banktg, &persona_id);
3463
3464 thread_mtx_lock(thread);
3465 /*
3466 * Once the thread is started, we will look at `ith_voucher` without
3467 * holding any lock.
3468 *
3469 * Setting the voucher hence can only be done by current_thread() or
3470 * before it started. "started" flips under the thread mutex and must be
3471 * tested under it too.
3472 */
3473 if (thread != current_thread() && thread->started) {
3474 thread_mtx_unlock(thread);
3475 return KERN_INVALID_ARGUMENT;
3476 }
3477
3478 ipc_voucher_reference(voucher);
3479 old_voucher = thread->ith_voucher;
3480 thread->ith_voucher = voucher;
3481 thread->ith_voucher_name = MACH_PORT_NULL;
3482 thread_mtx_unlock(thread);
3483
3484 bank_swap_thread_bank_ledger(thread, bankledger);
3485 #if CONFIG_THREAD_GROUPS
3486 thread_group_set_bank(thread, banktg);
3487 #endif /* CONFIG_THREAD_GROUPS */
3488
3489 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3490 MACHDBG_CODE(DBG_MACH_IPC, MACH_THREAD_SET_VOUCHER) | DBG_FUNC_NONE,
3491 (uintptr_t)thread_tid(thread),
3492 (uintptr_t)MACH_PORT_NULL,
3493 VM_KERNEL_ADDRPERM((uintptr_t)voucher),
3494 persona_id, 0);
3495
3496 ipc_voucher_release(old_voucher);
3497
3498 return KERN_SUCCESS;
3499 }
3500
3501 /*
3502 * thread_swap_mach_voucher - swap a voucher reference for the specified thread voucher
3503 *
3504 * Conditions: callers holds a reference on the new and presumed old voucher(s).
3505 * nothing locked.
3506 *
3507 * This function is no longer supported.
3508 */
3509 kern_return_t
thread_swap_mach_voucher(__unused thread_t thread,__unused ipc_voucher_t new_voucher,ipc_voucher_t * in_out_old_voucher)3510 thread_swap_mach_voucher(
3511 __unused thread_t thread,
3512 __unused ipc_voucher_t new_voucher,
3513 ipc_voucher_t *in_out_old_voucher)
3514 {
3515 /*
3516 * Currently this function is only called from a MIG generated
3517 * routine which doesn't release the reference on the voucher
3518 * addressed by in_out_old_voucher. To avoid leaking this reference,
3519 * a call to release it has been added here.
3520 */
3521 ipc_voucher_release(*in_out_old_voucher);
3522 OS_ANALYZER_SUPPRESS("81787115") return KERN_NOT_SUPPORTED;
3523 }
3524
3525 /*
3526 * thread_get_current_voucher_origin_pid - get the pid of the originator of the current voucher.
3527 */
3528 kern_return_t
thread_get_current_voucher_origin_pid(int32_t * pid)3529 thread_get_current_voucher_origin_pid(
3530 int32_t *pid)
3531 {
3532 return thread_get_voucher_origin_pid(current_thread(), pid);
3533 }
3534
3535 /*
3536 * thread_get_current_voucher_origin_pid - get the pid of the originator of the current voucher.
3537 */
3538 kern_return_t
thread_get_voucher_origin_pid(thread_t thread,int32_t * pid)3539 thread_get_voucher_origin_pid(thread_t thread, int32_t *pid)
3540 {
3541 uint32_t buf_size = sizeof(*pid);
3542 return mach_voucher_attr_command(thread->ith_voucher,
3543 MACH_VOUCHER_ATTR_KEY_BANK,
3544 BANK_ORIGINATOR_PID,
3545 NULL,
3546 0,
3547 (mach_voucher_attr_content_t)pid,
3548 &buf_size);
3549 }
3550
3551 /*
3552 * thread_get_current_voucher_proximate_pid - get the pid of the proximate process of the current voucher.
3553 */
3554 kern_return_t
thread_get_voucher_origin_proximate_pid(thread_t thread,int32_t * origin_pid,int32_t * proximate_pid)3555 thread_get_voucher_origin_proximate_pid(thread_t thread, int32_t *origin_pid, int32_t *proximate_pid)
3556 {
3557 int32_t origin_proximate_pids[2] = { };
3558 uint32_t buf_size = sizeof(origin_proximate_pids);
3559 kern_return_t kr = mach_voucher_attr_command(thread->ith_voucher,
3560 MACH_VOUCHER_ATTR_KEY_BANK,
3561 BANK_ORIGINATOR_PROXIMATE_PID,
3562 NULL,
3563 0,
3564 (mach_voucher_attr_content_t)origin_proximate_pids,
3565 &buf_size);
3566 if (kr == KERN_SUCCESS) {
3567 *origin_pid = origin_proximate_pids[0];
3568 *proximate_pid = origin_proximate_pids[1];
3569 }
3570 return kr;
3571 }
3572
3573 #if CONFIG_THREAD_GROUPS
3574 /*
3575 * Returns the current thread's voucher-carried thread group
3576 *
3577 * Reference is borrowed from this being the current voucher, so it does NOT
3578 * return a reference to the group.
3579 */
3580 struct thread_group *
thread_get_current_voucher_thread_group(thread_t thread)3581 thread_get_current_voucher_thread_group(thread_t thread)
3582 {
3583 assert(thread == current_thread());
3584
3585 if (thread->ith_voucher == NULL) {
3586 return NULL;
3587 }
3588
3589 ledger_t bankledger = NULL;
3590 struct thread_group *banktg = NULL;
3591
3592 bank_get_bank_ledger_thread_group_and_persona(thread->ith_voucher, &bankledger, &banktg, NULL);
3593
3594 return banktg;
3595 }
3596
3597 #endif /* CONFIG_THREAD_GROUPS */
3598
3599 #if CONFIG_COALITIONS
3600
3601 uint64_t
thread_get_current_voucher_resource_coalition_id(thread_t thread)3602 thread_get_current_voucher_resource_coalition_id(thread_t thread)
3603 {
3604 uint64_t id = 0;
3605 assert(thread == current_thread());
3606 if (thread->ith_voucher != NULL) {
3607 id = bank_get_bank_ledger_resource_coalition_id(thread->ith_voucher);
3608 }
3609 return id;
3610 }
3611
3612 #endif /* CONFIG_COALITIONS */
3613
3614 extern struct workqueue *
3615 proc_get_wqptr(void *proc);
3616
3617 static bool
task_supports_cooperative_workqueue(task_t task)3618 task_supports_cooperative_workqueue(task_t task)
3619 {
3620 void *bsd_info = get_bsdtask_info(task);
3621
3622 assert(task == current_task());
3623 if (bsd_info == NULL) {
3624 return false;
3625 }
3626
3627 uint64_t wq_quantum_expiry_offset = get_wq_quantum_offset_from_proc(bsd_info);
3628 /* userspace may not yet have called workq_open yet */
3629 struct workqueue *wq = proc_get_wqptr(bsd_info);
3630
3631 return (wq != NULL) && (wq_quantum_expiry_offset != 0);
3632 }
3633
3634 /* Not safe to call from scheduler paths - should only be called on self */
3635 bool
thread_supports_cooperative_workqueue(thread_t thread)3636 thread_supports_cooperative_workqueue(thread_t thread)
3637 {
3638 struct uthread *uth = get_bsdthread_info(thread);
3639 task_t task = get_threadtask(thread);
3640
3641 assert(thread == current_thread());
3642
3643 return task_supports_cooperative_workqueue(task) &&
3644 bsdthread_part_of_cooperative_workqueue(uth);
3645 }
3646
3647 static inline bool
thread_has_armed_workqueue_quantum(thread_t thread)3648 thread_has_armed_workqueue_quantum(thread_t thread)
3649 {
3650 return thread->workq_quantum_deadline != 0;
3651 }
3652
3653 /*
3654 * The workq quantum is a lazy timer that is evaluated at 2 specific times in
3655 * the scheduler:
3656 *
3657 * - context switch time
3658 * - scheduler quantum expiry time.
3659 *
3660 * We're currently expressing the workq quantum with a 0.5 scale factor of the
3661 * scheduler quantum. It is possible that if the workq quantum is rearmed
3662 * shortly after the scheduler quantum begins, we could have a large delay
3663 * between when the workq quantum next expires and when it actually is noticed.
3664 *
3665 * A potential future improvement for the wq quantum expiry logic is to compare
3666 * it to the next actual scheduler quantum deadline and expire it if it is
3667 * within a certain leeway.
3668 */
3669 static inline uint64_t
thread_workq_quantum_size(thread_t thread)3670 thread_workq_quantum_size(thread_t thread)
3671 {
3672 return (uint64_t) (SCHED(initial_quantum_size)(thread) / 2);
3673 }
3674
3675 /*
3676 * Always called by thread on itself - either at AST boundary after processing
3677 * an existing quantum expiry, or when a new quantum is armed before the thread
3678 * goes out to userspace to handle a thread request
3679 */
3680 void
thread_arm_workqueue_quantum(thread_t thread)3681 thread_arm_workqueue_quantum(thread_t thread)
3682 {
3683 /*
3684 * If the task is not opted into wq quantum notification, or if the thread
3685 * is not part of the cooperative workqueue, don't even bother with tracking
3686 * the quantum or calculating expiry
3687 */
3688 if (!thread_supports_cooperative_workqueue(thread)) {
3689 assert(thread->workq_quantum_deadline == 0);
3690 return;
3691 }
3692
3693 assert(current_thread() == thread);
3694 assert(thread_get_tag(thread) & THREAD_TAG_WORKQUEUE);
3695
3696 uint64_t current_runtime = thread_get_runtime_self();
3697 uint64_t deadline = thread_workq_quantum_size(thread) + current_runtime;
3698
3699 /*
3700 * The update of a workqueue quantum should always be followed by the update
3701 * of the AST - see explanation in kern/thread.h for synchronization of this
3702 * field
3703 */
3704 thread->workq_quantum_deadline = deadline;
3705
3706 /* We're arming a new quantum, clear any previous expiry notification */
3707 act_clear_astkevent(thread, AST_KEVENT_WORKQ_QUANTUM_EXPIRED);
3708
3709 WQ_TRACE(TRACE_wq_quantum_arm, current_runtime, deadline, 0, 0);
3710
3711 WORKQ_QUANTUM_HISTORY_WRITE_ENTRY(thread, thread->workq_quantum_deadline, true);
3712 }
3713
3714 /* Called by a thread on itself when it is about to park */
3715 void
thread_disarm_workqueue_quantum(thread_t thread)3716 thread_disarm_workqueue_quantum(thread_t thread)
3717 {
3718 /* The update of a workqueue quantum should always be followed by the update
3719 * of the AST - see explanation in kern/thread.h for synchronization of this
3720 * field */
3721 thread->workq_quantum_deadline = 0;
3722 act_clear_astkevent(thread, AST_KEVENT_WORKQ_QUANTUM_EXPIRED);
3723
3724 WQ_TRACE(TRACE_wq_quantum_disarm, 0, 0, 0, 0);
3725
3726 WORKQ_QUANTUM_HISTORY_WRITE_ENTRY(thread, thread->workq_quantum_deadline, false);
3727 }
3728
3729 /* This is called at context switch time on a thread that may not be self,
3730 * and at AST time
3731 */
3732 bool
thread_has_expired_workqueue_quantum(thread_t thread,bool should_trace)3733 thread_has_expired_workqueue_quantum(thread_t thread, bool should_trace)
3734 {
3735 if (!thread_has_armed_workqueue_quantum(thread)) {
3736 return false;
3737 }
3738 /* We do not do a thread_get_runtime_self() here since this function is
3739 * called from context switch time or during scheduler quantum expiry and
3740 * therefore, we may not be evaluating it on the current thread/self.
3741 *
3742 * In addition, the timers on the thread have just been updated recently so
3743 * we don't need to update them again.
3744 */
3745 uint64_t runtime = recount_thread_time_mach(thread);
3746 bool expired = runtime > thread->workq_quantum_deadline;
3747
3748 if (expired && should_trace) {
3749 WQ_TRACE(TRACE_wq_quantum_expired, runtime, thread->workq_quantum_deadline, 0, 0);
3750 }
3751
3752 return expired;
3753 }
3754
3755 /*
3756 * Called on a thread that is being context switched out or during quantum
3757 * expiry on self. Only called from scheduler paths.
3758 */
3759 void
thread_evaluate_workqueue_quantum_expiry(thread_t thread)3760 thread_evaluate_workqueue_quantum_expiry(thread_t thread)
3761 {
3762 if (thread_has_expired_workqueue_quantum(thread, true)) {
3763 act_set_astkevent(thread, AST_KEVENT_WORKQ_QUANTUM_EXPIRED);
3764 }
3765 }
3766
3767 boolean_t
thread_has_thread_name(thread_t th)3768 thread_has_thread_name(thread_t th)
3769 {
3770 if (th) {
3771 return bsd_hasthreadname(get_bsdthread_info(th));
3772 }
3773
3774 /*
3775 * This is an odd case; clients may set the thread name based on the lack of
3776 * a name, but in this context there is no uthread to attach the name to.
3777 */
3778 return FALSE;
3779 }
3780
3781 void
thread_set_thread_name(thread_t th,const char * name)3782 thread_set_thread_name(thread_t th, const char* name)
3783 {
3784 if (th && name) {
3785 bsd_setthreadname(get_bsdthread_info(th), thread_tid(th), name);
3786 }
3787 }
3788
3789 void
thread_get_thread_name(thread_t th,char * name)3790 thread_get_thread_name(thread_t th, char* name)
3791 {
3792 if (!name) {
3793 return;
3794 }
3795 if (th) {
3796 bsd_getthreadname(get_bsdthread_info(th), name);
3797 } else {
3798 name[0] = '\0';
3799 }
3800 }
3801
3802 processor_t
thread_get_runq(thread_t thread)3803 thread_get_runq(thread_t thread)
3804 {
3805 thread_lock_assert(thread, LCK_ASSERT_OWNED);
3806 processor_t runq = thread->__runq.runq;
3807 os_atomic_thread_fence(acquire);
3808 return runq;
3809 }
3810
3811 processor_t
thread_get_runq_locked(thread_t thread)3812 thread_get_runq_locked(thread_t thread)
3813 {
3814 thread_lock_assert(thread, LCK_ASSERT_OWNED);
3815 processor_t runq = thread->__runq.runq;
3816 if (runq != PROCESSOR_NULL) {
3817 pset_assert_locked(runq->processor_set);
3818 }
3819 return runq;
3820 }
3821
3822 void
thread_set_runq_locked(thread_t thread,processor_t new_runq)3823 thread_set_runq_locked(thread_t thread, processor_t new_runq)
3824 {
3825 thread_lock_assert(thread, LCK_ASSERT_OWNED);
3826 pset_assert_locked(new_runq->processor_set);
3827 thread_assert_runq_null(thread);
3828 thread->__runq.runq = new_runq;
3829 }
3830
3831 void
thread_clear_runq(thread_t thread)3832 thread_clear_runq(thread_t thread)
3833 {
3834 thread_assert_runq_nonnull(thread);
3835 os_atomic_thread_fence(release);
3836 thread->__runq.runq = PROCESSOR_NULL;
3837 }
3838
3839 void
thread_clear_runq_locked(thread_t thread)3840 thread_clear_runq_locked(thread_t thread)
3841 {
3842 thread_lock_assert(thread, LCK_ASSERT_OWNED);
3843 thread_assert_runq_nonnull(thread);
3844 thread->__runq.runq = PROCESSOR_NULL;
3845 }
3846
3847 void
thread_assert_runq_null(__assert_only thread_t thread)3848 thread_assert_runq_null(__assert_only thread_t thread)
3849 {
3850 assert(thread->__runq.runq == PROCESSOR_NULL);
3851 }
3852
3853 void
thread_assert_runq_nonnull(thread_t thread)3854 thread_assert_runq_nonnull(thread_t thread)
3855 {
3856 pset_assert_locked(thread->__runq.runq->processor_set);
3857 assert(thread->__runq.runq != PROCESSOR_NULL);
3858 }
3859
3860 void
thread_set_honor_qlimit(thread_t thread)3861 thread_set_honor_qlimit(thread_t thread)
3862 {
3863 thread->options |= TH_OPT_HONOR_QLIMIT;
3864 }
3865
3866 void
thread_clear_honor_qlimit(thread_t thread)3867 thread_clear_honor_qlimit(thread_t thread)
3868 {
3869 thread->options &= (~TH_OPT_HONOR_QLIMIT);
3870 }
3871
3872 /*
3873 * thread_enable_send_importance - set/clear the SEND_IMPORTANCE thread option bit.
3874 */
3875 void
thread_enable_send_importance(thread_t thread,boolean_t enable)3876 thread_enable_send_importance(thread_t thread, boolean_t enable)
3877 {
3878 if (enable == TRUE) {
3879 thread->options |= TH_OPT_SEND_IMPORTANCE;
3880 } else {
3881 thread->options &= ~TH_OPT_SEND_IMPORTANCE;
3882 }
3883 }
3884
3885 kern_return_t
thread_get_ipc_propagate_attr(thread_t thread,struct thread_attr_for_ipc_propagation * attr)3886 thread_get_ipc_propagate_attr(thread_t thread, struct thread_attr_for_ipc_propagation *attr)
3887 {
3888 int iotier;
3889 int qos;
3890
3891 if (thread == NULL || attr == NULL) {
3892 return KERN_INVALID_ARGUMENT;
3893 }
3894
3895 iotier = proc_get_effective_thread_policy(thread, TASK_POLICY_IO);
3896 qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS);
3897
3898 if (!qos) {
3899 qos = thread_user_promotion_qos_for_pri(thread->base_pri);
3900 }
3901
3902 attr->tafip_iotier = iotier;
3903 attr->tafip_qos = qos;
3904
3905 return KERN_SUCCESS;
3906 }
3907
3908 /*
3909 * thread_set_allocation_name - .
3910 */
3911
3912 kern_allocation_name_t
thread_set_allocation_name(kern_allocation_name_t new_name)3913 thread_set_allocation_name(kern_allocation_name_t new_name)
3914 {
3915 kern_allocation_name_t ret;
3916 thread_kernel_state_t kstate = thread_get_kernel_state(current_thread());
3917 ret = kstate->allocation_name;
3918 // fifo
3919 if (!new_name || !kstate->allocation_name) {
3920 kstate->allocation_name = new_name;
3921 }
3922 return ret;
3923 }
3924
3925 void *
thread_iokit_tls_get(uint32_t index)3926 thread_iokit_tls_get(uint32_t index)
3927 {
3928 assert(index < THREAD_SAVE_IOKIT_TLS_COUNT);
3929 return current_thread()->saved.iokit.tls[index];
3930 }
3931
3932 void
thread_iokit_tls_set(uint32_t index,void * data)3933 thread_iokit_tls_set(uint32_t index, void * data)
3934 {
3935 assert(index < THREAD_SAVE_IOKIT_TLS_COUNT);
3936 current_thread()->saved.iokit.tls[index] = data;
3937 }
3938
3939 uint64_t
thread_get_last_wait_duration(thread_t thread)3940 thread_get_last_wait_duration(thread_t thread)
3941 {
3942 return thread->last_made_runnable_time - thread->last_run_time;
3943 }
3944
3945 integer_t
thread_kern_get_pri(thread_t thr)3946 thread_kern_get_pri(thread_t thr)
3947 {
3948 return thr->base_pri;
3949 }
3950
3951 void
thread_kern_set_pri(thread_t thr,integer_t pri)3952 thread_kern_set_pri(thread_t thr, integer_t pri)
3953 {
3954 sched_set_kernel_thread_priority(thr, pri);
3955 }
3956
3957 integer_t
thread_kern_get_kernel_maxpri(void)3958 thread_kern_get_kernel_maxpri(void)
3959 {
3960 return MAXPRI_KERNEL;
3961 }
3962 /*
3963 * thread_port_with_flavor_no_senders
3964 *
3965 * Called whenever the Mach port system detects no-senders on
3966 * the thread inspect or read port. These ports are allocated lazily and
3967 * should be deallocated here when there are no senders remaining.
3968 */
3969 static void
thread_port_with_flavor_no_senders(ipc_port_t port,mach_port_mscount_t mscount __unused)3970 thread_port_with_flavor_no_senders(
3971 ipc_port_t port,
3972 mach_port_mscount_t mscount __unused)
3973 {
3974 thread_ro_t tro;
3975 thread_t thread;
3976 mach_thread_flavor_t flavor;
3977 ipc_kobject_type_t kotype;
3978
3979 ip_mq_lock(port);
3980 if (port->ip_srights > 0) {
3981 ip_mq_unlock(port);
3982 return;
3983 }
3984 kotype = ip_kotype(port);
3985 assert((IKOT_THREAD_READ == kotype) || (IKOT_THREAD_INSPECT == kotype));
3986 thread = ipc_kobject_get_locked(port, kotype);
3987 if (thread != THREAD_NULL) {
3988 thread_reference(thread);
3989 }
3990 ip_mq_unlock(port);
3991
3992 if (thread == THREAD_NULL) {
3993 /* The thread is exiting or disabled; it will eventually deallocate the port */
3994 return;
3995 }
3996
3997 if (kotype == IKOT_THREAD_READ) {
3998 flavor = THREAD_FLAVOR_READ;
3999 } else {
4000 flavor = THREAD_FLAVOR_INSPECT;
4001 }
4002
4003 thread_mtx_lock(thread);
4004 ip_mq_lock(port);
4005
4006 /*
4007 * If the port is no longer active, then ipc_thread_terminate() ran
4008 * and destroyed the kobject already. Just deallocate the task
4009 * ref we took and go away.
4010 *
4011 * It is also possible that several nsrequests are in flight,
4012 * only one shall NULL-out the port entry, and this is the one
4013 * that gets to dealloc the port.
4014 *
4015 * Check for a stale no-senders notification. A call to any function
4016 * that vends out send rights to this port could resurrect it between
4017 * this notification being generated and actually being handled here.
4018 */
4019 tro = get_thread_ro(thread);
4020 if (!ip_active(port) ||
4021 tro->tro_ports[flavor] != port ||
4022 port->ip_srights > 0) {
4023 ip_mq_unlock(port);
4024 thread_mtx_unlock(thread);
4025 thread_deallocate(thread);
4026 return;
4027 }
4028
4029 assert(tro->tro_ports[flavor] == port);
4030 zalloc_ro_clear_field(ZONE_ID_THREAD_RO, tro, tro_ports[flavor]);
4031 thread_mtx_unlock(thread);
4032
4033 ipc_kobject_dealloc_port_and_unlock(port, 0, kotype);
4034
4035 thread_deallocate(thread);
4036 }
4037
4038 /*
4039 * The 'thread_region_page_shift' is used by footprint
4040 * to specify the page size that it will use to
4041 * accomplish its accounting work on the task being
4042 * inspected. Since footprint uses a thread for each
4043 * task that it works on, we need to keep the page_shift
4044 * on a per-thread basis.
4045 */
4046
4047 int
thread_self_region_page_shift(void)4048 thread_self_region_page_shift(void)
4049 {
4050 /*
4051 * Return the page shift that this thread
4052 * would like to use for its accounting work.
4053 */
4054 return current_thread()->thread_region_page_shift;
4055 }
4056
4057 void
thread_self_region_page_shift_set(int pgshift)4058 thread_self_region_page_shift_set(
4059 int pgshift)
4060 {
4061 /*
4062 * Set the page shift that this thread
4063 * would like to use for its accounting work
4064 * when dealing with a task.
4065 */
4066 current_thread()->thread_region_page_shift = pgshift;
4067 }
4068
4069 __startup_func
4070 static void
ctid_table_init(void)4071 ctid_table_init(void)
4072 {
4073 /*
4074 * Pretend the early boot setup didn't exist,
4075 * and pick a mangling nonce.
4076 */
4077 *compact_id_resolve(&ctid_table, 0) = THREAD_NULL;
4078 ctid_nonce = (uint32_t)early_random() & CTID_MASK;
4079 }
4080
4081
4082 /*
4083 * This maps the [0, CTID_MAX_THREAD_NUMBER] range
4084 * to [1, CTID_MAX_THREAD_NUMBER + 1 == CTID_MASK]
4085 * so that in mangled form, '0' is an invalid CTID.
4086 */
4087 static ctid_t
ctid_mangle(compact_id_t cid)4088 ctid_mangle(compact_id_t cid)
4089 {
4090 return (cid == ctid_nonce ? CTID_MASK : cid) ^ ctid_nonce;
4091 }
4092
4093 static compact_id_t
ctid_unmangle(ctid_t ctid)4094 ctid_unmangle(ctid_t ctid)
4095 {
4096 ctid ^= ctid_nonce;
4097 return ctid == CTID_MASK ? ctid_nonce : ctid;
4098 }
4099
4100 void
ctid_table_add(thread_t thread)4101 ctid_table_add(thread_t thread)
4102 {
4103 compact_id_t cid;
4104
4105 cid = compact_id_get(&ctid_table, CTID_MAX_THREAD_NUMBER, thread);
4106 thread->ctid = ctid_mangle(cid);
4107 }
4108
4109 void
ctid_table_remove(thread_t thread)4110 ctid_table_remove(thread_t thread)
4111 {
4112 __assert_only thread_t value;
4113
4114 value = compact_id_put(&ctid_table, ctid_unmangle(thread->ctid));
4115 assert3p(value, ==, thread);
4116 thread->ctid = 0;
4117 }
4118
4119 thread_t
ctid_get_thread_unsafe(ctid_t ctid)4120 ctid_get_thread_unsafe(ctid_t ctid)
4121 {
4122 if (ctid) {
4123 return *compact_id_resolve(&ctid_table, ctid_unmangle(ctid));
4124 }
4125 return THREAD_NULL;
4126 }
4127
4128 thread_t
ctid_get_thread(ctid_t ctid)4129 ctid_get_thread(ctid_t ctid)
4130 {
4131 thread_t thread = THREAD_NULL;
4132
4133 if (ctid) {
4134 thread = *compact_id_resolve(&ctid_table, ctid_unmangle(ctid));
4135 assert(thread && thread->ctid == ctid);
4136 }
4137 return thread;
4138 }
4139
4140 ctid_t
thread_get_ctid(thread_t thread)4141 thread_get_ctid(thread_t thread)
4142 {
4143 return thread->ctid;
4144 }
4145
4146 /*
4147 * Adjust code signature dependent thread state.
4148 *
4149 * Called to allow code signature dependent adjustments to the thread
4150 * state. Note that this is usually called twice for the main thread:
4151 * Once at thread creation by thread_create, when the signature is
4152 * potentially not attached yet (which is usually the case for the
4153 * first/main thread of a task), and once after the task's signature
4154 * has actually been attached.
4155 *
4156 */
4157 kern_return_t
thread_process_signature(thread_t thread,task_t task)4158 thread_process_signature(thread_t thread, task_t task)
4159 {
4160 return machine_thread_process_signature(thread, task);
4161 }
4162
4163 #if CONFIG_SPTM
4164
4165 void
thread_associate_txm_thread_stack(uintptr_t thread_stack)4166 thread_associate_txm_thread_stack(uintptr_t thread_stack)
4167 {
4168 thread_t self = current_thread();
4169
4170 if (self->txm_thread_stack != 0) {
4171 panic("attempted multiple TXM thread associations: %lu | %lu",
4172 self->txm_thread_stack, thread_stack);
4173 }
4174
4175 self->txm_thread_stack = thread_stack;
4176 }
4177
4178 void
thread_disassociate_txm_thread_stack(uintptr_t thread_stack)4179 thread_disassociate_txm_thread_stack(uintptr_t thread_stack)
4180 {
4181 thread_t self = current_thread();
4182
4183 if (self->txm_thread_stack == 0) {
4184 panic("attempted to disassociate non-existent TXM thread");
4185 } else if (self->txm_thread_stack != thread_stack) {
4186 panic("invalid disassociation for TXM thread: %lu | %lu",
4187 self->txm_thread_stack, thread_stack);
4188 }
4189
4190 self->txm_thread_stack = 0;
4191 }
4192
4193 uintptr_t
thread_get_txm_thread_stack(void)4194 thread_get_txm_thread_stack(void)
4195 {
4196 return current_thread()->txm_thread_stack;
4197 }
4198
4199 #endif
4200
4201 #if CONFIG_DTRACE
4202 uint32_t
dtrace_get_thread_predcache(thread_t thread)4203 dtrace_get_thread_predcache(thread_t thread)
4204 {
4205 if (thread != THREAD_NULL) {
4206 return thread->t_dtrace_predcache;
4207 } else {
4208 return 0;
4209 }
4210 }
4211
4212 int64_t
dtrace_get_thread_vtime(thread_t thread)4213 dtrace_get_thread_vtime(thread_t thread)
4214 {
4215 if (thread != THREAD_NULL) {
4216 return thread->t_dtrace_vtime;
4217 } else {
4218 return 0;
4219 }
4220 }
4221
4222 int
dtrace_get_thread_last_cpu_id(thread_t thread)4223 dtrace_get_thread_last_cpu_id(thread_t thread)
4224 {
4225 if ((thread != THREAD_NULL) && (thread->last_processor != PROCESSOR_NULL)) {
4226 return thread->last_processor->cpu_id;
4227 } else {
4228 return -1;
4229 }
4230 }
4231
4232 int64_t
dtrace_get_thread_tracing(thread_t thread)4233 dtrace_get_thread_tracing(thread_t thread)
4234 {
4235 if (thread != THREAD_NULL) {
4236 return thread->t_dtrace_tracing;
4237 } else {
4238 return 0;
4239 }
4240 }
4241
4242 uint16_t
dtrace_get_thread_inprobe(thread_t thread)4243 dtrace_get_thread_inprobe(thread_t thread)
4244 {
4245 if (thread != THREAD_NULL) {
4246 return thread->t_dtrace_inprobe;
4247 } else {
4248 return 0;
4249 }
4250 }
4251
4252 vm_offset_t
thread_get_kernel_stack(thread_t thread)4253 thread_get_kernel_stack(thread_t thread)
4254 {
4255 if (thread != THREAD_NULL) {
4256 return thread->kernel_stack;
4257 } else {
4258 return 0;
4259 }
4260 }
4261
4262 #if KASAN
4263 struct kasan_thread_data *
kasan_get_thread_data(thread_t thread)4264 kasan_get_thread_data(thread_t thread)
4265 {
4266 return &thread->kasan_data;
4267 }
4268 #endif
4269
4270 #if CONFIG_KCOV
4271 kcov_thread_data_t *
kcov_get_thread_data(thread_t thread)4272 kcov_get_thread_data(thread_t thread)
4273 {
4274 return &thread->kcov_data;
4275 }
4276 #endif
4277
4278 #if CONFIG_STKSZ
4279 /*
4280 * Returns base of a thread's kernel stack.
4281 *
4282 * Coverage sanitizer instruments every function including those that participates in stack handoff between threads.
4283 * There is a window in which CPU still holds old values but stack has been handed over to anoher thread already.
4284 * In this window kernel_stack is 0 but CPU still uses the original stack (until contex switch occurs). The original
4285 * kernel_stack value is preserved in ksancov_stack during this window.
4286 */
4287 vm_offset_t
kcov_stksz_get_thread_stkbase(thread_t thread)4288 kcov_stksz_get_thread_stkbase(thread_t thread)
4289 {
4290 if (thread != THREAD_NULL) {
4291 kcov_thread_data_t *data = kcov_get_thread_data(thread);
4292 if (data->ktd_stksz.kst_stack) {
4293 return data->ktd_stksz.kst_stack;
4294 } else {
4295 return thread->kernel_stack;
4296 }
4297 } else {
4298 return 0;
4299 }
4300 }
4301
4302 vm_offset_t
kcov_stksz_get_thread_stksize(thread_t thread)4303 kcov_stksz_get_thread_stksize(thread_t thread)
4304 {
4305 if (thread != THREAD_NULL) {
4306 return kernel_stack_size;
4307 } else {
4308 return 0;
4309 }
4310 }
4311
4312 void
kcov_stksz_set_thread_stack(thread_t thread,vm_offset_t stack)4313 kcov_stksz_set_thread_stack(thread_t thread, vm_offset_t stack)
4314 {
4315 kcov_thread_data_t *data = kcov_get_thread_data(thread);
4316 data->ktd_stksz.kst_stack = stack;
4317 }
4318 #endif /* CONFIG_STKSZ */
4319
4320 int64_t
dtrace_calc_thread_recent_vtime(thread_t thread)4321 dtrace_calc_thread_recent_vtime(thread_t thread)
4322 {
4323 if (thread == THREAD_NULL) {
4324 return 0;
4325 }
4326
4327 struct recount_usage usage = { 0 };
4328 recount_current_thread_usage(&usage);
4329 return (int64_t)(recount_usage_time_mach(&usage));
4330 }
4331
4332 void
dtrace_set_thread_predcache(thread_t thread,uint32_t predcache)4333 dtrace_set_thread_predcache(thread_t thread, uint32_t predcache)
4334 {
4335 if (thread != THREAD_NULL) {
4336 thread->t_dtrace_predcache = predcache;
4337 }
4338 }
4339
4340 void
dtrace_set_thread_vtime(thread_t thread,int64_t vtime)4341 dtrace_set_thread_vtime(thread_t thread, int64_t vtime)
4342 {
4343 if (thread != THREAD_NULL) {
4344 thread->t_dtrace_vtime = vtime;
4345 }
4346 }
4347
4348 void
dtrace_set_thread_tracing(thread_t thread,int64_t accum)4349 dtrace_set_thread_tracing(thread_t thread, int64_t accum)
4350 {
4351 if (thread != THREAD_NULL) {
4352 thread->t_dtrace_tracing = accum;
4353 }
4354 }
4355
4356 void
dtrace_set_thread_inprobe(thread_t thread,uint16_t inprobe)4357 dtrace_set_thread_inprobe(thread_t thread, uint16_t inprobe)
4358 {
4359 if (thread != THREAD_NULL) {
4360 thread->t_dtrace_inprobe = inprobe;
4361 }
4362 }
4363
4364 void
dtrace_thread_bootstrap(void)4365 dtrace_thread_bootstrap(void)
4366 {
4367 task_t task = current_task();
4368
4369 if (task->thread_count == 1) {
4370 thread_t thread = current_thread();
4371 if (thread->t_dtrace_flags & TH_DTRACE_EXECSUCCESS) {
4372 thread->t_dtrace_flags &= ~TH_DTRACE_EXECSUCCESS;
4373 DTRACE_PROC(exec__success);
4374 KDBG(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXEC),
4375 task_pid(task));
4376 }
4377 DTRACE_PROC(start);
4378 }
4379 DTRACE_PROC(lwp__start);
4380 }
4381
4382 void
dtrace_thread_didexec(thread_t thread)4383 dtrace_thread_didexec(thread_t thread)
4384 {
4385 thread->t_dtrace_flags |= TH_DTRACE_EXECSUCCESS;
4386 }
4387 #endif /* CONFIG_DTRACE */
4388