xref: /xnu-8019.80.24/bsd/kern/sys_generic.c (revision a325d9c4a84054e40bbe985afedcb50ab80993ea)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1989, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  * (c) UNIX System Laboratories, Inc.
33  * All or some portions of this file are derived from material licensed
34  * to the University of California by American Telephone and Telegraph
35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36  * the permission of UNIX System Laboratories, Inc.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. All advertising materials mentioning features or use of this software
47  *    must display the following acknowledgement:
48  *	This product includes software developed by the University of
49  *	California, Berkeley and its contributors.
50  * 4. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
67  */
68 /*
69  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
70  * support for mandatory and extensible security protections.  This notice
71  * is included in support of clause 2.2 (b) of the Apple Public License,
72  * Version 2.0.
73  */
74 
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/filedesc.h>
78 #include <sys/ioctl.h>
79 #include <sys/file_internal.h>
80 #include <sys/proc_internal.h>
81 #include <sys/socketvar.h>
82 #include <sys/uio_internal.h>
83 #include <sys/kernel.h>
84 #include <sys/guarded.h>
85 #include <sys/stat.h>
86 #include <sys/malloc.h>
87 #include <sys/sysproto.h>
88 
89 #include <sys/mount_internal.h>
90 #include <sys/protosw.h>
91 #include <sys/ev.h>
92 #include <sys/user.h>
93 #include <sys/kdebug.h>
94 #include <sys/poll.h>
95 #include <sys/event.h>
96 #include <sys/eventvar.h>
97 #include <sys/proc.h>
98 #include <sys/kauth.h>
99 
100 #include <machine/smp.h>
101 #include <mach/mach_types.h>
102 #include <kern/kern_types.h>
103 #include <kern/assert.h>
104 #include <kern/kalloc.h>
105 #include <kern/thread.h>
106 #include <kern/clock.h>
107 #include <kern/ledger.h>
108 #include <kern/task.h>
109 #include <kern/telemetry.h>
110 #include <kern/waitq.h>
111 #include <kern/sched_hygiene.h>
112 #include <kern/sched_prim.h>
113 #include <kern/mpsc_queue.h>
114 #include <kern/debug.h>
115 
116 #include <sys/mbuf.h>
117 #include <sys/domain.h>
118 #include <sys/socket.h>
119 #include <sys/socketvar.h>
120 #include <sys/errno.h>
121 #include <sys/syscall.h>
122 #include <sys/pipe.h>
123 
124 #include <security/audit/audit.h>
125 
126 #include <net/if.h>
127 #include <net/route.h>
128 
129 #include <netinet/in.h>
130 #include <netinet/in_systm.h>
131 #include <netinet/ip.h>
132 #include <netinet/in_pcb.h>
133 #include <netinet/ip_var.h>
134 #include <netinet/ip6.h>
135 #include <netinet/tcp.h>
136 #include <netinet/tcp_fsm.h>
137 #include <netinet/tcp_seq.h>
138 #include <netinet/tcp_timer.h>
139 #include <netinet/tcp_var.h>
140 #include <netinet/tcpip.h>
141 #include <netinet/tcp_debug.h>
142 /* for wait queue based select */
143 #include <kern/waitq.h>
144 #include <sys/vnode_internal.h>
145 /* for remote time api*/
146 #include <kern/remote_time.h>
147 #include <os/log.h>
148 #include <sys/log_data.h>
149 
150 #if CONFIG_MACF
151 #include <security/mac_framework.h>
152 #endif
153 
154 #ifdef CONFIG_KDP_INTERACTIVE_DEBUGGING
155 #include <mach_debug/mach_debug_types.h>
156 #endif
157 
158 /* for entitlement check */
159 #include <IOKit/IOBSD.h>
160 /*
161  * If you need accounting for KM_SELECT consider using
162  * KALLOC_HEAP_DEFINE to define a view.
163  */
164 #define KM_SELECT       KHEAP_DEFAULT
165 
166 /* XXX should be in a header file somewhere */
167 extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
168 
169 int rd_uio(struct proc *p, int fdes, uio_t uio, int is_preadv, user_ssize_t *retval);
170 int wr_uio(struct proc *p, int fdes, uio_t uio, int is_pwritev, user_ssize_t *retval);
171 int do_uiowrite(struct proc *p, struct fileproc *fp, uio_t uio, int flags, user_ssize_t *retval);
172 
173 __private_extern__ int  dofileread(vfs_context_t ctx, struct fileproc *fp,
174     user_addr_t bufp, user_size_t nbyte,
175     off_t offset, int flags, user_ssize_t *retval);
176 __private_extern__ int  dofilewrite(vfs_context_t ctx, struct fileproc *fp,
177     user_addr_t bufp, user_size_t nbyte,
178     off_t offset, int flags, user_ssize_t *retval);
179 static int preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode);
180 
181 /* Conflict wait queue for when selects collide (opaque type) */
182 struct waitq select_conflict_queue;
183 
184 /*
185  * Init routine called from bsd_init.c
186  */
187 void select_waitq_init(void);
188 void
select_waitq_init(void)189 select_waitq_init(void)
190 {
191 	waitq_init(&select_conflict_queue, SYNC_POLICY_FIFO);
192 }
193 
194 #define f_flag fp_glob->fg_flag
195 #define f_type fp_glob->fg_ops->fo_type
196 #define f_cred fp_glob->fg_cred
197 #define f_ops fp_glob->fg_ops
198 
199 /*
200  * Validate if the file can be used for random access (pread, pwrite, etc).
201  *
202  * Conditions:
203  *		proc_fdlock is held
204  *
205  * Returns:    0                       Success
206  *             ESPIPE
207  *             ENXIO
208  */
209 static int
valid_for_random_access(struct fileproc * fp)210 valid_for_random_access(struct fileproc *fp)
211 {
212 	if (__improbable(fp->f_type != DTYPE_VNODE)) {
213 		return ESPIPE;
214 	}
215 
216 	vnode_t vp = (struct vnode *)fp_get_data(fp);
217 	if (__improbable(vnode_isfifo(vp))) {
218 		return ESPIPE;
219 	}
220 
221 	if (__improbable(vp->v_flag & VISTTY)) {
222 		return ENXIO;
223 	}
224 
225 	return 0;
226 }
227 
228 /*
229  * Read system call.
230  *
231  * Returns:	0			Success
232  *	preparefileread:EBADF
233  *	preparefileread:ESPIPE
234  *	preparefileread:ENXIO
235  *	preparefileread:EBADF
236  *	dofileread:???
237  */
238 int
read(struct proc * p,struct read_args * uap,user_ssize_t * retval)239 read(struct proc *p, struct read_args *uap, user_ssize_t *retval)
240 {
241 	__pthread_testcancel(1);
242 	return read_nocancel(p, (struct read_nocancel_args *)uap, retval);
243 }
244 
245 int
read_nocancel(struct proc * p,struct read_nocancel_args * uap,user_ssize_t * retval)246 read_nocancel(struct proc *p, struct read_nocancel_args *uap, user_ssize_t *retval)
247 {
248 	struct fileproc *fp;
249 	int error;
250 	int fd = uap->fd;
251 	struct vfs_context context;
252 
253 	if ((error = preparefileread(p, &fp, fd, 0))) {
254 		return error;
255 	}
256 
257 	context = *(vfs_context_current());
258 	context.vc_ucred = fp->fp_glob->fg_cred;
259 
260 	error = dofileread(&context, fp, uap->cbuf, uap->nbyte,
261 	    (off_t)-1, 0, retval);
262 
263 	fp_drop(p, fd, fp, 0);
264 
265 	return error;
266 }
267 
268 /*
269  * Pread system call
270  *
271  * Returns:	0			Success
272  *	preparefileread:EBADF
273  *	preparefileread:ESPIPE
274  *	preparefileread:ENXIO
275  *	preparefileread:EBADF
276  *	dofileread:???
277  */
278 int
pread(struct proc * p,struct pread_args * uap,user_ssize_t * retval)279 pread(struct proc *p, struct pread_args *uap, user_ssize_t *retval)
280 {
281 	__pthread_testcancel(1);
282 	return pread_nocancel(p, (struct pread_nocancel_args *)uap, retval);
283 }
284 
285 int
pread_nocancel(struct proc * p,struct pread_nocancel_args * uap,user_ssize_t * retval)286 pread_nocancel(struct proc *p, struct pread_nocancel_args *uap, user_ssize_t *retval)
287 {
288 	struct fileproc *fp = NULL;     /* fp set by preparefileread() */
289 	int fd = uap->fd;
290 	int error;
291 	struct vfs_context context;
292 
293 	if ((error = preparefileread(p, &fp, fd, 1))) {
294 		goto out;
295 	}
296 
297 	context = *(vfs_context_current());
298 	context.vc_ucred = fp->fp_glob->fg_cred;
299 
300 	error = dofileread(&context, fp, uap->buf, uap->nbyte,
301 	    uap->offset, FOF_OFFSET, retval);
302 
303 	fp_drop(p, fd, fp, 0);
304 
305 	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
306 	    uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
307 
308 out:
309 	return error;
310 }
311 
312 /*
313  * Code common for read and pread
314  */
315 
316 /*
317  * Returns:	0			Success
318  *		EBADF
319  *		ESPIPE
320  *		ENXIO
321  *	fp_lookup:EBADF
322  *  valid_for_random_access:ESPIPE
323  *  valid_for_random_access:ENXIO
324  */
325 static int
preparefileread(struct proc * p,struct fileproc ** fp_ret,int fd,int check_for_pread)326 preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread)
327 {
328 	int     error;
329 	struct fileproc *fp;
330 
331 	AUDIT_ARG(fd, fd);
332 
333 	proc_fdlock_spin(p);
334 
335 	error = fp_lookup(p, fd, &fp, 1);
336 
337 	if (error) {
338 		proc_fdunlock(p);
339 		return error;
340 	}
341 	if ((fp->f_flag & FREAD) == 0) {
342 		error = EBADF;
343 		goto out;
344 	}
345 	if (check_for_pread) {
346 		if ((error = valid_for_random_access(fp))) {
347 			goto out;
348 		}
349 	}
350 
351 	*fp_ret = fp;
352 
353 	proc_fdunlock(p);
354 	return 0;
355 
356 out:
357 	fp_drop(p, fd, fp, 1);
358 	proc_fdunlock(p);
359 	return error;
360 }
361 
362 
363 /*
364  * Returns:	0			Success
365  *		EINVAL
366  *	fo_read:???
367  */
368 __private_extern__ int
dofileread(vfs_context_t ctx,struct fileproc * fp,user_addr_t bufp,user_size_t nbyte,off_t offset,int flags,user_ssize_t * retval)369 dofileread(vfs_context_t ctx, struct fileproc *fp,
370     user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
371     user_ssize_t *retval)
372 {
373 	uio_t auio;
374 	user_ssize_t bytecnt;
375 	int error = 0;
376 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
377 
378 	if (nbyte > INT_MAX) {
379 		return EINVAL;
380 	}
381 
382 	if (vfs_context_is64bit(ctx)) {
383 		auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
384 		    &uio_buf[0], sizeof(uio_buf));
385 	} else {
386 		auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
387 		    &uio_buf[0], sizeof(uio_buf));
388 	}
389 	if (uio_addiov(auio, bufp, nbyte) != 0) {
390 		*retval = 0;
391 		return EINVAL;
392 	}
393 
394 	bytecnt = nbyte;
395 
396 	if ((error = fo_read(fp, auio, flags, ctx))) {
397 		if (uio_resid(auio) != bytecnt && (error == ERESTART ||
398 		    error == EINTR || error == EWOULDBLOCK)) {
399 			error = 0;
400 		}
401 	}
402 	bytecnt -= uio_resid(auio);
403 
404 	*retval = bytecnt;
405 
406 	return error;
407 }
408 
409 /*
410  * Vector read.
411  *
412  * Returns:    0                       Success
413  *             EINVAL
414  *             ENOMEM
415  *     preparefileread:EBADF
416  *     preparefileread:ESPIPE
417  *     preparefileread:ENXIO
418  *     preparefileread:EBADF
419  *     copyin:EFAULT
420  *     rd_uio:???
421  */
422 static int
readv_preadv_uio(struct proc * p,int fdes,user_addr_t user_iovp,int iovcnt,off_t offset,int is_preadv,user_ssize_t * retval)423 readv_preadv_uio(struct proc *p, int fdes,
424     user_addr_t user_iovp, int iovcnt, off_t offset, int is_preadv,
425     user_ssize_t *retval)
426 {
427 	uio_t auio = NULL;
428 	int error;
429 	struct user_iovec *iovp;
430 
431 	/* Verify range before calling uio_create() */
432 	if (iovcnt <= 0 || iovcnt > UIO_MAXIOV) {
433 		return EINVAL;
434 	}
435 
436 	/* allocate a uio large enough to hold the number of iovecs passed */
437 	auio = uio_create(iovcnt, offset,
438 	    (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
439 	    UIO_READ);
440 
441 	/* get location of iovecs within the uio.  then copyin the iovecs from
442 	 * user space.
443 	 */
444 	iovp = uio_iovsaddr(auio);
445 	if (iovp == NULL) {
446 		error = ENOMEM;
447 		goto ExitThisRoutine;
448 	}
449 	error = copyin_user_iovec_array(user_iovp,
450 	    IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
451 	    iovcnt, iovp);
452 	if (error) {
453 		goto ExitThisRoutine;
454 	}
455 
456 	/* finalize uio_t for use and do the IO
457 	 */
458 	error = uio_calculateresid(auio);
459 	if (error) {
460 		goto ExitThisRoutine;
461 	}
462 	error = rd_uio(p, fdes, auio, is_preadv, retval);
463 
464 ExitThisRoutine:
465 	if (auio != NULL) {
466 		uio_free(auio);
467 	}
468 	return error;
469 }
470 
471 /*
472  * Scatter read system call.
473  */
474 int
readv(struct proc * p,struct readv_args * uap,user_ssize_t * retval)475 readv(struct proc *p, struct readv_args *uap, user_ssize_t *retval)
476 {
477 	__pthread_testcancel(1);
478 	return readv_nocancel(p, (struct readv_nocancel_args *)uap, retval);
479 }
480 
481 int
readv_nocancel(struct proc * p,struct readv_nocancel_args * uap,user_ssize_t * retval)482 readv_nocancel(struct proc *p, struct readv_nocancel_args *uap, user_ssize_t *retval)
483 {
484 	return readv_preadv_uio(p, uap->fd, uap->iovp, uap->iovcnt, 0, 0, retval);
485 }
486 
487 /*
488  * Preadv system call
489  */
490 int
sys_preadv(struct proc * p,struct preadv_args * uap,user_ssize_t * retval)491 sys_preadv(struct proc *p, struct preadv_args *uap, user_ssize_t *retval)
492 {
493 	__pthread_testcancel(1);
494 	return sys_preadv_nocancel(p, (struct preadv_nocancel_args *)uap, retval);
495 }
496 
497 int
sys_preadv_nocancel(struct proc * p,struct preadv_nocancel_args * uap,user_ssize_t * retval)498 sys_preadv_nocancel(struct proc *p, struct preadv_nocancel_args *uap, user_ssize_t *retval)
499 {
500 	return readv_preadv_uio(p, uap->fd, uap->iovp, uap->iovcnt, uap->offset, 1, retval);
501 }
502 
503 /*
504  * Write system call
505  *
506  * Returns:	0			Success
507  *		EBADF
508  *	fp_lookup:EBADF
509  *	dofilewrite:???
510  */
511 int
write(struct proc * p,struct write_args * uap,user_ssize_t * retval)512 write(struct proc *p, struct write_args *uap, user_ssize_t *retval)
513 {
514 	__pthread_testcancel(1);
515 	return write_nocancel(p, (struct write_nocancel_args *)uap, retval);
516 }
517 
518 int
write_nocancel(struct proc * p,struct write_nocancel_args * uap,user_ssize_t * retval)519 write_nocancel(struct proc *p, struct write_nocancel_args *uap, user_ssize_t *retval)
520 {
521 	struct fileproc *fp;
522 	int error;
523 	int fd = uap->fd;
524 
525 	AUDIT_ARG(fd, fd);
526 
527 	error = fp_lookup(p, fd, &fp, 0);
528 	if (error) {
529 		return error;
530 	}
531 	if ((fp->f_flag & FWRITE) == 0) {
532 		error = EBADF;
533 	} else if (fp_isguarded(fp, GUARD_WRITE)) {
534 		proc_fdlock(p);
535 		error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
536 		proc_fdunlock(p);
537 	} else {
538 		struct vfs_context context = *(vfs_context_current());
539 		context.vc_ucred = fp->fp_glob->fg_cred;
540 
541 		error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
542 		    (off_t)-1, 0, retval);
543 	}
544 	fp_drop(p, fd, fp, 0);
545 	return error;
546 }
547 
548 /*
549  * pwrite system call
550  *
551  * Returns:	0			Success
552  *		EBADF
553  *		ESPIPE
554  *		ENXIO
555  *		EINVAL
556  *	fp_lookup:EBADF
557  *	dofilewrite:???
558  */
559 int
pwrite(struct proc * p,struct pwrite_args * uap,user_ssize_t * retval)560 pwrite(struct proc *p, struct pwrite_args *uap, user_ssize_t *retval)
561 {
562 	__pthread_testcancel(1);
563 	return pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval);
564 }
565 
566 int
pwrite_nocancel(struct proc * p,struct pwrite_nocancel_args * uap,user_ssize_t * retval)567 pwrite_nocancel(struct proc *p, struct pwrite_nocancel_args *uap, user_ssize_t *retval)
568 {
569 	struct fileproc *fp;
570 	int error;
571 	int fd = uap->fd;
572 	vnode_t vp  = (vnode_t)0;
573 
574 	AUDIT_ARG(fd, fd);
575 
576 	error = fp_get_ftype(p, fd, DTYPE_VNODE, ESPIPE, &fp);
577 	if (error) {
578 		return error;
579 	}
580 
581 	if ((fp->f_flag & FWRITE) == 0) {
582 		error = EBADF;
583 	} else if (fp_isguarded(fp, GUARD_WRITE)) {
584 		proc_fdlock(p);
585 		error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
586 		proc_fdunlock(p);
587 	} else {
588 		struct vfs_context context = *vfs_context_current();
589 		context.vc_ucred = fp->fp_glob->fg_cred;
590 
591 		vp = (vnode_t)fp_get_data(fp);
592 		if (vnode_isfifo(vp)) {
593 			error = ESPIPE;
594 			goto errout;
595 		}
596 		if ((vp->v_flag & VISTTY)) {
597 			error = ENXIO;
598 			goto errout;
599 		}
600 		if (uap->offset == (off_t)-1) {
601 			error = EINVAL;
602 			goto errout;
603 		}
604 
605 		error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
606 		    uap->offset, FOF_OFFSET, retval);
607 	}
608 errout:
609 	fp_drop(p, fd, fp, 0);
610 
611 	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
612 	    uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
613 
614 	return error;
615 }
616 
617 /*
618  * Returns:	0			Success
619  *		EINVAL
620  *	<fo_write>:EPIPE
621  *	<fo_write>:???			[indirect through struct fileops]
622  */
623 __private_extern__ int
dofilewrite(vfs_context_t ctx,struct fileproc * fp,user_addr_t bufp,user_size_t nbyte,off_t offset,int flags,user_ssize_t * retval)624 dofilewrite(vfs_context_t ctx, struct fileproc *fp,
625     user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
626     user_ssize_t *retval)
627 {
628 	uio_t auio;
629 	int error = 0;
630 	user_ssize_t bytecnt;
631 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
632 
633 	if (nbyte > INT_MAX) {
634 		*retval = 0;
635 		return EINVAL;
636 	}
637 
638 	if (vfs_context_is64bit(ctx)) {
639 		auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
640 		    &uio_buf[0], sizeof(uio_buf));
641 	} else {
642 		auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
643 		    &uio_buf[0], sizeof(uio_buf));
644 	}
645 	if (uio_addiov(auio, bufp, nbyte) != 0) {
646 		*retval = 0;
647 		return EINVAL;
648 	}
649 
650 	bytecnt = nbyte;
651 	if ((error = fo_write(fp, auio, flags, ctx))) {
652 		if (uio_resid(auio) != bytecnt && (error == ERESTART ||
653 		    error == EINTR || error == EWOULDBLOCK)) {
654 			error = 0;
655 		}
656 		/* The socket layer handles SIGPIPE */
657 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
658 		    (fp->fp_glob->fg_lflags & FG_NOSIGPIPE) == 0) {
659 			/* XXX Raise the signal on the thread? */
660 			psignal(vfs_context_proc(ctx), SIGPIPE);
661 		}
662 	}
663 	bytecnt -= uio_resid(auio);
664 	if (bytecnt) {
665 		os_atomic_or(&fp->fp_glob->fg_flag, FWASWRITTEN, relaxed);
666 	}
667 	*retval = bytecnt;
668 
669 	return error;
670 }
671 
672 /*
673  * Returns:	0			Success
674  *		EBADF
675  *		ESPIPE
676  *		ENXIO
677  *	fp_lookup:EBADF
678  *	fp_guard_exception:???
679  *  valid_for_random_access:ESPIPE
680  *  valid_for_random_access:ENXIO
681  */
682 static int
preparefilewrite(struct proc * p,struct fileproc ** fp_ret,int fd,int check_for_pwrite)683 preparefilewrite(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pwrite)
684 {
685 	int error;
686 	struct fileproc *fp;
687 
688 	AUDIT_ARG(fd, fd);
689 
690 	proc_fdlock_spin(p);
691 
692 	error = fp_lookup(p, fd, &fp, 1);
693 
694 	if (error) {
695 		proc_fdunlock(p);
696 		return error;
697 	}
698 	if ((fp->f_flag & FWRITE) == 0) {
699 		error = EBADF;
700 		goto ExitThisRoutine;
701 	}
702 	if (fp_isguarded(fp, GUARD_WRITE)) {
703 		error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
704 		goto ExitThisRoutine;
705 	}
706 	if (check_for_pwrite) {
707 		if ((error = valid_for_random_access(fp))) {
708 			goto ExitThisRoutine;
709 		}
710 	}
711 
712 	*fp_ret = fp;
713 
714 	proc_fdunlock(p);
715 	return 0;
716 
717 ExitThisRoutine:
718 	fp_drop(p, fd, fp, 1);
719 	proc_fdunlock(p);
720 	return error;
721 }
722 
723 static int
writev_prwritev_uio(struct proc * p,int fd,user_addr_t user_iovp,int iovcnt,off_t offset,int is_pwritev,user_ssize_t * retval)724 writev_prwritev_uio(struct proc *p, int fd,
725     user_addr_t user_iovp, int iovcnt, off_t offset, int is_pwritev,
726     user_ssize_t *retval)
727 {
728 	uio_t auio = NULL;
729 	int error;
730 	struct user_iovec *iovp;
731 
732 	/* Verify range before calling uio_create() */
733 	if (iovcnt <= 0 || iovcnt > UIO_MAXIOV || offset < 0) {
734 		return EINVAL;
735 	}
736 
737 	/* allocate a uio large enough to hold the number of iovecs passed */
738 	auio = uio_create(iovcnt, offset,
739 	    (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
740 	    UIO_WRITE);
741 
742 	/* get location of iovecs within the uio.  then copyin the iovecs from
743 	 * user space.
744 	 */
745 	iovp = uio_iovsaddr(auio);
746 	if (iovp == NULL) {
747 		error = ENOMEM;
748 		goto ExitThisRoutine;
749 	}
750 	error = copyin_user_iovec_array(user_iovp,
751 	    IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
752 	    iovcnt, iovp);
753 	if (error) {
754 		goto ExitThisRoutine;
755 	}
756 
757 	/* finalize uio_t for use and do the IO
758 	 */
759 	error = uio_calculateresid(auio);
760 	if (error) {
761 		goto ExitThisRoutine;
762 	}
763 
764 	error = wr_uio(p, fd, auio, is_pwritev, retval);
765 
766 ExitThisRoutine:
767 	if (auio != NULL) {
768 		uio_free(auio);
769 	}
770 	return error;
771 }
772 
773 /*
774  * Gather write system call
775  */
776 int
writev(struct proc * p,struct writev_args * uap,user_ssize_t * retval)777 writev(struct proc *p, struct writev_args *uap, user_ssize_t *retval)
778 {
779 	__pthread_testcancel(1);
780 	return writev_nocancel(p, (struct writev_nocancel_args *)uap, retval);
781 }
782 
783 int
writev_nocancel(struct proc * p,struct writev_nocancel_args * uap,user_ssize_t * retval)784 writev_nocancel(struct proc *p, struct writev_nocancel_args *uap, user_ssize_t *retval)
785 {
786 	return writev_prwritev_uio(p, uap->fd, uap->iovp, uap->iovcnt, 0, 0, retval);
787 }
788 
789 /*
790  * Pwritev system call
791  */
792 int
sys_pwritev(struct proc * p,struct pwritev_args * uap,user_ssize_t * retval)793 sys_pwritev(struct proc *p, struct pwritev_args *uap, user_ssize_t *retval)
794 {
795 	__pthread_testcancel(1);
796 	return sys_pwritev_nocancel(p, (struct pwritev_nocancel_args *)uap, retval);
797 }
798 
799 int
sys_pwritev_nocancel(struct proc * p,struct pwritev_nocancel_args * uap,user_ssize_t * retval)800 sys_pwritev_nocancel(struct proc *p, struct pwritev_nocancel_args *uap, user_ssize_t *retval)
801 {
802 	return writev_prwritev_uio(p, uap->fd, uap->iovp, uap->iovcnt, uap->offset, 1, retval);
803 }
804 
805 /*
806  * Returns:	0			Success
807  *	preparefileread:EBADF
808  *	preparefileread:ESPIPE
809  *	preparefileread:ENXIO
810  *	preparefileread:???
811  *	fo_write:???
812  */
813 int
wr_uio(struct proc * p,int fd,uio_t uio,int is_pwritev,user_ssize_t * retval)814 wr_uio(struct proc *p, int fd, uio_t uio, int is_pwritev, user_ssize_t *retval)
815 {
816 	struct fileproc *fp;
817 	int error;
818 	int flags;
819 
820 	if ((error = preparefilewrite(p, &fp, fd, is_pwritev))) {
821 		return error;
822 	}
823 
824 	flags = is_pwritev ? FOF_OFFSET : 0;
825 	error = do_uiowrite(p, fp, uio, flags, retval);
826 
827 	fp_drop(p, fd, fp, 0);
828 
829 	return error;
830 }
831 
832 int
do_uiowrite(struct proc * p,struct fileproc * fp,uio_t uio,int flags,user_ssize_t * retval)833 do_uiowrite(struct proc *p, struct fileproc *fp, uio_t uio, int flags, user_ssize_t *retval)
834 {
835 	int error;
836 	user_ssize_t count;
837 	struct vfs_context context = *vfs_context_current();
838 
839 	count = uio_resid(uio);
840 
841 	context.vc_ucred = fp->f_cred;
842 	error = fo_write(fp, uio, flags, &context);
843 	if (error) {
844 		if (uio_resid(uio) != count && (error == ERESTART ||
845 		    error == EINTR || error == EWOULDBLOCK)) {
846 			error = 0;
847 		}
848 		/* The socket layer handles SIGPIPE */
849 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
850 		    (fp->fp_glob->fg_lflags & FG_NOSIGPIPE) == 0) {
851 			psignal(p, SIGPIPE);
852 		}
853 	}
854 	count -= uio_resid(uio);
855 	if (count) {
856 		os_atomic_or(&fp->fp_glob->fg_flag, FWASWRITTEN, relaxed);
857 	}
858 	*retval = count;
859 
860 	return error;
861 }
862 
863 /*
864  * Returns:	0			Success
865  *	preparefileread:EBADF
866  *	preparefileread:ESPIPE
867  *	preparefileread:ENXIO
868  *	fo_read:???
869  */
870 int
rd_uio(struct proc * p,int fdes,uio_t uio,int is_preadv,user_ssize_t * retval)871 rd_uio(struct proc *p, int fdes, uio_t uio, int is_preadv, user_ssize_t *retval)
872 {
873 	struct fileproc *fp;
874 	int error;
875 	user_ssize_t count;
876 	struct vfs_context context = *vfs_context_current();
877 
878 	if ((error = preparefileread(p, &fp, fdes, is_preadv))) {
879 		return error;
880 	}
881 
882 	count = uio_resid(uio);
883 
884 	context.vc_ucred = fp->f_cred;
885 
886 	int flags = is_preadv ? FOF_OFFSET : 0;
887 	error = fo_read(fp, uio, flags, &context);
888 
889 	if (error) {
890 		if (uio_resid(uio) != count && (error == ERESTART ||
891 		    error == EINTR || error == EWOULDBLOCK)) {
892 			error = 0;
893 		}
894 	}
895 	*retval = count - uio_resid(uio);
896 
897 	fp_drop(p, fdes, fp, 0);
898 
899 	return error;
900 }
901 
902 /*
903  * Ioctl system call
904  *
905  * Returns:	0			Success
906  *		EBADF
907  *		ENOTTY
908  *		ENOMEM
909  *		ESRCH
910  *	copyin:EFAULT
911  *	copyoutEFAULT
912  *	fp_lookup:EBADF			Bad file descriptor
913  *	fo_ioctl:???
914  */
915 int
ioctl(struct proc * p,struct ioctl_args * uap,__unused int32_t * retval)916 ioctl(struct proc *p, struct ioctl_args *uap, __unused int32_t *retval)
917 {
918 	struct fileproc *fp = NULL;
919 	int error = 0;
920 	u_int size = 0;
921 	caddr_t datap = NULL, memp = NULL;
922 	boolean_t is64bit = FALSE;
923 	int tmp = 0;
924 #define STK_PARAMS      128
925 	char stkbuf[STK_PARAMS] = {};
926 	int fd = uap->fd;
927 	u_long com = uap->com;
928 	struct vfs_context context = *vfs_context_current();
929 
930 	AUDIT_ARG(fd, uap->fd);
931 	AUDIT_ARG(addr, uap->data);
932 
933 	is64bit = proc_is64bit(p);
934 #if CONFIG_AUDIT
935 	if (is64bit) {
936 		AUDIT_ARG(value64, com);
937 	} else {
938 		AUDIT_ARG(cmd, CAST_DOWN_EXPLICIT(int, com));
939 	}
940 #endif /* CONFIG_AUDIT */
941 
942 	/*
943 	 * Interpret high order word to find amount of data to be
944 	 * copied to/from the user's address space.
945 	 */
946 	size = IOCPARM_LEN(com);
947 	if (size > IOCPARM_MAX) {
948 		return ENOTTY;
949 	}
950 	if (size > sizeof(stkbuf)) {
951 		memp = (caddr_t)kalloc_data(size, Z_WAITOK);
952 		if (memp == 0) {
953 			return ENOMEM;
954 		}
955 		datap = memp;
956 	} else {
957 		datap = &stkbuf[0];
958 	}
959 	if (com & IOC_IN) {
960 		if (size) {
961 			error = copyin(uap->data, datap, size);
962 			if (error) {
963 				goto out_nofp;
964 			}
965 		} else {
966 			/* XXX - IOC_IN and no size?  we should proably return an error here!! */
967 			if (is64bit) {
968 				*(user_addr_t *)datap = uap->data;
969 			} else {
970 				*(uint32_t *)datap = (uint32_t)uap->data;
971 			}
972 		}
973 	} else if ((com & IOC_OUT) && size) {
974 		/*
975 		 * Zero the buffer so the user always
976 		 * gets back something deterministic.
977 		 */
978 		bzero(datap, size);
979 	} else if (com & IOC_VOID) {
980 		/* XXX - this is odd since IOC_VOID means no parameters */
981 		if (is64bit) {
982 			*(user_addr_t *)datap = uap->data;
983 		} else {
984 			*(uint32_t *)datap = (uint32_t)uap->data;
985 		}
986 	}
987 
988 	proc_fdlock(p);
989 	error = fp_lookup(p, fd, &fp, 1);
990 	if (error) {
991 		proc_fdunlock(p);
992 		goto out_nofp;
993 	}
994 
995 	AUDIT_ARG(file, p, fp);
996 
997 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
998 		error = EBADF;
999 		goto out;
1000 	}
1001 
1002 	context.vc_ucred = fp->fp_glob->fg_cred;
1003 
1004 #if CONFIG_MACF
1005 	error = mac_file_check_ioctl(context.vc_ucred, fp->fp_glob, com);
1006 	if (error) {
1007 		goto out;
1008 	}
1009 #endif
1010 
1011 	switch (com) {
1012 	case FIONCLEX:
1013 		fp->fp_flags &= ~FP_CLOEXEC;
1014 		break;
1015 
1016 	case FIOCLEX:
1017 		fp->fp_flags |= FP_CLOEXEC;
1018 		break;
1019 
1020 	case FIONBIO:
1021 		// FIXME (rdar://54898652)
1022 		//
1023 		// this code is broken if fnctl(F_SETFL), ioctl() are
1024 		// called concurrently for the same fileglob.
1025 		if ((tmp = *(int *)datap)) {
1026 			os_atomic_or(&fp->f_flag, FNONBLOCK, relaxed);
1027 		} else {
1028 			os_atomic_andnot(&fp->f_flag, FNONBLOCK, relaxed);
1029 		}
1030 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
1031 		break;
1032 
1033 	case FIOASYNC:
1034 		// FIXME (rdar://54898652)
1035 		//
1036 		// this code is broken if fnctl(F_SETFL), ioctl() are
1037 		// called concurrently for the same fileglob.
1038 		if ((tmp = *(int *)datap)) {
1039 			os_atomic_or(&fp->f_flag, FASYNC, relaxed);
1040 		} else {
1041 			os_atomic_andnot(&fp->f_flag, FASYNC, relaxed);
1042 		}
1043 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
1044 		break;
1045 
1046 	case FIOSETOWN:
1047 		tmp = *(int *)datap;
1048 		if (fp->f_type == DTYPE_SOCKET) {
1049 			((struct socket *)fp_get_data(fp))->so_pgid = tmp;
1050 			break;
1051 		}
1052 		if (fp->f_type == DTYPE_PIPE) {
1053 			error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context);
1054 			break;
1055 		}
1056 		if (tmp <= 0) {
1057 			tmp = -tmp;
1058 		} else {
1059 			struct proc *p1 = proc_find(tmp);
1060 			if (p1 == 0) {
1061 				error = ESRCH;
1062 				break;
1063 			}
1064 			tmp = p1->p_pgrpid;
1065 			proc_rele(p1);
1066 		}
1067 		error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context);
1068 		break;
1069 
1070 	case FIOGETOWN:
1071 		if (fp->f_type == DTYPE_SOCKET) {
1072 			*(int *)datap = ((struct socket *)fp_get_data(fp))->so_pgid;
1073 			break;
1074 		}
1075 		error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
1076 		*(int *)datap = -*(int *)datap;
1077 		break;
1078 
1079 	default:
1080 		error = fo_ioctl(fp, com, datap, &context);
1081 		/*
1082 		 * Copy any data to user, size was
1083 		 * already set and checked above.
1084 		 */
1085 		if (error == 0 && (com & IOC_OUT) && size) {
1086 			error = copyout(datap, uap->data, (u_int)size);
1087 		}
1088 		break;
1089 	}
1090 out:
1091 	fp_drop(p, fd, fp, 1);
1092 	proc_fdunlock(p);
1093 
1094 out_nofp:
1095 	if (memp) {
1096 		kfree_data(memp, size);
1097 	}
1098 	return error;
1099 }
1100 
1101 int     selwait;
1102 #define SEL_FIRSTPASS 1
1103 #define SEL_SECONDPASS 2
1104 extern int selcontinue(int error);
1105 extern int selprocess(int error, int sel_pass);
1106 static int selscan(struct proc *p, struct _select * sel, struct _select_data * seldata,
1107     int nfd, int32_t *retval, int sel_pass, struct waitq_set *wqset);
1108 static int selcount(struct proc *p, u_int32_t *ibits, int nfd, int *count);
1109 static int seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup);
1110 static int seldrop(struct proc *p, u_int32_t *ibits, int nfd, int lim);
1111 static int select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval);
1112 
1113 /*
1114  * This is used for the special device nodes that do not implement
1115  * a proper kevent filter (see filt_specattach).
1116  *
1117  * In order to enable kevents on those, the spec_filtops will pretend
1118  * to call select, and try to sniff the selrecord(), if it observes one,
1119  * the knote is attached, which pairs with selwakeup() or selthreadclear().
1120  *
1121  * The last issue remaining, is that we need to serialize filt_specdetach()
1122  * with this, but it really can't know the "selinfo" or any locking domain.
1123  * To make up for this, We protect knote list operations with a global lock,
1124  * which give us a safe shared locking domain.
1125  *
1126  * Note: It is a little distasteful, but we really have very few of those.
1127  *       The big problem here is that sharing a lock domain without
1128  *       any kind of shared knowledge is a little complicated.
1129  *
1130  *       1. filters can really implement their own kqueue integration
1131  *          to side step this,
1132  *
1133  *       2. There's an opportunity to pick a private lock in selspec_attach()
1134  *          because both the selinfo and the knote are locked at that time.
1135  *          The cleanup story is however a little complicated.
1136  */
1137 static LCK_GRP_DECLARE(selspec_grp, "spec_filtops");
1138 static LCK_SPIN_DECLARE(selspec_lock, &selspec_grp);
1139 
1140 /*
1141  * The "primitive" lock is held.
1142  * The knote lock is held.
1143  */
1144 void
selspec_attach(struct knote * kn,struct selinfo * si)1145 selspec_attach(struct knote *kn, struct selinfo *si)
1146 {
1147 	struct selinfo *cur = os_atomic_load(&kn->kn_hook, relaxed);
1148 
1149 	if (cur == NULL) {
1150 		si->si_flags |= SI_SELSPEC;
1151 		lck_spin_lock(&selspec_lock);
1152 		kn->kn_hook = si;
1153 		KNOTE_ATTACH(&si->si_note, kn);
1154 		lck_spin_unlock(&selspec_lock);
1155 	} else {
1156 		/*
1157 		 * selspec_attach() can be called from e.g. filt_spectouch()
1158 		 * which might be called before any event was dequeued.
1159 		 *
1160 		 * It is hence not impossible for the knote already be hooked.
1161 		 *
1162 		 * Note that selwakeup_internal() could possibly
1163 		 * already have cleared this pointer. This is a race
1164 		 * that filt_specprocess will debounce.
1165 		 */
1166 		assert(si->si_flags & SI_SELSPEC);
1167 		assert(cur == si);
1168 	}
1169 }
1170 
1171 /*
1172  * The "primitive" lock is _not_ held.
1173  * The knote lock is held.
1174  */
1175 void
selspec_detach(struct knote * kn)1176 selspec_detach(struct knote *kn)
1177 {
1178 	/*
1179 	 * kn_hook always becomes non NULL under the knote lock.
1180 	 * Seeing "NULL" can't be a false positive.
1181 	 */
1182 	if (kn->kn_hook == NULL) {
1183 		return;
1184 	}
1185 
1186 	lck_spin_lock(&selspec_lock);
1187 	if (kn->kn_hook) {
1188 		struct selinfo *sip = kn->kn_hook;
1189 
1190 		kn->kn_hook = NULL;
1191 		KNOTE_DETACH(&sip->si_note, kn);
1192 	}
1193 	lck_spin_unlock(&selspec_lock);
1194 }
1195 
1196 /*
1197  * Select system call.
1198  *
1199  * Returns:	0			Success
1200  *		EINVAL			Invalid argument
1201  *		EAGAIN			Nonconformant error if allocation fails
1202  */
1203 int
select(struct proc * p,struct select_args * uap,int32_t * retval)1204 select(struct proc *p, struct select_args *uap, int32_t *retval)
1205 {
1206 	__pthread_testcancel(1);
1207 	return select_nocancel(p, (struct select_nocancel_args *)uap, retval);
1208 }
1209 
1210 int
select_nocancel(struct proc * p,struct select_nocancel_args * uap,int32_t * retval)1211 select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retval)
1212 {
1213 	uint64_t timeout = 0;
1214 
1215 	if (uap->tv) {
1216 		int err;
1217 		struct timeval atv;
1218 		if (IS_64BIT_PROCESS(p)) {
1219 			struct user64_timeval atv64;
1220 			err = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
1221 			/* Loses resolution - assume timeout < 68 years */
1222 			atv.tv_sec = (__darwin_time_t)atv64.tv_sec;
1223 			atv.tv_usec = atv64.tv_usec;
1224 		} else {
1225 			struct user32_timeval atv32;
1226 			err = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
1227 			atv.tv_sec = atv32.tv_sec;
1228 			atv.tv_usec = atv32.tv_usec;
1229 		}
1230 		if (err) {
1231 			return err;
1232 		}
1233 
1234 		if (itimerfix(&atv)) {
1235 			err = EINVAL;
1236 			return err;
1237 		}
1238 
1239 		clock_absolutetime_interval_to_deadline(tvtoabstime(&atv), &timeout);
1240 	}
1241 
1242 	return select_internal(p, uap, timeout, retval);
1243 }
1244 
1245 int
pselect(struct proc * p,struct pselect_args * uap,int32_t * retval)1246 pselect(struct proc *p, struct pselect_args *uap, int32_t *retval)
1247 {
1248 	__pthread_testcancel(1);
1249 	return pselect_nocancel(p, (struct pselect_nocancel_args *)uap, retval);
1250 }
1251 
1252 int
pselect_nocancel(struct proc * p,struct pselect_nocancel_args * uap,int32_t * retval)1253 pselect_nocancel(struct proc *p, struct pselect_nocancel_args *uap, int32_t *retval)
1254 {
1255 	int err;
1256 	struct uthread *ut;
1257 	uint64_t timeout = 0;
1258 
1259 	if (uap->ts) {
1260 		struct timespec ts;
1261 
1262 		if (IS_64BIT_PROCESS(p)) {
1263 			struct user64_timespec ts64;
1264 			err = copyin(uap->ts, (caddr_t)&ts64, sizeof(ts64));
1265 			ts.tv_sec = (__darwin_time_t)ts64.tv_sec;
1266 			ts.tv_nsec = (long)ts64.tv_nsec;
1267 		} else {
1268 			struct user32_timespec ts32;
1269 			err = copyin(uap->ts, (caddr_t)&ts32, sizeof(ts32));
1270 			ts.tv_sec = ts32.tv_sec;
1271 			ts.tv_nsec = ts32.tv_nsec;
1272 		}
1273 		if (err) {
1274 			return err;
1275 		}
1276 
1277 		if (!timespec_is_valid(&ts)) {
1278 			return EINVAL;
1279 		}
1280 		clock_absolutetime_interval_to_deadline(tstoabstime(&ts), &timeout);
1281 	}
1282 
1283 	ut = current_uthread();
1284 
1285 	if (uap->mask != USER_ADDR_NULL) {
1286 		/* save current mask, then copyin and set new mask */
1287 		sigset_t newset;
1288 		err = copyin(uap->mask, &newset, sizeof(sigset_t));
1289 		if (err) {
1290 			return err;
1291 		}
1292 		ut->uu_oldmask = ut->uu_sigmask;
1293 		ut->uu_flag |= UT_SAS_OLDMASK;
1294 		ut->uu_sigmask = (newset & ~sigcantmask);
1295 	}
1296 
1297 	err = select_internal(p, (struct select_nocancel_args *)uap, timeout, retval);
1298 
1299 	if (err != EINTR && ut->uu_flag & UT_SAS_OLDMASK) {
1300 		/*
1301 		 * Restore old mask (direct return case). NOTE: EINTR can also be returned
1302 		 * if the thread is cancelled. In that case, we don't reset the signal
1303 		 * mask to its original value (which usually happens in the signal
1304 		 * delivery path). This behavior is permitted by POSIX.
1305 		 */
1306 		ut->uu_sigmask = ut->uu_oldmask;
1307 		ut->uu_oldmask = 0;
1308 		ut->uu_flag &= ~UT_SAS_OLDMASK;
1309 	}
1310 
1311 	return err;
1312 }
1313 
1314 void
select_cleanup_uthread(struct _select * sel)1315 select_cleanup_uthread(struct _select *sel)
1316 {
1317 	kfree_data(sel->ibits, 2 * sel->nbytes);
1318 	sel->ibits = sel->obits = NULL;
1319 	sel->nbytes = 0;
1320 }
1321 
1322 static int
select_grow_uthread_cache(struct _select * sel,uint32_t nbytes)1323 select_grow_uthread_cache(struct _select *sel, uint32_t nbytes)
1324 {
1325 	uint32_t *buf;
1326 
1327 	buf = kalloc_data(2 * nbytes, Z_WAITOK | Z_ZERO);
1328 	if (buf) {
1329 		select_cleanup_uthread(sel);
1330 		sel->ibits = buf;
1331 		sel->obits = buf + nbytes / sizeof(uint32_t);
1332 		sel->nbytes = nbytes;
1333 		return true;
1334 	}
1335 	return false;
1336 }
1337 
1338 static void
select_bzero_uthread_cache(struct _select * sel)1339 select_bzero_uthread_cache(struct _select *sel)
1340 {
1341 	bzero(sel->ibits, sel->nbytes * 2);
1342 }
1343 
1344 /*
1345  * Generic implementation of {,p}select. Care: we type-pun uap across the two
1346  * syscalls, which differ slightly. The first 4 arguments (nfds and the fd sets)
1347  * are identical. The 5th (timeout) argument points to different types, so we
1348  * unpack in the syscall-specific code, but the generic code still does a null
1349  * check on this argument to determine if a timeout was specified.
1350  */
1351 static int
select_internal(struct proc * p,struct select_nocancel_args * uap,uint64_t timeout,int32_t * retval)1352 select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval)
1353 {
1354 	int error = 0;
1355 	u_int ni, nw;
1356 	thread_t th_act;
1357 	struct uthread  *uth;
1358 	struct _select *sel;
1359 	struct _select_data *seldata;
1360 	int count = 0;
1361 	size_t sz = 0;
1362 
1363 	th_act = current_thread();
1364 	uth = get_bsdthread_info(th_act);
1365 	sel = &uth->uu_select;
1366 	seldata = &uth->uu_save.uus_select_data;
1367 	*retval = 0;
1368 
1369 	seldata->args = uap;
1370 	seldata->retval = retval;
1371 	seldata->wqp = NULL;
1372 	seldata->count = 0;
1373 
1374 	if (uap->nd < 0) {
1375 		return EINVAL;
1376 	}
1377 
1378 	if (uap->nd > p->p_fd.fd_nfiles) {
1379 		uap->nd = p->p_fd.fd_nfiles; /* forgiving; slightly wrong */
1380 	}
1381 	nw = howmany(uap->nd, NFDBITS);
1382 	ni = nw * sizeof(fd_mask);
1383 
1384 	/*
1385 	 * if the previously allocated space for the bits is smaller than
1386 	 * what is requested or no space has yet been allocated for this
1387 	 * thread, allocate enough space now.
1388 	 *
1389 	 * Note: If this process fails, select() will return EAGAIN; this
1390 	 * is the same thing pool() returns in a no-memory situation, but
1391 	 * it is not a POSIX compliant error code for select().
1392 	 */
1393 	if (sel->nbytes < (3 * ni)) {
1394 		if (!select_grow_uthread_cache(sel, 3 * ni)) {
1395 			return EAGAIN;
1396 		}
1397 	} else {
1398 		select_bzero_uthread_cache(sel);
1399 	}
1400 
1401 	/*
1402 	 * get the bits from the user address space
1403 	 */
1404 #define getbits(name, x) \
1405 	do { \
1406 	        if (uap->name && (error = copyin(uap->name, \
1407 	                (caddr_t)&sel->ibits[(x) * nw], ni))) \
1408 	                goto continuation; \
1409 	} while (0)
1410 
1411 	getbits(in, 0);
1412 	getbits(ou, 1);
1413 	getbits(ex, 2);
1414 #undef  getbits
1415 
1416 	seldata->abstime = timeout;
1417 
1418 	if ((error = selcount(p, sel->ibits, uap->nd, &count))) {
1419 		goto continuation;
1420 	}
1421 
1422 	/*
1423 	 * We need an array of waitq pointers. This is due to the new way
1424 	 * in which waitqs are linked to sets. When a thread selects on a
1425 	 * file descriptor, a waitq (embedded in a selinfo structure) is
1426 	 * added to the thread's local waitq set. There is no longer any
1427 	 * way to directly iterate over all members of a given waitq set.
1428 	 * The process of linking a waitq into a set may allocate a link
1429 	 * table object. Because we can't iterate over all the waitqs to
1430 	 * which our thread waitq set belongs, we need a way of removing
1431 	 * this link object!
1432 	 *
1433 	 * Thus we need a buffer which will hold one waitq pointer
1434 	 * per FD being selected. During the tear-down phase we can use
1435 	 * these pointers to dis-associate the underlying selinfo's waitq
1436 	 * from our thread's waitq set.
1437 	 *
1438 	 * Because we also need to allocate a waitq set for this thread,
1439 	 * we use a bare buffer pointer to hold all the memory. Note that
1440 	 * this memory is cached in the thread pointer and not reaped until
1441 	 * the thread exists. This is generally OK because threads that
1442 	 * call select tend to keep calling select repeatedly.
1443 	 */
1444 	sz = ALIGN(sizeof(struct waitq_set)) + (count * sizeof(uint64_t));
1445 	if (sz > uth->uu_wqstate_sz) {
1446 		/* (re)allocate a buffer to hold waitq pointers */
1447 		if (uth->uu_wqset) {
1448 			if (waitq_set_is_valid(uth->uu_wqset)) {
1449 				waitq_set_deinit(uth->uu_wqset);
1450 			}
1451 			kheap_free(KM_SELECT, uth->uu_wqset, uth->uu_wqstate_sz);
1452 		} else if (uth->uu_wqstate_sz && !uth->uu_wqset) {
1453 			panic("select: thread structure corrupt! "
1454 			    "uu_wqstate_sz:%ld, wqstate_buf == NULL",
1455 			    uth->uu_wqstate_sz);
1456 		}
1457 		uth->uu_wqstate_sz = sz;
1458 		uth->uu_wqset = kheap_alloc(KM_SELECT, sz, Z_WAITOK);
1459 		if (!uth->uu_wqset) {
1460 			panic("can't allocate %ld bytes for wqstate buffer",
1461 			    uth->uu_wqstate_sz);
1462 		}
1463 		waitq_set_init(uth->uu_wqset, SYNC_POLICY_FIFO);
1464 	}
1465 
1466 	if (!waitq_set_is_valid(uth->uu_wqset)) {
1467 		waitq_set_init(uth->uu_wqset, SYNC_POLICY_FIFO);
1468 	}
1469 
1470 	/* the last chunk of our buffer is an array of waitq pointers */
1471 	seldata->wqp = (uint64_t *)((char *)(uth->uu_wqset) + ALIGN(sizeof(struct waitq_set)));
1472 	bzero(seldata->wqp, sz - ALIGN(sizeof(struct waitq_set)));
1473 
1474 	seldata->count = count;
1475 
1476 continuation:
1477 
1478 	if (error) {
1479 		/*
1480 		 * We have already cleaned up any state we established,
1481 		 * either locally or as a result of selcount().  We don't
1482 		 * need to wait_subqueue_unlink_all(), since we haven't set
1483 		 * anything at this point.
1484 		 */
1485 		return error;
1486 	}
1487 
1488 	return selprocess(0, SEL_FIRSTPASS);
1489 }
1490 
1491 int
selcontinue(int error)1492 selcontinue(int error)
1493 {
1494 	return selprocess(error, SEL_SECONDPASS);
1495 }
1496 
1497 
1498 /*
1499  * selprocess
1500  *
1501  * Parameters:	error			The error code from our caller
1502  *		sel_pass		The pass we are on
1503  */
1504 int
selprocess(int error,int sel_pass)1505 selprocess(int error, int sel_pass)
1506 {
1507 	u_int ni, nw;
1508 	thread_t th_act;
1509 	struct uthread  *uth;
1510 	struct proc *p;
1511 	struct select_nocancel_args *uap;
1512 	int *retval;
1513 	struct _select *sel;
1514 	struct _select_data *seldata;
1515 	int unwind = 1;
1516 	int prepost = 0;
1517 	int somewakeup = 0;
1518 	int doretry = 0;
1519 	wait_result_t wait_result;
1520 
1521 	p = current_proc();
1522 	th_act = current_thread();
1523 	uth = get_bsdthread_info(th_act);
1524 	sel = &uth->uu_select;
1525 	seldata = &uth->uu_save.uus_select_data;
1526 	uap = seldata->args;
1527 	retval = seldata->retval;
1528 
1529 	if ((error != 0) && (sel_pass == SEL_FIRSTPASS)) {
1530 		unwind = 0;
1531 	}
1532 	if (seldata->count == 0) {
1533 		unwind = 0;
1534 	}
1535 retry:
1536 	if (error != 0) {
1537 		goto done;
1538 	}
1539 
1540 	OSBitOrAtomic(P_SELECT, &p->p_flag);
1541 
1542 	/* skip scans if the select is just for timeouts */
1543 	if (seldata->count) {
1544 		error = selscan(p, sel, seldata, uap->nd, retval, sel_pass, uth->uu_wqset);
1545 		if (error || *retval) {
1546 			goto done;
1547 		}
1548 		if (prepost || somewakeup) {
1549 			/*
1550 			 * if the select of log, then we can wakeup and
1551 			 * discover some one else already read the data;
1552 			 * go to select again if time permits
1553 			 */
1554 			prepost = 0;
1555 			somewakeup = 0;
1556 			doretry = 1;
1557 		}
1558 	}
1559 
1560 	if (uap->tv) {
1561 		uint64_t        now;
1562 
1563 		clock_get_uptime(&now);
1564 		if (now >= seldata->abstime) {
1565 			goto done;
1566 		}
1567 	}
1568 
1569 	if (doretry) {
1570 		/* cleanup obits and try again */
1571 		doretry = 0;
1572 		sel_pass = SEL_FIRSTPASS;
1573 		goto retry;
1574 	}
1575 
1576 	/*
1577 	 * To effect a poll, the timeout argument should be
1578 	 * non-nil, pointing to a zero-valued timeval structure.
1579 	 */
1580 	if (uap->tv && seldata->abstime == 0) {
1581 		goto done;
1582 	}
1583 
1584 	/* No spurious wakeups due to colls,no need to check for them */
1585 	if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
1586 		sel_pass = SEL_FIRSTPASS;
1587 		goto retry;
1588 	}
1589 
1590 	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1591 
1592 	/* if the select is just for timeout skip check */
1593 	if (seldata->count && (sel_pass == SEL_SECONDPASS)) {
1594 		panic("selprocess: 2nd pass assertwaiting");
1595 	}
1596 
1597 	/* waitq_set has waitqueue as first element */
1598 	wait_result = waitq_assert_wait64_leeway((struct waitq *)uth->uu_wqset,
1599 	    NO_EVENT64, THREAD_ABORTSAFE,
1600 	    TIMEOUT_URGENCY_USER_NORMAL,
1601 	    seldata->abstime,
1602 	    TIMEOUT_NO_LEEWAY);
1603 	if (wait_result != THREAD_AWAKENED) {
1604 		/* there are no preposted events */
1605 		error = tsleep1(NULL, PSOCK | PCATCH,
1606 		    "select", 0, selcontinue);
1607 	} else {
1608 		prepost = 1;
1609 		error = 0;
1610 	}
1611 
1612 	if (error == 0) {
1613 		sel_pass = SEL_SECONDPASS;
1614 		if (!prepost) {
1615 			somewakeup = 1;
1616 		}
1617 		goto retry;
1618 	}
1619 done:
1620 	if (unwind) {
1621 		seldrop(p, sel->ibits, uap->nd, seldata->count);
1622 		waitq_set_deinit(uth->uu_wqset);
1623 		/*
1624 		 * zero out the waitq pointer array to avoid use-after free
1625 		 * errors in the selcount error path (seldrop_locked) if/when
1626 		 * the thread re-calls select().
1627 		 */
1628 		bzero((void *)uth->uu_wqset, uth->uu_wqstate_sz);
1629 	}
1630 	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1631 	/* select is not restarted after signals... */
1632 	if (error == ERESTART) {
1633 		error = EINTR;
1634 	}
1635 	if (error == EWOULDBLOCK) {
1636 		error = 0;
1637 	}
1638 	nw = howmany(uap->nd, NFDBITS);
1639 	ni = nw * sizeof(fd_mask);
1640 
1641 #define putbits(name, x) \
1642 	do { \
1643 	        if (uap->name && (error2 = \
1644 	                copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
1645 	                error = error2; \
1646 	} while (0)
1647 
1648 	if (error == 0) {
1649 		int error2;
1650 
1651 		putbits(in, 0);
1652 		putbits(ou, 1);
1653 		putbits(ex, 2);
1654 #undef putbits
1655 	}
1656 
1657 	if (error != EINTR && sel_pass == SEL_SECONDPASS && uth->uu_flag & UT_SAS_OLDMASK) {
1658 		/* restore signal mask - continuation case */
1659 		uth->uu_sigmask = uth->uu_oldmask;
1660 		uth->uu_oldmask = 0;
1661 		uth->uu_flag &= ~UT_SAS_OLDMASK;
1662 	}
1663 
1664 	return error;
1665 }
1666 
1667 
1668 /**
1669  * remove the fileproc's underlying waitq from the supplied waitq set;
1670  * clear FP_INSELECT when appropriate
1671  *
1672  * Parameters:
1673  *		fp	File proc that is potentially currently in select
1674  *		wqset	Waitq set to which the fileproc may belong
1675  *			(usually this is the thread's private waitq set)
1676  * Conditions:
1677  *		proc_fdlock is held
1678  */
1679 static void
selunlinkfp(struct fileproc * fp,uint64_t wqp_id,struct waitq_set * wqset)1680 selunlinkfp(struct fileproc *fp, uint64_t wqp_id, struct waitq_set *wqset)
1681 {
1682 	int valid_set = waitq_set_is_valid(wqset);
1683 	int valid_q = !!wqp_id;
1684 
1685 	/*
1686 	 * This could be called (from selcount error path) before we setup
1687 	 * the thread's wqset. Check the wqset passed in, and only unlink if
1688 	 * the set is valid.
1689 	 */
1690 
1691 	/* unlink the underlying waitq from the input set (thread waitq set) */
1692 	if (valid_q && valid_set) {
1693 		waitq_unlink_by_prepost_id(wqp_id, wqset);
1694 	}
1695 
1696 	/*
1697 	 * We can always remove the conflict queue from our thread's set: this
1698 	 * will not affect other threads that potentially need to be awoken on
1699 	 * the conflict queue during a fileproc_drain - those sets will still
1700 	 * be linked with the global conflict queue, and the last waiter
1701 	 * on the fp clears the CONFLICT marker.
1702 	 */
1703 	if (valid_set && (fp->fp_flags & FP_SELCONFLICT)) {
1704 		waitq_unlink(&select_conflict_queue, wqset);
1705 	}
1706 
1707 	if (valid_set && (fp->fp_flags & FP_INSELECT)) {
1708 		if (fp->fp_guard_attrs) {
1709 			if (fp->fp_guard->fpg_wset == wqset) {
1710 				fp->fp_guard->fpg_wset = NULL;
1711 				fp->fp_flags &= ~FP_INSELECT;
1712 			}
1713 		} else {
1714 			if (fp->fp_wset == wqset) {
1715 				fp->fp_wset = NULL;
1716 				fp->fp_flags &= ~FP_INSELECT;
1717 			}
1718 		}
1719 	}
1720 }
1721 
1722 /**
1723  * connect a fileproc to the given wqset, potentially bridging to a waitq
1724  * pointed to indirectly by wq_data
1725  *
1726  * Parameters:
1727  *		fp	File proc potentially currently in select
1728  *		wq_data	Pointer to a pointer to a waitq (could be NULL)
1729  *		wqset	Waitq set to which the fileproc should now belong
1730  *			(usually this is the thread's private waitq set)
1731  *
1732  * Conditions:
1733  *		proc_fdlock is held
1734  */
1735 static uint64_t
sellinkfp(struct fileproc * fp,void ** wq_data,struct waitq_set * wqset)1736 sellinkfp(struct fileproc *fp, void **wq_data, struct waitq_set *wqset)
1737 {
1738 	struct waitq *f_wq = NULL;
1739 
1740 	if ((fp->fp_flags & FP_INSELECT) == 0) {
1741 		if (fp->fp_guard_attrs) {
1742 			fp->fp_guard->fpg_wset = wqset;
1743 		} else {
1744 			fp->fp_wset = wqset;
1745 		}
1746 		fp->fp_flags |= FP_INSELECT;
1747 	} else {
1748 		fp->fp_flags |= FP_SELCONFLICT;
1749 		waitq_link(&select_conflict_queue, wqset, WAITQ_SHOULD_LOCK, NULL);
1750 	}
1751 
1752 	/*
1753 	 * The wq_data parameter has potentially been set by selrecord called
1754 	 * from a subsystems fo_select() function. If the subsystem does not
1755 	 * call selrecord, then wq_data will be NULL
1756 	 *
1757 	 * Use memcpy to get the value into a proper pointer because
1758 	 * wq_data most likely points to a stack variable that could be
1759 	 * unaligned on 32-bit systems.
1760 	 */
1761 	if (wq_data) {
1762 		memcpy(&f_wq, wq_data, sizeof(f_wq));
1763 		if (!waitq_is_valid(f_wq)) {
1764 			f_wq = NULL;
1765 		}
1766 	}
1767 
1768 	/* handles NULL f_wq */
1769 	return waitq_get_prepost_id(f_wq);
1770 }
1771 
1772 
1773 /*
1774  * selscan
1775  *
1776  * Parameters:	p			Process performing the select
1777  *		sel			The per-thread select context structure
1778  *		nfd			The number of file descriptors to scan
1779  *		retval			The per thread system call return area
1780  *		sel_pass		Which pass this is; allowed values are
1781  *						SEL_FIRSTPASS and SEL_SECONDPASS
1782  *		wqset			The per thread wait queue set
1783  *
1784  * Returns:	0			Success
1785  *		EIO			Invalid p->p_fd field XXX Obsolete?
1786  *		EBADF			One of the files in the bit vector is
1787  *						invalid.
1788  */
1789 static int
selscan(struct proc * p,struct _select * sel,struct _select_data * seldata,int nfd,int32_t * retval,int sel_pass,struct waitq_set * wqset)1790 selscan(struct proc *p, struct _select *sel, struct _select_data * seldata,
1791     int nfd, int32_t *retval, int sel_pass, struct waitq_set *wqset)
1792 {
1793 	int msk, i, j, fd;
1794 	u_int32_t bits;
1795 	struct fileproc *fp;
1796 	int n = 0;              /* count of bits */
1797 	int nc = 0;             /* bit vector offset (nc'th bit) */
1798 	static int flag[3] = { FREAD, FWRITE, 0 };
1799 	u_int32_t *iptr, *optr;
1800 	u_int nw;
1801 	u_int32_t *ibits, *obits;
1802 	waitq_ref_t reserved_link, *rl_ptr = NULL;
1803 	int count;
1804 	struct vfs_context context = *vfs_context_current();
1805 
1806 	ibits = sel->ibits;
1807 	obits = sel->obits;
1808 
1809 	nw = howmany(nfd, NFDBITS);
1810 
1811 	count = seldata->count;
1812 
1813 	nc = 0;
1814 	if (!count) {
1815 		*retval = 0;
1816 		return 0;
1817 	}
1818 
1819 	if (sel_pass == SEL_FIRSTPASS) {
1820 		/*
1821 		 * Make sure the waitq-set is all clean:
1822 		 *
1823 		 * select loops until it finds at least one event, however it
1824 		 * doesn't mean that the event that woke up select is still
1825 		 * fired by the time the second pass runs, and then
1826 		 * select_internal will loop back to a first pass.
1827 		 */
1828 		waitq_set_reset_anon_prepost(wqset);
1829 	}
1830 
1831 	proc_fdlock(p);
1832 	for (msk = 0; msk < 3; msk++) {
1833 		iptr = (u_int32_t *)&ibits[msk * nw];
1834 		optr = (u_int32_t *)&obits[msk * nw];
1835 
1836 		for (i = 0; i < nfd; i += NFDBITS) {
1837 			bits = iptr[i / NFDBITS];
1838 
1839 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1840 				bits &= ~(1U << j);
1841 
1842 				fp = fp_get_noref_locked(p, fd);
1843 				if (fp == NULL) {
1844 					/*
1845 					 * If we abort because of a bad
1846 					 * fd, let the caller unwind...
1847 					 */
1848 					proc_fdunlock(p);
1849 					return EBADF;
1850 				}
1851 				if (sel_pass == SEL_SECONDPASS) {
1852 					reserved_link = WAITQ_REF_NULL;
1853 					rl_ptr = NULL;
1854 					selunlinkfp(fp, seldata->wqp[nc], wqset);
1855 				} else {
1856 					reserved_link = waitq_link_reserve();
1857 					rl_ptr = &reserved_link;
1858 					waitq_set_lazy_init_link(wqset);
1859 				}
1860 
1861 				context.vc_ucred = fp->f_cred;
1862 
1863 				/*
1864 				 * stash this value b/c fo_select may replace
1865 				 * reserved_link with a pointer to a waitq object
1866 				 */
1867 				waitq_ref_t rsvd = reserved_link;
1868 
1869 				/* The select; set the bit, if true */
1870 				if (fp->f_ops && fp->f_type
1871 				    && fo_select(fp, flag[msk], rl_ptr, &context)) {
1872 					optr[fd / NFDBITS] |= (1U << (fd % NFDBITS));
1873 					n++;
1874 				}
1875 				if (sel_pass == SEL_FIRSTPASS) {
1876 					/*
1877 					 * If the fp's supporting selinfo structure was linked
1878 					 * to this thread's waitq set, then 'reserved_link'
1879 					 * will have been updated by selrecord to be a pointer
1880 					 * to the selinfo's waitq.
1881 					 */
1882 					if (reserved_link.wqr_value == rsvd.wqr_value) {
1883 						waitq_link_release(reserved_link);
1884 						rl_ptr = NULL; /* fo_select never called selrecord() */
1885 					}
1886 
1887 					/*
1888 					 * Hook up the thread's waitq set either to
1889 					 * the fileproc structure, or to the global
1890 					 * conflict queue: but only on the first
1891 					 * select pass.
1892 					 */
1893 					seldata->wqp[nc] = sellinkfp(fp, (void **)rl_ptr, wqset);
1894 				}
1895 				nc++;
1896 			}
1897 		}
1898 	}
1899 	proc_fdunlock(p);
1900 
1901 	*retval = n;
1902 	return 0;
1903 }
1904 
1905 static int poll_callback(struct kevent_qos_s *, kevent_ctx_t);
1906 
1907 int
poll(struct proc * p,struct poll_args * uap,int32_t * retval)1908 poll(struct proc *p, struct poll_args *uap, int32_t *retval)
1909 {
1910 	__pthread_testcancel(1);
1911 	return poll_nocancel(p, (struct poll_nocancel_args *)uap, retval);
1912 }
1913 
1914 
1915 int
poll_nocancel(struct proc * p,struct poll_nocancel_args * uap,int32_t * retval)1916 poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
1917 {
1918 	struct pollfd *fds = NULL;
1919 	struct kqueue *kq = NULL;
1920 	int error = 0;
1921 	u_int nfds = uap->nfds;
1922 	u_int rfds = 0;
1923 	rlim_t nofile = proc_limitgetcur(p, RLIMIT_NOFILE);
1924 	size_t ni = nfds * sizeof(struct pollfd);
1925 
1926 	/*
1927 	 * This is kinda bogus.  We have fd limits, but that is not
1928 	 * really related to the size of the pollfd array.  Make sure
1929 	 * we let the process use at least FD_SETSIZE entries and at
1930 	 * least enough for the current limits.  We want to be reasonably
1931 	 * safe, but not overly restrictive.
1932 	 */
1933 	if (nfds > OPEN_MAX ||
1934 	    (nfds > nofile && (proc_suser(p) || nfds > FD_SETSIZE))) {
1935 		return EINVAL;
1936 	}
1937 
1938 	kq = kqueue_alloc(p);
1939 	if (kq == NULL) {
1940 		return EAGAIN;
1941 	}
1942 
1943 	if (nfds) {
1944 		fds = (struct pollfd *)kalloc_data(ni, Z_WAITOK);
1945 		if (NULL == fds) {
1946 			error = EAGAIN;
1947 			goto out;
1948 		}
1949 
1950 		error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
1951 		if (error) {
1952 			goto out;
1953 		}
1954 	}
1955 
1956 	/* JMM - all this P_SELECT stuff is bogus */
1957 	OSBitOrAtomic(P_SELECT, &p->p_flag);
1958 	for (u_int i = 0; i < nfds; i++) {
1959 		short events = fds[i].events;
1960 		__assert_only int rc;
1961 
1962 		/* per spec, ignore fd values below zero */
1963 		if (fds[i].fd < 0) {
1964 			fds[i].revents = 0;
1965 			continue;
1966 		}
1967 
1968 		/* convert the poll event into a kqueue kevent */
1969 		struct kevent_qos_s kev = {
1970 			.ident = fds[i].fd,
1971 			.flags = EV_ADD | EV_ONESHOT | EV_POLL,
1972 			.udata = CAST_USER_ADDR_T(&fds[i])
1973 		};
1974 
1975 		/* Handle input events */
1976 		if (events & (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP)) {
1977 			kev.filter = EVFILT_READ;
1978 			if (events & (POLLPRI | POLLRDBAND)) {
1979 				kev.flags |= EV_OOBAND;
1980 			}
1981 			rc = kevent_register(kq, &kev, NULL);
1982 			assert((rc & FILTER_REGISTER_WAIT) == 0);
1983 		}
1984 
1985 		/* Handle output events */
1986 		if ((kev.flags & EV_ERROR) == 0 &&
1987 		    (events & (POLLOUT | POLLWRNORM | POLLWRBAND))) {
1988 			kev.filter = EVFILT_WRITE;
1989 			rc = kevent_register(kq, &kev, NULL);
1990 			assert((rc & FILTER_REGISTER_WAIT) == 0);
1991 		}
1992 
1993 		/* Handle BSD extension vnode events */
1994 		if ((kev.flags & EV_ERROR) == 0 &&
1995 		    (events & (POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE))) {
1996 			kev.filter = EVFILT_VNODE;
1997 			kev.fflags = 0;
1998 			if (events & POLLEXTEND) {
1999 				kev.fflags |= NOTE_EXTEND;
2000 			}
2001 			if (events & POLLATTRIB) {
2002 				kev.fflags |= NOTE_ATTRIB;
2003 			}
2004 			if (events & POLLNLINK) {
2005 				kev.fflags |= NOTE_LINK;
2006 			}
2007 			if (events & POLLWRITE) {
2008 				kev.fflags |= NOTE_WRITE;
2009 			}
2010 			rc = kevent_register(kq, &kev, NULL);
2011 			assert((rc & FILTER_REGISTER_WAIT) == 0);
2012 		}
2013 
2014 		if (kev.flags & EV_ERROR) {
2015 			fds[i].revents = POLLNVAL;
2016 			rfds++;
2017 		} else {
2018 			fds[i].revents = 0;
2019 		}
2020 	}
2021 
2022 	/*
2023 	 * Did we have any trouble registering?
2024 	 * If user space passed 0 FDs, then respect any timeout value passed.
2025 	 * This is an extremely inefficient sleep. If user space passed one or
2026 	 * more FDs, and we had trouble registering _all_ of them, then bail
2027 	 * out. If a subset of the provided FDs failed to register, then we
2028 	 * will still call the kqueue_scan function.
2029 	 */
2030 	if (nfds && (rfds == nfds)) {
2031 		goto done;
2032 	}
2033 
2034 	/* scan for, and possibly wait for, the kevents to trigger */
2035 	kevent_ctx_t kectx = kevent_get_context(current_thread());
2036 	*kectx = (struct kevent_ctx_s){
2037 		.kec_process_noutputs = rfds,
2038 		.kec_process_flags    = KEVENT_FLAG_POLL,
2039 		.kec_deadline         = 0, /* wait forever */
2040 	};
2041 
2042 	/*
2043 	 * If any events have trouble registering, an event has fired and we
2044 	 * shouldn't wait for events in kqueue_scan.
2045 	 */
2046 	if (rfds) {
2047 		kectx->kec_process_flags |= KEVENT_FLAG_IMMEDIATE;
2048 	} else if (uap->timeout != -1) {
2049 		clock_interval_to_deadline(uap->timeout, NSEC_PER_MSEC,
2050 		    &kectx->kec_deadline);
2051 	}
2052 
2053 	error = kqueue_scan(kq, kectx->kec_process_flags, kectx, poll_callback);
2054 	rfds = kectx->kec_process_noutputs;
2055 
2056 done:
2057 	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
2058 	/* poll is not restarted after signals... */
2059 	if (error == ERESTART) {
2060 		error = EINTR;
2061 	}
2062 	if (error == 0) {
2063 		error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
2064 		*retval = rfds;
2065 	}
2066 
2067 out:
2068 	kfree_data(fds, ni);
2069 
2070 	kqueue_dealloc(kq);
2071 	return error;
2072 }
2073 
2074 static int
poll_callback(struct kevent_qos_s * kevp,kevent_ctx_t kectx)2075 poll_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
2076 {
2077 	struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
2078 	short prev_revents = fds->revents;
2079 	short mask = 0;
2080 
2081 	/* convert the results back into revents */
2082 	if (kevp->flags & EV_EOF) {
2083 		fds->revents |= POLLHUP;
2084 	}
2085 	if (kevp->flags & EV_ERROR) {
2086 		fds->revents |= POLLERR;
2087 	}
2088 
2089 	switch (kevp->filter) {
2090 	case EVFILT_READ:
2091 		if (fds->revents & POLLHUP) {
2092 			mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND);
2093 		} else {
2094 			mask = (POLLIN | POLLRDNORM);
2095 			if (kevp->flags & EV_OOBAND) {
2096 				mask |= (POLLPRI | POLLRDBAND);
2097 			}
2098 		}
2099 		fds->revents |= (fds->events & mask);
2100 		break;
2101 
2102 	case EVFILT_WRITE:
2103 		if (!(fds->revents & POLLHUP)) {
2104 			fds->revents |= (fds->events & (POLLOUT | POLLWRNORM | POLLWRBAND));
2105 		}
2106 		break;
2107 
2108 	case EVFILT_VNODE:
2109 		if (kevp->fflags & NOTE_EXTEND) {
2110 			fds->revents |= (fds->events & POLLEXTEND);
2111 		}
2112 		if (kevp->fflags & NOTE_ATTRIB) {
2113 			fds->revents |= (fds->events & POLLATTRIB);
2114 		}
2115 		if (kevp->fflags & NOTE_LINK) {
2116 			fds->revents |= (fds->events & POLLNLINK);
2117 		}
2118 		if (kevp->fflags & NOTE_WRITE) {
2119 			fds->revents |= (fds->events & POLLWRITE);
2120 		}
2121 		break;
2122 	}
2123 
2124 	if (fds->revents != 0 && prev_revents == 0) {
2125 		kectx->kec_process_noutputs++;
2126 	}
2127 
2128 	return 0;
2129 }
2130 
2131 int
seltrue(__unused dev_t dev,__unused int flag,__unused struct proc * p)2132 seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
2133 {
2134 	return 1;
2135 }
2136 
2137 /*
2138  * selcount
2139  *
2140  * Count the number of bits set in the input bit vector, and establish an
2141  * outstanding fp->fp_iocount for each of the descriptors which will be in
2142  * use in the select operation.
2143  *
2144  * Parameters:	p			The process doing the select
2145  *		ibits			The input bit vector
2146  *		nfd			The number of fd's in the vector
2147  *		countp			Pointer to where to store the bit count
2148  *
2149  * Returns:	0			Success
2150  *		EIO			Bad per process open file table
2151  *		EBADF			One of the bits in the input bit vector
2152  *						references an invalid fd
2153  *
2154  * Implicit:	*countp (modified)	Count of fd's
2155  *
2156  * Notes:	This function is the first pass under the proc_fdlock() that
2157  *		permits us to recognize invalid descriptors in the bit vector;
2158  *		the may, however, not remain valid through the drop and
2159  *		later reacquisition of the proc_fdlock().
2160  */
2161 static int
selcount(struct proc * p,u_int32_t * ibits,int nfd,int * countp)2162 selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp)
2163 {
2164 	int msk, i, j, fd;
2165 	u_int32_t bits;
2166 	struct fileproc *fp;
2167 	int n = 0;
2168 	u_int32_t *iptr;
2169 	u_int nw;
2170 	int error = 0;
2171 	int need_wakeup = 0;
2172 
2173 	nw = howmany(nfd, NFDBITS);
2174 
2175 	proc_fdlock(p);
2176 	for (msk = 0; msk < 3; msk++) {
2177 		iptr = (u_int32_t *)&ibits[msk * nw];
2178 		for (i = 0; i < nfd; i += NFDBITS) {
2179 			bits = iptr[i / NFDBITS];
2180 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
2181 				bits &= ~(1U << j);
2182 
2183 				fp = fp_get_noref_locked(p, fd);
2184 				if (fp == NULL) {
2185 					*countp = 0;
2186 					error = EBADF;
2187 					goto bad;
2188 				}
2189 				os_ref_retain_locked(&fp->fp_iocount);
2190 				n++;
2191 			}
2192 		}
2193 	}
2194 	proc_fdunlock(p);
2195 
2196 	*countp = n;
2197 	return 0;
2198 
2199 bad:
2200 	if (n == 0) {
2201 		goto out;
2202 	}
2203 	/* Ignore error return; it's already EBADF */
2204 	(void)seldrop_locked(p, ibits, nfd, n, &need_wakeup);
2205 
2206 out:
2207 	proc_fdunlock(p);
2208 	if (need_wakeup) {
2209 		wakeup(&p->p_fd.fd_fpdrainwait);
2210 	}
2211 	return error;
2212 }
2213 
2214 
2215 /*
2216  * seldrop_locked
2217  *
2218  * Drop outstanding wait queue references set up during selscan(); drop the
2219  * outstanding per fileproc fp_iocount picked up during the selcount().
2220  *
2221  * Parameters:	p			Process performing the select
2222  *		ibits			Input bit bector of fd's
2223  *		nfd			Number of fd's
2224  *		lim			Limit to number of vector entries to
2225  *						consider, or -1 for "all"
2226  *		inselect		True if
2227  *		need_wakeup		Pointer to flag to set to do a wakeup
2228  *					if f_iocont on any descriptor goes to 0
2229  *
2230  * Returns:	0			Success
2231  *		EBADF			One or more fds in the bit vector
2232  *						were invalid, but the rest
2233  *						were successfully dropped
2234  *
2235  * Notes:	An fd make become bad while the proc_fdlock() is not held,
2236  *		if a multithreaded application closes the fd out from under
2237  *		the in progress select.  In this case, we still have to
2238  *		clean up after the set up on the remaining fds.
2239  */
2240 static int
seldrop_locked(struct proc * p,u_int32_t * ibits,int nfd,int lim,int * need_wakeup)2241 seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup)
2242 {
2243 	int msk, i, j, nc, fd;
2244 	u_int32_t bits;
2245 	struct fileproc *fp;
2246 	u_int32_t *iptr;
2247 	u_int nw;
2248 	int error = 0;
2249 	uthread_t uth = current_uthread();
2250 	struct _select_data *seldata;
2251 
2252 	*need_wakeup = 0;
2253 
2254 	nw = howmany(nfd, NFDBITS);
2255 	seldata = &uth->uu_save.uus_select_data;
2256 
2257 	nc = 0;
2258 	for (msk = 0; msk < 3; msk++) {
2259 		iptr = (u_int32_t *)&ibits[msk * nw];
2260 		for (i = 0; i < nfd; i += NFDBITS) {
2261 			bits = iptr[i / NFDBITS];
2262 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
2263 				bits &= ~(1U << j);
2264 				/*
2265 				 * If we've already dropped as many as were
2266 				 * counted/scanned, then we are done.
2267 				 */
2268 				if (nc >= lim) {
2269 					goto done;
2270 				}
2271 
2272 				/*
2273 				 * We took an I/O reference in selcount,
2274 				 * so the fp can't possibly be NULL.
2275 				 */
2276 				fp = fp_get_noref_locked_with_iocount(p, fd);
2277 				selunlinkfp(fp,
2278 				    seldata->wqp ? seldata->wqp[nc] : 0,
2279 				    uth->uu_wqset);
2280 
2281 				nc++;
2282 
2283 				const os_ref_count_t refc = os_ref_release_locked(&fp->fp_iocount);
2284 				if (0 == refc) {
2285 					panic("fp_iocount overdecrement!");
2286 				}
2287 
2288 				if (1 == refc) {
2289 					/*
2290 					 * The last iocount is responsible for clearing
2291 					 * selconfict flag - even if we didn't set it -
2292 					 * and is also responsible for waking up anyone
2293 					 * waiting on iocounts to drain.
2294 					 */
2295 					if (fp->fp_flags & FP_SELCONFLICT) {
2296 						fp->fp_flags &= ~FP_SELCONFLICT;
2297 					}
2298 					if (p->p_fd.fd_fpdrainwait) {
2299 						p->p_fd.fd_fpdrainwait = 0;
2300 						*need_wakeup = 1;
2301 					}
2302 				}
2303 			}
2304 		}
2305 	}
2306 done:
2307 	return error;
2308 }
2309 
2310 
2311 static int
seldrop(struct proc * p,u_int32_t * ibits,int nfd,int lim)2312 seldrop(struct proc *p, u_int32_t *ibits, int nfd, int lim)
2313 {
2314 	int error;
2315 	int need_wakeup = 0;
2316 
2317 	proc_fdlock(p);
2318 	error = seldrop_locked(p, ibits, nfd, lim, &need_wakeup);
2319 	proc_fdunlock(p);
2320 	if (need_wakeup) {
2321 		wakeup(&p->p_fd.fd_fpdrainwait);
2322 	}
2323 	return error;
2324 }
2325 
2326 /*
2327  * Record a select request.
2328  */
2329 void
selrecord(__unused struct proc * selector,struct selinfo * sip,void * s_data)2330 selrecord(__unused struct proc *selector, struct selinfo *sip, void *s_data)
2331 {
2332 	struct uthread * ut = current_uthread();
2333 
2334 	/* need to look at collisions */
2335 
2336 	/* do not record if this is second pass of select */
2337 	if (!s_data) {
2338 		return;
2339 	}
2340 
2341 	if ((sip->si_flags & SI_INITED) == 0) {
2342 		waitq_init(&sip->si_waitq, SYNC_POLICY_FIFO);
2343 		sip->si_flags |= SI_INITED;
2344 	}
2345 
2346 	if (ut->uu_wqset == SELSPEC_RECORD_MARKER) {
2347 		((selspec_record_hook_t)s_data)(sip);
2348 	} else {
2349 		/* on input, s_data points to the 64-bit ID of a reserved link object */
2350 		waitq_ref_t *reserved_link = (waitq_ref_t *)s_data;
2351 
2352 		sip->si_flags |= SI_RECORDED;
2353 
2354 		/* note: this checks for pre-existing linkage */
2355 		waitq_link(&sip->si_waitq, ut->uu_wqset,
2356 		    WAITQ_SHOULD_LOCK, reserved_link);
2357 
2358 		/*
2359 		 * Always consume the reserved link.
2360 		 * We can always call waitq_link_release() safely because if
2361 		 * waitq_link is successful, it consumes the link and resets the
2362 		 * value to 0, in which case our call to release becomes a no-op.
2363 		 * If waitq_link fails, then the following release call will actually
2364 		 * release the reserved link object.
2365 		 */
2366 		waitq_link_release(*reserved_link);
2367 		*reserved_link = WAITQ_REF_NULL;
2368 
2369 		/*
2370 		 * Use the s_data pointer as an output parameter as well
2371 		 * This avoids changing the prototype for this function which is
2372 		 * used by many kexts. We need to surface the waitq object
2373 		 * associated with the selinfo we just added to the thread's select
2374 		 * set. New waitq sets do not have back-pointers to set members, so
2375 		 * the only way to clear out set linkage objects is to go from the
2376 		 * waitq to the set.
2377 		 */
2378 		*(void **)s_data = &sip->si_waitq;
2379 	}
2380 }
2381 
2382 static void
selwakeup_internal(struct selinfo * sip,long hint,wait_result_t wr)2383 selwakeup_internal(struct selinfo *sip, long hint, wait_result_t wr)
2384 {
2385 	if ((sip->si_flags & SI_INITED) == 0) {
2386 		return;
2387 	}
2388 
2389 	if (sip->si_flags & SI_RECORDED) {
2390 		waitq_wakeup64_all(&sip->si_waitq, NO_EVENT64,
2391 		    wr, WAITQ_ALL_PRIORITIES);
2392 		sip->si_flags &= ~SI_RECORDED;
2393 	}
2394 
2395 	if (sip->si_flags & SI_SELSPEC) {
2396 		/*
2397 		 * The "primitive" lock is held.
2398 		 * The knote lock is not held.
2399 		 *
2400 		 * All knotes will transition their kn_hook to NULL.
2401 		 */
2402 		lck_spin_lock(&selspec_lock);
2403 		KNOTE(&sip->si_note, hint);
2404 		klist_init(&sip->si_note);
2405 		lck_spin_unlock(&selspec_lock);
2406 		sip->si_flags &= ~SI_SELSPEC;
2407 	}
2408 
2409 	if (hint == NOTE_REVOKE) {
2410 		/*
2411 		 * Higher level logic may have a handle on this waitq's
2412 		 * prepost ID, but that's OK because the waitq_deinit
2413 		 * will remove/invalidate the prepost object
2414 		 * (as well as mark the waitq invalid).
2415 		 *
2416 		 * This de-couples us from any callers that may have
2417 		 * a handle to this waitq via the prepost ID.
2418 		 */
2419 		waitq_deinit(&sip->si_waitq);
2420 		sip->si_flags &= ~SI_INITED;
2421 	} else {
2422 		/*
2423 		 * selinfo users might never call selthreadclear()
2424 		 * (for example pipes didn't use to).
2425 		 *
2426 		 * Fortunately, the waitq will always be unhooked
2427 		 * from the select sets cleanly, and when `waitq_unlink`
2428 		 * removes the waitq from the last set it is in,
2429 		 * it clears the prepost, which avoids a leak.
2430 		 *
2431 		 * This is why it is "OK" to have selinfos for which
2432 		 * waitq_deinit() is never called.
2433 		 */
2434 	}
2435 }
2436 
2437 
2438 void
selwakeup(struct selinfo * sip)2439 selwakeup(struct selinfo *sip)
2440 {
2441 	selwakeup_internal(sip, 0, THREAD_AWAKENED);
2442 }
2443 
2444 void
selthreadclear(struct selinfo * sip)2445 selthreadclear(struct selinfo *sip)
2446 {
2447 	selwakeup_internal(sip, NOTE_REVOKE, THREAD_RESTART);
2448 }
2449 
2450 
2451 /*
2452  * gethostuuid
2453  *
2454  * Description:	Get the host UUID from IOKit and return it to user space.
2455  *
2456  * Parameters:	uuid_buf		Pointer to buffer to receive UUID
2457  *		timeout			Timespec for timout
2458  *
2459  * Returns:	0			Success
2460  *		EWOULDBLOCK		Timeout is too short
2461  *		copyout:EFAULT		Bad user buffer
2462  *		mac_system_check_info:EPERM		Client not allowed to perform this operation
2463  *
2464  * Notes:	A timeout seems redundant, since if it's tolerable to not
2465  *		have a system UUID in hand, then why ask for one?
2466  */
2467 int
gethostuuid(struct proc * p,struct gethostuuid_args * uap,__unused int32_t * retval)2468 gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retval)
2469 {
2470 	kern_return_t kret;
2471 	int error;
2472 	mach_timespec_t mach_ts;        /* for IOKit call */
2473 	__darwin_uuid_t uuid_kern = {}; /* for IOKit call */
2474 
2475 	/* Check entitlement */
2476 	if (!IOCurrentTaskHasEntitlement("com.apple.private.getprivatesysid")) {
2477 #if !defined(XNU_TARGET_OS_OSX)
2478 #if CONFIG_MACF
2479 		if ((error = mac_system_check_info(kauth_cred_get(), "hw.uuid")) != 0) {
2480 			/* EPERM invokes userspace upcall if present */
2481 			return error;
2482 		}
2483 #endif
2484 #endif
2485 	}
2486 
2487 	/* Convert the 32/64 bit timespec into a mach_timespec_t */
2488 	if (proc_is64bit(p)) {
2489 		struct user64_timespec ts;
2490 		error = copyin(uap->timeoutp, &ts, sizeof(ts));
2491 		if (error) {
2492 			return error;
2493 		}
2494 		mach_ts.tv_sec = (unsigned int)ts.tv_sec;
2495 		mach_ts.tv_nsec = (clock_res_t)ts.tv_nsec;
2496 	} else {
2497 		struct user32_timespec ts;
2498 		error = copyin(uap->timeoutp, &ts, sizeof(ts));
2499 		if (error) {
2500 			return error;
2501 		}
2502 		mach_ts.tv_sec = ts.tv_sec;
2503 		mach_ts.tv_nsec = ts.tv_nsec;
2504 	}
2505 
2506 	/* Call IOKit with the stack buffer to get the UUID */
2507 	kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
2508 
2509 	/*
2510 	 * If we get it, copy out the data to the user buffer; note that a
2511 	 * uuid_t is an array of characters, so this is size invariant for
2512 	 * 32 vs. 64 bit.
2513 	 */
2514 	if (kret == KERN_SUCCESS) {
2515 		error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
2516 	} else {
2517 		error = EWOULDBLOCK;
2518 	}
2519 
2520 	return error;
2521 }
2522 
2523 /*
2524  * ledger
2525  *
2526  * Description:	Omnibus system call for ledger operations
2527  */
2528 int
ledger(struct proc * p,struct ledger_args * args,__unused int32_t * retval)2529 ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval)
2530 {
2531 #if !CONFIG_MACF
2532 #pragma unused(p)
2533 #endif
2534 	int rval, pid, len, error;
2535 #ifdef LEDGER_DEBUG
2536 	struct ledger_limit_args lla;
2537 #endif
2538 	task_t task;
2539 	proc_t proc;
2540 
2541 	/* Finish copying in the necessary args before taking the proc lock */
2542 	error = 0;
2543 	len = 0;
2544 	if (args->cmd == LEDGER_ENTRY_INFO) {
2545 		error = copyin(args->arg3, (char *)&len, sizeof(len));
2546 	} else if (args->cmd == LEDGER_TEMPLATE_INFO) {
2547 		error = copyin(args->arg2, (char *)&len, sizeof(len));
2548 	} else if (args->cmd == LEDGER_LIMIT)
2549 #ifdef LEDGER_DEBUG
2550 	{ error = copyin(args->arg2, (char *)&lla, sizeof(lla));}
2551 #else
2552 	{ return EINVAL; }
2553 #endif
2554 	else if ((args->cmd < 0) || (args->cmd > LEDGER_MAX_CMD)) {
2555 		return EINVAL;
2556 	}
2557 
2558 	if (error) {
2559 		return error;
2560 	}
2561 	if (len < 0) {
2562 		return EINVAL;
2563 	}
2564 
2565 	rval = 0;
2566 	if (args->cmd != LEDGER_TEMPLATE_INFO) {
2567 		pid = (int)args->arg1;
2568 		proc = proc_find(pid);
2569 		if (proc == NULL) {
2570 			return ESRCH;
2571 		}
2572 
2573 #if CONFIG_MACF
2574 		error = mac_proc_check_ledger(p, proc, args->cmd);
2575 		if (error) {
2576 			proc_rele(proc);
2577 			return error;
2578 		}
2579 #endif
2580 
2581 		task = proc->task;
2582 	}
2583 
2584 	switch (args->cmd) {
2585 #ifdef LEDGER_DEBUG
2586 	case LEDGER_LIMIT: {
2587 		if (!kauth_cred_issuser(kauth_cred_get())) {
2588 			rval = EPERM;
2589 		}
2590 		rval = ledger_limit(task, &lla);
2591 		proc_rele(proc);
2592 		break;
2593 	}
2594 #endif
2595 	case LEDGER_INFO: {
2596 		struct ledger_info info = {};
2597 
2598 		rval = ledger_info(task, &info);
2599 		proc_rele(proc);
2600 		if (rval == 0) {
2601 			rval = copyout(&info, args->arg2,
2602 			    sizeof(info));
2603 		}
2604 		break;
2605 	}
2606 
2607 	case LEDGER_ENTRY_INFO: {
2608 		void *buf;
2609 		int sz;
2610 
2611 #if CONFIG_MEMORYSTATUS
2612 		task_ledger_settle_dirty_time(task);
2613 #endif /* CONFIG_MEMORYSTATUS */
2614 
2615 		rval = ledger_get_task_entry_info_multiple(task, &buf, &len);
2616 		proc_rele(proc);
2617 		if ((rval == 0) && (len >= 0)) {
2618 			sz = len * sizeof(struct ledger_entry_info);
2619 			rval = copyout(buf, args->arg2, sz);
2620 			kfree_data(buf, sz);
2621 		}
2622 		if (rval == 0) {
2623 			rval = copyout(&len, args->arg3, sizeof(len));
2624 		}
2625 		break;
2626 	}
2627 
2628 	case LEDGER_TEMPLATE_INFO: {
2629 		void *buf;
2630 		int sz;
2631 
2632 		rval = ledger_template_info(&buf, &len);
2633 		if ((rval == 0) && (len >= 0)) {
2634 			sz = len * sizeof(struct ledger_template_info);
2635 			rval = copyout(buf, args->arg1, sz);
2636 			kfree_data(buf, sz);
2637 		}
2638 		if (rval == 0) {
2639 			rval = copyout(&len, args->arg2, sizeof(len));
2640 		}
2641 		break;
2642 	}
2643 
2644 	default:
2645 		panic("ledger syscall logic error -- command type %d", args->cmd);
2646 		proc_rele(proc);
2647 		rval = EINVAL;
2648 	}
2649 
2650 	return rval;
2651 }
2652 
2653 int
telemetry(__unused struct proc * p,struct telemetry_args * args,__unused int32_t * retval)2654 telemetry(__unused struct proc *p, struct telemetry_args *args, __unused int32_t *retval)
2655 {
2656 	int error = 0;
2657 
2658 	switch (args->cmd) {
2659 #if CONFIG_TELEMETRY
2660 	case TELEMETRY_CMD_TIMER_EVENT:
2661 		error = telemetry_timer_event(args->deadline, args->interval, args->leeway);
2662 		break;
2663 	case TELEMETRY_CMD_PMI_SETUP:
2664 		error = telemetry_pmi_setup((enum telemetry_pmi)args->deadline, args->interval);
2665 		break;
2666 #endif /* CONFIG_TELEMETRY */
2667 	case TELEMETRY_CMD_VOUCHER_NAME:
2668 		if (thread_set_voucher_name((mach_port_name_t)args->deadline)) {
2669 			error = EINVAL;
2670 		}
2671 		break;
2672 
2673 	default:
2674 		error = EINVAL;
2675 		break;
2676 	}
2677 
2678 	return error;
2679 }
2680 
2681 /*
2682  * Logging
2683  *
2684  * Description: syscall to access kernel logging from userspace
2685  *
2686  * Args:
2687  *	tag - used for syncing with userspace on the version.
2688  *	flags - flags used by the syscall.
2689  *	buffer - userspace address of string to copy.
2690  *	size - size of buffer.
2691  */
2692 int
log_data(__unused struct proc * p,struct log_data_args * args,int * retval)2693 log_data(__unused struct proc *p, struct log_data_args *args, int *retval)
2694 {
2695 	unsigned int tag = args->tag;
2696 	unsigned int flags = args->flags;
2697 	user_addr_t buffer = args->buffer;
2698 	unsigned int size = args->size;
2699 	int ret = 0;
2700 	*retval = 0;
2701 
2702 	/* Only DEXTs are suppose to use this syscall. */
2703 	if (!task_is_driver(current_task())) {
2704 		return EPERM;
2705 	}
2706 
2707 	/*
2708 	 * Tag synchronize the syscall version with userspace.
2709 	 * Tag == 0 => flags == OS_LOG_TYPE
2710 	 */
2711 	if (tag != 0) {
2712 		return EINVAL;
2713 	}
2714 
2715 	/*
2716 	 * OS_LOG_TYPE are defined in libkern/os/log.h
2717 	 * In userspace they are defined in libtrace/os/log.h
2718 	 */
2719 	if (flags != OS_LOG_TYPE_DEFAULT &&
2720 	    flags != OS_LOG_TYPE_INFO &&
2721 	    flags != OS_LOG_TYPE_DEBUG &&
2722 	    flags != OS_LOG_TYPE_ERROR &&
2723 	    flags != OS_LOG_TYPE_FAULT) {
2724 		return EINVAL;
2725 	}
2726 
2727 	if (size == 0) {
2728 		return EINVAL;
2729 	}
2730 
2731 	/* truncate to OS_LOG_DATA_MAX_SIZE */
2732 	if (size > OS_LOG_DATA_MAX_SIZE) {
2733 		printf("%s: WARNING msg is going to be truncated from %u to %u\n",
2734 		    __func__, size, OS_LOG_DATA_MAX_SIZE);
2735 		size = OS_LOG_DATA_MAX_SIZE;
2736 	}
2737 
2738 	char *log_msg = (char *)kalloc_data(size, Z_WAITOK);
2739 	if (!log_msg) {
2740 		return ENOMEM;
2741 	}
2742 
2743 	if (copyin(buffer, log_msg, size) != 0) {
2744 		ret = EFAULT;
2745 		goto out;
2746 	}
2747 	log_msg[size - 1] = '\0';
2748 
2749 	/*
2750 	 * This will log to dmesg and logd.
2751 	 * The call will fail if the current
2752 	 * process is not a driverKit process.
2753 	 */
2754 	os_log_driverKit(&ret, OS_LOG_DEFAULT, (os_log_type_t)flags, "%s", log_msg);
2755 
2756 out:
2757 	if (log_msg != NULL) {
2758 		kfree_data(log_msg, size);
2759 	}
2760 
2761 	return ret;
2762 }
2763 
2764 #if DEVELOPMENT || DEBUG
2765 static int
2766 sysctl_waitq_set_nelem SYSCTL_HANDLER_ARGS
2767 {
2768 #pragma unused(oidp, arg1, arg2)
2769 	int nelem;
2770 
2771 	/* Read only  */
2772 	if (req->newptr != USER_ADDR_NULL) {
2773 		return EPERM;
2774 	}
2775 
2776 	nelem = sysctl_helper_waitq_set_nelem();
2777 
2778 	return SYSCTL_OUT(req, &nelem, sizeof(nelem));
2779 }
2780 
2781 SYSCTL_PROC(_kern, OID_AUTO, n_ltable_entries, CTLFLAG_RD | CTLFLAG_LOCKED,
2782     0, 0, sysctl_waitq_set_nelem, "I", "ltable elementis currently used");
2783 
2784 
2785 static int
2786 sysctl_mpsc_test_pingpong SYSCTL_HANDLER_ARGS
2787 {
2788 #pragma unused(oidp, arg1, arg2)
2789 	uint64_t value = 0;
2790 	int error;
2791 
2792 	error = SYSCTL_IN(req, &value, sizeof(value));
2793 	if (error) {
2794 		return error;
2795 	}
2796 
2797 	if (error == 0 && req->newptr) {
2798 		error = mpsc_test_pingpong(value, &value);
2799 		if (error == 0) {
2800 			error = SYSCTL_OUT(req, &value, sizeof(value));
2801 		}
2802 	}
2803 
2804 	return error;
2805 }
2806 SYSCTL_PROC(_kern, OID_AUTO, mpsc_test_pingpong, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
2807     0, 0, sysctl_mpsc_test_pingpong, "Q", "MPSC tests: pingpong");
2808 
2809 #endif /* DEVELOPMENT || DEBUG */
2810 
2811 /*Remote Time api*/
2812 SYSCTL_NODE(_machdep, OID_AUTO, remotetime, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "Remote time api");
2813 
2814 #if DEVELOPMENT || DEBUG
2815 #if CONFIG_MACH_BRIDGE_SEND_TIME
2816 extern _Atomic uint32_t bt_init_flag;
2817 extern uint32_t mach_bridge_timer_enable(uint32_t, int);
2818 
2819 SYSCTL_INT(_machdep_remotetime, OID_AUTO, bridge_timer_init_flag,
2820     CTLFLAG_RD | CTLFLAG_LOCKED, &bt_init_flag, 0, "");
2821 
2822 static int sysctl_mach_bridge_timer_enable SYSCTL_HANDLER_ARGS
2823 {
2824 #pragma unused(oidp, arg1, arg2)
2825 	uint32_t value = 0;
2826 	int error = 0;
2827 	/* User is querying buffer size */
2828 	if (req->oldptr == USER_ADDR_NULL && req->newptr == USER_ADDR_NULL) {
2829 		req->oldidx = sizeof(value);
2830 		return 0;
2831 	}
2832 	if (os_atomic_load(&bt_init_flag, acquire)) {
2833 		if (req->newptr) {
2834 			int new_value = 0;
2835 			error = SYSCTL_IN(req, &new_value, sizeof(new_value));
2836 			if (error) {
2837 				return error;
2838 			}
2839 			if (new_value == 0 || new_value == 1) {
2840 				value = mach_bridge_timer_enable(new_value, 1);
2841 			} else {
2842 				return EPERM;
2843 			}
2844 		} else {
2845 			value = mach_bridge_timer_enable(0, 0);
2846 		}
2847 	}
2848 	error = SYSCTL_OUT(req, &value, sizeof(value));
2849 	return error;
2850 }
2851 
2852 SYSCTL_PROC(_machdep_remotetime, OID_AUTO, bridge_timer_enable,
2853     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2854     0, 0, sysctl_mach_bridge_timer_enable, "I", "");
2855 
2856 #endif /* CONFIG_MACH_BRIDGE_SEND_TIME */
2857 
2858 static int sysctl_mach_bridge_remote_time SYSCTL_HANDLER_ARGS
2859 {
2860 #pragma unused(oidp, arg1, arg2)
2861 	uint64_t ltime = 0, rtime = 0;
2862 	if (req->oldptr == USER_ADDR_NULL) {
2863 		req->oldidx = sizeof(rtime);
2864 		return 0;
2865 	}
2866 	if (req->newptr) {
2867 		int error = SYSCTL_IN(req, &ltime, sizeof(ltime));
2868 		if (error) {
2869 			return error;
2870 		}
2871 	}
2872 	rtime = mach_bridge_remote_time(ltime);
2873 	return SYSCTL_OUT(req, &rtime, sizeof(rtime));
2874 }
2875 SYSCTL_PROC(_machdep_remotetime, OID_AUTO, mach_bridge_remote_time,
2876     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
2877     0, 0, sysctl_mach_bridge_remote_time, "Q", "");
2878 
2879 #endif /* DEVELOPMENT || DEBUG */
2880 
2881 #if CONFIG_MACH_BRIDGE_RECV_TIME
2882 extern struct bt_params bt_params_get_latest(void);
2883 
2884 static int sysctl_mach_bridge_conversion_params SYSCTL_HANDLER_ARGS
2885 {
2886 #pragma unused(oidp, arg1, arg2)
2887 	struct bt_params params = {};
2888 	if (req->oldptr == USER_ADDR_NULL) {
2889 		req->oldidx = sizeof(struct bt_params);
2890 		return 0;
2891 	}
2892 	if (req->newptr) {
2893 		return EPERM;
2894 	}
2895 	params = bt_params_get_latest();
2896 	return SYSCTL_OUT(req, &params, MIN(sizeof(params), req->oldlen));
2897 }
2898 
2899 SYSCTL_PROC(_machdep_remotetime, OID_AUTO, conversion_params,
2900     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0,
2901     0, sysctl_mach_bridge_conversion_params, "S,bt_params", "");
2902 
2903 #endif /* CONFIG_MACH_BRIDGE_RECV_TIME */
2904 
2905 #if DEVELOPMENT || DEBUG
2906 
2907 #include <pexpert/pexpert.h>
2908 extern int32_t sysctl_get_bound_cpuid(void);
2909 extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
2910 static int
2911 sysctl_kern_sched_thread_bind_cpu SYSCTL_HANDLER_ARGS
2912 {
2913 #pragma unused(oidp, arg1, arg2)
2914 
2915 	/*
2916 	 * DO NOT remove this bootarg guard or make this non-development.
2917 	 * This kind of binding should only be used for tests and
2918 	 * experiments in a custom configuration, never shipping code.
2919 	 */
2920 
2921 	if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
2922 		return ENOENT;
2923 	}
2924 
2925 	int32_t cpuid = sysctl_get_bound_cpuid();
2926 
2927 	int32_t new_value;
2928 	int changed;
2929 	int error = sysctl_io_number(req, cpuid, sizeof cpuid, &new_value, &changed);
2930 	if (error) {
2931 		return error;
2932 	}
2933 
2934 	if (changed) {
2935 		kern_return_t kr = sysctl_thread_bind_cpuid(new_value);
2936 
2937 		if (kr == KERN_NOT_SUPPORTED) {
2938 			return ENOTSUP;
2939 		}
2940 
2941 		if (kr == KERN_INVALID_VALUE) {
2942 			return ERANGE;
2943 		}
2944 	}
2945 
2946 	return error;
2947 }
2948 
2949 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cpu, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2950     0, 0, sysctl_kern_sched_thread_bind_cpu, "I", "");
2951 
2952 #if __AMP__
2953 extern char sysctl_get_bound_cluster_type(void);
2954 extern void sysctl_thread_bind_cluster_type(char cluster_type);
2955 static int
2956 sysctl_kern_sched_thread_bind_cluster_type SYSCTL_HANDLER_ARGS
2957 {
2958 #pragma unused(oidp, arg1, arg2)
2959 	char buff[4];
2960 
2961 	if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
2962 		return ENOENT;
2963 	}
2964 
2965 	int error = SYSCTL_IN(req, buff, 1);
2966 	if (error) {
2967 		return error;
2968 	}
2969 	char cluster_type = buff[0];
2970 
2971 	if (!req->newptr) {
2972 		goto out;
2973 	}
2974 
2975 	sysctl_thread_bind_cluster_type(cluster_type);
2976 out:
2977 	cluster_type = sysctl_get_bound_cluster_type();
2978 	buff[0] = cluster_type;
2979 
2980 	return SYSCTL_OUT(req, buff, 1);
2981 }
2982 
2983 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cluster_type, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED,
2984     0, 0, sysctl_kern_sched_thread_bind_cluster_type, "A", "");
2985 
2986 extern char sysctl_get_task_cluster_type(void);
2987 extern void sysctl_task_set_cluster_type(char cluster_type);
2988 static int
2989 sysctl_kern_sched_task_set_cluster_type SYSCTL_HANDLER_ARGS
2990 {
2991 #pragma unused(oidp, arg1, arg2)
2992 	char buff[4];
2993 
2994 	if (!PE_parse_boot_argn("enable_skstsct", NULL, 0)) {
2995 		return ENOENT;
2996 	}
2997 
2998 	int error = SYSCTL_IN(req, buff, 1);
2999 	if (error) {
3000 		return error;
3001 	}
3002 	char cluster_type = buff[0];
3003 
3004 	if (!req->newptr) {
3005 		goto out;
3006 	}
3007 
3008 	sysctl_task_set_cluster_type(cluster_type);
3009 out:
3010 	cluster_type = sysctl_get_task_cluster_type();
3011 	buff[0] = cluster_type;
3012 
3013 	return SYSCTL_OUT(req, buff, 1);
3014 }
3015 
3016 SYSCTL_PROC(_kern, OID_AUTO, sched_task_set_cluster_type, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED,
3017     0, 0, sysctl_kern_sched_task_set_cluster_type, "A", "");
3018 
3019 extern kern_return_t thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options);
3020 extern uint32_t thread_bound_cluster_id(thread_t);
3021 static int
3022 sysctl_kern_sched_thread_bind_cluster_id SYSCTL_HANDLER_ARGS
3023 {
3024 #pragma unused(oidp, arg1, arg2)
3025 	if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
3026 		return ENOENT;
3027 	}
3028 
3029 	thread_t self = current_thread();
3030 	uint32_t old_value = thread_bound_cluster_id(self);
3031 	uint32_t new_value;
3032 
3033 	int error = SYSCTL_IN(req, &new_value, sizeof(new_value));
3034 	if (error) {
3035 		return error;
3036 	}
3037 	if (new_value != old_value) {
3038 		/*
3039 		 * This sysctl binds the thread to the cluster without any flags,
3040 		 * which means it will be hard bound and not check eligibility.
3041 		 */
3042 		thread_bind_cluster_id(self, new_value, 0);
3043 	}
3044 	return SYSCTL_OUT(req, &old_value, sizeof(old_value));
3045 }
3046 
3047 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cluster_id, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3048     0, 0, sysctl_kern_sched_thread_bind_cluster_id, "I", "");
3049 
3050 #if CONFIG_SCHED_EDGE
3051 
3052 extern int sched_edge_restrict_ut;
3053 SYSCTL_INT(_kern, OID_AUTO, sched_edge_restrict_ut, CTLFLAG_RW | CTLFLAG_LOCKED, &sched_edge_restrict_ut, 0, "Edge Scheduler Restrict UT Threads");
3054 extern int sched_edge_restrict_bg;
3055 SYSCTL_INT(_kern, OID_AUTO, sched_edge_restrict_bg, CTLFLAG_RW | CTLFLAG_LOCKED, &sched_edge_restrict_ut, 0, "Edge Scheduler Restrict BG Threads");
3056 extern int sched_edge_migrate_ipi_immediate;
3057 SYSCTL_INT(_kern, OID_AUTO, sched_edge_migrate_ipi_immediate, CTLFLAG_RW | CTLFLAG_LOCKED, &sched_edge_migrate_ipi_immediate, 0, "Edge Scheduler uses immediate IPIs for migration event based on execution latency");
3058 
3059 #endif /* CONFIG_SCHED_EDGE */
3060 
3061 #endif /* __AMP__ */
3062 
3063 #if INTERRUPT_MASKED_DEBUG
3064 
3065 SYSCTL_INT(_kern, OID_AUTO, interrupt_masked_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED,
3066     &interrupt_masked_timeout, 0,
3067     "Interrupt masked duration after which a tracepoint is emitted or the device panics (in mach timebase units)");
3068 
3069 SYSCTL_INT(_kern, OID_AUTO, interrupt_masked_debug_mode, CTLFLAG_RW | CTLFLAG_LOCKED,
3070     &interrupt_masked_debug_mode, 0,
3071     "Enable interrupt masked tracing or panic (0: off, 1: trace, 2: panic)");
3072 
3073 #endif /* INTERRUPT_MASKED_DEBUG */
3074 
3075 #if SCHED_PREEMPTION_DISABLE_DEBUG
3076 
3077 SYSCTL_QUAD(_kern, OID_AUTO, sched_preemption_disable_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED,
3078     &sched_preemption_disable_threshold_mt,
3079     "Preemption disablement duration after which a tracepoint is emitted or the device panics (in mach timebase units)");
3080 
3081 SYSCTL_INT(_kern, OID_AUTO, sched_preemption_disable_debug_mode, CTLFLAG_RW | CTLFLAG_LOCKED,
3082     &sched_preemption_disable_debug_mode, 0,
3083     "Enable preemption disablement tracing or panic (0: off, 1: trace, 2: panic)");
3084 
3085 PERCPU_DECL(uint64_t, preemption_disable_max_mt);
3086 
3087 static int
sysctl_sched_preemption_disable_stats(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3088 sysctl_sched_preemption_disable_stats(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3089 {
3090 	uint64_t stats[MAX_CPUS]; // maximum per CPU
3091 
3092 	/*
3093 	 * No synchronization here. The individual values are pretty much
3094 	 * independent, and reading/writing them is atomic.
3095 	 */
3096 
3097 	static_assert(__LP64__); /* below is racy on armv7k, reminder to change if needed there. */
3098 
3099 	int cpu = 0;
3100 	percpu_foreach(max_stat, preemption_disable_max_mt) {
3101 		stats[cpu++] = *max_stat;
3102 	}
3103 
3104 	if (req->newlen > 0) {
3105 		// writing just resets all stats.
3106 		percpu_foreach(max_stat, preemption_disable_max_mt) {
3107 			*max_stat = 0;
3108 		}
3109 	}
3110 
3111 	return sysctl_io_opaque(req, stats, cpu * sizeof(uint64_t), NULL);
3112 }
3113 
3114 SYSCTL_PROC(_kern, OID_AUTO, sched_preemption_disable_stats,
3115     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_LOCKED,
3116     0, 0, sysctl_sched_preemption_disable_stats, "I", "Preemption disablement statistics");
3117 
3118 #endif /* SCHED_PREEMPTION_DISABLE_DEBUG */
3119 
3120 
3121 /* used for testing by exception_tests */
3122 extern uint32_t ipc_control_port_options;
3123 SYSCTL_INT(_kern, OID_AUTO, ipc_control_port_options,
3124     CTLFLAG_RD | CTLFLAG_LOCKED, &ipc_control_port_options, 0, "");
3125 
3126 #endif /* DEVELOPMENT || DEBUG */
3127 
3128 extern uint32_t task_exc_guard_default;
3129 
3130 SYSCTL_INT(_kern, OID_AUTO, task_exc_guard_default,
3131     CTLFLAG_RD | CTLFLAG_LOCKED, &task_exc_guard_default, 0, "");
3132 
3133 
3134 static int
3135 sysctl_kern_tcsm_available SYSCTL_HANDLER_ARGS
3136 {
3137 #pragma unused(oidp, arg1, arg2)
3138 	uint32_t value = machine_csv(CPUVN_CI) ? 1 : 0;
3139 
3140 	if (req->newptr) {
3141 		return EINVAL;
3142 	}
3143 
3144 	return SYSCTL_OUT(req, &value, sizeof(value));
3145 }
3146 SYSCTL_PROC(_kern, OID_AUTO, tcsm_available,
3147     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
3148     0, 0, sysctl_kern_tcsm_available, "I", "");
3149 
3150 
3151 static int
3152 sysctl_kern_tcsm_enable SYSCTL_HANDLER_ARGS
3153 {
3154 #pragma unused(oidp, arg1, arg2)
3155 	uint32_t soflags = 0;
3156 	uint32_t old_value = thread_get_no_smt() ? 1 : 0;
3157 
3158 	int error = SYSCTL_IN(req, &soflags, sizeof(soflags));
3159 	if (error) {
3160 		return error;
3161 	}
3162 
3163 	if (soflags && machine_csv(CPUVN_CI)) {
3164 		thread_set_no_smt(true);
3165 		machine_tecs(current_thread());
3166 	}
3167 
3168 	return SYSCTL_OUT(req, &old_value, sizeof(old_value));
3169 }
3170 SYSCTL_PROC(_kern, OID_AUTO, tcsm_enable,
3171     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
3172     0, 0, sysctl_kern_tcsm_enable, "I", "");
3173 
3174 
3175 #if DEVELOPMENT || DEBUG
3176 extern void sysctl_task_set_no_smt(char no_smt);
3177 extern char sysctl_task_get_no_smt(void);
3178 
3179 static int
3180 sysctl_kern_sched_task_set_no_smt SYSCTL_HANDLER_ARGS
3181 {
3182 #pragma unused(oidp, arg1, arg2)
3183 	char buff[4];
3184 
3185 	int error = SYSCTL_IN(req, buff, 1);
3186 	if (error) {
3187 		return error;
3188 	}
3189 	char no_smt = buff[0];
3190 
3191 	if (!req->newptr) {
3192 		goto out;
3193 	}
3194 
3195 	sysctl_task_set_no_smt(no_smt);
3196 out:
3197 	no_smt = sysctl_task_get_no_smt();
3198 	buff[0] = no_smt;
3199 
3200 	return SYSCTL_OUT(req, buff, 1);
3201 }
3202 
3203 SYSCTL_PROC(_kern, OID_AUTO, sched_task_set_no_smt, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
3204     0, 0, sysctl_kern_sched_task_set_no_smt, "A", "");
3205 
3206 static int
sysctl_kern_sched_thread_set_no_smt(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3207 sysctl_kern_sched_thread_set_no_smt(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3208 {
3209 	int new_value, changed;
3210 	int old_value = thread_get_no_smt() ? 1 : 0;
3211 	int error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed);
3212 
3213 	if (changed) {
3214 		thread_set_no_smt(!!new_value);
3215 	}
3216 
3217 	return error;
3218 }
3219 
3220 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_set_no_smt,
3221     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
3222     0, 0, sysctl_kern_sched_thread_set_no_smt, "I", "");
3223 
3224 
3225 static int
3226 sysctl_kern_debug_get_preoslog SYSCTL_HANDLER_ARGS
3227 {
3228 #pragma unused(oidp, arg1, arg2)
3229 	static bool oneshot_executed = false;
3230 	size_t preoslog_size = 0;
3231 	const char *preoslog = NULL;
3232 
3233 	// DumpPanic pases a non-zero write value when it needs oneshot behaviour
3234 	if (req->newptr) {
3235 		uint8_t oneshot = 0;
3236 		int error = SYSCTL_IN(req, &oneshot, sizeof(oneshot));
3237 		if (error) {
3238 			return error;
3239 		}
3240 
3241 		if (oneshot) {
3242 			if (!OSCompareAndSwap8(false, true, &oneshot_executed)) {
3243 				return EPERM;
3244 			}
3245 		}
3246 	}
3247 
3248 	preoslog = sysctl_debug_get_preoslog(&preoslog_size);
3249 	if (preoslog == NULL || preoslog_size == 0) {
3250 		return 0;
3251 	}
3252 
3253 	if (req->oldptr == USER_ADDR_NULL) {
3254 		req->oldidx = preoslog_size;
3255 		return 0;
3256 	}
3257 
3258 	return SYSCTL_OUT(req, preoslog, preoslog_size);
3259 }
3260 
3261 SYSCTL_PROC(_kern, OID_AUTO, preoslog, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_LOCKED,
3262     0, 0, sysctl_kern_debug_get_preoslog, "-", "");
3263 
3264 static int
3265 sysctl_kern_task_set_filter_msg_flag SYSCTL_HANDLER_ARGS
3266 {
3267 #pragma unused(oidp, arg1, arg2)
3268 	int new_value, changed;
3269 	int old_value = task_get_filter_msg_flag(current_task()) ? 1 : 0;
3270 	int error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed);
3271 
3272 	if (changed) {
3273 		task_set_filter_msg_flag(current_task(), !!new_value);
3274 	}
3275 
3276 	return error;
3277 }
3278 
3279 SYSCTL_PROC(_kern, OID_AUTO, task_set_filter_msg_flag, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3280     0, 0, sysctl_kern_task_set_filter_msg_flag, "I", "");
3281 
3282 #if CONFIG_PROC_RESOURCE_LIMITS
3283 
3284 extern mach_port_name_t current_task_get_fatal_port_name(void);
3285 
3286 static int
3287 sysctl_kern_task_get_fatal_port SYSCTL_HANDLER_ARGS
3288 {
3289 #pragma unused(oidp, arg1, arg2)
3290 	int port = 0;
3291 	int flag = 0;
3292 
3293 	if (req->oldptr == USER_ADDR_NULL) {
3294 		req->oldidx = sizeof(mach_port_t);
3295 		return 0;
3296 	}
3297 
3298 	int error = SYSCTL_IN(req, &flag, sizeof(flag));
3299 	if (error) {
3300 		return error;
3301 	}
3302 
3303 	if (flag == 1) {
3304 		port = (int)current_task_get_fatal_port_name();
3305 	}
3306 	return SYSCTL_OUT(req, &port, sizeof(port));
3307 }
3308 
3309 SYSCTL_PROC(_machdep, OID_AUTO, task_get_fatal_port, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3310     0, 0, sysctl_kern_task_get_fatal_port, "I", "");
3311 
3312 #endif /* CONFIG_PROC_RESOURCE_LIMITS */
3313 
3314 extern unsigned int ipc_table_max_entries(void);
3315 
3316 static int
3317 sysctl_mach_max_port_table_size SYSCTL_HANDLER_ARGS
3318 {
3319 #pragma unused(oidp, arg1, arg2)
3320 	int old_value = ipc_table_max_entries();
3321 	int error = sysctl_io_number(req, old_value, sizeof(int), NULL, NULL);
3322 
3323 	return error;
3324 }
3325 
3326 SYSCTL_PROC(_machdep, OID_AUTO, max_port_table_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3327     0, 0, sysctl_mach_max_port_table_size, "I", "");
3328 
3329 #endif /* DEVELOPMENT || DEBUG */
3330 
3331 #if defined(CONFIG_KDP_INTERACTIVE_DEBUGGING) && defined(CONFIG_KDP_COREDUMP_ENCRYPTION)
3332 
3333 #define COREDUMP_ENCRYPTION_KEY_ENTITLEMENT "com.apple.private.coredump-encryption-key"
3334 
3335 static int
3336 sysctl_coredump_encryption_key_update SYSCTL_HANDLER_ARGS
3337 {
3338 	kern_return_t ret = KERN_SUCCESS;
3339 	int error = 0;
3340 	struct kdp_core_encryption_key_descriptor key_descriptor = { MACH_CORE_FILEHEADER_V2_FLAG_NEXT_COREFILE_KEY_FORMAT_NIST_P256, 0, NULL };
3341 
3342 	/* Need to be root and have entitlement */
3343 	if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(COREDUMP_ENCRYPTION_KEY_ENTITLEMENT)) {
3344 		return EPERM;
3345 	}
3346 
3347 	// Sanity-check the given key length
3348 	if (req->newlen > UINT16_MAX) {
3349 		return EINVAL;
3350 	}
3351 
3352 	// It is allowed for the caller to pass in a NULL buffer. This indicates that they want us to forget about any public key
3353 	// we might have.
3354 	if (req->newptr) {
3355 		key_descriptor.kcekd_size = (uint16_t) req->newlen;
3356 
3357 		ret = kmem_alloc(kernel_map, (vm_offset_t*) &(key_descriptor.kcekd_key), key_descriptor.kcekd_size, VM_KERN_MEMORY_DIAG);
3358 		if (KERN_SUCCESS != ret) {
3359 			return ENOMEM;
3360 		}
3361 
3362 		error = SYSCTL_IN(req, key_descriptor.kcekd_key, key_descriptor.kcekd_size);
3363 		if (error) {
3364 			return error;
3365 		}
3366 	}
3367 
3368 	// If successful, kdp_core will take ownership of the 'kcekd_key' pointer
3369 	ret = IOProvideCoreFileAccess(kdp_core_handle_new_encryption_key, (void *)&key_descriptor);
3370 	if (KERN_SUCCESS != ret) {
3371 		printf("Failed to handle the new encryption key. Error 0x%x", ret);
3372 		if (key_descriptor.kcekd_key) {
3373 			kmem_free(kernel_map, (vm_offset_t) key_descriptor.kcekd_key, key_descriptor.kcekd_size);
3374 		}
3375 		return EFAULT;
3376 	}
3377 
3378 	return 0;
3379 }
3380 
3381 SYSCTL_PROC(_kern, OID_AUTO, coredump_encryption_key, CTLTYPE_OPAQUE | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3382     0, 0, &sysctl_coredump_encryption_key_update, "-", "Set a new encryption key for coredumps");
3383 
3384 #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING && CONFIG_KDP_COREDUMP_ENCRYPTION*/
3385