xref: /xnu-8020.121.3/bsd/kern/sys_generic.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1989, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  * (c) UNIX System Laboratories, Inc.
33  * All or some portions of this file are derived from material licensed
34  * to the University of California by American Telephone and Telegraph
35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36  * the permission of UNIX System Laboratories, Inc.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. All advertising materials mentioning features or use of this software
47  *    must display the following acknowledgement:
48  *	This product includes software developed by the University of
49  *	California, Berkeley and its contributors.
50  * 4. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
67  */
68 /*
69  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
70  * support for mandatory and extensible security protections.  This notice
71  * is included in support of clause 2.2 (b) of the Apple Public License,
72  * Version 2.0.
73  */
74 
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/filedesc.h>
78 #include <sys/ioctl.h>
79 #include <sys/file_internal.h>
80 #include <sys/proc_internal.h>
81 #include <sys/socketvar.h>
82 #include <sys/uio_internal.h>
83 #include <sys/kernel.h>
84 #include <sys/guarded.h>
85 #include <sys/stat.h>
86 #include <sys/malloc.h>
87 #include <sys/sysproto.h>
88 
89 #include <sys/mount_internal.h>
90 #include <sys/protosw.h>
91 #include <sys/ev.h>
92 #include <sys/user.h>
93 #include <sys/kdebug.h>
94 #include <sys/poll.h>
95 #include <sys/event.h>
96 #include <sys/eventvar.h>
97 #include <sys/proc.h>
98 #include <sys/kauth.h>
99 
100 #include <machine/smp.h>
101 #include <mach/mach_types.h>
102 #include <kern/kern_types.h>
103 #include <kern/assert.h>
104 #include <kern/kalloc.h>
105 #include <kern/thread.h>
106 #include <kern/clock.h>
107 #include <kern/ledger.h>
108 #include <kern/monotonic.h>
109 #include <kern/task.h>
110 #include <kern/telemetry.h>
111 #include <kern/waitq.h>
112 #include <kern/sched_hygiene.h>
113 #include <kern/sched_prim.h>
114 #include <kern/mpsc_queue.h>
115 #include <kern/debug.h>
116 
117 #include <sys/mbuf.h>
118 #include <sys/domain.h>
119 #include <sys/socket.h>
120 #include <sys/socketvar.h>
121 #include <sys/errno.h>
122 #include <sys/syscall.h>
123 #include <sys/pipe.h>
124 
125 #include <security/audit/audit.h>
126 
127 #include <net/if.h>
128 #include <net/route.h>
129 
130 #include <netinet/in.h>
131 #include <netinet/in_systm.h>
132 #include <netinet/ip.h>
133 #include <netinet/in_pcb.h>
134 #include <netinet/ip_var.h>
135 #include <netinet/ip6.h>
136 #include <netinet/tcp.h>
137 #include <netinet/tcp_fsm.h>
138 #include <netinet/tcp_seq.h>
139 #include <netinet/tcp_timer.h>
140 #include <netinet/tcp_var.h>
141 #include <netinet/tcpip.h>
142 #include <netinet/tcp_debug.h>
143 /* for wait queue based select */
144 #include <kern/waitq.h>
145 #include <sys/vnode_internal.h>
146 /* for remote time api*/
147 #include <kern/remote_time.h>
148 #include <os/log.h>
149 #include <sys/log_data.h>
150 
151 #if CONFIG_MACF
152 #include <security/mac_framework.h>
153 #endif
154 
155 #ifdef CONFIG_KDP_INTERACTIVE_DEBUGGING
156 #include <mach_debug/mach_debug_types.h>
157 #endif
158 
159 #if MONOTONIC
160 #include <machine/monotonic.h>
161 #endif /* MONOTONIC */
162 
163 /* for entitlement check */
164 #include <IOKit/IOBSD.h>
165 /*
166  * If you need accounting for KM_SELECT consider using
167  * KALLOC_HEAP_DEFINE to define a view.
168  */
169 #define KM_SELECT       KHEAP_DEFAULT
170 
171 /* XXX should be in a header file somewhere */
172 extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
173 
174 int rd_uio(struct proc *p, int fdes, uio_t uio, int is_preadv, user_ssize_t *retval);
175 int wr_uio(struct proc *p, int fdes, uio_t uio, int is_pwritev, user_ssize_t *retval);
176 int do_uiowrite(struct proc *p, struct fileproc *fp, uio_t uio, int flags, user_ssize_t *retval);
177 
178 __private_extern__ int  dofileread(vfs_context_t ctx, struct fileproc *fp,
179     user_addr_t bufp, user_size_t nbyte,
180     off_t offset, int flags, user_ssize_t *retval);
181 __private_extern__ int  dofilewrite(vfs_context_t ctx, struct fileproc *fp,
182     user_addr_t bufp, user_size_t nbyte,
183     off_t offset, int flags, user_ssize_t *retval);
184 static int preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode);
185 
186 #define f_flag fp_glob->fg_flag
187 #define f_type fp_glob->fg_ops->fo_type
188 #define f_cred fp_glob->fg_cred
189 #define f_ops fp_glob->fg_ops
190 
191 /*
192  * Validate if the file can be used for random access (pread, pwrite, etc).
193  *
194  * Conditions:
195  *		proc_fdlock is held
196  *
197  * Returns:    0                       Success
198  *             ESPIPE
199  *             ENXIO
200  */
201 static int
valid_for_random_access(struct fileproc * fp)202 valid_for_random_access(struct fileproc *fp)
203 {
204 	if (__improbable(fp->f_type != DTYPE_VNODE)) {
205 		return ESPIPE;
206 	}
207 
208 	vnode_t vp = (struct vnode *)fp_get_data(fp);
209 	if (__improbable(vnode_isfifo(vp))) {
210 		return ESPIPE;
211 	}
212 
213 	if (__improbable(vp->v_flag & VISTTY)) {
214 		return ENXIO;
215 	}
216 
217 	return 0;
218 }
219 
220 /*
221  * Read system call.
222  *
223  * Returns:	0			Success
224  *	preparefileread:EBADF
225  *	preparefileread:ESPIPE
226  *	preparefileread:ENXIO
227  *	preparefileread:EBADF
228  *	dofileread:???
229  */
230 int
read(struct proc * p,struct read_args * uap,user_ssize_t * retval)231 read(struct proc *p, struct read_args *uap, user_ssize_t *retval)
232 {
233 	__pthread_testcancel(1);
234 	return read_nocancel(p, (struct read_nocancel_args *)uap, retval);
235 }
236 
237 int
read_nocancel(struct proc * p,struct read_nocancel_args * uap,user_ssize_t * retval)238 read_nocancel(struct proc *p, struct read_nocancel_args *uap, user_ssize_t *retval)
239 {
240 	struct fileproc *fp;
241 	int error;
242 	int fd = uap->fd;
243 	struct vfs_context context;
244 
245 	if ((error = preparefileread(p, &fp, fd, 0))) {
246 		return error;
247 	}
248 
249 	context = *(vfs_context_current());
250 	context.vc_ucred = fp->fp_glob->fg_cred;
251 
252 	error = dofileread(&context, fp, uap->cbuf, uap->nbyte,
253 	    (off_t)-1, 0, retval);
254 
255 	fp_drop(p, fd, fp, 0);
256 
257 	return error;
258 }
259 
260 /*
261  * Pread system call
262  *
263  * Returns:	0			Success
264  *	preparefileread:EBADF
265  *	preparefileread:ESPIPE
266  *	preparefileread:ENXIO
267  *	preparefileread:EBADF
268  *	dofileread:???
269  */
270 int
pread(struct proc * p,struct pread_args * uap,user_ssize_t * retval)271 pread(struct proc *p, struct pread_args *uap, user_ssize_t *retval)
272 {
273 	__pthread_testcancel(1);
274 	return pread_nocancel(p, (struct pread_nocancel_args *)uap, retval);
275 }
276 
277 int
pread_nocancel(struct proc * p,struct pread_nocancel_args * uap,user_ssize_t * retval)278 pread_nocancel(struct proc *p, struct pread_nocancel_args *uap, user_ssize_t *retval)
279 {
280 	struct fileproc *fp = NULL;     /* fp set by preparefileread() */
281 	int fd = uap->fd;
282 	int error;
283 	struct vfs_context context;
284 
285 	if ((error = preparefileread(p, &fp, fd, 1))) {
286 		goto out;
287 	}
288 
289 	context = *(vfs_context_current());
290 	context.vc_ucred = fp->fp_glob->fg_cred;
291 
292 	error = dofileread(&context, fp, uap->buf, uap->nbyte,
293 	    uap->offset, FOF_OFFSET, retval);
294 
295 	fp_drop(p, fd, fp, 0);
296 
297 	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
298 	    uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
299 
300 out:
301 	return error;
302 }
303 
304 /*
305  * Code common for read and pread
306  */
307 
308 /*
309  * Returns:	0			Success
310  *		EBADF
311  *		ESPIPE
312  *		ENXIO
313  *	fp_lookup:EBADF
314  *  valid_for_random_access:ESPIPE
315  *  valid_for_random_access:ENXIO
316  */
317 static int
preparefileread(struct proc * p,struct fileproc ** fp_ret,int fd,int check_for_pread)318 preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread)
319 {
320 	int     error;
321 	struct fileproc *fp;
322 
323 	AUDIT_ARG(fd, fd);
324 
325 	proc_fdlock_spin(p);
326 
327 	error = fp_lookup(p, fd, &fp, 1);
328 
329 	if (error) {
330 		proc_fdunlock(p);
331 		return error;
332 	}
333 	if ((fp->f_flag & FREAD) == 0) {
334 		error = EBADF;
335 		goto out;
336 	}
337 	if (check_for_pread) {
338 		if ((error = valid_for_random_access(fp))) {
339 			goto out;
340 		}
341 	}
342 
343 	*fp_ret = fp;
344 
345 	proc_fdunlock(p);
346 	return 0;
347 
348 out:
349 	fp_drop(p, fd, fp, 1);
350 	proc_fdunlock(p);
351 	return error;
352 }
353 
354 
355 /*
356  * Returns:	0			Success
357  *		EINVAL
358  *	fo_read:???
359  */
360 __private_extern__ int
dofileread(vfs_context_t ctx,struct fileproc * fp,user_addr_t bufp,user_size_t nbyte,off_t offset,int flags,user_ssize_t * retval)361 dofileread(vfs_context_t ctx, struct fileproc *fp,
362     user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
363     user_ssize_t *retval)
364 {
365 	uio_t auio;
366 	user_ssize_t bytecnt;
367 	int error = 0;
368 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
369 
370 	if (nbyte > INT_MAX) {
371 		return EINVAL;
372 	}
373 
374 	if (vfs_context_is64bit(ctx)) {
375 		auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
376 		    &uio_buf[0], sizeof(uio_buf));
377 	} else {
378 		auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
379 		    &uio_buf[0], sizeof(uio_buf));
380 	}
381 	if (uio_addiov(auio, bufp, nbyte) != 0) {
382 		*retval = 0;
383 		return EINVAL;
384 	}
385 
386 	bytecnt = nbyte;
387 
388 	if ((error = fo_read(fp, auio, flags, ctx))) {
389 		if (uio_resid(auio) != bytecnt && (error == ERESTART ||
390 		    error == EINTR || error == EWOULDBLOCK)) {
391 			error = 0;
392 		}
393 	}
394 	bytecnt -= uio_resid(auio);
395 
396 	*retval = bytecnt;
397 
398 	return error;
399 }
400 
401 /*
402  * Vector read.
403  *
404  * Returns:    0                       Success
405  *             EINVAL
406  *             ENOMEM
407  *     preparefileread:EBADF
408  *     preparefileread:ESPIPE
409  *     preparefileread:ENXIO
410  *     preparefileread:EBADF
411  *     copyin:EFAULT
412  *     rd_uio:???
413  */
414 static int
readv_preadv_uio(struct proc * p,int fdes,user_addr_t user_iovp,int iovcnt,off_t offset,int is_preadv,user_ssize_t * retval)415 readv_preadv_uio(struct proc *p, int fdes,
416     user_addr_t user_iovp, int iovcnt, off_t offset, int is_preadv,
417     user_ssize_t *retval)
418 {
419 	uio_t auio = NULL;
420 	int error;
421 	struct user_iovec *iovp;
422 
423 	/* Verify range before calling uio_create() */
424 	if (iovcnt <= 0 || iovcnt > UIO_MAXIOV) {
425 		return EINVAL;
426 	}
427 
428 	/* allocate a uio large enough to hold the number of iovecs passed */
429 	auio = uio_create(iovcnt, offset,
430 	    (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
431 	    UIO_READ);
432 
433 	/* get location of iovecs within the uio.  then copyin the iovecs from
434 	 * user space.
435 	 */
436 	iovp = uio_iovsaddr(auio);
437 	if (iovp == NULL) {
438 		error = ENOMEM;
439 		goto ExitThisRoutine;
440 	}
441 	error = copyin_user_iovec_array(user_iovp,
442 	    IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
443 	    iovcnt, iovp);
444 	if (error) {
445 		goto ExitThisRoutine;
446 	}
447 
448 	/* finalize uio_t for use and do the IO
449 	 */
450 	error = uio_calculateresid(auio);
451 	if (error) {
452 		goto ExitThisRoutine;
453 	}
454 	error = rd_uio(p, fdes, auio, is_preadv, retval);
455 
456 ExitThisRoutine:
457 	if (auio != NULL) {
458 		uio_free(auio);
459 	}
460 	return error;
461 }
462 
463 /*
464  * Scatter read system call.
465  */
466 int
readv(struct proc * p,struct readv_args * uap,user_ssize_t * retval)467 readv(struct proc *p, struct readv_args *uap, user_ssize_t *retval)
468 {
469 	__pthread_testcancel(1);
470 	return readv_nocancel(p, (struct readv_nocancel_args *)uap, retval);
471 }
472 
473 int
readv_nocancel(struct proc * p,struct readv_nocancel_args * uap,user_ssize_t * retval)474 readv_nocancel(struct proc *p, struct readv_nocancel_args *uap, user_ssize_t *retval)
475 {
476 	return readv_preadv_uio(p, uap->fd, uap->iovp, uap->iovcnt, 0, 0, retval);
477 }
478 
479 /*
480  * Preadv system call
481  */
482 int
sys_preadv(struct proc * p,struct preadv_args * uap,user_ssize_t * retval)483 sys_preadv(struct proc *p, struct preadv_args *uap, user_ssize_t *retval)
484 {
485 	__pthread_testcancel(1);
486 	return sys_preadv_nocancel(p, (struct preadv_nocancel_args *)uap, retval);
487 }
488 
489 int
sys_preadv_nocancel(struct proc * p,struct preadv_nocancel_args * uap,user_ssize_t * retval)490 sys_preadv_nocancel(struct proc *p, struct preadv_nocancel_args *uap, user_ssize_t *retval)
491 {
492 	return readv_preadv_uio(p, uap->fd, uap->iovp, uap->iovcnt, uap->offset, 1, retval);
493 }
494 
495 /*
496  * Write system call
497  *
498  * Returns:	0			Success
499  *		EBADF
500  *	fp_lookup:EBADF
501  *	dofilewrite:???
502  */
503 int
write(struct proc * p,struct write_args * uap,user_ssize_t * retval)504 write(struct proc *p, struct write_args *uap, user_ssize_t *retval)
505 {
506 	__pthread_testcancel(1);
507 	return write_nocancel(p, (struct write_nocancel_args *)uap, retval);
508 }
509 
510 int
write_nocancel(struct proc * p,struct write_nocancel_args * uap,user_ssize_t * retval)511 write_nocancel(struct proc *p, struct write_nocancel_args *uap, user_ssize_t *retval)
512 {
513 	struct fileproc *fp;
514 	int error;
515 	int fd = uap->fd;
516 
517 	AUDIT_ARG(fd, fd);
518 
519 	error = fp_lookup(p, fd, &fp, 0);
520 	if (error) {
521 		return error;
522 	}
523 	if ((fp->f_flag & FWRITE) == 0) {
524 		error = EBADF;
525 	} else if (fp_isguarded(fp, GUARD_WRITE)) {
526 		proc_fdlock(p);
527 		error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
528 		proc_fdunlock(p);
529 	} else {
530 		struct vfs_context context = *(vfs_context_current());
531 		context.vc_ucred = fp->fp_glob->fg_cred;
532 
533 		error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
534 		    (off_t)-1, 0, retval);
535 	}
536 	fp_drop(p, fd, fp, 0);
537 	return error;
538 }
539 
540 /*
541  * pwrite system call
542  *
543  * Returns:	0			Success
544  *		EBADF
545  *		ESPIPE
546  *		ENXIO
547  *		EINVAL
548  *	fp_lookup:EBADF
549  *	dofilewrite:???
550  */
551 int
pwrite(struct proc * p,struct pwrite_args * uap,user_ssize_t * retval)552 pwrite(struct proc *p, struct pwrite_args *uap, user_ssize_t *retval)
553 {
554 	__pthread_testcancel(1);
555 	return pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval);
556 }
557 
558 int
pwrite_nocancel(struct proc * p,struct pwrite_nocancel_args * uap,user_ssize_t * retval)559 pwrite_nocancel(struct proc *p, struct pwrite_nocancel_args *uap, user_ssize_t *retval)
560 {
561 	struct fileproc *fp;
562 	int error;
563 	int fd = uap->fd;
564 	vnode_t vp  = (vnode_t)0;
565 
566 	AUDIT_ARG(fd, fd);
567 
568 	error = fp_get_ftype(p, fd, DTYPE_VNODE, ESPIPE, &fp);
569 	if (error) {
570 		return error;
571 	}
572 
573 	if ((fp->f_flag & FWRITE) == 0) {
574 		error = EBADF;
575 	} else if (fp_isguarded(fp, GUARD_WRITE)) {
576 		proc_fdlock(p);
577 		error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
578 		proc_fdunlock(p);
579 	} else {
580 		struct vfs_context context = *vfs_context_current();
581 		context.vc_ucred = fp->fp_glob->fg_cred;
582 
583 		vp = (vnode_t)fp_get_data(fp);
584 		if (vnode_isfifo(vp)) {
585 			error = ESPIPE;
586 			goto errout;
587 		}
588 		if ((vp->v_flag & VISTTY)) {
589 			error = ENXIO;
590 			goto errout;
591 		}
592 		if (uap->offset == (off_t)-1) {
593 			error = EINVAL;
594 			goto errout;
595 		}
596 
597 		error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
598 		    uap->offset, FOF_OFFSET, retval);
599 	}
600 errout:
601 	fp_drop(p, fd, fp, 0);
602 
603 	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
604 	    uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
605 
606 	return error;
607 }
608 
609 /*
610  * Returns:	0			Success
611  *		EINVAL
612  *	<fo_write>:EPIPE
613  *	<fo_write>:???			[indirect through struct fileops]
614  */
615 __private_extern__ int
dofilewrite(vfs_context_t ctx,struct fileproc * fp,user_addr_t bufp,user_size_t nbyte,off_t offset,int flags,user_ssize_t * retval)616 dofilewrite(vfs_context_t ctx, struct fileproc *fp,
617     user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
618     user_ssize_t *retval)
619 {
620 	uio_t auio;
621 	int error = 0;
622 	user_ssize_t bytecnt;
623 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
624 
625 	if (nbyte > INT_MAX) {
626 		*retval = 0;
627 		return EINVAL;
628 	}
629 
630 	if (vfs_context_is64bit(ctx)) {
631 		auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
632 		    &uio_buf[0], sizeof(uio_buf));
633 	} else {
634 		auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
635 		    &uio_buf[0], sizeof(uio_buf));
636 	}
637 	if (uio_addiov(auio, bufp, nbyte) != 0) {
638 		*retval = 0;
639 		return EINVAL;
640 	}
641 
642 	bytecnt = nbyte;
643 	if ((error = fo_write(fp, auio, flags, ctx))) {
644 		if (uio_resid(auio) != bytecnt && (error == ERESTART ||
645 		    error == EINTR || error == EWOULDBLOCK)) {
646 			error = 0;
647 		}
648 		/* The socket layer handles SIGPIPE */
649 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
650 		    (fp->fp_glob->fg_lflags & FG_NOSIGPIPE) == 0) {
651 			/* XXX Raise the signal on the thread? */
652 			psignal(vfs_context_proc(ctx), SIGPIPE);
653 		}
654 	}
655 	bytecnt -= uio_resid(auio);
656 	if (bytecnt) {
657 		os_atomic_or(&fp->fp_glob->fg_flag, FWASWRITTEN, relaxed);
658 	}
659 	*retval = bytecnt;
660 
661 	return error;
662 }
663 
664 /*
665  * Returns:	0			Success
666  *		EBADF
667  *		ESPIPE
668  *		ENXIO
669  *	fp_lookup:EBADF
670  *	fp_guard_exception:???
671  *  valid_for_random_access:ESPIPE
672  *  valid_for_random_access:ENXIO
673  */
674 static int
preparefilewrite(struct proc * p,struct fileproc ** fp_ret,int fd,int check_for_pwrite)675 preparefilewrite(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pwrite)
676 {
677 	int error;
678 	struct fileproc *fp;
679 
680 	AUDIT_ARG(fd, fd);
681 
682 	proc_fdlock_spin(p);
683 
684 	error = fp_lookup(p, fd, &fp, 1);
685 
686 	if (error) {
687 		proc_fdunlock(p);
688 		return error;
689 	}
690 	if ((fp->f_flag & FWRITE) == 0) {
691 		error = EBADF;
692 		goto ExitThisRoutine;
693 	}
694 	if (fp_isguarded(fp, GUARD_WRITE)) {
695 		if ((error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE))) {
696 			goto ExitThisRoutine;
697 		}
698 	}
699 	if (check_for_pwrite) {
700 		if ((error = valid_for_random_access(fp))) {
701 			goto ExitThisRoutine;
702 		}
703 	}
704 
705 	*fp_ret = fp;
706 
707 	proc_fdunlock(p);
708 	return 0;
709 
710 ExitThisRoutine:
711 	fp_drop(p, fd, fp, 1);
712 	proc_fdunlock(p);
713 	return error;
714 }
715 
716 static int
writev_prwritev_uio(struct proc * p,int fd,user_addr_t user_iovp,int iovcnt,off_t offset,int is_pwritev,user_ssize_t * retval)717 writev_prwritev_uio(struct proc *p, int fd,
718     user_addr_t user_iovp, int iovcnt, off_t offset, int is_pwritev,
719     user_ssize_t *retval)
720 {
721 	uio_t auio = NULL;
722 	int error;
723 	struct user_iovec *iovp;
724 
725 	/* Verify range before calling uio_create() */
726 	if (iovcnt <= 0 || iovcnt > UIO_MAXIOV || offset < 0) {
727 		return EINVAL;
728 	}
729 
730 	/* allocate a uio large enough to hold the number of iovecs passed */
731 	auio = uio_create(iovcnt, offset,
732 	    (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
733 	    UIO_WRITE);
734 
735 	/* get location of iovecs within the uio.  then copyin the iovecs from
736 	 * user space.
737 	 */
738 	iovp = uio_iovsaddr(auio);
739 	if (iovp == NULL) {
740 		error = ENOMEM;
741 		goto ExitThisRoutine;
742 	}
743 	error = copyin_user_iovec_array(user_iovp,
744 	    IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
745 	    iovcnt, iovp);
746 	if (error) {
747 		goto ExitThisRoutine;
748 	}
749 
750 	/* finalize uio_t for use and do the IO
751 	 */
752 	error = uio_calculateresid(auio);
753 	if (error) {
754 		goto ExitThisRoutine;
755 	}
756 
757 	error = wr_uio(p, fd, auio, is_pwritev, retval);
758 
759 ExitThisRoutine:
760 	if (auio != NULL) {
761 		uio_free(auio);
762 	}
763 	return error;
764 }
765 
766 /*
767  * Gather write system call
768  */
769 int
writev(struct proc * p,struct writev_args * uap,user_ssize_t * retval)770 writev(struct proc *p, struct writev_args *uap, user_ssize_t *retval)
771 {
772 	__pthread_testcancel(1);
773 	return writev_nocancel(p, (struct writev_nocancel_args *)uap, retval);
774 }
775 
776 int
writev_nocancel(struct proc * p,struct writev_nocancel_args * uap,user_ssize_t * retval)777 writev_nocancel(struct proc *p, struct writev_nocancel_args *uap, user_ssize_t *retval)
778 {
779 	return writev_prwritev_uio(p, uap->fd, uap->iovp, uap->iovcnt, 0, 0, retval);
780 }
781 
782 /*
783  * Pwritev system call
784  */
785 int
sys_pwritev(struct proc * p,struct pwritev_args * uap,user_ssize_t * retval)786 sys_pwritev(struct proc *p, struct pwritev_args *uap, user_ssize_t *retval)
787 {
788 	__pthread_testcancel(1);
789 	return sys_pwritev_nocancel(p, (struct pwritev_nocancel_args *)uap, retval);
790 }
791 
792 int
sys_pwritev_nocancel(struct proc * p,struct pwritev_nocancel_args * uap,user_ssize_t * retval)793 sys_pwritev_nocancel(struct proc *p, struct pwritev_nocancel_args *uap, user_ssize_t *retval)
794 {
795 	return writev_prwritev_uio(p, uap->fd, uap->iovp, uap->iovcnt, uap->offset, 1, retval);
796 }
797 
798 /*
799  * Returns:	0			Success
800  *	preparefileread:EBADF
801  *	preparefileread:ESPIPE
802  *	preparefileread:ENXIO
803  *	preparefileread:???
804  *	fo_write:???
805  */
806 int
wr_uio(struct proc * p,int fd,uio_t uio,int is_pwritev,user_ssize_t * retval)807 wr_uio(struct proc *p, int fd, uio_t uio, int is_pwritev, user_ssize_t *retval)
808 {
809 	struct fileproc *fp;
810 	int error;
811 	int flags;
812 
813 	if ((error = preparefilewrite(p, &fp, fd, is_pwritev))) {
814 		return error;
815 	}
816 
817 	flags = is_pwritev ? FOF_OFFSET : 0;
818 	error = do_uiowrite(p, fp, uio, flags, retval);
819 
820 	fp_drop(p, fd, fp, 0);
821 
822 	return error;
823 }
824 
825 int
do_uiowrite(struct proc * p,struct fileproc * fp,uio_t uio,int flags,user_ssize_t * retval)826 do_uiowrite(struct proc *p, struct fileproc *fp, uio_t uio, int flags, user_ssize_t *retval)
827 {
828 	int error;
829 	user_ssize_t count;
830 	struct vfs_context context = *vfs_context_current();
831 
832 	count = uio_resid(uio);
833 
834 	context.vc_ucred = fp->f_cred;
835 	error = fo_write(fp, uio, flags, &context);
836 	if (error) {
837 		if (uio_resid(uio) != count && (error == ERESTART ||
838 		    error == EINTR || error == EWOULDBLOCK)) {
839 			error = 0;
840 		}
841 		/* The socket layer handles SIGPIPE */
842 		if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
843 		    (fp->fp_glob->fg_lflags & FG_NOSIGPIPE) == 0) {
844 			psignal(p, SIGPIPE);
845 		}
846 	}
847 	count -= uio_resid(uio);
848 	if (count) {
849 		os_atomic_or(&fp->fp_glob->fg_flag, FWASWRITTEN, relaxed);
850 	}
851 	*retval = count;
852 
853 	return error;
854 }
855 
856 /*
857  * Returns:	0			Success
858  *	preparefileread:EBADF
859  *	preparefileread:ESPIPE
860  *	preparefileread:ENXIO
861  *	fo_read:???
862  */
863 int
rd_uio(struct proc * p,int fdes,uio_t uio,int is_preadv,user_ssize_t * retval)864 rd_uio(struct proc *p, int fdes, uio_t uio, int is_preadv, user_ssize_t *retval)
865 {
866 	struct fileproc *fp;
867 	int error;
868 	user_ssize_t count;
869 	struct vfs_context context = *vfs_context_current();
870 
871 	if ((error = preparefileread(p, &fp, fdes, is_preadv))) {
872 		return error;
873 	}
874 
875 	count = uio_resid(uio);
876 
877 	context.vc_ucred = fp->f_cred;
878 
879 	int flags = is_preadv ? FOF_OFFSET : 0;
880 	error = fo_read(fp, uio, flags, &context);
881 
882 	if (error) {
883 		if (uio_resid(uio) != count && (error == ERESTART ||
884 		    error == EINTR || error == EWOULDBLOCK)) {
885 			error = 0;
886 		}
887 	}
888 	*retval = count - uio_resid(uio);
889 
890 	fp_drop(p, fdes, fp, 0);
891 
892 	return error;
893 }
894 
895 /*
896  * Ioctl system call
897  *
898  * Returns:	0			Success
899  *		EBADF
900  *		ENOTTY
901  *		ENOMEM
902  *		ESRCH
903  *	copyin:EFAULT
904  *	copyoutEFAULT
905  *	fp_lookup:EBADF			Bad file descriptor
906  *	fo_ioctl:???
907  */
908 int
ioctl(struct proc * p,struct ioctl_args * uap,__unused int32_t * retval)909 ioctl(struct proc *p, struct ioctl_args *uap, __unused int32_t *retval)
910 {
911 	struct fileproc *fp = NULL;
912 	int error = 0;
913 	u_int size = 0;
914 	caddr_t datap = NULL, memp = NULL;
915 	boolean_t is64bit = FALSE;
916 	int tmp = 0;
917 #define STK_PARAMS      128
918 	char stkbuf[STK_PARAMS] = {};
919 	int fd = uap->fd;
920 	u_long com = uap->com;
921 	struct vfs_context context = *vfs_context_current();
922 
923 	AUDIT_ARG(fd, uap->fd);
924 	AUDIT_ARG(addr, uap->data);
925 
926 	is64bit = proc_is64bit(p);
927 #if CONFIG_AUDIT
928 	if (is64bit) {
929 		AUDIT_ARG(value64, com);
930 	} else {
931 		AUDIT_ARG(cmd, CAST_DOWN_EXPLICIT(int, com));
932 	}
933 #endif /* CONFIG_AUDIT */
934 
935 	/*
936 	 * Interpret high order word to find amount of data to be
937 	 * copied to/from the user's address space.
938 	 */
939 	size = IOCPARM_LEN(com);
940 	if (size > IOCPARM_MAX) {
941 		return ENOTTY;
942 	}
943 	if (size > sizeof(stkbuf)) {
944 		memp = (caddr_t)kalloc_data(size, Z_WAITOK);
945 		if (memp == 0) {
946 			return ENOMEM;
947 		}
948 		datap = memp;
949 	} else {
950 		datap = &stkbuf[0];
951 	}
952 	if (com & IOC_IN) {
953 		if (size) {
954 			error = copyin(uap->data, datap, size);
955 			if (error) {
956 				goto out_nofp;
957 			}
958 		} else {
959 			/* XXX - IOC_IN and no size?  we should proably return an error here!! */
960 			if (is64bit) {
961 				*(user_addr_t *)datap = uap->data;
962 			} else {
963 				*(uint32_t *)datap = (uint32_t)uap->data;
964 			}
965 		}
966 	} else if ((com & IOC_OUT) && size) {
967 		/*
968 		 * Zero the buffer so the user always
969 		 * gets back something deterministic.
970 		 */
971 		bzero(datap, size);
972 	} else if (com & IOC_VOID) {
973 		/* XXX - this is odd since IOC_VOID means no parameters */
974 		if (is64bit) {
975 			*(user_addr_t *)datap = uap->data;
976 		} else {
977 			*(uint32_t *)datap = (uint32_t)uap->data;
978 		}
979 	}
980 
981 	proc_fdlock(p);
982 	error = fp_lookup(p, fd, &fp, 1);
983 	if (error) {
984 		proc_fdunlock(p);
985 		goto out_nofp;
986 	}
987 
988 	AUDIT_ARG(file, p, fp);
989 
990 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
991 		error = EBADF;
992 		goto out;
993 	}
994 
995 	context.vc_ucred = fp->fp_glob->fg_cred;
996 
997 #if CONFIG_MACF
998 	error = mac_file_check_ioctl(context.vc_ucred, fp->fp_glob, com);
999 	if (error) {
1000 		goto out;
1001 	}
1002 #endif
1003 
1004 	switch (com) {
1005 	case FIONCLEX:
1006 		fp->fp_flags &= ~FP_CLOEXEC;
1007 		break;
1008 
1009 	case FIOCLEX:
1010 		fp->fp_flags |= FP_CLOEXEC;
1011 		break;
1012 
1013 	case FIONBIO:
1014 		// FIXME (rdar://54898652)
1015 		//
1016 		// this code is broken if fnctl(F_SETFL), ioctl() are
1017 		// called concurrently for the same fileglob.
1018 		if ((tmp = *(int *)datap)) {
1019 			os_atomic_or(&fp->f_flag, FNONBLOCK, relaxed);
1020 		} else {
1021 			os_atomic_andnot(&fp->f_flag, FNONBLOCK, relaxed);
1022 		}
1023 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
1024 		break;
1025 
1026 	case FIOASYNC:
1027 		// FIXME (rdar://54898652)
1028 		//
1029 		// this code is broken if fnctl(F_SETFL), ioctl() are
1030 		// called concurrently for the same fileglob.
1031 		if ((tmp = *(int *)datap)) {
1032 			os_atomic_or(&fp->f_flag, FASYNC, relaxed);
1033 		} else {
1034 			os_atomic_andnot(&fp->f_flag, FASYNC, relaxed);
1035 		}
1036 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
1037 		break;
1038 
1039 	case FIOSETOWN:
1040 		tmp = *(int *)datap;
1041 		if (fp->f_type == DTYPE_SOCKET) {
1042 			((struct socket *)fp_get_data(fp))->so_pgid = tmp;
1043 			break;
1044 		}
1045 		if (fp->f_type == DTYPE_PIPE) {
1046 			error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context);
1047 			break;
1048 		}
1049 		if (tmp <= 0) {
1050 			tmp = -tmp;
1051 		} else {
1052 			struct proc *p1 = proc_find(tmp);
1053 			if (p1 == 0) {
1054 				error = ESRCH;
1055 				break;
1056 			}
1057 			tmp = p1->p_pgrpid;
1058 			proc_rele(p1);
1059 		}
1060 		error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context);
1061 		break;
1062 
1063 	case FIOGETOWN:
1064 		if (fp->f_type == DTYPE_SOCKET) {
1065 			*(int *)datap = ((struct socket *)fp_get_data(fp))->so_pgid;
1066 			break;
1067 		}
1068 		error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
1069 		*(int *)datap = -*(int *)datap;
1070 		break;
1071 
1072 	default:
1073 		error = fo_ioctl(fp, com, datap, &context);
1074 		/*
1075 		 * Copy any data to user, size was
1076 		 * already set and checked above.
1077 		 */
1078 		if (error == 0 && (com & IOC_OUT) && size) {
1079 			error = copyout(datap, uap->data, (u_int)size);
1080 		}
1081 		break;
1082 	}
1083 out:
1084 	fp_drop(p, fd, fp, 1);
1085 	proc_fdunlock(p);
1086 
1087 out_nofp:
1088 	if (memp) {
1089 		kfree_data(memp, size);
1090 	}
1091 	return error;
1092 }
1093 
1094 int     selwait;
1095 #define SEL_FIRSTPASS 1
1096 #define SEL_SECONDPASS 2
1097 static int selprocess(struct proc *p, int error, int sel_pass);
1098 static int selscan(struct proc *p, struct _select * sel, struct _select_data * seldata,
1099     int nfd, int32_t *retval, int sel_pass, struct select_set *selset);
1100 static int selcount(struct proc *p, u_int32_t *ibits, int nfd, int *count);
1101 static int seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup);
1102 static int seldrop(struct proc *p, u_int32_t *ibits, int nfd, int lim);
1103 static int select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval);
1104 
1105 /*
1106  * This is used for the special device nodes that do not implement
1107  * a proper kevent filter (see filt_specattach).
1108  *
1109  * In order to enable kevents on those, the spec_filtops will pretend
1110  * to call select, and try to sniff the selrecord(), if it observes one,
1111  * the knote is attached, which pairs with selwakeup() or selthreadclear().
1112  *
1113  * The last issue remaining, is that we need to serialize filt_specdetach()
1114  * with this, but it really can't know the "selinfo" or any locking domain.
1115  * To make up for this, We protect knote list operations with a global lock,
1116  * which give us a safe shared locking domain.
1117  *
1118  * Note: It is a little distasteful, but we really have very few of those.
1119  *       The big problem here is that sharing a lock domain without
1120  *       any kind of shared knowledge is a little complicated.
1121  *
1122  *       1. filters can really implement their own kqueue integration
1123  *          to side step this,
1124  *
1125  *       2. There's an opportunity to pick a private lock in selspec_attach()
1126  *          because both the selinfo and the knote are locked at that time.
1127  *          The cleanup story is however a little complicated.
1128  */
1129 static LCK_GRP_DECLARE(selspec_grp, "spec_filtops");
1130 static LCK_SPIN_DECLARE(selspec_lock, &selspec_grp);
1131 
1132 /*
1133  * The "primitive" lock is held.
1134  * The knote lock is held.
1135  */
1136 void
selspec_attach(struct knote * kn,struct selinfo * si)1137 selspec_attach(struct knote *kn, struct selinfo *si)
1138 {
1139 	struct selinfo *cur = os_atomic_load(&kn->kn_hook, relaxed);
1140 
1141 	if (cur == NULL) {
1142 		si->si_flags |= SI_SELSPEC;
1143 		lck_spin_lock(&selspec_lock);
1144 		kn->kn_hook = si;
1145 		KNOTE_ATTACH(&si->si_note, kn);
1146 		lck_spin_unlock(&selspec_lock);
1147 	} else {
1148 		/*
1149 		 * selspec_attach() can be called from e.g. filt_spectouch()
1150 		 * which might be called before any event was dequeued.
1151 		 *
1152 		 * It is hence not impossible for the knote already be hooked.
1153 		 *
1154 		 * Note that selwakeup_internal() could possibly
1155 		 * already have cleared this pointer. This is a race
1156 		 * that filt_specprocess will debounce.
1157 		 */
1158 		assert(si->si_flags & SI_SELSPEC);
1159 		assert(cur == si);
1160 	}
1161 }
1162 
1163 /*
1164  * The "primitive" lock is _not_ held.
1165  * The knote lock is held.
1166  */
1167 void
selspec_detach(struct knote * kn)1168 selspec_detach(struct knote *kn)
1169 {
1170 	/*
1171 	 * kn_hook always becomes non NULL under the knote lock.
1172 	 * Seeing "NULL" can't be a false positive.
1173 	 */
1174 	if (kn->kn_hook == NULL) {
1175 		return;
1176 	}
1177 
1178 	lck_spin_lock(&selspec_lock);
1179 	if (kn->kn_hook) {
1180 		struct selinfo *sip = kn->kn_hook;
1181 
1182 		kn->kn_hook = NULL;
1183 		KNOTE_DETACH(&sip->si_note, kn);
1184 	}
1185 	lck_spin_unlock(&selspec_lock);
1186 }
1187 
1188 /*
1189  * Select system call.
1190  *
1191  * Returns:	0			Success
1192  *		EINVAL			Invalid argument
1193  *		EAGAIN			Nonconformant error if allocation fails
1194  */
1195 int
select(struct proc * p,struct select_args * uap,int32_t * retval)1196 select(struct proc *p, struct select_args *uap, int32_t *retval)
1197 {
1198 	__pthread_testcancel(1);
1199 	return select_nocancel(p, (struct select_nocancel_args *)uap, retval);
1200 }
1201 
1202 int
select_nocancel(struct proc * p,struct select_nocancel_args * uap,int32_t * retval)1203 select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retval)
1204 {
1205 	uint64_t timeout = 0;
1206 
1207 	if (uap->tv) {
1208 		int err;
1209 		struct timeval atv;
1210 		if (IS_64BIT_PROCESS(p)) {
1211 			struct user64_timeval atv64;
1212 			err = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
1213 			/* Loses resolution - assume timeout < 68 years */
1214 			atv.tv_sec = (__darwin_time_t)atv64.tv_sec;
1215 			atv.tv_usec = atv64.tv_usec;
1216 		} else {
1217 			struct user32_timeval atv32;
1218 			err = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
1219 			atv.tv_sec = atv32.tv_sec;
1220 			atv.tv_usec = atv32.tv_usec;
1221 		}
1222 		if (err) {
1223 			return err;
1224 		}
1225 
1226 		if (itimerfix(&atv)) {
1227 			err = EINVAL;
1228 			return err;
1229 		}
1230 
1231 		clock_absolutetime_interval_to_deadline(tvtoabstime(&atv), &timeout);
1232 	}
1233 
1234 	return select_internal(p, uap, timeout, retval);
1235 }
1236 
1237 int
pselect(struct proc * p,struct pselect_args * uap,int32_t * retval)1238 pselect(struct proc *p, struct pselect_args *uap, int32_t *retval)
1239 {
1240 	__pthread_testcancel(1);
1241 	return pselect_nocancel(p, (struct pselect_nocancel_args *)uap, retval);
1242 }
1243 
1244 int
pselect_nocancel(struct proc * p,struct pselect_nocancel_args * uap,int32_t * retval)1245 pselect_nocancel(struct proc *p, struct pselect_nocancel_args *uap, int32_t *retval)
1246 {
1247 	int err;
1248 	struct uthread *ut;
1249 	uint64_t timeout = 0;
1250 
1251 	if (uap->ts) {
1252 		struct timespec ts;
1253 
1254 		if (IS_64BIT_PROCESS(p)) {
1255 			struct user64_timespec ts64;
1256 			err = copyin(uap->ts, (caddr_t)&ts64, sizeof(ts64));
1257 			ts.tv_sec = (__darwin_time_t)ts64.tv_sec;
1258 			ts.tv_nsec = (long)ts64.tv_nsec;
1259 		} else {
1260 			struct user32_timespec ts32;
1261 			err = copyin(uap->ts, (caddr_t)&ts32, sizeof(ts32));
1262 			ts.tv_sec = ts32.tv_sec;
1263 			ts.tv_nsec = ts32.tv_nsec;
1264 		}
1265 		if (err) {
1266 			return err;
1267 		}
1268 
1269 		if (!timespec_is_valid(&ts)) {
1270 			return EINVAL;
1271 		}
1272 		clock_absolutetime_interval_to_deadline(tstoabstime(&ts), &timeout);
1273 	}
1274 
1275 	ut = current_uthread();
1276 
1277 	if (uap->mask != USER_ADDR_NULL) {
1278 		/* save current mask, then copyin and set new mask */
1279 		sigset_t newset;
1280 		err = copyin(uap->mask, &newset, sizeof(sigset_t));
1281 		if (err) {
1282 			return err;
1283 		}
1284 		ut->uu_oldmask = ut->uu_sigmask;
1285 		ut->uu_flag |= UT_SAS_OLDMASK;
1286 		ut->uu_sigmask = (newset & ~sigcantmask);
1287 	}
1288 
1289 	err = select_internal(p, (struct select_nocancel_args *)uap, timeout, retval);
1290 
1291 	if (err != EINTR && ut->uu_flag & UT_SAS_OLDMASK) {
1292 		/*
1293 		 * Restore old mask (direct return case). NOTE: EINTR can also be returned
1294 		 * if the thread is cancelled. In that case, we don't reset the signal
1295 		 * mask to its original value (which usually happens in the signal
1296 		 * delivery path). This behavior is permitted by POSIX.
1297 		 */
1298 		ut->uu_sigmask = ut->uu_oldmask;
1299 		ut->uu_oldmask = 0;
1300 		ut->uu_flag &= ~UT_SAS_OLDMASK;
1301 	}
1302 
1303 	return err;
1304 }
1305 
1306 void
select_cleanup_uthread(struct _select * sel)1307 select_cleanup_uthread(struct _select *sel)
1308 {
1309 	kfree_data(sel->ibits, 2 * sel->nbytes);
1310 	sel->ibits = sel->obits = NULL;
1311 	sel->nbytes = 0;
1312 }
1313 
1314 static int
select_grow_uthread_cache(struct _select * sel,uint32_t nbytes)1315 select_grow_uthread_cache(struct _select *sel, uint32_t nbytes)
1316 {
1317 	uint32_t *buf;
1318 
1319 	buf = kalloc_data(2 * nbytes, Z_WAITOK | Z_ZERO);
1320 	if (buf) {
1321 		select_cleanup_uthread(sel);
1322 		sel->ibits = buf;
1323 		sel->obits = buf + nbytes / sizeof(uint32_t);
1324 		sel->nbytes = nbytes;
1325 		return true;
1326 	}
1327 	return false;
1328 }
1329 
1330 static void
select_bzero_uthread_cache(struct _select * sel)1331 select_bzero_uthread_cache(struct _select *sel)
1332 {
1333 	bzero(sel->ibits, sel->nbytes * 2);
1334 }
1335 
1336 /*
1337  * Generic implementation of {,p}select. Care: we type-pun uap across the two
1338  * syscalls, which differ slightly. The first 4 arguments (nfds and the fd sets)
1339  * are identical. The 5th (timeout) argument points to different types, so we
1340  * unpack in the syscall-specific code, but the generic code still does a null
1341  * check on this argument to determine if a timeout was specified.
1342  */
1343 static int
select_internal(struct proc * p,struct select_nocancel_args * uap,uint64_t timeout,int32_t * retval)1344 select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval)
1345 {
1346 	struct uthread *uth = current_uthread();
1347 	struct _select *sel = &uth->uu_select;
1348 	struct _select_data *seldata = &uth->uu_save.uus_select_data;
1349 	int error = 0;
1350 	u_int ni, nw;
1351 
1352 	*retval = 0;
1353 
1354 	seldata->abstime = timeout;
1355 	seldata->args = uap;
1356 	seldata->retval = retval;
1357 	seldata->count = 0;
1358 
1359 	if (uap->nd < 0) {
1360 		return EINVAL;
1361 	}
1362 
1363 	if (uap->nd > p->p_fd.fd_nfiles) {
1364 		uap->nd = p->p_fd.fd_nfiles; /* forgiving; slightly wrong */
1365 	}
1366 	nw = howmany(uap->nd, NFDBITS);
1367 	ni = nw * sizeof(fd_mask);
1368 
1369 	/*
1370 	 * if the previously allocated space for the bits is smaller than
1371 	 * what is requested or no space has yet been allocated for this
1372 	 * thread, allocate enough space now.
1373 	 *
1374 	 * Note: If this process fails, select() will return EAGAIN; this
1375 	 * is the same thing pool() returns in a no-memory situation, but
1376 	 * it is not a POSIX compliant error code for select().
1377 	 */
1378 	if (sel->nbytes >= (3 * ni)) {
1379 		select_bzero_uthread_cache(sel);
1380 	} else if (!select_grow_uthread_cache(sel, 3 * ni)) {
1381 		return EAGAIN;
1382 	}
1383 
1384 	/*
1385 	 * get the bits from the user address space
1386 	 */
1387 #define getbits(name, x) \
1388 	(uap->name ? copyin(uap->name, &sel->ibits[(x) * nw], ni) : 0)
1389 
1390 	if ((error = getbits(in, 0))) {
1391 		return error;
1392 	}
1393 	if ((error = getbits(ou, 1))) {
1394 		return error;
1395 	}
1396 	if ((error = getbits(ex, 2))) {
1397 		return error;
1398 	}
1399 #undef  getbits
1400 
1401 	if ((error = selcount(p, sel->ibits, uap->nd, &seldata->count))) {
1402 		return error;
1403 	}
1404 
1405 	if (uth->uu_selset == NULL) {
1406 		uth->uu_selset = select_set_alloc();
1407 	}
1408 	return selprocess(p, 0, SEL_FIRSTPASS);
1409 }
1410 
1411 static int
selcontinue(int error)1412 selcontinue(int error)
1413 {
1414 	return selprocess(current_proc(), error, SEL_SECONDPASS);
1415 }
1416 
1417 
1418 /*
1419  * selprocess
1420  *
1421  * Parameters:	error			The error code from our caller
1422  *		sel_pass		The pass we are on
1423  */
1424 int
selprocess(struct proc * p,int error,int sel_pass)1425 selprocess(struct proc *p, int error, int sel_pass)
1426 {
1427 	struct uthread *uth = current_uthread();
1428 	struct _select *sel = &uth->uu_select;
1429 	struct _select_data *seldata = &uth->uu_save.uus_select_data;
1430 	struct select_nocancel_args *uap = seldata->args;
1431 	int *retval = seldata->retval;
1432 
1433 	int unwind = 1;
1434 	int prepost = 0;
1435 	int somewakeup = 0;
1436 	int doretry = 0;
1437 	wait_result_t wait_result;
1438 
1439 	if ((error != 0) && (sel_pass == SEL_FIRSTPASS)) {
1440 		unwind = 0;
1441 	}
1442 	if (seldata->count == 0) {
1443 		unwind = 0;
1444 	}
1445 retry:
1446 	if (error != 0) {
1447 		goto done;
1448 	}
1449 
1450 	OSBitOrAtomic(P_SELECT, &p->p_flag);
1451 
1452 	/* skip scans if the select is just for timeouts */
1453 	if (seldata->count) {
1454 		error = selscan(p, sel, seldata, uap->nd, retval, sel_pass,
1455 		    uth->uu_selset);
1456 		if (error || *retval) {
1457 			goto done;
1458 		}
1459 		if (prepost || somewakeup) {
1460 			/*
1461 			 * if the select of log, then we can wakeup and
1462 			 * discover some one else already read the data;
1463 			 * go to select again if time permits
1464 			 */
1465 			prepost = 0;
1466 			somewakeup = 0;
1467 			doretry = 1;
1468 		}
1469 	}
1470 
1471 	if (uap->tv) {
1472 		uint64_t        now;
1473 
1474 		clock_get_uptime(&now);
1475 		if (now >= seldata->abstime) {
1476 			goto done;
1477 		}
1478 	}
1479 
1480 	if (doretry) {
1481 		/* cleanup obits and try again */
1482 		doretry = 0;
1483 		sel_pass = SEL_FIRSTPASS;
1484 		goto retry;
1485 	}
1486 
1487 	/*
1488 	 * To effect a poll, the timeout argument should be
1489 	 * non-nil, pointing to a zero-valued timeval structure.
1490 	 */
1491 	if (uap->tv && seldata->abstime == 0) {
1492 		goto done;
1493 	}
1494 
1495 	/* No spurious wakeups due to colls,no need to check for them */
1496 	if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
1497 		sel_pass = SEL_FIRSTPASS;
1498 		goto retry;
1499 	}
1500 
1501 	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1502 
1503 	/* if the select is just for timeout skip check */
1504 	if (seldata->count && (sel_pass == SEL_SECONDPASS)) {
1505 		panic("selprocess: 2nd pass assertwaiting");
1506 	}
1507 
1508 	wait_result = waitq_assert_wait64_leeway(uth->uu_selset,
1509 	    NO_EVENT64, THREAD_ABORTSAFE,
1510 	    TIMEOUT_URGENCY_USER_NORMAL,
1511 	    seldata->abstime,
1512 	    TIMEOUT_NO_LEEWAY);
1513 	if (wait_result != THREAD_AWAKENED) {
1514 		/* there are no preposted events */
1515 		error = tsleep1(NULL, PSOCK | PCATCH,
1516 		    "select", 0, selcontinue);
1517 	} else {
1518 		prepost = 1;
1519 		error = 0;
1520 	}
1521 
1522 	if (error == 0) {
1523 		sel_pass = SEL_SECONDPASS;
1524 		if (!prepost) {
1525 			somewakeup = 1;
1526 		}
1527 		goto retry;
1528 	}
1529 done:
1530 	if (unwind) {
1531 		seldrop(p, sel->ibits, uap->nd, seldata->count);
1532 		select_set_reset(uth->uu_selset);
1533 	}
1534 	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1535 	/* select is not restarted after signals... */
1536 	if (error == ERESTART) {
1537 		error = EINTR;
1538 	}
1539 	if (error == EWOULDBLOCK) {
1540 		error = 0;
1541 	}
1542 
1543 	if (error == 0) {
1544 		uint32_t nw = howmany(uap->nd, NFDBITS);
1545 		uint32_t ni = nw * sizeof(fd_mask);
1546 
1547 #define putbits(name, x) \
1548 	(uap->name ? copyout(&sel->obits[(x) * nw], uap->name, ni) : 0)
1549 		int e0 = putbits(in, 0);
1550 		int e1 = putbits(ou, 1);
1551 		int e2 = putbits(ex, 2);
1552 
1553 		error = e0 ?: e1 ?: e2;
1554 #undef putbits
1555 	}
1556 
1557 	if (error != EINTR && sel_pass == SEL_SECONDPASS && uth->uu_flag & UT_SAS_OLDMASK) {
1558 		/* restore signal mask - continuation case */
1559 		uth->uu_sigmask = uth->uu_oldmask;
1560 		uth->uu_oldmask = 0;
1561 		uth->uu_flag &= ~UT_SAS_OLDMASK;
1562 	}
1563 
1564 	return error;
1565 }
1566 
1567 
1568 /**
1569  * remove the fileproc's underlying waitq from the supplied waitq set;
1570  * clear FP_INSELECT when appropriate
1571  *
1572  * Parameters:
1573  *		fp	File proc that is potentially currently in select
1574  *		selset	Waitq set to which the fileproc may belong
1575  *			(usually this is the thread's private waitq set)
1576  * Conditions:
1577  *		proc_fdlock is held
1578  */
1579 static void
selunlinkfp(struct fileproc * fp,struct select_set * selset)1580 selunlinkfp(struct fileproc *fp, struct select_set *selset)
1581 {
1582 	if (fp->fp_flags & FP_INSELECT) {
1583 		if (fp->fp_guard_attrs) {
1584 			if (fp->fp_guard->fpg_wset == selset) {
1585 				fp->fp_guard->fpg_wset = NULL;
1586 				fp->fp_flags &= ~FP_INSELECT;
1587 			}
1588 		} else {
1589 			if (fp->fp_wset == selset) {
1590 				fp->fp_wset = NULL;
1591 				fp->fp_flags &= ~FP_INSELECT;
1592 			}
1593 		}
1594 	}
1595 }
1596 
1597 /**
1598  * connect a fileproc to the given selset, potentially bridging to a waitq
1599  * pointed to indirectly by wq_data
1600  *
1601  * Parameters:
1602  *		fp	File proc potentially currently in select
1603  *		selset	Waitq set to which the fileproc should now belong
1604  *			(usually this is the thread's private waitq set)
1605  *
1606  * Conditions:
1607  *		proc_fdlock is held
1608  */
1609 static void
sellinkfp(struct fileproc * fp,struct select_set * selset,waitq_link_t * linkp)1610 sellinkfp(struct fileproc *fp, struct select_set *selset, waitq_link_t *linkp)
1611 {
1612 	if ((fp->fp_flags & FP_INSELECT) == 0) {
1613 		if (fp->fp_guard_attrs) {
1614 			fp->fp_guard->fpg_wset = selset;
1615 		} else {
1616 			fp->fp_wset = selset;
1617 		}
1618 		fp->fp_flags |= FP_INSELECT;
1619 	} else {
1620 		fp->fp_flags |= FP_SELCONFLICT;
1621 		if (linkp->wqlh == NULL) {
1622 			*linkp = waitq_link_alloc(WQT_SELECT_SET);
1623 		}
1624 		select_set_link(&select_conflict_queue, selset, linkp);
1625 	}
1626 }
1627 
1628 
1629 /*
1630  * selscan
1631  *
1632  * Parameters:	p			Process performing the select
1633  *		sel			The per-thread select context structure
1634  *		nfd			The number of file descriptors to scan
1635  *		retval			The per thread system call return area
1636  *		sel_pass		Which pass this is; allowed values are
1637  *						SEL_FIRSTPASS and SEL_SECONDPASS
1638  *		selset			The per thread wait queue set
1639  *
1640  * Returns:	0			Success
1641  *		EIO			Invalid p->p_fd field XXX Obsolete?
1642  *		EBADF			One of the files in the bit vector is
1643  *						invalid.
1644  */
1645 static int
selscan(struct proc * p,struct _select * sel,struct _select_data * seldata,int nfd,int32_t * retval,int sel_pass,struct select_set * selset)1646 selscan(struct proc *p, struct _select *sel, struct _select_data * seldata,
1647     int nfd, int32_t *retval, int sel_pass, struct select_set *selset)
1648 {
1649 	int msk, i, j, fd;
1650 	u_int32_t bits;
1651 	struct fileproc *fp;
1652 	int n = 0;              /* count of bits */
1653 	int nc = 0;             /* bit vector offset (nc'th bit) */
1654 	static int flag[3] = { FREAD, FWRITE, 0 };
1655 	u_int32_t *iptr, *optr;
1656 	u_int nw;
1657 	u_int32_t *ibits, *obits;
1658 	int count;
1659 	struct vfs_context context = {
1660 		.vc_thread = current_thread(),
1661 	};
1662 	waitq_link_t link = WQL_NULL;
1663 	void *s_data;
1664 
1665 	ibits = sel->ibits;
1666 	obits = sel->obits;
1667 
1668 	nw = howmany(nfd, NFDBITS);
1669 
1670 	count = seldata->count;
1671 
1672 	nc = 0;
1673 	if (!count) {
1674 		*retval = 0;
1675 		return 0;
1676 	}
1677 
1678 	if (sel_pass == SEL_FIRSTPASS) {
1679 		/*
1680 		 * Make sure the waitq-set is all clean:
1681 		 *
1682 		 * select loops until it finds at least one event, however it
1683 		 * doesn't mean that the event that woke up select is still
1684 		 * fired by the time the second pass runs, and then
1685 		 * select_internal will loop back to a first pass.
1686 		 */
1687 		select_set_reset(selset);
1688 		s_data = &link;
1689 	} else {
1690 		s_data = NULL;
1691 	}
1692 
1693 	proc_fdlock(p);
1694 	for (msk = 0; msk < 3; msk++) {
1695 		iptr = (u_int32_t *)&ibits[msk * nw];
1696 		optr = (u_int32_t *)&obits[msk * nw];
1697 
1698 		for (i = 0; i < nfd; i += NFDBITS) {
1699 			bits = iptr[i / NFDBITS];
1700 
1701 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1702 				bits &= ~(1U << j);
1703 
1704 				fp = fp_get_noref_locked(p, fd);
1705 				if (fp == NULL) {
1706 					/*
1707 					 * If we abort because of a bad
1708 					 * fd, let the caller unwind...
1709 					 */
1710 					proc_fdunlock(p);
1711 					return EBADF;
1712 				}
1713 				if (sel_pass == SEL_SECONDPASS) {
1714 					selunlinkfp(fp, selset);
1715 				} else if (link.wqlh == NULL) {
1716 					link = waitq_link_alloc(WQT_SELECT_SET);
1717 				}
1718 
1719 				context.vc_ucred = fp->f_cred;
1720 
1721 				/* The select; set the bit, if true */
1722 				if (fo_select(fp, flag[msk], s_data, &context)) {
1723 					optr[fd / NFDBITS] |= (1U << (fd % NFDBITS));
1724 					n++;
1725 				}
1726 				if (sel_pass == SEL_FIRSTPASS) {
1727 					/*
1728 					 * Hook up the thread's waitq set either to
1729 					 * the fileproc structure, or to the global
1730 					 * conflict queue: but only on the first
1731 					 * select pass.
1732 					 */
1733 					sellinkfp(fp, selset, &link);
1734 				}
1735 				nc++;
1736 			}
1737 		}
1738 	}
1739 	proc_fdunlock(p);
1740 
1741 	if (link.wqlh) {
1742 		waitq_link_free(WQT_SELECT_SET, link);
1743 	}
1744 
1745 	*retval = n;
1746 	return 0;
1747 }
1748 
1749 static int poll_callback(struct kevent_qos_s *, kevent_ctx_t);
1750 
1751 int
poll(struct proc * p,struct poll_args * uap,int32_t * retval)1752 poll(struct proc *p, struct poll_args *uap, int32_t *retval)
1753 {
1754 	__pthread_testcancel(1);
1755 	return poll_nocancel(p, (struct poll_nocancel_args *)uap, retval);
1756 }
1757 
1758 
1759 int
poll_nocancel(struct proc * p,struct poll_nocancel_args * uap,int32_t * retval)1760 poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
1761 {
1762 	struct pollfd *fds = NULL;
1763 	struct kqueue *kq = NULL;
1764 	int error = 0;
1765 	u_int nfds = uap->nfds;
1766 	u_int rfds = 0;
1767 	rlim_t nofile = proc_limitgetcur(p, RLIMIT_NOFILE);
1768 	size_t ni = nfds * sizeof(struct pollfd);
1769 
1770 	/*
1771 	 * This is kinda bogus.  We have fd limits, but that is not
1772 	 * really related to the size of the pollfd array.  Make sure
1773 	 * we let the process use at least FD_SETSIZE entries and at
1774 	 * least enough for the current limits.  We want to be reasonably
1775 	 * safe, but not overly restrictive.
1776 	 */
1777 	if (nfds > OPEN_MAX ||
1778 	    (nfds > nofile && (proc_suser(p) || nfds > FD_SETSIZE))) {
1779 		return EINVAL;
1780 	}
1781 
1782 	kq = kqueue_alloc(p);
1783 	if (kq == NULL) {
1784 		return EAGAIN;
1785 	}
1786 
1787 	if (nfds) {
1788 		fds = (struct pollfd *)kalloc_data(ni, Z_WAITOK);
1789 		if (NULL == fds) {
1790 			error = EAGAIN;
1791 			goto out;
1792 		}
1793 
1794 		error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
1795 		if (error) {
1796 			goto out;
1797 		}
1798 	}
1799 
1800 	/* JMM - all this P_SELECT stuff is bogus */
1801 	OSBitOrAtomic(P_SELECT, &p->p_flag);
1802 	for (u_int i = 0; i < nfds; i++) {
1803 		short events = fds[i].events;
1804 		__assert_only int rc;
1805 
1806 		/* per spec, ignore fd values below zero */
1807 		if (fds[i].fd < 0) {
1808 			fds[i].revents = 0;
1809 			continue;
1810 		}
1811 
1812 		/* convert the poll event into a kqueue kevent */
1813 		struct kevent_qos_s kev = {
1814 			.ident = fds[i].fd,
1815 			.flags = EV_ADD | EV_ONESHOT | EV_POLL,
1816 			.udata = CAST_USER_ADDR_T(&fds[i])
1817 		};
1818 
1819 		/* Handle input events */
1820 		if (events & (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP)) {
1821 			kev.filter = EVFILT_READ;
1822 			if (events & (POLLPRI | POLLRDBAND)) {
1823 				kev.flags |= EV_OOBAND;
1824 			}
1825 			rc = kevent_register(kq, &kev, NULL);
1826 			assert((rc & FILTER_REGISTER_WAIT) == 0);
1827 		}
1828 
1829 		/* Handle output events */
1830 		if ((kev.flags & EV_ERROR) == 0 &&
1831 		    (events & (POLLOUT | POLLWRNORM | POLLWRBAND))) {
1832 			kev.filter = EVFILT_WRITE;
1833 			rc = kevent_register(kq, &kev, NULL);
1834 			assert((rc & FILTER_REGISTER_WAIT) == 0);
1835 		}
1836 
1837 		/* Handle BSD extension vnode events */
1838 		if ((kev.flags & EV_ERROR) == 0 &&
1839 		    (events & (POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE))) {
1840 			kev.filter = EVFILT_VNODE;
1841 			kev.fflags = 0;
1842 			if (events & POLLEXTEND) {
1843 				kev.fflags |= NOTE_EXTEND;
1844 			}
1845 			if (events & POLLATTRIB) {
1846 				kev.fflags |= NOTE_ATTRIB;
1847 			}
1848 			if (events & POLLNLINK) {
1849 				kev.fflags |= NOTE_LINK;
1850 			}
1851 			if (events & POLLWRITE) {
1852 				kev.fflags |= NOTE_WRITE;
1853 			}
1854 			rc = kevent_register(kq, &kev, NULL);
1855 			assert((rc & FILTER_REGISTER_WAIT) == 0);
1856 		}
1857 
1858 		if (kev.flags & EV_ERROR) {
1859 			fds[i].revents = POLLNVAL;
1860 			rfds++;
1861 		} else {
1862 			fds[i].revents = 0;
1863 		}
1864 	}
1865 
1866 	/*
1867 	 * Did we have any trouble registering?
1868 	 * If user space passed 0 FDs, then respect any timeout value passed.
1869 	 * This is an extremely inefficient sleep. If user space passed one or
1870 	 * more FDs, and we had trouble registering _all_ of them, then bail
1871 	 * out. If a subset of the provided FDs failed to register, then we
1872 	 * will still call the kqueue_scan function.
1873 	 */
1874 	if (nfds && (rfds == nfds)) {
1875 		goto done;
1876 	}
1877 
1878 	/* scan for, and possibly wait for, the kevents to trigger */
1879 	kevent_ctx_t kectx = kevent_get_context(current_thread());
1880 	*kectx = (struct kevent_ctx_s){
1881 		.kec_process_noutputs = rfds,
1882 		.kec_process_flags    = KEVENT_FLAG_POLL,
1883 		.kec_deadline         = 0, /* wait forever */
1884 	};
1885 
1886 	/*
1887 	 * If any events have trouble registering, an event has fired and we
1888 	 * shouldn't wait for events in kqueue_scan.
1889 	 */
1890 	if (rfds) {
1891 		kectx->kec_process_flags |= KEVENT_FLAG_IMMEDIATE;
1892 	} else if (uap->timeout != -1) {
1893 		clock_interval_to_deadline(uap->timeout, NSEC_PER_MSEC,
1894 		    &kectx->kec_deadline);
1895 	}
1896 
1897 	error = kqueue_scan(kq, kectx->kec_process_flags, kectx, poll_callback);
1898 	rfds = kectx->kec_process_noutputs;
1899 
1900 done:
1901 	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1902 	/* poll is not restarted after signals... */
1903 	if (error == ERESTART) {
1904 		error = EINTR;
1905 	}
1906 	if (error == 0) {
1907 		error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
1908 		*retval = rfds;
1909 	}
1910 
1911 out:
1912 	kfree_data(fds, ni);
1913 
1914 	kqueue_dealloc(kq);
1915 	return error;
1916 }
1917 
1918 static int
poll_callback(struct kevent_qos_s * kevp,kevent_ctx_t kectx)1919 poll_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
1920 {
1921 	struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
1922 	short prev_revents = fds->revents;
1923 	short mask = 0;
1924 
1925 	/* convert the results back into revents */
1926 	if (kevp->flags & EV_EOF) {
1927 		fds->revents |= POLLHUP;
1928 	}
1929 	if (kevp->flags & EV_ERROR) {
1930 		fds->revents |= POLLERR;
1931 	}
1932 
1933 	switch (kevp->filter) {
1934 	case EVFILT_READ:
1935 		if (fds->revents & POLLHUP) {
1936 			mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND);
1937 		} else {
1938 			mask = (POLLIN | POLLRDNORM);
1939 			if (kevp->flags & EV_OOBAND) {
1940 				mask |= (POLLPRI | POLLRDBAND);
1941 			}
1942 		}
1943 		fds->revents |= (fds->events & mask);
1944 		break;
1945 
1946 	case EVFILT_WRITE:
1947 		if (!(fds->revents & POLLHUP)) {
1948 			fds->revents |= (fds->events & (POLLOUT | POLLWRNORM | POLLWRBAND));
1949 		}
1950 		break;
1951 
1952 	case EVFILT_VNODE:
1953 		if (kevp->fflags & NOTE_EXTEND) {
1954 			fds->revents |= (fds->events & POLLEXTEND);
1955 		}
1956 		if (kevp->fflags & NOTE_ATTRIB) {
1957 			fds->revents |= (fds->events & POLLATTRIB);
1958 		}
1959 		if (kevp->fflags & NOTE_LINK) {
1960 			fds->revents |= (fds->events & POLLNLINK);
1961 		}
1962 		if (kevp->fflags & NOTE_WRITE) {
1963 			fds->revents |= (fds->events & POLLWRITE);
1964 		}
1965 		break;
1966 	}
1967 
1968 	if (fds->revents != 0 && prev_revents == 0) {
1969 		kectx->kec_process_noutputs++;
1970 	}
1971 
1972 	return 0;
1973 }
1974 
1975 int
seltrue(__unused dev_t dev,__unused int flag,__unused struct proc * p)1976 seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
1977 {
1978 	return 1;
1979 }
1980 
1981 /*
1982  * selcount
1983  *
1984  * Count the number of bits set in the input bit vector, and establish an
1985  * outstanding fp->fp_iocount for each of the descriptors which will be in
1986  * use in the select operation.
1987  *
1988  * Parameters:	p			The process doing the select
1989  *		ibits			The input bit vector
1990  *		nfd			The number of fd's in the vector
1991  *		countp			Pointer to where to store the bit count
1992  *
1993  * Returns:	0			Success
1994  *		EIO			Bad per process open file table
1995  *		EBADF			One of the bits in the input bit vector
1996  *						references an invalid fd
1997  *
1998  * Implicit:	*countp (modified)	Count of fd's
1999  *
2000  * Notes:	This function is the first pass under the proc_fdlock() that
2001  *		permits us to recognize invalid descriptors in the bit vector;
2002  *		the may, however, not remain valid through the drop and
2003  *		later reacquisition of the proc_fdlock().
2004  */
2005 static int
selcount(struct proc * p,u_int32_t * ibits,int nfd,int * countp)2006 selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp)
2007 {
2008 	int msk, i, j, fd;
2009 	u_int32_t bits;
2010 	struct fileproc *fp;
2011 	int n = 0;
2012 	u_int32_t *iptr;
2013 	u_int nw;
2014 	int error = 0;
2015 	int need_wakeup = 0;
2016 
2017 	nw = howmany(nfd, NFDBITS);
2018 
2019 	proc_fdlock(p);
2020 	for (msk = 0; msk < 3; msk++) {
2021 		iptr = (u_int32_t *)&ibits[msk * nw];
2022 		for (i = 0; i < nfd; i += NFDBITS) {
2023 			bits = iptr[i / NFDBITS];
2024 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
2025 				bits &= ~(1U << j);
2026 
2027 				fp = fp_get_noref_locked(p, fd);
2028 				if (fp == NULL) {
2029 					*countp = 0;
2030 					error = EBADF;
2031 					goto bad;
2032 				}
2033 				os_ref_retain_locked(&fp->fp_iocount);
2034 				n++;
2035 			}
2036 		}
2037 	}
2038 	proc_fdunlock(p);
2039 
2040 	*countp = n;
2041 	return 0;
2042 
2043 bad:
2044 	if (n == 0) {
2045 		goto out;
2046 	}
2047 	/* Ignore error return; it's already EBADF */
2048 	(void)seldrop_locked(p, ibits, nfd, n, &need_wakeup);
2049 
2050 out:
2051 	proc_fdunlock(p);
2052 	if (need_wakeup) {
2053 		wakeup(&p->p_fd.fd_fpdrainwait);
2054 	}
2055 	return error;
2056 }
2057 
2058 
2059 /*
2060  * seldrop_locked
2061  *
2062  * Drop outstanding wait queue references set up during selscan(); drop the
2063  * outstanding per fileproc fp_iocount picked up during the selcount().
2064  *
2065  * Parameters:	p			Process performing the select
2066  *		ibits			Input bit bector of fd's
2067  *		nfd			Number of fd's
2068  *		lim			Limit to number of vector entries to
2069  *						consider, or -1 for "all"
2070  *		inselect		True if
2071  *		need_wakeup		Pointer to flag to set to do a wakeup
2072  *					if f_iocont on any descriptor goes to 0
2073  *
2074  * Returns:	0			Success
2075  *		EBADF			One or more fds in the bit vector
2076  *						were invalid, but the rest
2077  *						were successfully dropped
2078  *
2079  * Notes:	An fd make become bad while the proc_fdlock() is not held,
2080  *		if a multithreaded application closes the fd out from under
2081  *		the in progress select.  In this case, we still have to
2082  *		clean up after the set up on the remaining fds.
2083  */
2084 static int
seldrop_locked(struct proc * p,u_int32_t * ibits,int nfd,int lim,int * need_wakeup)2085 seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup)
2086 {
2087 	int msk, i, j, nc, fd;
2088 	u_int32_t bits;
2089 	struct fileproc *fp;
2090 	u_int32_t *iptr;
2091 	u_int nw;
2092 	int error = 0;
2093 	uthread_t uth = current_uthread();
2094 	struct _select_data *seldata;
2095 
2096 	*need_wakeup = 0;
2097 
2098 	nw = howmany(nfd, NFDBITS);
2099 	seldata = &uth->uu_save.uus_select_data;
2100 
2101 	nc = 0;
2102 	for (msk = 0; msk < 3; msk++) {
2103 		iptr = (u_int32_t *)&ibits[msk * nw];
2104 		for (i = 0; i < nfd; i += NFDBITS) {
2105 			bits = iptr[i / NFDBITS];
2106 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
2107 				bits &= ~(1U << j);
2108 				/*
2109 				 * If we've already dropped as many as were
2110 				 * counted/scanned, then we are done.
2111 				 */
2112 				if (nc >= lim) {
2113 					goto done;
2114 				}
2115 
2116 				/*
2117 				 * We took an I/O reference in selcount,
2118 				 * so the fp can't possibly be NULL.
2119 				 */
2120 				fp = fp_get_noref_locked_with_iocount(p, fd);
2121 				selunlinkfp(fp, uth->uu_selset);
2122 
2123 				nc++;
2124 
2125 				const os_ref_count_t refc = os_ref_release_locked(&fp->fp_iocount);
2126 				if (0 == refc) {
2127 					panic("fp_iocount overdecrement!");
2128 				}
2129 
2130 				if (1 == refc) {
2131 					/*
2132 					 * The last iocount is responsible for clearing
2133 					 * selconfict flag - even if we didn't set it -
2134 					 * and is also responsible for waking up anyone
2135 					 * waiting on iocounts to drain.
2136 					 */
2137 					if (fp->fp_flags & FP_SELCONFLICT) {
2138 						fp->fp_flags &= ~FP_SELCONFLICT;
2139 					}
2140 					if (p->p_fd.fd_fpdrainwait) {
2141 						p->p_fd.fd_fpdrainwait = 0;
2142 						*need_wakeup = 1;
2143 					}
2144 				}
2145 			}
2146 		}
2147 	}
2148 done:
2149 	return error;
2150 }
2151 
2152 
2153 static int
seldrop(struct proc * p,u_int32_t * ibits,int nfd,int lim)2154 seldrop(struct proc *p, u_int32_t *ibits, int nfd, int lim)
2155 {
2156 	int error;
2157 	int need_wakeup = 0;
2158 
2159 	proc_fdlock(p);
2160 	error = seldrop_locked(p, ibits, nfd, lim, &need_wakeup);
2161 	proc_fdunlock(p);
2162 	if (need_wakeup) {
2163 		wakeup(&p->p_fd.fd_fpdrainwait);
2164 	}
2165 	return error;
2166 }
2167 
2168 /*
2169  * Record a select request.
2170  */
2171 void
selrecord(__unused struct proc * selector,struct selinfo * sip,void * s_data)2172 selrecord(__unused struct proc *selector, struct selinfo *sip, void *s_data)
2173 {
2174 	struct select_set *selset = current_uthread()->uu_selset;
2175 
2176 	/* do not record if this is second pass of select */
2177 	if (!s_data) {
2178 		return;
2179 	}
2180 
2181 	if (selset == SELSPEC_RECORD_MARKER) {
2182 		/*
2183 		 * The kevent subsystem is trying to sniff
2184 		 * the selinfo::si_note to attach to.
2185 		 */
2186 		((selspec_record_hook_t)s_data)(sip);
2187 	} else {
2188 		waitq_link_t *linkp = s_data;
2189 
2190 		if (!waitq_is_valid(&sip->si_waitq)) {
2191 			waitq_init(&sip->si_waitq, WQT_SELECT, SYNC_POLICY_FIFO);
2192 		}
2193 
2194 		/* note: this checks for pre-existing linkage */
2195 		select_set_link(&sip->si_waitq, selset, linkp);
2196 	}
2197 }
2198 
2199 static void
selwakeup_internal(struct selinfo * sip,long hint,wait_result_t wr)2200 selwakeup_internal(struct selinfo *sip, long hint, wait_result_t wr)
2201 {
2202 	if (sip->si_flags & SI_SELSPEC) {
2203 		/*
2204 		 * The "primitive" lock is held.
2205 		 * The knote lock is not held.
2206 		 *
2207 		 * All knotes will transition their kn_hook to NULL.
2208 		 */
2209 		lck_spin_lock(&selspec_lock);
2210 		KNOTE(&sip->si_note, hint);
2211 		klist_init(&sip->si_note);
2212 		lck_spin_unlock(&selspec_lock);
2213 		sip->si_flags &= ~SI_SELSPEC;
2214 	}
2215 
2216 	/*
2217 	 * After selrecord() has been called, selinfo owners must call
2218 	 * at least one of selwakeup() or selthreadclear().
2219 	 *
2220 	 * Use this opportunity to deinit the waitq
2221 	 * so that all linkages are garbage collected
2222 	 * in a combined wakeup-all + unlink + deinit call.
2223 	 */
2224 	select_waitq_wakeup_and_deinit(&sip->si_waitq, NO_EVENT64, wr,
2225 	    WAITQ_ALL_PRIORITIES);
2226 }
2227 
2228 
2229 void
selwakeup(struct selinfo * sip)2230 selwakeup(struct selinfo *sip)
2231 {
2232 	selwakeup_internal(sip, 0, THREAD_AWAKENED);
2233 }
2234 
2235 void
selthreadclear(struct selinfo * sip)2236 selthreadclear(struct selinfo *sip)
2237 {
2238 	selwakeup_internal(sip, NOTE_REVOKE, THREAD_RESTART);
2239 }
2240 
2241 
2242 /*
2243  * gethostuuid
2244  *
2245  * Description:	Get the host UUID from IOKit and return it to user space.
2246  *
2247  * Parameters:	uuid_buf		Pointer to buffer to receive UUID
2248  *		timeout			Timespec for timout
2249  *
2250  * Returns:	0			Success
2251  *		EWOULDBLOCK		Timeout is too short
2252  *		copyout:EFAULT		Bad user buffer
2253  *		mac_system_check_info:EPERM		Client not allowed to perform this operation
2254  *
2255  * Notes:	A timeout seems redundant, since if it's tolerable to not
2256  *		have a system UUID in hand, then why ask for one?
2257  */
2258 int
gethostuuid(struct proc * p,struct gethostuuid_args * uap,__unused int32_t * retval)2259 gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retval)
2260 {
2261 	kern_return_t kret;
2262 	int error;
2263 	mach_timespec_t mach_ts;        /* for IOKit call */
2264 	__darwin_uuid_t uuid_kern = {}; /* for IOKit call */
2265 
2266 	/* Check entitlement */
2267 	if (!IOCurrentTaskHasEntitlement("com.apple.private.getprivatesysid")) {
2268 #if !defined(XNU_TARGET_OS_OSX)
2269 #if CONFIG_MACF
2270 		if ((error = mac_system_check_info(kauth_cred_get(), "hw.uuid")) != 0) {
2271 			/* EPERM invokes userspace upcall if present */
2272 			return error;
2273 		}
2274 #endif
2275 #endif
2276 	}
2277 
2278 	/* Convert the 32/64 bit timespec into a mach_timespec_t */
2279 	if (proc_is64bit(p)) {
2280 		struct user64_timespec ts;
2281 		error = copyin(uap->timeoutp, &ts, sizeof(ts));
2282 		if (error) {
2283 			return error;
2284 		}
2285 		mach_ts.tv_sec = (unsigned int)ts.tv_sec;
2286 		mach_ts.tv_nsec = (clock_res_t)ts.tv_nsec;
2287 	} else {
2288 		struct user32_timespec ts;
2289 		error = copyin(uap->timeoutp, &ts, sizeof(ts));
2290 		if (error) {
2291 			return error;
2292 		}
2293 		mach_ts.tv_sec = ts.tv_sec;
2294 		mach_ts.tv_nsec = ts.tv_nsec;
2295 	}
2296 
2297 	/* Call IOKit with the stack buffer to get the UUID */
2298 	kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
2299 
2300 	/*
2301 	 * If we get it, copy out the data to the user buffer; note that a
2302 	 * uuid_t is an array of characters, so this is size invariant for
2303 	 * 32 vs. 64 bit.
2304 	 */
2305 	if (kret == KERN_SUCCESS) {
2306 		error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
2307 	} else {
2308 		error = EWOULDBLOCK;
2309 	}
2310 
2311 	return error;
2312 }
2313 
2314 /*
2315  * ledger
2316  *
2317  * Description:	Omnibus system call for ledger operations
2318  */
2319 int
ledger(struct proc * p,struct ledger_args * args,__unused int32_t * retval)2320 ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval)
2321 {
2322 #if !CONFIG_MACF
2323 #pragma unused(p)
2324 #endif
2325 	int rval, pid, len, error;
2326 #ifdef LEDGER_DEBUG
2327 	struct ledger_limit_args lla;
2328 #endif
2329 	task_t task;
2330 	proc_t proc;
2331 
2332 	/* Finish copying in the necessary args before taking the proc lock */
2333 	error = 0;
2334 	len = 0;
2335 	if (args->cmd == LEDGER_ENTRY_INFO) {
2336 		error = copyin(args->arg3, (char *)&len, sizeof(len));
2337 	} else if (args->cmd == LEDGER_TEMPLATE_INFO) {
2338 		error = copyin(args->arg2, (char *)&len, sizeof(len));
2339 	} else if (args->cmd == LEDGER_LIMIT)
2340 #ifdef LEDGER_DEBUG
2341 	{ error = copyin(args->arg2, (char *)&lla, sizeof(lla));}
2342 #else
2343 	{ return EINVAL; }
2344 #endif
2345 	else if ((args->cmd < 0) || (args->cmd > LEDGER_MAX_CMD)) {
2346 		return EINVAL;
2347 	}
2348 
2349 	if (error) {
2350 		return error;
2351 	}
2352 	if (len < 0) {
2353 		return EINVAL;
2354 	}
2355 
2356 	rval = 0;
2357 	if (args->cmd != LEDGER_TEMPLATE_INFO) {
2358 		pid = (int)args->arg1;
2359 		proc = proc_find(pid);
2360 		if (proc == NULL) {
2361 			return ESRCH;
2362 		}
2363 
2364 #if CONFIG_MACF
2365 		error = mac_proc_check_ledger(p, proc, args->cmd);
2366 		if (error) {
2367 			proc_rele(proc);
2368 			return error;
2369 		}
2370 #endif
2371 
2372 		task = proc->task;
2373 	}
2374 
2375 	switch (args->cmd) {
2376 #ifdef LEDGER_DEBUG
2377 	case LEDGER_LIMIT: {
2378 		if (!kauth_cred_issuser(kauth_cred_get())) {
2379 			rval = EPERM;
2380 		}
2381 		rval = ledger_limit(task, &lla);
2382 		proc_rele(proc);
2383 		break;
2384 	}
2385 #endif
2386 	case LEDGER_INFO: {
2387 		struct ledger_info info = {};
2388 
2389 		rval = ledger_info(task, &info);
2390 		proc_rele(proc);
2391 		if (rval == 0) {
2392 			rval = copyout(&info, args->arg2,
2393 			    sizeof(info));
2394 		}
2395 		break;
2396 	}
2397 
2398 	case LEDGER_ENTRY_INFO: {
2399 		void *buf;
2400 		int sz;
2401 
2402 #if CONFIG_MEMORYSTATUS
2403 		task_ledger_settle_dirty_time(task);
2404 #endif /* CONFIG_MEMORYSTATUS */
2405 
2406 		rval = ledger_get_task_entry_info_multiple(task, &buf, &len);
2407 		proc_rele(proc);
2408 		if ((rval == 0) && (len >= 0)) {
2409 			sz = len * sizeof(struct ledger_entry_info);
2410 			rval = copyout(buf, args->arg2, sz);
2411 			kfree_data(buf, sz);
2412 		}
2413 		if (rval == 0) {
2414 			rval = copyout(&len, args->arg3, sizeof(len));
2415 		}
2416 		break;
2417 	}
2418 
2419 	case LEDGER_TEMPLATE_INFO: {
2420 		void *buf;
2421 		int sz;
2422 
2423 		rval = ledger_template_info(&buf, &len);
2424 		if ((rval == 0) && (len >= 0)) {
2425 			sz = len * sizeof(struct ledger_template_info);
2426 			rval = copyout(buf, args->arg1, sz);
2427 			kfree_data(buf, sz);
2428 		}
2429 		if (rval == 0) {
2430 			rval = copyout(&len, args->arg2, sizeof(len));
2431 		}
2432 		break;
2433 	}
2434 
2435 	default:
2436 		panic("ledger syscall logic error -- command type %d", args->cmd);
2437 		proc_rele(proc);
2438 		rval = EINVAL;
2439 	}
2440 
2441 	return rval;
2442 }
2443 
2444 int
telemetry(__unused struct proc * p,struct telemetry_args * args,__unused int32_t * retval)2445 telemetry(__unused struct proc *p, struct telemetry_args *args, __unused int32_t *retval)
2446 {
2447 	int error = 0;
2448 
2449 	switch (args->cmd) {
2450 #if CONFIG_TELEMETRY
2451 	case TELEMETRY_CMD_TIMER_EVENT:
2452 		error = telemetry_timer_event(args->deadline, args->interval, args->leeway);
2453 		break;
2454 	case TELEMETRY_CMD_PMI_SETUP:
2455 		error = telemetry_pmi_setup((enum telemetry_pmi)args->deadline, args->interval);
2456 		break;
2457 #endif /* CONFIG_TELEMETRY */
2458 	case TELEMETRY_CMD_VOUCHER_NAME:
2459 		if (thread_set_voucher_name((mach_port_name_t)args->deadline)) {
2460 			error = EINVAL;
2461 		}
2462 		break;
2463 
2464 	default:
2465 		error = EINVAL;
2466 		break;
2467 	}
2468 
2469 	return error;
2470 }
2471 
2472 /*
2473  * Logging
2474  *
2475  * Description: syscall to access kernel logging from userspace
2476  *
2477  * Args:
2478  *	tag - used for syncing with userspace on the version.
2479  *	flags - flags used by the syscall.
2480  *	buffer - userspace address of string to copy.
2481  *	size - size of buffer.
2482  */
2483 int
log_data(__unused struct proc * p,struct log_data_args * args,int * retval)2484 log_data(__unused struct proc *p, struct log_data_args *args, int *retval)
2485 {
2486 	unsigned int tag = args->tag;
2487 	unsigned int flags = args->flags;
2488 	user_addr_t buffer = args->buffer;
2489 	unsigned int size = args->size;
2490 	int ret = 0;
2491 	*retval = 0;
2492 
2493 	/* Only DEXTs are suppose to use this syscall. */
2494 	if (!task_is_driver(current_task())) {
2495 		return EPERM;
2496 	}
2497 
2498 	/*
2499 	 * Tag synchronize the syscall version with userspace.
2500 	 * Tag == 0 => flags == OS_LOG_TYPE
2501 	 */
2502 	if (tag != 0) {
2503 		return EINVAL;
2504 	}
2505 
2506 	/*
2507 	 * OS_LOG_TYPE are defined in libkern/os/log.h
2508 	 * In userspace they are defined in libtrace/os/log.h
2509 	 */
2510 	if (flags != OS_LOG_TYPE_DEFAULT &&
2511 	    flags != OS_LOG_TYPE_INFO &&
2512 	    flags != OS_LOG_TYPE_DEBUG &&
2513 	    flags != OS_LOG_TYPE_ERROR &&
2514 	    flags != OS_LOG_TYPE_FAULT) {
2515 		return EINVAL;
2516 	}
2517 
2518 	if (size == 0) {
2519 		return EINVAL;
2520 	}
2521 
2522 	/* truncate to OS_LOG_DATA_MAX_SIZE */
2523 	if (size > OS_LOG_DATA_MAX_SIZE) {
2524 		printf("%s: WARNING msg is going to be truncated from %u to %u\n",
2525 		    __func__, size, OS_LOG_DATA_MAX_SIZE);
2526 		size = OS_LOG_DATA_MAX_SIZE;
2527 	}
2528 
2529 	char *log_msg = (char *)kalloc_data(size, Z_WAITOK);
2530 	if (!log_msg) {
2531 		return ENOMEM;
2532 	}
2533 
2534 	if (copyin(buffer, log_msg, size) != 0) {
2535 		ret = EFAULT;
2536 		goto out;
2537 	}
2538 	log_msg[size - 1] = '\0';
2539 
2540 	/*
2541 	 * This will log to dmesg and logd.
2542 	 * The call will fail if the current
2543 	 * process is not a driverKit process.
2544 	 */
2545 	os_log_driverKit(&ret, OS_LOG_DEFAULT, (os_log_type_t)flags, "%s", log_msg);
2546 
2547 out:
2548 	if (log_msg != NULL) {
2549 		kfree_data(log_msg, size);
2550 	}
2551 
2552 	return ret;
2553 }
2554 
2555 #if DEVELOPMENT || DEBUG
2556 
2557 static int
2558 sysctl_mpsc_test_pingpong SYSCTL_HANDLER_ARGS
2559 {
2560 #pragma unused(oidp, arg1, arg2)
2561 	uint64_t value = 0;
2562 	int error;
2563 
2564 	error = SYSCTL_IN(req, &value, sizeof(value));
2565 	if (error) {
2566 		return error;
2567 	}
2568 
2569 	if (error == 0 && req->newptr) {
2570 		error = mpsc_test_pingpong(value, &value);
2571 		if (error == 0) {
2572 			error = SYSCTL_OUT(req, &value, sizeof(value));
2573 		}
2574 	}
2575 
2576 	return error;
2577 }
2578 SYSCTL_PROC(_kern, OID_AUTO, mpsc_test_pingpong, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
2579     0, 0, sysctl_mpsc_test_pingpong, "Q", "MPSC tests: pingpong");
2580 
2581 #endif /* DEVELOPMENT || DEBUG */
2582 
2583 /* Telemetry, microstackshots */
2584 
2585 SYSCTL_NODE(_kern, OID_AUTO, microstackshot, CTLFLAG_RD | CTLFLAG_LOCKED, 0,
2586     "microstackshot info");
2587 
2588 extern uint32_t telemetry_sample_rate;
2589 SYSCTL_UINT(_kern_microstackshot, OID_AUTO, interrupt_sample_rate,
2590     CTLFLAG_RD | CTLFLAG_LOCKED, &telemetry_sample_rate, 0,
2591     "interrupt-based sampling rate in Hz");
2592 
2593 #if defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES)
2594 
2595 extern uint64_t mt_microstackshot_period;
2596 SYSCTL_QUAD(_kern_microstackshot, OID_AUTO, pmi_sample_period,
2597     CTLFLAG_RD | CTLFLAG_LOCKED, &mt_microstackshot_period,
2598     "PMI sampling rate");
2599 extern unsigned int mt_microstackshot_ctr;
2600 SYSCTL_UINT(_kern_microstackshot, OID_AUTO, pmi_sample_counter,
2601     CTLFLAG_RD | CTLFLAG_LOCKED, &mt_microstackshot_ctr, 0,
2602     "PMI counter");
2603 
2604 #endif /* defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) */
2605 
2606 /*Remote Time api*/
2607 SYSCTL_NODE(_machdep, OID_AUTO, remotetime, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "Remote time api");
2608 
2609 #if DEVELOPMENT || DEBUG
2610 #if CONFIG_MACH_BRIDGE_SEND_TIME
2611 extern _Atomic uint32_t bt_init_flag;
2612 extern uint32_t mach_bridge_timer_enable(uint32_t, int);
2613 
2614 SYSCTL_INT(_machdep_remotetime, OID_AUTO, bridge_timer_init_flag,
2615     CTLFLAG_RD | CTLFLAG_LOCKED, &bt_init_flag, 0, "");
2616 
2617 static int sysctl_mach_bridge_timer_enable SYSCTL_HANDLER_ARGS
2618 {
2619 #pragma unused(oidp, arg1, arg2)
2620 	uint32_t value = 0;
2621 	int error = 0;
2622 	/* User is querying buffer size */
2623 	if (req->oldptr == USER_ADDR_NULL && req->newptr == USER_ADDR_NULL) {
2624 		req->oldidx = sizeof(value);
2625 		return 0;
2626 	}
2627 	if (os_atomic_load(&bt_init_flag, acquire)) {
2628 		if (req->newptr) {
2629 			int new_value = 0;
2630 			error = SYSCTL_IN(req, &new_value, sizeof(new_value));
2631 			if (error) {
2632 				return error;
2633 			}
2634 			if (new_value == 0 || new_value == 1) {
2635 				value = mach_bridge_timer_enable(new_value, 1);
2636 			} else {
2637 				return EPERM;
2638 			}
2639 		} else {
2640 			value = mach_bridge_timer_enable(0, 0);
2641 		}
2642 	}
2643 	error = SYSCTL_OUT(req, &value, sizeof(value));
2644 	return error;
2645 }
2646 
2647 SYSCTL_PROC(_machdep_remotetime, OID_AUTO, bridge_timer_enable,
2648     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2649     0, 0, sysctl_mach_bridge_timer_enable, "I", "");
2650 
2651 #endif /* CONFIG_MACH_BRIDGE_SEND_TIME */
2652 
2653 static int sysctl_mach_bridge_remote_time SYSCTL_HANDLER_ARGS
2654 {
2655 #pragma unused(oidp, arg1, arg2)
2656 	uint64_t ltime = 0, rtime = 0;
2657 	if (req->oldptr == USER_ADDR_NULL) {
2658 		req->oldidx = sizeof(rtime);
2659 		return 0;
2660 	}
2661 	if (req->newptr) {
2662 		int error = SYSCTL_IN(req, &ltime, sizeof(ltime));
2663 		if (error) {
2664 			return error;
2665 		}
2666 	}
2667 	rtime = mach_bridge_remote_time(ltime);
2668 	return SYSCTL_OUT(req, &rtime, sizeof(rtime));
2669 }
2670 SYSCTL_PROC(_machdep_remotetime, OID_AUTO, mach_bridge_remote_time,
2671     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
2672     0, 0, sysctl_mach_bridge_remote_time, "Q", "");
2673 
2674 #endif /* DEVELOPMENT || DEBUG */
2675 
2676 #if CONFIG_MACH_BRIDGE_RECV_TIME
2677 extern struct bt_params bt_params_get_latest(void);
2678 
2679 static int sysctl_mach_bridge_conversion_params SYSCTL_HANDLER_ARGS
2680 {
2681 #pragma unused(oidp, arg1, arg2)
2682 	struct bt_params params = {};
2683 	if (req->oldptr == USER_ADDR_NULL) {
2684 		req->oldidx = sizeof(struct bt_params);
2685 		return 0;
2686 	}
2687 	if (req->newptr) {
2688 		return EPERM;
2689 	}
2690 	params = bt_params_get_latest();
2691 	return SYSCTL_OUT(req, &params, MIN(sizeof(params), req->oldlen));
2692 }
2693 
2694 SYSCTL_PROC(_machdep_remotetime, OID_AUTO, conversion_params,
2695     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0,
2696     0, sysctl_mach_bridge_conversion_params, "S,bt_params", "");
2697 
2698 #endif /* CONFIG_MACH_BRIDGE_RECV_TIME */
2699 
2700 #if DEVELOPMENT || DEBUG
2701 
2702 #include <pexpert/pexpert.h>
2703 extern int32_t sysctl_get_bound_cpuid(void);
2704 extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
2705 static int
2706 sysctl_kern_sched_thread_bind_cpu SYSCTL_HANDLER_ARGS
2707 {
2708 #pragma unused(oidp, arg1, arg2)
2709 
2710 	/*
2711 	 * DO NOT remove this bootarg guard or make this non-development.
2712 	 * This kind of binding should only be used for tests and
2713 	 * experiments in a custom configuration, never shipping code.
2714 	 */
2715 
2716 	if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
2717 		return ENOENT;
2718 	}
2719 
2720 	int32_t cpuid = sysctl_get_bound_cpuid();
2721 
2722 	int32_t new_value;
2723 	int changed;
2724 	int error = sysctl_io_number(req, cpuid, sizeof cpuid, &new_value, &changed);
2725 	if (error) {
2726 		return error;
2727 	}
2728 
2729 	if (changed) {
2730 		kern_return_t kr = sysctl_thread_bind_cpuid(new_value);
2731 
2732 		if (kr == KERN_NOT_SUPPORTED) {
2733 			return ENOTSUP;
2734 		}
2735 
2736 		if (kr == KERN_INVALID_VALUE) {
2737 			return ERANGE;
2738 		}
2739 	}
2740 
2741 	return error;
2742 }
2743 
2744 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cpu, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2745     0, 0, sysctl_kern_sched_thread_bind_cpu, "I", "");
2746 
2747 #if __AMP__
2748 extern char sysctl_get_bound_cluster_type(void);
2749 extern void sysctl_thread_bind_cluster_type(char cluster_type);
2750 static int
2751 sysctl_kern_sched_thread_bind_cluster_type SYSCTL_HANDLER_ARGS
2752 {
2753 #pragma unused(oidp, arg1, arg2)
2754 	char buff[4];
2755 
2756 	if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
2757 		return ENOENT;
2758 	}
2759 
2760 	int error = SYSCTL_IN(req, buff, 1);
2761 	if (error) {
2762 		return error;
2763 	}
2764 	char cluster_type = buff[0];
2765 
2766 	if (!req->newptr) {
2767 		goto out;
2768 	}
2769 
2770 	sysctl_thread_bind_cluster_type(cluster_type);
2771 out:
2772 	cluster_type = sysctl_get_bound_cluster_type();
2773 	buff[0] = cluster_type;
2774 
2775 	return SYSCTL_OUT(req, buff, 1);
2776 }
2777 
2778 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cluster_type, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED,
2779     0, 0, sysctl_kern_sched_thread_bind_cluster_type, "A", "");
2780 
2781 extern char sysctl_get_task_cluster_type(void);
2782 extern void sysctl_task_set_cluster_type(char cluster_type);
2783 static int
2784 sysctl_kern_sched_task_set_cluster_type SYSCTL_HANDLER_ARGS
2785 {
2786 #pragma unused(oidp, arg1, arg2)
2787 	char buff[4];
2788 
2789 	if (!PE_parse_boot_argn("enable_skstsct", NULL, 0)) {
2790 		return ENOENT;
2791 	}
2792 
2793 	int error = SYSCTL_IN(req, buff, 1);
2794 	if (error) {
2795 		return error;
2796 	}
2797 	char cluster_type = buff[0];
2798 
2799 	if (!req->newptr) {
2800 		goto out;
2801 	}
2802 
2803 	sysctl_task_set_cluster_type(cluster_type);
2804 out:
2805 	cluster_type = sysctl_get_task_cluster_type();
2806 	buff[0] = cluster_type;
2807 
2808 	return SYSCTL_OUT(req, buff, 1);
2809 }
2810 
2811 SYSCTL_PROC(_kern, OID_AUTO, sched_task_set_cluster_type, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED,
2812     0, 0, sysctl_kern_sched_task_set_cluster_type, "A", "");
2813 
2814 extern kern_return_t thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options);
2815 extern uint32_t thread_bound_cluster_id(thread_t);
2816 static int
2817 sysctl_kern_sched_thread_bind_cluster_id SYSCTL_HANDLER_ARGS
2818 {
2819 #pragma unused(oidp, arg1, arg2)
2820 	if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
2821 		return ENOENT;
2822 	}
2823 
2824 	thread_t self = current_thread();
2825 	uint32_t old_value = thread_bound_cluster_id(self);
2826 	uint32_t new_value;
2827 
2828 	int error = SYSCTL_IN(req, &new_value, sizeof(new_value));
2829 	if (error) {
2830 		return error;
2831 	}
2832 	if (new_value != old_value) {
2833 		/*
2834 		 * This sysctl binds the thread to the cluster without any flags,
2835 		 * which means it will be hard bound and not check eligibility.
2836 		 */
2837 		thread_bind_cluster_id(self, new_value, 0);
2838 	}
2839 	return SYSCTL_OUT(req, &old_value, sizeof(old_value));
2840 }
2841 
2842 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cluster_id, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2843     0, 0, sysctl_kern_sched_thread_bind_cluster_id, "I", "");
2844 
2845 #if CONFIG_SCHED_EDGE
2846 
2847 extern int sched_edge_restrict_ut;
2848 SYSCTL_INT(_kern, OID_AUTO, sched_edge_restrict_ut, CTLFLAG_RW | CTLFLAG_LOCKED, &sched_edge_restrict_ut, 0, "Edge Scheduler Restrict UT Threads");
2849 extern int sched_edge_restrict_bg;
2850 SYSCTL_INT(_kern, OID_AUTO, sched_edge_restrict_bg, CTLFLAG_RW | CTLFLAG_LOCKED, &sched_edge_restrict_ut, 0, "Edge Scheduler Restrict BG Threads");
2851 extern int sched_edge_migrate_ipi_immediate;
2852 SYSCTL_INT(_kern, OID_AUTO, sched_edge_migrate_ipi_immediate, CTLFLAG_RW | CTLFLAG_LOCKED, &sched_edge_migrate_ipi_immediate, 0, "Edge Scheduler uses immediate IPIs for migration event based on execution latency");
2853 
2854 #endif /* CONFIG_SCHED_EDGE */
2855 
2856 #endif /* __AMP__ */
2857 
2858 #if INTERRUPT_MASKED_DEBUG
2859 
2860 SYSCTL_INT(_kern, OID_AUTO, interrupt_masked_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED,
2861     &interrupt_masked_timeout, 0,
2862     "Interrupt masked duration after which a tracepoint is emitted or the device panics (in mach timebase units)");
2863 
2864 SYSCTL_INT(_kern, OID_AUTO, interrupt_masked_debug_mode, CTLFLAG_RW | CTLFLAG_LOCKED,
2865     &interrupt_masked_debug_mode, 0,
2866     "Enable interrupt masked tracing or panic (0: off, 1: trace, 2: panic)");
2867 
2868 #endif /* INTERRUPT_MASKED_DEBUG */
2869 
2870 #if SCHED_PREEMPTION_DISABLE_DEBUG
2871 
2872 SYSCTL_QUAD(_kern, OID_AUTO, sched_preemption_disable_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED,
2873     &sched_preemption_disable_threshold_mt,
2874     "Preemption disablement duration after which a tracepoint is emitted or the device panics (in mach timebase units)");
2875 
2876 SYSCTL_INT(_kern, OID_AUTO, sched_preemption_disable_debug_mode, CTLFLAG_RW | CTLFLAG_LOCKED,
2877     &sched_preemption_disable_debug_mode, 0,
2878     "Enable preemption disablement tracing or panic (0: off, 1: trace, 2: panic)");
2879 
2880 PERCPU_DECL(uint64_t, preemption_disable_max_mt);
2881 
2882 static int
sysctl_sched_preemption_disable_stats(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)2883 sysctl_sched_preemption_disable_stats(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
2884 {
2885 	uint64_t stats[MAX_CPUS]; // maximum per CPU
2886 
2887 	/*
2888 	 * No synchronization here. The individual values are pretty much
2889 	 * independent, and reading/writing them is atomic.
2890 	 */
2891 
2892 	static_assert(__LP64__); /* below is racy on armv7k, reminder to change if needed there. */
2893 
2894 	int cpu = 0;
2895 	percpu_foreach(max_stat, preemption_disable_max_mt) {
2896 		stats[cpu++] = *max_stat;
2897 	}
2898 
2899 	if (req->newlen > 0) {
2900 		// writing just resets all stats.
2901 		percpu_foreach(max_stat, preemption_disable_max_mt) {
2902 			*max_stat = 0;
2903 		}
2904 	}
2905 
2906 	return sysctl_io_opaque(req, stats, cpu * sizeof(uint64_t), NULL);
2907 }
2908 
2909 SYSCTL_PROC(_kern, OID_AUTO, sched_preemption_disable_stats,
2910     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_LOCKED,
2911     0, 0, sysctl_sched_preemption_disable_stats, "I", "Preemption disablement statistics");
2912 
2913 #endif /* SCHED_PREEMPTION_DISABLE_DEBUG */
2914 
2915 
2916 /* used for testing by exception_tests */
2917 extern uint32_t ipc_control_port_options;
2918 SYSCTL_INT(_kern, OID_AUTO, ipc_control_port_options,
2919     CTLFLAG_RD | CTLFLAG_LOCKED, &ipc_control_port_options, 0, "");
2920 
2921 #endif /* DEVELOPMENT || DEBUG */
2922 
2923 extern uint32_t task_exc_guard_default;
2924 
2925 SYSCTL_INT(_kern, OID_AUTO, task_exc_guard_default,
2926     CTLFLAG_RD | CTLFLAG_LOCKED, &task_exc_guard_default, 0, "");
2927 
2928 
2929 static int
2930 sysctl_kern_tcsm_available SYSCTL_HANDLER_ARGS
2931 {
2932 #pragma unused(oidp, arg1, arg2)
2933 	uint32_t value = machine_csv(CPUVN_CI) ? 1 : 0;
2934 
2935 	if (req->newptr) {
2936 		return EINVAL;
2937 	}
2938 
2939 	return SYSCTL_OUT(req, &value, sizeof(value));
2940 }
2941 SYSCTL_PROC(_kern, OID_AUTO, tcsm_available,
2942     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
2943     0, 0, sysctl_kern_tcsm_available, "I", "");
2944 
2945 
2946 static int
2947 sysctl_kern_tcsm_enable SYSCTL_HANDLER_ARGS
2948 {
2949 #pragma unused(oidp, arg1, arg2)
2950 	uint32_t soflags = 0;
2951 	uint32_t old_value = thread_get_no_smt() ? 1 : 0;
2952 
2953 	int error = SYSCTL_IN(req, &soflags, sizeof(soflags));
2954 	if (error) {
2955 		return error;
2956 	}
2957 
2958 	if (soflags && machine_csv(CPUVN_CI)) {
2959 		thread_set_no_smt(true);
2960 		machine_tecs(current_thread());
2961 	}
2962 
2963 	return SYSCTL_OUT(req, &old_value, sizeof(old_value));
2964 }
2965 SYSCTL_PROC(_kern, OID_AUTO, tcsm_enable,
2966     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
2967     0, 0, sysctl_kern_tcsm_enable, "I", "");
2968 
2969 static int
2970 sysctl_kern_debug_get_preoslog SYSCTL_HANDLER_ARGS
2971 {
2972 #pragma unused(oidp, arg1, arg2)
2973 	static bool oneshot_executed = false;
2974 	size_t preoslog_size = 0;
2975 	const char *preoslog = NULL;
2976 	int ret = 0;
2977 
2978 	// DumpPanic passes a non-zero write value when it needs oneshot behaviour
2979 	if (req->newptr != USER_ADDR_NULL) {
2980 		uint8_t oneshot = 0;
2981 		int error = SYSCTL_IN(req, &oneshot, sizeof(oneshot));
2982 		if (error) {
2983 			return error;
2984 		}
2985 
2986 		if (oneshot) {
2987 			if (!os_atomic_cmpxchg(&oneshot_executed, false, true, acq_rel)) {
2988 				return EPERM;
2989 			}
2990 		}
2991 	}
2992 
2993 	preoslog = sysctl_debug_get_preoslog(&preoslog_size);
2994 	if (preoslog != NULL && preoslog_size == 0) {
2995 		sysctl_debug_free_preoslog();
2996 		return 0;
2997 	}
2998 
2999 	if (preoslog == NULL || preoslog_size == 0) {
3000 		return 0;
3001 	}
3002 
3003 	if (req->oldptr == USER_ADDR_NULL) {
3004 		req->oldidx = preoslog_size;
3005 		return 0;
3006 	}
3007 
3008 	ret = SYSCTL_OUT(req, preoslog, preoslog_size);
3009 	sysctl_debug_free_preoslog();
3010 	return ret;
3011 }
3012 
3013 SYSCTL_PROC(_kern, OID_AUTO, preoslog, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_LOCKED,
3014     0, 0, sysctl_kern_debug_get_preoslog, "-", "");
3015 
3016 #if DEVELOPMENT || DEBUG
3017 extern void sysctl_task_set_no_smt(char no_smt);
3018 extern char sysctl_task_get_no_smt(void);
3019 
3020 static int
3021 sysctl_kern_sched_task_set_no_smt SYSCTL_HANDLER_ARGS
3022 {
3023 #pragma unused(oidp, arg1, arg2)
3024 	char buff[4];
3025 
3026 	int error = SYSCTL_IN(req, buff, 1);
3027 	if (error) {
3028 		return error;
3029 	}
3030 	char no_smt = buff[0];
3031 
3032 	if (!req->newptr) {
3033 		goto out;
3034 	}
3035 
3036 	sysctl_task_set_no_smt(no_smt);
3037 out:
3038 	no_smt = sysctl_task_get_no_smt();
3039 	buff[0] = no_smt;
3040 
3041 	return SYSCTL_OUT(req, buff, 1);
3042 }
3043 
3044 SYSCTL_PROC(_kern, OID_AUTO, sched_task_set_no_smt, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
3045     0, 0, sysctl_kern_sched_task_set_no_smt, "A", "");
3046 
3047 static int
sysctl_kern_sched_thread_set_no_smt(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3048 sysctl_kern_sched_thread_set_no_smt(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3049 {
3050 	int new_value, changed;
3051 	int old_value = thread_get_no_smt() ? 1 : 0;
3052 	int error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed);
3053 
3054 	if (changed) {
3055 		thread_set_no_smt(!!new_value);
3056 	}
3057 
3058 	return error;
3059 }
3060 
3061 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_set_no_smt,
3062     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
3063     0, 0, sysctl_kern_sched_thread_set_no_smt, "I", "");
3064 
3065 
3066 static int
3067 sysctl_kern_task_set_filter_msg_flag SYSCTL_HANDLER_ARGS
3068 {
3069 #pragma unused(oidp, arg1, arg2)
3070 	int new_value, changed;
3071 	int old_value = task_get_filter_msg_flag(current_task()) ? 1 : 0;
3072 	int error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed);
3073 
3074 	if (changed) {
3075 		task_set_filter_msg_flag(current_task(), !!new_value);
3076 	}
3077 
3078 	return error;
3079 }
3080 
3081 SYSCTL_PROC(_kern, OID_AUTO, task_set_filter_msg_flag, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3082     0, 0, sysctl_kern_task_set_filter_msg_flag, "I", "");
3083 
3084 #if CONFIG_PROC_RESOURCE_LIMITS
3085 
3086 extern mach_port_name_t current_task_get_fatal_port_name(void);
3087 
3088 static int
3089 sysctl_kern_task_get_fatal_port SYSCTL_HANDLER_ARGS
3090 {
3091 #pragma unused(oidp, arg1, arg2)
3092 	int port = 0;
3093 	int flag = 0;
3094 
3095 	if (req->oldptr == USER_ADDR_NULL) {
3096 		req->oldidx = sizeof(mach_port_t);
3097 		return 0;
3098 	}
3099 
3100 	int error = SYSCTL_IN(req, &flag, sizeof(flag));
3101 	if (error) {
3102 		return error;
3103 	}
3104 
3105 	if (flag == 1) {
3106 		port = (int)current_task_get_fatal_port_name();
3107 	}
3108 	return SYSCTL_OUT(req, &port, sizeof(port));
3109 }
3110 
3111 SYSCTL_PROC(_machdep, OID_AUTO, task_get_fatal_port, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3112     0, 0, sysctl_kern_task_get_fatal_port, "I", "");
3113 
3114 #endif /* CONFIG_PROC_RESOURCE_LIMITS */
3115 
3116 extern unsigned int ipc_table_max_entries(void);
3117 
3118 static int
3119 sysctl_mach_max_port_table_size SYSCTL_HANDLER_ARGS
3120 {
3121 #pragma unused(oidp, arg1, arg2)
3122 	int old_value = ipc_table_max_entries();
3123 	int error = sysctl_io_number(req, old_value, sizeof(int), NULL, NULL);
3124 
3125 	return error;
3126 }
3127 
3128 SYSCTL_PROC(_machdep, OID_AUTO, max_port_table_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3129     0, 0, sysctl_mach_max_port_table_size, "I", "");
3130 
3131 #endif /* DEVELOPMENT || DEBUG */
3132 
3133 #if defined(CONFIG_KDP_INTERACTIVE_DEBUGGING) && defined(CONFIG_KDP_COREDUMP_ENCRYPTION)
3134 
3135 #define COREDUMP_ENCRYPTION_KEY_ENTITLEMENT "com.apple.private.coredump-encryption-key"
3136 
3137 static int
3138 sysctl_coredump_encryption_key_update SYSCTL_HANDLER_ARGS
3139 {
3140 	kern_return_t ret = KERN_SUCCESS;
3141 	int error = 0;
3142 	struct kdp_core_encryption_key_descriptor key_descriptor = {
3143 		.kcekd_format = MACH_CORE_FILEHEADER_V2_FLAG_NEXT_COREFILE_KEY_FORMAT_NIST_P256,
3144 	};
3145 
3146 	/* Need to be root and have entitlement */
3147 	if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(COREDUMP_ENCRYPTION_KEY_ENTITLEMENT)) {
3148 		return EPERM;
3149 	}
3150 
3151 	// Sanity-check the given key length
3152 	if (req->newlen > UINT16_MAX) {
3153 		return EINVAL;
3154 	}
3155 
3156 	// It is allowed for the caller to pass in a NULL buffer.
3157 	// This indicates that they want us to forget about any public key we might have.
3158 	if (req->newptr) {
3159 		key_descriptor.kcekd_size = (uint16_t) req->newlen;
3160 		key_descriptor.kcekd_key = kalloc_data(key_descriptor.kcekd_size, Z_WAITOK);
3161 
3162 		if (key_descriptor.kcekd_key == NULL) {
3163 			return ENOMEM;
3164 		}
3165 
3166 		error = SYSCTL_IN(req, key_descriptor.kcekd_key, key_descriptor.kcekd_size);
3167 		if (error) {
3168 			goto out;
3169 		}
3170 	}
3171 
3172 	ret = IOProvideCoreFileAccess(kdp_core_handle_new_encryption_key, (void *)&key_descriptor);
3173 	if (KERN_SUCCESS != ret) {
3174 		printf("Failed to handle the new encryption key. Error 0x%x", ret);
3175 		error = EFAULT;
3176 	}
3177 
3178 out:
3179 	kfree_data(key_descriptor.kcekd_key, key_descriptor.kcekd_size);
3180 	return 0;
3181 }
3182 
3183 SYSCTL_PROC(_kern, OID_AUTO, coredump_encryption_key, CTLTYPE_OPAQUE | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3184     0, 0, &sysctl_coredump_encryption_key_update, "-", "Set a new encryption key for coredumps");
3185 
3186 #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING && CONFIG_KDP_COREDUMP_ENCRYPTION*/
3187