1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
67 */
68 /*
69 * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
70 * support for mandatory and extensible security protections. This notice
71 * is included in support of clause 2.2 (b) of the Apple Public License,
72 * Version 2.0.
73 */
74
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/filedesc.h>
78 #include <sys/ioctl.h>
79 #include <sys/file_internal.h>
80 #include <sys/proc_internal.h>
81 #include <sys/socketvar.h>
82 #include <sys/uio_internal.h>
83 #include <sys/kernel.h>
84 #include <sys/guarded.h>
85 #include <sys/stat.h>
86 #include <sys/malloc.h>
87 #include <sys/sysproto.h>
88
89 #include <sys/mount_internal.h>
90 #include <sys/protosw.h>
91 #include <sys/ev.h>
92 #include <sys/user.h>
93 #include <sys/kdebug.h>
94 #include <sys/poll.h>
95 #include <sys/event.h>
96 #include <sys/eventvar.h>
97 #include <sys/proc.h>
98 #include <sys/kauth.h>
99
100 #include <machine/smp.h>
101 #include <mach/mach_types.h>
102 #include <kern/kern_types.h>
103 #include <kern/assert.h>
104 #include <kern/kalloc.h>
105 #include <kern/thread.h>
106 #include <kern/clock.h>
107 #include <kern/ledger.h>
108 #include <kern/monotonic.h>
109 #include <kern/task.h>
110 #include <kern/telemetry.h>
111 #include <kern/waitq.h>
112 #include <kern/sched_hygiene.h>
113 #include <kern/sched_prim.h>
114 #include <kern/mpsc_queue.h>
115 #include <kern/debug.h>
116
117 #include <sys/mbuf.h>
118 #include <sys/domain.h>
119 #include <sys/socket.h>
120 #include <sys/socketvar.h>
121 #include <sys/errno.h>
122 #include <sys/syscall.h>
123 #include <sys/pipe.h>
124
125 #include <security/audit/audit.h>
126
127 #include <net/if.h>
128 #include <net/route.h>
129
130 #include <netinet/in.h>
131 #include <netinet/in_systm.h>
132 #include <netinet/ip.h>
133 #include <netinet/in_pcb.h>
134 #include <netinet/ip_var.h>
135 #include <netinet/ip6.h>
136 #include <netinet/tcp.h>
137 #include <netinet/tcp_fsm.h>
138 #include <netinet/tcp_seq.h>
139 #include <netinet/tcp_timer.h>
140 #include <netinet/tcp_var.h>
141 #include <netinet/tcpip.h>
142 #include <netinet/tcp_debug.h>
143 /* for wait queue based select */
144 #include <kern/waitq.h>
145 #include <sys/vnode_internal.h>
146 /* for remote time api*/
147 #include <kern/remote_time.h>
148 #include <os/log.h>
149 #include <sys/log_data.h>
150
151 #if CONFIG_MACF
152 #include <security/mac_framework.h>
153 #endif
154
155 #ifdef CONFIG_KDP_INTERACTIVE_DEBUGGING
156 #include <mach_debug/mach_debug_types.h>
157 #endif
158
159 #if MONOTONIC
160 #include <machine/monotonic.h>
161 #endif /* MONOTONIC */
162
163 /* for entitlement check */
164 #include <IOKit/IOBSD.h>
165 /*
166 * If you need accounting for KM_SELECT consider using
167 * KALLOC_HEAP_DEFINE to define a view.
168 */
169 #define KM_SELECT KHEAP_DEFAULT
170
171 /* XXX should be in a header file somewhere */
172 extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
173
174 int rd_uio(struct proc *p, int fdes, uio_t uio, int is_preadv, user_ssize_t *retval);
175 int wr_uio(struct proc *p, int fdes, uio_t uio, int is_pwritev, user_ssize_t *retval);
176 int do_uiowrite(struct proc *p, struct fileproc *fp, uio_t uio, int flags, user_ssize_t *retval);
177
178 __private_extern__ int dofileread(vfs_context_t ctx, struct fileproc *fp,
179 user_addr_t bufp, user_size_t nbyte,
180 off_t offset, int flags, user_ssize_t *retval);
181 __private_extern__ int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
182 user_addr_t bufp, user_size_t nbyte,
183 off_t offset, int flags, user_ssize_t *retval);
184 static int preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode);
185
186 #define f_flag fp_glob->fg_flag
187 #define f_type fp_glob->fg_ops->fo_type
188 #define f_cred fp_glob->fg_cred
189 #define f_ops fp_glob->fg_ops
190
191 /*
192 * Validate if the file can be used for random access (pread, pwrite, etc).
193 *
194 * Conditions:
195 * proc_fdlock is held
196 *
197 * Returns: 0 Success
198 * ESPIPE
199 * ENXIO
200 */
201 static int
valid_for_random_access(struct fileproc * fp)202 valid_for_random_access(struct fileproc *fp)
203 {
204 if (__improbable(fp->f_type != DTYPE_VNODE)) {
205 return ESPIPE;
206 }
207
208 vnode_t vp = (struct vnode *)fp_get_data(fp);
209 if (__improbable(vnode_isfifo(vp))) {
210 return ESPIPE;
211 }
212
213 if (__improbable(vp->v_flag & VISTTY)) {
214 return ENXIO;
215 }
216
217 return 0;
218 }
219
220 /*
221 * Read system call.
222 *
223 * Returns: 0 Success
224 * preparefileread:EBADF
225 * preparefileread:ESPIPE
226 * preparefileread:ENXIO
227 * preparefileread:EBADF
228 * dofileread:???
229 */
230 int
read(struct proc * p,struct read_args * uap,user_ssize_t * retval)231 read(struct proc *p, struct read_args *uap, user_ssize_t *retval)
232 {
233 __pthread_testcancel(1);
234 return read_nocancel(p, (struct read_nocancel_args *)uap, retval);
235 }
236
237 int
read_nocancel(struct proc * p,struct read_nocancel_args * uap,user_ssize_t * retval)238 read_nocancel(struct proc *p, struct read_nocancel_args *uap, user_ssize_t *retval)
239 {
240 struct fileproc *fp;
241 int error;
242 int fd = uap->fd;
243 struct vfs_context context;
244
245 if ((error = preparefileread(p, &fp, fd, 0))) {
246 return error;
247 }
248
249 context = *(vfs_context_current());
250 context.vc_ucred = fp->fp_glob->fg_cred;
251
252 error = dofileread(&context, fp, uap->cbuf, uap->nbyte,
253 (off_t)-1, 0, retval);
254
255 fp_drop(p, fd, fp, 0);
256
257 return error;
258 }
259
260 /*
261 * Pread system call
262 *
263 * Returns: 0 Success
264 * preparefileread:EBADF
265 * preparefileread:ESPIPE
266 * preparefileread:ENXIO
267 * preparefileread:EBADF
268 * dofileread:???
269 */
270 int
pread(struct proc * p,struct pread_args * uap,user_ssize_t * retval)271 pread(struct proc *p, struct pread_args *uap, user_ssize_t *retval)
272 {
273 __pthread_testcancel(1);
274 return pread_nocancel(p, (struct pread_nocancel_args *)uap, retval);
275 }
276
277 int
pread_nocancel(struct proc * p,struct pread_nocancel_args * uap,user_ssize_t * retval)278 pread_nocancel(struct proc *p, struct pread_nocancel_args *uap, user_ssize_t *retval)
279 {
280 struct fileproc *fp = NULL; /* fp set by preparefileread() */
281 int fd = uap->fd;
282 int error;
283 struct vfs_context context;
284
285 if ((error = preparefileread(p, &fp, fd, 1))) {
286 goto out;
287 }
288
289 context = *(vfs_context_current());
290 context.vc_ucred = fp->fp_glob->fg_cred;
291
292 error = dofileread(&context, fp, uap->buf, uap->nbyte,
293 uap->offset, FOF_OFFSET, retval);
294
295 fp_drop(p, fd, fp, 0);
296
297 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
298 uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
299
300 out:
301 return error;
302 }
303
304 /*
305 * Code common for read and pread
306 */
307
308 /*
309 * Returns: 0 Success
310 * EBADF
311 * ESPIPE
312 * ENXIO
313 * fp_lookup:EBADF
314 * valid_for_random_access:ESPIPE
315 * valid_for_random_access:ENXIO
316 */
317 static int
preparefileread(struct proc * p,struct fileproc ** fp_ret,int fd,int check_for_pread)318 preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread)
319 {
320 int error;
321 struct fileproc *fp;
322
323 AUDIT_ARG(fd, fd);
324
325 proc_fdlock_spin(p);
326
327 error = fp_lookup(p, fd, &fp, 1);
328
329 if (error) {
330 proc_fdunlock(p);
331 return error;
332 }
333 if ((fp->f_flag & FREAD) == 0) {
334 error = EBADF;
335 goto out;
336 }
337 if (check_for_pread) {
338 if ((error = valid_for_random_access(fp))) {
339 goto out;
340 }
341 }
342
343 *fp_ret = fp;
344
345 proc_fdunlock(p);
346 return 0;
347
348 out:
349 fp_drop(p, fd, fp, 1);
350 proc_fdunlock(p);
351 return error;
352 }
353
354
355 /*
356 * Returns: 0 Success
357 * EINVAL
358 * fo_read:???
359 */
360 __private_extern__ int
dofileread(vfs_context_t ctx,struct fileproc * fp,user_addr_t bufp,user_size_t nbyte,off_t offset,int flags,user_ssize_t * retval)361 dofileread(vfs_context_t ctx, struct fileproc *fp,
362 user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
363 user_ssize_t *retval)
364 {
365 uio_t auio;
366 user_ssize_t bytecnt;
367 int error = 0;
368 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
369
370 if (nbyte > INT_MAX) {
371 return EINVAL;
372 }
373
374 if (vfs_context_is64bit(ctx)) {
375 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
376 &uio_buf[0], sizeof(uio_buf));
377 } else {
378 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
379 &uio_buf[0], sizeof(uio_buf));
380 }
381 if (uio_addiov(auio, bufp, nbyte) != 0) {
382 *retval = 0;
383 return EINVAL;
384 }
385
386 bytecnt = nbyte;
387
388 if ((error = fo_read(fp, auio, flags, ctx))) {
389 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
390 error == EINTR || error == EWOULDBLOCK)) {
391 error = 0;
392 }
393 }
394 bytecnt -= uio_resid(auio);
395
396 *retval = bytecnt;
397
398 return error;
399 }
400
401 /*
402 * Vector read.
403 *
404 * Returns: 0 Success
405 * EINVAL
406 * ENOMEM
407 * preparefileread:EBADF
408 * preparefileread:ESPIPE
409 * preparefileread:ENXIO
410 * preparefileread:EBADF
411 * copyin:EFAULT
412 * rd_uio:???
413 */
414 static int
readv_preadv_uio(struct proc * p,int fdes,user_addr_t user_iovp,int iovcnt,off_t offset,int is_preadv,user_ssize_t * retval)415 readv_preadv_uio(struct proc *p, int fdes,
416 user_addr_t user_iovp, int iovcnt, off_t offset, int is_preadv,
417 user_ssize_t *retval)
418 {
419 uio_t auio = NULL;
420 int error;
421 struct user_iovec *iovp;
422
423 /* Verify range before calling uio_create() */
424 if (iovcnt <= 0 || iovcnt > UIO_MAXIOV) {
425 return EINVAL;
426 }
427
428 /* allocate a uio large enough to hold the number of iovecs passed */
429 auio = uio_create(iovcnt, offset,
430 (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
431 UIO_READ);
432
433 /* get location of iovecs within the uio. then copyin the iovecs from
434 * user space.
435 */
436 iovp = uio_iovsaddr(auio);
437 if (iovp == NULL) {
438 error = ENOMEM;
439 goto ExitThisRoutine;
440 }
441 error = copyin_user_iovec_array(user_iovp,
442 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
443 iovcnt, iovp);
444 if (error) {
445 goto ExitThisRoutine;
446 }
447
448 /* finalize uio_t for use and do the IO
449 */
450 error = uio_calculateresid(auio);
451 if (error) {
452 goto ExitThisRoutine;
453 }
454 error = rd_uio(p, fdes, auio, is_preadv, retval);
455
456 ExitThisRoutine:
457 if (auio != NULL) {
458 uio_free(auio);
459 }
460 return error;
461 }
462
463 /*
464 * Scatter read system call.
465 */
466 int
readv(struct proc * p,struct readv_args * uap,user_ssize_t * retval)467 readv(struct proc *p, struct readv_args *uap, user_ssize_t *retval)
468 {
469 __pthread_testcancel(1);
470 return readv_nocancel(p, (struct readv_nocancel_args *)uap, retval);
471 }
472
473 int
readv_nocancel(struct proc * p,struct readv_nocancel_args * uap,user_ssize_t * retval)474 readv_nocancel(struct proc *p, struct readv_nocancel_args *uap, user_ssize_t *retval)
475 {
476 return readv_preadv_uio(p, uap->fd, uap->iovp, uap->iovcnt, 0, 0, retval);
477 }
478
479 /*
480 * Preadv system call
481 */
482 int
sys_preadv(struct proc * p,struct preadv_args * uap,user_ssize_t * retval)483 sys_preadv(struct proc *p, struct preadv_args *uap, user_ssize_t *retval)
484 {
485 __pthread_testcancel(1);
486 return sys_preadv_nocancel(p, (struct preadv_nocancel_args *)uap, retval);
487 }
488
489 int
sys_preadv_nocancel(struct proc * p,struct preadv_nocancel_args * uap,user_ssize_t * retval)490 sys_preadv_nocancel(struct proc *p, struct preadv_nocancel_args *uap, user_ssize_t *retval)
491 {
492 return readv_preadv_uio(p, uap->fd, uap->iovp, uap->iovcnt, uap->offset, 1, retval);
493 }
494
495 /*
496 * Write system call
497 *
498 * Returns: 0 Success
499 * EBADF
500 * fp_lookup:EBADF
501 * dofilewrite:???
502 */
503 int
write(struct proc * p,struct write_args * uap,user_ssize_t * retval)504 write(struct proc *p, struct write_args *uap, user_ssize_t *retval)
505 {
506 __pthread_testcancel(1);
507 return write_nocancel(p, (struct write_nocancel_args *)uap, retval);
508 }
509
510 int
write_nocancel(struct proc * p,struct write_nocancel_args * uap,user_ssize_t * retval)511 write_nocancel(struct proc *p, struct write_nocancel_args *uap, user_ssize_t *retval)
512 {
513 struct fileproc *fp;
514 int error;
515 int fd = uap->fd;
516
517 AUDIT_ARG(fd, fd);
518
519 error = fp_lookup(p, fd, &fp, 0);
520 if (error) {
521 return error;
522 }
523 if ((fp->f_flag & FWRITE) == 0) {
524 error = EBADF;
525 } else if (fp_isguarded(fp, GUARD_WRITE)) {
526 proc_fdlock(p);
527 error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
528 proc_fdunlock(p);
529 } else {
530 struct vfs_context context = *(vfs_context_current());
531 context.vc_ucred = fp->fp_glob->fg_cred;
532
533 error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
534 (off_t)-1, 0, retval);
535 }
536 fp_drop(p, fd, fp, 0);
537 return error;
538 }
539
540 /*
541 * pwrite system call
542 *
543 * Returns: 0 Success
544 * EBADF
545 * ESPIPE
546 * ENXIO
547 * EINVAL
548 * fp_lookup:EBADF
549 * dofilewrite:???
550 */
551 int
pwrite(struct proc * p,struct pwrite_args * uap,user_ssize_t * retval)552 pwrite(struct proc *p, struct pwrite_args *uap, user_ssize_t *retval)
553 {
554 __pthread_testcancel(1);
555 return pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval);
556 }
557
558 int
pwrite_nocancel(struct proc * p,struct pwrite_nocancel_args * uap,user_ssize_t * retval)559 pwrite_nocancel(struct proc *p, struct pwrite_nocancel_args *uap, user_ssize_t *retval)
560 {
561 struct fileproc *fp;
562 int error;
563 int fd = uap->fd;
564 vnode_t vp = (vnode_t)0;
565
566 AUDIT_ARG(fd, fd);
567
568 error = fp_get_ftype(p, fd, DTYPE_VNODE, ESPIPE, &fp);
569 if (error) {
570 return error;
571 }
572
573 if ((fp->f_flag & FWRITE) == 0) {
574 error = EBADF;
575 } else if (fp_isguarded(fp, GUARD_WRITE)) {
576 proc_fdlock(p);
577 error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
578 proc_fdunlock(p);
579 } else {
580 struct vfs_context context = *vfs_context_current();
581 context.vc_ucred = fp->fp_glob->fg_cred;
582
583 vp = (vnode_t)fp_get_data(fp);
584 if (vnode_isfifo(vp)) {
585 error = ESPIPE;
586 goto errout;
587 }
588 if ((vp->v_flag & VISTTY)) {
589 error = ENXIO;
590 goto errout;
591 }
592 if (uap->offset == (off_t)-1) {
593 error = EINVAL;
594 goto errout;
595 }
596
597 error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
598 uap->offset, FOF_OFFSET, retval);
599 }
600 errout:
601 fp_drop(p, fd, fp, 0);
602
603 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
604 uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
605
606 return error;
607 }
608
609 /*
610 * Returns: 0 Success
611 * EINVAL
612 * <fo_write>:EPIPE
613 * <fo_write>:??? [indirect through struct fileops]
614 */
615 __private_extern__ int
dofilewrite(vfs_context_t ctx,struct fileproc * fp,user_addr_t bufp,user_size_t nbyte,off_t offset,int flags,user_ssize_t * retval)616 dofilewrite(vfs_context_t ctx, struct fileproc *fp,
617 user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
618 user_ssize_t *retval)
619 {
620 uio_t auio;
621 int error = 0;
622 user_ssize_t bytecnt;
623 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
624
625 if (nbyte > INT_MAX) {
626 *retval = 0;
627 return EINVAL;
628 }
629
630 if (vfs_context_is64bit(ctx)) {
631 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
632 &uio_buf[0], sizeof(uio_buf));
633 } else {
634 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
635 &uio_buf[0], sizeof(uio_buf));
636 }
637 if (uio_addiov(auio, bufp, nbyte) != 0) {
638 *retval = 0;
639 return EINVAL;
640 }
641
642 bytecnt = nbyte;
643 if ((error = fo_write(fp, auio, flags, ctx))) {
644 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
645 error == EINTR || error == EWOULDBLOCK)) {
646 error = 0;
647 }
648 /* The socket layer handles SIGPIPE */
649 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
650 (fp->fp_glob->fg_lflags & FG_NOSIGPIPE) == 0) {
651 /* XXX Raise the signal on the thread? */
652 psignal(vfs_context_proc(ctx), SIGPIPE);
653 }
654 }
655 bytecnt -= uio_resid(auio);
656 if (bytecnt) {
657 os_atomic_or(&fp->fp_glob->fg_flag, FWASWRITTEN, relaxed);
658 }
659 *retval = bytecnt;
660
661 return error;
662 }
663
664 /*
665 * Returns: 0 Success
666 * EBADF
667 * ESPIPE
668 * ENXIO
669 * fp_lookup:EBADF
670 * fp_guard_exception:???
671 * valid_for_random_access:ESPIPE
672 * valid_for_random_access:ENXIO
673 */
674 static int
preparefilewrite(struct proc * p,struct fileproc ** fp_ret,int fd,int check_for_pwrite)675 preparefilewrite(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pwrite)
676 {
677 int error;
678 struct fileproc *fp;
679
680 AUDIT_ARG(fd, fd);
681
682 proc_fdlock_spin(p);
683
684 error = fp_lookup(p, fd, &fp, 1);
685
686 if (error) {
687 proc_fdunlock(p);
688 return error;
689 }
690 if ((fp->f_flag & FWRITE) == 0) {
691 error = EBADF;
692 goto ExitThisRoutine;
693 }
694 if (fp_isguarded(fp, GUARD_WRITE)) {
695 if ((error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE))) {
696 goto ExitThisRoutine;
697 }
698 }
699 if (check_for_pwrite) {
700 if ((error = valid_for_random_access(fp))) {
701 goto ExitThisRoutine;
702 }
703 }
704
705 *fp_ret = fp;
706
707 proc_fdunlock(p);
708 return 0;
709
710 ExitThisRoutine:
711 fp_drop(p, fd, fp, 1);
712 proc_fdunlock(p);
713 return error;
714 }
715
716 static int
writev_prwritev_uio(struct proc * p,int fd,user_addr_t user_iovp,int iovcnt,off_t offset,int is_pwritev,user_ssize_t * retval)717 writev_prwritev_uio(struct proc *p, int fd,
718 user_addr_t user_iovp, int iovcnt, off_t offset, int is_pwritev,
719 user_ssize_t *retval)
720 {
721 uio_t auio = NULL;
722 int error;
723 struct user_iovec *iovp;
724
725 /* Verify range before calling uio_create() */
726 if (iovcnt <= 0 || iovcnt > UIO_MAXIOV || offset < 0) {
727 return EINVAL;
728 }
729
730 /* allocate a uio large enough to hold the number of iovecs passed */
731 auio = uio_create(iovcnt, offset,
732 (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
733 UIO_WRITE);
734
735 /* get location of iovecs within the uio. then copyin the iovecs from
736 * user space.
737 */
738 iovp = uio_iovsaddr(auio);
739 if (iovp == NULL) {
740 error = ENOMEM;
741 goto ExitThisRoutine;
742 }
743 error = copyin_user_iovec_array(user_iovp,
744 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
745 iovcnt, iovp);
746 if (error) {
747 goto ExitThisRoutine;
748 }
749
750 /* finalize uio_t for use and do the IO
751 */
752 error = uio_calculateresid(auio);
753 if (error) {
754 goto ExitThisRoutine;
755 }
756
757 error = wr_uio(p, fd, auio, is_pwritev, retval);
758
759 ExitThisRoutine:
760 if (auio != NULL) {
761 uio_free(auio);
762 }
763 return error;
764 }
765
766 /*
767 * Gather write system call
768 */
769 int
writev(struct proc * p,struct writev_args * uap,user_ssize_t * retval)770 writev(struct proc *p, struct writev_args *uap, user_ssize_t *retval)
771 {
772 __pthread_testcancel(1);
773 return writev_nocancel(p, (struct writev_nocancel_args *)uap, retval);
774 }
775
776 int
writev_nocancel(struct proc * p,struct writev_nocancel_args * uap,user_ssize_t * retval)777 writev_nocancel(struct proc *p, struct writev_nocancel_args *uap, user_ssize_t *retval)
778 {
779 return writev_prwritev_uio(p, uap->fd, uap->iovp, uap->iovcnt, 0, 0, retval);
780 }
781
782 /*
783 * Pwritev system call
784 */
785 int
sys_pwritev(struct proc * p,struct pwritev_args * uap,user_ssize_t * retval)786 sys_pwritev(struct proc *p, struct pwritev_args *uap, user_ssize_t *retval)
787 {
788 __pthread_testcancel(1);
789 return sys_pwritev_nocancel(p, (struct pwritev_nocancel_args *)uap, retval);
790 }
791
792 int
sys_pwritev_nocancel(struct proc * p,struct pwritev_nocancel_args * uap,user_ssize_t * retval)793 sys_pwritev_nocancel(struct proc *p, struct pwritev_nocancel_args *uap, user_ssize_t *retval)
794 {
795 return writev_prwritev_uio(p, uap->fd, uap->iovp, uap->iovcnt, uap->offset, 1, retval);
796 }
797
798 /*
799 * Returns: 0 Success
800 * preparefileread:EBADF
801 * preparefileread:ESPIPE
802 * preparefileread:ENXIO
803 * preparefileread:???
804 * fo_write:???
805 */
806 int
wr_uio(struct proc * p,int fd,uio_t uio,int is_pwritev,user_ssize_t * retval)807 wr_uio(struct proc *p, int fd, uio_t uio, int is_pwritev, user_ssize_t *retval)
808 {
809 struct fileproc *fp;
810 int error;
811 int flags;
812
813 if ((error = preparefilewrite(p, &fp, fd, is_pwritev))) {
814 return error;
815 }
816
817 flags = is_pwritev ? FOF_OFFSET : 0;
818 error = do_uiowrite(p, fp, uio, flags, retval);
819
820 fp_drop(p, fd, fp, 0);
821
822 return error;
823 }
824
825 int
do_uiowrite(struct proc * p,struct fileproc * fp,uio_t uio,int flags,user_ssize_t * retval)826 do_uiowrite(struct proc *p, struct fileproc *fp, uio_t uio, int flags, user_ssize_t *retval)
827 {
828 int error;
829 user_ssize_t count;
830 struct vfs_context context = *vfs_context_current();
831
832 count = uio_resid(uio);
833
834 context.vc_ucred = fp->f_cred;
835 error = fo_write(fp, uio, flags, &context);
836 if (error) {
837 if (uio_resid(uio) != count && (error == ERESTART ||
838 error == EINTR || error == EWOULDBLOCK)) {
839 error = 0;
840 }
841 /* The socket layer handles SIGPIPE */
842 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
843 (fp->fp_glob->fg_lflags & FG_NOSIGPIPE) == 0) {
844 psignal(p, SIGPIPE);
845 }
846 }
847 count -= uio_resid(uio);
848 if (count) {
849 os_atomic_or(&fp->fp_glob->fg_flag, FWASWRITTEN, relaxed);
850 }
851 *retval = count;
852
853 return error;
854 }
855
856 /*
857 * Returns: 0 Success
858 * preparefileread:EBADF
859 * preparefileread:ESPIPE
860 * preparefileread:ENXIO
861 * fo_read:???
862 */
863 int
rd_uio(struct proc * p,int fdes,uio_t uio,int is_preadv,user_ssize_t * retval)864 rd_uio(struct proc *p, int fdes, uio_t uio, int is_preadv, user_ssize_t *retval)
865 {
866 struct fileproc *fp;
867 int error;
868 user_ssize_t count;
869 struct vfs_context context = *vfs_context_current();
870
871 if ((error = preparefileread(p, &fp, fdes, is_preadv))) {
872 return error;
873 }
874
875 count = uio_resid(uio);
876
877 context.vc_ucred = fp->f_cred;
878
879 int flags = is_preadv ? FOF_OFFSET : 0;
880 error = fo_read(fp, uio, flags, &context);
881
882 if (error) {
883 if (uio_resid(uio) != count && (error == ERESTART ||
884 error == EINTR || error == EWOULDBLOCK)) {
885 error = 0;
886 }
887 }
888 *retval = count - uio_resid(uio);
889
890 fp_drop(p, fdes, fp, 0);
891
892 return error;
893 }
894
895 /*
896 * Ioctl system call
897 *
898 * Returns: 0 Success
899 * EBADF
900 * ENOTTY
901 * ENOMEM
902 * ESRCH
903 * copyin:EFAULT
904 * copyoutEFAULT
905 * fp_lookup:EBADF Bad file descriptor
906 * fo_ioctl:???
907 */
908 int
ioctl(struct proc * p,struct ioctl_args * uap,__unused int32_t * retval)909 ioctl(struct proc *p, struct ioctl_args *uap, __unused int32_t *retval)
910 {
911 struct fileproc *fp = NULL;
912 int error = 0;
913 u_int size = 0;
914 caddr_t datap = NULL, memp = NULL;
915 boolean_t is64bit = FALSE;
916 int tmp = 0;
917 #define STK_PARAMS 128
918 char stkbuf[STK_PARAMS] = {};
919 int fd = uap->fd;
920 u_long com = uap->com;
921 struct vfs_context context = *vfs_context_current();
922
923 AUDIT_ARG(fd, uap->fd);
924 AUDIT_ARG(addr, uap->data);
925
926 is64bit = proc_is64bit(p);
927 #if CONFIG_AUDIT
928 if (is64bit) {
929 AUDIT_ARG(value64, com);
930 } else {
931 AUDIT_ARG(cmd, CAST_DOWN_EXPLICIT(int, com));
932 }
933 #endif /* CONFIG_AUDIT */
934
935 /*
936 * Interpret high order word to find amount of data to be
937 * copied to/from the user's address space.
938 */
939 size = IOCPARM_LEN(com);
940 if (size > IOCPARM_MAX) {
941 return ENOTTY;
942 }
943 if (size > sizeof(stkbuf)) {
944 memp = (caddr_t)kalloc_data(size, Z_WAITOK);
945 if (memp == 0) {
946 return ENOMEM;
947 }
948 datap = memp;
949 } else {
950 datap = &stkbuf[0];
951 }
952 if (com & IOC_IN) {
953 if (size) {
954 error = copyin(uap->data, datap, size);
955 if (error) {
956 goto out_nofp;
957 }
958 } else {
959 /* XXX - IOC_IN and no size? we should proably return an error here!! */
960 if (is64bit) {
961 *(user_addr_t *)datap = uap->data;
962 } else {
963 *(uint32_t *)datap = (uint32_t)uap->data;
964 }
965 }
966 } else if ((com & IOC_OUT) && size) {
967 /*
968 * Zero the buffer so the user always
969 * gets back something deterministic.
970 */
971 bzero(datap, size);
972 } else if (com & IOC_VOID) {
973 /* XXX - this is odd since IOC_VOID means no parameters */
974 if (is64bit) {
975 *(user_addr_t *)datap = uap->data;
976 } else {
977 *(uint32_t *)datap = (uint32_t)uap->data;
978 }
979 }
980
981 proc_fdlock(p);
982 error = fp_lookup(p, fd, &fp, 1);
983 if (error) {
984 proc_fdunlock(p);
985 goto out_nofp;
986 }
987
988 AUDIT_ARG(file, p, fp);
989
990 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
991 error = EBADF;
992 goto out;
993 }
994
995 context.vc_ucred = fp->fp_glob->fg_cred;
996
997 #if CONFIG_MACF
998 error = mac_file_check_ioctl(context.vc_ucred, fp->fp_glob, com);
999 if (error) {
1000 goto out;
1001 }
1002 #endif
1003
1004 switch (com) {
1005 case FIONCLEX:
1006 fp->fp_flags &= ~FP_CLOEXEC;
1007 break;
1008
1009 case FIOCLEX:
1010 fp->fp_flags |= FP_CLOEXEC;
1011 break;
1012
1013 case FIONBIO:
1014 // FIXME (rdar://54898652)
1015 //
1016 // this code is broken if fnctl(F_SETFL), ioctl() are
1017 // called concurrently for the same fileglob.
1018 if ((tmp = *(int *)datap)) {
1019 os_atomic_or(&fp->f_flag, FNONBLOCK, relaxed);
1020 } else {
1021 os_atomic_andnot(&fp->f_flag, FNONBLOCK, relaxed);
1022 }
1023 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
1024 break;
1025
1026 case FIOASYNC:
1027 // FIXME (rdar://54898652)
1028 //
1029 // this code is broken if fnctl(F_SETFL), ioctl() are
1030 // called concurrently for the same fileglob.
1031 if ((tmp = *(int *)datap)) {
1032 os_atomic_or(&fp->f_flag, FASYNC, relaxed);
1033 } else {
1034 os_atomic_andnot(&fp->f_flag, FASYNC, relaxed);
1035 }
1036 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
1037 break;
1038
1039 case FIOSETOWN:
1040 tmp = *(int *)datap;
1041 if (fp->f_type == DTYPE_SOCKET) {
1042 ((struct socket *)fp_get_data(fp))->so_pgid = tmp;
1043 break;
1044 }
1045 if (fp->f_type == DTYPE_PIPE) {
1046 error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context);
1047 break;
1048 }
1049 if (tmp <= 0) {
1050 tmp = -tmp;
1051 } else {
1052 struct proc *p1 = proc_find(tmp);
1053 if (p1 == 0) {
1054 error = ESRCH;
1055 break;
1056 }
1057 tmp = p1->p_pgrpid;
1058 proc_rele(p1);
1059 }
1060 error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context);
1061 break;
1062
1063 case FIOGETOWN:
1064 if (fp->f_type == DTYPE_SOCKET) {
1065 *(int *)datap = ((struct socket *)fp_get_data(fp))->so_pgid;
1066 break;
1067 }
1068 error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
1069 *(int *)datap = -*(int *)datap;
1070 break;
1071
1072 default:
1073 error = fo_ioctl(fp, com, datap, &context);
1074 /*
1075 * Copy any data to user, size was
1076 * already set and checked above.
1077 */
1078 if (error == 0 && (com & IOC_OUT) && size) {
1079 error = copyout(datap, uap->data, (u_int)size);
1080 }
1081 break;
1082 }
1083 out:
1084 fp_drop(p, fd, fp, 1);
1085 proc_fdunlock(p);
1086
1087 out_nofp:
1088 if (memp) {
1089 kfree_data(memp, size);
1090 }
1091 return error;
1092 }
1093
1094 int selwait;
1095 #define SEL_FIRSTPASS 1
1096 #define SEL_SECONDPASS 2
1097 static int selprocess(struct proc *p, int error, int sel_pass);
1098 static int selscan(struct proc *p, struct _select * sel, struct _select_data * seldata,
1099 int nfd, int32_t *retval, int sel_pass, struct select_set *selset);
1100 static int selcount(struct proc *p, u_int32_t *ibits, int nfd, int *count);
1101 static int seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup);
1102 static int seldrop(struct proc *p, u_int32_t *ibits, int nfd, int lim);
1103 static int select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval);
1104
1105 /*
1106 * This is used for the special device nodes that do not implement
1107 * a proper kevent filter (see filt_specattach).
1108 *
1109 * In order to enable kevents on those, the spec_filtops will pretend
1110 * to call select, and try to sniff the selrecord(), if it observes one,
1111 * the knote is attached, which pairs with selwakeup() or selthreadclear().
1112 *
1113 * The last issue remaining, is that we need to serialize filt_specdetach()
1114 * with this, but it really can't know the "selinfo" or any locking domain.
1115 * To make up for this, We protect knote list operations with a global lock,
1116 * which give us a safe shared locking domain.
1117 *
1118 * Note: It is a little distasteful, but we really have very few of those.
1119 * The big problem here is that sharing a lock domain without
1120 * any kind of shared knowledge is a little complicated.
1121 *
1122 * 1. filters can really implement their own kqueue integration
1123 * to side step this,
1124 *
1125 * 2. There's an opportunity to pick a private lock in selspec_attach()
1126 * because both the selinfo and the knote are locked at that time.
1127 * The cleanup story is however a little complicated.
1128 */
1129 static LCK_GRP_DECLARE(selspec_grp, "spec_filtops");
1130 static LCK_SPIN_DECLARE(selspec_lock, &selspec_grp);
1131
1132 /*
1133 * The "primitive" lock is held.
1134 * The knote lock is held.
1135 */
1136 void
selspec_attach(struct knote * kn,struct selinfo * si)1137 selspec_attach(struct knote *kn, struct selinfo *si)
1138 {
1139 struct selinfo *cur = os_atomic_load(&kn->kn_hook, relaxed);
1140
1141 if (cur == NULL) {
1142 si->si_flags |= SI_SELSPEC;
1143 lck_spin_lock(&selspec_lock);
1144 kn->kn_hook = si;
1145 KNOTE_ATTACH(&si->si_note, kn);
1146 lck_spin_unlock(&selspec_lock);
1147 } else {
1148 /*
1149 * selspec_attach() can be called from e.g. filt_spectouch()
1150 * which might be called before any event was dequeued.
1151 *
1152 * It is hence not impossible for the knote already be hooked.
1153 *
1154 * Note that selwakeup_internal() could possibly
1155 * already have cleared this pointer. This is a race
1156 * that filt_specprocess will debounce.
1157 */
1158 assert(si->si_flags & SI_SELSPEC);
1159 assert(cur == si);
1160 }
1161 }
1162
1163 /*
1164 * The "primitive" lock is _not_ held.
1165 * The knote lock is held.
1166 */
1167 void
selspec_detach(struct knote * kn)1168 selspec_detach(struct knote *kn)
1169 {
1170 /*
1171 * kn_hook always becomes non NULL under the knote lock.
1172 * Seeing "NULL" can't be a false positive.
1173 */
1174 if (kn->kn_hook == NULL) {
1175 return;
1176 }
1177
1178 lck_spin_lock(&selspec_lock);
1179 if (kn->kn_hook) {
1180 struct selinfo *sip = kn->kn_hook;
1181
1182 kn->kn_hook = NULL;
1183 KNOTE_DETACH(&sip->si_note, kn);
1184 }
1185 lck_spin_unlock(&selspec_lock);
1186 }
1187
1188 /*
1189 * Select system call.
1190 *
1191 * Returns: 0 Success
1192 * EINVAL Invalid argument
1193 * EAGAIN Nonconformant error if allocation fails
1194 */
1195 int
select(struct proc * p,struct select_args * uap,int32_t * retval)1196 select(struct proc *p, struct select_args *uap, int32_t *retval)
1197 {
1198 __pthread_testcancel(1);
1199 return select_nocancel(p, (struct select_nocancel_args *)uap, retval);
1200 }
1201
1202 int
select_nocancel(struct proc * p,struct select_nocancel_args * uap,int32_t * retval)1203 select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retval)
1204 {
1205 uint64_t timeout = 0;
1206
1207 if (uap->tv) {
1208 int err;
1209 struct timeval atv;
1210 if (IS_64BIT_PROCESS(p)) {
1211 struct user64_timeval atv64;
1212 err = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
1213 /* Loses resolution - assume timeout < 68 years */
1214 atv.tv_sec = (__darwin_time_t)atv64.tv_sec;
1215 atv.tv_usec = atv64.tv_usec;
1216 } else {
1217 struct user32_timeval atv32;
1218 err = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
1219 atv.tv_sec = atv32.tv_sec;
1220 atv.tv_usec = atv32.tv_usec;
1221 }
1222 if (err) {
1223 return err;
1224 }
1225
1226 if (itimerfix(&atv)) {
1227 err = EINVAL;
1228 return err;
1229 }
1230
1231 clock_absolutetime_interval_to_deadline(tvtoabstime(&atv), &timeout);
1232 }
1233
1234 return select_internal(p, uap, timeout, retval);
1235 }
1236
1237 int
pselect(struct proc * p,struct pselect_args * uap,int32_t * retval)1238 pselect(struct proc *p, struct pselect_args *uap, int32_t *retval)
1239 {
1240 __pthread_testcancel(1);
1241 return pselect_nocancel(p, (struct pselect_nocancel_args *)uap, retval);
1242 }
1243
1244 int
pselect_nocancel(struct proc * p,struct pselect_nocancel_args * uap,int32_t * retval)1245 pselect_nocancel(struct proc *p, struct pselect_nocancel_args *uap, int32_t *retval)
1246 {
1247 int err;
1248 struct uthread *ut;
1249 uint64_t timeout = 0;
1250
1251 if (uap->ts) {
1252 struct timespec ts;
1253
1254 if (IS_64BIT_PROCESS(p)) {
1255 struct user64_timespec ts64;
1256 err = copyin(uap->ts, (caddr_t)&ts64, sizeof(ts64));
1257 ts.tv_sec = (__darwin_time_t)ts64.tv_sec;
1258 ts.tv_nsec = (long)ts64.tv_nsec;
1259 } else {
1260 struct user32_timespec ts32;
1261 err = copyin(uap->ts, (caddr_t)&ts32, sizeof(ts32));
1262 ts.tv_sec = ts32.tv_sec;
1263 ts.tv_nsec = ts32.tv_nsec;
1264 }
1265 if (err) {
1266 return err;
1267 }
1268
1269 if (!timespec_is_valid(&ts)) {
1270 return EINVAL;
1271 }
1272 clock_absolutetime_interval_to_deadline(tstoabstime(&ts), &timeout);
1273 }
1274
1275 ut = current_uthread();
1276
1277 if (uap->mask != USER_ADDR_NULL) {
1278 /* save current mask, then copyin and set new mask */
1279 sigset_t newset;
1280 err = copyin(uap->mask, &newset, sizeof(sigset_t));
1281 if (err) {
1282 return err;
1283 }
1284 ut->uu_oldmask = ut->uu_sigmask;
1285 ut->uu_flag |= UT_SAS_OLDMASK;
1286 ut->uu_sigmask = (newset & ~sigcantmask);
1287 }
1288
1289 err = select_internal(p, (struct select_nocancel_args *)uap, timeout, retval);
1290
1291 if (err != EINTR && ut->uu_flag & UT_SAS_OLDMASK) {
1292 /*
1293 * Restore old mask (direct return case). NOTE: EINTR can also be returned
1294 * if the thread is cancelled. In that case, we don't reset the signal
1295 * mask to its original value (which usually happens in the signal
1296 * delivery path). This behavior is permitted by POSIX.
1297 */
1298 ut->uu_sigmask = ut->uu_oldmask;
1299 ut->uu_oldmask = 0;
1300 ut->uu_flag &= ~UT_SAS_OLDMASK;
1301 }
1302
1303 return err;
1304 }
1305
1306 void
select_cleanup_uthread(struct _select * sel)1307 select_cleanup_uthread(struct _select *sel)
1308 {
1309 kfree_data(sel->ibits, 2 * sel->nbytes);
1310 sel->ibits = sel->obits = NULL;
1311 sel->nbytes = 0;
1312 }
1313
1314 static int
select_grow_uthread_cache(struct _select * sel,uint32_t nbytes)1315 select_grow_uthread_cache(struct _select *sel, uint32_t nbytes)
1316 {
1317 uint32_t *buf;
1318
1319 buf = kalloc_data(2 * nbytes, Z_WAITOK | Z_ZERO);
1320 if (buf) {
1321 select_cleanup_uthread(sel);
1322 sel->ibits = buf;
1323 sel->obits = buf + nbytes / sizeof(uint32_t);
1324 sel->nbytes = nbytes;
1325 return true;
1326 }
1327 return false;
1328 }
1329
1330 static void
select_bzero_uthread_cache(struct _select * sel)1331 select_bzero_uthread_cache(struct _select *sel)
1332 {
1333 bzero(sel->ibits, sel->nbytes * 2);
1334 }
1335
1336 /*
1337 * Generic implementation of {,p}select. Care: we type-pun uap across the two
1338 * syscalls, which differ slightly. The first 4 arguments (nfds and the fd sets)
1339 * are identical. The 5th (timeout) argument points to different types, so we
1340 * unpack in the syscall-specific code, but the generic code still does a null
1341 * check on this argument to determine if a timeout was specified.
1342 */
1343 static int
select_internal(struct proc * p,struct select_nocancel_args * uap,uint64_t timeout,int32_t * retval)1344 select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval)
1345 {
1346 struct uthread *uth = current_uthread();
1347 struct _select *sel = &uth->uu_select;
1348 struct _select_data *seldata = &uth->uu_save.uus_select_data;
1349 int error = 0;
1350 u_int ni, nw;
1351
1352 *retval = 0;
1353
1354 seldata->abstime = timeout;
1355 seldata->args = uap;
1356 seldata->retval = retval;
1357 seldata->count = 0;
1358
1359 if (uap->nd < 0) {
1360 return EINVAL;
1361 }
1362
1363 if (uap->nd > p->p_fd.fd_nfiles) {
1364 uap->nd = p->p_fd.fd_nfiles; /* forgiving; slightly wrong */
1365 }
1366 nw = howmany(uap->nd, NFDBITS);
1367 ni = nw * sizeof(fd_mask);
1368
1369 /*
1370 * if the previously allocated space for the bits is smaller than
1371 * what is requested or no space has yet been allocated for this
1372 * thread, allocate enough space now.
1373 *
1374 * Note: If this process fails, select() will return EAGAIN; this
1375 * is the same thing pool() returns in a no-memory situation, but
1376 * it is not a POSIX compliant error code for select().
1377 */
1378 if (sel->nbytes >= (3 * ni)) {
1379 select_bzero_uthread_cache(sel);
1380 } else if (!select_grow_uthread_cache(sel, 3 * ni)) {
1381 return EAGAIN;
1382 }
1383
1384 /*
1385 * get the bits from the user address space
1386 */
1387 #define getbits(name, x) \
1388 (uap->name ? copyin(uap->name, &sel->ibits[(x) * nw], ni) : 0)
1389
1390 if ((error = getbits(in, 0))) {
1391 return error;
1392 }
1393 if ((error = getbits(ou, 1))) {
1394 return error;
1395 }
1396 if ((error = getbits(ex, 2))) {
1397 return error;
1398 }
1399 #undef getbits
1400
1401 if ((error = selcount(p, sel->ibits, uap->nd, &seldata->count))) {
1402 return error;
1403 }
1404
1405 if (uth->uu_selset == NULL) {
1406 uth->uu_selset = select_set_alloc();
1407 }
1408 return selprocess(p, 0, SEL_FIRSTPASS);
1409 }
1410
1411 static int
selcontinue(int error)1412 selcontinue(int error)
1413 {
1414 return selprocess(current_proc(), error, SEL_SECONDPASS);
1415 }
1416
1417
1418 /*
1419 * selprocess
1420 *
1421 * Parameters: error The error code from our caller
1422 * sel_pass The pass we are on
1423 */
1424 int
selprocess(struct proc * p,int error,int sel_pass)1425 selprocess(struct proc *p, int error, int sel_pass)
1426 {
1427 struct uthread *uth = current_uthread();
1428 struct _select *sel = &uth->uu_select;
1429 struct _select_data *seldata = &uth->uu_save.uus_select_data;
1430 struct select_nocancel_args *uap = seldata->args;
1431 int *retval = seldata->retval;
1432
1433 int unwind = 1;
1434 int prepost = 0;
1435 int somewakeup = 0;
1436 int doretry = 0;
1437 wait_result_t wait_result;
1438
1439 if ((error != 0) && (sel_pass == SEL_FIRSTPASS)) {
1440 unwind = 0;
1441 }
1442 if (seldata->count == 0) {
1443 unwind = 0;
1444 }
1445 retry:
1446 if (error != 0) {
1447 goto done;
1448 }
1449
1450 OSBitOrAtomic(P_SELECT, &p->p_flag);
1451
1452 /* skip scans if the select is just for timeouts */
1453 if (seldata->count) {
1454 error = selscan(p, sel, seldata, uap->nd, retval, sel_pass,
1455 uth->uu_selset);
1456 if (error || *retval) {
1457 goto done;
1458 }
1459 if (prepost || somewakeup) {
1460 /*
1461 * if the select of log, then we can wakeup and
1462 * discover some one else already read the data;
1463 * go to select again if time permits
1464 */
1465 prepost = 0;
1466 somewakeup = 0;
1467 doretry = 1;
1468 }
1469 }
1470
1471 if (uap->tv) {
1472 uint64_t now;
1473
1474 clock_get_uptime(&now);
1475 if (now >= seldata->abstime) {
1476 goto done;
1477 }
1478 }
1479
1480 if (doretry) {
1481 /* cleanup obits and try again */
1482 doretry = 0;
1483 sel_pass = SEL_FIRSTPASS;
1484 goto retry;
1485 }
1486
1487 /*
1488 * To effect a poll, the timeout argument should be
1489 * non-nil, pointing to a zero-valued timeval structure.
1490 */
1491 if (uap->tv && seldata->abstime == 0) {
1492 goto done;
1493 }
1494
1495 /* No spurious wakeups due to colls,no need to check for them */
1496 if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
1497 sel_pass = SEL_FIRSTPASS;
1498 goto retry;
1499 }
1500
1501 OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1502
1503 /* if the select is just for timeout skip check */
1504 if (seldata->count && (sel_pass == SEL_SECONDPASS)) {
1505 panic("selprocess: 2nd pass assertwaiting");
1506 }
1507
1508 wait_result = waitq_assert_wait64_leeway(uth->uu_selset,
1509 NO_EVENT64, THREAD_ABORTSAFE,
1510 TIMEOUT_URGENCY_USER_NORMAL,
1511 seldata->abstime,
1512 TIMEOUT_NO_LEEWAY);
1513 if (wait_result != THREAD_AWAKENED) {
1514 /* there are no preposted events */
1515 error = tsleep1(NULL, PSOCK | PCATCH,
1516 "select", 0, selcontinue);
1517 } else {
1518 prepost = 1;
1519 error = 0;
1520 }
1521
1522 if (error == 0) {
1523 sel_pass = SEL_SECONDPASS;
1524 if (!prepost) {
1525 somewakeup = 1;
1526 }
1527 goto retry;
1528 }
1529 done:
1530 if (unwind) {
1531 seldrop(p, sel->ibits, uap->nd, seldata->count);
1532 select_set_reset(uth->uu_selset);
1533 }
1534 OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1535 /* select is not restarted after signals... */
1536 if (error == ERESTART) {
1537 error = EINTR;
1538 }
1539 if (error == EWOULDBLOCK) {
1540 error = 0;
1541 }
1542
1543 if (error == 0) {
1544 uint32_t nw = howmany(uap->nd, NFDBITS);
1545 uint32_t ni = nw * sizeof(fd_mask);
1546
1547 #define putbits(name, x) \
1548 (uap->name ? copyout(&sel->obits[(x) * nw], uap->name, ni) : 0)
1549 int e0 = putbits(in, 0);
1550 int e1 = putbits(ou, 1);
1551 int e2 = putbits(ex, 2);
1552
1553 error = e0 ?: e1 ?: e2;
1554 #undef putbits
1555 }
1556
1557 if (error != EINTR && sel_pass == SEL_SECONDPASS && uth->uu_flag & UT_SAS_OLDMASK) {
1558 /* restore signal mask - continuation case */
1559 uth->uu_sigmask = uth->uu_oldmask;
1560 uth->uu_oldmask = 0;
1561 uth->uu_flag &= ~UT_SAS_OLDMASK;
1562 }
1563
1564 return error;
1565 }
1566
1567
1568 /**
1569 * remove the fileproc's underlying waitq from the supplied waitq set;
1570 * clear FP_INSELECT when appropriate
1571 *
1572 * Parameters:
1573 * fp File proc that is potentially currently in select
1574 * selset Waitq set to which the fileproc may belong
1575 * (usually this is the thread's private waitq set)
1576 * Conditions:
1577 * proc_fdlock is held
1578 */
1579 static void
selunlinkfp(struct fileproc * fp,struct select_set * selset)1580 selunlinkfp(struct fileproc *fp, struct select_set *selset)
1581 {
1582 if (fp->fp_flags & FP_INSELECT) {
1583 if (fp->fp_guard_attrs) {
1584 if (fp->fp_guard->fpg_wset == selset) {
1585 fp->fp_guard->fpg_wset = NULL;
1586 fp->fp_flags &= ~FP_INSELECT;
1587 }
1588 } else {
1589 if (fp->fp_wset == selset) {
1590 fp->fp_wset = NULL;
1591 fp->fp_flags &= ~FP_INSELECT;
1592 }
1593 }
1594 }
1595 }
1596
1597 /**
1598 * connect a fileproc to the given selset, potentially bridging to a waitq
1599 * pointed to indirectly by wq_data
1600 *
1601 * Parameters:
1602 * fp File proc potentially currently in select
1603 * selset Waitq set to which the fileproc should now belong
1604 * (usually this is the thread's private waitq set)
1605 *
1606 * Conditions:
1607 * proc_fdlock is held
1608 */
1609 static void
sellinkfp(struct fileproc * fp,struct select_set * selset,waitq_link_t * linkp)1610 sellinkfp(struct fileproc *fp, struct select_set *selset, waitq_link_t *linkp)
1611 {
1612 if ((fp->fp_flags & FP_INSELECT) == 0) {
1613 if (fp->fp_guard_attrs) {
1614 fp->fp_guard->fpg_wset = selset;
1615 } else {
1616 fp->fp_wset = selset;
1617 }
1618 fp->fp_flags |= FP_INSELECT;
1619 } else {
1620 fp->fp_flags |= FP_SELCONFLICT;
1621 if (linkp->wqlh == NULL) {
1622 *linkp = waitq_link_alloc(WQT_SELECT_SET);
1623 }
1624 select_set_link(&select_conflict_queue, selset, linkp);
1625 }
1626 }
1627
1628
1629 /*
1630 * selscan
1631 *
1632 * Parameters: p Process performing the select
1633 * sel The per-thread select context structure
1634 * nfd The number of file descriptors to scan
1635 * retval The per thread system call return area
1636 * sel_pass Which pass this is; allowed values are
1637 * SEL_FIRSTPASS and SEL_SECONDPASS
1638 * selset The per thread wait queue set
1639 *
1640 * Returns: 0 Success
1641 * EIO Invalid p->p_fd field XXX Obsolete?
1642 * EBADF One of the files in the bit vector is
1643 * invalid.
1644 */
1645 static int
selscan(struct proc * p,struct _select * sel,struct _select_data * seldata,int nfd,int32_t * retval,int sel_pass,struct select_set * selset)1646 selscan(struct proc *p, struct _select *sel, struct _select_data * seldata,
1647 int nfd, int32_t *retval, int sel_pass, struct select_set *selset)
1648 {
1649 int msk, i, j, fd;
1650 u_int32_t bits;
1651 struct fileproc *fp;
1652 int n = 0; /* count of bits */
1653 int nc = 0; /* bit vector offset (nc'th bit) */
1654 static int flag[3] = { FREAD, FWRITE, 0 };
1655 u_int32_t *iptr, *optr;
1656 u_int nw;
1657 u_int32_t *ibits, *obits;
1658 int count;
1659 struct vfs_context context = {
1660 .vc_thread = current_thread(),
1661 };
1662 waitq_link_t link = WQL_NULL;
1663 void *s_data;
1664
1665 ibits = sel->ibits;
1666 obits = sel->obits;
1667
1668 nw = howmany(nfd, NFDBITS);
1669
1670 count = seldata->count;
1671
1672 nc = 0;
1673 if (!count) {
1674 *retval = 0;
1675 return 0;
1676 }
1677
1678 if (sel_pass == SEL_FIRSTPASS) {
1679 /*
1680 * Make sure the waitq-set is all clean:
1681 *
1682 * select loops until it finds at least one event, however it
1683 * doesn't mean that the event that woke up select is still
1684 * fired by the time the second pass runs, and then
1685 * select_internal will loop back to a first pass.
1686 */
1687 select_set_reset(selset);
1688 s_data = &link;
1689 } else {
1690 s_data = NULL;
1691 }
1692
1693 proc_fdlock(p);
1694 for (msk = 0; msk < 3; msk++) {
1695 iptr = (u_int32_t *)&ibits[msk * nw];
1696 optr = (u_int32_t *)&obits[msk * nw];
1697
1698 for (i = 0; i < nfd; i += NFDBITS) {
1699 bits = iptr[i / NFDBITS];
1700
1701 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1702 bits &= ~(1U << j);
1703
1704 fp = fp_get_noref_locked(p, fd);
1705 if (fp == NULL) {
1706 /*
1707 * If we abort because of a bad
1708 * fd, let the caller unwind...
1709 */
1710 proc_fdunlock(p);
1711 return EBADF;
1712 }
1713 if (sel_pass == SEL_SECONDPASS) {
1714 selunlinkfp(fp, selset);
1715 } else if (link.wqlh == NULL) {
1716 link = waitq_link_alloc(WQT_SELECT_SET);
1717 }
1718
1719 context.vc_ucred = fp->f_cred;
1720
1721 /* The select; set the bit, if true */
1722 if (fo_select(fp, flag[msk], s_data, &context)) {
1723 optr[fd / NFDBITS] |= (1U << (fd % NFDBITS));
1724 n++;
1725 }
1726 if (sel_pass == SEL_FIRSTPASS) {
1727 /*
1728 * Hook up the thread's waitq set either to
1729 * the fileproc structure, or to the global
1730 * conflict queue: but only on the first
1731 * select pass.
1732 */
1733 sellinkfp(fp, selset, &link);
1734 }
1735 nc++;
1736 }
1737 }
1738 }
1739 proc_fdunlock(p);
1740
1741 if (link.wqlh) {
1742 waitq_link_free(WQT_SELECT_SET, link);
1743 }
1744
1745 *retval = n;
1746 return 0;
1747 }
1748
1749 static int poll_callback(struct kevent_qos_s *, kevent_ctx_t);
1750
1751 int
poll(struct proc * p,struct poll_args * uap,int32_t * retval)1752 poll(struct proc *p, struct poll_args *uap, int32_t *retval)
1753 {
1754 __pthread_testcancel(1);
1755 return poll_nocancel(p, (struct poll_nocancel_args *)uap, retval);
1756 }
1757
1758
1759 int
poll_nocancel(struct proc * p,struct poll_nocancel_args * uap,int32_t * retval)1760 poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
1761 {
1762 struct pollfd *fds = NULL;
1763 struct kqueue *kq = NULL;
1764 int error = 0;
1765 u_int nfds = uap->nfds;
1766 u_int rfds = 0;
1767 rlim_t nofile = proc_limitgetcur(p, RLIMIT_NOFILE);
1768 size_t ni = nfds * sizeof(struct pollfd);
1769
1770 /*
1771 * This is kinda bogus. We have fd limits, but that is not
1772 * really related to the size of the pollfd array. Make sure
1773 * we let the process use at least FD_SETSIZE entries and at
1774 * least enough for the current limits. We want to be reasonably
1775 * safe, but not overly restrictive.
1776 */
1777 if (nfds > OPEN_MAX ||
1778 (nfds > nofile && (proc_suser(p) || nfds > FD_SETSIZE))) {
1779 return EINVAL;
1780 }
1781
1782 kq = kqueue_alloc(p);
1783 if (kq == NULL) {
1784 return EAGAIN;
1785 }
1786
1787 if (nfds) {
1788 fds = (struct pollfd *)kalloc_data(ni, Z_WAITOK);
1789 if (NULL == fds) {
1790 error = EAGAIN;
1791 goto out;
1792 }
1793
1794 error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
1795 if (error) {
1796 goto out;
1797 }
1798 }
1799
1800 /* JMM - all this P_SELECT stuff is bogus */
1801 OSBitOrAtomic(P_SELECT, &p->p_flag);
1802 for (u_int i = 0; i < nfds; i++) {
1803 short events = fds[i].events;
1804 __assert_only int rc;
1805
1806 /* per spec, ignore fd values below zero */
1807 if (fds[i].fd < 0) {
1808 fds[i].revents = 0;
1809 continue;
1810 }
1811
1812 /* convert the poll event into a kqueue kevent */
1813 struct kevent_qos_s kev = {
1814 .ident = fds[i].fd,
1815 .flags = EV_ADD | EV_ONESHOT | EV_POLL,
1816 .udata = CAST_USER_ADDR_T(&fds[i])
1817 };
1818
1819 /* Handle input events */
1820 if (events & (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP)) {
1821 kev.filter = EVFILT_READ;
1822 if (events & (POLLPRI | POLLRDBAND)) {
1823 kev.flags |= EV_OOBAND;
1824 }
1825 rc = kevent_register(kq, &kev, NULL);
1826 assert((rc & FILTER_REGISTER_WAIT) == 0);
1827 }
1828
1829 /* Handle output events */
1830 if ((kev.flags & EV_ERROR) == 0 &&
1831 (events & (POLLOUT | POLLWRNORM | POLLWRBAND))) {
1832 kev.filter = EVFILT_WRITE;
1833 rc = kevent_register(kq, &kev, NULL);
1834 assert((rc & FILTER_REGISTER_WAIT) == 0);
1835 }
1836
1837 /* Handle BSD extension vnode events */
1838 if ((kev.flags & EV_ERROR) == 0 &&
1839 (events & (POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE))) {
1840 kev.filter = EVFILT_VNODE;
1841 kev.fflags = 0;
1842 if (events & POLLEXTEND) {
1843 kev.fflags |= NOTE_EXTEND;
1844 }
1845 if (events & POLLATTRIB) {
1846 kev.fflags |= NOTE_ATTRIB;
1847 }
1848 if (events & POLLNLINK) {
1849 kev.fflags |= NOTE_LINK;
1850 }
1851 if (events & POLLWRITE) {
1852 kev.fflags |= NOTE_WRITE;
1853 }
1854 rc = kevent_register(kq, &kev, NULL);
1855 assert((rc & FILTER_REGISTER_WAIT) == 0);
1856 }
1857
1858 if (kev.flags & EV_ERROR) {
1859 fds[i].revents = POLLNVAL;
1860 rfds++;
1861 } else {
1862 fds[i].revents = 0;
1863 }
1864 }
1865
1866 /*
1867 * Did we have any trouble registering?
1868 * If user space passed 0 FDs, then respect any timeout value passed.
1869 * This is an extremely inefficient sleep. If user space passed one or
1870 * more FDs, and we had trouble registering _all_ of them, then bail
1871 * out. If a subset of the provided FDs failed to register, then we
1872 * will still call the kqueue_scan function.
1873 */
1874 if (nfds && (rfds == nfds)) {
1875 goto done;
1876 }
1877
1878 /* scan for, and possibly wait for, the kevents to trigger */
1879 kevent_ctx_t kectx = kevent_get_context(current_thread());
1880 *kectx = (struct kevent_ctx_s){
1881 .kec_process_noutputs = rfds,
1882 .kec_process_flags = KEVENT_FLAG_POLL,
1883 .kec_deadline = 0, /* wait forever */
1884 };
1885
1886 /*
1887 * If any events have trouble registering, an event has fired and we
1888 * shouldn't wait for events in kqueue_scan.
1889 */
1890 if (rfds) {
1891 kectx->kec_process_flags |= KEVENT_FLAG_IMMEDIATE;
1892 } else if (uap->timeout != -1) {
1893 clock_interval_to_deadline(uap->timeout, NSEC_PER_MSEC,
1894 &kectx->kec_deadline);
1895 }
1896
1897 error = kqueue_scan(kq, kectx->kec_process_flags, kectx, poll_callback);
1898 rfds = kectx->kec_process_noutputs;
1899
1900 done:
1901 OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1902 /* poll is not restarted after signals... */
1903 if (error == ERESTART) {
1904 error = EINTR;
1905 }
1906 if (error == 0) {
1907 error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
1908 *retval = rfds;
1909 }
1910
1911 out:
1912 kfree_data(fds, ni);
1913
1914 kqueue_dealloc(kq);
1915 return error;
1916 }
1917
1918 static int
poll_callback(struct kevent_qos_s * kevp,kevent_ctx_t kectx)1919 poll_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
1920 {
1921 struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
1922 short prev_revents = fds->revents;
1923 short mask = 0;
1924
1925 /* convert the results back into revents */
1926 if (kevp->flags & EV_EOF) {
1927 fds->revents |= POLLHUP;
1928 }
1929 if (kevp->flags & EV_ERROR) {
1930 fds->revents |= POLLERR;
1931 }
1932
1933 switch (kevp->filter) {
1934 case EVFILT_READ:
1935 if (fds->revents & POLLHUP) {
1936 mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND);
1937 } else {
1938 mask = (POLLIN | POLLRDNORM);
1939 if (kevp->flags & EV_OOBAND) {
1940 mask |= (POLLPRI | POLLRDBAND);
1941 }
1942 }
1943 fds->revents |= (fds->events & mask);
1944 break;
1945
1946 case EVFILT_WRITE:
1947 if (!(fds->revents & POLLHUP)) {
1948 fds->revents |= (fds->events & (POLLOUT | POLLWRNORM | POLLWRBAND));
1949 }
1950 break;
1951
1952 case EVFILT_VNODE:
1953 if (kevp->fflags & NOTE_EXTEND) {
1954 fds->revents |= (fds->events & POLLEXTEND);
1955 }
1956 if (kevp->fflags & NOTE_ATTRIB) {
1957 fds->revents |= (fds->events & POLLATTRIB);
1958 }
1959 if (kevp->fflags & NOTE_LINK) {
1960 fds->revents |= (fds->events & POLLNLINK);
1961 }
1962 if (kevp->fflags & NOTE_WRITE) {
1963 fds->revents |= (fds->events & POLLWRITE);
1964 }
1965 break;
1966 }
1967
1968 if (fds->revents != 0 && prev_revents == 0) {
1969 kectx->kec_process_noutputs++;
1970 }
1971
1972 return 0;
1973 }
1974
1975 int
seltrue(__unused dev_t dev,__unused int flag,__unused struct proc * p)1976 seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
1977 {
1978 return 1;
1979 }
1980
1981 /*
1982 * selcount
1983 *
1984 * Count the number of bits set in the input bit vector, and establish an
1985 * outstanding fp->fp_iocount for each of the descriptors which will be in
1986 * use in the select operation.
1987 *
1988 * Parameters: p The process doing the select
1989 * ibits The input bit vector
1990 * nfd The number of fd's in the vector
1991 * countp Pointer to where to store the bit count
1992 *
1993 * Returns: 0 Success
1994 * EIO Bad per process open file table
1995 * EBADF One of the bits in the input bit vector
1996 * references an invalid fd
1997 *
1998 * Implicit: *countp (modified) Count of fd's
1999 *
2000 * Notes: This function is the first pass under the proc_fdlock() that
2001 * permits us to recognize invalid descriptors in the bit vector;
2002 * the may, however, not remain valid through the drop and
2003 * later reacquisition of the proc_fdlock().
2004 */
2005 static int
selcount(struct proc * p,u_int32_t * ibits,int nfd,int * countp)2006 selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp)
2007 {
2008 int msk, i, j, fd;
2009 u_int32_t bits;
2010 struct fileproc *fp;
2011 int n = 0;
2012 u_int32_t *iptr;
2013 u_int nw;
2014 int error = 0;
2015 int need_wakeup = 0;
2016
2017 nw = howmany(nfd, NFDBITS);
2018
2019 proc_fdlock(p);
2020 for (msk = 0; msk < 3; msk++) {
2021 iptr = (u_int32_t *)&ibits[msk * nw];
2022 for (i = 0; i < nfd; i += NFDBITS) {
2023 bits = iptr[i / NFDBITS];
2024 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
2025 bits &= ~(1U << j);
2026
2027 fp = fp_get_noref_locked(p, fd);
2028 if (fp == NULL) {
2029 *countp = 0;
2030 error = EBADF;
2031 goto bad;
2032 }
2033 os_ref_retain_locked(&fp->fp_iocount);
2034 n++;
2035 }
2036 }
2037 }
2038 proc_fdunlock(p);
2039
2040 *countp = n;
2041 return 0;
2042
2043 bad:
2044 if (n == 0) {
2045 goto out;
2046 }
2047 /* Ignore error return; it's already EBADF */
2048 (void)seldrop_locked(p, ibits, nfd, n, &need_wakeup);
2049
2050 out:
2051 proc_fdunlock(p);
2052 if (need_wakeup) {
2053 wakeup(&p->p_fd.fd_fpdrainwait);
2054 }
2055 return error;
2056 }
2057
2058
2059 /*
2060 * seldrop_locked
2061 *
2062 * Drop outstanding wait queue references set up during selscan(); drop the
2063 * outstanding per fileproc fp_iocount picked up during the selcount().
2064 *
2065 * Parameters: p Process performing the select
2066 * ibits Input bit bector of fd's
2067 * nfd Number of fd's
2068 * lim Limit to number of vector entries to
2069 * consider, or -1 for "all"
2070 * inselect True if
2071 * need_wakeup Pointer to flag to set to do a wakeup
2072 * if f_iocont on any descriptor goes to 0
2073 *
2074 * Returns: 0 Success
2075 * EBADF One or more fds in the bit vector
2076 * were invalid, but the rest
2077 * were successfully dropped
2078 *
2079 * Notes: An fd make become bad while the proc_fdlock() is not held,
2080 * if a multithreaded application closes the fd out from under
2081 * the in progress select. In this case, we still have to
2082 * clean up after the set up on the remaining fds.
2083 */
2084 static int
seldrop_locked(struct proc * p,u_int32_t * ibits,int nfd,int lim,int * need_wakeup)2085 seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup)
2086 {
2087 int msk, i, j, nc, fd;
2088 u_int32_t bits;
2089 struct fileproc *fp;
2090 u_int32_t *iptr;
2091 u_int nw;
2092 int error = 0;
2093 uthread_t uth = current_uthread();
2094 struct _select_data *seldata;
2095
2096 *need_wakeup = 0;
2097
2098 nw = howmany(nfd, NFDBITS);
2099 seldata = &uth->uu_save.uus_select_data;
2100
2101 nc = 0;
2102 for (msk = 0; msk < 3; msk++) {
2103 iptr = (u_int32_t *)&ibits[msk * nw];
2104 for (i = 0; i < nfd; i += NFDBITS) {
2105 bits = iptr[i / NFDBITS];
2106 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
2107 bits &= ~(1U << j);
2108 /*
2109 * If we've already dropped as many as were
2110 * counted/scanned, then we are done.
2111 */
2112 if (nc >= lim) {
2113 goto done;
2114 }
2115
2116 /*
2117 * We took an I/O reference in selcount,
2118 * so the fp can't possibly be NULL.
2119 */
2120 fp = fp_get_noref_locked_with_iocount(p, fd);
2121 selunlinkfp(fp, uth->uu_selset);
2122
2123 nc++;
2124
2125 const os_ref_count_t refc = os_ref_release_locked(&fp->fp_iocount);
2126 if (0 == refc) {
2127 panic("fp_iocount overdecrement!");
2128 }
2129
2130 if (1 == refc) {
2131 /*
2132 * The last iocount is responsible for clearing
2133 * selconfict flag - even if we didn't set it -
2134 * and is also responsible for waking up anyone
2135 * waiting on iocounts to drain.
2136 */
2137 if (fp->fp_flags & FP_SELCONFLICT) {
2138 fp->fp_flags &= ~FP_SELCONFLICT;
2139 }
2140 if (p->p_fd.fd_fpdrainwait) {
2141 p->p_fd.fd_fpdrainwait = 0;
2142 *need_wakeup = 1;
2143 }
2144 }
2145 }
2146 }
2147 }
2148 done:
2149 return error;
2150 }
2151
2152
2153 static int
seldrop(struct proc * p,u_int32_t * ibits,int nfd,int lim)2154 seldrop(struct proc *p, u_int32_t *ibits, int nfd, int lim)
2155 {
2156 int error;
2157 int need_wakeup = 0;
2158
2159 proc_fdlock(p);
2160 error = seldrop_locked(p, ibits, nfd, lim, &need_wakeup);
2161 proc_fdunlock(p);
2162 if (need_wakeup) {
2163 wakeup(&p->p_fd.fd_fpdrainwait);
2164 }
2165 return error;
2166 }
2167
2168 /*
2169 * Record a select request.
2170 */
2171 void
selrecord(__unused struct proc * selector,struct selinfo * sip,void * s_data)2172 selrecord(__unused struct proc *selector, struct selinfo *sip, void *s_data)
2173 {
2174 struct select_set *selset = current_uthread()->uu_selset;
2175
2176 /* do not record if this is second pass of select */
2177 if (!s_data) {
2178 return;
2179 }
2180
2181 if (selset == SELSPEC_RECORD_MARKER) {
2182 /*
2183 * The kevent subsystem is trying to sniff
2184 * the selinfo::si_note to attach to.
2185 */
2186 ((selspec_record_hook_t)s_data)(sip);
2187 } else {
2188 waitq_link_t *linkp = s_data;
2189
2190 if (!waitq_is_valid(&sip->si_waitq)) {
2191 waitq_init(&sip->si_waitq, WQT_SELECT, SYNC_POLICY_FIFO);
2192 }
2193
2194 /* note: this checks for pre-existing linkage */
2195 select_set_link(&sip->si_waitq, selset, linkp);
2196 }
2197 }
2198
2199 static void
selwakeup_internal(struct selinfo * sip,long hint,wait_result_t wr)2200 selwakeup_internal(struct selinfo *sip, long hint, wait_result_t wr)
2201 {
2202 if (sip->si_flags & SI_SELSPEC) {
2203 /*
2204 * The "primitive" lock is held.
2205 * The knote lock is not held.
2206 *
2207 * All knotes will transition their kn_hook to NULL.
2208 */
2209 lck_spin_lock(&selspec_lock);
2210 KNOTE(&sip->si_note, hint);
2211 klist_init(&sip->si_note);
2212 lck_spin_unlock(&selspec_lock);
2213 sip->si_flags &= ~SI_SELSPEC;
2214 }
2215
2216 /*
2217 * After selrecord() has been called, selinfo owners must call
2218 * at least one of selwakeup() or selthreadclear().
2219 *
2220 * Use this opportunity to deinit the waitq
2221 * so that all linkages are garbage collected
2222 * in a combined wakeup-all + unlink + deinit call.
2223 */
2224 select_waitq_wakeup_and_deinit(&sip->si_waitq, NO_EVENT64, wr,
2225 WAITQ_ALL_PRIORITIES);
2226 }
2227
2228
2229 void
selwakeup(struct selinfo * sip)2230 selwakeup(struct selinfo *sip)
2231 {
2232 selwakeup_internal(sip, 0, THREAD_AWAKENED);
2233 }
2234
2235 void
selthreadclear(struct selinfo * sip)2236 selthreadclear(struct selinfo *sip)
2237 {
2238 selwakeup_internal(sip, NOTE_REVOKE, THREAD_RESTART);
2239 }
2240
2241
2242 /*
2243 * gethostuuid
2244 *
2245 * Description: Get the host UUID from IOKit and return it to user space.
2246 *
2247 * Parameters: uuid_buf Pointer to buffer to receive UUID
2248 * timeout Timespec for timout
2249 *
2250 * Returns: 0 Success
2251 * EWOULDBLOCK Timeout is too short
2252 * copyout:EFAULT Bad user buffer
2253 * mac_system_check_info:EPERM Client not allowed to perform this operation
2254 *
2255 * Notes: A timeout seems redundant, since if it's tolerable to not
2256 * have a system UUID in hand, then why ask for one?
2257 */
2258 int
gethostuuid(struct proc * p,struct gethostuuid_args * uap,__unused int32_t * retval)2259 gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retval)
2260 {
2261 kern_return_t kret;
2262 int error;
2263 mach_timespec_t mach_ts; /* for IOKit call */
2264 __darwin_uuid_t uuid_kern = {}; /* for IOKit call */
2265
2266 /* Check entitlement */
2267 if (!IOCurrentTaskHasEntitlement("com.apple.private.getprivatesysid")) {
2268 #if !defined(XNU_TARGET_OS_OSX)
2269 #if CONFIG_MACF
2270 if ((error = mac_system_check_info(kauth_cred_get(), "hw.uuid")) != 0) {
2271 /* EPERM invokes userspace upcall if present */
2272 return error;
2273 }
2274 #endif
2275 #endif
2276 }
2277
2278 /* Convert the 32/64 bit timespec into a mach_timespec_t */
2279 if (proc_is64bit(p)) {
2280 struct user64_timespec ts;
2281 error = copyin(uap->timeoutp, &ts, sizeof(ts));
2282 if (error) {
2283 return error;
2284 }
2285 mach_ts.tv_sec = (unsigned int)ts.tv_sec;
2286 mach_ts.tv_nsec = (clock_res_t)ts.tv_nsec;
2287 } else {
2288 struct user32_timespec ts;
2289 error = copyin(uap->timeoutp, &ts, sizeof(ts));
2290 if (error) {
2291 return error;
2292 }
2293 mach_ts.tv_sec = ts.tv_sec;
2294 mach_ts.tv_nsec = ts.tv_nsec;
2295 }
2296
2297 /* Call IOKit with the stack buffer to get the UUID */
2298 kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
2299
2300 /*
2301 * If we get it, copy out the data to the user buffer; note that a
2302 * uuid_t is an array of characters, so this is size invariant for
2303 * 32 vs. 64 bit.
2304 */
2305 if (kret == KERN_SUCCESS) {
2306 error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
2307 } else {
2308 error = EWOULDBLOCK;
2309 }
2310
2311 return error;
2312 }
2313
2314 /*
2315 * ledger
2316 *
2317 * Description: Omnibus system call for ledger operations
2318 */
2319 int
ledger(struct proc * p,struct ledger_args * args,__unused int32_t * retval)2320 ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval)
2321 {
2322 #if !CONFIG_MACF
2323 #pragma unused(p)
2324 #endif
2325 int rval, pid, len, error;
2326 #ifdef LEDGER_DEBUG
2327 struct ledger_limit_args lla;
2328 #endif
2329 task_t task;
2330 proc_t proc;
2331
2332 /* Finish copying in the necessary args before taking the proc lock */
2333 error = 0;
2334 len = 0;
2335 if (args->cmd == LEDGER_ENTRY_INFO) {
2336 error = copyin(args->arg3, (char *)&len, sizeof(len));
2337 } else if (args->cmd == LEDGER_TEMPLATE_INFO) {
2338 error = copyin(args->arg2, (char *)&len, sizeof(len));
2339 } else if (args->cmd == LEDGER_LIMIT)
2340 #ifdef LEDGER_DEBUG
2341 { error = copyin(args->arg2, (char *)&lla, sizeof(lla));}
2342 #else
2343 { return EINVAL; }
2344 #endif
2345 else if ((args->cmd < 0) || (args->cmd > LEDGER_MAX_CMD)) {
2346 return EINVAL;
2347 }
2348
2349 if (error) {
2350 return error;
2351 }
2352 if (len < 0) {
2353 return EINVAL;
2354 }
2355
2356 rval = 0;
2357 if (args->cmd != LEDGER_TEMPLATE_INFO) {
2358 pid = (int)args->arg1;
2359 proc = proc_find(pid);
2360 if (proc == NULL) {
2361 return ESRCH;
2362 }
2363
2364 #if CONFIG_MACF
2365 error = mac_proc_check_ledger(p, proc, args->cmd);
2366 if (error) {
2367 proc_rele(proc);
2368 return error;
2369 }
2370 #endif
2371
2372 task = proc->task;
2373 }
2374
2375 switch (args->cmd) {
2376 #ifdef LEDGER_DEBUG
2377 case LEDGER_LIMIT: {
2378 if (!kauth_cred_issuser(kauth_cred_get())) {
2379 rval = EPERM;
2380 }
2381 rval = ledger_limit(task, &lla);
2382 proc_rele(proc);
2383 break;
2384 }
2385 #endif
2386 case LEDGER_INFO: {
2387 struct ledger_info info = {};
2388
2389 rval = ledger_info(task, &info);
2390 proc_rele(proc);
2391 if (rval == 0) {
2392 rval = copyout(&info, args->arg2,
2393 sizeof(info));
2394 }
2395 break;
2396 }
2397
2398 case LEDGER_ENTRY_INFO: {
2399 void *buf;
2400 int sz;
2401
2402 #if CONFIG_MEMORYSTATUS
2403 task_ledger_settle_dirty_time(task);
2404 #endif /* CONFIG_MEMORYSTATUS */
2405
2406 rval = ledger_get_task_entry_info_multiple(task, &buf, &len);
2407 proc_rele(proc);
2408 if ((rval == 0) && (len >= 0)) {
2409 sz = len * sizeof(struct ledger_entry_info);
2410 rval = copyout(buf, args->arg2, sz);
2411 kfree_data(buf, sz);
2412 }
2413 if (rval == 0) {
2414 rval = copyout(&len, args->arg3, sizeof(len));
2415 }
2416 break;
2417 }
2418
2419 case LEDGER_TEMPLATE_INFO: {
2420 void *buf;
2421 int sz;
2422
2423 rval = ledger_template_info(&buf, &len);
2424 if ((rval == 0) && (len >= 0)) {
2425 sz = len * sizeof(struct ledger_template_info);
2426 rval = copyout(buf, args->arg1, sz);
2427 kfree_data(buf, sz);
2428 }
2429 if (rval == 0) {
2430 rval = copyout(&len, args->arg2, sizeof(len));
2431 }
2432 break;
2433 }
2434
2435 default:
2436 panic("ledger syscall logic error -- command type %d", args->cmd);
2437 proc_rele(proc);
2438 rval = EINVAL;
2439 }
2440
2441 return rval;
2442 }
2443
2444 int
telemetry(__unused struct proc * p,struct telemetry_args * args,__unused int32_t * retval)2445 telemetry(__unused struct proc *p, struct telemetry_args *args, __unused int32_t *retval)
2446 {
2447 int error = 0;
2448
2449 switch (args->cmd) {
2450 #if CONFIG_TELEMETRY
2451 case TELEMETRY_CMD_TIMER_EVENT:
2452 error = telemetry_timer_event(args->deadline, args->interval, args->leeway);
2453 break;
2454 case TELEMETRY_CMD_PMI_SETUP:
2455 error = telemetry_pmi_setup((enum telemetry_pmi)args->deadline, args->interval);
2456 break;
2457 #endif /* CONFIG_TELEMETRY */
2458 case TELEMETRY_CMD_VOUCHER_NAME:
2459 if (thread_set_voucher_name((mach_port_name_t)args->deadline)) {
2460 error = EINVAL;
2461 }
2462 break;
2463
2464 default:
2465 error = EINVAL;
2466 break;
2467 }
2468
2469 return error;
2470 }
2471
2472 /*
2473 * Logging
2474 *
2475 * Description: syscall to access kernel logging from userspace
2476 *
2477 * Args:
2478 * tag - used for syncing with userspace on the version.
2479 * flags - flags used by the syscall.
2480 * buffer - userspace address of string to copy.
2481 * size - size of buffer.
2482 */
2483 int
log_data(__unused struct proc * p,struct log_data_args * args,int * retval)2484 log_data(__unused struct proc *p, struct log_data_args *args, int *retval)
2485 {
2486 unsigned int tag = args->tag;
2487 unsigned int flags = args->flags;
2488 user_addr_t buffer = args->buffer;
2489 unsigned int size = args->size;
2490 int ret = 0;
2491 *retval = 0;
2492
2493 /* Only DEXTs are suppose to use this syscall. */
2494 if (!task_is_driver(current_task())) {
2495 return EPERM;
2496 }
2497
2498 /*
2499 * Tag synchronize the syscall version with userspace.
2500 * Tag == 0 => flags == OS_LOG_TYPE
2501 */
2502 if (tag != 0) {
2503 return EINVAL;
2504 }
2505
2506 /*
2507 * OS_LOG_TYPE are defined in libkern/os/log.h
2508 * In userspace they are defined in libtrace/os/log.h
2509 */
2510 if (flags != OS_LOG_TYPE_DEFAULT &&
2511 flags != OS_LOG_TYPE_INFO &&
2512 flags != OS_LOG_TYPE_DEBUG &&
2513 flags != OS_LOG_TYPE_ERROR &&
2514 flags != OS_LOG_TYPE_FAULT) {
2515 return EINVAL;
2516 }
2517
2518 if (size == 0) {
2519 return EINVAL;
2520 }
2521
2522 /* truncate to OS_LOG_DATA_MAX_SIZE */
2523 if (size > OS_LOG_DATA_MAX_SIZE) {
2524 printf("%s: WARNING msg is going to be truncated from %u to %u\n",
2525 __func__, size, OS_LOG_DATA_MAX_SIZE);
2526 size = OS_LOG_DATA_MAX_SIZE;
2527 }
2528
2529 char *log_msg = (char *)kalloc_data(size, Z_WAITOK);
2530 if (!log_msg) {
2531 return ENOMEM;
2532 }
2533
2534 if (copyin(buffer, log_msg, size) != 0) {
2535 ret = EFAULT;
2536 goto out;
2537 }
2538 log_msg[size - 1] = '\0';
2539
2540 /*
2541 * This will log to dmesg and logd.
2542 * The call will fail if the current
2543 * process is not a driverKit process.
2544 */
2545 os_log_driverKit(&ret, OS_LOG_DEFAULT, (os_log_type_t)flags, "%s", log_msg);
2546
2547 out:
2548 if (log_msg != NULL) {
2549 kfree_data(log_msg, size);
2550 }
2551
2552 return ret;
2553 }
2554
2555 #if DEVELOPMENT || DEBUG
2556
2557 static int
2558 sysctl_mpsc_test_pingpong SYSCTL_HANDLER_ARGS
2559 {
2560 #pragma unused(oidp, arg1, arg2)
2561 uint64_t value = 0;
2562 int error;
2563
2564 error = SYSCTL_IN(req, &value, sizeof(value));
2565 if (error) {
2566 return error;
2567 }
2568
2569 if (error == 0 && req->newptr) {
2570 error = mpsc_test_pingpong(value, &value);
2571 if (error == 0) {
2572 error = SYSCTL_OUT(req, &value, sizeof(value));
2573 }
2574 }
2575
2576 return error;
2577 }
2578 SYSCTL_PROC(_kern, OID_AUTO, mpsc_test_pingpong, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
2579 0, 0, sysctl_mpsc_test_pingpong, "Q", "MPSC tests: pingpong");
2580
2581 #endif /* DEVELOPMENT || DEBUG */
2582
2583 /* Telemetry, microstackshots */
2584
2585 SYSCTL_NODE(_kern, OID_AUTO, microstackshot, CTLFLAG_RD | CTLFLAG_LOCKED, 0,
2586 "microstackshot info");
2587
2588 extern uint32_t telemetry_sample_rate;
2589 SYSCTL_UINT(_kern_microstackshot, OID_AUTO, interrupt_sample_rate,
2590 CTLFLAG_RD | CTLFLAG_LOCKED, &telemetry_sample_rate, 0,
2591 "interrupt-based sampling rate in Hz");
2592
2593 #if defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES)
2594
2595 extern uint64_t mt_microstackshot_period;
2596 SYSCTL_QUAD(_kern_microstackshot, OID_AUTO, pmi_sample_period,
2597 CTLFLAG_RD | CTLFLAG_LOCKED, &mt_microstackshot_period,
2598 "PMI sampling rate");
2599 extern unsigned int mt_microstackshot_ctr;
2600 SYSCTL_UINT(_kern_microstackshot, OID_AUTO, pmi_sample_counter,
2601 CTLFLAG_RD | CTLFLAG_LOCKED, &mt_microstackshot_ctr, 0,
2602 "PMI counter");
2603
2604 #endif /* defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) */
2605
2606 /*Remote Time api*/
2607 SYSCTL_NODE(_machdep, OID_AUTO, remotetime, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "Remote time api");
2608
2609 #if DEVELOPMENT || DEBUG
2610 #if CONFIG_MACH_BRIDGE_SEND_TIME
2611 extern _Atomic uint32_t bt_init_flag;
2612 extern uint32_t mach_bridge_timer_enable(uint32_t, int);
2613
2614 SYSCTL_INT(_machdep_remotetime, OID_AUTO, bridge_timer_init_flag,
2615 CTLFLAG_RD | CTLFLAG_LOCKED, &bt_init_flag, 0, "");
2616
2617 static int sysctl_mach_bridge_timer_enable SYSCTL_HANDLER_ARGS
2618 {
2619 #pragma unused(oidp, arg1, arg2)
2620 uint32_t value = 0;
2621 int error = 0;
2622 /* User is querying buffer size */
2623 if (req->oldptr == USER_ADDR_NULL && req->newptr == USER_ADDR_NULL) {
2624 req->oldidx = sizeof(value);
2625 return 0;
2626 }
2627 if (os_atomic_load(&bt_init_flag, acquire)) {
2628 if (req->newptr) {
2629 int new_value = 0;
2630 error = SYSCTL_IN(req, &new_value, sizeof(new_value));
2631 if (error) {
2632 return error;
2633 }
2634 if (new_value == 0 || new_value == 1) {
2635 value = mach_bridge_timer_enable(new_value, 1);
2636 } else {
2637 return EPERM;
2638 }
2639 } else {
2640 value = mach_bridge_timer_enable(0, 0);
2641 }
2642 }
2643 error = SYSCTL_OUT(req, &value, sizeof(value));
2644 return error;
2645 }
2646
2647 SYSCTL_PROC(_machdep_remotetime, OID_AUTO, bridge_timer_enable,
2648 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2649 0, 0, sysctl_mach_bridge_timer_enable, "I", "");
2650
2651 #endif /* CONFIG_MACH_BRIDGE_SEND_TIME */
2652
2653 static int sysctl_mach_bridge_remote_time SYSCTL_HANDLER_ARGS
2654 {
2655 #pragma unused(oidp, arg1, arg2)
2656 uint64_t ltime = 0, rtime = 0;
2657 if (req->oldptr == USER_ADDR_NULL) {
2658 req->oldidx = sizeof(rtime);
2659 return 0;
2660 }
2661 if (req->newptr) {
2662 int error = SYSCTL_IN(req, <ime, sizeof(ltime));
2663 if (error) {
2664 return error;
2665 }
2666 }
2667 rtime = mach_bridge_remote_time(ltime);
2668 return SYSCTL_OUT(req, &rtime, sizeof(rtime));
2669 }
2670 SYSCTL_PROC(_machdep_remotetime, OID_AUTO, mach_bridge_remote_time,
2671 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
2672 0, 0, sysctl_mach_bridge_remote_time, "Q", "");
2673
2674 #endif /* DEVELOPMENT || DEBUG */
2675
2676 #if CONFIG_MACH_BRIDGE_RECV_TIME
2677 extern struct bt_params bt_params_get_latest(void);
2678
2679 static int sysctl_mach_bridge_conversion_params SYSCTL_HANDLER_ARGS
2680 {
2681 #pragma unused(oidp, arg1, arg2)
2682 struct bt_params params = {};
2683 if (req->oldptr == USER_ADDR_NULL) {
2684 req->oldidx = sizeof(struct bt_params);
2685 return 0;
2686 }
2687 if (req->newptr) {
2688 return EPERM;
2689 }
2690 params = bt_params_get_latest();
2691 return SYSCTL_OUT(req, ¶ms, MIN(sizeof(params), req->oldlen));
2692 }
2693
2694 SYSCTL_PROC(_machdep_remotetime, OID_AUTO, conversion_params,
2695 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0,
2696 0, sysctl_mach_bridge_conversion_params, "S,bt_params", "");
2697
2698 #endif /* CONFIG_MACH_BRIDGE_RECV_TIME */
2699
2700 #if DEVELOPMENT || DEBUG
2701
2702 #include <pexpert/pexpert.h>
2703 extern int32_t sysctl_get_bound_cpuid(void);
2704 extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
2705 static int
2706 sysctl_kern_sched_thread_bind_cpu SYSCTL_HANDLER_ARGS
2707 {
2708 #pragma unused(oidp, arg1, arg2)
2709
2710 /*
2711 * DO NOT remove this bootarg guard or make this non-development.
2712 * This kind of binding should only be used for tests and
2713 * experiments in a custom configuration, never shipping code.
2714 */
2715
2716 if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
2717 return ENOENT;
2718 }
2719
2720 int32_t cpuid = sysctl_get_bound_cpuid();
2721
2722 int32_t new_value;
2723 int changed;
2724 int error = sysctl_io_number(req, cpuid, sizeof cpuid, &new_value, &changed);
2725 if (error) {
2726 return error;
2727 }
2728
2729 if (changed) {
2730 kern_return_t kr = sysctl_thread_bind_cpuid(new_value);
2731
2732 if (kr == KERN_NOT_SUPPORTED) {
2733 return ENOTSUP;
2734 }
2735
2736 if (kr == KERN_INVALID_VALUE) {
2737 return ERANGE;
2738 }
2739 }
2740
2741 return error;
2742 }
2743
2744 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cpu, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2745 0, 0, sysctl_kern_sched_thread_bind_cpu, "I", "");
2746
2747 #if __AMP__
2748 extern char sysctl_get_bound_cluster_type(void);
2749 extern void sysctl_thread_bind_cluster_type(char cluster_type);
2750 static int
2751 sysctl_kern_sched_thread_bind_cluster_type SYSCTL_HANDLER_ARGS
2752 {
2753 #pragma unused(oidp, arg1, arg2)
2754 char buff[4];
2755
2756 if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
2757 return ENOENT;
2758 }
2759
2760 int error = SYSCTL_IN(req, buff, 1);
2761 if (error) {
2762 return error;
2763 }
2764 char cluster_type = buff[0];
2765
2766 if (!req->newptr) {
2767 goto out;
2768 }
2769
2770 sysctl_thread_bind_cluster_type(cluster_type);
2771 out:
2772 cluster_type = sysctl_get_bound_cluster_type();
2773 buff[0] = cluster_type;
2774
2775 return SYSCTL_OUT(req, buff, 1);
2776 }
2777
2778 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cluster_type, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED,
2779 0, 0, sysctl_kern_sched_thread_bind_cluster_type, "A", "");
2780
2781 extern char sysctl_get_task_cluster_type(void);
2782 extern void sysctl_task_set_cluster_type(char cluster_type);
2783 static int
2784 sysctl_kern_sched_task_set_cluster_type SYSCTL_HANDLER_ARGS
2785 {
2786 #pragma unused(oidp, arg1, arg2)
2787 char buff[4];
2788
2789 if (!PE_parse_boot_argn("enable_skstsct", NULL, 0)) {
2790 return ENOENT;
2791 }
2792
2793 int error = SYSCTL_IN(req, buff, 1);
2794 if (error) {
2795 return error;
2796 }
2797 char cluster_type = buff[0];
2798
2799 if (!req->newptr) {
2800 goto out;
2801 }
2802
2803 sysctl_task_set_cluster_type(cluster_type);
2804 out:
2805 cluster_type = sysctl_get_task_cluster_type();
2806 buff[0] = cluster_type;
2807
2808 return SYSCTL_OUT(req, buff, 1);
2809 }
2810
2811 SYSCTL_PROC(_kern, OID_AUTO, sched_task_set_cluster_type, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED,
2812 0, 0, sysctl_kern_sched_task_set_cluster_type, "A", "");
2813
2814 extern kern_return_t thread_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_option_t options);
2815 extern uint32_t thread_bound_cluster_id(thread_t);
2816 static int
2817 sysctl_kern_sched_thread_bind_cluster_id SYSCTL_HANDLER_ARGS
2818 {
2819 #pragma unused(oidp, arg1, arg2)
2820 if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
2821 return ENOENT;
2822 }
2823
2824 thread_t self = current_thread();
2825 uint32_t old_value = thread_bound_cluster_id(self);
2826 uint32_t new_value;
2827
2828 int error = SYSCTL_IN(req, &new_value, sizeof(new_value));
2829 if (error) {
2830 return error;
2831 }
2832 if (new_value != old_value) {
2833 /*
2834 * This sysctl binds the thread to the cluster without any flags,
2835 * which means it will be hard bound and not check eligibility.
2836 */
2837 thread_bind_cluster_id(self, new_value, 0);
2838 }
2839 return SYSCTL_OUT(req, &old_value, sizeof(old_value));
2840 }
2841
2842 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cluster_id, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
2843 0, 0, sysctl_kern_sched_thread_bind_cluster_id, "I", "");
2844
2845 #if CONFIG_SCHED_EDGE
2846
2847 extern int sched_edge_restrict_ut;
2848 SYSCTL_INT(_kern, OID_AUTO, sched_edge_restrict_ut, CTLFLAG_RW | CTLFLAG_LOCKED, &sched_edge_restrict_ut, 0, "Edge Scheduler Restrict UT Threads");
2849 extern int sched_edge_restrict_bg;
2850 SYSCTL_INT(_kern, OID_AUTO, sched_edge_restrict_bg, CTLFLAG_RW | CTLFLAG_LOCKED, &sched_edge_restrict_ut, 0, "Edge Scheduler Restrict BG Threads");
2851 extern int sched_edge_migrate_ipi_immediate;
2852 SYSCTL_INT(_kern, OID_AUTO, sched_edge_migrate_ipi_immediate, CTLFLAG_RW | CTLFLAG_LOCKED, &sched_edge_migrate_ipi_immediate, 0, "Edge Scheduler uses immediate IPIs for migration event based on execution latency");
2853
2854 #endif /* CONFIG_SCHED_EDGE */
2855
2856 #endif /* __AMP__ */
2857
2858 #if INTERRUPT_MASKED_DEBUG
2859
2860 SYSCTL_INT(_kern, OID_AUTO, interrupt_masked_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED,
2861 &interrupt_masked_timeout, 0,
2862 "Interrupt masked duration after which a tracepoint is emitted or the device panics (in mach timebase units)");
2863
2864 SYSCTL_INT(_kern, OID_AUTO, interrupt_masked_debug_mode, CTLFLAG_RW | CTLFLAG_LOCKED,
2865 &interrupt_masked_debug_mode, 0,
2866 "Enable interrupt masked tracing or panic (0: off, 1: trace, 2: panic)");
2867
2868 #endif /* INTERRUPT_MASKED_DEBUG */
2869
2870 #if SCHED_PREEMPTION_DISABLE_DEBUG
2871
2872 SYSCTL_QUAD(_kern, OID_AUTO, sched_preemption_disable_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED,
2873 &sched_preemption_disable_threshold_mt,
2874 "Preemption disablement duration after which a tracepoint is emitted or the device panics (in mach timebase units)");
2875
2876 SYSCTL_INT(_kern, OID_AUTO, sched_preemption_disable_debug_mode, CTLFLAG_RW | CTLFLAG_LOCKED,
2877 &sched_preemption_disable_debug_mode, 0,
2878 "Enable preemption disablement tracing or panic (0: off, 1: trace, 2: panic)");
2879
2880 PERCPU_DECL(uint64_t, preemption_disable_max_mt);
2881
2882 static int
sysctl_sched_preemption_disable_stats(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)2883 sysctl_sched_preemption_disable_stats(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
2884 {
2885 uint64_t stats[MAX_CPUS]; // maximum per CPU
2886
2887 /*
2888 * No synchronization here. The individual values are pretty much
2889 * independent, and reading/writing them is atomic.
2890 */
2891
2892 static_assert(__LP64__); /* below is racy on armv7k, reminder to change if needed there. */
2893
2894 int cpu = 0;
2895 percpu_foreach(max_stat, preemption_disable_max_mt) {
2896 stats[cpu++] = *max_stat;
2897 }
2898
2899 if (req->newlen > 0) {
2900 // writing just resets all stats.
2901 percpu_foreach(max_stat, preemption_disable_max_mt) {
2902 *max_stat = 0;
2903 }
2904 }
2905
2906 return sysctl_io_opaque(req, stats, cpu * sizeof(uint64_t), NULL);
2907 }
2908
2909 SYSCTL_PROC(_kern, OID_AUTO, sched_preemption_disable_stats,
2910 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_LOCKED,
2911 0, 0, sysctl_sched_preemption_disable_stats, "I", "Preemption disablement statistics");
2912
2913 #endif /* SCHED_PREEMPTION_DISABLE_DEBUG */
2914
2915
2916 /* used for testing by exception_tests */
2917 extern uint32_t ipc_control_port_options;
2918 SYSCTL_INT(_kern, OID_AUTO, ipc_control_port_options,
2919 CTLFLAG_RD | CTLFLAG_LOCKED, &ipc_control_port_options, 0, "");
2920
2921 #endif /* DEVELOPMENT || DEBUG */
2922
2923 extern uint32_t task_exc_guard_default;
2924
2925 SYSCTL_INT(_kern, OID_AUTO, task_exc_guard_default,
2926 CTLFLAG_RD | CTLFLAG_LOCKED, &task_exc_guard_default, 0, "");
2927
2928
2929 static int
2930 sysctl_kern_tcsm_available SYSCTL_HANDLER_ARGS
2931 {
2932 #pragma unused(oidp, arg1, arg2)
2933 uint32_t value = machine_csv(CPUVN_CI) ? 1 : 0;
2934
2935 if (req->newptr) {
2936 return EINVAL;
2937 }
2938
2939 return SYSCTL_OUT(req, &value, sizeof(value));
2940 }
2941 SYSCTL_PROC(_kern, OID_AUTO, tcsm_available,
2942 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
2943 0, 0, sysctl_kern_tcsm_available, "I", "");
2944
2945
2946 static int
2947 sysctl_kern_tcsm_enable SYSCTL_HANDLER_ARGS
2948 {
2949 #pragma unused(oidp, arg1, arg2)
2950 uint32_t soflags = 0;
2951 uint32_t old_value = thread_get_no_smt() ? 1 : 0;
2952
2953 int error = SYSCTL_IN(req, &soflags, sizeof(soflags));
2954 if (error) {
2955 return error;
2956 }
2957
2958 if (soflags && machine_csv(CPUVN_CI)) {
2959 thread_set_no_smt(true);
2960 machine_tecs(current_thread());
2961 }
2962
2963 return SYSCTL_OUT(req, &old_value, sizeof(old_value));
2964 }
2965 SYSCTL_PROC(_kern, OID_AUTO, tcsm_enable,
2966 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
2967 0, 0, sysctl_kern_tcsm_enable, "I", "");
2968
2969 static int
2970 sysctl_kern_debug_get_preoslog SYSCTL_HANDLER_ARGS
2971 {
2972 #pragma unused(oidp, arg1, arg2)
2973 static bool oneshot_executed = false;
2974 size_t preoslog_size = 0;
2975 const char *preoslog = NULL;
2976 int ret = 0;
2977
2978 // DumpPanic passes a non-zero write value when it needs oneshot behaviour
2979 if (req->newptr != USER_ADDR_NULL) {
2980 uint8_t oneshot = 0;
2981 int error = SYSCTL_IN(req, &oneshot, sizeof(oneshot));
2982 if (error) {
2983 return error;
2984 }
2985
2986 if (oneshot) {
2987 if (!os_atomic_cmpxchg(&oneshot_executed, false, true, acq_rel)) {
2988 return EPERM;
2989 }
2990 }
2991 }
2992
2993 preoslog = sysctl_debug_get_preoslog(&preoslog_size);
2994 if (preoslog != NULL && preoslog_size == 0) {
2995 sysctl_debug_free_preoslog();
2996 return 0;
2997 }
2998
2999 if (preoslog == NULL || preoslog_size == 0) {
3000 return 0;
3001 }
3002
3003 if (req->oldptr == USER_ADDR_NULL) {
3004 req->oldidx = preoslog_size;
3005 return 0;
3006 }
3007
3008 ret = SYSCTL_OUT(req, preoslog, preoslog_size);
3009 sysctl_debug_free_preoslog();
3010 return ret;
3011 }
3012
3013 SYSCTL_PROC(_kern, OID_AUTO, preoslog, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_LOCKED,
3014 0, 0, sysctl_kern_debug_get_preoslog, "-", "");
3015
3016 #if DEVELOPMENT || DEBUG
3017 extern void sysctl_task_set_no_smt(char no_smt);
3018 extern char sysctl_task_get_no_smt(void);
3019
3020 static int
3021 sysctl_kern_sched_task_set_no_smt SYSCTL_HANDLER_ARGS
3022 {
3023 #pragma unused(oidp, arg1, arg2)
3024 char buff[4];
3025
3026 int error = SYSCTL_IN(req, buff, 1);
3027 if (error) {
3028 return error;
3029 }
3030 char no_smt = buff[0];
3031
3032 if (!req->newptr) {
3033 goto out;
3034 }
3035
3036 sysctl_task_set_no_smt(no_smt);
3037 out:
3038 no_smt = sysctl_task_get_no_smt();
3039 buff[0] = no_smt;
3040
3041 return SYSCTL_OUT(req, buff, 1);
3042 }
3043
3044 SYSCTL_PROC(_kern, OID_AUTO, sched_task_set_no_smt, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
3045 0, 0, sysctl_kern_sched_task_set_no_smt, "A", "");
3046
3047 static int
sysctl_kern_sched_thread_set_no_smt(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3048 sysctl_kern_sched_thread_set_no_smt(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3049 {
3050 int new_value, changed;
3051 int old_value = thread_get_no_smt() ? 1 : 0;
3052 int error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed);
3053
3054 if (changed) {
3055 thread_set_no_smt(!!new_value);
3056 }
3057
3058 return error;
3059 }
3060
3061 SYSCTL_PROC(_kern, OID_AUTO, sched_thread_set_no_smt,
3062 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY,
3063 0, 0, sysctl_kern_sched_thread_set_no_smt, "I", "");
3064
3065
3066 static int
3067 sysctl_kern_task_set_filter_msg_flag SYSCTL_HANDLER_ARGS
3068 {
3069 #pragma unused(oidp, arg1, arg2)
3070 int new_value, changed;
3071 int old_value = task_get_filter_msg_flag(current_task()) ? 1 : 0;
3072 int error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed);
3073
3074 if (changed) {
3075 task_set_filter_msg_flag(current_task(), !!new_value);
3076 }
3077
3078 return error;
3079 }
3080
3081 SYSCTL_PROC(_kern, OID_AUTO, task_set_filter_msg_flag, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3082 0, 0, sysctl_kern_task_set_filter_msg_flag, "I", "");
3083
3084 #if CONFIG_PROC_RESOURCE_LIMITS
3085
3086 extern mach_port_name_t current_task_get_fatal_port_name(void);
3087
3088 static int
3089 sysctl_kern_task_get_fatal_port SYSCTL_HANDLER_ARGS
3090 {
3091 #pragma unused(oidp, arg1, arg2)
3092 int port = 0;
3093 int flag = 0;
3094
3095 if (req->oldptr == USER_ADDR_NULL) {
3096 req->oldidx = sizeof(mach_port_t);
3097 return 0;
3098 }
3099
3100 int error = SYSCTL_IN(req, &flag, sizeof(flag));
3101 if (error) {
3102 return error;
3103 }
3104
3105 if (flag == 1) {
3106 port = (int)current_task_get_fatal_port_name();
3107 }
3108 return SYSCTL_OUT(req, &port, sizeof(port));
3109 }
3110
3111 SYSCTL_PROC(_machdep, OID_AUTO, task_get_fatal_port, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3112 0, 0, sysctl_kern_task_get_fatal_port, "I", "");
3113
3114 #endif /* CONFIG_PROC_RESOURCE_LIMITS */
3115
3116 extern unsigned int ipc_table_max_entries(void);
3117
3118 static int
3119 sysctl_mach_max_port_table_size SYSCTL_HANDLER_ARGS
3120 {
3121 #pragma unused(oidp, arg1, arg2)
3122 int old_value = ipc_table_max_entries();
3123 int error = sysctl_io_number(req, old_value, sizeof(int), NULL, NULL);
3124
3125 return error;
3126 }
3127
3128 SYSCTL_PROC(_machdep, OID_AUTO, max_port_table_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3129 0, 0, sysctl_mach_max_port_table_size, "I", "");
3130
3131 #endif /* DEVELOPMENT || DEBUG */
3132
3133 #if defined(CONFIG_KDP_INTERACTIVE_DEBUGGING) && defined(CONFIG_KDP_COREDUMP_ENCRYPTION)
3134
3135 #define COREDUMP_ENCRYPTION_KEY_ENTITLEMENT "com.apple.private.coredump-encryption-key"
3136
3137 static int
3138 sysctl_coredump_encryption_key_update SYSCTL_HANDLER_ARGS
3139 {
3140 kern_return_t ret = KERN_SUCCESS;
3141 int error = 0;
3142 struct kdp_core_encryption_key_descriptor key_descriptor = {
3143 .kcekd_format = MACH_CORE_FILEHEADER_V2_FLAG_NEXT_COREFILE_KEY_FORMAT_NIST_P256,
3144 };
3145
3146 /* Need to be root and have entitlement */
3147 if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(COREDUMP_ENCRYPTION_KEY_ENTITLEMENT)) {
3148 return EPERM;
3149 }
3150
3151 // Sanity-check the given key length
3152 if (req->newlen > UINT16_MAX) {
3153 return EINVAL;
3154 }
3155
3156 // It is allowed for the caller to pass in a NULL buffer.
3157 // This indicates that they want us to forget about any public key we might have.
3158 if (req->newptr) {
3159 key_descriptor.kcekd_size = (uint16_t) req->newlen;
3160 key_descriptor.kcekd_key = kalloc_data(key_descriptor.kcekd_size, Z_WAITOK);
3161
3162 if (key_descriptor.kcekd_key == NULL) {
3163 return ENOMEM;
3164 }
3165
3166 error = SYSCTL_IN(req, key_descriptor.kcekd_key, key_descriptor.kcekd_size);
3167 if (error) {
3168 goto out;
3169 }
3170 }
3171
3172 ret = IOProvideCoreFileAccess(kdp_core_handle_new_encryption_key, (void *)&key_descriptor);
3173 if (KERN_SUCCESS != ret) {
3174 printf("Failed to handle the new encryption key. Error 0x%x", ret);
3175 error = EFAULT;
3176 }
3177
3178 out:
3179 kfree_data(key_descriptor.kcekd_key, key_descriptor.kcekd_size);
3180 return 0;
3181 }
3182
3183 SYSCTL_PROC(_kern, OID_AUTO, coredump_encryption_key, CTLTYPE_OPAQUE | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3184 0, 0, &sysctl_coredump_encryption_key_update, "-", "Set a new encryption key for coredumps");
3185
3186 #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING && CONFIG_KDP_COREDUMP_ENCRYPTION*/
3187