xref: /xnu-10063.141.1/bsd/vfs/vfs_syscalls.c (revision d8b80295118ef25ac3a784134bcf95cd8e88109f)
1 /*
2  * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 #if CONFIG_EXCLAVES
115 #include <vfs/vfs_exclave_fs.h>
116 #endif
117 
118 #include <security/audit/audit.h>
119 #include <bsm/audit_kevents.h>
120 
121 #include <mach/mach_types.h>
122 #include <kern/kern_types.h>
123 #include <kern/kalloc.h>
124 #include <kern/task.h>
125 
126 #include <vm/vm_pageout.h>
127 #include <vm/vm_protos.h>
128 
129 #include <libkern/OSAtomic.h>
130 #include <os/atomic_private.h>
131 #include <pexpert/pexpert.h>
132 #include <IOKit/IOBSD.h>
133 
134 // deps for MIG call
135 #include <kern/host.h>
136 #include <kern/ipc_misc.h>
137 #include <mach/host_priv.h>
138 #include <mach/vfs_nspace.h>
139 #include <os/log.h>
140 
141 #include <nfs/nfs_conf.h>
142 
143 #if ROUTEFS
144 #include <miscfs/routefs/routefs.h>
145 #endif /* ROUTEFS */
146 
147 #if CONFIG_MACF
148 #include <security/mac.h>
149 #include <security/mac_framework.h>
150 #endif
151 
152 #if CONFIG_FSE
153 #define GET_PATH(x) \
154 	((x) = get_pathbuff())
155 #define RELEASE_PATH(x) \
156 	release_pathbuff(x)
157 #else
158 #define GET_PATH(x)     \
159 	((x) = zalloc(ZV_NAMEI))
160 #define RELEASE_PATH(x) \
161 	zfree(ZV_NAMEI, x)
162 #endif /* CONFIG_FSE */
163 
164 #ifndef HFS_GET_BOOT_INFO
165 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
166 #endif
167 
168 #ifndef HFS_SET_BOOT_INFO
169 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
170 #endif
171 
172 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
173 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
174 #endif
175 
176 extern void disk_conditioner_unmount(mount_t mp);
177 
178 /* struct for checkdirs iteration */
179 struct cdirargs {
180 	vnode_t olddp;
181 	vnode_t newdp;
182 };
183 /* callback  for checkdirs iteration */
184 static int checkdirs_callback(proc_t p, void * arg);
185 
186 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
187 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
188 void enablequotas(struct mount *mp, vfs_context_t ctx);
189 static int getfsstat_callback(mount_t mp, void * arg);
190 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
191 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
192 static int sync_callback(mount_t, void *);
193 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
194     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
195     boolean_t partial_copy);
196 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
197 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
198     struct componentname *cnp, user_addr_t fsmountargs,
199     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
200 void vfs_notify_mount(vnode_t pdvp);
201 
202 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
203 
204 struct fd_vn_data * fg_vn_data_alloc(void);
205 
206 /*
207  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
208  * Concurrent lookups (or lookups by ids) on hard links can cause the
209  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
210  * does) to return ENOENT as the path cannot be returned from the name cache
211  * alone. We have no option but to retry and hope to get one namei->reverse path
212  * generation done without an intervening lookup, lookup by id on the hard link
213  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
214  * which currently are the MAC hooks for rename, unlink and rmdir.
215  */
216 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
217 
218 /* Max retry limit for rename due to vnode recycling. */
219 #define MAX_RENAME_ERECYCLE_RETRIES 1024
220 
221 /* Max retries for concurrent mounts on the same covered vnode. */
222 #define MAX_MOUNT_RETRIES       10
223 
224 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
225     int unlink_flags);
226 
227 #ifdef CONFIG_IMGSRC_ACCESS
228 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
229 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
230 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
231 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
232 static void mount_end_update(mount_t mp);
233 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
234 #endif /* CONFIG_IMGSRC_ACCESS */
235 
236 //snapshot functions
237 #if CONFIG_MNT_ROOTSNAP
238 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
239 #else
240 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
241 #endif
242 
243 __private_extern__
244 int sync_internal(void);
245 
246 __private_extern__
247 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
248 
249 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
250 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
251 
252 /* vars for sync mutex */
253 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
254 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
255 
256 extern lck_rw_t rootvnode_rw_lock;
257 
258 VFS_SMR_DECLARE;
259 extern uint32_t nc_smr_enabled;
260 
261 /*
262  * incremented each time a mount or unmount operation occurs
263  * used to invalidate the cached value of the rootvp in the
264  * mount structure utilized by cache_lookup_path
265  */
266 uint32_t mount_generation = 0;
267 
268 /* counts number of mount and unmount operations */
269 unsigned int vfs_nummntops = 0;
270 
271 /* system-wide, per-boot unique mount ID */
272 static _Atomic uint64_t mount_unique_id = 1;
273 
274 extern const struct fileops vnops;
275 #if CONFIG_APPLEDOUBLE
276 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
277 #endif /* CONFIG_APPLEDOUBLE */
278 
279 /* Maximum buffer length supported by fsgetpath(2) */
280 #define FSGETPATH_MAXBUFLEN  8192
281 
282 /*
283  * Virtual File System System Calls
284  */
285 
286 /*
287  * Private in-kernel mounting spi (specific use-cases only)
288  */
289 boolean_t
vfs_iskernelmount(mount_t mp)290 vfs_iskernelmount(mount_t mp)
291 {
292 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
293 }
294 
295 __private_extern__
296 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)297 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
298     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
299     vfs_context_t ctx)
300 {
301 	struct nameidata nd;
302 	boolean_t did_namei;
303 	int error;
304 
305 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
306 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
307 
308 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
309 
310 	/*
311 	 * Get the vnode to be covered if it's not supplied
312 	 */
313 	if (vp == NULLVP) {
314 		error = namei(&nd);
315 		if (error) {
316 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
317 				printf("failed to locate mount-on path: %s ", path);
318 			}
319 			return error;
320 		}
321 		vp = nd.ni_vp;
322 		pvp = nd.ni_dvp;
323 		did_namei = TRUE;
324 	} else {
325 		char *pnbuf = CAST_DOWN(char *, path);
326 
327 		nd.ni_cnd.cn_pnbuf = pnbuf;
328 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
329 		did_namei = FALSE;
330 	}
331 
332 	kern_flags |= KERNEL_MOUNT_KMOUNT;
333 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
334 	    syscall_flags, kern_flags, NULL, ctx);
335 
336 	if (did_namei) {
337 		vnode_put(vp);
338 		vnode_put(pvp);
339 		nameidone(&nd);
340 	}
341 
342 	return error;
343 }
344 
345 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)346 vfs_mount_at_path(const char *fstype, const char *path,
347     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
348     int mnt_flags, int flags)
349 {
350 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
351 	int error, km_flags = 0;
352 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
353 
354 	/*
355 	 * This call is currently restricted to specific use cases.
356 	 */
357 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
358 		return ENOTSUP;
359 	}
360 
361 #if !defined(XNU_TARGET_OS_OSX)
362 	if (strcmp(fstype, "lifs") == 0) {
363 		syscall_flags |= MNT_NOEXEC;
364 	}
365 #endif
366 
367 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
368 		km_flags |= KERNEL_MOUNT_NOAUTH;
369 	}
370 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
371 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
372 	}
373 
374 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
375 	    syscall_flags, km_flags, ctx);
376 	if (error) {
377 		printf("%s: mount on %s failed, error %d\n", __func__, path,
378 		    error);
379 	}
380 
381 	return error;
382 }
383 
384 /*
385  * Mount a file system.
386  */
387 /* ARGSUSED */
388 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)389 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
390 {
391 	struct __mac_mount_args muap;
392 
393 	muap.type = uap->type;
394 	muap.path = uap->path;
395 	muap.flags = uap->flags;
396 	muap.data = uap->data;
397 	muap.mac_p = USER_ADDR_NULL;
398 	return __mac_mount(p, &muap, retval);
399 }
400 
401 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)402 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
403 {
404 	struct componentname    cn;
405 	vfs_context_t           ctx = vfs_context_current();
406 	size_t                  dummy = 0;
407 	int                     error;
408 	int                     flags = uap->flags;
409 	char                    fstypename[MFSNAMELEN];
410 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
411 	vnode_t                 pvp;
412 	vnode_t                 vp;
413 
414 	AUDIT_ARG(fd, uap->fd);
415 	AUDIT_ARG(fflags, flags);
416 	/* fstypename will get audited by mount_common */
417 
418 	/* Sanity check the flags */
419 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
420 		return ENOTSUP;
421 	}
422 
423 	if (flags & MNT_UNION) {
424 		return EPERM;
425 	}
426 
427 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
428 	if (error) {
429 		return error;
430 	}
431 
432 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
433 		return error;
434 	}
435 
436 	if ((error = vnode_getwithref(vp)) != 0) {
437 		file_drop(uap->fd);
438 		return error;
439 	}
440 
441 	pvp = vnode_getparent(vp);
442 	if (pvp == NULL) {
443 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
444 			error = EBUSY;
445 		} else {
446 			error = EINVAL;
447 		}
448 		vnode_put(vp);
449 		file_drop(uap->fd);
450 		return error;
451 	}
452 
453 	memset(&cn, 0, sizeof(struct componentname));
454 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
455 	cn.cn_pnlen = MAXPATHLEN;
456 
457 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
458 		zfree(ZV_NAMEI, cn.cn_pnbuf);
459 		vnode_put(pvp);
460 		vnode_put(vp);
461 		file_drop(uap->fd);
462 		return error;
463 	}
464 
465 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
466 
467 	zfree(ZV_NAMEI, cn.cn_pnbuf);
468 	vnode_put(pvp);
469 	vnode_put(vp);
470 	file_drop(uap->fd);
471 
472 	return error;
473 }
474 
475 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
476 
477 /*
478  * Get the size of a graft file (a manifest or payload file).
479  * The vp should be an iocounted vnode.
480  */
481 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)482 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
483 {
484 	struct stat64 sb = {};
485 	int error;
486 
487 	*size = 0;
488 
489 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
490 	if (error) {
491 		return error;
492 	}
493 
494 	if (sb.st_size == 0) {
495 		error = ENODATA;
496 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
497 		error = EFBIG;
498 	} else {
499 		*size = (size_t) sb.st_size;
500 	}
501 
502 	return error;
503 }
504 
505 /*
506  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
507  * `size` must already be validated.
508  */
509 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)510 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
511 {
512 	return vn_rdwr(UIO_READ, graft_vp,
513 	           (caddr_t) buf, (int) size, /* offset */ 0,
514 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
515 	           vfs_context_ucred(vctx), /* resid */ NULL,
516 	           vfs_context_proc(vctx));
517 }
518 
519 /*
520  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
521  * and read it into `buf`.
522  */
523 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)524 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
525 {
526 	vnode_t metadata_vp = NULLVP;
527 	int error;
528 
529 	// Convert this graft fd to a vnode.
530 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
531 		goto out;
532 	}
533 
534 	// Get (and validate) size information.
535 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
536 		goto out;
537 	}
538 
539 	// Read each file into the provided buffer - we must get the expected amount of bytes.
540 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
541 		goto out;
542 	}
543 
544 out:
545 	if (metadata_vp) {
546 		vnode_put(metadata_vp);
547 		metadata_vp = NULLVP;
548 	}
549 
550 	return error;
551 }
552 
553 /*
554  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
555  * provided in `gfs`, saving the size of data read in `gfs`.
556  */
557 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)558 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
559     fsioc_graft_fs_t *gfs)
560 {
561 	int error;
562 
563 	// Read the authentic manifest.
564 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
565 	    &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
566 		return error;
567 	}
568 
569 	// The user manifest is currently unused, but set its size.
570 	gfs->user_manifest_size = 0;
571 
572 	// Read the payload.
573 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
574 	    &gfs->payload_size, gfs->payload))) {
575 		return error;
576 	}
577 
578 	return 0;
579 }
580 
581 /*
582  * Call into the filesystem to verify and graft a cryptex.
583  */
584 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)585 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
586     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
587 {
588 	fsioc_graft_fs_t gfs = {};
589 	uint64_t graft_dir_ino = 0;
590 	struct stat64 sb = {};
591 	int error;
592 
593 	// Pre-flight arguments.
594 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
595 		// Make sure that this graft version matches what we support.
596 		return ENOTSUP;
597 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
598 		// For this type, cryptex VP must live on same volume as the target of graft.
599 		return EXDEV;
600 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
601 		// We cannot graft upon non-directories.
602 		return ENOTDIR;
603 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
604 	    sbc_args->sbc_payload_fd < 0) {
605 		// We cannot graft without a manifest and payload.
606 		return EINVAL;
607 	}
608 
609 	if (mounton_vp) {
610 		// Get the mounton's inode number.
611 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
612 		if (error) {
613 			return error;
614 		}
615 		graft_dir_ino = (uint64_t) sb.st_ino;
616 	}
617 
618 	// Create buffers (of our maximum-defined size) to store authentication info.
619 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
620 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
621 
622 	if (!gfs.authentic_manifest || !gfs.payload) {
623 		error = ENOMEM;
624 		goto out;
625 	}
626 
627 	// Read our fd's into our buffers.
628 	// (Note that this will set the buffer size fields in `gfs`.)
629 	error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
630 	if (error) {
631 		goto out;
632 	}
633 
634 	gfs.graft_version = FSIOC_GRAFT_VERSION;
635 	gfs.graft_type = graft_type;
636 	gfs.graft_4cc = sbc_args->sbc_4cc;
637 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
638 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
639 	}
640 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
641 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
642 	}
643 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
644 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
645 	}
646 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
647 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
648 	}
649 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
650 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
651 	}
652 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
653 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
654 	}
655 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
656 
657 	// Call into the FS to perform the graft (and validation).
658 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
659 
660 out:
661 	if (gfs.authentic_manifest) {
662 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
663 		gfs.authentic_manifest = NULL;
664 	}
665 	if (gfs.payload) {
666 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
667 		gfs.payload = NULL;
668 	}
669 
670 	return error;
671 }
672 
673 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
674 
675 /*
676  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
677  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
678  */
679 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)680 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
681 {
682 	int ua_dmgfd = uap->dmg_fd;
683 	user_addr_t ua_mountdir = uap->mountdir;
684 	uint32_t ua_grafttype = uap->graft_type;
685 	user_addr_t ua_graftargs = uap->gda;
686 
687 	graftdmg_args_un kern_gda = {};
688 	int error = 0;
689 	secure_boot_cryptex_args_t *sbc_args = NULL;
690 
691 	vnode_t cryptex_vp = NULLVP;
692 	vnode_t mounton_vp = NULLVP;
693 	struct nameidata nd = {};
694 	vfs_context_t ctx = vfs_context_current();
695 
696 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
697 		return EPERM;
698 	}
699 
700 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
701 	if (error) {
702 		return error;
703 	}
704 
705 	// Copy mount dir in, if provided.
706 	if (ua_mountdir != USER_ADDR_NULL) {
707 		// Acquire vnode for mount-on path
708 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
709 		    UIO_USERSPACE, ua_mountdir, ctx);
710 
711 		error = namei(&nd);
712 		if (error) {
713 			return error;
714 		}
715 		mounton_vp = nd.ni_vp;
716 	}
717 
718 	// Convert fd to vnode.
719 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
720 	if (error) {
721 		goto graftout;
722 	}
723 
724 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
725 		error = EINVAL;
726 	} else {
727 		sbc_args = &kern_gda.sbc_args;
728 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
729 	}
730 
731 graftout:
732 	if (cryptex_vp) {
733 		vnode_put(cryptex_vp);
734 		cryptex_vp = NULLVP;
735 	}
736 	if (mounton_vp) {
737 		vnode_put(mounton_vp);
738 		mounton_vp = NULLVP;
739 	}
740 	if (ua_mountdir != USER_ADDR_NULL) {
741 		nameidone(&nd);
742 	}
743 
744 	return error;
745 }
746 
747 /*
748  * Ungraft a cryptex disk image (via mount dir FD)
749  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
750  */
751 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)752 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
753 {
754 	int error = 0;
755 	user_addr_t ua_mountdir = uap->mountdir;
756 	fsioc_ungraft_fs_t ugfs;
757 	vnode_t mounton_vp = NULLVP;
758 	struct nameidata nd = {};
759 	vfs_context_t ctx = vfs_context_current();
760 
761 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
762 		return EPERM;
763 	}
764 
765 	if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
766 		return EINVAL;
767 	}
768 
769 	ugfs.ungraft_flags = 0;
770 
771 	// Acquire vnode for mount-on path
772 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
773 	    UIO_USERSPACE, ua_mountdir, ctx);
774 
775 	error = namei(&nd);
776 	if (error) {
777 		return error;
778 	}
779 	mounton_vp = nd.ni_vp;
780 
781 	// Call into the FS to perform the ungraft
782 	error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
783 
784 	vnode_put(mounton_vp);
785 	nameidone(&nd);
786 
787 	return error;
788 }
789 
790 
791 void
vfs_notify_mount(vnode_t pdvp)792 vfs_notify_mount(vnode_t pdvp)
793 {
794 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
795 	lock_vnode_and_post(pdvp, NOTE_WRITE);
796 }
797 
798 /*
799  * __mac_mount:
800  *	Mount a file system taking into account MAC label behavior.
801  *	See mount(2) man page for more information
802  *
803  * Parameters:    p                        Process requesting the mount
804  *                uap                      User argument descriptor (see below)
805  *                retval                   (ignored)
806  *
807  * Indirect:      uap->type                Filesystem type
808  *                uap->path                Path to mount
809  *                uap->data                Mount arguments
810  *                uap->mac_p               MAC info
811  *                uap->flags               Mount flags
812  *
813  *
814  * Returns:        0                       Success
815  *                !0                       Not success
816  */
817 boolean_t root_fs_upgrade_try = FALSE;
818 
819 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)820 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
821 {
822 	vnode_t pvp = NULL;
823 	vnode_t vp = NULL;
824 	int need_nameidone = 0;
825 	vfs_context_t ctx = vfs_context_current();
826 	char fstypename[MFSNAMELEN];
827 	struct nameidata nd;
828 	size_t dummy = 0;
829 	char *labelstr = NULL;
830 	size_t labelsz = 0;
831 	int flags = uap->flags;
832 	int error;
833 	int num_retries = 0;
834 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
835 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
836 #else
837 #pragma unused(p)
838 #endif
839 	/*
840 	 * Get the fs type name from user space
841 	 */
842 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
843 	if (error) {
844 		return error;
845 	}
846 
847 retry:
848 	/*
849 	 * Get the vnode to be covered
850 	 */
851 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
852 	    UIO_USERSPACE, uap->path, ctx);
853 	if (flags & MNT_NOFOLLOW) {
854 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
855 	}
856 	error = namei(&nd);
857 	if (error) {
858 		goto out;
859 	}
860 	need_nameidone = 1;
861 	vp = nd.ni_vp;
862 	pvp = nd.ni_dvp;
863 
864 #ifdef CONFIG_IMGSRC_ACCESS
865 	/* Mounting image source cannot be batched with other operations */
866 	if (flags == MNT_IMGSRC_BY_INDEX) {
867 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
868 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
869 		goto out;
870 	}
871 #endif /* CONFIG_IMGSRC_ACCESS */
872 
873 #if CONFIG_MACF
874 	/*
875 	 * Get the label string (if any) from user space
876 	 */
877 	if (uap->mac_p != USER_ADDR_NULL) {
878 		struct user_mac mac;
879 		size_t ulen = 0;
880 
881 		if (is_64bit) {
882 			struct user64_mac mac64;
883 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
884 			mac.m_buflen = (user_size_t)mac64.m_buflen;
885 			mac.m_string = (user_addr_t)mac64.m_string;
886 		} else {
887 			struct user32_mac mac32;
888 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
889 			mac.m_buflen = mac32.m_buflen;
890 			mac.m_string = mac32.m_string;
891 		}
892 		if (error) {
893 			goto out;
894 		}
895 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
896 		    (mac.m_buflen < 2)) {
897 			error = EINVAL;
898 			goto out;
899 		}
900 		labelsz = mac.m_buflen;
901 		labelstr = kalloc_data(labelsz, Z_WAITOK);
902 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
903 		if (error) {
904 			goto out;
905 		}
906 		AUDIT_ARG(mac_string, labelstr);
907 	}
908 #endif /* CONFIG_MACF */
909 
910 	AUDIT_ARG(fflags, flags);
911 
912 #if !CONFIG_UNION_MOUNTS
913 	if (flags & MNT_UNION) {
914 		error = EPERM;
915 		goto out;
916 	}
917 #endif
918 
919 	if ((vp->v_flag & VROOT) &&
920 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
921 #if CONFIG_UNION_MOUNTS
922 		if (!(flags & MNT_UNION)) {
923 			flags |= MNT_UPDATE;
924 		} else {
925 			/*
926 			 * For a union mount on '/', treat it as fresh
927 			 * mount instead of update.
928 			 * Otherwise, union mouting on '/' used to panic the
929 			 * system before, since mnt_vnodecovered was found to
930 			 * be NULL for '/' which is required for unionlookup
931 			 * after it gets ENOENT on union mount.
932 			 */
933 			flags = (flags & ~(MNT_UPDATE));
934 		}
935 #else
936 		flags |= MNT_UPDATE;
937 #endif /* CONFIG_UNION_MOUNTS */
938 
939 #if SECURE_KERNEL
940 		if ((flags & MNT_RDONLY) == 0) {
941 			/* Release kernels are not allowed to mount "/" as rw */
942 			error = EPERM;
943 			goto out;
944 		}
945 #endif
946 
947 		/*
948 		 * See 7392553 for more details on why this check exists.
949 		 * Suffice to say: If this check is ON and something tries
950 		 * to mount the rootFS RW, we'll turn off the codesign
951 		 * bitmap optimization.
952 		 */
953 #if CHECK_CS_VALIDATION_BITMAP
954 		if ((flags & MNT_RDONLY) == 0) {
955 			root_fs_upgrade_try = TRUE;
956 		}
957 #endif
958 	}
959 
960 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
961 	    labelstr, ctx);
962 
963 out:
964 
965 #if CONFIG_MACF
966 	kfree_data(labelstr, labelsz);
967 #endif /* CONFIG_MACF */
968 
969 	if (vp) {
970 		vnode_put(vp);
971 	}
972 	if (pvp) {
973 		vnode_put(pvp);
974 	}
975 	if (need_nameidone) {
976 		nameidone(&nd);
977 	}
978 
979 	if (error == EBUSY) {
980 		/* Retry the lookup and mount again due to concurrent mounts. */
981 		if (++num_retries < MAX_MOUNT_RETRIES) {
982 			goto retry;
983 		}
984 	}
985 
986 	return error;
987 }
988 
989 /*
990  * common mount implementation (final stage of mounting)
991  *
992  * Arguments:
993  *  fstypename	file system type (ie it's vfs name)
994  *  pvp		parent of covered vnode
995  *  vp		covered vnode
996  *  cnp		component name (ie path) of covered vnode
997  *  flags	generic mount flags
998  *  fsmountargs	file system specific data
999  *  labelstr	optional MAC label
1000  *  kernelmount	TRUE for mounts initiated from inside the kernel
1001  *  ctx		caller's context
1002  */
1003 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1004 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1005     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1006     char *labelstr, vfs_context_t ctx)
1007 {
1008 #if !CONFIG_MACF
1009 #pragma unused(labelstr)
1010 #endif
1011 	struct vnode *devvp = NULLVP;
1012 	struct vnode *device_vnode = NULLVP;
1013 #if CONFIG_MACF
1014 	struct vnode *rvp;
1015 #endif
1016 	struct mount *mp = NULL;
1017 	struct vfstable *vfsp = (struct vfstable *)0;
1018 	struct proc *p = vfs_context_proc(ctx);
1019 	int error, flag = 0;
1020 	bool flag_set = false;
1021 	user_addr_t devpath = USER_ADDR_NULL;
1022 	int ronly = 0;
1023 	int mntalloc = 0;
1024 	boolean_t vfsp_ref = FALSE;
1025 	boolean_t is_rwlock_locked = FALSE;
1026 	boolean_t did_rele = FALSE;
1027 	boolean_t have_usecount = FALSE;
1028 	boolean_t did_set_lmount = FALSE;
1029 	boolean_t did_set_vmount = FALSE;
1030 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1031 
1032 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1033 	/* Check for mutually-exclusive flag bits */
1034 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1035 	int bitcount = 0;
1036 	while (checkflags != 0) {
1037 		checkflags &= (checkflags - 1);
1038 		bitcount++;
1039 	}
1040 
1041 	if (bitcount > 1) {
1042 		//not allowed to request multiple mount-by-role flags
1043 		error = EINVAL;
1044 		goto out1;
1045 	}
1046 #endif
1047 
1048 	/*
1049 	 * Process an update for an existing mount
1050 	 */
1051 	if (flags & MNT_UPDATE) {
1052 		if ((vp->v_flag & VROOT) == 0) {
1053 			error = EINVAL;
1054 			goto out1;
1055 		}
1056 		mp = vp->v_mount;
1057 
1058 		/* if unmount or mount in progress, return error */
1059 		mount_lock_spin(mp);
1060 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1061 			mount_unlock(mp);
1062 			error = EBUSY;
1063 			goto out1;
1064 		}
1065 		mp->mnt_lflag |= MNT_LMOUNT;
1066 		did_set_lmount = TRUE;
1067 		mount_unlock(mp);
1068 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1069 		is_rwlock_locked = TRUE;
1070 		/*
1071 		 * We only allow the filesystem to be reloaded if it
1072 		 * is currently mounted read-only.
1073 		 */
1074 		if ((flags & MNT_RELOAD) &&
1075 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1076 			error = ENOTSUP;
1077 			goto out1;
1078 		}
1079 
1080 		/*
1081 		 * If content protection is enabled, update mounts are not
1082 		 * allowed to turn it off.
1083 		 */
1084 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1085 		    ((flags & MNT_CPROTECT) == 0)) {
1086 			error = EINVAL;
1087 			goto out1;
1088 		}
1089 
1090 		/*
1091 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1092 		 * failure to return an error for this so we'll just silently
1093 		 * add it if it is not passed in.
1094 		 */
1095 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1096 		    ((flags & MNT_REMOVABLE) == 0)) {
1097 			flags |= MNT_REMOVABLE;
1098 		}
1099 
1100 		/* Can't downgrade the backer of the root FS */
1101 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1102 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1103 			error = ENOTSUP;
1104 			goto out1;
1105 		}
1106 
1107 		/*
1108 		 * Only root, or the user that did the original mount is
1109 		 * permitted to update it.
1110 		 */
1111 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1112 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1113 			goto out1;
1114 		}
1115 #if CONFIG_MACF
1116 		error = mac_mount_check_remount(ctx, mp);
1117 		if (error != 0) {
1118 			goto out1;
1119 		}
1120 #endif
1121 		/*
1122 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1123 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1124 		 */
1125 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1126 			flags |= MNT_NOSUID | MNT_NODEV;
1127 			if (mp->mnt_flag & MNT_NOEXEC) {
1128 				flags |= MNT_NOEXEC;
1129 			}
1130 		}
1131 		flag = mp->mnt_flag;
1132 		flag_set = true;
1133 
1134 
1135 
1136 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1137 
1138 		vfsp = mp->mnt_vtable;
1139 		goto update;
1140 	} // MNT_UPDATE
1141 
1142 	/*
1143 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1144 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1145 	 */
1146 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1147 		flags |= MNT_NOSUID | MNT_NODEV;
1148 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1149 			flags |= MNT_NOEXEC;
1150 		}
1151 	}
1152 
1153 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1154 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1155 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1156 	mount_list_lock();
1157 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1158 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1159 			vfsp->vfc_refcount++;
1160 			vfsp_ref = TRUE;
1161 			break;
1162 		}
1163 	}
1164 	mount_list_unlock();
1165 	if (vfsp == NULL) {
1166 		error = ENODEV;
1167 		goto out1;
1168 	}
1169 
1170 	/*
1171 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1172 	 * except in ROSV configs and for the initial BaseSystem root.
1173 	 */
1174 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1175 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1176 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1177 		error = EINVAL;  /* unsupported request */
1178 		goto out1;
1179 	}
1180 
1181 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1182 	if (error != 0) {
1183 		goto out1;
1184 	}
1185 
1186 	/*
1187 	 * Upon successful of prepare_coveredvp(), VMOUNT is set for the covered vp.
1188 	 */
1189 	did_set_vmount = TRUE;
1190 
1191 	/*
1192 	 * Allocate and initialize the filesystem (mount_t)
1193 	 */
1194 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1195 	mntalloc = 1;
1196 
1197 	/* Initialize the default IO constraints */
1198 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1199 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1200 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1201 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1202 	mp->mnt_devblocksize = DEV_BSIZE;
1203 	mp->mnt_alignmentmask = PAGE_MASK;
1204 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1205 	mp->mnt_ioscale = 1;
1206 	mp->mnt_ioflags = 0;
1207 	mp->mnt_realrootvp = NULLVP;
1208 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1209 
1210 	mp->mnt_lflag |= MNT_LMOUNT;
1211 	did_set_lmount = TRUE;
1212 
1213 	TAILQ_INIT(&mp->mnt_vnodelist);
1214 	TAILQ_INIT(&mp->mnt_workerqueue);
1215 	TAILQ_INIT(&mp->mnt_newvnodes);
1216 	mount_lock_init(mp);
1217 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1218 	is_rwlock_locked = TRUE;
1219 	mp->mnt_op = vfsp->vfc_vfsops;
1220 	mp->mnt_vtable = vfsp;
1221 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1222 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1223 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1224 	do {
1225 		size_t pathlen = MAXPATHLEN;
1226 
1227 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1228 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1229 		}
1230 	} while (0);
1231 	mp->mnt_vnodecovered = vp;
1232 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1233 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1234 	mp->mnt_devbsdunit = 0;
1235 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1236 
1237 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1238 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1239 
1240 	if (kernelmount) {
1241 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1242 	}
1243 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1244 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1245 	}
1246 
1247 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1248 		// kernel mounted devfs
1249 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1250 	}
1251 
1252 update:
1253 
1254 	/*
1255 	 * Set the mount level flags.
1256 	 */
1257 	if (flags & MNT_RDONLY) {
1258 		mp->mnt_flag |= MNT_RDONLY;
1259 	} else if (mp->mnt_flag & MNT_RDONLY) {
1260 		// disallow read/write upgrades of file systems that
1261 		// had the TYPENAME_OVERRIDE feature set.
1262 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1263 			error = EPERM;
1264 			goto out1;
1265 		}
1266 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1267 	}
1268 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1269 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1270 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1271 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1272 	    MNT_QUARANTINE | MNT_CPROTECT);
1273 
1274 #if SECURE_KERNEL
1275 #if !CONFIG_MNT_SUID
1276 	/*
1277 	 * On release builds of iOS based platforms, always enforce NOSUID on
1278 	 * all mounts. We do this here because we can catch update mounts as well as
1279 	 * non-update mounts in this case.
1280 	 */
1281 	mp->mnt_flag |= (MNT_NOSUID);
1282 #endif
1283 #endif
1284 
1285 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1286 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1287 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1288 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1289 	    MNT_QUARANTINE | MNT_CPROTECT);
1290 
1291 #if CONFIG_MACF
1292 	if (flags & MNT_MULTILABEL) {
1293 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1294 			error = EINVAL;
1295 			goto out1;
1296 		}
1297 		mp->mnt_flag |= MNT_MULTILABEL;
1298 	}
1299 #endif
1300 	/*
1301 	 * Process device path for local file systems if requested.
1302 	 *
1303 	 * Snapshot and mount-by-role mounts do not use this path; they are
1304 	 * passing other opaque data in the device path field.
1305 	 *
1306 	 * Basesystemroot mounts pass a device path to be resolved here,
1307 	 * but it's just a char * already inside the kernel, which
1308 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1309 	 * mounts we must skip copyin (both of the address and of the string
1310 	 * (in NDINIT).
1311 	 */
1312 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1313 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1314 		boolean_t do_copyin_devpath = true;
1315 #if CONFIG_BASESYSTEMROOT
1316 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1317 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1318 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1319 			// but is actually a char ** pointing to a (kernelspace) string.
1320 			// We manually unpack it with a series of casts and dereferences
1321 			// that reverses what was done just above us on the stack in
1322 			// imageboot_pivot_image().
1323 			// After retrieving the path to the dev node (which we will NDINIT
1324 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1325 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1326 			char **devnamepp = (char **)fsmountargs;
1327 			char *devnamep = *devnamepp;
1328 			devpath = CAST_USER_ADDR_T(devnamep);
1329 			do_copyin_devpath = false;
1330 			fsmountargs = USER_ADDR_NULL;
1331 
1332 			//Now that we have a mp, denote that this mount is for the basesystem.
1333 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1334 		}
1335 #endif // CONFIG_BASESYSTEMROOT
1336 
1337 		if (do_copyin_devpath) {
1338 			if (vfs_context_is64bit(ctx)) {
1339 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1340 					goto out1;
1341 				}
1342 				fsmountargs += sizeof(devpath);
1343 			} else {
1344 				user32_addr_t tmp;
1345 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1346 					goto out1;
1347 				}
1348 				/* munge into LP64 addr */
1349 				devpath = CAST_USER_ADDR_T(tmp);
1350 				fsmountargs += sizeof(tmp);
1351 			}
1352 		}
1353 
1354 		/* Lookup device and authorize access to it */
1355 		if ((devpath)) {
1356 			struct nameidata nd;
1357 
1358 			enum uio_seg seg = UIO_USERSPACE;
1359 #if CONFIG_BASESYSTEMROOT
1360 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1361 				seg = UIO_SYSSPACE;
1362 			}
1363 #endif // CONFIG_BASESYSTEMROOT
1364 
1365 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1366 			if ((error = namei(&nd))) {
1367 				goto out1;
1368 			}
1369 
1370 			devvp = nd.ni_vp;
1371 
1372 			if (devvp->v_type != VBLK) {
1373 				error = ENOTBLK;
1374 				nameidone(&nd);
1375 				goto out2;
1376 			}
1377 			if (major(devvp->v_rdev) >= nblkdev) {
1378 				error = ENXIO;
1379 				nameidone(&nd);
1380 				goto out2;
1381 			}
1382 			/*
1383 			 * If mount by non-root, then verify that user has necessary
1384 			 * permissions on the device.
1385 			 */
1386 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1387 				kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1388 
1389 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1390 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1391 				}
1392 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1393 					nameidone(&nd);
1394 					goto out2;
1395 				}
1396 			}
1397 
1398 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1399 			nameidone(&nd);
1400 		}
1401 		/* On first mount, preflight and open device */
1402 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1403 			if ((error = vnode_ref(devvp))) {
1404 				goto out2;
1405 			}
1406 			/*
1407 			 * Disallow multiple mounts of the same device.
1408 			 * Disallow mounting of a device that is currently in use
1409 			 * (except for root, which might share swap device for miniroot).
1410 			 * Flush out any old buffers remaining from a previous use.
1411 			 */
1412 			if ((error = vfs_setmounting(devvp))) {
1413 				vnode_rele(devvp);
1414 				goto out2;
1415 			}
1416 
1417 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1418 				error = EBUSY;
1419 				goto out3;
1420 			}
1421 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1422 				error = ENOTBLK;
1423 				goto out3;
1424 			}
1425 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1426 				goto out3;
1427 			}
1428 
1429 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1430 #if CONFIG_MACF
1431 			error = mac_vnode_check_open(ctx,
1432 			    devvp,
1433 			    ronly ? FREAD : FREAD | FWRITE);
1434 			if (error) {
1435 				goto out3;
1436 			}
1437 #endif /* MAC */
1438 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1439 				goto out3;
1440 			}
1441 
1442 			mp->mnt_devvp = devvp;
1443 			device_vnode = devvp;
1444 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1445 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1446 		    (device_vnode = mp->mnt_devvp)) {
1447 			dev_t dev;
1448 			int maj;
1449 			/*
1450 			 * If upgrade to read-write by non-root, then verify
1451 			 * that user has necessary permissions on the device.
1452 			 */
1453 			vnode_getalways(device_vnode);
1454 
1455 			if (suser(vfs_context_ucred(ctx), NULL) &&
1456 			    (error = vnode_authorize(device_vnode, NULL,
1457 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1458 			    ctx)) != 0) {
1459 				vnode_put(device_vnode);
1460 				goto out2;
1461 			}
1462 
1463 			/* Tell the device that we're upgrading */
1464 			dev = (dev_t)device_vnode->v_rdev;
1465 			maj = major(dev);
1466 
1467 			if ((u_int)maj >= (u_int)nblkdev) {
1468 				panic("Volume mounted on a device with invalid major number.");
1469 			}
1470 
1471 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1472 			vnode_put(device_vnode);
1473 			device_vnode = NULLVP;
1474 			if (error != 0) {
1475 				goto out2;
1476 			}
1477 		}
1478 	} // localargs && !(snapshot | data | vm)
1479 
1480 #if CONFIG_MACF
1481 	if ((flags & MNT_UPDATE) == 0) {
1482 		mac_mount_label_init(mp);
1483 		mac_mount_label_associate(ctx, mp);
1484 	}
1485 	if (labelstr) {
1486 		if ((flags & MNT_UPDATE) != 0) {
1487 			error = mac_mount_check_label_update(ctx, mp);
1488 			if (error != 0) {
1489 				goto out3;
1490 			}
1491 		}
1492 	}
1493 #endif
1494 	/*
1495 	 * Mount the filesystem.  We already asserted that internal_flags
1496 	 * cannot have more than one mount-by-role bit set.
1497 	 */
1498 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1499 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1500 		    (caddr_t)fsmountargs, 0, ctx);
1501 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1502 #if CONFIG_ROSV_STARTUP
1503 		struct mount *origin_mp = (struct mount*)fsmountargs;
1504 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1505 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1506 		if (error) {
1507 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1508 		} else {
1509 			/* Mark volume associated with system volume */
1510 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1511 
1512 			/* Attempt to acquire the mnt_devvp and set it up */
1513 			struct vnode *mp_devvp = NULL;
1514 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1515 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1516 				    0, &mp_devvp, vfs_context_kernel());
1517 				if (!lerr) {
1518 					mp->mnt_devvp = mp_devvp;
1519 					//vnode_lookup took an iocount, need to drop it.
1520 					vnode_put(mp_devvp);
1521 					// now set `device_vnode` to the devvp that was acquired.
1522 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1523 					// note that though the iocount above was dropped, the mount acquires
1524 					// an implicit reference against the device.
1525 					device_vnode = mp_devvp;
1526 				}
1527 			}
1528 		}
1529 #else
1530 		error = EINVAL;
1531 #endif
1532 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1533 #if CONFIG_MOUNT_VM
1534 		struct mount *origin_mp = (struct mount*)fsmountargs;
1535 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1536 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1537 		if (error) {
1538 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1539 		} else {
1540 			/* Mark volume associated with system volume and a swap mount */
1541 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1542 			/* Attempt to acquire the mnt_devvp and set it up */
1543 			struct vnode *mp_devvp = NULL;
1544 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1545 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1546 				    0, &mp_devvp, vfs_context_kernel());
1547 				if (!lerr) {
1548 					mp->mnt_devvp = mp_devvp;
1549 					//vnode_lookup took an iocount, need to drop it.
1550 					vnode_put(mp_devvp);
1551 
1552 					// now set `device_vnode` to the devvp that was acquired.
1553 					// note that though the iocount above was dropped, the mount acquires
1554 					// an implicit reference against the device.
1555 					device_vnode = mp_devvp;
1556 				}
1557 			}
1558 		}
1559 #else
1560 		error = EINVAL;
1561 #endif
1562 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1563 #if CONFIG_MOUNT_PREBOOTRECOVERY
1564 		struct mount *origin_mp = (struct mount*)fsmountargs;
1565 		uint32_t mount_role = 0;
1566 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1567 			mount_role = VFS_PREBOOT_ROLE;
1568 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1569 			mount_role = VFS_RECOVERY_ROLE;
1570 		}
1571 
1572 		if (mount_role != 0) {
1573 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1574 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1575 			if (error) {
1576 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1577 			} else {
1578 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1579 				/* Mark volume associated with system volume */
1580 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1581 				/* Attempt to acquire the mnt_devvp and set it up */
1582 				struct vnode *mp_devvp = NULL;
1583 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1584 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1585 					    0, &mp_devvp, vfs_context_kernel());
1586 					if (!lerr) {
1587 						mp->mnt_devvp = mp_devvp;
1588 						//vnode_lookup took an iocount, need to drop it.
1589 						vnode_put(mp_devvp);
1590 
1591 						// now set `device_vnode` to the devvp that was acquired.
1592 						// note that though the iocount above was dropped, the mount acquires
1593 						// an implicit reference against the device.
1594 						device_vnode = mp_devvp;
1595 					}
1596 				}
1597 			}
1598 		} else {
1599 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1600 			error = EINVAL;
1601 		}
1602 #else
1603 		error = EINVAL;
1604 #endif
1605 	} else {
1606 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1607 	}
1608 
1609 	if (flags & MNT_UPDATE) {
1610 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1611 			mp->mnt_flag &= ~MNT_RDONLY;
1612 		}
1613 		mp->mnt_flag &= ~
1614 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1615 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1616 		if (error) {
1617 			mp->mnt_flag = flag;  /* restore flag value */
1618 		}
1619 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1620 		lck_rw_done(&mp->mnt_rwlock);
1621 		is_rwlock_locked = FALSE;
1622 		if (!error) {
1623 			enablequotas(mp, ctx);
1624 		}
1625 		goto exit;
1626 	}
1627 
1628 	/*
1629 	 * Put the new filesystem on the mount list after root.
1630 	 */
1631 	if (error == 0) {
1632 		struct vfs_attr vfsattr;
1633 		if (device_vnode) {
1634 			/*
1635 			 *   cache the IO attributes for the underlying physical media...
1636 			 *   an error return indicates the underlying driver doesn't
1637 			 *   support all the queries necessary... however, reasonable
1638 			 *   defaults will have been set, so no reason to bail or care
1639 			 *
1640 			 *   Need to do this before calling the MAC hook as it needs
1641 			 *   information from this call.
1642 			 */
1643 			vfs_init_io_attributes(device_vnode, mp);
1644 		}
1645 
1646 #if CONFIG_MACF
1647 		error = mac_mount_check_mount_late(ctx, mp);
1648 		if (error != 0) {
1649 			goto out4;
1650 		}
1651 
1652 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1653 			error = VFS_ROOT(mp, &rvp, ctx);
1654 			if (error) {
1655 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1656 				goto out4;
1657 			}
1658 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1659 			/*
1660 			 * drop reference provided by VFS_ROOT
1661 			 */
1662 			vnode_put(rvp);
1663 
1664 			if (error) {
1665 				goto out4;
1666 			}
1667 		}
1668 #endif  /* MAC */
1669 
1670 		vnode_lock_spin(vp);
1671 		CLR(vp->v_flag, VMOUNT);
1672 		vp->v_mountedhere = mp;
1673 		SET(vp->v_flag, VMOUNTEDHERE);
1674 
1675 		/*
1676 		 * Wakeup any waiter(s) in prepare_coveredvp() that is waiting for the
1677 		 * 'v_mountedhere' to be planted.
1678 		 */
1679 		wakeup(&vp->v_flag);
1680 		vnode_unlock(vp);
1681 
1682 		/*
1683 		 * taking the name_cache_lock exclusively will
1684 		 * insure that everyone is out of the fast path who
1685 		 * might be trying to use a now stale copy of
1686 		 * vp->v_mountedhere->mnt_realrootvp
1687 		 * bumping mount_generation causes the cached values
1688 		 * to be invalidated
1689 		 */
1690 		name_cache_lock();
1691 		mount_generation++;
1692 		name_cache_unlock();
1693 
1694 		error = vnode_ref(vp);
1695 		if (error != 0) {
1696 			goto out4;
1697 		}
1698 
1699 		have_usecount = TRUE;
1700 
1701 		error = checkdirs(vp, ctx);
1702 		if (error != 0) {
1703 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1704 			goto out4;
1705 		}
1706 		/*
1707 		 * there is no cleanup code here so I have made it void
1708 		 * we need to revisit this
1709 		 */
1710 		(void)VFS_START(mp, 0, ctx);
1711 
1712 		if (mount_list_add(mp) != 0) {
1713 			/*
1714 			 * The system is shutting down trying to umount
1715 			 * everything, so fail with a plausible errno.
1716 			 */
1717 			error = EBUSY;
1718 			goto out4;
1719 		}
1720 		lck_rw_done(&mp->mnt_rwlock);
1721 		is_rwlock_locked = FALSE;
1722 
1723 		/* Check if this mounted file system supports EAs or named streams. */
1724 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1725 		VFSATTR_INIT(&vfsattr);
1726 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1727 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1728 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1729 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1730 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1731 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1732 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1733 			}
1734 #if NAMEDSTREAMS
1735 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1736 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1737 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1738 			}
1739 #endif
1740 			/* Check if this file system supports path from id lookups. */
1741 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1742 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1743 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1744 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1745 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1746 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1747 			}
1748 
1749 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1750 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1751 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1752 			}
1753 		}
1754 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1755 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1756 		}
1757 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1758 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1759 		}
1760 		/* increment the operations count */
1761 		OSAddAtomic(1, &vfs_nummntops);
1762 		enablequotas(mp, ctx);
1763 
1764 		if (device_vnode) {
1765 			vfs_setmountedon(device_vnode);
1766 		}
1767 
1768 		/* Now that mount is setup, notify the listeners */
1769 		vfs_notify_mount(pvp);
1770 		IOBSDMountChange(mp, kIOMountChangeMount);
1771 	} else {
1772 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1773 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1774 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1775 			    mp->mnt_vtable->vfc_name, error);
1776 		}
1777 
1778 		vnode_lock_spin(vp);
1779 		CLR(vp->v_flag, VMOUNT);
1780 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
1781 		wakeup(&vp->v_flag);
1782 		vnode_unlock(vp);
1783 		mount_list_lock();
1784 		mp->mnt_vtable->vfc_refcount--;
1785 		mount_list_unlock();
1786 
1787 		if (device_vnode) {
1788 			vnode_rele(device_vnode);
1789 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1790 			vfs_clearmounting(device_vnode);
1791 		}
1792 		lck_rw_done(&mp->mnt_rwlock);
1793 		is_rwlock_locked = FALSE;
1794 
1795 		if (nc_smr_enabled) {
1796 			vfs_smr_synchronize();
1797 		}
1798 
1799 		/*
1800 		 * if we get here, we have a mount structure that needs to be freed,
1801 		 * but since the coveredvp hasn't yet been updated to point at it,
1802 		 * no need to worry about other threads holding a crossref on this mp
1803 		 * so it's ok to just free it
1804 		 */
1805 		mount_lock_destroy(mp);
1806 #if CONFIG_MACF
1807 		mac_mount_label_destroy(mp);
1808 #endif
1809 		zfree(mount_zone, mp);
1810 		did_set_lmount = false;
1811 	}
1812 exit:
1813 	/*
1814 	 * drop I/O count on the device vp if there was one
1815 	 */
1816 	if (devpath && devvp) {
1817 		vnode_put(devvp);
1818 	}
1819 
1820 	if (did_set_lmount) {
1821 		mount_lock_spin(mp);
1822 		mp->mnt_lflag &= ~MNT_LMOUNT;
1823 		mount_unlock(mp);
1824 	}
1825 
1826 	return error;
1827 
1828 /* Error condition exits */
1829 out4:
1830 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1831 
1832 	/*
1833 	 * If the mount has been placed on the covered vp,
1834 	 * it may have been discovered by now, so we have
1835 	 * to treat this just like an unmount
1836 	 */
1837 	mount_lock_spin(mp);
1838 	mp->mnt_lflag |= MNT_LDEAD;
1839 	mount_unlock(mp);
1840 
1841 	if (device_vnode != NULLVP) {
1842 		vnode_rele(device_vnode);
1843 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1844 		    ctx);
1845 		vfs_clearmounting(device_vnode);
1846 		did_rele = TRUE;
1847 	}
1848 
1849 	vnode_lock_spin(vp);
1850 
1851 	mp->mnt_crossref++;
1852 	CLR(vp->v_flag, VMOUNTEDHERE);
1853 	vp->v_mountedhere = (mount_t) 0;
1854 
1855 	vnode_unlock(vp);
1856 
1857 	if (have_usecount) {
1858 		vnode_rele(vp);
1859 	}
1860 out3:
1861 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1862 		vnode_rele(devvp);
1863 		vfs_clearmounting(devvp);
1864 	}
1865 out2:
1866 	if (devpath && devvp) {
1867 		vnode_put(devvp);
1868 	}
1869 out1:
1870 	/* Release mnt_rwlock only when it was taken */
1871 	if (is_rwlock_locked == TRUE) {
1872 		if (flag_set) {
1873 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1874 		}
1875 		lck_rw_done(&mp->mnt_rwlock);
1876 	}
1877 
1878 	if (did_set_lmount) {
1879 		mount_lock_spin(mp);
1880 		mp->mnt_lflag &= ~MNT_LMOUNT;
1881 		mount_unlock(mp);
1882 	}
1883 
1884 	if (did_set_vmount) {
1885 		vnode_lock_spin(vp);
1886 		CLR(vp->v_flag, VMOUNT);
1887 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
1888 		wakeup(&vp->v_flag);
1889 		vnode_unlock(vp);
1890 	}
1891 
1892 	if (mntalloc) {
1893 		if (mp->mnt_crossref) {
1894 			mount_dropcrossref(mp, vp, 0);
1895 		} else {
1896 			if (nc_smr_enabled) {
1897 				vfs_smr_synchronize();
1898 			}
1899 
1900 			mount_lock_destroy(mp);
1901 #if CONFIG_MACF
1902 			mac_mount_label_destroy(mp);
1903 #endif
1904 			zfree(mount_zone, mp);
1905 		}
1906 	}
1907 	if (vfsp_ref) {
1908 		mount_list_lock();
1909 		vfsp->vfc_refcount--;
1910 		mount_list_unlock();
1911 	}
1912 
1913 	return error;
1914 }
1915 
1916 /*
1917  * Flush in-core data, check for competing mount attempts,
1918  * and set VMOUNT
1919  */
1920 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1921 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1922 {
1923 #if !CONFIG_MACF
1924 #pragma unused(cnp,fsname)
1925 #endif
1926 	struct vnode_attr va;
1927 	int error;
1928 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1929 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1930 	boolean_t is_kmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1931 
1932 	if (!skip_auth) {
1933 		/*
1934 		 * If the user is not root, ensure that they own the directory
1935 		 * onto which we are attempting to mount.
1936 		 */
1937 		VATTR_INIT(&va);
1938 		VATTR_WANTED(&va, va_uid);
1939 		if ((error = vnode_getattr(vp, &va, ctx)) ||
1940 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1941 		    (!vfs_context_issuser(ctx)))) {
1942 			error = EPERM;
1943 			goto out;
1944 		}
1945 	}
1946 
1947 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1948 		goto out;
1949 	}
1950 
1951 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1952 		goto out;
1953 	}
1954 
1955 	if (vp->v_type != VDIR) {
1956 		error = ENOTDIR;
1957 		goto out;
1958 	}
1959 
1960 	vnode_lock_spin(vp);
1961 
1962 	if (is_fmount && (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL))) {
1963 		error = EBUSY;
1964 	} else if (!is_kmount && (ISSET(vp->v_flag, VMOUNT) ||
1965 	    (vp->v_mountedhere != NULL))) {
1966 		/*
1967 		 * For mount triggered from mount() call, we want to wait for the
1968 		 * current in-progress mount to complete, redo lookup and retry the
1969 		 * mount again. Similarly, we also want to retry if we lost the race
1970 		 * due to concurrent mounts and the 'VMOUNT' flag has been cleared and
1971 		 * 'v_mountedhere' has been planted after initial lookup.
1972 		 */
1973 		if (ISSET(vp->v_flag, VMOUNT)) {
1974 			vnode_lock_convert(vp);
1975 			msleep(&vp->v_flag, &vp->v_lock, PVFS, "vnode_waitformount", NULL);
1976 		}
1977 		error = EBUSY;
1978 	} else if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1979 		error = EBUSY;
1980 	}
1981 
1982 	if (error) {
1983 		vnode_unlock(vp);
1984 		goto out;
1985 	}
1986 	SET(vp->v_flag, VMOUNT);
1987 	vnode_unlock(vp);
1988 
1989 #if CONFIG_MACF
1990 	error = mac_mount_check_mount(ctx, vp,
1991 	    cnp, fsname);
1992 	if (error != 0) {
1993 		vnode_lock_spin(vp);
1994 		CLR(vp->v_flag, VMOUNT);
1995 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
1996 		wakeup(&vp->v_flag);
1997 		vnode_unlock(vp);
1998 	}
1999 #endif
2000 
2001 out:
2002 	return error;
2003 }
2004 
2005 #if CONFIG_IMGSRC_ACCESS
2006 
2007 #define DEBUG_IMGSRC 0
2008 
2009 #if DEBUG_IMGSRC
2010 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2011 #else
2012 #define IMGSRC_DEBUG(args...) do { } while(0)
2013 #endif
2014 
2015 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2016 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2017 {
2018 	struct nameidata nd;
2019 	vnode_t vp, realdevvp;
2020 	kauth_action_t accessmode;
2021 	int error;
2022 	enum uio_seg uio = UIO_USERSPACE;
2023 
2024 	if (ctx == vfs_context_kernel()) {
2025 		uio = UIO_SYSSPACE;
2026 	}
2027 
2028 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2029 	if ((error = namei(&nd))) {
2030 		IMGSRC_DEBUG("namei() failed with %d\n", error);
2031 		return error;
2032 	}
2033 
2034 	vp = nd.ni_vp;
2035 
2036 	if (!vnode_isblk(vp)) {
2037 		IMGSRC_DEBUG("Not block device.\n");
2038 		error = ENOTBLK;
2039 		goto out;
2040 	}
2041 
2042 	realdevvp = mp->mnt_devvp;
2043 	if (realdevvp == NULLVP) {
2044 		IMGSRC_DEBUG("No device backs the mount.\n");
2045 		error = ENXIO;
2046 		goto out;
2047 	}
2048 
2049 	error = vnode_getwithref(realdevvp);
2050 	if (error != 0) {
2051 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2052 		goto out;
2053 	}
2054 
2055 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2056 		IMGSRC_DEBUG("Wrong dev_t.\n");
2057 		error = ENXIO;
2058 		goto out1;
2059 	}
2060 
2061 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2062 
2063 	/*
2064 	 * If mount by non-root, then verify that user has necessary
2065 	 * permissions on the device.
2066 	 */
2067 	if (!vfs_context_issuser(ctx)) {
2068 		accessmode = KAUTH_VNODE_READ_DATA;
2069 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2070 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2071 		}
2072 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2073 			IMGSRC_DEBUG("Access denied.\n");
2074 			goto out1;
2075 		}
2076 	}
2077 
2078 	*devvpp = vp;
2079 
2080 out1:
2081 	vnode_put(realdevvp);
2082 
2083 out:
2084 	nameidone(&nd);
2085 
2086 	if (error) {
2087 		vnode_put(vp);
2088 	}
2089 
2090 	return error;
2091 }
2092 
2093 /*
2094  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2095  * and call checkdirs()
2096  */
2097 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2098 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2099 {
2100 	int error;
2101 
2102 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2103 
2104 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2105 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2106 
2107 	vnode_lock_spin(vp);
2108 	CLR(vp->v_flag, VMOUNT);
2109 	vp->v_mountedhere = mp;
2110 	SET(vp->v_flag, VMOUNTEDHERE);
2111 	/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2112 	wakeup(&vp->v_flag);
2113 	vnode_unlock(vp);
2114 
2115 	/*
2116 	 * taking the name_cache_lock exclusively will
2117 	 * insure that everyone is out of the fast path who
2118 	 * might be trying to use a now stale copy of
2119 	 * vp->v_mountedhere->mnt_realrootvp
2120 	 * bumping mount_generation causes the cached values
2121 	 * to be invalidated
2122 	 */
2123 	name_cache_lock();
2124 	mount_generation++;
2125 	name_cache_unlock();
2126 
2127 	error = vnode_ref(vp);
2128 	if (error != 0) {
2129 		goto out;
2130 	}
2131 
2132 	error = checkdirs(vp, ctx);
2133 	if (error != 0) {
2134 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2135 		vnode_rele(vp);
2136 		goto out;
2137 	}
2138 
2139 out:
2140 	if (error != 0) {
2141 		mp->mnt_vnodecovered = NULLVP;
2142 	}
2143 	return error;
2144 }
2145 
2146 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2147 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2148 {
2149 	vnode_rele(vp);
2150 	vnode_lock_spin(vp);
2151 	CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2152 	vp->v_mountedhere = (mount_t)NULL;
2153 	/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2154 	wakeup(&vp->v_flag);
2155 	vnode_unlock(vp);
2156 
2157 	mp->mnt_vnodecovered = NULLVP;
2158 }
2159 
2160 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2161 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2162 {
2163 	int error;
2164 
2165 	/* unmount in progress return error */
2166 	mount_lock_spin(mp);
2167 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2168 		mount_unlock(mp);
2169 		return EBUSY;
2170 	}
2171 	mount_unlock(mp);
2172 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2173 
2174 	/*
2175 	 * We only allow the filesystem to be reloaded if it
2176 	 * is currently mounted read-only.
2177 	 */
2178 	if ((flags & MNT_RELOAD) &&
2179 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2180 		error = ENOTSUP;
2181 		goto out;
2182 	}
2183 
2184 	/*
2185 	 * Only root, or the user that did the original mount is
2186 	 * permitted to update it.
2187 	 */
2188 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2189 	    (!vfs_context_issuser(ctx))) {
2190 		error = EPERM;
2191 		goto out;
2192 	}
2193 #if CONFIG_MACF
2194 	error = mac_mount_check_remount(ctx, mp);
2195 	if (error != 0) {
2196 		goto out;
2197 	}
2198 #endif
2199 
2200 out:
2201 	if (error) {
2202 		lck_rw_done(&mp->mnt_rwlock);
2203 	}
2204 
2205 	return error;
2206 }
2207 
2208 static void
mount_end_update(mount_t mp)2209 mount_end_update(mount_t mp)
2210 {
2211 	lck_rw_done(&mp->mnt_rwlock);
2212 }
2213 
2214 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2215 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2216 {
2217 	vnode_t vp;
2218 
2219 	if (height >= MAX_IMAGEBOOT_NESTING) {
2220 		return EINVAL;
2221 	}
2222 
2223 	vp = imgsrc_rootvnodes[height];
2224 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2225 		*rvpp = vp;
2226 		return 0;
2227 	} else {
2228 		return ENOENT;
2229 	}
2230 }
2231 
2232 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2233 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2234     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2235     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2236 {
2237 	int error;
2238 	mount_t mp;
2239 	boolean_t placed = FALSE;
2240 	struct vfstable *vfsp;
2241 	user_addr_t devpath;
2242 	char *old_mntonname;
2243 	vnode_t rvp;
2244 	vnode_t devvp;
2245 	uint32_t height;
2246 	uint32_t flags;
2247 
2248 	/* If we didn't imageboot, nothing to move */
2249 	if (imgsrc_rootvnodes[0] == NULLVP) {
2250 		return EINVAL;
2251 	}
2252 
2253 	/* Only root can do this */
2254 	if (!vfs_context_issuser(ctx)) {
2255 		return EPERM;
2256 	}
2257 
2258 	IMGSRC_DEBUG("looking for root vnode.\n");
2259 
2260 	/*
2261 	 * Get root vnode of filesystem we're moving.
2262 	 */
2263 	if (by_index) {
2264 		if (is64bit) {
2265 			struct user64_mnt_imgsrc_args mia64;
2266 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2267 			if (error != 0) {
2268 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2269 				return error;
2270 			}
2271 
2272 			height = mia64.mi_height;
2273 			flags = mia64.mi_flags;
2274 			devpath = (user_addr_t)mia64.mi_devpath;
2275 		} else {
2276 			struct user32_mnt_imgsrc_args mia32;
2277 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2278 			if (error != 0) {
2279 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2280 				return error;
2281 			}
2282 
2283 			height = mia32.mi_height;
2284 			flags = mia32.mi_flags;
2285 			devpath = mia32.mi_devpath;
2286 		}
2287 	} else {
2288 		/*
2289 		 * For binary compatibility--assumes one level of nesting.
2290 		 */
2291 		if (is64bit) {
2292 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2293 				return error;
2294 			}
2295 		} else {
2296 			user32_addr_t tmp;
2297 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2298 				return error;
2299 			}
2300 
2301 			/* munge into LP64 addr */
2302 			devpath = CAST_USER_ADDR_T(tmp);
2303 		}
2304 
2305 		height = 0;
2306 		flags = 0;
2307 	}
2308 
2309 	if (flags != 0) {
2310 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2311 		return EINVAL;
2312 	}
2313 
2314 	error = get_imgsrc_rootvnode(height, &rvp);
2315 	if (error != 0) {
2316 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2317 		return error;
2318 	}
2319 
2320 	IMGSRC_DEBUG("got old root vnode\n");
2321 
2322 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2323 
2324 	/* Can only move once */
2325 	mp = vnode_mount(rvp);
2326 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2327 		IMGSRC_DEBUG("Already moved.\n");
2328 		error = EBUSY;
2329 		goto out0;
2330 	}
2331 
2332 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2333 	IMGSRC_DEBUG("Starting updated.\n");
2334 
2335 	/* Get exclusive rwlock on mount, authorize update on mp */
2336 	error = mount_begin_update(mp, ctx, 0);
2337 	if (error != 0) {
2338 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2339 		goto out0;
2340 	}
2341 
2342 	/*
2343 	 * It can only be moved once.  Flag is set under the rwlock,
2344 	 * so we're now safe to proceed.
2345 	 */
2346 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2347 		IMGSRC_DEBUG("Already moved [2]\n");
2348 		goto out1;
2349 	}
2350 
2351 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2352 
2353 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2354 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2355 	if (error != 0) {
2356 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2357 		goto out1;
2358 	}
2359 
2360 	IMGSRC_DEBUG("Covered vp OK.\n");
2361 
2362 	/* Sanity check the name caller has provided */
2363 	vfsp = mp->mnt_vtable;
2364 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2365 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2366 		    vfsp->vfc_name, fsname);
2367 		error = EINVAL;
2368 		goto out2;
2369 	}
2370 
2371 	/* Check the device vnode and update mount-from name, for local filesystems */
2372 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2373 		IMGSRC_DEBUG("Local, doing device validation.\n");
2374 
2375 		if (devpath != USER_ADDR_NULL) {
2376 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2377 			if (error) {
2378 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2379 				goto out2;
2380 			}
2381 
2382 			vnode_put(devvp);
2383 		}
2384 	}
2385 
2386 	/*
2387 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2388 	 * and increment the name cache's mount generation
2389 	 */
2390 
2391 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2392 	error = place_mount_and_checkdirs(mp, vp, ctx);
2393 	if (error != 0) {
2394 		goto out2;
2395 	}
2396 
2397 	placed = TRUE;
2398 
2399 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2400 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2401 
2402 	/* Forbid future moves */
2403 	mount_lock(mp);
2404 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2405 	mount_unlock(mp);
2406 
2407 	/* Finally, add to mount list, completely ready to go */
2408 	if (mount_list_add(mp) != 0) {
2409 		/*
2410 		 * The system is shutting down trying to umount
2411 		 * everything, so fail with a plausible errno.
2412 		 */
2413 		error = EBUSY;
2414 		goto out3;
2415 	}
2416 
2417 	mount_end_update(mp);
2418 	vnode_put(rvp);
2419 	zfree(ZV_NAMEI, old_mntonname);
2420 
2421 	vfs_notify_mount(pvp);
2422 
2423 	return 0;
2424 out3:
2425 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2426 
2427 	mount_lock(mp);
2428 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2429 	mount_unlock(mp);
2430 
2431 out2:
2432 	/*
2433 	 * Placing the mp on the vnode clears VMOUNT,
2434 	 * so cleanup is different after that point
2435 	 */
2436 	if (placed) {
2437 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2438 		undo_place_on_covered_vp(mp, vp);
2439 	} else {
2440 		vnode_lock_spin(vp);
2441 		CLR(vp->v_flag, VMOUNT);
2442 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2443 		wakeup(&vp->v_flag);
2444 		vnode_unlock(vp);
2445 	}
2446 out1:
2447 	mount_end_update(mp);
2448 
2449 out0:
2450 	vnode_put(rvp);
2451 	zfree(ZV_NAMEI, old_mntonname);
2452 	return error;
2453 }
2454 
2455 #endif /* CONFIG_IMGSRC_ACCESS */
2456 
2457 void
enablequotas(struct mount * mp,vfs_context_t ctx)2458 enablequotas(struct mount *mp, vfs_context_t ctx)
2459 {
2460 	struct nameidata qnd;
2461 	int type;
2462 	char qfpath[MAXPATHLEN];
2463 	const char *qfname = QUOTAFILENAME;
2464 	const char *qfopsname = QUOTAOPSNAME;
2465 	const char *qfextension[] = INITQFNAMES;
2466 
2467 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2468 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2469 		return;
2470 	}
2471 	/*
2472 	 * Enable filesystem disk quotas if necessary.
2473 	 * We ignore errors as this should not interfere with final mount
2474 	 */
2475 	for (type = 0; type < MAXQUOTAS; type++) {
2476 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2477 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2478 		    CAST_USER_ADDR_T(qfpath), ctx);
2479 		if (namei(&qnd) != 0) {
2480 			continue;           /* option file to trigger quotas is not present */
2481 		}
2482 		vnode_put(qnd.ni_vp);
2483 		nameidone(&qnd);
2484 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2485 
2486 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2487 	}
2488 	return;
2489 }
2490 
2491 
2492 static int
checkdirs_callback(proc_t p,void * arg)2493 checkdirs_callback(proc_t p, void * arg)
2494 {
2495 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2496 	vnode_t olddp = cdrp->olddp;
2497 	vnode_t newdp = cdrp->newdp;
2498 	struct filedesc *fdp = &p->p_fd;
2499 	vnode_t new_cvp = newdp;
2500 	vnode_t new_rvp = newdp;
2501 	vnode_t old_cvp = NULL;
2502 	vnode_t old_rvp = NULL;
2503 
2504 	/*
2505 	 * XXX Also needs to iterate each thread in the process to see if it
2506 	 * XXX is using a per-thread current working directory, and, if so,
2507 	 * XXX update that as well.
2508 	 */
2509 
2510 	/*
2511 	 * First, with the proc_fdlock held, check to see if we will need
2512 	 * to do any work.  If not, we will get out fast.
2513 	 */
2514 	proc_fdlock(p);
2515 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2516 		proc_fdunlock(p);
2517 		return PROC_RETURNED;
2518 	}
2519 	proc_fdunlock(p);
2520 
2521 	/*
2522 	 * Ok, we will have to do some work.  Always take two refs
2523 	 * because we might need that many.  We'll dispose of whatever
2524 	 * we ended up not using.
2525 	 */
2526 	if (vnode_ref(newdp) != 0) {
2527 		return PROC_RETURNED;
2528 	}
2529 	if (vnode_ref(newdp) != 0) {
2530 		vnode_rele(newdp);
2531 		return PROC_RETURNED;
2532 	}
2533 
2534 	proc_dirs_lock_exclusive(p);
2535 	/*
2536 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2537 	 * have to do all of the checks again.
2538 	 */
2539 	proc_fdlock(p);
2540 	if (fdp->fd_cdir == olddp) {
2541 		old_cvp = olddp;
2542 		fdp->fd_cdir = newdp;
2543 		new_cvp = NULL;
2544 	}
2545 	if (fdp->fd_rdir == olddp) {
2546 		old_rvp = olddp;
2547 		fdp->fd_rdir = newdp;
2548 		new_rvp = NULL;
2549 	}
2550 	proc_fdunlock(p);
2551 	proc_dirs_unlock_exclusive(p);
2552 
2553 	/*
2554 	 * Dispose of any references that are no longer needed.
2555 	 */
2556 	if (old_cvp != NULL) {
2557 		vnode_rele(old_cvp);
2558 	}
2559 	if (old_rvp != NULL) {
2560 		vnode_rele(old_rvp);
2561 	}
2562 	if (new_cvp != NULL) {
2563 		vnode_rele(new_cvp);
2564 	}
2565 	if (new_rvp != NULL) {
2566 		vnode_rele(new_rvp);
2567 	}
2568 
2569 	return PROC_RETURNED;
2570 }
2571 
2572 
2573 
2574 /*
2575  * Scan all active processes to see if any of them have a current
2576  * or root directory onto which the new filesystem has just been
2577  * mounted. If so, replace them with the new mount point.
2578  */
2579 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2580 checkdirs(vnode_t olddp, vfs_context_t ctx)
2581 {
2582 	vnode_t newdp;
2583 	vnode_t tvp;
2584 	int err;
2585 	struct cdirargs cdr;
2586 
2587 	if (olddp->v_usecount == 1) {
2588 		return 0;
2589 	}
2590 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2591 
2592 	if (err != 0) {
2593 #if DIAGNOSTIC
2594 		panic("mount: lost mount: error %d", err);
2595 #endif
2596 		return err;
2597 	}
2598 
2599 	cdr.olddp = olddp;
2600 	cdr.newdp = newdp;
2601 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2602 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2603 
2604 	if (rootvnode == olddp) {
2605 		vnode_ref(newdp);
2606 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2607 		tvp = rootvnode;
2608 		rootvnode = newdp;
2609 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2610 		vnode_rele(tvp);
2611 	}
2612 
2613 	vnode_put(newdp);
2614 	return 0;
2615 }
2616 
2617 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2618 	"com.apple.private.vfs.role-account-unmount"
2619 
2620 /*
2621  * Unmount a file system.
2622  *
2623  * Note: unmount takes a path to the vnode mounted on as argument,
2624  * not special file (as before).
2625  */
2626 /* ARGSUSED */
2627 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2628 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2629 {
2630 	vnode_t vp;
2631 	struct mount *mp;
2632 	int error;
2633 	struct nameidata nd;
2634 	vfs_context_t ctx;
2635 
2636 	/*
2637 	 * If the process has the entitlement, use the kernel's context when
2638 	 * performing lookup on the mount path as the process might lack proper
2639 	 * permission to access the directory.
2640 	 */
2641 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2642 	    vfs_context_kernel() : vfs_context_current();
2643 
2644 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2645 	    UIO_USERSPACE, uap->path, ctx);
2646 	error = namei(&nd);
2647 	if (error) {
2648 		return error;
2649 	}
2650 	vp = nd.ni_vp;
2651 	mp = vp->v_mount;
2652 	nameidone(&nd);
2653 
2654 	/*
2655 	 * Must be the root of the filesystem
2656 	 */
2657 	if ((vp->v_flag & VROOT) == 0) {
2658 		vnode_put(vp);
2659 		return EINVAL;
2660 	}
2661 #if CONFIG_MACF
2662 	error = mac_mount_check_umount(ctx, mp);
2663 	if (error != 0) {
2664 		vnode_put(vp);
2665 		return error;
2666 	}
2667 #endif
2668 	mount_ref(mp, 0);
2669 	vnode_put(vp);
2670 	/* safedounmount consumes the mount ref */
2671 	return safedounmount(mp, uap->flags, ctx);
2672 }
2673 
2674 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2675 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2676 {
2677 	mount_t mp;
2678 
2679 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2680 	if (mp == (mount_t)0) {
2681 		return ENOENT;
2682 	}
2683 	mount_ref(mp, 0);
2684 	mount_iterdrop(mp);
2685 	/* safedounmount consumes the mount ref */
2686 	return safedounmount(mp, flags, ctx);
2687 }
2688 
2689 /*
2690  * The mount struct comes with a mount ref which will be consumed.
2691  * Do the actual file system unmount, prevent some common foot shooting.
2692  */
2693 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2694 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2695 {
2696 	int error;
2697 	proc_t p = vfs_context_proc(ctx);
2698 
2699 	/*
2700 	 * If the file system is not responding and MNT_NOBLOCK
2701 	 * is set and not a forced unmount then return EBUSY.
2702 	 */
2703 	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2704 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2705 		error = EBUSY;
2706 		goto out;
2707 	}
2708 
2709 	/*
2710 	 * Skip authorization in two cases:
2711 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2712 	 *   This entitlement allows non-root processes unmount volumes mounted by
2713 	 *   other processes.
2714 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2715 	 *   attempt.
2716 	 */
2717 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2718 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2719 		/*
2720 		 * Only root, or the user that did the original mount is
2721 		 * permitted to unmount this filesystem.
2722 		 */
2723 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2724 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2725 			goto out;
2726 		}
2727 	}
2728 	/*
2729 	 * Don't allow unmounting the root file system, or other volumes
2730 	 * associated with it (for example, the associated VM or DATA mounts) .
2731 	 */
2732 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2733 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2734 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2735 			    mp->mnt_vfsstat.f_mntonname);
2736 		}
2737 		error = EBUSY; /* the root (or associated volumes) is always busy */
2738 		goto out;
2739 	}
2740 
2741 	/*
2742 	 * If the mount is providing the root filesystem's disk image
2743 	 * (i.e. imageboot), don't allow unmounting
2744 	 */
2745 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2746 		error = EBUSY;
2747 		goto out;
2748 	}
2749 
2750 	return dounmount(mp, flags, 1, ctx);
2751 
2752 out:
2753 	mount_drop(mp, 0);
2754 	return error;
2755 }
2756 
2757 /*
2758  * Do the actual file system unmount.
2759  */
2760 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2761 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2762 {
2763 	vnode_t coveredvp = (vnode_t)0;
2764 	int error;
2765 	int needwakeup = 0;
2766 	int forcedunmount = 0;
2767 	int lflags = 0;
2768 	struct vnode *devvp = NULLVP;
2769 #if CONFIG_TRIGGERS
2770 	proc_t p = vfs_context_proc(ctx);
2771 	int did_vflush = 0;
2772 	int pflags_save = 0;
2773 #endif /* CONFIG_TRIGGERS */
2774 
2775 #if CONFIG_FSE
2776 	if (!(flags & MNT_FORCE)) {
2777 		fsevent_unmount(mp, ctx);  /* has to come first! */
2778 	}
2779 #endif
2780 
2781 	mount_lock(mp);
2782 
2783 	/*
2784 	 * If already an unmount in progress just return EBUSY.
2785 	 * Even a forced unmount cannot override.
2786 	 */
2787 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2788 		if (withref != 0) {
2789 			mount_drop(mp, 1);
2790 		}
2791 		mount_unlock(mp);
2792 		return EBUSY;
2793 	}
2794 
2795 	if (flags & MNT_FORCE) {
2796 		forcedunmount = 1;
2797 		mp->mnt_lflag |= MNT_LFORCE;
2798 	}
2799 
2800 #if CONFIG_TRIGGERS
2801 	if (flags & MNT_NOBLOCK && p != kernproc) {
2802 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2803 	}
2804 #endif
2805 
2806 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2807 	mp->mnt_lflag |= MNT_LUNMOUNT;
2808 	mp->mnt_flag &= ~MNT_ASYNC;
2809 	/*
2810 	 * anyone currently in the fast path that
2811 	 * trips over the cached rootvp will be
2812 	 * dumped out and forced into the slow path
2813 	 * to regenerate a new cached value
2814 	 */
2815 	mp->mnt_realrootvp = NULLVP;
2816 	mount_unlock(mp);
2817 
2818 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2819 		/*
2820 		 * Force unmount any mounts in this filesystem.
2821 		 * If any unmounts fail - just leave them dangling.
2822 		 * Avoids recursion.
2823 		 */
2824 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2825 	}
2826 
2827 	/*
2828 	 * taking the name_cache_lock exclusively will
2829 	 * insure that everyone is out of the fast path who
2830 	 * might be trying to use a now stale copy of
2831 	 * vp->v_mountedhere->mnt_realrootvp
2832 	 * bumping mount_generation causes the cached values
2833 	 * to be invalidated
2834 	 */
2835 	name_cache_lock();
2836 	mount_generation++;
2837 	name_cache_unlock();
2838 
2839 
2840 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2841 	if (withref != 0) {
2842 		mount_drop(mp, 0);
2843 	}
2844 	error = 0;
2845 	if (forcedunmount == 0) {
2846 		ubc_umount(mp); /* release cached vnodes */
2847 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2848 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2849 			if (error) {
2850 				mount_lock(mp);
2851 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2852 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2853 				mp->mnt_lflag &= ~MNT_LFORCE;
2854 				goto out;
2855 			}
2856 		}
2857 	}
2858 
2859 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2860 
2861 #if CONFIG_TRIGGERS
2862 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2863 	did_vflush = 1;
2864 #endif
2865 	if (forcedunmount) {
2866 		lflags |= FORCECLOSE;
2867 	}
2868 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2869 	if ((forcedunmount == 0) && error) {
2870 		mount_lock(mp);
2871 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2872 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2873 		mp->mnt_lflag &= ~MNT_LFORCE;
2874 		goto out;
2875 	}
2876 
2877 	/* make sure there are no one in the mount iterations or lookup */
2878 	mount_iterdrain(mp);
2879 
2880 	error = VFS_UNMOUNT(mp, flags, ctx);
2881 	if (error) {
2882 		mount_iterreset(mp);
2883 		mount_lock(mp);
2884 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2885 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2886 		mp->mnt_lflag &= ~MNT_LFORCE;
2887 		goto out;
2888 	}
2889 
2890 	/* increment the operations count */
2891 	if (!error) {
2892 		OSAddAtomic(1, &vfs_nummntops);
2893 	}
2894 
2895 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2896 		/* hold an io reference and drop the usecount before close */
2897 		devvp = mp->mnt_devvp;
2898 		vnode_getalways(devvp);
2899 		vnode_rele(devvp);
2900 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2901 		    ctx);
2902 		vnode_clearmountedon(devvp);
2903 		vnode_put(devvp);
2904 	}
2905 	lck_rw_done(&mp->mnt_rwlock);
2906 	mount_list_remove(mp);
2907 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2908 
2909 	/* mark the mount point hook in the vp but not drop the ref yet */
2910 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2911 		/*
2912 		 * The covered vnode needs special handling. Trying to get an
2913 		 * iocount must not block here as this may lead to deadlocks
2914 		 * if the Filesystem to which the covered vnode belongs is
2915 		 * undergoing forced unmounts. Since we hold a usecount, the
2916 		 * vnode cannot be reused (it can, however, still be terminated)
2917 		 */
2918 		vnode_getalways(coveredvp);
2919 		vnode_lock_spin(coveredvp);
2920 
2921 		mp->mnt_crossref++;
2922 		coveredvp->v_mountedhere = (struct mount *)0;
2923 		CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
2924 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2925 		wakeup(&coveredvp->v_flag);
2926 		vnode_unlock(coveredvp);
2927 		vnode_put(coveredvp);
2928 	}
2929 
2930 	mount_list_lock();
2931 	mp->mnt_vtable->vfc_refcount--;
2932 	mount_list_unlock();
2933 
2934 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
2935 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2936 	mount_lock(mp);
2937 	mp->mnt_lflag |= MNT_LDEAD;
2938 
2939 	if (mp->mnt_lflag & MNT_LWAIT) {
2940 		/*
2941 		 * do the wakeup here
2942 		 * in case we block in mount_refdrain
2943 		 * which will drop the mount lock
2944 		 * and allow anyone blocked in vfs_busy
2945 		 * to wakeup and see the LDEAD state
2946 		 */
2947 		mp->mnt_lflag &= ~MNT_LWAIT;
2948 		wakeup((caddr_t)mp);
2949 	}
2950 	mount_refdrain(mp);
2951 
2952 	/* free disk_conditioner_info structure for this mount */
2953 	disk_conditioner_unmount(mp);
2954 
2955 out:
2956 	if (mp->mnt_lflag & MNT_LWAIT) {
2957 		mp->mnt_lflag &= ~MNT_LWAIT;
2958 		needwakeup = 1;
2959 	}
2960 
2961 #if CONFIG_TRIGGERS
2962 	if (flags & MNT_NOBLOCK && p != kernproc) {
2963 		// Restore P_NOREMOTEHANG bit to its previous value
2964 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
2965 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2966 		}
2967 	}
2968 
2969 	/*
2970 	 * Callback and context are set together under the mount lock, and
2971 	 * never cleared, so we're safe to examine them here, drop the lock,
2972 	 * and call out.
2973 	 */
2974 	if (mp->mnt_triggercallback != NULL) {
2975 		mount_unlock(mp);
2976 		if (error == 0) {
2977 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2978 		} else if (did_vflush) {
2979 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2980 		}
2981 	} else {
2982 		mount_unlock(mp);
2983 	}
2984 #else
2985 	mount_unlock(mp);
2986 #endif /* CONFIG_TRIGGERS */
2987 
2988 	lck_rw_done(&mp->mnt_rwlock);
2989 
2990 	if (needwakeup) {
2991 		wakeup((caddr_t)mp);
2992 	}
2993 
2994 	if (!error) {
2995 		if ((coveredvp != NULLVP)) {
2996 			vnode_t pvp = NULLVP;
2997 
2998 			/*
2999 			 * The covered vnode needs special handling. Trying to
3000 			 * get an iocount must not block here as this may lead
3001 			 * to deadlocks if the Filesystem to which the covered
3002 			 * vnode belongs is undergoing forced unmounts. Since we
3003 			 * hold a usecount, the  vnode cannot be reused
3004 			 * (it can, however, still be terminated).
3005 			 */
3006 			vnode_getalways(coveredvp);
3007 
3008 			mount_dropcrossref(mp, coveredvp, 0);
3009 			/*
3010 			 * We'll _try_ to detect if this really needs to be
3011 			 * done. The coveredvp can only be in termination (or
3012 			 * terminated) if the coveredvp's mount point is in a
3013 			 * forced unmount (or has been) since we still hold the
3014 			 * ref.
3015 			 */
3016 			if (!vnode_isrecycled(coveredvp)) {
3017 				pvp = vnode_getparent(coveredvp);
3018 #if CONFIG_TRIGGERS
3019 				if (coveredvp->v_resolve) {
3020 					vnode_trigger_rearm(coveredvp, ctx);
3021 				}
3022 #endif
3023 			}
3024 
3025 			vnode_rele(coveredvp);
3026 			vnode_put(coveredvp);
3027 			coveredvp = NULLVP;
3028 
3029 			if (pvp) {
3030 				lock_vnode_and_post(pvp, NOTE_WRITE);
3031 				vnode_put(pvp);
3032 			}
3033 		} else if (mp->mnt_flag & MNT_ROOTFS) {
3034 			if (nc_smr_enabled) {
3035 				vfs_smr_synchronize();
3036 			}
3037 
3038 			mount_lock_destroy(mp);
3039 #if CONFIG_MACF
3040 			mac_mount_label_destroy(mp);
3041 #endif
3042 			zfree(mount_zone, mp);
3043 		} else {
3044 			panic("dounmount: no coveredvp");
3045 		}
3046 	}
3047 	return error;
3048 }
3049 
3050 /*
3051  * Unmount any mounts in this filesystem.
3052  */
3053 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3054 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3055 {
3056 	mount_t smp;
3057 	fsid_t *fsids, fsid;
3058 	int fsids_sz;
3059 	int count = 0, i, m = 0;
3060 	vnode_t vp;
3061 
3062 	mount_list_lock();
3063 
3064 	// Get an array to hold the submounts fsids.
3065 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
3066 	count++;
3067 	fsids_sz = count * sizeof(fsid_t);
3068 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3069 	if (fsids == NULL) {
3070 		mount_list_unlock();
3071 		goto out;
3072 	}
3073 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
3074 
3075 	/*
3076 	 * Fill the array with submount fsids.
3077 	 * Since mounts are always added to the tail of the mount list, the
3078 	 * list is always in mount order.
3079 	 * For each mount check if the mounted-on vnode belongs to a
3080 	 * mount that's already added to our array of mounts to be unmounted.
3081 	 */
3082 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3083 		vp = smp->mnt_vnodecovered;
3084 		if (vp == NULL) {
3085 			continue;
3086 		}
3087 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3088 		for (i = 0; i <= m; i++) {
3089 			if (fsids[i].val[0] == fsid.val[0] &&
3090 			    fsids[i].val[1] == fsid.val[1]) {
3091 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3092 				break;
3093 			}
3094 		}
3095 	}
3096 	mount_list_unlock();
3097 
3098 	// Unmount the submounts in reverse order. Ignore errors.
3099 	for (i = m; i > 0; i--) {
3100 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3101 		if (smp) {
3102 			mount_ref(smp, 0);
3103 			mount_iterdrop(smp);
3104 			(void) dounmount(smp, flags, 1, ctx);
3105 		}
3106 	}
3107 out:
3108 	kfree_data(fsids, fsids_sz);
3109 }
3110 
3111 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3112 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3113 {
3114 	vnode_hold(dp);
3115 	vnode_lock(dp);
3116 	mp->mnt_crossref--;
3117 
3118 	if (mp->mnt_crossref < 0) {
3119 		panic("mount cross refs -ve");
3120 	}
3121 
3122 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3123 		if (need_put) {
3124 			vnode_put_locked(dp);
3125 		}
3126 		vnode_drop_and_unlock(dp);
3127 
3128 		if (nc_smr_enabled) {
3129 			vfs_smr_synchronize();
3130 		}
3131 
3132 		mount_lock_destroy(mp);
3133 #if CONFIG_MACF
3134 		mac_mount_label_destroy(mp);
3135 #endif
3136 		zfree(mount_zone, mp);
3137 		return;
3138 	}
3139 	if (need_put) {
3140 		vnode_put_locked(dp);
3141 	}
3142 	vnode_drop_and_unlock(dp);
3143 }
3144 
3145 
3146 /*
3147  * Sync each mounted filesystem.
3148  */
3149 #if DIAGNOSTIC
3150 int syncprt = 0;
3151 #endif
3152 
3153 int print_vmpage_stat = 0;
3154 
3155 /*
3156  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3157  *			mounted read-write with the passed waitfor value.
3158  *
3159  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3160  *		arg	user argument (please see below)
3161  *
3162  * User argument is a pointer to 32 bit unsigned integer which describes the
3163  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3164  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3165  * waitfor value.
3166  *
3167  * Returns:		VFS_RETURNED
3168  */
3169 static int
sync_callback(mount_t mp,void * arg)3170 sync_callback(mount_t mp, void *arg)
3171 {
3172 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3173 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3174 		unsigned waitfor = MNT_NOWAIT;
3175 
3176 		if (arg) {
3177 			waitfor = *(uint32_t*)arg;
3178 		}
3179 
3180 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3181 		if (waitfor != MNT_WAIT &&
3182 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3183 		    waitfor != MNT_NOWAIT &&
3184 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3185 		    waitfor != MNT_DWAIT &&
3186 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3187 			panic("Passed inappropriate waitfor %u to "
3188 			    "sync_callback()", waitfor);
3189 		}
3190 
3191 		mp->mnt_flag &= ~MNT_ASYNC;
3192 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3193 		if (asyncflag) {
3194 			mp->mnt_flag |= MNT_ASYNC;
3195 		}
3196 	}
3197 
3198 	return VFS_RETURNED;
3199 }
3200 
3201 /* ARGSUSED */
3202 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3203 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3204 {
3205 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3206 
3207 	if (print_vmpage_stat) {
3208 		vm_countdirtypages();
3209 	}
3210 
3211 #if DIAGNOSTIC
3212 	if (syncprt) {
3213 		vfs_bufstats();
3214 	}
3215 #endif /* DIAGNOSTIC */
3216 	return 0;
3217 }
3218 
3219 typedef enum {
3220 	SYNC_ALL = 0,
3221 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3222 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3223 } sync_type_t;
3224 
3225 static int
sync_internal_callback(mount_t mp,void * arg)3226 sync_internal_callback(mount_t mp, void *arg)
3227 {
3228 	if (arg) {
3229 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3230 		    (mp->mnt_flag & MNT_LOCAL);
3231 		sync_type_t sync_type = *((sync_type_t *)arg);
3232 
3233 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3234 			return VFS_RETURNED;
3235 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3236 			return VFS_RETURNED;
3237 		}
3238 	}
3239 
3240 	(void)sync_callback(mp, NULL);
3241 
3242 	return VFS_RETURNED;
3243 }
3244 
3245 int sync_thread_state = 0;
3246 int sync_timeout_seconds = 5;
3247 
3248 #define SYNC_THREAD_RUN       0x0001
3249 #define SYNC_THREAD_RUNNING   0x0002
3250 
3251 #if CONFIG_PHYS_WRITE_ACCT
3252 thread_t pm_sync_thread;
3253 #endif /* CONFIG_PHYS_WRITE_ACCT */
3254 
3255 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3256 sync_thread(__unused void *arg, __unused wait_result_t wr)
3257 {
3258 	sync_type_t sync_type;
3259 #if CONFIG_PHYS_WRITE_ACCT
3260 	pm_sync_thread = current_thread();
3261 #endif /* CONFIG_PHYS_WRITE_ACCT */
3262 
3263 	lck_mtx_lock(&sync_mtx_lck);
3264 	while (sync_thread_state & SYNC_THREAD_RUN) {
3265 		sync_thread_state &= ~SYNC_THREAD_RUN;
3266 		lck_mtx_unlock(&sync_mtx_lck);
3267 
3268 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3269 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3270 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3271 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3272 
3273 		lck_mtx_lock(&sync_mtx_lck);
3274 	}
3275 	/*
3276 	 * This wakeup _has_ to be issued before the lock is released otherwise
3277 	 * we may end up waking up a thread in sync_internal which is
3278 	 * expecting a wakeup from a thread it just created and not from this
3279 	 * thread which is about to exit.
3280 	 */
3281 	wakeup(&sync_thread_state);
3282 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3283 #if CONFIG_PHYS_WRITE_ACCT
3284 	pm_sync_thread = NULL;
3285 #endif /* CONFIG_PHYS_WRITE_ACCT */
3286 	lck_mtx_unlock(&sync_mtx_lck);
3287 
3288 	if (print_vmpage_stat) {
3289 		vm_countdirtypages();
3290 	}
3291 
3292 #if DIAGNOSTIC
3293 	if (syncprt) {
3294 		vfs_bufstats();
3295 	}
3296 #endif /* DIAGNOSTIC */
3297 }
3298 
3299 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3300 
3301 /*
3302  * An in-kernel sync for power management to call.
3303  * This function always returns within sync_timeout seconds.
3304  */
3305 __private_extern__ int
sync_internal(void)3306 sync_internal(void)
3307 {
3308 	thread_t thd = NULL;
3309 	int error;
3310 	int thread_created = FALSE;
3311 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3312 
3313 	lck_mtx_lock(&sync_mtx_lck);
3314 	sync_thread_state |= SYNC_THREAD_RUN;
3315 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3316 		int kr;
3317 
3318 		sync_thread_state |= SYNC_THREAD_RUNNING;
3319 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3320 		if (kr != KERN_SUCCESS) {
3321 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3322 			lck_mtx_unlock(&sync_mtx_lck);
3323 			printf("sync_thread failed\n");
3324 			return 0;
3325 		}
3326 		thread_created = TRUE;
3327 	}
3328 
3329 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3330 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3331 	if (error) {
3332 		struct timeval now;
3333 
3334 		microtime(&now);
3335 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3336 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3337 			sync_timeout_last_print.tv_sec = now.tv_sec;
3338 		}
3339 	}
3340 
3341 	if (thread_created) {
3342 		thread_deallocate(thd);
3343 	}
3344 
3345 	return 0;
3346 } /* end of sync_internal call */
3347 
3348 /*
3349  * Change filesystem quotas.
3350  */
3351 #if QUOTA
3352 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3353 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3354 {
3355 	struct mount *mp;
3356 	int error, quota_cmd, quota_status = 0;
3357 	caddr_t datap;
3358 	size_t fnamelen;
3359 	struct nameidata nd;
3360 	vfs_context_t ctx = vfs_context_current();
3361 	struct dqblk my_dqblk = {};
3362 
3363 	AUDIT_ARG(uid, uap->uid);
3364 	AUDIT_ARG(cmd, uap->cmd);
3365 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3366 	    uap->path, ctx);
3367 	error = namei(&nd);
3368 	if (error) {
3369 		return error;
3370 	}
3371 	mp = nd.ni_vp->v_mount;
3372 	mount_ref(mp, 0);
3373 	vnode_put(nd.ni_vp);
3374 	nameidone(&nd);
3375 
3376 #if CONFIG_MACF
3377 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3378 	if (error != 0) {
3379 		goto out;
3380 	}
3381 #endif
3382 
3383 	/* copyin any data we will need for downstream code */
3384 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3385 
3386 	switch (quota_cmd) {
3387 	case Q_QUOTAON:
3388 		/* uap->arg specifies a file from which to take the quotas */
3389 		fnamelen = MAXPATHLEN;
3390 		datap = zalloc(ZV_NAMEI);
3391 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3392 		break;
3393 	case Q_GETQUOTA:
3394 		/* uap->arg is a pointer to a dqblk structure. */
3395 		datap = (caddr_t) &my_dqblk;
3396 		break;
3397 	case Q_SETQUOTA:
3398 	case Q_SETUSE:
3399 		/* uap->arg is a pointer to a dqblk structure. */
3400 		datap = (caddr_t) &my_dqblk;
3401 		if (proc_is64bit(p)) {
3402 			struct user_dqblk       my_dqblk64;
3403 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3404 			if (error == 0) {
3405 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3406 			}
3407 		} else {
3408 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3409 		}
3410 		break;
3411 	case Q_QUOTASTAT:
3412 		/* uap->arg is a pointer to an integer */
3413 		datap = (caddr_t) &quota_status;
3414 		break;
3415 	default:
3416 		datap = NULL;
3417 		break;
3418 	} /* switch */
3419 
3420 	if (error == 0) {
3421 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3422 	}
3423 
3424 	switch (quota_cmd) {
3425 	case Q_QUOTAON:
3426 		if (datap != NULL) {
3427 			zfree(ZV_NAMEI, datap);
3428 		}
3429 		break;
3430 	case Q_GETQUOTA:
3431 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3432 		if (error == 0) {
3433 			if (proc_is64bit(p)) {
3434 				struct user_dqblk       my_dqblk64;
3435 
3436 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3437 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3438 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3439 			} else {
3440 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3441 			}
3442 		}
3443 		break;
3444 	case Q_QUOTASTAT:
3445 		/* uap->arg is a pointer to an integer */
3446 		if (error == 0) {
3447 			error = copyout(datap, uap->arg, sizeof(quota_status));
3448 		}
3449 		break;
3450 	default:
3451 		break;
3452 	} /* switch */
3453 
3454 out:
3455 	mount_drop(mp, 0);
3456 	return error;
3457 }
3458 #else
3459 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3460 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3461 {
3462 	return EOPNOTSUPP;
3463 }
3464 #endif /* QUOTA */
3465 
3466 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3467 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3468 {
3469 	int error;
3470 	vfs_context_t ctx = vfs_context_current();
3471 
3472 #if CONFIG_MACF
3473 	error = mac_mount_check_stat(ctx, mp);
3474 	if (error != 0) {
3475 		return error;
3476 	}
3477 #endif
3478 
3479 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3480 	if (error != 0) {
3481 		return error;
3482 	}
3483 
3484 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3485 }
3486 
3487 /*
3488  * Get filesystem statistics.
3489  *
3490  * Returns:	0			Success
3491  *	namei:???
3492  *	vfs_update_vfsstat:???
3493  *	munge_statfs:EFAULT
3494  */
3495 /* ARGSUSED */
3496 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3497 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3498 {
3499 	int error;
3500 	struct mount *mp;
3501 	struct nameidata nd;
3502 	vfs_context_t ctx = vfs_context_current();
3503 	vnode_t vp;
3504 
3505 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3506 	    UIO_USERSPACE, uap->path, ctx);
3507 	error = namei(&nd);
3508 	if (error != 0) {
3509 		return error;
3510 	}
3511 	vp = nd.ni_vp;
3512 	mp = vp->v_mount;
3513 	nameidone(&nd);
3514 
3515 	error = statfs_internal(p, mp, uap->buf);
3516 	vnode_put(vp);
3517 
3518 	return error;
3519 }
3520 
3521 /*
3522  * Get filesystem statistics.
3523  */
3524 /* ARGSUSED */
3525 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3526 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3527 {
3528 	int error;
3529 	vnode_t vp = NULL;
3530 	struct mount *mp;
3531 
3532 	AUDIT_ARG(fd, uap->fd);
3533 
3534 	if ((error = file_vnode(uap->fd, &vp)) ||
3535 	    (error = vnode_getwithref(vp))) {
3536 		goto out;
3537 	}
3538 
3539 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3540 
3541 	mp = vp->v_mount;
3542 	if (!mp) {
3543 		error = EBADF;
3544 		goto out_vnode;
3545 	}
3546 
3547 	error = statfs_internal(p, mp, uap->buf);
3548 
3549 out_vnode:
3550 	vnode_put(vp);
3551 
3552 out:
3553 	if (vp != NULL) {
3554 		file_drop(uap->fd);
3555 	}
3556 
3557 	return error;
3558 }
3559 
3560 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3561 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3562 {
3563 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3564 
3565 	bzero(sfs, sizeof(*sfs));
3566 
3567 	sfs->f_bsize = vsfs->f_bsize;
3568 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3569 	sfs->f_blocks = vsfs->f_blocks;
3570 	sfs->f_bfree = vsfs->f_bfree;
3571 	sfs->f_bavail = vsfs->f_bavail;
3572 	sfs->f_files = vsfs->f_files;
3573 	sfs->f_ffree = vsfs->f_ffree;
3574 	sfs->f_fsid = vsfs->f_fsid;
3575 	sfs->f_owner = vsfs->f_owner;
3576 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3577 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3578 	sfs->f_fssubtype = vsfs->f_fssubtype;
3579 	sfs->f_flags_ext = 0;
3580 	if (mp->mnt_kern_flag & MNTK_SYSTEMDATA) {
3581 		sfs->f_flags_ext |= MNT_EXT_ROOT_DATA_VOL;
3582 	}
3583 	if (mp->mnt_kern_flag & MNTK_FSKIT) {
3584 		sfs->f_flags_ext |= MNT_EXT_FSKIT;
3585 	}
3586 	vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3587 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3588 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3589 }
3590 
3591 /*
3592  * Get file system statistics in 64-bit mode
3593  */
3594 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3595 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3596 {
3597 	struct mount *mp;
3598 	int error;
3599 	struct nameidata *ndp;
3600 	struct statfs64 *sfsp;
3601 	vfs_context_t ctxp = vfs_context_current();
3602 	vnode_t vp;
3603 	struct {
3604 		struct nameidata nd;
3605 		struct statfs64 sfs;
3606 	} *__nameidata_statfs64;
3607 
3608 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3609 	    Z_WAITOK);
3610 	ndp = &__nameidata_statfs64->nd;
3611 
3612 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3613 	    UIO_USERSPACE, uap->path, ctxp);
3614 	error = namei(ndp);
3615 	if (error != 0) {
3616 		goto out;
3617 	}
3618 	vp = ndp->ni_vp;
3619 	mp = vp->v_mount;
3620 	nameidone(ndp);
3621 
3622 #if CONFIG_MACF
3623 	error = mac_mount_check_stat(ctxp, mp);
3624 	if (error != 0) {
3625 		vnode_put(vp);
3626 		goto out;
3627 	}
3628 #endif
3629 
3630 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3631 	if (error != 0) {
3632 		vnode_put(vp);
3633 		goto out;
3634 	}
3635 
3636 	sfsp = &__nameidata_statfs64->sfs;
3637 	vfs_get_statfs64(mp, sfsp);
3638 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3639 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3640 		/* This process does not want to see a seperate data volume mountpoint */
3641 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3642 	}
3643 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3644 	vnode_put(vp);
3645 
3646 out:
3647 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3648 
3649 	return error;
3650 }
3651 
3652 /*
3653  * Get file system statistics in 64-bit mode
3654  */
3655 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3656 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3657 {
3658 	struct vnode *vp;
3659 	struct mount *mp;
3660 	struct statfs64 sfs;
3661 	int error;
3662 
3663 	AUDIT_ARG(fd, uap->fd);
3664 
3665 	if ((error = file_vnode(uap->fd, &vp))) {
3666 		return error;
3667 	}
3668 
3669 	error = vnode_getwithref(vp);
3670 	if (error) {
3671 		file_drop(uap->fd);
3672 		return error;
3673 	}
3674 
3675 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3676 
3677 	mp = vp->v_mount;
3678 	if (!mp) {
3679 		error = EBADF;
3680 		goto out;
3681 	}
3682 
3683 #if CONFIG_MACF
3684 	error = mac_mount_check_stat(vfs_context_current(), mp);
3685 	if (error != 0) {
3686 		goto out;
3687 	}
3688 #endif
3689 
3690 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3691 		goto out;
3692 	}
3693 
3694 	vfs_get_statfs64(mp, &sfs);
3695 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3696 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3697 		/* This process does not want to see a seperate data volume mountpoint */
3698 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3699 	}
3700 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3701 
3702 out:
3703 	file_drop(uap->fd);
3704 	vnode_put(vp);
3705 
3706 	return error;
3707 }
3708 
3709 struct getfsstat_struct {
3710 	user_addr_t     sfsp;
3711 	user_addr_t     *mp;
3712 	int             count;
3713 	int             maxcount;
3714 	int             flags;
3715 	int             error;
3716 };
3717 
3718 
3719 static int
getfsstat_callback(mount_t mp,void * arg)3720 getfsstat_callback(mount_t mp, void * arg)
3721 {
3722 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3723 	struct vfsstatfs *sp;
3724 	int error, my_size;
3725 	vfs_context_t ctx = vfs_context_current();
3726 
3727 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3728 #if CONFIG_MACF
3729 		error = mac_mount_check_stat(ctx, mp);
3730 		if (error != 0) {
3731 			fstp->error = error;
3732 			return VFS_RETURNED_DONE;
3733 		}
3734 #endif
3735 		sp = &mp->mnt_vfsstat;
3736 		/*
3737 		 * If MNT_NOWAIT is specified, do not refresh the
3738 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3739 		 */
3740 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3741 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3742 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3743 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3744 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3745 			return VFS_RETURNED;
3746 		}
3747 
3748 		/*
3749 		 * Need to handle LP64 version of struct statfs
3750 		 */
3751 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3752 		if (error) {
3753 			fstp->error = error;
3754 			return VFS_RETURNED_DONE;
3755 		}
3756 		fstp->sfsp += my_size;
3757 
3758 		if (fstp->mp) {
3759 #if CONFIG_MACF
3760 			error = mac_mount_label_get(mp, *fstp->mp);
3761 			if (error) {
3762 				fstp->error = error;
3763 				return VFS_RETURNED_DONE;
3764 			}
3765 #endif
3766 			fstp->mp++;
3767 		}
3768 	}
3769 	fstp->count++;
3770 	return VFS_RETURNED;
3771 }
3772 
3773 /*
3774  * Get statistics on all filesystems.
3775  */
3776 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3777 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3778 {
3779 	struct __mac_getfsstat_args muap;
3780 
3781 	muap.buf = uap->buf;
3782 	muap.bufsize = uap->bufsize;
3783 	muap.mac = USER_ADDR_NULL;
3784 	muap.macsize = 0;
3785 	muap.flags = uap->flags;
3786 
3787 	return __mac_getfsstat(p, &muap, retval);
3788 }
3789 
3790 /*
3791  * __mac_getfsstat: Get MAC-related file system statistics
3792  *
3793  * Parameters:    p                        (ignored)
3794  *                uap                      User argument descriptor (see below)
3795  *                retval                   Count of file system statistics (N stats)
3796  *
3797  * Indirect:      uap->bufsize             Buffer size
3798  *                uap->macsize             MAC info size
3799  *                uap->buf                 Buffer where information will be returned
3800  *                uap->mac                 MAC info
3801  *                uap->flags               File system flags
3802  *
3803  *
3804  * Returns:        0                       Success
3805  *                !0                       Not success
3806  *
3807  */
3808 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3809 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3810 {
3811 	user_addr_t sfsp;
3812 	user_addr_t *mp;
3813 	size_t count, maxcount, bufsize, macsize;
3814 	struct getfsstat_struct fst;
3815 
3816 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3817 		return EINVAL;
3818 	}
3819 
3820 	bufsize = (size_t) uap->bufsize;
3821 	macsize = (size_t) uap->macsize;
3822 
3823 	if (IS_64BIT_PROCESS(p)) {
3824 		maxcount = bufsize / sizeof(struct user64_statfs);
3825 	} else {
3826 		maxcount = bufsize / sizeof(struct user32_statfs);
3827 	}
3828 	sfsp = uap->buf;
3829 	count = 0;
3830 
3831 	mp = NULL;
3832 
3833 #if CONFIG_MACF
3834 	if (uap->mac != USER_ADDR_NULL) {
3835 		u_int32_t *mp0;
3836 		int error;
3837 		unsigned int i;
3838 
3839 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3840 		if (count != maxcount) {
3841 			return EINVAL;
3842 		}
3843 
3844 		/* Copy in the array */
3845 		mp0 = kalloc_data(macsize, Z_WAITOK);
3846 		if (mp0 == NULL) {
3847 			return ENOMEM;
3848 		}
3849 
3850 		error = copyin(uap->mac, mp0, macsize);
3851 		if (error) {
3852 			kfree_data(mp0, macsize);
3853 			return error;
3854 		}
3855 
3856 		/* Normalize to an array of user_addr_t */
3857 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3858 		if (mp == NULL) {
3859 			kfree_data(mp0, macsize);
3860 			return ENOMEM;
3861 		}
3862 
3863 		for (i = 0; i < count; i++) {
3864 			if (IS_64BIT_PROCESS(p)) {
3865 				mp[i] = ((user_addr_t *)mp0)[i];
3866 			} else {
3867 				mp[i] = (user_addr_t)mp0[i];
3868 			}
3869 		}
3870 		kfree_data(mp0, macsize);
3871 	}
3872 #endif
3873 
3874 
3875 	fst.sfsp = sfsp;
3876 	fst.mp = mp;
3877 	fst.flags = uap->flags;
3878 	fst.count = 0;
3879 	fst.error = 0;
3880 	fst.maxcount = (int)maxcount;
3881 
3882 
3883 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3884 
3885 	if (mp) {
3886 		kfree_data(mp, count * sizeof(user_addr_t));
3887 	}
3888 
3889 	if (fst.error) {
3890 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3891 		return fst.error;
3892 	}
3893 
3894 	if (fst.sfsp && fst.count > fst.maxcount) {
3895 		*retval = fst.maxcount;
3896 	} else {
3897 		*retval = fst.count;
3898 	}
3899 	return 0;
3900 }
3901 
3902 static int
getfsstat64_callback(mount_t mp,void * arg)3903 getfsstat64_callback(mount_t mp, void * arg)
3904 {
3905 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3906 	struct vfsstatfs *sp;
3907 	struct statfs64 sfs;
3908 	int error;
3909 
3910 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3911 #if CONFIG_MACF
3912 		error = mac_mount_check_stat(vfs_context_current(), mp);
3913 		if (error != 0) {
3914 			fstp->error = error;
3915 			return VFS_RETURNED_DONE;
3916 		}
3917 #endif
3918 		sp = &mp->mnt_vfsstat;
3919 		/*
3920 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3921 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
3922 		 *
3923 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3924 		 * getfsstat, since the constants are out of the same
3925 		 * namespace.
3926 		 */
3927 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3928 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3929 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3930 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3931 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3932 			return VFS_RETURNED;
3933 		}
3934 
3935 		vfs_get_statfs64(mp, &sfs);
3936 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3937 		if (error) {
3938 			fstp->error = error;
3939 			return VFS_RETURNED_DONE;
3940 		}
3941 		fstp->sfsp += sizeof(sfs);
3942 	}
3943 	fstp->count++;
3944 	return VFS_RETURNED;
3945 }
3946 
3947 /*
3948  * Get statistics on all file systems in 64 bit mode.
3949  */
3950 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3951 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3952 {
3953 	user_addr_t sfsp;
3954 	int count, maxcount;
3955 	struct getfsstat_struct fst;
3956 
3957 	maxcount = uap->bufsize / sizeof(struct statfs64);
3958 
3959 	sfsp = uap->buf;
3960 	count = 0;
3961 
3962 	fst.sfsp = sfsp;
3963 	fst.flags = uap->flags;
3964 	fst.count = 0;
3965 	fst.error = 0;
3966 	fst.maxcount = maxcount;
3967 
3968 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3969 
3970 	if (fst.error) {
3971 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3972 		return fst.error;
3973 	}
3974 
3975 	if (fst.sfsp && fst.count > fst.maxcount) {
3976 		*retval = fst.maxcount;
3977 	} else {
3978 		*retval = fst.count;
3979 	}
3980 
3981 	return 0;
3982 }
3983 
3984 /*
3985  * gets the associated vnode with the file descriptor passed.
3986  * as input
3987  *
3988  * INPUT
3989  * ctx - vfs context of caller
3990  * fd - file descriptor for which vnode is required.
3991  * vpp - Pointer to pointer to vnode to be returned.
3992  *
3993  * The vnode is returned with an iocount so any vnode obtained
3994  * by this call needs a vnode_put
3995  *
3996  */
3997 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3998 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3999 {
4000 	int error;
4001 	vnode_t vp;
4002 	struct fileproc *fp;
4003 	proc_t p = vfs_context_proc(ctx);
4004 
4005 	*vpp =  NULLVP;
4006 
4007 	error = fp_getfvp(p, fd, &fp, &vp);
4008 	if (error) {
4009 		return error;
4010 	}
4011 
4012 	error = vnode_getwithref(vp);
4013 	if (error) {
4014 		(void)fp_drop(p, fd, fp, 0);
4015 		return error;
4016 	}
4017 
4018 	(void)fp_drop(p, fd, fp, 0);
4019 	*vpp = vp;
4020 	return error;
4021 }
4022 
4023 /*
4024  * Wrapper function around namei to start lookup from a directory
4025  * specified by a file descriptor ni_dirfd.
4026  *
4027  * In addition to all the errors returned by namei, this call can
4028  * return ENOTDIR if the file descriptor does not refer to a directory.
4029  * and EBADF if the file descriptor is not valid.
4030  */
4031 int
nameiat(struct nameidata * ndp,int dirfd)4032 nameiat(struct nameidata *ndp, int dirfd)
4033 {
4034 	if ((dirfd != AT_FDCWD) &&
4035 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4036 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
4037 		int error = 0;
4038 		char c;
4039 
4040 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4041 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4042 			if (error) {
4043 				return error;
4044 			}
4045 		} else {
4046 			c = *((char *)(ndp->ni_dirp));
4047 		}
4048 
4049 		if (c != '/') {
4050 			vnode_t dvp_at;
4051 
4052 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4053 			    &dvp_at);
4054 			if (error) {
4055 				return error;
4056 			}
4057 
4058 			if (vnode_vtype(dvp_at) != VDIR) {
4059 				vnode_put(dvp_at);
4060 				return ENOTDIR;
4061 			}
4062 
4063 			ndp->ni_dvp = dvp_at;
4064 			ndp->ni_cnd.cn_flags |= USEDVP;
4065 			error = namei(ndp);
4066 			ndp->ni_cnd.cn_flags &= ~USEDVP;
4067 			vnode_put(dvp_at);
4068 			return error;
4069 		}
4070 	}
4071 
4072 	return namei(ndp);
4073 }
4074 
4075 /*
4076  * Change current working directory to a given file descriptor.
4077  */
4078 /* ARGSUSED */
4079 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4080 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4081 {
4082 	vnode_t vp;
4083 	vnode_t tdp;
4084 	vnode_t tvp;
4085 	struct mount *mp;
4086 	int error, should_put = 1;
4087 
4088 	AUDIT_ARG(fd, fd);
4089 	if (per_thread && fd == -1) {
4090 		/*
4091 		 * Switching back from per-thread to per process CWD; verify we
4092 		 * in fact have one before proceeding.  The only success case
4093 		 * for this code path is to return 0 preemptively after zapping
4094 		 * the thread structure contents.
4095 		 */
4096 		thread_t th = vfs_context_thread(ctx);
4097 		if (th) {
4098 			uthread_t uth = get_bsdthread_info(th);
4099 			tvp = uth->uu_cdir;
4100 			uth->uu_cdir = NULLVP;
4101 			if (tvp != NULLVP) {
4102 				vnode_rele(tvp);
4103 				return 0;
4104 			}
4105 		}
4106 		return EBADF;
4107 	}
4108 
4109 	if ((error = file_vnode(fd, &vp))) {
4110 		return error;
4111 	}
4112 	if ((error = vnode_getwithref(vp))) {
4113 		file_drop(fd);
4114 		return error;
4115 	}
4116 
4117 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4118 
4119 	if (vp->v_type != VDIR) {
4120 		error = ENOTDIR;
4121 		goto out;
4122 	}
4123 
4124 #if CONFIG_MACF
4125 	error = mac_vnode_check_chdir(ctx, vp);
4126 	if (error) {
4127 		goto out;
4128 	}
4129 #endif
4130 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4131 	if (error) {
4132 		goto out;
4133 	}
4134 
4135 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4136 		if (vfs_busy(mp, LK_NOWAIT)) {
4137 			error = EACCES;
4138 			goto out;
4139 		}
4140 		error = VFS_ROOT(mp, &tdp, ctx);
4141 		vfs_unbusy(mp);
4142 		if (error) {
4143 			break;
4144 		}
4145 		vnode_put(vp);
4146 		vp = tdp;
4147 	}
4148 	if (error) {
4149 		goto out;
4150 	}
4151 	if ((error = vnode_ref(vp))) {
4152 		goto out;
4153 	}
4154 	vnode_put(vp);
4155 	should_put = 0;
4156 
4157 	if (per_thread) {
4158 		thread_t th = vfs_context_thread(ctx);
4159 		if (th) {
4160 			uthread_t uth = get_bsdthread_info(th);
4161 			tvp = uth->uu_cdir;
4162 			uth->uu_cdir = vp;
4163 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4164 		} else {
4165 			vnode_rele(vp);
4166 			error = ENOENT;
4167 			goto out;
4168 		}
4169 	} else {
4170 		proc_dirs_lock_exclusive(p);
4171 		proc_fdlock(p);
4172 		tvp = p->p_fd.fd_cdir;
4173 		p->p_fd.fd_cdir = vp;
4174 		proc_fdunlock(p);
4175 		proc_dirs_unlock_exclusive(p);
4176 	}
4177 
4178 	if (tvp) {
4179 		vnode_rele(tvp);
4180 	}
4181 
4182 out:
4183 	if (should_put) {
4184 		vnode_put(vp);
4185 	}
4186 	file_drop(fd);
4187 
4188 	return error;
4189 }
4190 
4191 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4192 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4193 {
4194 	return fchdir(p, vfs_context_current(), uap->fd, false);
4195 }
4196 
4197 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4198 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4199 {
4200 	return fchdir(p, vfs_context_current(), uap->fd, true);
4201 }
4202 
4203 
4204 /*
4205  * Change current working directory (".").
4206  *
4207  * Returns:	0			Success
4208  *	change_dir:ENOTDIR
4209  *	change_dir:???
4210  *	vnode_ref:ENOENT		No such file or directory
4211  */
4212 /* ARGSUSED */
4213 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4214 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4215 {
4216 	int error;
4217 	vnode_t tvp;
4218 
4219 	error = change_dir(ndp, ctx);
4220 	if (error) {
4221 		return error;
4222 	}
4223 	if ((error = vnode_ref(ndp->ni_vp))) {
4224 		vnode_put(ndp->ni_vp);
4225 		return error;
4226 	}
4227 	/*
4228 	 * drop the iocount we picked up in change_dir
4229 	 */
4230 	vnode_put(ndp->ni_vp);
4231 
4232 	if (per_thread) {
4233 		thread_t th = vfs_context_thread(ctx);
4234 		if (th) {
4235 			uthread_t uth = get_bsdthread_info(th);
4236 			tvp = uth->uu_cdir;
4237 			uth->uu_cdir = ndp->ni_vp;
4238 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4239 		} else {
4240 			vnode_rele(ndp->ni_vp);
4241 			return ENOENT;
4242 		}
4243 	} else {
4244 		proc_dirs_lock_exclusive(p);
4245 		proc_fdlock(p);
4246 		tvp = p->p_fd.fd_cdir;
4247 		p->p_fd.fd_cdir = ndp->ni_vp;
4248 		proc_fdunlock(p);
4249 		proc_dirs_unlock_exclusive(p);
4250 	}
4251 
4252 	if (tvp) {
4253 		vnode_rele(tvp);
4254 	}
4255 
4256 	return 0;
4257 }
4258 
4259 
4260 /*
4261  * Change current working directory (".").
4262  *
4263  * Returns:	0			Success
4264  *	chdir_internal:ENOTDIR
4265  *	chdir_internal:ENOENT		No such file or directory
4266  *	chdir_internal:???
4267  */
4268 /* ARGSUSED */
4269 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4270 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4271 {
4272 	struct nameidata nd;
4273 	vfs_context_t ctx = vfs_context_current();
4274 
4275 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4276 	    UIO_USERSPACE, uap->path, ctx);
4277 
4278 	return chdir_internal(p, ctx, &nd, per_thread);
4279 }
4280 
4281 
4282 /*
4283  * chdir
4284  *
4285  * Change current working directory (".") for the entire process
4286  *
4287  * Parameters:  p       Process requesting the call
4288  *              uap     User argument descriptor (see below)
4289  *              retval  (ignored)
4290  *
4291  * Indirect parameters:	uap->path	Directory path
4292  *
4293  * Returns:	0			Success
4294  *              common_chdir: ENOTDIR
4295  *              common_chdir: ENOENT	No such file or directory
4296  *              common_chdir: ???
4297  *
4298  */
4299 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4300 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4301 {
4302 	return common_chdir(p, (void *)uap, 0);
4303 }
4304 
4305 /*
4306  * __pthread_chdir
4307  *
4308  * Change current working directory (".") for a single thread
4309  *
4310  * Parameters:  p       Process requesting the call
4311  *              uap     User argument descriptor (see below)
4312  *              retval  (ignored)
4313  *
4314  * Indirect parameters:	uap->path	Directory path
4315  *
4316  * Returns:	0			Success
4317  *              common_chdir: ENOTDIR
4318  *		common_chdir: ENOENT	No such file or directory
4319  *		common_chdir: ???
4320  *
4321  */
4322 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4323 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4324 {
4325 	return common_chdir(p, (void *)uap, 1);
4326 }
4327 
4328 
4329 /*
4330  * Change notion of root (``/'') directory.
4331  */
4332 /* ARGSUSED */
4333 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4334 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4335 {
4336 	struct filedesc *fdp = &p->p_fd;
4337 	int error;
4338 	struct nameidata nd;
4339 	vnode_t tvp;
4340 	vfs_context_t ctx = vfs_context_current();
4341 
4342 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4343 		return error;
4344 	}
4345 
4346 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4347 	    UIO_USERSPACE, uap->path, ctx);
4348 	error = change_dir(&nd, ctx);
4349 	if (error) {
4350 		return error;
4351 	}
4352 
4353 #if CONFIG_MACF
4354 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4355 	    &nd.ni_cnd);
4356 	if (error) {
4357 		vnode_put(nd.ni_vp);
4358 		return error;
4359 	}
4360 #endif
4361 
4362 	if ((error = vnode_ref(nd.ni_vp))) {
4363 		vnode_put(nd.ni_vp);
4364 		return error;
4365 	}
4366 	vnode_put(nd.ni_vp);
4367 
4368 	/*
4369 	 * This lock provides the guarantee that as long as you hold the lock
4370 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4371 	 * on a referenced vnode in namei when determining the rootvnode for
4372 	 * a process.
4373 	 */
4374 	/* needed for synchronization with lookup */
4375 	proc_dirs_lock_exclusive(p);
4376 	/* needed for setting the flag and other activities on the fd itself */
4377 	proc_fdlock(p);
4378 	tvp = fdp->fd_rdir;
4379 	fdp->fd_rdir = nd.ni_vp;
4380 	fdt_flag_set(fdp, FD_CHROOT);
4381 	proc_fdunlock(p);
4382 	proc_dirs_unlock_exclusive(p);
4383 
4384 	if (tvp != NULL) {
4385 		vnode_rele(tvp);
4386 	}
4387 
4388 	return 0;
4389 }
4390 
4391 #define PATHSTATICBUFLEN 256
4392 #define PIVOT_ROOT_ENTITLEMENT              \
4393        "com.apple.private.vfs.pivot-root"
4394 
4395 #if defined(XNU_TARGET_OS_OSX)
4396 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4397 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4398 {
4399 	int error;
4400 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4401 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4402 	char *new_rootfs_path_before_buf = NULL;
4403 	char *old_rootfs_path_after_buf = NULL;
4404 	char *incoming = NULL;
4405 	char *outgoing = NULL;
4406 	vnode_t incoming_rootvp = NULLVP;
4407 	size_t bytes_copied;
4408 
4409 	/*
4410 	 * XXX : Additional restrictions needed
4411 	 * - perhaps callable only once.
4412 	 */
4413 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4414 		return error;
4415 	}
4416 
4417 	/*
4418 	 * pivot_root can be executed by launchd only.
4419 	 * Enforce entitlement.
4420 	 */
4421 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4422 		return EPERM;
4423 	}
4424 
4425 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4426 	if (error == ENAMETOOLONG) {
4427 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4428 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4429 	}
4430 
4431 	if (error) {
4432 		goto out;
4433 	}
4434 
4435 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4436 	if (error == ENAMETOOLONG) {
4437 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4438 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4439 	}
4440 	if (error) {
4441 		goto out;
4442 	}
4443 
4444 	if (new_rootfs_path_before_buf) {
4445 		incoming = new_rootfs_path_before_buf;
4446 	} else {
4447 		incoming = &new_rootfs_path_before[0];
4448 	}
4449 
4450 	if (old_rootfs_path_after_buf) {
4451 		outgoing = old_rootfs_path_after_buf;
4452 	} else {
4453 		outgoing = &old_rootfs_path_after[0];
4454 	}
4455 
4456 	/*
4457 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4458 	 * Userland is not allowed to pivot to an image.
4459 	 */
4460 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4461 	if (error) {
4462 		goto out;
4463 	}
4464 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4465 	if (error) {
4466 		goto out;
4467 	}
4468 
4469 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4470 
4471 out:
4472 	if (incoming_rootvp != NULLVP) {
4473 		vnode_put(incoming_rootvp);
4474 		incoming_rootvp = NULLVP;
4475 	}
4476 
4477 	if (old_rootfs_path_after_buf) {
4478 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4479 	}
4480 
4481 	if (new_rootfs_path_before_buf) {
4482 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4483 	}
4484 
4485 	return error;
4486 }
4487 #else
4488 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4489 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4490 {
4491 	return nosys(p, NULL, retval);
4492 }
4493 #endif /* XNU_TARGET_OS_OSX */
4494 
4495 /*
4496  * Common routine for chroot and chdir.
4497  *
4498  * Returns:	0			Success
4499  *		ENOTDIR			Not a directory
4500  *		namei:???		[anything namei can return]
4501  *		vnode_authorize:???	[anything vnode_authorize can return]
4502  */
4503 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4504 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4505 {
4506 	vnode_t vp;
4507 	int error;
4508 
4509 	if ((error = namei(ndp))) {
4510 		return error;
4511 	}
4512 	nameidone(ndp);
4513 	vp = ndp->ni_vp;
4514 
4515 	if (vp->v_type != VDIR) {
4516 		vnode_put(vp);
4517 		return ENOTDIR;
4518 	}
4519 
4520 #if CONFIG_MACF
4521 	error = mac_vnode_check_chdir(ctx, vp);
4522 	if (error) {
4523 		vnode_put(vp);
4524 		return error;
4525 	}
4526 #endif
4527 
4528 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4529 	if (error) {
4530 		vnode_put(vp);
4531 		return error;
4532 	}
4533 
4534 	return error;
4535 }
4536 
4537 /*
4538  * Free the vnode data (for directories) associated with the file glob.
4539  */
4540 struct fd_vn_data *
fg_vn_data_alloc(void)4541 fg_vn_data_alloc(void)
4542 {
4543 	struct fd_vn_data *fvdata;
4544 
4545 	/* Allocate per fd vnode data */
4546 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4547 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4548 	return fvdata;
4549 }
4550 
4551 /*
4552  * Free the vnode data (for directories) associated with the file glob.
4553  */
4554 void
fg_vn_data_free(void * fgvndata)4555 fg_vn_data_free(void *fgvndata)
4556 {
4557 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4558 
4559 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4560 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4561 	kfree_type(struct fd_vn_data, fvdata);
4562 }
4563 
4564 /*
4565  * Check permissions, allocate an open file structure,
4566  * and call the device open routine if any.
4567  *
4568  * Returns:	0			Success
4569  *		EINVAL
4570  *		EINTR
4571  *	falloc:ENFILE
4572  *	falloc:EMFILE
4573  *	falloc:ENOMEM
4574  *	vn_open_auth:???
4575  *	dupfdopen:???
4576  *	VNOP_ADVLOCK:???
4577  *	vnode_setsize:???
4578  *
4579  * XXX Need to implement uid, gid
4580  */
4581 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4582 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4583     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4584 {
4585 	proc_t p = vfs_context_proc(ctx);
4586 	kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4587 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4588 	struct fileproc *fp;
4589 	vnode_t vp;
4590 	int flags, oflags, amode;
4591 	int type, indx, error;
4592 	struct vfs_context context;
4593 	vnode_t authvp = NULLVP;
4594 
4595 	oflags = uflags;
4596 
4597 	amode = oflags & O_ACCMODE;
4598 	/*
4599 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4600 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4601 	 * with FREAD/FWRITE.
4602 	 */
4603 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4604 		return EINVAL;
4605 	}
4606 
4607 	flags = FFLAGS(uflags);
4608 	CLR(flags, FENCRYPTED);
4609 	CLR(flags, FUNENCRYPTED);
4610 
4611 	AUDIT_ARG(fflags, oflags);
4612 	AUDIT_ARG(mode, vap->va_mode);
4613 
4614 	if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4615 		return error;
4616 	}
4617 	if (flags & O_CLOEXEC) {
4618 		fp->fp_flags |= FP_CLOEXEC;
4619 	}
4620 	if (flags & O_CLOFORK) {
4621 		fp->fp_flags |= FP_CLOFORK;
4622 	}
4623 
4624 	/* setup state to recognize when fdesc_open was called */
4625 	uu->uu_dupfd = -1;
4626 
4627 	/*
4628 	 * Disable read/write access if file is opened with O_EVTONLY and
4629 	 * the process has requested to deny read/write access.
4630 	 */
4631 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4632 		flags &= ~(FREAD | FWRITE);
4633 	}
4634 
4635 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4636 		error = vnode_getfromfd(ctx, authfd, &authvp);
4637 		if (error) {
4638 			fp_free(p, indx, fp);
4639 			return error;
4640 		}
4641 	}
4642 
4643 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4644 		if (authvp != NULLVP) {
4645 			vnode_put(authvp);
4646 		}
4647 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4648 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4649 				*retval = indx;
4650 				return 0;
4651 			}
4652 		}
4653 		if (error == ERESTART) {
4654 			error = EINTR;
4655 		}
4656 		fp_free(p, indx, fp);
4657 		return error;
4658 	}
4659 
4660 	if (authvp != NULLVP) {
4661 		vnode_put(authvp);
4662 	}
4663 
4664 	uu->uu_dupfd = 0;
4665 	vp = ndp->ni_vp;
4666 
4667 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4668 	fp->fp_glob->fg_ops = &vnops;
4669 	fp_set_data(fp, vp);
4670 
4671 #if CONFIG_FILE_LEASES
4672 	/*
4673 	 * If we are creating a file or open with truncate, we need to break the
4674 	 * lease if there is a read lease placed on the parent dir.
4675 	 */
4676 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4677 		vnode_breakdirlease(vp, true, oflags);
4678 	}
4679 	/* Now check if there is a lease placed on the file itself. */
4680 	error = vnode_breaklease(vp, oflags, ctx);
4681 	if (error) {
4682 		goto bad;
4683 	}
4684 #endif /* CONFIG_FILE_LEASES */
4685 
4686 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4687 		struct flock lf = {
4688 			.l_whence = SEEK_SET,
4689 		};
4690 
4691 		if (flags & O_EXLOCK) {
4692 			lf.l_type = F_WRLCK;
4693 		} else {
4694 			lf.l_type = F_RDLCK;
4695 		}
4696 		type = F_FLOCK;
4697 		if ((flags & FNONBLOCK) == 0) {
4698 			type |= F_WAIT;
4699 		}
4700 #if CONFIG_MACF
4701 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4702 		    F_SETLK, &lf);
4703 		if (error) {
4704 			goto bad;
4705 		}
4706 #endif
4707 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4708 			goto bad;
4709 		}
4710 		fp->fp_glob->fg_flag |= FWASLOCKED;
4711 	}
4712 
4713 	/* try to truncate by setting the size attribute */
4714 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4715 		goto bad;
4716 	}
4717 
4718 	/*
4719 	 * For directories we hold some additional information in the fd.
4720 	 */
4721 	if (vnode_vtype(vp) == VDIR) {
4722 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4723 	} else {
4724 		fp->fp_glob->fg_vn_data = NULL;
4725 	}
4726 
4727 #if CONFIG_SECLUDED_MEMORY
4728 	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4729 		memory_object_control_t moc;
4730 		const char *v_name;
4731 
4732 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4733 
4734 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4735 			/* nothing to do... */
4736 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4737 			/* writable -> no longer  eligible for secluded pages */
4738 			memory_object_mark_eligible_for_secluded(moc,
4739 			    FALSE);
4740 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4741 			char pathname[32] = { 0, };
4742 			size_t copied;
4743 			/* XXX FBDP: better way to detect /Applications/ ? */
4744 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4745 				(void)copyinstr(ndp->ni_dirp,
4746 				    pathname,
4747 				    sizeof(pathname),
4748 				    &copied);
4749 			} else {
4750 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4751 				    pathname,
4752 				    sizeof(pathname),
4753 				    &copied);
4754 			}
4755 			pathname[sizeof(pathname) - 1] = '\0';
4756 			if (strncmp(pathname,
4757 			    "/Applications/",
4758 			    strlen("/Applications/")) == 0 &&
4759 			    strncmp(pathname,
4760 			    "/Applications/Camera.app/",
4761 			    strlen("/Applications/Camera.app/")) != 0) {
4762 				/*
4763 				 * not writable
4764 				 * AND from "/Applications/"
4765 				 * AND not from "/Applications/Camera.app/"
4766 				 * ==> eligible for secluded
4767 				 */
4768 				memory_object_mark_eligible_for_secluded(moc,
4769 				    TRUE);
4770 			}
4771 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4772 		    (v_name = vnode_getname(vp))) {
4773 			size_t len = strlen(v_name);
4774 
4775 			if (!strncmp(v_name, "dyld", len) ||
4776 			    !strncmp(v_name, "launchd", len) ||
4777 			    !strncmp(v_name, "Camera", len) ||
4778 			    !strncmp(v_name, "SpringBoard", len) ||
4779 			    !strncmp(v_name, "backboardd", len)) {
4780 				/*
4781 				 * This file matters when launching Camera:
4782 				 * do not store its contents in the secluded
4783 				 * pool that will be drained on Camera launch.
4784 				 */
4785 				memory_object_mark_eligible_for_secluded(moc,
4786 				    FALSE);
4787 			} else if (!strncmp(v_name, "audiomxd", len) ||
4788 			    !strncmp(v_name, "mediaplaybackd", len)) {
4789 				memory_object_mark_eligible_for_secluded(moc,
4790 				    FALSE);
4791 				memory_object_mark_for_realtime(moc,
4792 				    true);
4793 			} else if (!strncmp(v_name, "bluetoothd", len)) {
4794 				/*
4795 				 * bluetoothd might be needed for realtime audio
4796 				 * playback.
4797 				 */
4798 				memory_object_mark_eligible_for_secluded(moc,
4799 				    FALSE);
4800 				memory_object_mark_for_realtime(moc,
4801 				    true);
4802 			} else {
4803 				char pathname[64] = { 0, };
4804 				size_t copied;
4805 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4806 					(void)copyinstr(ndp->ni_dirp,
4807 					    pathname,
4808 					    sizeof(pathname),
4809 					    &copied);
4810 				} else {
4811 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
4812 					    pathname,
4813 					    sizeof(pathname),
4814 					    &copied);
4815 				}
4816 				pathname[sizeof(pathname) - 1] = '\0';
4817 				if (strncmp(pathname,
4818 				    "/Library/Audio/Plug-Ins/",
4819 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4820 				    strncmp(pathname,
4821 				    "/System/Library/Audio/Plug-Ins/",
4822 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4823 					/*
4824 					 * This may be an audio plugin required
4825 					 * for realtime playback.
4826 					 * ==> NOT eligible for secluded.
4827 					 */
4828 					memory_object_mark_eligible_for_secluded(moc,
4829 					    FALSE);
4830 					memory_object_mark_for_realtime(moc,
4831 					    true);
4832 				}
4833 			}
4834 			vnode_putname(v_name);
4835 		}
4836 	}
4837 #endif /* CONFIG_SECLUDED_MEMORY */
4838 
4839 	vnode_put(vp);
4840 
4841 	/*
4842 	 * The first terminal open (without a O_NOCTTY) by a session leader
4843 	 * results in it being set as the controlling terminal.
4844 	 */
4845 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4846 	    !(flags & O_NOCTTY)) {
4847 		int tmp = 0;
4848 
4849 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4850 		    (caddr_t)&tmp, ctx);
4851 	}
4852 
4853 	proc_fdlock(p);
4854 	procfdtbl_releasefd(p, indx, NULL);
4855 
4856 	fp_drop(p, indx, fp, 1);
4857 	proc_fdunlock(p);
4858 
4859 	*retval = indx;
4860 
4861 	return 0;
4862 bad:
4863 	context = *vfs_context_current();
4864 	context.vc_ucred = fp->fp_glob->fg_cred;
4865 
4866 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4867 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4868 		struct flock lf = {
4869 			.l_whence = SEEK_SET,
4870 			.l_type = F_UNLCK,
4871 		};
4872 
4873 		(void)VNOP_ADVLOCK(
4874 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4875 	}
4876 
4877 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4878 	vnode_put(vp);
4879 	fp_free(p, indx, fp);
4880 
4881 	return error;
4882 }
4883 
4884 /*
4885  * While most of the *at syscall handlers can call nameiat() which
4886  * is a wrapper around namei, the use of namei and initialisation
4887  * of nameidata are far removed and in different functions  - namei
4888  * gets called in vn_open_auth for open1. So we'll just do here what
4889  * nameiat() does.
4890  */
4891 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4892 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4893     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4894     int dirfd, int authfd)
4895 {
4896 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4897 		int error;
4898 		char c;
4899 
4900 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4901 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4902 			if (error) {
4903 				return error;
4904 			}
4905 		} else {
4906 			c = *((char *)(ndp->ni_dirp));
4907 		}
4908 
4909 		if (c != '/') {
4910 			vnode_t dvp_at;
4911 
4912 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4913 			    &dvp_at);
4914 			if (error) {
4915 				return error;
4916 			}
4917 
4918 			if (vnode_vtype(dvp_at) != VDIR) {
4919 				vnode_put(dvp_at);
4920 				return ENOTDIR;
4921 			}
4922 
4923 			ndp->ni_dvp = dvp_at;
4924 			ndp->ni_cnd.cn_flags |= USEDVP;
4925 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4926 			    retval, authfd);
4927 			vnode_put(dvp_at);
4928 			return error;
4929 		}
4930 	}
4931 
4932 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4933 }
4934 
4935 /*
4936  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4937  *
4938  * Parameters:	p			Process requesting the open
4939  *		uap			User argument descriptor (see below)
4940  *		retval			Pointer to an area to receive the
4941  *					return calue from the system call
4942  *
4943  * Indirect:	uap->path		Path to open (same as 'open')
4944  *		uap->flags		Flags to open (same as 'open'
4945  *		uap->uid		UID to set, if creating
4946  *		uap->gid		GID to set, if creating
4947  *		uap->mode		File mode, if creating (same as 'open')
4948  *		uap->xsecurity		ACL to set, if creating
4949  *
4950  * Returns:	0			Success
4951  *		!0			errno value
4952  *
4953  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4954  *
4955  * XXX:		We should enummerate the possible errno values here, and where
4956  *		in the code they originated.
4957  */
4958 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4959 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4960 {
4961 	int ciferror;
4962 	kauth_filesec_t xsecdst;
4963 	struct vnode_attr va;
4964 	struct nameidata nd;
4965 	int cmode;
4966 
4967 	AUDIT_ARG(owner, uap->uid, uap->gid);
4968 
4969 	xsecdst = NULL;
4970 	if ((uap->xsecurity != USER_ADDR_NULL) &&
4971 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4972 		return ciferror;
4973 	}
4974 
4975 	VATTR_INIT(&va);
4976 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4977 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4978 	if (uap->uid != KAUTH_UID_NONE) {
4979 		VATTR_SET(&va, va_uid, uap->uid);
4980 	}
4981 	if (uap->gid != KAUTH_GID_NONE) {
4982 		VATTR_SET(&va, va_gid, uap->gid);
4983 	}
4984 	if (xsecdst != NULL) {
4985 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4986 		va.va_vaflags |= VA_FILESEC_ACL;
4987 	}
4988 
4989 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4990 	    uap->path, vfs_context_current());
4991 
4992 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4993 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4994 	if (xsecdst != NULL) {
4995 		kauth_filesec_free(xsecdst);
4996 	}
4997 
4998 	return ciferror;
4999 }
5000 
5001 /*
5002  * Go through the data-protected atomically controlled open (2)
5003  *
5004  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5005  */
5006 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5007 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5008     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5009 {
5010 	/*
5011 	 * Follow the same path as normal open(2)
5012 	 * Look up the item if it exists, and acquire the vnode.
5013 	 */
5014 	struct vnode_attr va;
5015 	struct nameidata nd;
5016 	int cmode;
5017 	int error;
5018 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5019 
5020 	VATTR_INIT(&va);
5021 	/* Mask off all but regular access permissions */
5022 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5023 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5024 
5025 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5026 	    path, ctx);
5027 
5028 	/*
5029 	 * Initialize the extra fields in vnode_attr to pass down our
5030 	 * extra fields.
5031 	 * 1. target cprotect class.
5032 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5033 	 */
5034 	if (flags & O_CREAT) {
5035 		/* lower level kernel code validates that the class is valid before applying it. */
5036 		if (class != PROTECTION_CLASS_DEFAULT) {
5037 			/*
5038 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5039 			 * file behave the same as open (2)
5040 			 */
5041 			VATTR_SET(&va, va_dataprotect_class, class);
5042 		}
5043 	}
5044 
5045 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5046 		if (flags & (O_RDWR | O_WRONLY)) {
5047 			/*
5048 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
5049 			 */
5050 			return EINVAL;
5051 		}
5052 		if (dpflags & O_DP_GETRAWENCRYPTED) {
5053 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5054 		}
5055 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5056 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5057 		}
5058 		if (dpflags & O_DP_AUTHENTICATE) {
5059 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5060 		}
5061 	}
5062 
5063 	error = open1at(vfs_context_current(), &nd, flags, &va,
5064 	    NULL, NULL, retval, fd, authfd);
5065 
5066 	return error;
5067 }
5068 
5069 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5070 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5071 {
5072 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5073 		return EINVAL;
5074 	}
5075 
5076 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5077 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5078 }
5079 
5080 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5081 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5082 {
5083 	if (uap->dpflags & O_DP_AUTHENTICATE) {
5084 		return EINVAL;
5085 	}
5086 
5087 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5088 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5089 }
5090 
5091 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5092 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5093     int fd, enum uio_seg segflg, int *retval)
5094 {
5095 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5096 	struct {
5097 		struct vnode_attr va;
5098 		struct nameidata nd;
5099 	} *__open_data;
5100 	struct vnode_attr *vap;
5101 	struct nameidata *ndp;
5102 	int cmode;
5103 	int error;
5104 
5105 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5106 	vap = &__open_data->va;
5107 	ndp = &__open_data->nd;
5108 
5109 	VATTR_INIT(vap);
5110 	/* Mask off all but regular access permissions */
5111 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5112 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5113 
5114 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5115 	    segflg, path, ctx);
5116 
5117 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5118 
5119 	kfree_type(typeof(*__open_data), __open_data);
5120 
5121 	return error;
5122 }
5123 
5124 int
open(proc_t p,struct open_args * uap,int32_t * retval)5125 open(proc_t p, struct open_args *uap, int32_t *retval)
5126 {
5127 	__pthread_testcancel(1);
5128 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5129 }
5130 
5131 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5132 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5133     int32_t *retval)
5134 {
5135 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5136 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5137 }
5138 
5139 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5140 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5141     int32_t *retval)
5142 {
5143 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5144 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
5145 }
5146 
5147 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5148 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5149 {
5150 	__pthread_testcancel(1);
5151 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5152 }
5153 
5154 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5155 
5156 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5157 vfs_context_can_open_by_id(vfs_context_t ctx)
5158 {
5159 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5160 		return TRUE;
5161 	}
5162 
5163 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5164 	           OPEN_BY_ID_ENTITLEMENT);
5165 }
5166 
5167 /*
5168  * openbyid_np: open a file given a file system id and a file system object id
5169  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5170  *	file systems that don't support object ids it is a node id (uint64_t).
5171  *
5172  * Parameters:	p			Process requesting the open
5173  *		uap			User argument descriptor (see below)
5174  *		retval			Pointer to an area to receive the
5175  *					return calue from the system call
5176  *
5177  * Indirect:	uap->path		Path to open (same as 'open')
5178  *
5179  *		uap->fsid		id of target file system
5180  *		uap->objid		id of target file system object
5181  *		uap->flags		Flags to open (same as 'open')
5182  *
5183  * Returns:	0			Success
5184  *		!0			errno value
5185  *
5186  *
5187  * XXX:		We should enummerate the possible errno values here, and where
5188  *		in the code they originated.
5189  */
5190 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5191 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5192 {
5193 	fsid_t fsid;
5194 	uint64_t objid;
5195 	int error;
5196 	char *buf = NULL;
5197 	int buflen = MAXPATHLEN;
5198 	int pathlen = 0;
5199 	vfs_context_t ctx = vfs_context_current();
5200 
5201 	if (!vfs_context_can_open_by_id(ctx)) {
5202 		return EPERM;
5203 	}
5204 
5205 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5206 		return error;
5207 	}
5208 
5209 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5210 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5211 		return error;
5212 	}
5213 
5214 	AUDIT_ARG(value32, fsid.val[0]);
5215 	AUDIT_ARG(value64, objid);
5216 
5217 	/*resolve path from fsis, objid*/
5218 	do {
5219 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5220 		if (buf == NULL) {
5221 			return ENOMEM;
5222 		}
5223 
5224 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5225 		    buf, FSOPT_ISREALFSID, &pathlen);
5226 
5227 		if (error) {
5228 			kfree_data(buf, buflen + 1);
5229 			buf = NULL;
5230 		}
5231 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5232 
5233 	if (error) {
5234 		return error;
5235 	}
5236 
5237 	buf[pathlen] = 0;
5238 
5239 	error = openat_internal(
5240 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5241 
5242 	kfree_data(buf, buflen + 1);
5243 
5244 	return error;
5245 }
5246 
5247 
5248 /*
5249  * Create a special file.
5250  */
5251 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5252     int fd);
5253 
5254 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5255 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5256     mode_t mode, int fd)
5257 {
5258 	vfs_context_t ctx = vfs_context_current();
5259 	struct nameidata nd;
5260 	vnode_t vp, dvp;
5261 	int error;
5262 
5263 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5264 	if ((mode & S_IFMT) == S_IFIFO) {
5265 		return mkfifo1(ctx, upath, vap, fd);
5266 	}
5267 
5268 	AUDIT_ARG(mode, mode);
5269 	AUDIT_ARG(value32, vap->va_rdev);
5270 
5271 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5272 		return error;
5273 	}
5274 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5275 	    UIO_USERSPACE, upath, ctx);
5276 	error = nameiat(&nd, fd);
5277 	if (error) {
5278 		return error;
5279 	}
5280 	dvp = nd.ni_dvp;
5281 	vp = nd.ni_vp;
5282 
5283 	if (vp != NULL) {
5284 		error = EEXIST;
5285 		goto out;
5286 	}
5287 
5288 	switch (mode & S_IFMT) {
5289 	case S_IFCHR:
5290 		VATTR_SET(vap, va_type, VCHR);
5291 		break;
5292 	case S_IFBLK:
5293 		VATTR_SET(vap, va_type, VBLK);
5294 		break;
5295 	default:
5296 		error = EINVAL;
5297 		goto out;
5298 	}
5299 
5300 #if CONFIG_MACF
5301 	error = mac_vnode_check_create(ctx,
5302 	    nd.ni_dvp, &nd.ni_cnd, vap);
5303 	if (error) {
5304 		goto out;
5305 	}
5306 #endif
5307 
5308 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5309 		goto out;
5310 	}
5311 
5312 #if CONFIG_FILE_LEASES
5313 	vnode_breakdirlease(dvp, false, O_WRONLY);
5314 #endif
5315 
5316 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5317 		goto out;
5318 	}
5319 
5320 	if (vp) {
5321 		int     update_flags = 0;
5322 
5323 		// Make sure the name & parent pointers are hooked up
5324 		if (vp->v_name == NULL) {
5325 			update_flags |= VNODE_UPDATE_NAME;
5326 		}
5327 		if (vp->v_parent == NULLVP) {
5328 			update_flags |= VNODE_UPDATE_PARENT;
5329 		}
5330 
5331 		if (update_flags) {
5332 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5333 		}
5334 
5335 #if CONFIG_FSE
5336 		add_fsevent(FSE_CREATE_FILE, ctx,
5337 		    FSE_ARG_VNODE, vp,
5338 		    FSE_ARG_DONE);
5339 #endif
5340 	}
5341 
5342 out:
5343 	/*
5344 	 * nameidone has to happen before we vnode_put(dvp)
5345 	 * since it may need to release the fs_nodelock on the dvp
5346 	 */
5347 	nameidone(&nd);
5348 
5349 	if (vp) {
5350 		vnode_put(vp);
5351 	}
5352 	vnode_put(dvp);
5353 
5354 	return error;
5355 }
5356 
5357 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5358 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5359 {
5360 	struct vnode_attr va;
5361 
5362 	VATTR_INIT(&va);
5363 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5364 	VATTR_SET(&va, va_rdev, uap->dev);
5365 
5366 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5367 }
5368 
5369 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5370 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5371 {
5372 	struct vnode_attr va;
5373 
5374 	VATTR_INIT(&va);
5375 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5376 	VATTR_SET(&va, va_rdev, uap->dev);
5377 
5378 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5379 }
5380 
5381 /*
5382  * Create a named pipe.
5383  *
5384  * Returns:	0			Success
5385  *		EEXIST
5386  *	namei:???
5387  *	vnode_authorize:???
5388  *	vn_create:???
5389  */
5390 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5391 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5392 {
5393 	vnode_t vp, dvp;
5394 	int error;
5395 	struct nameidata nd;
5396 
5397 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5398 	    UIO_USERSPACE, upath, ctx);
5399 	error = nameiat(&nd, fd);
5400 	if (error) {
5401 		return error;
5402 	}
5403 	dvp = nd.ni_dvp;
5404 	vp = nd.ni_vp;
5405 
5406 	/* check that this is a new file and authorize addition */
5407 	if (vp != NULL) {
5408 		error = EEXIST;
5409 		goto out;
5410 	}
5411 	VATTR_SET(vap, va_type, VFIFO);
5412 
5413 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5414 		goto out;
5415 	}
5416 
5417 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5418 out:
5419 	/*
5420 	 * nameidone has to happen before we vnode_put(dvp)
5421 	 * since it may need to release the fs_nodelock on the dvp
5422 	 */
5423 	nameidone(&nd);
5424 
5425 	if (vp) {
5426 		vnode_put(vp);
5427 	}
5428 	vnode_put(dvp);
5429 
5430 	return error;
5431 }
5432 
5433 
5434 /*
5435  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5436  *
5437  * Parameters:	p			Process requesting the open
5438  *		uap			User argument descriptor (see below)
5439  *		retval			(Ignored)
5440  *
5441  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5442  *		uap->uid		UID to set
5443  *		uap->gid		GID to set
5444  *		uap->mode		File mode to set (same as 'mkfifo')
5445  *		uap->xsecurity		ACL to set, if creating
5446  *
5447  * Returns:	0			Success
5448  *		!0			errno value
5449  *
5450  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5451  *
5452  * XXX:		We should enummerate the possible errno values here, and where
5453  *		in the code they originated.
5454  */
5455 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5456 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5457 {
5458 	int ciferror;
5459 	kauth_filesec_t xsecdst;
5460 	struct vnode_attr va;
5461 
5462 	AUDIT_ARG(owner, uap->uid, uap->gid);
5463 
5464 	xsecdst = KAUTH_FILESEC_NONE;
5465 	if (uap->xsecurity != USER_ADDR_NULL) {
5466 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5467 			return ciferror;
5468 		}
5469 	}
5470 
5471 	VATTR_INIT(&va);
5472 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5473 	if (uap->uid != KAUTH_UID_NONE) {
5474 		VATTR_SET(&va, va_uid, uap->uid);
5475 	}
5476 	if (uap->gid != KAUTH_GID_NONE) {
5477 		VATTR_SET(&va, va_gid, uap->gid);
5478 	}
5479 	if (xsecdst != KAUTH_FILESEC_NONE) {
5480 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5481 		va.va_vaflags |= VA_FILESEC_ACL;
5482 	}
5483 
5484 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5485 
5486 	if (xsecdst != KAUTH_FILESEC_NONE) {
5487 		kauth_filesec_free(xsecdst);
5488 	}
5489 	return ciferror;
5490 }
5491 
5492 /* ARGSUSED */
5493 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5494 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5495 {
5496 	struct vnode_attr va;
5497 
5498 	VATTR_INIT(&va);
5499 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5500 
5501 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5502 }
5503 
5504 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5505 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5506 {
5507 	struct vnode_attr va;
5508 
5509 	VATTR_INIT(&va);
5510 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5511 
5512 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5513 }
5514 
5515 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5516 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5517 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5518 
5519 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5520 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5521 {
5522 	int ret, len = _len;
5523 
5524 	*truncated_path = 0;
5525 
5526 	if (firmlink) {
5527 		ret = vn_getpath(dvp, path, &len);
5528 	} else {
5529 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5530 	}
5531 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5532 		if (leafname) {
5533 			path[len - 1] = '/';
5534 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5535 			if (len > MAXPATHLEN) {
5536 				char *ptr;
5537 
5538 				// the string got truncated!
5539 				*truncated_path = 1;
5540 				ptr = strrchr(path, '/');
5541 				if (ptr) {
5542 					*ptr = '\0';   // chop off the string at the last directory component
5543 				}
5544 				len = (int)strlen(path) + 1;
5545 			}
5546 		}
5547 	} else if (ret == 0) {
5548 		*truncated_path = 1;
5549 	} else if (ret != 0) {
5550 		struct vnode *mydvp = dvp;
5551 
5552 		if (ret != ENOSPC) {
5553 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5554 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5555 		}
5556 		*truncated_path = 1;
5557 
5558 		do {
5559 			if (mydvp->v_parent != NULL) {
5560 				mydvp = mydvp->v_parent;
5561 			} else if (mydvp->v_mount) {
5562 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5563 				break;
5564 			} else {
5565 				// no parent and no mount point?  only thing is to punt and say "/" changed
5566 				strlcpy(path, "/", _len);
5567 				len = 2;
5568 				mydvp = NULL;
5569 			}
5570 
5571 			if (mydvp == NULL) {
5572 				break;
5573 			}
5574 
5575 			len = _len;
5576 			if (firmlink) {
5577 				ret = vn_getpath(mydvp, path, &len);
5578 			} else {
5579 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5580 			}
5581 		} while (ret == ENOSPC);
5582 	}
5583 
5584 	return len;
5585 }
5586 
5587 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5588 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5589 {
5590 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5591 }
5592 
5593 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5594 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5595 {
5596 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5597 }
5598 
5599 /*
5600  * Make a hard file link.
5601  *
5602  * Returns:	0			Success
5603  *		EPERM
5604  *		EEXIST
5605  *		EXDEV
5606  *	namei:???
5607  *	vnode_authorize:???
5608  *	VNOP_LINK:???
5609  */
5610 /* ARGSUSED */
5611 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5612 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5613     user_addr_t link, int flag, enum uio_seg segflg)
5614 {
5615 	vnode_t vp, pvp, dvp, lvp;
5616 	struct nameidata nd;
5617 	int follow;
5618 	int error;
5619 #if CONFIG_FSE
5620 	fse_info finfo;
5621 #endif
5622 	int need_event, has_listeners, need_kpath2;
5623 	char *target_path = NULL;
5624 	char  *no_firmlink_path = NULL;
5625 	int truncated = 0;
5626 	int truncated_no_firmlink_path = 0;
5627 
5628 	vp = dvp = lvp = NULLVP;
5629 
5630 	/* look up the object we are linking to */
5631 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5632 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5633 	    segflg, path, ctx);
5634 
5635 	error = nameiat(&nd, fd1);
5636 	if (error) {
5637 		return error;
5638 	}
5639 	vp = nd.ni_vp;
5640 
5641 	nameidone(&nd);
5642 
5643 	/*
5644 	 * Normally, linking to directories is not supported.
5645 	 * However, some file systems may have limited support.
5646 	 */
5647 	if (vp->v_type == VDIR) {
5648 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5649 			error = EPERM;   /* POSIX */
5650 			goto out;
5651 		}
5652 
5653 		/* Linking to a directory requires ownership. */
5654 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5655 			struct vnode_attr dva;
5656 
5657 			VATTR_INIT(&dva);
5658 			VATTR_WANTED(&dva, va_uid);
5659 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5660 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5661 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5662 				error = EACCES;
5663 				goto out;
5664 			}
5665 		}
5666 	}
5667 
5668 	/* lookup the target node */
5669 #if CONFIG_TRIGGERS
5670 	nd.ni_op = OP_LINK;
5671 #endif
5672 	nd.ni_cnd.cn_nameiop = CREATE;
5673 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5674 	nd.ni_dirp = link;
5675 	error = nameiat(&nd, fd2);
5676 	if (error != 0) {
5677 		goto out;
5678 	}
5679 	dvp = nd.ni_dvp;
5680 	lvp = nd.ni_vp;
5681 
5682 #if CONFIG_MACF
5683 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5684 		goto out2;
5685 	}
5686 #endif
5687 
5688 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5689 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5690 		goto out2;
5691 	}
5692 
5693 	/* target node must not exist */
5694 	if (lvp != NULLVP) {
5695 		error = EEXIST;
5696 		goto out2;
5697 	}
5698 	/* cannot link across mountpoints */
5699 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5700 		error = EXDEV;
5701 		goto out2;
5702 	}
5703 
5704 	/* authorize creation of the target note */
5705 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5706 		goto out2;
5707 	}
5708 
5709 #if CONFIG_FILE_LEASES
5710 	vnode_breakdirlease(dvp, false, O_WRONLY);
5711 #endif
5712 
5713 	/* and finally make the link */
5714 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5715 	if (error) {
5716 		goto out2;
5717 	}
5718 
5719 #if CONFIG_MACF
5720 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5721 #endif
5722 
5723 #if CONFIG_FSE
5724 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5725 #else
5726 	need_event = 0;
5727 #endif
5728 	has_listeners = kauth_authorize_fileop_has_listeners();
5729 
5730 	need_kpath2 = 0;
5731 #if CONFIG_AUDIT
5732 	if (AUDIT_RECORD_EXISTS()) {
5733 		need_kpath2 = 1;
5734 	}
5735 #endif
5736 
5737 	if (need_event || has_listeners || need_kpath2) {
5738 		char *link_to_path = NULL;
5739 		int len, link_name_len;
5740 		int  len_no_firmlink_path = 0;
5741 
5742 		/* build the path to the new link file */
5743 		GET_PATH(target_path);
5744 
5745 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5746 		if (no_firmlink_path == NULL) {
5747 			GET_PATH(no_firmlink_path);
5748 		}
5749 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5750 
5751 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5752 
5753 		if (has_listeners) {
5754 			/* build the path to file we are linking to */
5755 			GET_PATH(link_to_path);
5756 
5757 			link_name_len = MAXPATHLEN;
5758 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5759 				/*
5760 				 * Call out to allow 3rd party notification of rename.
5761 				 * Ignore result of kauth_authorize_fileop call.
5762 				 */
5763 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5764 				    (uintptr_t)link_to_path,
5765 				    (uintptr_t)target_path);
5766 			}
5767 			if (link_to_path != NULL) {
5768 				RELEASE_PATH(link_to_path);
5769 			}
5770 		}
5771 #if CONFIG_FSE
5772 		if (need_event) {
5773 			/* construct fsevent */
5774 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5775 				if (truncated_no_firmlink_path) {
5776 					finfo.mode |= FSE_TRUNCATED_PATH;
5777 				}
5778 
5779 				// build the path to the destination of the link
5780 				add_fsevent(FSE_CREATE_FILE, ctx,
5781 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5782 				    FSE_ARG_FINFO, &finfo,
5783 				    FSE_ARG_DONE);
5784 			}
5785 
5786 			pvp = vp->v_parent;
5787 			// need an iocount on parent vnode in this case
5788 			if (pvp && pvp != dvp) {
5789 				pvp = vnode_getparent_if_different(vp, dvp);
5790 			}
5791 			if (pvp) {
5792 				add_fsevent(FSE_STAT_CHANGED, ctx,
5793 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5794 			}
5795 			if (pvp && pvp != dvp) {
5796 				vnode_put(pvp);
5797 			}
5798 		}
5799 #endif
5800 	}
5801 out2:
5802 	/*
5803 	 * nameidone has to happen before we vnode_put(dvp)
5804 	 * since it may need to release the fs_nodelock on the dvp
5805 	 */
5806 	nameidone(&nd);
5807 	if (target_path != NULL) {
5808 		RELEASE_PATH(target_path);
5809 	}
5810 	if (no_firmlink_path != NULL) {
5811 		RELEASE_PATH(no_firmlink_path);
5812 		no_firmlink_path = NULL;
5813 	}
5814 out:
5815 	if (lvp) {
5816 		vnode_put(lvp);
5817 	}
5818 	if (dvp) {
5819 		vnode_put(dvp);
5820 	}
5821 	vnode_put(vp);
5822 	return error;
5823 }
5824 
5825 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5826 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5827 {
5828 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5829 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5830 }
5831 
5832 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5833 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5834 {
5835 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5836 		return EINVAL;
5837 	}
5838 
5839 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5840 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5841 }
5842 
5843 /*
5844  * Make a symbolic link.
5845  *
5846  * We could add support for ACLs here too...
5847  */
5848 /* ARGSUSED */
5849 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5850 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5851     user_addr_t link, enum uio_seg segflg)
5852 {
5853 	struct vnode_attr va;
5854 	char *path;
5855 	int error;
5856 	struct nameidata nd;
5857 	vnode_t vp, dvp;
5858 	size_t dummy = 0;
5859 	proc_t p;
5860 
5861 	error = 0;
5862 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5863 		path = zalloc(ZV_NAMEI);
5864 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5865 	} else {
5866 		path = (char *)path_data;
5867 	}
5868 	if (error) {
5869 		goto out;
5870 	}
5871 	AUDIT_ARG(text, path);  /* This is the link string */
5872 
5873 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5874 	    segflg, link, ctx);
5875 
5876 	error = nameiat(&nd, fd);
5877 	if (error) {
5878 		goto out;
5879 	}
5880 	dvp = nd.ni_dvp;
5881 	vp = nd.ni_vp;
5882 
5883 	p = vfs_context_proc(ctx);
5884 	VATTR_INIT(&va);
5885 	VATTR_SET(&va, va_type, VLNK);
5886 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5887 
5888 #if CONFIG_MACF
5889 	error = mac_vnode_check_create(ctx,
5890 	    dvp, &nd.ni_cnd, &va);
5891 #endif
5892 	if (error != 0) {
5893 		goto skipit;
5894 	}
5895 
5896 	if (vp != NULL) {
5897 		error = EEXIST;
5898 		goto skipit;
5899 	}
5900 
5901 	/* authorize */
5902 	if (error == 0) {
5903 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5904 	}
5905 	/* get default ownership, etc. */
5906 	if (error == 0) {
5907 		error = vnode_authattr_new(dvp, &va, 0, ctx);
5908 	}
5909 
5910 #if CONFIG_FILE_LEASES
5911 	vnode_breakdirlease(dvp, false, O_WRONLY);
5912 #endif
5913 
5914 	if (error == 0) {
5915 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5916 	}
5917 
5918 	/* do fallback attribute handling */
5919 	if (error == 0 && vp) {
5920 		error = vnode_setattr_fallback(vp, &va, ctx);
5921 	}
5922 
5923 #if CONFIG_MACF
5924 	if (error == 0 && vp) {
5925 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5926 	}
5927 #endif
5928 
5929 	if (error == 0) {
5930 		int     update_flags = 0;
5931 
5932 		/*check if a new vnode was created, else try to get one*/
5933 		if (vp == NULL) {
5934 			nd.ni_cnd.cn_nameiop = LOOKUP;
5935 #if CONFIG_TRIGGERS
5936 			nd.ni_op = OP_LOOKUP;
5937 #endif
5938 			/*
5939 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5940 			 * reallocated again in namei().
5941 			 */
5942 			nd.ni_cnd.cn_flags &= HASBUF;
5943 			error = nameiat(&nd, fd);
5944 			if (error) {
5945 				goto skipit;
5946 			}
5947 			vp = nd.ni_vp;
5948 		}
5949 
5950 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5951 		/* call out to allow 3rd party notification of rename.
5952 		 * Ignore result of kauth_authorize_fileop call.
5953 		 */
5954 		if (kauth_authorize_fileop_has_listeners() &&
5955 		    namei(&nd) == 0) {
5956 			char *new_link_path = NULL;
5957 			int             len;
5958 
5959 			/* build the path to the new link file */
5960 			new_link_path = get_pathbuff();
5961 			len = MAXPATHLEN;
5962 			vn_getpath(dvp, new_link_path, &len);
5963 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5964 				new_link_path[len - 1] = '/';
5965 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5966 			}
5967 
5968 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5969 			    (uintptr_t)path, (uintptr_t)new_link_path);
5970 			if (new_link_path != NULL) {
5971 				release_pathbuff(new_link_path);
5972 			}
5973 		}
5974 #endif
5975 		// Make sure the name & parent pointers are hooked up
5976 		if (vp->v_name == NULL) {
5977 			update_flags |= VNODE_UPDATE_NAME;
5978 		}
5979 		if (vp->v_parent == NULLVP) {
5980 			update_flags |= VNODE_UPDATE_PARENT;
5981 		}
5982 
5983 		if (update_flags) {
5984 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5985 		}
5986 
5987 #if CONFIG_FSE
5988 		add_fsevent(FSE_CREATE_FILE, ctx,
5989 		    FSE_ARG_VNODE, vp,
5990 		    FSE_ARG_DONE);
5991 #endif
5992 	}
5993 
5994 skipit:
5995 	/*
5996 	 * nameidone has to happen before we vnode_put(dvp)
5997 	 * since it may need to release the fs_nodelock on the dvp
5998 	 */
5999 	nameidone(&nd);
6000 
6001 	if (vp) {
6002 		vnode_put(vp);
6003 	}
6004 	vnode_put(dvp);
6005 out:
6006 	if (path && (path != (char *)path_data)) {
6007 		zfree(ZV_NAMEI, path);
6008 	}
6009 
6010 	return error;
6011 }
6012 
6013 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6014 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6015 {
6016 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6017 	           uap->link, UIO_USERSPACE);
6018 }
6019 
6020 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6021 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6022     __unused int32_t *retval)
6023 {
6024 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6025 	           uap->path2, UIO_USERSPACE);
6026 }
6027 
6028 /*
6029  * Delete a whiteout from the filesystem.
6030  * No longer supported.
6031  */
6032 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6033 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6034 {
6035 	return ENOTSUP;
6036 }
6037 
6038 /*
6039  * Delete a name from the filesystem.
6040  */
6041 /* ARGSUSED */
6042 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6043 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6044     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6045 {
6046 	struct {
6047 		struct nameidata nd;
6048 #if CONFIG_FSE
6049 		struct vnode_attr va;
6050 		fse_info finfo;
6051 #endif
6052 	} *__unlink_data;
6053 	struct nameidata *ndp;
6054 	vnode_t vp, dvp;
6055 	int error;
6056 	struct componentname *cnp;
6057 	char  *path = NULL;
6058 	char  *no_firmlink_path = NULL;
6059 	int  len_path = 0;
6060 	int  len_no_firmlink_path = 0;
6061 	int flags;
6062 	int need_event;
6063 	int has_listeners;
6064 	int truncated_path;
6065 	int truncated_no_firmlink_path;
6066 	int batched;
6067 	struct vnode_attr *vap;
6068 	int do_retry;
6069 	int retry_count = 0;
6070 	int cn_flags;
6071 	int nofollow_any = 0;
6072 
6073 	cn_flags = LOCKPARENT;
6074 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6075 		cn_flags |= AUDITVNPATH1;
6076 	}
6077 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6078 		nofollow_any = NAMEI_NOFOLLOW_ANY;
6079 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6080 	}
6081 	/* If a starting dvp is passed, it trumps any fd passed. */
6082 	if (start_dvp) {
6083 		cn_flags |= USEDVP;
6084 	}
6085 
6086 #if NAMEDRSRCFORK
6087 	/* unlink or delete is allowed on rsrc forks and named streams */
6088 	cn_flags |= CN_ALLOWRSRCFORK;
6089 #endif
6090 
6091 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6092 	ndp = &__unlink_data->nd;
6093 #if CONFIG_FSE
6094 	fse_info *finfop = &__unlink_data->finfo;
6095 #endif
6096 
6097 retry:
6098 	do_retry = 0;
6099 	flags = 0;
6100 	need_event = 0;
6101 	has_listeners = 0;
6102 	truncated_path = 0;
6103 	truncated_no_firmlink_path = 0;
6104 	vap = NULL;
6105 
6106 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6107 
6108 	ndp->ni_dvp = start_dvp;
6109 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6110 	cnp = &ndp->ni_cnd;
6111 
6112 continue_lookup:
6113 	error = nameiat(ndp, fd);
6114 	if (error) {
6115 		goto early_out;
6116 	}
6117 
6118 	dvp = ndp->ni_dvp;
6119 	vp = ndp->ni_vp;
6120 
6121 	/* With Carbon delete semantics, busy files cannot be deleted */
6122 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6123 		flags |= VNODE_REMOVE_NODELETEBUSY;
6124 	}
6125 
6126 	/* Skip any potential upcalls if told to. */
6127 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6128 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6129 	}
6130 
6131 	if (vp) {
6132 		batched = vnode_compound_remove_available(vp);
6133 		/*
6134 		 * The root of a mounted filesystem cannot be deleted.
6135 		 */
6136 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6137 			error = EBUSY;
6138 			goto out;
6139 		}
6140 
6141 #if DEVELOPMENT || DEBUG
6142 		/*
6143 		 * XXX VSWAP: Check for entitlements or special flag here
6144 		 * so we can restrict access appropriately.
6145 		 */
6146 #else /* DEVELOPMENT || DEBUG */
6147 
6148 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6149 			error = EPERM;
6150 			goto out;
6151 		}
6152 #endif /* DEVELOPMENT || DEBUG */
6153 
6154 		if (!batched) {
6155 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6156 			if (error) {
6157 				if (error == ENOENT) {
6158 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6159 						do_retry = 1;
6160 						retry_count++;
6161 					}
6162 				}
6163 				goto out;
6164 			}
6165 		}
6166 	} else {
6167 		batched = 1;
6168 
6169 		if (!vnode_compound_remove_available(dvp)) {
6170 			panic("No vp, but no compound remove?");
6171 		}
6172 	}
6173 
6174 #if CONFIG_FSE
6175 	need_event = need_fsevent(FSE_DELETE, dvp);
6176 	if (need_event) {
6177 		if (!batched) {
6178 			if ((vp->v_flag & VISHARDLINK) == 0) {
6179 				/* XXX need to get these data in batched VNOP */
6180 				get_fse_info(vp, finfop, ctx);
6181 			}
6182 		} else {
6183 			error =
6184 			    vfs_get_notify_attributes(&__unlink_data->va);
6185 			if (error) {
6186 				goto out;
6187 			}
6188 
6189 			vap = &__unlink_data->va;
6190 		}
6191 	}
6192 #endif
6193 	has_listeners = kauth_authorize_fileop_has_listeners();
6194 	if (need_event || has_listeners) {
6195 		if (path == NULL) {
6196 			GET_PATH(path);
6197 		}
6198 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6199 		if (no_firmlink_path == NULL) {
6200 			GET_PATH(no_firmlink_path);
6201 		}
6202 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6203 	}
6204 
6205 #if NAMEDRSRCFORK
6206 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6207 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6208 	} else
6209 #endif
6210 	{
6211 #if CONFIG_FILE_LEASES
6212 		vnode_breakdirlease(dvp, false, O_WRONLY);
6213 #endif
6214 
6215 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6216 		vp = ndp->ni_vp;
6217 		if (error == EKEEPLOOKING) {
6218 			if (!batched) {
6219 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6220 			}
6221 
6222 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6223 				panic("EKEEPLOOKING, but continue flag not set?");
6224 			}
6225 
6226 			if (vnode_isdir(vp)) {
6227 				error = EISDIR;
6228 				goto out;
6229 			}
6230 			goto continue_lookup;
6231 		} else if (error == ENOENT && batched) {
6232 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6233 				/*
6234 				 * For compound VNOPs, the authorization callback may
6235 				 * return ENOENT in case of racing hardlink lookups
6236 				 * hitting the name  cache, redrive the lookup.
6237 				 */
6238 				do_retry = 1;
6239 				retry_count += 1;
6240 				goto out;
6241 			}
6242 		}
6243 	}
6244 
6245 	/*
6246 	 * Call out to allow 3rd party notification of delete.
6247 	 * Ignore result of kauth_authorize_fileop call.
6248 	 */
6249 	if (!error) {
6250 		if (has_listeners) {
6251 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6252 			    KAUTH_FILEOP_DELETE,
6253 			    (uintptr_t)vp,
6254 			    (uintptr_t)path);
6255 		}
6256 
6257 		if (vp->v_flag & VISHARDLINK) {
6258 			//
6259 			// if a hardlink gets deleted we want to blow away the
6260 			// v_parent link because the path that got us to this
6261 			// instance of the link is no longer valid.  this will
6262 			// force the next call to get the path to ask the file
6263 			// system instead of just following the v_parent link.
6264 			//
6265 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6266 		}
6267 
6268 #if CONFIG_FSE
6269 		if (need_event) {
6270 			if (vp->v_flag & VISHARDLINK) {
6271 				get_fse_info(vp, finfop, ctx);
6272 			} else if (vap) {
6273 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6274 			}
6275 			if (truncated_path) {
6276 				finfop->mode |= FSE_TRUNCATED_PATH;
6277 			}
6278 			add_fsevent(FSE_DELETE, ctx,
6279 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6280 			    FSE_ARG_FINFO, finfop,
6281 			    FSE_ARG_DONE);
6282 		}
6283 #endif
6284 
6285 #if CONFIG_MACF
6286 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6287 #endif
6288 	}
6289 
6290 out:
6291 	if (path != NULL) {
6292 		RELEASE_PATH(path);
6293 		path = NULL;
6294 	}
6295 
6296 	if (no_firmlink_path != NULL) {
6297 		RELEASE_PATH(no_firmlink_path);
6298 		no_firmlink_path = NULL;
6299 	}
6300 #if NAMEDRSRCFORK
6301 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6302 	 * will cause its shadow file to go away if necessary.
6303 	 */
6304 	if (vp && (vnode_isnamedstream(vp)) &&
6305 	    (vp->v_parent != NULLVP) &&
6306 	    vnode_isshadow(vp)) {
6307 		vnode_recycle(vp);
6308 	}
6309 #endif
6310 	/*
6311 	 * nameidone has to happen before we vnode_put(dvp)
6312 	 * since it may need to release the fs_nodelock on the dvp
6313 	 */
6314 	nameidone(ndp);
6315 	vnode_put(dvp);
6316 	if (vp) {
6317 		vnode_put(vp);
6318 	}
6319 
6320 	if (do_retry) {
6321 		goto retry;
6322 	}
6323 
6324 early_out:
6325 	kfree_type(typeof(*__unlink_data), __unlink_data);
6326 	return error;
6327 }
6328 
6329 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6330 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6331     enum uio_seg segflg, int unlink_flags)
6332 {
6333 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6334 	           unlink_flags);
6335 }
6336 
6337 /*
6338  * Delete a name from the filesystem using Carbon semantics.
6339  */
6340 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6341 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6342 {
6343 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6344 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6345 }
6346 
6347 /*
6348  * Delete a name from the filesystem using POSIX semantics.
6349  */
6350 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6351 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6352 {
6353 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6354 	           uap->path, UIO_USERSPACE, 0);
6355 }
6356 
6357 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6358 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6359 {
6360 	int unlink_flags = 0;
6361 
6362 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY)) {
6363 		return EINVAL;
6364 	}
6365 
6366 	if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6367 		unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6368 	}
6369 
6370 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6371 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6372 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6373 		}
6374 		return rmdirat_internal(vfs_context_current(), uap->fd,
6375 		           uap->path, UIO_USERSPACE, unlink_flags);
6376 	} else {
6377 		return unlinkat_internal(vfs_context_current(), uap->fd,
6378 		           NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6379 	}
6380 }
6381 
6382 /*
6383  * Reposition read/write file offset.
6384  */
6385 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6386 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6387 {
6388 	struct fileproc *fp;
6389 	vnode_t vp;
6390 	struct vfs_context *ctx;
6391 	off_t offset = uap->offset, file_size;
6392 	int error;
6393 
6394 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6395 		if (error == ENOTSUP) {
6396 			return ESPIPE;
6397 		}
6398 		return error;
6399 	}
6400 	if (vnode_isfifo(vp)) {
6401 		file_drop(uap->fd);
6402 		return ESPIPE;
6403 	}
6404 
6405 
6406 	ctx = vfs_context_current();
6407 #if CONFIG_MACF
6408 	if (uap->whence == L_INCR && uap->offset == 0) {
6409 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6410 		    fp->fp_glob);
6411 	} else {
6412 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6413 		    fp->fp_glob);
6414 	}
6415 	if (error) {
6416 		file_drop(uap->fd);
6417 		return error;
6418 	}
6419 #endif
6420 	if ((error = vnode_getwithref(vp))) {
6421 		file_drop(uap->fd);
6422 		return error;
6423 	}
6424 
6425 	switch (uap->whence) {
6426 	case L_INCR:
6427 		offset += fp->fp_glob->fg_offset;
6428 		break;
6429 	case L_XTND:
6430 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6431 			break;
6432 		}
6433 		offset += file_size;
6434 		break;
6435 	case L_SET:
6436 		break;
6437 	case SEEK_HOLE:
6438 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6439 		break;
6440 	case SEEK_DATA:
6441 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6442 		break;
6443 	default:
6444 		error = EINVAL;
6445 	}
6446 	if (error == 0) {
6447 		if (uap->offset > 0 && offset < 0) {
6448 			/* Incremented/relative move past max size */
6449 			error = EOVERFLOW;
6450 		} else {
6451 			/*
6452 			 * Allow negative offsets on character devices, per
6453 			 * POSIX 1003.1-2001.  Most likely for writing disk
6454 			 * labels.
6455 			 */
6456 			if (offset < 0 && vp->v_type != VCHR) {
6457 				/* Decremented/relative move before start */
6458 				error = EINVAL;
6459 			} else {
6460 				/* Success */
6461 				fp->fp_glob->fg_offset = offset;
6462 				*retval = fp->fp_glob->fg_offset;
6463 			}
6464 		}
6465 	}
6466 
6467 	/*
6468 	 * An lseek can affect whether data is "available to read."  Use
6469 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6470 	 */
6471 	post_event_if_success(vp, error, NOTE_NONE);
6472 	(void)vnode_put(vp);
6473 	file_drop(uap->fd);
6474 	return error;
6475 }
6476 
6477 
6478 /*
6479  * Check access permissions.
6480  *
6481  * Returns:	0			Success
6482  *		vnode_authorize:???
6483  */
6484 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6485 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6486 {
6487 	kauth_action_t action;
6488 	int error;
6489 
6490 	/*
6491 	 * If just the regular access bits, convert them to something
6492 	 * that vnode_authorize will understand.
6493 	 */
6494 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6495 		action = 0;
6496 		if (uflags & R_OK) {
6497 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6498 		}
6499 		if (uflags & W_OK) {
6500 			if (vnode_isdir(vp)) {
6501 				action |= KAUTH_VNODE_ADD_FILE |
6502 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6503 				/* might want delete rights here too */
6504 			} else {
6505 				action |= KAUTH_VNODE_WRITE_DATA;
6506 			}
6507 		}
6508 		if (uflags & X_OK) {
6509 			if (vnode_isdir(vp)) {
6510 				action |= KAUTH_VNODE_SEARCH;
6511 			} else {
6512 				action |= KAUTH_VNODE_EXECUTE;
6513 			}
6514 		}
6515 	} else {
6516 		/* take advantage of definition of uflags */
6517 		action = uflags >> 8;
6518 	}
6519 
6520 #if CONFIG_MACF
6521 	error = mac_vnode_check_access(ctx, vp, uflags);
6522 	if (error) {
6523 		return error;
6524 	}
6525 #endif /* MAC */
6526 
6527 	/* action == 0 means only check for existence */
6528 	if (action != 0) {
6529 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6530 	} else {
6531 		error = 0;
6532 	}
6533 
6534 	return error;
6535 }
6536 
6537 
6538 
6539 /*
6540  * access_extended: Check access permissions in bulk.
6541  *
6542  * Description:	uap->entries		Pointer to an array of accessx
6543  *                                      descriptor structs, plus one or
6544  *                                      more NULL terminated strings (see
6545  *                                      "Notes" section below).
6546  *		uap->size		Size of the area pointed to by
6547  *					uap->entries.
6548  *		uap->results		Pointer to the results array.
6549  *
6550  * Returns:	0			Success
6551  *		ENOMEM			Insufficient memory
6552  *		EINVAL			Invalid arguments
6553  *		namei:EFAULT		Bad address
6554  *		namei:ENAMETOOLONG	Filename too long
6555  *		namei:ENOENT		No such file or directory
6556  *		namei:ELOOP		Too many levels of symbolic links
6557  *		namei:EBADF		Bad file descriptor
6558  *		namei:ENOTDIR		Not a directory
6559  *		namei:???
6560  *		access1:
6561  *
6562  * Implicit returns:
6563  *		uap->results		Array contents modified
6564  *
6565  * Notes:	The uap->entries are structured as an arbitrary length array
6566  *		of accessx descriptors, followed by one or more NULL terminated
6567  *		strings
6568  *
6569  *			struct accessx_descriptor[0]
6570  *			...
6571  *			struct accessx_descriptor[n]
6572  *			char name_data[0];
6573  *
6574  *		We determine the entry count by walking the buffer containing
6575  *		the uap->entries argument descriptor.  For each descriptor we
6576  *		see, the valid values for the offset ad_name_offset will be
6577  *		in the byte range:
6578  *
6579  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6580  *						to
6581  *				[ uap->entries + uap->size - 2 ]
6582  *
6583  *		since we must have at least one string, and the string must
6584  *		be at least one character plus the NULL terminator in length.
6585  *
6586  * XXX:		Need to support the check-as uid argument
6587  */
6588 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6589 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6590 {
6591 	struct accessx_descriptor *input = NULL;
6592 	errno_t *result = NULL;
6593 	errno_t error = 0;
6594 	int wantdelete = 0;
6595 	size_t desc_max, desc_actual = 0;
6596 	unsigned int i, j;
6597 	struct vfs_context context;
6598 	struct nameidata nd;
6599 	int niopts;
6600 	vnode_t vp = NULL;
6601 	vnode_t dvp = NULL;
6602 #define ACCESSX_MAX_DESCR_ON_STACK 10
6603 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6604 
6605 	context.vc_ucred = NULL;
6606 
6607 	/*
6608 	 * Validate parameters; if valid, copy the descriptor array and string
6609 	 * arguments into local memory.  Before proceeding, the following
6610 	 * conditions must have been met:
6611 	 *
6612 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6613 	 * o	There must be sufficient room in the request for at least one
6614 	 *	descriptor and a one yte NUL terminated string.
6615 	 * o	The allocation of local storage must not fail.
6616 	 */
6617 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6618 		return ENOMEM;
6619 	}
6620 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6621 		return EINVAL;
6622 	}
6623 	if (uap->size <= sizeof(stack_input)) {
6624 		input = stack_input;
6625 	} else {
6626 		input = kalloc_data(uap->size, Z_WAITOK);
6627 		if (input == NULL) {
6628 			error = ENOMEM;
6629 			goto out;
6630 		}
6631 	}
6632 	error = copyin(uap->entries, input, uap->size);
6633 	if (error) {
6634 		goto out;
6635 	}
6636 
6637 	AUDIT_ARG(opaque, input, uap->size);
6638 
6639 	/*
6640 	 * Force NUL termination of the copyin buffer to avoid nami() running
6641 	 * off the end.  If the caller passes us bogus data, they may get a
6642 	 * bogus result.
6643 	 */
6644 	((char *)input)[uap->size - 1] = 0;
6645 
6646 	/*
6647 	 * Access is defined as checking against the process' real identity,
6648 	 * even if operations are checking the effective identity.  This
6649 	 * requires that we use a local vfs context.
6650 	 */
6651 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6652 	context.vc_thread = current_thread();
6653 
6654 	/*
6655 	 * Find out how many entries we have, so we can allocate the result
6656 	 * array by walking the list and adjusting the count downward by the
6657 	 * earliest string offset we see.
6658 	 */
6659 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6660 	desc_actual = desc_max;
6661 	for (i = 0; i < desc_actual; i++) {
6662 		/*
6663 		 * Take the offset to the name string for this entry and
6664 		 * convert to an input array index, which would be one off
6665 		 * the end of the array if this entry was the lowest-addressed
6666 		 * name string.
6667 		 */
6668 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6669 
6670 		/*
6671 		 * An offset greater than the max allowable offset is an error.
6672 		 * It is also an error for any valid entry to point
6673 		 * to a location prior to the end of the current entry, if
6674 		 * it's not a reference to the string of the previous entry.
6675 		 */
6676 		if (j > desc_max || (j != 0 && j <= i)) {
6677 			error = EINVAL;
6678 			goto out;
6679 		}
6680 
6681 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6682 		if (input[i].ad_name_offset >= uap->size) {
6683 			error = EINVAL;
6684 			goto out;
6685 		}
6686 
6687 		/*
6688 		 * An offset of 0 means use the previous descriptor's offset;
6689 		 * this is used to chain multiple requests for the same file
6690 		 * to avoid multiple lookups.
6691 		 */
6692 		if (j == 0) {
6693 			/* This is not valid for the first entry */
6694 			if (i == 0) {
6695 				error = EINVAL;
6696 				goto out;
6697 			}
6698 			continue;
6699 		}
6700 
6701 		/*
6702 		 * If the offset of the string for this descriptor is before
6703 		 * what we believe is the current actual last descriptor,
6704 		 * then we need to adjust our estimate downward; this permits
6705 		 * the string table following the last descriptor to be out
6706 		 * of order relative to the descriptor list.
6707 		 */
6708 		if (j < desc_actual) {
6709 			desc_actual = j;
6710 		}
6711 	}
6712 
6713 	/*
6714 	 * We limit the actual number of descriptors we are willing to process
6715 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6716 	 * requested does not exceed this limit,
6717 	 */
6718 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6719 		error = ENOMEM;
6720 		goto out;
6721 	}
6722 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6723 	if (result == NULL) {
6724 		error = ENOMEM;
6725 		goto out;
6726 	}
6727 
6728 	/*
6729 	 * Do the work by iterating over the descriptor entries we know to
6730 	 * at least appear to contain valid data.
6731 	 */
6732 	error = 0;
6733 	for (i = 0; i < desc_actual; i++) {
6734 		/*
6735 		 * If the ad_name_offset is 0, then we use the previous
6736 		 * results to make the check; otherwise, we are looking up
6737 		 * a new file name.
6738 		 */
6739 		if (input[i].ad_name_offset != 0) {
6740 			/* discard old vnodes */
6741 			if (vp) {
6742 				vnode_put(vp);
6743 				vp = NULL;
6744 			}
6745 			if (dvp) {
6746 				vnode_put(dvp);
6747 				dvp = NULL;
6748 			}
6749 
6750 			/*
6751 			 * Scan forward in the descriptor list to see if we
6752 			 * need the parent vnode.  We will need it if we are
6753 			 * deleting, since we must have rights  to remove
6754 			 * entries in the parent directory, as well as the
6755 			 * rights to delete the object itself.
6756 			 */
6757 			wantdelete = input[i].ad_flags & _DELETE_OK;
6758 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6759 				if (input[j].ad_flags & _DELETE_OK) {
6760 					wantdelete = 1;
6761 				}
6762 			}
6763 
6764 			niopts = FOLLOW | AUDITVNPATH1;
6765 
6766 			/* need parent for vnode_authorize for deletion test */
6767 			if (wantdelete) {
6768 				niopts |= WANTPARENT;
6769 			}
6770 
6771 			/* do the lookup */
6772 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6773 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6774 			    &context);
6775 			error = namei(&nd);
6776 			if (!error) {
6777 				vp = nd.ni_vp;
6778 				if (wantdelete) {
6779 					dvp = nd.ni_dvp;
6780 				}
6781 			}
6782 			nameidone(&nd);
6783 		}
6784 
6785 		/*
6786 		 * Handle lookup errors.
6787 		 */
6788 		switch (error) {
6789 		case ENOENT:
6790 		case EACCES:
6791 		case EPERM:
6792 		case ENOTDIR:
6793 			result[i] = error;
6794 			break;
6795 		case 0:
6796 			/* run this access check */
6797 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6798 			break;
6799 		default:
6800 			/* fatal lookup error */
6801 
6802 			goto out;
6803 		}
6804 	}
6805 
6806 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6807 
6808 	/* copy out results */
6809 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6810 
6811 out:
6812 	if (input && input != stack_input) {
6813 		kfree_data(input, uap->size);
6814 	}
6815 	if (result) {
6816 		kfree_data(result, desc_actual * sizeof(errno_t));
6817 	}
6818 	if (vp) {
6819 		vnode_put(vp);
6820 	}
6821 	if (dvp) {
6822 		vnode_put(dvp);
6823 	}
6824 	if (IS_VALID_CRED(context.vc_ucred)) {
6825 		kauth_cred_unref(&context.vc_ucred);
6826 	}
6827 	return error;
6828 }
6829 
6830 
6831 /*
6832  * Returns:	0			Success
6833  *		namei:EFAULT		Bad address
6834  *		namei:ENAMETOOLONG	Filename too long
6835  *		namei:ENOENT		No such file or directory
6836  *		namei:ELOOP		Too many levels of symbolic links
6837  *		namei:EBADF		Bad file descriptor
6838  *		namei:ENOTDIR		Not a directory
6839  *		namei:???
6840  *		access1:
6841  */
6842 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6843 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6844     int flag, enum uio_seg segflg)
6845 {
6846 	int error;
6847 	struct nameidata nd;
6848 	int niopts;
6849 	struct vfs_context context;
6850 #if NAMEDRSRCFORK
6851 	int is_namedstream = 0;
6852 #endif
6853 
6854 	/*
6855 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6856 	 * against the process' real identity, even if operations are checking
6857 	 * the effective identity.  So we need to tweak the credential
6858 	 * in the context for that case.
6859 	 */
6860 	if (!(flag & AT_EACCESS)) {
6861 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6862 	} else {
6863 		context.vc_ucred = ctx->vc_ucred;
6864 	}
6865 	context.vc_thread = ctx->vc_thread;
6866 
6867 
6868 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6869 	/* need parent for vnode_authorize for deletion test */
6870 	if (amode & _DELETE_OK) {
6871 		niopts |= WANTPARENT;
6872 	}
6873 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6874 	    path, &context);
6875 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6876 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6877 	}
6878 
6879 #if NAMEDRSRCFORK
6880 	/* access(F_OK) calls are allowed for resource forks. */
6881 	if (amode == F_OK) {
6882 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6883 	}
6884 #endif
6885 	error = nameiat(&nd, fd);
6886 	if (error) {
6887 		goto out;
6888 	}
6889 
6890 #if NAMEDRSRCFORK
6891 	/* Grab reference on the shadow stream file vnode to
6892 	 * force an inactive on release which will mark it
6893 	 * for recycle.
6894 	 */
6895 	if (vnode_isnamedstream(nd.ni_vp) &&
6896 	    (nd.ni_vp->v_parent != NULLVP) &&
6897 	    vnode_isshadow(nd.ni_vp)) {
6898 		is_namedstream = 1;
6899 		vnode_ref(nd.ni_vp);
6900 	}
6901 #endif
6902 
6903 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6904 
6905 #if NAMEDRSRCFORK
6906 	if (is_namedstream) {
6907 		vnode_rele(nd.ni_vp);
6908 	}
6909 #endif
6910 
6911 	vnode_put(nd.ni_vp);
6912 	if (amode & _DELETE_OK) {
6913 		vnode_put(nd.ni_dvp);
6914 	}
6915 	nameidone(&nd);
6916 
6917 out:
6918 	if (!(flag & AT_EACCESS)) {
6919 		kauth_cred_unref(&context.vc_ucred);
6920 	}
6921 	return error;
6922 }
6923 
6924 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6925 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6926 {
6927 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
6928 	           uap->path, uap->flags, 0, UIO_USERSPACE);
6929 }
6930 
6931 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6932 faccessat(__unused proc_t p, struct faccessat_args *uap,
6933     __unused int32_t *retval)
6934 {
6935 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6936 		return EINVAL;
6937 	}
6938 
6939 	return faccessat_internal(vfs_context_current(), uap->fd,
6940 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6941 }
6942 
6943 /*
6944  * Returns:	0			Success
6945  *		EFAULT
6946  *	copyout:EFAULT
6947  *	namei:???
6948  *	vn_stat:???
6949  */
6950 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6951 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6952     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6953     enum uio_seg segflg, int fd, int flag)
6954 {
6955 	struct nameidata *ndp = NULL;
6956 	int follow;
6957 	union {
6958 		struct stat sb;
6959 		struct stat64 sb64;
6960 	} source = {};
6961 	union {
6962 		struct user64_stat user64_sb;
6963 		struct user32_stat user32_sb;
6964 		struct user64_stat64 user64_sb64;
6965 		struct user32_stat64 user32_sb64;
6966 	} dest = {};
6967 	caddr_t sbp;
6968 	int error, my_size;
6969 	kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
6970 	size_t xsecurity_bufsize;
6971 	void * statptr;
6972 	struct fileproc *fp = NULL;
6973 	int needsrealdev = 0;
6974 
6975 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6976 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
6977 	NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6978 	    segflg, path, ctx);
6979 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6980 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
6981 	}
6982 
6983 #if NAMEDRSRCFORK
6984 	int is_namedstream = 0;
6985 	/* stat calls are allowed for resource forks. */
6986 	ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6987 #endif
6988 
6989 	if (flag & AT_FDONLY) {
6990 		vnode_t fvp;
6991 
6992 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6993 		if (error) {
6994 			goto out;
6995 		}
6996 		if ((error = vnode_getwithref(fvp))) {
6997 			file_drop(fd);
6998 			goto out;
6999 		}
7000 		ndp->ni_vp = fvp;
7001 	} else {
7002 		error = nameiat(ndp, fd);
7003 		if (error) {
7004 			goto out;
7005 		}
7006 	}
7007 
7008 	statptr = (void *)&source;
7009 
7010 #if NAMEDRSRCFORK
7011 	/* Grab reference on the shadow stream file vnode to
7012 	 * force an inactive on release which will mark it
7013 	 * for recycle.
7014 	 */
7015 	if (vnode_isnamedstream(ndp->ni_vp) &&
7016 	    (ndp->ni_vp->v_parent != NULLVP) &&
7017 	    vnode_isshadow(ndp->ni_vp)) {
7018 		is_namedstream = 1;
7019 		vnode_ref(ndp->ni_vp);
7020 	}
7021 #endif
7022 
7023 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
7024 	if (fp && (xsecurity == USER_ADDR_NULL)) {
7025 		/*
7026 		 * If the caller has the file open, and is not
7027 		 * requesting extended security information, we are
7028 		 * going to let them get the basic stat information.
7029 		 */
7030 		error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7031 		    fp->fp_glob->fg_cred);
7032 	} else {
7033 		error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7034 		    isstat64, needsrealdev, ctx);
7035 	}
7036 
7037 #if NAMEDRSRCFORK
7038 	if (is_namedstream) {
7039 		vnode_rele(ndp->ni_vp);
7040 	}
7041 #endif
7042 	vnode_put(ndp->ni_vp);
7043 	nameidone(ndp);
7044 
7045 	if (fp) {
7046 		file_drop(fd);
7047 		fp = NULL;
7048 	}
7049 
7050 	if (error) {
7051 		goto out;
7052 	}
7053 	/* Zap spare fields */
7054 	if (isstat64 != 0) {
7055 		source.sb64.st_lspare = 0;
7056 		source.sb64.st_qspare[0] = 0LL;
7057 		source.sb64.st_qspare[1] = 0LL;
7058 		if (vfs_context_is64bit(ctx)) {
7059 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7060 			my_size = sizeof(dest.user64_sb64);
7061 			sbp = (caddr_t)&dest.user64_sb64;
7062 		} else {
7063 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7064 			my_size = sizeof(dest.user32_sb64);
7065 			sbp = (caddr_t)&dest.user32_sb64;
7066 		}
7067 		/*
7068 		 * Check if we raced (post lookup) against the last unlink of a file.
7069 		 */
7070 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7071 			source.sb64.st_nlink = 1;
7072 		}
7073 	} else {
7074 		source.sb.st_lspare = 0;
7075 		source.sb.st_qspare[0] = 0LL;
7076 		source.sb.st_qspare[1] = 0LL;
7077 		if (vfs_context_is64bit(ctx)) {
7078 			munge_user64_stat(&source.sb, &dest.user64_sb);
7079 			my_size = sizeof(dest.user64_sb);
7080 			sbp = (caddr_t)&dest.user64_sb;
7081 		} else {
7082 			munge_user32_stat(&source.sb, &dest.user32_sb);
7083 			my_size = sizeof(dest.user32_sb);
7084 			sbp = (caddr_t)&dest.user32_sb;
7085 		}
7086 
7087 		/*
7088 		 * Check if we raced (post lookup) against the last unlink of a file.
7089 		 */
7090 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7091 			source.sb.st_nlink = 1;
7092 		}
7093 	}
7094 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7095 		goto out;
7096 	}
7097 
7098 	/* caller wants extended security information? */
7099 	if (xsecurity != USER_ADDR_NULL) {
7100 		/* did we get any? */
7101 		if (fsec == KAUTH_FILESEC_NONE) {
7102 			if (susize(xsecurity_size, 0) != 0) {
7103 				error = EFAULT;
7104 				goto out;
7105 			}
7106 		} else {
7107 			/* find the user buffer size */
7108 			xsecurity_bufsize = fusize(xsecurity_size);
7109 
7110 			/* copy out the actual data size */
7111 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7112 				error = EFAULT;
7113 				goto out;
7114 			}
7115 
7116 			/* if the caller supplied enough room, copy out to it */
7117 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7118 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7119 			}
7120 		}
7121 	}
7122 out:
7123 	if (ndp) {
7124 		kfree_type(struct nameidata, ndp);
7125 	}
7126 	if (fsec != KAUTH_FILESEC_NONE) {
7127 		kauth_filesec_free(fsec);
7128 	}
7129 	return error;
7130 }
7131 
7132 /*
7133  * stat_extended: Get file status; with extended security (ACL).
7134  *
7135  * Parameters:    p                       (ignored)
7136  *                uap                     User argument descriptor (see below)
7137  *                retval                  (ignored)
7138  *
7139  * Indirect:      uap->path               Path of file to get status from
7140  *                uap->ub                 User buffer (holds file status info)
7141  *                uap->xsecurity          ACL to get (extended security)
7142  *                uap->xsecurity_size     Size of ACL
7143  *
7144  * Returns:        0                      Success
7145  *                !0                      errno value
7146  *
7147  */
7148 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7149 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7150     __unused int32_t *retval)
7151 {
7152 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7153 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7154 	           0);
7155 }
7156 
7157 /*
7158  * Returns:	0			Success
7159  *	fstatat_internal:???		[see fstatat_internal() in this file]
7160  */
7161 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7162 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7163 {
7164 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7165 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7166 }
7167 
7168 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7169 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7170 {
7171 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7172 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7173 }
7174 
7175 /*
7176  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7177  *
7178  * Parameters:    p                       (ignored)
7179  *                uap                     User argument descriptor (see below)
7180  *                retval                  (ignored)
7181  *
7182  * Indirect:      uap->path               Path of file to get status from
7183  *                uap->ub                 User buffer (holds file status info)
7184  *                uap->xsecurity          ACL to get (extended security)
7185  *                uap->xsecurity_size     Size of ACL
7186  *
7187  * Returns:        0                      Success
7188  *                !0                      errno value
7189  *
7190  */
7191 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7192 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7193 {
7194 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7195 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7196 	           0);
7197 }
7198 
7199 /*
7200  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7201  *
7202  * Parameters:    p                       (ignored)
7203  *                uap                     User argument descriptor (see below)
7204  *                retval                  (ignored)
7205  *
7206  * Indirect:      uap->path               Path of file to get status from
7207  *                uap->ub                 User buffer (holds file status info)
7208  *                uap->xsecurity          ACL to get (extended security)
7209  *                uap->xsecurity_size     Size of ACL
7210  *
7211  * Returns:        0                      Success
7212  *                !0                      errno value
7213  *
7214  */
7215 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7216 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7217 {
7218 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7219 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7220 	           AT_SYMLINK_NOFOLLOW);
7221 }
7222 
7223 /*
7224  * Get file status; this version does not follow links.
7225  */
7226 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7227 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7228 {
7229 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7230 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7231 }
7232 
7233 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7234 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7235 {
7236 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7237 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7238 }
7239 
7240 /*
7241  * lstat64_extended: Get file status; can handle large inode numbers; does not
7242  * follow links; with extended security (ACL).
7243  *
7244  * Parameters:    p                       (ignored)
7245  *                uap                     User argument descriptor (see below)
7246  *                retval                  (ignored)
7247  *
7248  * Indirect:      uap->path               Path of file to get status from
7249  *                uap->ub                 User buffer (holds file status info)
7250  *                uap->xsecurity          ACL to get (extended security)
7251  *                uap->xsecurity_size     Size of ACL
7252  *
7253  * Returns:        0                      Success
7254  *                !0                      errno value
7255  *
7256  */
7257 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7258 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7259 {
7260 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7261 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7262 	           AT_SYMLINK_NOFOLLOW);
7263 }
7264 
7265 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7266 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7267 {
7268 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7269 		return EINVAL;
7270 	}
7271 
7272 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7273 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7274 }
7275 
7276 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7277 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7278     __unused int32_t *retval)
7279 {
7280 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7281 		return EINVAL;
7282 	}
7283 
7284 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7285 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7286 }
7287 
7288 /*
7289  * Get configurable pathname variables.
7290  *
7291  * Returns:	0			Success
7292  *	namei:???
7293  *	vn_pathconf:???
7294  *
7295  * Notes:	Global implementation  constants are intended to be
7296  *		implemented in this function directly; all other constants
7297  *		are per-FS implementation, and therefore must be handled in
7298  *		each respective FS, instead.
7299  *
7300  * XXX We implement some things globally right now that should actually be
7301  * XXX per-FS; we will need to deal with this at some point.
7302  */
7303 /* ARGSUSED */
7304 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7305 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7306 {
7307 	int error;
7308 	struct nameidata nd;
7309 	vfs_context_t ctx = vfs_context_current();
7310 
7311 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7312 	    UIO_USERSPACE, uap->path, ctx);
7313 	error = namei(&nd);
7314 	if (error) {
7315 		return error;
7316 	}
7317 
7318 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7319 
7320 	vnode_put(nd.ni_vp);
7321 	nameidone(&nd);
7322 	return error;
7323 }
7324 
7325 /*
7326  * Return target name of a symbolic link.
7327  */
7328 /* ARGSUSED */
7329 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7330 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7331     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7332     int *retval)
7333 {
7334 	vnode_t vp;
7335 	uio_t auio;
7336 	int error;
7337 	struct nameidata nd;
7338 	UIO_STACKBUF(uio_buf, 1);
7339 	bool put_vnode;
7340 
7341 	if (bufsize > INT32_MAX) {
7342 		return EINVAL;
7343 	}
7344 
7345 	if (lnk_vp) {
7346 		vp = lnk_vp;
7347 		put_vnode = false;
7348 	} else {
7349 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7350 		    seg, path, ctx);
7351 
7352 		error = nameiat(&nd, fd);
7353 		if (error) {
7354 			return error;
7355 		}
7356 		vp = nd.ni_vp;
7357 		put_vnode = true;
7358 		nameidone(&nd);
7359 	}
7360 
7361 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7362 	    &uio_buf[0], sizeof(uio_buf));
7363 	uio_addiov(auio, buf, bufsize);
7364 	if (vp->v_type != VLNK) {
7365 		error = EINVAL;
7366 	} else {
7367 #if CONFIG_MACF
7368 		error = mac_vnode_check_readlink(ctx, vp);
7369 #endif
7370 		if (error == 0) {
7371 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7372 			    ctx);
7373 		}
7374 		if (error == 0) {
7375 			error = VNOP_READLINK(vp, auio, ctx);
7376 		}
7377 	}
7378 
7379 	if (put_vnode) {
7380 		vnode_put(vp);
7381 	}
7382 
7383 	*retval = (int)(bufsize - uio_resid(auio));
7384 	return error;
7385 }
7386 
7387 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7388 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7389 {
7390 	enum uio_seg procseg;
7391 	vnode_t vp;
7392 	int error;
7393 
7394 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7395 
7396 	AUDIT_ARG(fd, uap->fd);
7397 
7398 	if ((error = file_vnode(uap->fd, &vp))) {
7399 		return error;
7400 	}
7401 	if ((error = vnode_getwithref(vp))) {
7402 		file_drop(uap->fd);
7403 		return error;
7404 	}
7405 
7406 	error = readlinkat_internal(vfs_context_current(), -1,
7407 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7408 	    uap->bufsize, procseg, retval);
7409 
7410 	vnode_put(vp);
7411 	file_drop(uap->fd);
7412 	return error;
7413 }
7414 
7415 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7416 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7417 {
7418 	enum uio_seg procseg;
7419 
7420 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7421 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7422 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7423 	           uap->count, procseg, retval);
7424 }
7425 
7426 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7427 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7428 {
7429 	enum uio_seg procseg;
7430 
7431 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7432 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7433 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7434 	           retval);
7435 }
7436 
7437 /*
7438  * Change file flags, the deep inner layer.
7439  */
7440 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7441 chflags0(vnode_t vp, struct vnode_attr *va,
7442     int (*setattr)(vnode_t, void *, vfs_context_t),
7443     void *arg, vfs_context_t ctx)
7444 {
7445 	kauth_action_t action = 0;
7446 	int error;
7447 
7448 #if CONFIG_MACF
7449 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7450 	if (error) {
7451 		goto out;
7452 	}
7453 #endif
7454 
7455 	/* request authorisation, disregard immutability */
7456 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7457 		goto out;
7458 	}
7459 	/*
7460 	 * Request that the auth layer disregard those file flags it's allowed to when
7461 	 * authorizing this operation; we need to do this in order to be able to
7462 	 * clear immutable flags.
7463 	 */
7464 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7465 		goto out;
7466 	}
7467 	error = (*setattr)(vp, arg, ctx);
7468 
7469 #if CONFIG_MACF
7470 	if (error == 0) {
7471 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7472 	}
7473 #endif
7474 
7475 out:
7476 	return error;
7477 }
7478 
7479 /*
7480  * Change file flags.
7481  *
7482  * NOTE: this will vnode_put() `vp'
7483  */
7484 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7485 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7486 {
7487 	struct vnode_attr va;
7488 	int error;
7489 
7490 	VATTR_INIT(&va);
7491 	VATTR_SET(&va, va_flags, flags);
7492 
7493 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7494 	vnode_put(vp);
7495 
7496 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7497 		error = ENOTSUP;
7498 	}
7499 
7500 	return error;
7501 }
7502 
7503 /*
7504  * Change flags of a file given a path name.
7505  */
7506 /* ARGSUSED */
7507 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7508 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7509 {
7510 	vnode_t vp;
7511 	vfs_context_t ctx = vfs_context_current();
7512 	int error;
7513 	struct nameidata nd;
7514 	uint32_t wantparent = 0;
7515 
7516 #if CONFIG_FILE_LEASES
7517 	wantparent = WANTPARENT;
7518 #endif
7519 
7520 	AUDIT_ARG(fflags, uap->flags);
7521 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7522 	    UIO_USERSPACE, uap->path, ctx);
7523 	error = namei(&nd);
7524 	if (error) {
7525 		return error;
7526 	}
7527 	vp = nd.ni_vp;
7528 
7529 #if CONFIG_FILE_LEASES
7530 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7531 	vnode_put(nd.ni_dvp);
7532 #endif
7533 
7534 	nameidone(&nd);
7535 
7536 	/* we don't vnode_put() here because chflags1 does internally */
7537 	error = chflags1(vp, uap->flags, ctx);
7538 
7539 	return error;
7540 }
7541 
7542 /*
7543  * Change flags of a file given a file descriptor.
7544  */
7545 /* ARGSUSED */
7546 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7547 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7548 {
7549 	vnode_t vp;
7550 	int error;
7551 
7552 	AUDIT_ARG(fd, uap->fd);
7553 	AUDIT_ARG(fflags, uap->flags);
7554 	if ((error = file_vnode(uap->fd, &vp))) {
7555 		return error;
7556 	}
7557 
7558 	if ((error = vnode_getwithref(vp))) {
7559 		file_drop(uap->fd);
7560 		return error;
7561 	}
7562 
7563 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7564 
7565 #if CONFIG_FILE_LEASES
7566 	vnode_breakdirlease(vp, true, O_WRONLY);
7567 #endif
7568 
7569 	/* we don't vnode_put() here because chflags1 does internally */
7570 	error = chflags1(vp, uap->flags, vfs_context_current());
7571 
7572 	file_drop(uap->fd);
7573 	return error;
7574 }
7575 
7576 /*
7577  * Change security information on a filesystem object.
7578  *
7579  * Returns:	0			Success
7580  *		EPERM			Operation not permitted
7581  *		vnode_authattr:???	[anything vnode_authattr can return]
7582  *		vnode_authorize:???	[anything vnode_authorize can return]
7583  *		vnode_setattr:???	[anything vnode_setattr can return]
7584  *
7585  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7586  *		translated to EPERM before being returned.
7587  */
7588 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7589 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7590 {
7591 	kauth_action_t action;
7592 	int error;
7593 
7594 	AUDIT_ARG(mode, vap->va_mode);
7595 	/* XXX audit new args */
7596 
7597 #if NAMEDSTREAMS
7598 	/* chmod calls are not allowed for resource forks. */
7599 	if (vp->v_flag & VISNAMEDSTREAM) {
7600 		return EPERM;
7601 	}
7602 #endif
7603 
7604 #if CONFIG_MACF
7605 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7606 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7607 		return error;
7608 	}
7609 
7610 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7611 		if ((error = mac_vnode_check_setowner(ctx, vp,
7612 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7613 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7614 			return error;
7615 		}
7616 	}
7617 
7618 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7619 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7620 		return error;
7621 	}
7622 #endif
7623 
7624 	/* make sure that the caller is allowed to set this security information */
7625 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7626 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7627 		if (error == EACCES) {
7628 			error = EPERM;
7629 		}
7630 		return error;
7631 	}
7632 
7633 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7634 		return error;
7635 	}
7636 
7637 #if CONFIG_MACF
7638 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7639 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7640 	}
7641 
7642 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7643 		mac_vnode_notify_setowner(ctx, vp,
7644 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7645 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7646 	}
7647 
7648 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7649 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7650 	}
7651 #endif
7652 
7653 	return error;
7654 }
7655 
7656 
7657 /*
7658  * Change mode of a file given a path name.
7659  *
7660  * Returns:	0			Success
7661  *		namei:???		[anything namei can return]
7662  *		chmod_vnode:???		[anything chmod_vnode can return]
7663  */
7664 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7665 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7666     int fd, int flag, enum uio_seg segflg)
7667 {
7668 	struct nameidata nd;
7669 	int follow, error;
7670 	uint32_t wantparent = 0;
7671 
7672 #if CONFIG_FILE_LEASES
7673 	wantparent = WANTPARENT;
7674 #endif
7675 
7676 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7677 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7678 	    segflg, path, ctx);
7679 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7680 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7681 	}
7682 	if ((error = nameiat(&nd, fd))) {
7683 		return error;
7684 	}
7685 
7686 #if CONFIG_FILE_LEASES
7687 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7688 	vnode_put(nd.ni_dvp);
7689 #endif
7690 
7691 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7692 	vnode_put(nd.ni_vp);
7693 	nameidone(&nd);
7694 	return error;
7695 }
7696 
7697 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7698 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7699     gid_t gid, user_addr_t xsecurity)
7700 {
7701 	int error;
7702 
7703 	VATTR_INIT(pva);
7704 
7705 	if (mode != -1) {
7706 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
7707 	} else {
7708 		pva->va_mode = 0;
7709 	}
7710 
7711 	if (uid != KAUTH_UID_NONE) {
7712 		VATTR_SET(pva, va_uid, uid);
7713 	}
7714 
7715 	if (gid != KAUTH_GID_NONE) {
7716 		VATTR_SET(pva, va_gid, gid);
7717 	}
7718 
7719 	*pxsecdst = NULL;
7720 	switch (xsecurity) {
7721 	case USER_ADDR_NULL:
7722 		break;
7723 
7724 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7725 		VATTR_SET(pva, va_acl, NULL);
7726 		break;
7727 
7728 	default:
7729 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7730 			return error;
7731 		}
7732 
7733 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7734 		pva->va_vaflags |= VA_FILESEC_ACL;
7735 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7736 		break;
7737 	}
7738 
7739 	return 0;
7740 }
7741 
7742 /*
7743  * chmod_extended: Change the mode of a file given a path name; with extended
7744  * argument list (including extended security (ACL)).
7745  *
7746  * Parameters:	p			Process requesting the open
7747  *		uap			User argument descriptor (see below)
7748  *		retval			(ignored)
7749  *
7750  * Indirect:	uap->path		Path to object (same as 'chmod')
7751  *		uap->uid		UID to set
7752  *		uap->gid		GID to set
7753  *		uap->mode		File mode to set (same as 'chmod')
7754  *		uap->xsecurity		ACL to set (or delete)
7755  *
7756  * Returns:	0			Success
7757  *		!0			errno value
7758  *
7759  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7760  *
7761  * XXX:		We should enummerate the possible errno values here, and where
7762  *		in the code they originated.
7763  */
7764 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7765 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7766 {
7767 	int error;
7768 	struct vnode_attr va;
7769 	kauth_filesec_t xsecdst = NULL;
7770 
7771 	AUDIT_ARG(owner, uap->uid, uap->gid);
7772 
7773 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7774 	    uap->gid, uap->xsecurity);
7775 
7776 	if (error) {
7777 		return error;
7778 	}
7779 
7780 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7781 	    UIO_USERSPACE);
7782 
7783 	if (xsecdst != NULL) {
7784 		kauth_filesec_free(xsecdst);
7785 	}
7786 	return error;
7787 }
7788 
7789 /*
7790  * Returns:	0			Success
7791  *		chmodat:???		[anything chmodat can return]
7792  */
7793 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7794 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7795     int flag, enum uio_seg segflg)
7796 {
7797 	struct vnode_attr va;
7798 
7799 	VATTR_INIT(&va);
7800 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7801 
7802 	return chmodat(ctx, path, &va, fd, flag, segflg);
7803 }
7804 
7805 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7806 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7807 {
7808 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7809 	           AT_FDCWD, 0, UIO_USERSPACE);
7810 }
7811 
7812 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7813 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7814 {
7815 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7816 		return EINVAL;
7817 	}
7818 
7819 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7820 	           uap->fd, uap->flag, UIO_USERSPACE);
7821 }
7822 
7823 /*
7824  * Change mode of a file given a file descriptor.
7825  */
7826 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7827 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7828 {
7829 	vnode_t vp;
7830 	int error;
7831 
7832 	AUDIT_ARG(fd, fd);
7833 
7834 	if ((error = file_vnode(fd, &vp)) != 0) {
7835 		return error;
7836 	}
7837 	if ((error = vnode_getwithref(vp)) != 0) {
7838 		file_drop(fd);
7839 		return error;
7840 	}
7841 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7842 
7843 #if CONFIG_FILE_LEASES
7844 	vnode_breakdirlease(vp, true, O_WRONLY);
7845 #endif
7846 
7847 	error = chmod_vnode(vfs_context_current(), vp, vap);
7848 	(void)vnode_put(vp);
7849 	file_drop(fd);
7850 
7851 	return error;
7852 }
7853 
7854 /*
7855  * fchmod_extended: Change mode of a file given a file descriptor; with
7856  * extended argument list (including extended security (ACL)).
7857  *
7858  * Parameters:    p                       Process requesting to change file mode
7859  *                uap                     User argument descriptor (see below)
7860  *                retval                  (ignored)
7861  *
7862  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7863  *                uap->uid                UID to set
7864  *                uap->gid                GID to set
7865  *                uap->xsecurity          ACL to set (or delete)
7866  *                uap->fd                 File descriptor of file to change mode
7867  *
7868  * Returns:        0                      Success
7869  *                !0                      errno value
7870  *
7871  */
7872 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7873 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7874 {
7875 	int error;
7876 	struct vnode_attr va;
7877 	kauth_filesec_t xsecdst = NULL;
7878 
7879 	AUDIT_ARG(owner, uap->uid, uap->gid);
7880 
7881 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7882 	    uap->gid, uap->xsecurity);
7883 
7884 	if (error) {
7885 		return error;
7886 	}
7887 
7888 	error = fchmod1(p, uap->fd, &va);
7889 
7890 	if (xsecdst != NULL) {
7891 		kauth_filesec_free(xsecdst);
7892 	}
7893 	return error;
7894 }
7895 
7896 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7897 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7898 {
7899 	struct vnode_attr va;
7900 
7901 	VATTR_INIT(&va);
7902 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7903 
7904 	return fchmod1(p, uap->fd, &va);
7905 }
7906 
7907 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)7908 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
7909 {
7910 	struct vnode_attr va;
7911 	kauth_action_t action;
7912 	int error;
7913 
7914 	VATTR_INIT(&va);
7915 	if (uid != (uid_t)VNOVAL) {
7916 		VATTR_SET(&va, va_uid, uid);
7917 	}
7918 	if (gid != (gid_t)VNOVAL) {
7919 		VATTR_SET(&va, va_gid, gid);
7920 	}
7921 
7922 #if NAMEDSTREAMS
7923 	/* chown calls are not allowed for resource forks. */
7924 	if (vp->v_flag & VISNAMEDSTREAM) {
7925 		error = EPERM;
7926 		goto out;
7927 	}
7928 #endif
7929 
7930 #if CONFIG_MACF
7931 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7932 	if (error) {
7933 		goto out;
7934 	}
7935 #endif
7936 
7937 	/* preflight and authorize attribute changes */
7938 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7939 		goto out;
7940 	}
7941 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7942 		/*
7943 		 * EACCES is only allowed from namei(); permissions failure should
7944 		 * return EPERM, so we need to translate the error code.
7945 		 */
7946 		if (error == EACCES) {
7947 			error = EPERM;
7948 		}
7949 
7950 		goto out;
7951 	}
7952 
7953 #if CONFIG_FILE_LEASES
7954 	vnode_breakdirlease(vp, true, O_WRONLY);
7955 #endif
7956 
7957 	error = vnode_setattr(vp, &va, ctx);
7958 
7959 #if CONFIG_MACF
7960 	if (error == 0) {
7961 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
7962 	}
7963 #endif
7964 
7965 out:
7966 	return error;
7967 }
7968 
7969 /*
7970  * Set ownership given a path name.
7971  */
7972 /* ARGSUSED */
7973 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7974 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7975     gid_t gid, int flag, enum uio_seg segflg)
7976 {
7977 	vnode_t vp;
7978 	int error;
7979 	struct nameidata nd;
7980 	int follow;
7981 
7982 	AUDIT_ARG(owner, uid, gid);
7983 
7984 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7985 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
7986 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7987 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7988 	}
7989 
7990 	error = nameiat(&nd, fd);
7991 	if (error) {
7992 		return error;
7993 	}
7994 
7995 	vp = nd.ni_vp;
7996 	error = vn_chown_internal(ctx, vp, uid, gid);
7997 
7998 	nameidone(&nd);
7999 	vnode_put(vp);
8000 	return error;
8001 }
8002 
8003 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8004 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8005 {
8006 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8007 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
8008 }
8009 
8010 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8011 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8012 {
8013 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8014 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8015 }
8016 
8017 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8018 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8019 {
8020 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
8021 		return EINVAL;
8022 	}
8023 
8024 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8025 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8026 }
8027 
8028 /*
8029  * Set ownership given a file descriptor.
8030  */
8031 /* ARGSUSED */
8032 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8033 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8034 {
8035 	vfs_context_t ctx = vfs_context_current();
8036 	vnode_t vp;
8037 	int error;
8038 
8039 	AUDIT_ARG(owner, uap->uid, uap->gid);
8040 	AUDIT_ARG(fd, uap->fd);
8041 
8042 	if ((error = file_vnode(uap->fd, &vp))) {
8043 		return error;
8044 	}
8045 
8046 	if ((error = vnode_getwithref(vp))) {
8047 		file_drop(uap->fd);
8048 		return error;
8049 	}
8050 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8051 
8052 	error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8053 
8054 	(void)vnode_put(vp);
8055 	file_drop(uap->fd);
8056 	return error;
8057 }
8058 
8059 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8060 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8061 {
8062 	int error;
8063 
8064 	if (usrtvp == USER_ADDR_NULL) {
8065 		struct timeval old_tv;
8066 		/* XXX Y2038 bug because of microtime argument */
8067 		microtime(&old_tv);
8068 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8069 		tsp[1] = tsp[0];
8070 	} else {
8071 		if (IS_64BIT_PROCESS(current_proc())) {
8072 			struct user64_timeval tv[2];
8073 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8074 			if (error) {
8075 				return error;
8076 			}
8077 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8078 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8079 		} else {
8080 			struct user32_timeval tv[2];
8081 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8082 			if (error) {
8083 				return error;
8084 			}
8085 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8086 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8087 		}
8088 	}
8089 	return 0;
8090 }
8091 
8092 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8093 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8094     int nullflag)
8095 {
8096 	int error;
8097 	struct vnode_attr va;
8098 	kauth_action_t action;
8099 
8100 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8101 
8102 	VATTR_INIT(&va);
8103 	VATTR_SET(&va, va_access_time, ts[0]);
8104 	VATTR_SET(&va, va_modify_time, ts[1]);
8105 	if (nullflag) {
8106 		va.va_vaflags |= VA_UTIMES_NULL;
8107 	}
8108 
8109 #if NAMEDSTREAMS
8110 	/* utimes calls are not allowed for resource forks. */
8111 	if (vp->v_flag & VISNAMEDSTREAM) {
8112 		error = EPERM;
8113 		goto out;
8114 	}
8115 #endif
8116 
8117 #if CONFIG_MACF
8118 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8119 	if (error) {
8120 		goto out;
8121 	}
8122 #endif
8123 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8124 		if (!nullflag && error == EACCES) {
8125 			error = EPERM;
8126 		}
8127 		goto out;
8128 	}
8129 
8130 	/* since we may not need to auth anything, check here */
8131 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8132 		if (!nullflag && error == EACCES) {
8133 			error = EPERM;
8134 		}
8135 		goto out;
8136 	}
8137 	error = vnode_setattr(vp, &va, ctx);
8138 
8139 #if CONFIG_MACF
8140 	if (error == 0) {
8141 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8142 	}
8143 #endif
8144 
8145 out:
8146 	return error;
8147 }
8148 
8149 /*
8150  * Set the access and modification times of a file.
8151  */
8152 /* ARGSUSED */
8153 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8154 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8155 {
8156 	struct timespec ts[2];
8157 	user_addr_t usrtvp;
8158 	int error;
8159 	struct nameidata nd;
8160 	vfs_context_t ctx = vfs_context_current();
8161 	uint32_t wantparent = 0;
8162 
8163 #if CONFIG_FILE_LEASES
8164 	wantparent = WANTPARENT;
8165 #endif
8166 
8167 	/*
8168 	 * AUDIT: Needed to change the order of operations to do the
8169 	 * name lookup first because auditing wants the path.
8170 	 */
8171 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8172 	    UIO_USERSPACE, uap->path, ctx);
8173 	error = namei(&nd);
8174 	if (error) {
8175 		return error;
8176 	}
8177 
8178 	/*
8179 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8180 	 * the current time instead.
8181 	 */
8182 	usrtvp = uap->tptr;
8183 	if ((error = getutimes(usrtvp, ts)) != 0) {
8184 		goto out;
8185 	}
8186 
8187 #if CONFIG_FILE_LEASES
8188 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8189 #endif
8190 
8191 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8192 
8193 out:
8194 #if CONFIG_FILE_LEASES
8195 	vnode_put(nd.ni_dvp);
8196 #endif
8197 	nameidone(&nd);
8198 	vnode_put(nd.ni_vp);
8199 	return error;
8200 }
8201 
8202 /*
8203  * Set the access and modification times of a file.
8204  */
8205 /* ARGSUSED */
8206 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8207 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8208 {
8209 	struct timespec ts[2];
8210 	vnode_t vp;
8211 	user_addr_t usrtvp;
8212 	int error;
8213 
8214 	AUDIT_ARG(fd, uap->fd);
8215 	usrtvp = uap->tptr;
8216 	if ((error = getutimes(usrtvp, ts)) != 0) {
8217 		return error;
8218 	}
8219 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8220 		return error;
8221 	}
8222 	if ((error = vnode_getwithref(vp))) {
8223 		file_drop(uap->fd);
8224 		return error;
8225 	}
8226 
8227 #if CONFIG_FILE_LEASES
8228 	vnode_breakdirlease(vp, true, O_WRONLY);
8229 #endif
8230 
8231 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8232 
8233 	vnode_put(vp);
8234 	file_drop(uap->fd);
8235 	return error;
8236 }
8237 
8238 static int
truncate_validate_common(proc_t p,off_t length)8239 truncate_validate_common(proc_t p, off_t length)
8240 {
8241 	rlim_t fsize_limit;
8242 
8243 	if (length < 0) {
8244 		return EINVAL;
8245 	}
8246 
8247 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8248 	if ((rlim_t)length > fsize_limit) {
8249 		psignal(p, SIGXFSZ);
8250 		return EFBIG;
8251 	}
8252 
8253 	return 0;
8254 }
8255 
8256 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8257 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8258     vfs_context_t ctx, boolean_t need_auth)
8259 {
8260 	struct vnode_attr va;
8261 	kauth_action_t action;
8262 	int error;
8263 
8264 	VATTR_INIT(&va);
8265 	VATTR_SET(&va, va_data_size, length);
8266 
8267 #if CONFIG_MACF
8268 	error = mac_vnode_check_truncate(ctx, cred, vp);
8269 	if (error) {
8270 		return error;
8271 	}
8272 #endif
8273 
8274 	/*
8275 	 * If we reached here from `ftruncate` then we already did an effective
8276 	 * `vnode_authorize` upon open.  We honour the result from then.
8277 	 */
8278 	if (need_auth) {
8279 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8280 			return error;
8281 		}
8282 
8283 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8284 			return error;
8285 		}
8286 	}
8287 
8288 #if CONFIG_FILE_LEASES
8289 	/* Check if there is a lease placed on the parent directory. */
8290 	vnode_breakdirlease(vp, true, O_WRONLY);
8291 
8292 	/* Now check if there is a lease placed on the file itself. */
8293 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8294 #endif
8295 
8296 	error = vnode_setattr(vp, &va, ctx);
8297 
8298 #if CONFIG_MACF
8299 	if (error == 0) {
8300 		mac_vnode_notify_truncate(ctx, cred, vp);
8301 	}
8302 #endif
8303 
8304 	return error;
8305 }
8306 
8307 /*
8308  * Truncate a file given its path name.
8309  */
8310 /* ARGSUSED */
8311 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8312 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8313 {
8314 	vfs_context_t ctx = vfs_context_current();
8315 	vnode_t vp;
8316 	int error;
8317 	struct nameidata nd;
8318 
8319 	if ((error = truncate_validate_common(p, uap->length))) {
8320 		return error;
8321 	}
8322 
8323 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8324 	    UIO_USERSPACE, uap->path, ctx);
8325 
8326 	if ((error = namei(&nd))) {
8327 		return error;
8328 	}
8329 
8330 	vp = nd.ni_vp;
8331 	nameidone(&nd);
8332 
8333 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8334 	vnode_put(vp);
8335 
8336 	return error;
8337 }
8338 
8339 /*
8340  * Truncate a file given a file descriptor.
8341  */
8342 /* ARGSUSED */
8343 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8344 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8345 {
8346 	vnode_t vp;
8347 	struct fileproc *fp;
8348 	int error;
8349 
8350 	AUDIT_ARG(fd, uap->fd);
8351 
8352 	if ((error = truncate_validate_common(p, uap->length))) {
8353 		return error;
8354 	}
8355 
8356 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8357 		return error;
8358 	}
8359 
8360 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8361 	case DTYPE_PSXSHM:
8362 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8363 		goto out;
8364 	case DTYPE_VNODE:
8365 		break;
8366 	default:
8367 		error = EINVAL;
8368 		goto out;
8369 	}
8370 
8371 	vp = (vnode_t)fp_get_data(fp);
8372 
8373 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8374 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8375 		error = EINVAL;
8376 		goto out;
8377 	}
8378 
8379 	if ((error = vnode_getwithref(vp)) != 0) {
8380 		goto out;
8381 	}
8382 
8383 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8384 
8385 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8386 	    vfs_context_current(), false);
8387 	vnode_put(vp);
8388 
8389 out:
8390 	file_drop(uap->fd);
8391 	return error;
8392 }
8393 
8394 
8395 /*
8396  * Sync an open file with synchronized I/O _file_ integrity completion
8397  */
8398 /* ARGSUSED */
8399 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8400 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8401 {
8402 	__pthread_testcancel(1);
8403 	return fsync_common(p, uap, MNT_WAIT);
8404 }
8405 
8406 
8407 /*
8408  * Sync an open file with synchronized I/O _file_ integrity completion
8409  *
8410  * Notes:	This is a legacy support function that does not test for
8411  *		thread cancellation points.
8412  */
8413 /* ARGSUSED */
8414 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8415 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8416 {
8417 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8418 }
8419 
8420 
8421 /*
8422  * Sync an open file with synchronized I/O _data_ integrity completion
8423  */
8424 /* ARGSUSED */
8425 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8426 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8427 {
8428 	__pthread_testcancel(1);
8429 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8430 }
8431 
8432 
8433 /*
8434  * fsync_common
8435  *
8436  * Common fsync code to support both synchronized I/O file integrity completion
8437  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8438  *
8439  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8440  * will only guarantee that the file data contents are retrievable.  If
8441  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8442  * includes additional metadata unnecessary for retrieving the file data
8443  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8444  * storage.
8445  *
8446  * Parameters:	p				The process
8447  *		uap->fd				The descriptor to synchronize
8448  *		flags				The data integrity flags
8449  *
8450  * Returns:	int				Success
8451  *	fp_getfvp:EBADF				Bad file descriptor
8452  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8453  *	VNOP_FSYNC:???				unspecified
8454  *
8455  * Notes:	We use struct fsync_args because it is a short name, and all
8456  *		caller argument structures are otherwise identical.
8457  */
8458 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8459 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8460 {
8461 	vnode_t vp;
8462 	struct fileproc *fp;
8463 	vfs_context_t ctx = vfs_context_current();
8464 	int error;
8465 
8466 	AUDIT_ARG(fd, uap->fd);
8467 
8468 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8469 		return error;
8470 	}
8471 	if ((error = vnode_getwithref(vp))) {
8472 		file_drop(uap->fd);
8473 		return error;
8474 	}
8475 
8476 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8477 
8478 	error = VNOP_FSYNC(vp, flags, ctx);
8479 
8480 #if NAMEDRSRCFORK
8481 	/* Sync resource fork shadow file if necessary. */
8482 	if ((error == 0) &&
8483 	    (vp->v_flag & VISNAMEDSTREAM) &&
8484 	    (vp->v_parent != NULLVP) &&
8485 	    vnode_isshadow(vp) &&
8486 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8487 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8488 	}
8489 #endif
8490 
8491 	(void)vnode_put(vp);
8492 	file_drop(uap->fd);
8493 	return error;
8494 }
8495 
8496 /*
8497  * Duplicate files.  Source must be a file, target must be a file or
8498  * must not exist.
8499  *
8500  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8501  *     perform inheritance correctly.
8502  */
8503 /* ARGSUSED */
8504 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8505 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8506 {
8507 	vnode_t tvp, fvp, tdvp, sdvp;
8508 	struct nameidata fromnd, tond;
8509 	int error;
8510 	vfs_context_t ctx = vfs_context_current();
8511 
8512 	/* Check that the flags are valid. */
8513 	if (uap->flags & ~CPF_MASK) {
8514 		return EINVAL;
8515 	}
8516 
8517 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8518 	    UIO_USERSPACE, uap->from, ctx);
8519 	if ((error = namei(&fromnd))) {
8520 		return error;
8521 	}
8522 	fvp = fromnd.ni_vp;
8523 
8524 	NDINIT(&tond, CREATE, OP_LINK,
8525 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8526 	    UIO_USERSPACE, uap->to, ctx);
8527 	if ((error = namei(&tond))) {
8528 		goto out1;
8529 	}
8530 	tdvp = tond.ni_dvp;
8531 	tvp = tond.ni_vp;
8532 
8533 	if (tvp != NULL) {
8534 		if (!(uap->flags & CPF_OVERWRITE)) {
8535 			error = EEXIST;
8536 			goto out;
8537 		}
8538 	}
8539 
8540 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8541 		error = EISDIR;
8542 		goto out;
8543 	}
8544 
8545 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8546 		error = EOPNOTSUPP;
8547 		goto out;
8548 	}
8549 
8550 #if CONFIG_MACF
8551 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8552 		goto out;
8553 	}
8554 #endif /* CONFIG_MACF */
8555 
8556 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8557 		goto out;
8558 	}
8559 	if (tvp) {
8560 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8561 			goto out;
8562 		}
8563 	}
8564 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8565 		goto out;
8566 	}
8567 
8568 	if (fvp == tdvp) {
8569 		error = EINVAL;
8570 	}
8571 	/*
8572 	 * If source is the same as the destination (that is the
8573 	 * same inode number) then there is nothing to do.
8574 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8575 	 */
8576 	if (fvp == tvp) {
8577 		error = -1;
8578 	}
8579 
8580 #if CONFIG_FILE_LEASES
8581 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8582 #endif
8583 
8584 	if (!error) {
8585 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8586 	}
8587 out:
8588 	sdvp = tond.ni_startdir;
8589 	/*
8590 	 * nameidone has to happen before we vnode_put(tdvp)
8591 	 * since it may need to release the fs_nodelock on the tdvp
8592 	 */
8593 	nameidone(&tond);
8594 
8595 	if (tvp) {
8596 		vnode_put(tvp);
8597 	}
8598 	vnode_put(tdvp);
8599 	vnode_put(sdvp);
8600 out1:
8601 	vnode_put(fvp);
8602 
8603 	nameidone(&fromnd);
8604 
8605 	if (error == -1) {
8606 		return 0;
8607 	}
8608 	return error;
8609 }
8610 
8611 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8612 
8613 /*
8614  * Helper function for doing clones. The caller is expected to provide an
8615  * iocounted source vnode and release it.
8616  */
8617 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8618 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8619     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8620 {
8621 	vnode_t tvp, tdvp;
8622 	struct nameidata tond;
8623 	int error;
8624 	int follow;
8625 	boolean_t free_src_acl;
8626 	boolean_t attr_cleanup;
8627 	enum vtype v_type;
8628 	kauth_action_t action;
8629 	struct componentname *cnp;
8630 	uint32_t defaulted = 0;
8631 	struct vnode_attr va;
8632 	struct vnode_attr nva;
8633 	uint32_t vnop_flags;
8634 
8635 	v_type = vnode_vtype(fvp);
8636 	switch (v_type) {
8637 	case VLNK:
8638 	/* FALLTHRU */
8639 	case VREG:
8640 		action = KAUTH_VNODE_ADD_FILE;
8641 		break;
8642 	case VDIR:
8643 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8644 		    fvp->v_mountedhere) {
8645 			return EINVAL;
8646 		}
8647 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8648 		break;
8649 	default:
8650 		return EINVAL;
8651 	}
8652 
8653 	AUDIT_ARG(fd2, dst_dirfd);
8654 	AUDIT_ARG(value32, flags);
8655 
8656 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8657 	NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8658 	    UIO_USERSPACE, dst, ctx);
8659 	if ((error = nameiat(&tond, dst_dirfd))) {
8660 		return error;
8661 	}
8662 	cnp = &tond.ni_cnd;
8663 	tdvp = tond.ni_dvp;
8664 	tvp = tond.ni_vp;
8665 
8666 	free_src_acl = FALSE;
8667 	attr_cleanup = FALSE;
8668 
8669 	if (tvp != NULL) {
8670 		error = EEXIST;
8671 		goto out;
8672 	}
8673 
8674 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8675 		error = EXDEV;
8676 		goto out;
8677 	}
8678 
8679 #if CONFIG_MACF
8680 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8681 		goto out;
8682 	}
8683 #endif
8684 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8685 		goto out;
8686 	}
8687 
8688 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8689 	if (data_read_authorised) {
8690 		action &= ~KAUTH_VNODE_READ_DATA;
8691 	}
8692 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8693 		goto out;
8694 	}
8695 
8696 	/*
8697 	 * certain attributes may need to be changed from the source, we ask for
8698 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
8699 	 * flag is specified. By default, the clone file will inherit the target
8700 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8701 	 * will inherit the source file's ACLs instead.
8702 	 */
8703 	VATTR_INIT(&va);
8704 	VATTR_WANTED(&va, va_uid);
8705 	VATTR_WANTED(&va, va_gid);
8706 	VATTR_WANTED(&va, va_mode);
8707 	VATTR_WANTED(&va, va_flags);
8708 	if (flags & CLONE_ACL) {
8709 		VATTR_WANTED(&va, va_acl);
8710 	}
8711 
8712 	if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8713 		goto out;
8714 	}
8715 
8716 	VATTR_INIT(&nva);
8717 	VATTR_SET(&nva, va_type, v_type);
8718 	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8719 		VATTR_SET(&nva, va_acl, va.va_acl);
8720 		free_src_acl = TRUE;
8721 	}
8722 
8723 	/* Handle ACL inheritance, initialize vap. */
8724 	if (v_type == VLNK) {
8725 		error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8726 	} else {
8727 		error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8728 		if (error) {
8729 			goto out;
8730 		}
8731 		attr_cleanup = TRUE;
8732 	}
8733 
8734 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8735 	/*
8736 	 * We've got initial values for all security parameters,
8737 	 * If we are superuser, then we can change owners to be the
8738 	 * same as the source. Both superuser and the owner have default
8739 	 * WRITE_SECURITY privileges so all other fields can be taken
8740 	 * from source as well.
8741 	 */
8742 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8743 		if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8744 			VATTR_SET(&nva, va_uid, va.va_uid);
8745 		}
8746 		if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8747 			VATTR_SET(&nva, va_gid, va.va_gid);
8748 		}
8749 	} else {
8750 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8751 	}
8752 
8753 	if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8754 		VATTR_SET(&nva, va_mode, va.va_mode);
8755 	}
8756 	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8757 		VATTR_SET(&nva, va_flags,
8758 		    ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8759 		    (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8760 	}
8761 
8762 #if CONFIG_FILE_LEASES
8763 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8764 #endif
8765 
8766 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8767 
8768 	if (!error && tvp) {
8769 		int     update_flags = 0;
8770 #if CONFIG_FSE
8771 		int fsevent;
8772 #endif /* CONFIG_FSE */
8773 
8774 		/*
8775 		 * If some of the requested attributes weren't handled by the
8776 		 * VNOP, use our fallback code.
8777 		 */
8778 		if (!VATTR_ALL_SUPPORTED(&nva)) {
8779 			(void)vnode_setattr_fallback(tvp, &nva, ctx);
8780 		}
8781 
8782 #if CONFIG_MACF
8783 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8784 		    VNODE_LABEL_CREATE, ctx);
8785 #endif
8786 
8787 		// Make sure the name & parent pointers are hooked up
8788 		if (tvp->v_name == NULL) {
8789 			update_flags |= VNODE_UPDATE_NAME;
8790 		}
8791 		if (tvp->v_parent == NULLVP) {
8792 			update_flags |= VNODE_UPDATE_PARENT;
8793 		}
8794 
8795 		if (update_flags) {
8796 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8797 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8798 		}
8799 
8800 #if CONFIG_FSE
8801 		switch (vnode_vtype(tvp)) {
8802 		case VLNK:
8803 		/* FALLTHRU */
8804 		case VREG:
8805 			fsevent = FSE_CREATE_FILE;
8806 			break;
8807 		case VDIR:
8808 			fsevent = FSE_CREATE_DIR;
8809 			break;
8810 		default:
8811 			goto out;
8812 		}
8813 
8814 		if (need_fsevent(fsevent, tvp)) {
8815 			/*
8816 			 * The following is a sequence of three explicit events.
8817 			 * A pair of FSE_CLONE events representing the source and destination
8818 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8819 			 * fseventsd may coalesce the destination clone and create events
8820 			 * into a single event resulting in the following sequence for a client
8821 			 * FSE_CLONE (src)
8822 			 * FSE_CLONE | FSE_CREATE (dst)
8823 			 */
8824 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8825 			    FSE_ARG_DONE);
8826 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8827 			    FSE_ARG_DONE);
8828 		}
8829 #endif /* CONFIG_FSE */
8830 	}
8831 
8832 out:
8833 	if (attr_cleanup) {
8834 		vn_attribute_cleanup(&nva, defaulted);
8835 	}
8836 	if (free_src_acl && va.va_acl) {
8837 		kauth_acl_free(va.va_acl);
8838 	}
8839 	nameidone(&tond);
8840 	if (tvp) {
8841 		vnode_put(tvp);
8842 	}
8843 	vnode_put(tdvp);
8844 	return error;
8845 }
8846 
8847 /*
8848  * clone files or directories, target must not exist.
8849  */
8850 /* ARGSUSED */
8851 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8852 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8853     __unused int32_t *retval)
8854 {
8855 	vnode_t fvp;
8856 	struct nameidata fromnd;
8857 	int follow;
8858 	int error;
8859 	vfs_context_t ctx = vfs_context_current();
8860 
8861 	/* Check that the flags are valid. */
8862 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8863 		return EINVAL;
8864 	}
8865 
8866 	AUDIT_ARG(fd, uap->src_dirfd);
8867 
8868 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8869 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8870 	    UIO_USERSPACE, uap->src, ctx);
8871 	if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8872 		return error;
8873 	}
8874 
8875 	fvp = fromnd.ni_vp;
8876 	nameidone(&fromnd);
8877 
8878 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8879 	    uap->flags, ctx);
8880 
8881 	vnode_put(fvp);
8882 	return error;
8883 }
8884 
8885 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8886 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8887     __unused int32_t *retval)
8888 {
8889 	vnode_t fvp;
8890 	struct fileproc *fp;
8891 	int error;
8892 	vfs_context_t ctx = vfs_context_current();
8893 
8894 	/* Check that the flags are valid. */
8895 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8896 		return EINVAL;
8897 	}
8898 
8899 	AUDIT_ARG(fd, uap->src_fd);
8900 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8901 	if (error) {
8902 		return error;
8903 	}
8904 
8905 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8906 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8907 		error = EBADF;
8908 		goto out;
8909 	}
8910 
8911 	if ((error = vnode_getwithref(fvp))) {
8912 		goto out;
8913 	}
8914 
8915 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8916 
8917 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8918 	    uap->flags, ctx);
8919 
8920 	vnode_put(fvp);
8921 out:
8922 	file_drop(uap->src_fd);
8923 	return error;
8924 }
8925 
8926 static int
rename_submounts_callback(mount_t mp,void * arg)8927 rename_submounts_callback(mount_t mp, void *arg)
8928 {
8929 	int error = 0;
8930 	mount_t pmp = (mount_t)arg;
8931 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8932 
8933 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8934 		return 0;
8935 	}
8936 
8937 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8938 		return 0;
8939 	}
8940 
8941 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8942 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8943 		return -1;
8944 	}
8945 
8946 	size_t pathlen = MAXPATHLEN;
8947 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8948 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8949 	}
8950 
8951 	vfs_unbusy(mp);
8952 
8953 	return error;
8954 }
8955 
8956 /*
8957  * Rename files.  Source and destination must either both be directories,
8958  * or both not be directories.  If target is a directory, it must be empty.
8959  */
8960 /* ARGSUSED */
8961 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8962 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8963     int tofd, user_addr_t to, int segflg, u_int uflags)
8964 {
8965 	vnode_t tvp, tdvp;
8966 	vnode_t fvp, fdvp;
8967 	vnode_t mnt_fvp;
8968 	struct nameidata *fromnd, *tond;
8969 	int error = 0;
8970 	int do_retry;
8971 	int retry_count;
8972 	int mntrename;
8973 	int need_event;
8974 	int need_kpath2;
8975 	int has_listeners;
8976 	const char *oname = NULL;
8977 	char *from_name = NULL, *to_name = NULL;
8978 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8979 	int from_len = 0, to_len = 0;
8980 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8981 	int holding_mntlock;
8982 	int vn_authorize_skipped;
8983 	mount_t locked_mp = NULL;
8984 	vnode_t oparent = NULLVP;
8985 #if CONFIG_FSE
8986 	fse_info from_finfo = {}, to_finfo;
8987 #endif
8988 	int from_truncated = 0, to_truncated = 0;
8989 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8990 	int batched = 0;
8991 	struct vnode_attr *fvap, *tvap;
8992 	int continuing = 0;
8993 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8994 	int32_t nofollow_any = 0;
8995 	/* carving out a chunk for structs that are too big to be on stack. */
8996 	struct {
8997 		struct nameidata from_node, to_node;
8998 		struct vnode_attr fv_attr, tv_attr;
8999 	} * __rename_data;
9000 
9001 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9002 	fromnd = &__rename_data->from_node;
9003 	tond = &__rename_data->to_node;
9004 
9005 	holding_mntlock = 0;
9006 	do_retry = 0;
9007 	retry_count = 0;
9008 retry:
9009 	fvp = tvp = NULL;
9010 	fdvp = tdvp = NULL;
9011 	fvap = tvap = NULL;
9012 	mnt_fvp = NULLVP;
9013 	mntrename = FALSE;
9014 	vn_authorize_skipped = FALSE;
9015 
9016 	if (uflags & RENAME_NOFOLLOW_ANY) {
9017 		nofollow_any = NAMEI_NOFOLLOW_ANY;
9018 	}
9019 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9020 	    segflg, from, ctx);
9021 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9022 
9023 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9024 	    segflg, to, ctx);
9025 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9026 
9027 continue_lookup:
9028 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9029 		if ((error = nameiat(fromnd, fromfd))) {
9030 			goto out1;
9031 		}
9032 		fdvp = fromnd->ni_dvp;
9033 		fvp  = fromnd->ni_vp;
9034 
9035 		if (fvp && fvp->v_type == VDIR) {
9036 			tond->ni_cnd.cn_flags |= WILLBEDIR;
9037 		}
9038 	}
9039 
9040 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9041 		if ((error = nameiat(tond, tofd))) {
9042 			/*
9043 			 * Translate error code for rename("dir1", "dir2/.").
9044 			 */
9045 			if (error == EISDIR && fvp->v_type == VDIR) {
9046 				error = EINVAL;
9047 			}
9048 			goto out1;
9049 		}
9050 		tdvp = tond->ni_dvp;
9051 		tvp  = tond->ni_vp;
9052 	}
9053 
9054 #if DEVELOPMENT || DEBUG
9055 	/*
9056 	 * XXX VSWAP: Check for entitlements or special flag here
9057 	 * so we can restrict access appropriately.
9058 	 */
9059 #else /* DEVELOPMENT || DEBUG */
9060 
9061 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9062 		error = EPERM;
9063 		goto out1;
9064 	}
9065 
9066 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9067 		error = EPERM;
9068 		goto out1;
9069 	}
9070 #endif /* DEVELOPMENT || DEBUG */
9071 
9072 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9073 		error = ENOENT;
9074 		goto out1;
9075 	}
9076 
9077 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9078 		int32_t pval = 0;
9079 		int err = 0;
9080 
9081 		/*
9082 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9083 		 * has the same name as target iff the following conditions are met:
9084 		 * 1. the target file system is case insensitive
9085 		 * 2. source and target directories are the same
9086 		 * 3. source and target files are the same
9087 		 * 4. name only differs in case (determined by underlying filesystem)
9088 		 */
9089 		if (fvp != tvp || fdvp != tdvp) {
9090 			error = EEXIST;
9091 			goto out1;
9092 		}
9093 
9094 		/*
9095 		 * Assume that the target file system is case sensitive if
9096 		 * _PC_CASE_SENSITIVE selector isn't supported.
9097 		 */
9098 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9099 		if (err != 0 || pval != 0) {
9100 			error = EEXIST;
9101 			goto out1;
9102 		}
9103 	}
9104 
9105 	batched = vnode_compound_rename_available(fdvp);
9106 
9107 #if CONFIG_FSE
9108 	need_event = need_fsevent(FSE_RENAME, fdvp);
9109 	if (need_event) {
9110 		if (fvp) {
9111 			get_fse_info(fvp, &from_finfo, ctx);
9112 		} else {
9113 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9114 			if (error) {
9115 				goto out1;
9116 			}
9117 
9118 			fvap = &__rename_data->fv_attr;
9119 		}
9120 
9121 		if (tvp) {
9122 			get_fse_info(tvp, &to_finfo, ctx);
9123 		} else if (batched) {
9124 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9125 			if (error) {
9126 				goto out1;
9127 			}
9128 
9129 			tvap = &__rename_data->tv_attr;
9130 		}
9131 	}
9132 #else
9133 	need_event = 0;
9134 #endif /* CONFIG_FSE */
9135 
9136 	has_listeners = kauth_authorize_fileop_has_listeners();
9137 
9138 	need_kpath2 = 0;
9139 #if CONFIG_AUDIT
9140 	if (AUDIT_RECORD_EXISTS()) {
9141 		need_kpath2 = 1;
9142 	}
9143 #endif
9144 
9145 	if (need_event || has_listeners) {
9146 		if (from_name == NULL) {
9147 			GET_PATH(from_name);
9148 		}
9149 
9150 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9151 
9152 		if (from_name_no_firmlink == NULL) {
9153 			GET_PATH(from_name_no_firmlink);
9154 		}
9155 
9156 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9157 	}
9158 
9159 	if (need_event || need_kpath2 || has_listeners) {
9160 		if (to_name == NULL) {
9161 			GET_PATH(to_name);
9162 		}
9163 
9164 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9165 
9166 		if (to_name_no_firmlink == NULL) {
9167 			GET_PATH(to_name_no_firmlink);
9168 		}
9169 
9170 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9171 		if (to_name && need_kpath2) {
9172 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9173 		}
9174 	}
9175 	if (!fvp) {
9176 		/*
9177 		 * Claim: this check will never reject a valid rename.
9178 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9179 		 * Suppose fdvp and tdvp are not on the same mount.
9180 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9181 		 *      then you can't move it to within another dir on the same mountpoint.
9182 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9183 		 *
9184 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9185 		 */
9186 		if (fdvp->v_mount != tdvp->v_mount) {
9187 			error = EXDEV;
9188 			goto out1;
9189 		}
9190 		goto skipped_lookup;
9191 	}
9192 
9193 	/*
9194 	 * If the source and destination are the same (i.e. they're
9195 	 * links to the same vnode) and the target file system is
9196 	 * case sensitive, then there is nothing to do.
9197 	 *
9198 	 * XXX Come back to this.
9199 	 */
9200 	if (fvp == tvp) {
9201 		int pathconf_val;
9202 
9203 		/*
9204 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9205 		 * then assume that this file system is case sensitive.
9206 		 */
9207 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9208 		    pathconf_val != 0) {
9209 			vn_authorize_skipped = TRUE;
9210 			goto out1;
9211 		}
9212 	}
9213 
9214 	/*
9215 	 * Allow the renaming of mount points.
9216 	 * - target must not exist
9217 	 * - target must reside in the same directory as source
9218 	 * - union mounts cannot be renamed
9219 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9220 	 *
9221 	 * XXX Handle this in VFS after a continued lookup (if we missed
9222 	 * in the cache to start off)
9223 	 *
9224 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9225 	 * we'll skip past here.  The file system is responsible for
9226 	 * checking that @tvp is not a descendent of @fvp and vice versa
9227 	 * so it should always return EINVAL if either @tvp or @fvp is the
9228 	 * root of a volume.
9229 	 */
9230 	if ((fvp->v_flag & VROOT) &&
9231 	    (fvp->v_type == VDIR) &&
9232 	    (tvp == NULL) &&
9233 	    (fvp->v_mountedhere == NULL) &&
9234 	    (fdvp == tdvp) &&
9235 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9236 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9237 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9238 		vnode_t coveredvp;
9239 
9240 		/* switch fvp to the covered vnode */
9241 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9242 		if ((vnode_getwithref(coveredvp))) {
9243 			error = ENOENT;
9244 			goto out1;
9245 		}
9246 		/*
9247 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9248 		 * later.
9249 		 */
9250 		mnt_fvp = fvp;
9251 
9252 		fvp = coveredvp;
9253 		mntrename = TRUE;
9254 	}
9255 	/*
9256 	 * Check for cross-device rename.
9257 	 */
9258 	if ((fvp->v_mount != tdvp->v_mount) ||
9259 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9260 		error = EXDEV;
9261 		goto out1;
9262 	}
9263 
9264 	/*
9265 	 * If source is the same as the destination (that is the
9266 	 * same inode number) then there is nothing to do...
9267 	 * EXCEPT if the underlying file system supports case
9268 	 * insensitivity and is case preserving.  In this case
9269 	 * the file system needs to handle the special case of
9270 	 * getting the same vnode as target (fvp) and source (tvp).
9271 	 *
9272 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9273 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9274 	 * handle the special case of getting the same vnode as target and
9275 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9276 	 * so not to cause locking problems. There is a single reference on tvp.
9277 	 *
9278 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9279 	 * that correct behaviour then is just to return success without doing
9280 	 * anything.
9281 	 *
9282 	 * XXX filesystem should take care of this itself, perhaps...
9283 	 */
9284 	if (fvp == tvp && fdvp == tdvp) {
9285 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9286 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9287 		    fromnd->ni_cnd.cn_namelen)) {
9288 			vn_authorize_skipped = TRUE;
9289 			goto out1;
9290 		}
9291 	}
9292 
9293 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9294 		/*
9295 		 * we're holding a reference and lock
9296 		 * on locked_mp, but it no longer matches
9297 		 * what we want to do... so drop our hold
9298 		 */
9299 		mount_unlock_renames(locked_mp);
9300 		mount_drop(locked_mp, 0);
9301 		holding_mntlock = 0;
9302 	}
9303 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9304 		/*
9305 		 * serialize renames that re-shape
9306 		 * the tree... if holding_mntlock is
9307 		 * set, then we're ready to go...
9308 		 * otherwise we
9309 		 * first need to drop the iocounts
9310 		 * we picked up, second take the
9311 		 * lock to serialize the access,
9312 		 * then finally start the lookup
9313 		 * process over with the lock held
9314 		 */
9315 		if (!holding_mntlock) {
9316 			/*
9317 			 * need to grab a reference on
9318 			 * the mount point before we
9319 			 * drop all the iocounts... once
9320 			 * the iocounts are gone, the mount
9321 			 * could follow
9322 			 */
9323 			locked_mp = fvp->v_mount;
9324 			mount_ref(locked_mp, 0);
9325 
9326 			/*
9327 			 * nameidone has to happen before we vnode_put(tvp)
9328 			 * since it may need to release the fs_nodelock on the tvp
9329 			 */
9330 			nameidone(tond);
9331 
9332 			if (tvp) {
9333 				vnode_put(tvp);
9334 			}
9335 			vnode_put(tdvp);
9336 
9337 			/*
9338 			 * nameidone has to happen before we vnode_put(fdvp)
9339 			 * since it may need to release the fs_nodelock on the fvp
9340 			 */
9341 			nameidone(fromnd);
9342 
9343 			vnode_put(fvp);
9344 			vnode_put(fdvp);
9345 
9346 			if (mnt_fvp != NULLVP) {
9347 				vnode_put(mnt_fvp);
9348 			}
9349 
9350 			mount_lock_renames(locked_mp);
9351 			holding_mntlock = 1;
9352 
9353 			goto retry;
9354 		}
9355 	} else {
9356 		/*
9357 		 * when we dropped the iocounts to take
9358 		 * the lock, we allowed the identity of
9359 		 * the various vnodes to change... if they did,
9360 		 * we may no longer be dealing with a rename
9361 		 * that reshapes the tree... once we're holding
9362 		 * the iocounts, the vnodes can't change type
9363 		 * so we're free to drop the lock at this point
9364 		 * and continue on
9365 		 */
9366 		if (holding_mntlock) {
9367 			mount_unlock_renames(locked_mp);
9368 			mount_drop(locked_mp, 0);
9369 			holding_mntlock = 0;
9370 		}
9371 	}
9372 
9373 	if (!batched) {
9374 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9375 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9376 		    flags, NULL);
9377 		if (error) {
9378 			if (error == ENOENT) {
9379 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9380 					/*
9381 					 * We encountered a race where after doing the namei,
9382 					 * tvp stops being valid. If so, simply re-drive the rename
9383 					 * call from the top.
9384 					 */
9385 					do_retry = 1;
9386 					retry_count += 1;
9387 				}
9388 			}
9389 			goto out1;
9390 		}
9391 	}
9392 
9393 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9394 	if (mnt_fvp != NULLVP) {
9395 		vnode_put(mnt_fvp);
9396 		mnt_fvp = NULLVP;
9397 	}
9398 
9399 	// save these off so we can later verify that fvp is the same
9400 	oname   = fvp->v_name;
9401 	oparent = fvp->v_parent;
9402 
9403 skipped_lookup:
9404 #if CONFIG_FILE_LEASES
9405 	/* Lease break needed for source's parent dir? */
9406 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9407 
9408 	/* Lease break needed for target's parent dir? */
9409 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9410 #endif
9411 
9412 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9413 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9414 	    flags, ctx);
9415 
9416 	if (holding_mntlock) {
9417 		/*
9418 		 * we can drop our serialization
9419 		 * lock now
9420 		 */
9421 		mount_unlock_renames(locked_mp);
9422 		mount_drop(locked_mp, 0);
9423 		holding_mntlock = 0;
9424 	}
9425 	if (error) {
9426 		if (error == EDATALESS) {
9427 			/*
9428 			 * If we've been here before, something has gone
9429 			 * horribly wrong and we should just get out lest
9430 			 * we spiral around the drain forever.
9431 			 */
9432 			if (flags & VFS_RENAME_DATALESS) {
9433 				error = EIO;
9434 				goto out1;
9435 			}
9436 
9437 			/*
9438 			 * The object we're renaming is dataless (or has a
9439 			 * dataless descendent) and requires materialization
9440 			 * before the rename occurs.  But we're holding the
9441 			 * mount point's rename lock, so it's not safe to
9442 			 * make the upcall.
9443 			 *
9444 			 * In this case, we release the lock (above), perform
9445 			 * the materialization, and start the whole thing over.
9446 			 */
9447 			error = vfs_materialize_reparent(fvp, tdvp);
9448 			if (error == 0) {
9449 				/*
9450 				 * The next time around we need to tell the
9451 				 * file system that the materializtaion has
9452 				 * been performed.
9453 				 */
9454 				flags |= VFS_RENAME_DATALESS;
9455 				do_retry = 1;
9456 			}
9457 			goto out1;
9458 		}
9459 		if (error == EKEEPLOOKING) {
9460 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9461 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9462 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9463 				}
9464 			}
9465 
9466 			fromnd->ni_vp = fvp;
9467 			tond->ni_vp = tvp;
9468 
9469 			goto continue_lookup;
9470 		}
9471 
9472 		/*
9473 		 * We may encounter a race in the VNOP where the destination didn't
9474 		 * exist when we did the namei, but it does by the time we go and
9475 		 * try to create the entry. In this case, we should re-drive this rename
9476 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9477 		 * but other filesystems susceptible to this race could return it, too.
9478 		 */
9479 		if (error == ERECYCLE) {
9480 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9481 				do_retry = 1;
9482 				retry_count += 1;
9483 			} else {
9484 				printf("rename retry limit due to ERECYCLE reached\n");
9485 				error = ENOENT;
9486 			}
9487 		}
9488 
9489 		/*
9490 		 * For compound VNOPs, the authorization callback may return
9491 		 * ENOENT in case of racing hardlink lookups hitting the name
9492 		 * cache, redrive the lookup.
9493 		 */
9494 		if (batched && error == ENOENT) {
9495 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9496 				do_retry = 1;
9497 				retry_count += 1;
9498 			}
9499 		}
9500 
9501 		goto out1;
9502 	}
9503 
9504 	/* call out to allow 3rd party notification of rename.
9505 	 * Ignore result of kauth_authorize_fileop call.
9506 	 */
9507 	kauth_authorize_fileop(vfs_context_ucred(ctx),
9508 	    KAUTH_FILEOP_RENAME,
9509 	    (uintptr_t)from_name, (uintptr_t)to_name);
9510 	if (flags & VFS_RENAME_SWAP) {
9511 		kauth_authorize_fileop(vfs_context_ucred(ctx),
9512 		    KAUTH_FILEOP_RENAME,
9513 		    (uintptr_t)to_name, (uintptr_t)from_name);
9514 	}
9515 
9516 #if CONFIG_FSE
9517 	if (from_name != NULL && to_name != NULL) {
9518 		if (from_truncated || to_truncated) {
9519 			// set it here since only the from_finfo gets reported up to user space
9520 			from_finfo.mode |= FSE_TRUNCATED_PATH;
9521 		}
9522 
9523 		if (tvap && tvp) {
9524 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9525 		}
9526 		if (fvap) {
9527 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9528 		}
9529 
9530 		if (tvp) {
9531 			add_fsevent(FSE_RENAME, ctx,
9532 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9533 			    FSE_ARG_FINFO, &from_finfo,
9534 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9535 			    FSE_ARG_FINFO, &to_finfo,
9536 			    FSE_ARG_DONE);
9537 			if (flags & VFS_RENAME_SWAP) {
9538 				/*
9539 				 * Strictly speaking, swap is the equivalent of
9540 				 * *three* renames.  FSEvents clients should only take
9541 				 * the events as a hint, so we only bother reporting
9542 				 * two.
9543 				 */
9544 				add_fsevent(FSE_RENAME, ctx,
9545 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9546 				    FSE_ARG_FINFO, &to_finfo,
9547 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9548 				    FSE_ARG_FINFO, &from_finfo,
9549 				    FSE_ARG_DONE);
9550 			}
9551 		} else {
9552 			add_fsevent(FSE_RENAME, ctx,
9553 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9554 			    FSE_ARG_FINFO, &from_finfo,
9555 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9556 			    FSE_ARG_DONE);
9557 		}
9558 	}
9559 #endif /* CONFIG_FSE */
9560 
9561 	/*
9562 	 * update filesystem's mount point data
9563 	 */
9564 	if (mntrename) {
9565 		char *cp, *pathend, *mpname;
9566 		char * tobuf;
9567 		struct mount *mp;
9568 		int maxlen;
9569 		size_t len = 0;
9570 
9571 		mp = fvp->v_mountedhere;
9572 
9573 		if (vfs_busy(mp, LK_NOWAIT)) {
9574 			error = EBUSY;
9575 			goto out1;
9576 		}
9577 		tobuf = zalloc(ZV_NAMEI);
9578 
9579 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
9580 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9581 		} else {
9582 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9583 		}
9584 		if (!error) {
9585 			/* find current mount point prefix */
9586 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
9587 			for (cp = pathend; *cp != '\0'; ++cp) {
9588 				if (*cp == '/') {
9589 					pathend = cp + 1;
9590 				}
9591 			}
9592 			/* find last component of target name */
9593 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9594 				if (*cp == '/') {
9595 					mpname = cp + 1;
9596 				}
9597 			}
9598 
9599 			/* Update f_mntonname of sub mounts */
9600 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
9601 
9602 			/* append name to prefix */
9603 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9604 			bzero(pathend, maxlen);
9605 
9606 			strlcpy(pathend, mpname, maxlen);
9607 		}
9608 		zfree(ZV_NAMEI, tobuf);
9609 
9610 		vfs_unbusy(mp);
9611 
9612 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9613 	}
9614 	/*
9615 	 * fix up name & parent pointers.  note that we first
9616 	 * check that fvp has the same name/parent pointers it
9617 	 * had before the rename call... this is a 'weak' check
9618 	 * at best...
9619 	 *
9620 	 * XXX oparent and oname may not be set in the compound vnop case
9621 	 */
9622 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9623 		int update_flags;
9624 
9625 		update_flags = VNODE_UPDATE_NAME;
9626 
9627 		if (fdvp != tdvp) {
9628 			update_flags |= VNODE_UPDATE_PARENT;
9629 		}
9630 
9631 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9632 	}
9633 out1:
9634 	/*
9635 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9636 	 * skipped earlier as no actual rename was performed.
9637 	 */
9638 	if (vn_authorize_skipped && error == 0) {
9639 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
9640 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9641 		    flags, NULL);
9642 		if (error && error == ENOENT) {
9643 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9644 				do_retry = 1;
9645 				retry_count += 1;
9646 			}
9647 		}
9648 	}
9649 	if (to_name != NULL) {
9650 		RELEASE_PATH(to_name);
9651 		to_name = NULL;
9652 	}
9653 	if (to_name_no_firmlink != NULL) {
9654 		RELEASE_PATH(to_name_no_firmlink);
9655 		to_name_no_firmlink = NULL;
9656 	}
9657 	if (from_name != NULL) {
9658 		RELEASE_PATH(from_name);
9659 		from_name = NULL;
9660 	}
9661 	if (from_name_no_firmlink != NULL) {
9662 		RELEASE_PATH(from_name_no_firmlink);
9663 		from_name_no_firmlink = NULL;
9664 	}
9665 	if (holding_mntlock) {
9666 		mount_unlock_renames(locked_mp);
9667 		mount_drop(locked_mp, 0);
9668 		holding_mntlock = 0;
9669 	}
9670 	if (tdvp) {
9671 		/*
9672 		 * nameidone has to happen before we vnode_put(tdvp)
9673 		 * since it may need to release the fs_nodelock on the tdvp
9674 		 */
9675 		nameidone(tond);
9676 
9677 		if (tvp) {
9678 			vnode_put(tvp);
9679 		}
9680 		vnode_put(tdvp);
9681 	}
9682 	if (fdvp) {
9683 		/*
9684 		 * nameidone has to happen before we vnode_put(fdvp)
9685 		 * since it may need to release the fs_nodelock on the fdvp
9686 		 */
9687 		nameidone(fromnd);
9688 
9689 		if (fvp) {
9690 			vnode_put(fvp);
9691 		}
9692 		vnode_put(fdvp);
9693 	}
9694 	if (mnt_fvp != NULLVP) {
9695 		vnode_put(mnt_fvp);
9696 	}
9697 	/*
9698 	 * If things changed after we did the namei, then we will re-drive
9699 	 * this rename call from the top.
9700 	 */
9701 	if (do_retry) {
9702 		do_retry = 0;
9703 		goto retry;
9704 	}
9705 
9706 	kfree_type(typeof(*__rename_data), __rename_data);
9707 	return error;
9708 }
9709 
9710 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9711 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9712 {
9713 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9714 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9715 }
9716 
9717 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9718 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9719 {
9720 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9721 		return EINVAL;
9722 	}
9723 
9724 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9725 		return EINVAL;
9726 	}
9727 
9728 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9729 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9730 }
9731 
9732 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9733 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9734 {
9735 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9736 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9737 }
9738 
9739 /*
9740  * Make a directory file.
9741  *
9742  * Returns:	0			Success
9743  *		EEXIST
9744  *	namei:???
9745  *	vnode_authorize:???
9746  *	vn_create:???
9747  */
9748 /* ARGSUSED */
9749 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9750 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9751     enum uio_seg segflg)
9752 {
9753 	vnode_t vp, dvp;
9754 	int error;
9755 	int update_flags = 0;
9756 	int batched;
9757 	struct nameidata nd;
9758 
9759 	AUDIT_ARG(mode, vap->va_mode);
9760 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9761 	    path, ctx);
9762 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9763 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9764 
9765 continue_lookup:
9766 	error = nameiat(&nd, fd);
9767 	if (error) {
9768 		return error;
9769 	}
9770 	dvp = nd.ni_dvp;
9771 	vp = nd.ni_vp;
9772 
9773 	if (vp != NULL) {
9774 		error = EEXIST;
9775 		goto out;
9776 	}
9777 
9778 	batched = vnode_compound_mkdir_available(dvp);
9779 
9780 	VATTR_SET(vap, va_type, VDIR);
9781 
9782 	/*
9783 	 * XXX
9784 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9785 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9786 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9787 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9788 	 */
9789 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9790 		if (error == EACCES || error == EPERM) {
9791 			int error2;
9792 
9793 			nameidone(&nd);
9794 			vnode_put(dvp);
9795 			dvp = NULLVP;
9796 
9797 			/*
9798 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9799 			 * rather than EACCESS if the target exists.
9800 			 */
9801 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9802 			    path, ctx);
9803 			error2 = nameiat(&nd, fd);
9804 			if (error2) {
9805 				goto out;
9806 			} else {
9807 				vp = nd.ni_vp;
9808 				error = EEXIST;
9809 				goto out;
9810 			}
9811 		}
9812 
9813 		goto out;
9814 	}
9815 
9816 #if CONFIG_FILE_LEASES
9817 	vnode_breakdirlease(dvp, false, O_WRONLY);
9818 #endif
9819 
9820 	/*
9821 	 * make the directory
9822 	 */
9823 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9824 		if (error == EKEEPLOOKING) {
9825 			nd.ni_vp = vp;
9826 			goto continue_lookup;
9827 		}
9828 
9829 		goto out;
9830 	}
9831 
9832 	// Make sure the name & parent pointers are hooked up
9833 	if (vp->v_name == NULL) {
9834 		update_flags |= VNODE_UPDATE_NAME;
9835 	}
9836 	if (vp->v_parent == NULLVP) {
9837 		update_flags |= VNODE_UPDATE_PARENT;
9838 	}
9839 
9840 	if (update_flags) {
9841 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9842 	}
9843 
9844 #if CONFIG_FSE
9845 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9846 #endif
9847 
9848 out:
9849 	/*
9850 	 * nameidone has to happen before we vnode_put(dvp)
9851 	 * since it may need to release the fs_nodelock on the dvp
9852 	 */
9853 	nameidone(&nd);
9854 
9855 	if (vp) {
9856 		vnode_put(vp);
9857 	}
9858 	if (dvp) {
9859 		vnode_put(dvp);
9860 	}
9861 
9862 	return error;
9863 }
9864 
9865 /*
9866  * mkdir_extended: Create a directory; with extended security (ACL).
9867  *
9868  * Parameters:    p                       Process requesting to create the directory
9869  *                uap                     User argument descriptor (see below)
9870  *                retval                  (ignored)
9871  *
9872  * Indirect:      uap->path               Path of directory to create
9873  *                uap->mode               Access permissions to set
9874  *                uap->xsecurity          ACL to set
9875  *
9876  * Returns:        0                      Success
9877  *                !0                      Not success
9878  *
9879  */
9880 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9881 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9882 {
9883 	int ciferror;
9884 	kauth_filesec_t xsecdst;
9885 	struct vnode_attr va;
9886 
9887 	AUDIT_ARG(owner, uap->uid, uap->gid);
9888 
9889 	xsecdst = NULL;
9890 	if ((uap->xsecurity != USER_ADDR_NULL) &&
9891 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9892 		return ciferror;
9893 	}
9894 
9895 	VATTR_INIT(&va);
9896 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9897 	if (xsecdst != NULL) {
9898 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9899 		va.va_vaflags |= VA_FILESEC_ACL;
9900 	}
9901 
9902 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9903 	    UIO_USERSPACE);
9904 	if (xsecdst != NULL) {
9905 		kauth_filesec_free(xsecdst);
9906 	}
9907 	return ciferror;
9908 }
9909 
9910 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9911 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9912 {
9913 	struct vnode_attr va;
9914 
9915 	VATTR_INIT(&va);
9916 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9917 
9918 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9919 	           UIO_USERSPACE);
9920 }
9921 
9922 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9923 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9924 {
9925 	struct vnode_attr va;
9926 
9927 	VATTR_INIT(&va);
9928 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9929 
9930 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9931 	           UIO_USERSPACE);
9932 }
9933 
9934 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9935 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9936     enum uio_seg segflg, int unlink_flags)
9937 {
9938 	struct {
9939 		struct nameidata nd;
9940 #if CONFIG_FSE
9941 		struct vnode_attr va;
9942 #endif /* CONFIG_FSE */
9943 	} *__rmdir_data;
9944 	vnode_t vp, dvp;
9945 	int error;
9946 	struct nameidata *ndp;
9947 	char     *path = NULL;
9948 	char     *no_firmlink_path = NULL;
9949 	int       len_path = 0;
9950 	int       len_no_firmlink_path = 0;
9951 	int has_listeners = 0;
9952 	int need_event = 0;
9953 	int truncated_path = 0;
9954 	int truncated_no_firmlink_path = 0;
9955 	struct vnode_attr *vap = NULL;
9956 	int restart_count = 0;
9957 	int batched;
9958 
9959 	int restart_flag;
9960 	int nofollow_any = 0;
9961 
9962 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9963 	ndp = &__rmdir_data->nd;
9964 
9965 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
9966 		nofollow_any = NAMEI_NOFOLLOW_ANY;
9967 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
9968 	}
9969 
9970 	/*
9971 	 * This loop exists to restart rmdir in the unlikely case that two
9972 	 * processes are simultaneously trying to remove the same directory
9973 	 * containing orphaned appleDouble files.
9974 	 */
9975 	do {
9976 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9977 		    segflg, dirpath, ctx);
9978 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
9979 continue_lookup:
9980 		restart_flag = 0;
9981 		vap = NULL;
9982 
9983 		error = nameiat(ndp, fd);
9984 		if (error) {
9985 			goto err_out;
9986 		}
9987 
9988 		dvp = ndp->ni_dvp;
9989 		vp = ndp->ni_vp;
9990 
9991 		if (vp) {
9992 			batched = vnode_compound_rmdir_available(vp);
9993 
9994 			if (vp->v_flag & VROOT) {
9995 				/*
9996 				 * The root of a mounted filesystem cannot be deleted.
9997 				 */
9998 				error = EBUSY;
9999 				goto out;
10000 			}
10001 
10002 #if DEVELOPMENT || DEBUG
10003 			/*
10004 			 * XXX VSWAP: Check for entitlements or special flag here
10005 			 * so we can restrict access appropriately.
10006 			 */
10007 #else /* DEVELOPMENT || DEBUG */
10008 
10009 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10010 				error = EPERM;
10011 				goto out;
10012 			}
10013 #endif /* DEVELOPMENT || DEBUG */
10014 
10015 			/*
10016 			 * Removed a check here; we used to abort if vp's vid
10017 			 * was not the same as what we'd seen the last time around.
10018 			 * I do not think that check was valid, because if we retry
10019 			 * and all dirents are gone, the directory could legitimately
10020 			 * be recycled but still be present in a situation where we would
10021 			 * have had permission to delete.  Therefore, we won't make
10022 			 * an effort to preserve that check now that we may not have a
10023 			 * vp here.
10024 			 */
10025 
10026 			if (!batched) {
10027 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10028 				if (error) {
10029 					if (error == ENOENT) {
10030 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10031 							restart_flag = 1;
10032 							restart_count += 1;
10033 						}
10034 					}
10035 					goto out;
10036 				}
10037 			}
10038 		} else {
10039 			batched = 1;
10040 
10041 			if (!vnode_compound_rmdir_available(dvp)) {
10042 				panic("No error, but no compound rmdir?");
10043 			}
10044 		}
10045 
10046 #if CONFIG_FSE
10047 		fse_info  finfo = {0};
10048 
10049 		need_event = need_fsevent(FSE_DELETE, dvp);
10050 		if (need_event) {
10051 			if (!batched) {
10052 				get_fse_info(vp, &finfo, ctx);
10053 			} else {
10054 				error = vfs_get_notify_attributes(&__rmdir_data->va);
10055 				if (error) {
10056 					goto out;
10057 				}
10058 
10059 				vap = &__rmdir_data->va;
10060 			}
10061 		}
10062 #endif
10063 		has_listeners = kauth_authorize_fileop_has_listeners();
10064 		if (need_event || has_listeners) {
10065 			if (path == NULL) {
10066 				GET_PATH(path);
10067 			}
10068 
10069 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10070 
10071 			if (no_firmlink_path == NULL) {
10072 				GET_PATH(no_firmlink_path);
10073 			}
10074 
10075 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10076 #if CONFIG_FSE
10077 			if (truncated_no_firmlink_path) {
10078 				finfo.mode |= FSE_TRUNCATED_PATH;
10079 			}
10080 #endif
10081 		}
10082 
10083 #if CONFIG_FILE_LEASES
10084 		vnode_breakdirlease(dvp, false, O_WRONLY);
10085 #endif
10086 
10087 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10088 		ndp->ni_vp = vp;
10089 		if (vp == NULLVP) {
10090 			/* Couldn't find a vnode */
10091 			goto out;
10092 		}
10093 
10094 		if (error == EKEEPLOOKING) {
10095 			goto continue_lookup;
10096 		} else if (batched && error == ENOENT) {
10097 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10098 				/*
10099 				 * For compound VNOPs, the authorization callback
10100 				 * may return ENOENT in case of racing hard link lookups
10101 				 * redrive the lookup.
10102 				 */
10103 				restart_flag = 1;
10104 				restart_count += 1;
10105 				goto out;
10106 			}
10107 		}
10108 
10109 		/*
10110 		 * XXX There's no provision for passing flags
10111 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10112 		 * because it's not empty, then we try again
10113 		 * with VNOP_REMOVE(), passing in a special
10114 		 * flag that clever file systems will know
10115 		 * how to handle.
10116 		 */
10117 		if (error == ENOTEMPTY &&
10118 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10119 			/*
10120 			 * Only do this if the directory is actually
10121 			 * marked as DATALESS.
10122 			 */
10123 			struct vnode_attr *lvap =
10124 			    kalloc_type(struct vnode_attr, Z_WAITOK);
10125 
10126 			VATTR_INIT(lvap);
10127 			VATTR_WANTED(lvap, va_flags);
10128 			if (vnode_getattr(vp, lvap, ctx) == 0 &&
10129 			    VATTR_IS_SUPPORTED(lvap, va_flags) &&
10130 			    (lvap->va_flags & SF_DATALESS) != 0) {
10131 				/*
10132 				 * If this fails, we want to keep the original
10133 				 * error.
10134 				 */
10135 				if (vn_remove(dvp, &vp, ndp,
10136 				    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10137 					error = 0;
10138 				}
10139 			}
10140 			kfree_type(struct vnode_attr, lvap);
10141 		}
10142 
10143 #if CONFIG_APPLEDOUBLE
10144 		/*
10145 		 * Special case to remove orphaned AppleDouble
10146 		 * files. I don't like putting this in the kernel,
10147 		 * but carbon does not like putting this in carbon either,
10148 		 * so here we are.
10149 		 */
10150 		if (error == ENOTEMPTY) {
10151 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10152 			if (ad_error == EBUSY) {
10153 				error = ad_error;
10154 				goto out;
10155 			}
10156 
10157 
10158 			/*
10159 			 * Assuming everything went well, we will try the RMDIR again
10160 			 */
10161 			if (!ad_error) {
10162 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10163 			}
10164 		}
10165 #endif /* CONFIG_APPLEDOUBLE */
10166 		/*
10167 		 * Call out to allow 3rd party notification of delete.
10168 		 * Ignore result of kauth_authorize_fileop call.
10169 		 */
10170 		if (!error) {
10171 			if (has_listeners) {
10172 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10173 				    KAUTH_FILEOP_DELETE,
10174 				    (uintptr_t)vp,
10175 				    (uintptr_t)path);
10176 			}
10177 
10178 			if (vp->v_flag & VISHARDLINK) {
10179 				// see the comment in unlink1() about why we update
10180 				// the parent of a hard link when it is removed
10181 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10182 			}
10183 
10184 #if CONFIG_FSE
10185 			if (need_event) {
10186 				if (vap) {
10187 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10188 				}
10189 				add_fsevent(FSE_DELETE, ctx,
10190 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10191 				    FSE_ARG_FINFO, &finfo,
10192 				    FSE_ARG_DONE);
10193 			}
10194 #endif
10195 
10196 #if CONFIG_MACF
10197 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10198 #endif
10199 		}
10200 
10201 out:
10202 		if (path != NULL) {
10203 			RELEASE_PATH(path);
10204 			path = NULL;
10205 		}
10206 
10207 		if (no_firmlink_path != NULL) {
10208 			RELEASE_PATH(no_firmlink_path);
10209 			no_firmlink_path = NULL;
10210 		}
10211 
10212 		/*
10213 		 * nameidone has to happen before we vnode_put(dvp)
10214 		 * since it may need to release the fs_nodelock on the dvp
10215 		 */
10216 		nameidone(ndp);
10217 		vnode_put(dvp);
10218 
10219 		if (vp) {
10220 			vnode_put(vp);
10221 		}
10222 
10223 		if (restart_flag == 0) {
10224 			wakeup_one((caddr_t)vp);
10225 			goto err_out;
10226 		}
10227 		tsleep(vp, PVFS, "rm AD", 1);
10228 	} while (restart_flag != 0);
10229 
10230 err_out:
10231 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10232 
10233 	return error;
10234 }
10235 
10236 /*
10237  * Remove a directory file.
10238  */
10239 /* ARGSUSED */
10240 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10241 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10242 {
10243 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10244 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10245 }
10246 
10247 /* Get direntry length padded to 8 byte alignment */
10248 #define DIRENT64_LEN(namlen) \
10249 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10250 
10251 /* Get dirent length padded to 4 byte alignment */
10252 #define DIRENT_LEN(namelen) \
10253 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10254 
10255 /* Get the end of this dirent */
10256 #define DIRENT_END(dep) \
10257 	(((char *)(dep)) + (dep)->d_reclen - 1)
10258 
10259 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10260 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10261     int *numdirent, vfs_context_t ctxp)
10262 {
10263 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10264 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10265 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10266 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10267 	} else {
10268 		size_t bufsize;
10269 		void * bufptr;
10270 		uio_t auio;
10271 		struct direntry *entry64;
10272 		struct dirent *dep;
10273 		size_t bytesread;
10274 		int error;
10275 
10276 		/*
10277 		 * We're here because the underlying file system does not
10278 		 * support direnties or we mounted denying support so we must
10279 		 * fall back to dirents and convert them to direntries.
10280 		 *
10281 		 * Our kernel buffer needs to be smaller since re-packing will
10282 		 * expand each dirent.  The worse case (when the name length
10283 		 * is 3 or less) corresponds to a struct direntry size of 32
10284 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10285 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10286 		 * will prevent us from reading more than we can pack.
10287 		 *
10288 		 * Since this buffer is wired memory, we will limit the
10289 		 * buffer size to a maximum of 32K. We would really like to
10290 		 * use 32K in the MIN(), but we use magic number 87371 to
10291 		 * prevent uio_resid() * 3 / 8 from overflowing.
10292 		 */
10293 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10294 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10295 		if (bufptr == NULL) {
10296 			return ENOMEM;
10297 		}
10298 
10299 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10300 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10301 		auio->uio_offset = uio->uio_offset;
10302 
10303 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10304 
10305 		dep = (struct dirent *)bufptr;
10306 		bytesread = bufsize - uio_resid(auio);
10307 
10308 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10309 		/*
10310 		 * Convert all the entries and copy them out to user's buffer.
10311 		 */
10312 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10313 			/* First check that the dirent struct up to d_name is within the buffer */
10314 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10315 			    /* Check that the length of the entire dirent is within the buffer */
10316 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10317 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10318 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10319 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10320 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10321 				    vp->v_name ? vp->v_name : "<unknown>");
10322 				error = EIO;
10323 				break;
10324 			}
10325 
10326 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10327 
10328 			bzero(entry64, enbufsize);
10329 			/* Convert a dirent to a dirent64. */
10330 			entry64->d_ino = dep->d_ino;
10331 			entry64->d_seekoff = 0;
10332 			entry64->d_reclen = (uint16_t)enbufsize;
10333 			entry64->d_namlen = dep->d_namlen;
10334 			entry64->d_type = dep->d_type;
10335 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10336 
10337 			/* Move to next entry. */
10338 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10339 
10340 			/* Copy entry64 to user's buffer. */
10341 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10342 		}
10343 
10344 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10345 		if (error == 0) {
10346 			uio->uio_offset = auio->uio_offset;
10347 		}
10348 		uio_free(auio);
10349 		kfree_data(bufptr, bufsize);
10350 		kfree_type(struct direntry, entry64);
10351 		return error;
10352 	}
10353 }
10354 
10355 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10356 
10357 /*
10358  * Read a block of directory entries in a file system independent format.
10359  */
10360 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10361 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10362     off_t *offset, int *eofflag, int flags)
10363 {
10364 	vnode_t vp;
10365 	struct vfs_context context = *vfs_context_current();    /* local copy */
10366 	struct fileproc *fp;
10367 	uio_t auio;
10368 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10369 	off_t loff;
10370 	int error, numdirent;
10371 	UIO_STACKBUF(uio_buf, 1);
10372 
10373 get_from_fd:
10374 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10375 	if (error) {
10376 		return error;
10377 	}
10378 
10379 	vn_offset_lock(fp->fp_glob);
10380 	if (((vnode_t)fp_get_data(fp)) != vp) {
10381 		vn_offset_unlock(fp->fp_glob);
10382 		file_drop(fd);
10383 		goto get_from_fd;
10384 	}
10385 
10386 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10387 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10388 		error = EBADF;
10389 		goto out;
10390 	}
10391 
10392 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10393 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10394 	}
10395 
10396 #if CONFIG_MACF
10397 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10398 	if (error) {
10399 		goto out;
10400 	}
10401 #endif
10402 
10403 	if ((error = vnode_getwithref(vp))) {
10404 		goto out;
10405 	}
10406 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10407 
10408 #if CONFIG_UNION_MOUNTS
10409 unionread:
10410 #endif /* CONFIG_UNION_MOUNTS */
10411 	if (vp->v_type != VDIR) {
10412 		(void)vnode_put(vp);
10413 		error = EINVAL;
10414 		goto out;
10415 	}
10416 
10417 #if CONFIG_MACF
10418 	error = mac_vnode_check_readdir(&context, vp);
10419 	if (error != 0) {
10420 		(void)vnode_put(vp);
10421 		goto out;
10422 	}
10423 #endif /* MAC */
10424 
10425 	loff = fp->fp_glob->fg_offset;
10426 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10427 	uio_addiov(auio, bufp, bufsize);
10428 
10429 	if (flags & VNODE_READDIR_EXTENDED) {
10430 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10431 		fp->fp_glob->fg_offset = uio_offset(auio);
10432 	} else {
10433 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10434 		fp->fp_glob->fg_offset = uio_offset(auio);
10435 	}
10436 	if (error) {
10437 		(void)vnode_put(vp);
10438 		goto out;
10439 	}
10440 
10441 #if CONFIG_UNION_MOUNTS
10442 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10443 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10444 		vnode_t uvp;
10445 
10446 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10447 			if (vnode_ref(uvp) == 0) {
10448 				fp_set_data(fp, uvp);
10449 				fp->fp_glob->fg_offset = 0;
10450 				vnode_rele(vp);
10451 				vnode_put(vp);
10452 				vp = uvp;
10453 				goto unionread;
10454 			} else {
10455 				/* could not get a ref, can't replace in fd */
10456 				vnode_put(uvp);
10457 			}
10458 		}
10459 	}
10460 #endif /* CONFIG_UNION_MOUNTS */
10461 
10462 	vnode_put(vp);
10463 	if (offset) {
10464 		*offset = loff;
10465 	}
10466 
10467 	*bytesread = bufsize - uio_resid(auio);
10468 out:
10469 	vn_offset_unlock(fp->fp_glob);
10470 	file_drop(fd);
10471 	return error;
10472 }
10473 
10474 
10475 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10476 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10477 {
10478 	off_t offset;
10479 	ssize_t bytesread;
10480 	int error, eofflag;
10481 
10482 	AUDIT_ARG(fd, uap->fd);
10483 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
10484 	    &bytesread, &offset, &eofflag, 0);
10485 
10486 	if (error == 0) {
10487 		if (proc_is64bit(p)) {
10488 			user64_long_t base = (user64_long_t)offset;
10489 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10490 		} else {
10491 			user32_long_t base = (user32_long_t)offset;
10492 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10493 		}
10494 		*retval = (int)bytesread;
10495 	}
10496 	return error;
10497 }
10498 
10499 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10500 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10501 {
10502 	off_t offset;
10503 	ssize_t bytesread;
10504 	int error, eofflag;
10505 	user_size_t bufsize;
10506 
10507 	AUDIT_ARG(fd, uap->fd);
10508 
10509 	/*
10510 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10511 	 * then the kernel carves out the last 4 bytes to return extended
10512 	 * information to userspace (namely whether we reached EOF with this call).
10513 	 */
10514 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10515 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10516 	} else {
10517 		bufsize = uap->bufsize;
10518 	}
10519 
10520 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
10521 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10522 
10523 	if (error == 0) {
10524 		*retval = bytesread;
10525 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10526 
10527 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10528 			getdirentries64_flags_t flags = 0;
10529 			if (eofflag) {
10530 				flags |= GETDIRENTRIES64_EOF;
10531 			}
10532 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10533 			    sizeof(flags));
10534 		}
10535 	}
10536 	return error;
10537 }
10538 
10539 
10540 /*
10541  * Set the mode mask for creation of filesystem nodes.
10542  * XXX implement xsecurity
10543  */
10544 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
10545 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10546 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10547 {
10548 	AUDIT_ARG(mask, newmask);
10549 	proc_fdlock(p);
10550 	*retval = p->p_fd.fd_cmask;
10551 	p->p_fd.fd_cmask = newmask & ALLPERMS;
10552 	proc_fdunlock(p);
10553 	return 0;
10554 }
10555 
10556 /*
10557  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10558  *
10559  * Parameters:    p                       Process requesting to set the umask
10560  *                uap                     User argument descriptor (see below)
10561  *                retval                  umask of the process (parameter p)
10562  *
10563  * Indirect:      uap->newmask            umask to set
10564  *                uap->xsecurity          ACL to set
10565  *
10566  * Returns:        0                      Success
10567  *                !0                      Not success
10568  *
10569  */
10570 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10571 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10572 {
10573 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10574 }
10575 
10576 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10577 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10578 {
10579 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10580 }
10581 
10582 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
10583 	"com.apple.private.vfs.revoke-mounted-device"
10584 
10585 /*
10586  * Void all references to file by ripping underlying filesystem
10587  * away from vnode.
10588  */
10589 /* ARGSUSED */
10590 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10591 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10592 {
10593 	vnode_t vp;
10594 	struct vnode_attr va;
10595 	vfs_context_t ctx = vfs_context_current();
10596 	int error;
10597 	struct nameidata nd;
10598 
10599 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10600 	    uap->path, ctx);
10601 	error = namei(&nd);
10602 	if (error) {
10603 		return error;
10604 	}
10605 	vp = nd.ni_vp;
10606 
10607 	nameidone(&nd);
10608 
10609 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10610 		error = ENOTSUP;
10611 		goto out;
10612 	}
10613 
10614 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10615 		error = EBUSY;
10616 		goto out;
10617 	}
10618 
10619 #if CONFIG_MACF
10620 	error = mac_vnode_check_revoke(ctx, vp);
10621 	if (error) {
10622 		goto out;
10623 	}
10624 #endif
10625 
10626 	VATTR_INIT(&va);
10627 	VATTR_WANTED(&va, va_uid);
10628 	if ((error = vnode_getattr(vp, &va, ctx))) {
10629 		goto out;
10630 	}
10631 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10632 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10633 		goto out;
10634 	}
10635 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10636 		VNOP_REVOKE(vp, REVOKEALL, ctx);
10637 	}
10638 out:
10639 	vnode_put(vp);
10640 	return error;
10641 }
10642 
10643 
10644 /*
10645  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10646  *  The following system calls are designed to support features
10647  *  which are specific to the HFS & HFS Plus volume formats
10648  */
10649 
10650 
10651 /*
10652  * Obtain attribute information on objects in a directory while enumerating
10653  * the directory.
10654  */
10655 /* ARGSUSED */
10656 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10657 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10658 {
10659 	vnode_t vp;
10660 	struct fileproc *fp;
10661 	uio_t auio = NULL;
10662 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10663 	uint32_t count = 0, savecount = 0;
10664 	uint32_t newstate = 0;
10665 	int error, eofflag = 0;
10666 	off_t loff = 0;
10667 	struct attrlist attributelist;
10668 	vfs_context_t ctx = vfs_context_current();
10669 	int fd = uap->fd;
10670 	UIO_STACKBUF(uio_buf, 1);
10671 	kauth_action_t action;
10672 
10673 	AUDIT_ARG(fd, fd);
10674 
10675 	/* Get the attributes into kernel space */
10676 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10677 		return error;
10678 	}
10679 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10680 		return error;
10681 	}
10682 	savecount = count;
10683 
10684 get_from_fd:
10685 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10686 		return error;
10687 	}
10688 
10689 	vn_offset_lock(fp->fp_glob);
10690 	if (((vnode_t)fp_get_data(fp)) != vp) {
10691 		vn_offset_unlock(fp->fp_glob);
10692 		file_drop(fd);
10693 		goto get_from_fd;
10694 	}
10695 
10696 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10697 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10698 		error = EBADF;
10699 		goto out;
10700 	}
10701 
10702 
10703 #if CONFIG_MACF
10704 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10705 	    fp->fp_glob);
10706 	if (error) {
10707 		goto out;
10708 	}
10709 #endif
10710 
10711 
10712 	if ((error = vnode_getwithref(vp))) {
10713 		goto out;
10714 	}
10715 
10716 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10717 
10718 #if CONFIG_UNION_MOUNTS
10719 unionread:
10720 #endif /* CONFIG_UNION_MOUNTS */
10721 	if (vp->v_type != VDIR) {
10722 		(void)vnode_put(vp);
10723 		error = EINVAL;
10724 		goto out;
10725 	}
10726 
10727 #if CONFIG_MACF
10728 	error = mac_vnode_check_readdir(ctx, vp);
10729 	if (error != 0) {
10730 		(void)vnode_put(vp);
10731 		goto out;
10732 	}
10733 #endif /* MAC */
10734 
10735 	/* set up the uio structure which will contain the users return buffer */
10736 	loff = fp->fp_glob->fg_offset;
10737 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10738 	uio_addiov(auio, uap->buffer, uap->buffersize);
10739 
10740 	/*
10741 	 * If the only item requested is file names, we can let that past with
10742 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10743 	 * they need SEARCH as well.
10744 	 */
10745 	action = KAUTH_VNODE_LIST_DIRECTORY;
10746 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10747 	    attributelist.fileattr || attributelist.dirattr) {
10748 		action |= KAUTH_VNODE_SEARCH;
10749 	}
10750 
10751 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10752 		/* Believe it or not, uap->options only has 32-bits of valid
10753 		 * info, so truncate before extending again */
10754 
10755 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10756 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10757 	}
10758 
10759 	if (error) {
10760 		(void) vnode_put(vp);
10761 		goto out;
10762 	}
10763 
10764 #if CONFIG_UNION_MOUNTS
10765 	/*
10766 	 * If we've got the last entry of a directory in a union mount
10767 	 * then reset the eofflag and pretend there's still more to come.
10768 	 * The next call will again set eofflag and the buffer will be empty,
10769 	 * so traverse to the underlying directory and do the directory
10770 	 * read there.
10771 	 */
10772 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10773 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10774 			eofflag = 0;
10775 		} else {                                                // Empty buffer
10776 			vnode_t uvp;
10777 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10778 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10779 					fp_set_data(fp, uvp);
10780 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10781 					count = savecount;
10782 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10783 					vnode_put(vp);
10784 					vp = uvp;
10785 					goto unionread;
10786 				} else {
10787 					/* could not get a ref, can't replace in fd */
10788 					vnode_put(uvp);
10789 				}
10790 			}
10791 		}
10792 	}
10793 #endif /* CONFIG_UNION_MOUNTS */
10794 
10795 	(void)vnode_put(vp);
10796 
10797 	if (error) {
10798 		goto out;
10799 	}
10800 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10801 
10802 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10803 		goto out;
10804 	}
10805 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10806 		goto out;
10807 	}
10808 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10809 		goto out;
10810 	}
10811 
10812 	*retval = eofflag;  /* similar to getdirentries */
10813 	error = 0;
10814 out:
10815 	vn_offset_unlock(fp->fp_glob);
10816 	file_drop(fd);
10817 	return error; /* return error earlier, an retval of 0 or 1 now */
10818 } /* end of getdirentriesattr system call */
10819 
10820 /*
10821  * Exchange data between two files
10822  */
10823 
10824 /* ARGSUSED */
10825 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10826 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10827 {
10828 	struct nameidata fnd, snd;
10829 	vfs_context_t ctx = vfs_context_current();
10830 	vnode_t fvp;
10831 	vnode_t svp;
10832 	int error;
10833 	u_int32_t nameiflags;
10834 	char *fpath = NULL;
10835 	char *spath = NULL;
10836 	int   flen = 0, slen = 0;
10837 	int from_truncated = 0, to_truncated = 0;
10838 #if CONFIG_FSE
10839 	fse_info f_finfo, s_finfo;
10840 #endif
10841 
10842 	nameiflags = 0;
10843 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10844 		nameiflags |= FOLLOW;
10845 	}
10846 
10847 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10848 	    UIO_USERSPACE, uap->path1, ctx);
10849 
10850 	error = namei(&fnd);
10851 	if (error) {
10852 		goto out2;
10853 	}
10854 
10855 	nameidone(&fnd);
10856 	fvp = fnd.ni_vp;
10857 
10858 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10859 	    UIO_USERSPACE, uap->path2, ctx);
10860 
10861 	error = namei(&snd);
10862 	if (error) {
10863 		vnode_put(fvp);
10864 		goto out2;
10865 	}
10866 	nameidone(&snd);
10867 	svp = snd.ni_vp;
10868 
10869 	/*
10870 	 * if the files are the same, return an inval error
10871 	 */
10872 	if (svp == fvp) {
10873 		error = EINVAL;
10874 		goto out;
10875 	}
10876 
10877 	/*
10878 	 * if the files are on different volumes, return an error
10879 	 */
10880 	if (svp->v_mount != fvp->v_mount) {
10881 		error = EXDEV;
10882 		goto out;
10883 	}
10884 
10885 	/* If they're not files, return an error */
10886 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10887 		error = EINVAL;
10888 		goto out;
10889 	}
10890 
10891 #if CONFIG_MACF
10892 	error = mac_vnode_check_exchangedata(ctx,
10893 	    fvp, svp);
10894 	if (error) {
10895 		goto out;
10896 	}
10897 #endif
10898 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10899 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10900 		goto out;
10901 	}
10902 
10903 	if (
10904 #if CONFIG_FSE
10905 		need_fsevent(FSE_EXCHANGE, fvp) ||
10906 #endif
10907 		kauth_authorize_fileop_has_listeners()) {
10908 		GET_PATH(fpath);
10909 		GET_PATH(spath);
10910 
10911 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10912 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10913 
10914 #if CONFIG_FSE
10915 		get_fse_info(fvp, &f_finfo, ctx);
10916 		get_fse_info(svp, &s_finfo, ctx);
10917 		if (from_truncated || to_truncated) {
10918 			// set it here since only the f_finfo gets reported up to user space
10919 			f_finfo.mode |= FSE_TRUNCATED_PATH;
10920 		}
10921 #endif
10922 	}
10923 	/* Ok, make the call */
10924 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10925 
10926 	if (error == 0) {
10927 		const char *tmpname;
10928 
10929 		if (fpath != NULL && spath != NULL) {
10930 			/* call out to allow 3rd party notification of exchangedata.
10931 			 * Ignore result of kauth_authorize_fileop call.
10932 			 */
10933 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10934 			    (uintptr_t)fpath, (uintptr_t)spath);
10935 		}
10936 		name_cache_lock();
10937 
10938 		tmpname     = fvp->v_name;
10939 		fvp->v_name = svp->v_name;
10940 		svp->v_name = tmpname;
10941 
10942 		if (fvp->v_parent != svp->v_parent) {
10943 			vnode_t tmp;
10944 
10945 			tmp           = fvp->v_parent;
10946 			fvp->v_parent = svp->v_parent;
10947 			svp->v_parent = tmp;
10948 		}
10949 		name_cache_unlock();
10950 
10951 #if CONFIG_FSE
10952 		if (fpath != NULL && spath != NULL) {
10953 			add_fsevent(FSE_EXCHANGE, ctx,
10954 			    FSE_ARG_STRING, flen, fpath,
10955 			    FSE_ARG_FINFO, &f_finfo,
10956 			    FSE_ARG_STRING, slen, spath,
10957 			    FSE_ARG_FINFO, &s_finfo,
10958 			    FSE_ARG_DONE);
10959 		}
10960 #endif
10961 	}
10962 
10963 out:
10964 	if (fpath != NULL) {
10965 		RELEASE_PATH(fpath);
10966 	}
10967 	if (spath != NULL) {
10968 		RELEASE_PATH(spath);
10969 	}
10970 	vnode_put(svp);
10971 	vnode_put(fvp);
10972 out2:
10973 	return error;
10974 }
10975 
10976 /*
10977  * Return (in MB) the amount of freespace on the given vnode's volume.
10978  */
10979 uint32_t freespace_mb(vnode_t vp);
10980 
10981 uint32_t
freespace_mb(vnode_t vp)10982 freespace_mb(vnode_t vp)
10983 {
10984 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10985 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10986 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10987 }
10988 
10989 #if CONFIG_SEARCHFS
10990 
10991 /* ARGSUSED */
10992 
10993 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10994 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10995 {
10996 	vnode_t vp, tvp;
10997 	int i, error = 0;
10998 	int fserror = 0;
10999 	struct nameidata nd;
11000 	struct user64_fssearchblock searchblock;
11001 	struct searchstate *state;
11002 	struct attrlist *returnattrs;
11003 	struct timeval timelimit;
11004 	void *searchparams1, *searchparams2;
11005 	uio_t auio = NULL;
11006 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11007 	uint32_t nummatches;
11008 	size_t mallocsize;
11009 	uint32_t nameiflags;
11010 	vfs_context_t ctx = vfs_context_current();
11011 	UIO_STACKBUF(uio_buf, 1);
11012 
11013 	/* Start by copying in fsearchblock parameter list */
11014 	if (IS_64BIT_PROCESS(p)) {
11015 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11016 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
11017 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
11018 	} else {
11019 		struct user32_fssearchblock tmp_searchblock;
11020 
11021 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11022 		// munge into 64-bit version
11023 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11024 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11025 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11026 		searchblock.maxmatches = tmp_searchblock.maxmatches;
11027 		/*
11028 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11029 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11030 		 */
11031 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11032 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11033 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11034 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11035 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11036 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11037 		searchblock.searchattrs = tmp_searchblock.searchattrs;
11038 	}
11039 	if (error) {
11040 		return error;
11041 	}
11042 
11043 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11044 	 */
11045 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11046 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11047 		return EINVAL;
11048 	}
11049 
11050 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11051 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
11052 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11053 	/* block.                                                                                             */
11054 	/*												      */
11055 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
11056 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
11057 	/*       assumes the size is still 556 bytes it will continue to work				      */
11058 
11059 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11060 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11061 
11062 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11063 
11064 	/* Now set up the various pointers to the correct place in our newly allocated memory */
11065 
11066 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11067 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11068 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11069 
11070 	/* Now copy in the stuff given our local variables. */
11071 
11072 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11073 		goto freeandexit;
11074 	}
11075 
11076 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11077 		goto freeandexit;
11078 	}
11079 
11080 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11081 		goto freeandexit;
11082 	}
11083 
11084 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11085 		goto freeandexit;
11086 	}
11087 
11088 	/*
11089 	 * When searching a union mount, need to set the
11090 	 * start flag at the first call on each layer to
11091 	 * reset state for the new volume.
11092 	 */
11093 	if (uap->options & SRCHFS_START) {
11094 		state->ss_union_layer = 0;
11095 	} else {
11096 		uap->options |= state->ss_union_flags;
11097 	}
11098 	state->ss_union_flags = 0;
11099 
11100 	/*
11101 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11102 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11103 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11104 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11105 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11106 	 */
11107 
11108 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11109 		attrreference_t* string_ref;
11110 		u_int32_t* start_length;
11111 		user64_size_t param_length;
11112 
11113 		/* validate searchparams1 */
11114 		param_length = searchblock.sizeofsearchparams1;
11115 		/* skip the word that specifies length of the buffer */
11116 		start_length = (u_int32_t*) searchparams1;
11117 		start_length = start_length + 1;
11118 		string_ref = (attrreference_t*) start_length;
11119 
11120 		/* ensure no negative offsets or too big offsets */
11121 		if (string_ref->attr_dataoffset < 0) {
11122 			error = EINVAL;
11123 			goto freeandexit;
11124 		}
11125 		if (string_ref->attr_length > MAXPATHLEN) {
11126 			error = EINVAL;
11127 			goto freeandexit;
11128 		}
11129 
11130 		/* Check for pointer overflow in the string ref */
11131 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11132 			error = EINVAL;
11133 			goto freeandexit;
11134 		}
11135 
11136 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11137 			error = EINVAL;
11138 			goto freeandexit;
11139 		}
11140 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11141 			error = EINVAL;
11142 			goto freeandexit;
11143 		}
11144 	}
11145 
11146 	/* set up the uio structure which will contain the users return buffer */
11147 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11148 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11149 
11150 	nameiflags = 0;
11151 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11152 		nameiflags |= FOLLOW;
11153 	}
11154 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11155 	    UIO_USERSPACE, uap->path, ctx);
11156 
11157 	error = namei(&nd);
11158 	if (error) {
11159 		goto freeandexit;
11160 	}
11161 	vp = nd.ni_vp;
11162 	nameidone(&nd);
11163 
11164 	/*
11165 	 * Switch to the root vnode for the volume
11166 	 */
11167 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11168 	vnode_put(vp);
11169 	if (error) {
11170 		goto freeandexit;
11171 	}
11172 	vp = tvp;
11173 
11174 #if CONFIG_UNION_MOUNTS
11175 	/*
11176 	 * If it's a union mount, the path lookup takes
11177 	 * us to the top layer. But we may need to descend
11178 	 * to a lower layer. For non-union mounts the layer
11179 	 * is always zero.
11180 	 */
11181 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11182 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11183 			break;
11184 		}
11185 		tvp = vp;
11186 		vp = vp->v_mount->mnt_vnodecovered;
11187 		if (vp == NULL) {
11188 			vnode_put(tvp);
11189 			error = ENOENT;
11190 			goto freeandexit;
11191 		}
11192 		error = vnode_getwithref(vp);
11193 		vnode_put(tvp);
11194 		if (error) {
11195 			goto freeandexit;
11196 		}
11197 	}
11198 #endif /* CONFIG_UNION_MOUNTS */
11199 
11200 #if CONFIG_MACF
11201 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11202 	if (error) {
11203 		vnode_put(vp);
11204 		goto freeandexit;
11205 	}
11206 #endif
11207 
11208 
11209 	/*
11210 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11211 	 * before and sometimes the underlying code doesnt deal with it well.
11212 	 */
11213 	if (searchblock.maxmatches == 0) {
11214 		nummatches = 0;
11215 		goto saveandexit;
11216 	}
11217 
11218 	/*
11219 	 * Allright, we have everything we need, so lets make that call.
11220 	 *
11221 	 * We keep special track of the return value from the file system:
11222 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11223 	 * from copying out any results...
11224 	 */
11225 
11226 	fserror = VNOP_SEARCHFS(vp,
11227 	    searchparams1,
11228 	    searchparams2,
11229 	    &searchblock.searchattrs,
11230 	    (uint32_t)searchblock.maxmatches,
11231 	    &timelimit,
11232 	    returnattrs,
11233 	    &nummatches,
11234 	    (uint32_t)uap->scriptcode,
11235 	    (uint32_t)uap->options,
11236 	    auio,
11237 	    (struct searchstate *) &state->ss_fsstate,
11238 	    ctx);
11239 
11240 #if CONFIG_UNION_MOUNTS
11241 	/*
11242 	 * If it's a union mount we need to be called again
11243 	 * to search the mounted-on filesystem.
11244 	 */
11245 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11246 		state->ss_union_flags = SRCHFS_START;
11247 		state->ss_union_layer++;        // search next layer down
11248 		fserror = EAGAIN;
11249 	}
11250 #endif /* CONFIG_UNION_MOUNTS */
11251 
11252 saveandexit:
11253 
11254 	vnode_put(vp);
11255 
11256 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11257 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11258 
11259 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11260 		goto freeandexit;
11261 	}
11262 
11263 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11264 		goto freeandexit;
11265 	}
11266 
11267 	error = fserror;
11268 
11269 freeandexit:
11270 
11271 	kfree_data(searchparams1, mallocsize);
11272 
11273 	return error;
11274 } /* end of searchfs system call */
11275 
11276 #else /* CONFIG_SEARCHFS */
11277 
11278 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11279 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11280 {
11281 	return ENOTSUP;
11282 }
11283 
11284 #endif /* CONFIG_SEARCHFS */
11285 
11286 
11287 #if CONFIG_DATALESS_FILES
11288 
11289 /*
11290  * === Namespace Resolver Up-call Mechanism ===
11291  *
11292  * When I/O is performed to a dataless file or directory (read, write,
11293  * lookup-in, etc.), the file system performs an upcall to the namespace
11294  * resolver (filecoordinationd) to materialize the object.
11295  *
11296  * We need multiple up-calls to be in flight at once, and we need these
11297  * up-calls to be interruptible, thus the following implementation:
11298  *
11299  * => The nspace_resolver_request represents the in-kernel request state.
11300  *    It contains a request ID, storage space for the errno code returned
11301  *    by filecoordinationd, and flags.
11302  *
11303  * => The request ID is simply a global monotonically incrementing 32-bit
11304  *    number.  Outstanding requests are stored in a hash table, and the
11305  *    hash function is extremely simple.
11306  *
11307  * => When an upcall is to be made to filecoordinationd, a request structure
11308  *    is allocated on the stack (it is small, and needs to live only during
11309  *    the duration of the call to resolve_nspace_item_ext()).  It is
11310  *    initialized and inserted into the table.  Some backpressure from
11311  *    filecoordinationd is applied by limiting the numnber of entries that
11312  *    can be inserted into the table (and thus limiting the number of
11313  *    outstanding requests issued to filecoordinationd); waiting for an
11314  *    available slot is interruptible.
11315  *
11316  * => Once the request has been inserted into the table, the up-call is made
11317  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11318  *    immediately and filecoordinationd processes the request asynchronously.
11319  *
11320  * => The caller now waits for the request to complete.  Tnis is achieved by
11321  *    sleeping on the address of the request structure and waiting for
11322  *    filecoordinationd to mark the request structure as complete.  This
11323  *    is an interruptible sleep call; if interrupted, the request structure
11324  *    is removed from the table and EINTR is returned to the caller.  If
11325  *    this occurs, an advisory up-call is made to filecoordinationd with
11326  *    the request ID to indicate that the request can be aborted or
11327  *    de-prioritized at the discretion of filecoordinationd.
11328  *
11329  * => When filecoordinationd has completed the request, it signals completion
11330  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11331  *    decorated as a namespace resolver can write to this sysctl node.  The
11332  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11333  *    The request ID is looked up in the table, and if the request is found,
11334  *    the error code is stored in the request structure and a wakeup()
11335  *    issued on the address of the request structure.  If the request is not
11336  *    found, we simply drop the completion notification, assuming that the
11337  *    caller was interrupted.
11338  *
11339  * => When the waiting thread wakes up, it extracts the error code from the
11340  *    request structure, removes the request from the table, and returns the
11341  *    error code to the calling function.  Fini!
11342  */
11343 
11344 struct nspace_resolver_request {
11345 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11346 	vnode_t         r_vp;
11347 	vnode_t         r_tdvp;
11348 	uint32_t        r_req_id;
11349 	int             r_resolver_error;
11350 	int             r_flags;
11351 };
11352 
11353 #define RRF_COMPLETE    0x0001
11354 #define RRF_COMPLETING  0x0002
11355 
11356 struct nspace_resolver_completion_data {
11357 	uint32_t req_id;
11358 	int32_t  resolver_error;
11359 	uint64_t orig_gencount;
11360 	uint64_t orig_syncroot;
11361 };
11362 
11363 static uint32_t
next_nspace_req_id(void)11364 next_nspace_req_id(void)
11365 {
11366 	static uint32_t next_req_id;
11367 
11368 	return OSAddAtomic(1, &next_req_id);
11369 }
11370 
11371 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11372 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11373 
11374 static LIST_HEAD(nspace_resolver_requesthead,
11375     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11376 static u_long nspace_resolver_request_hashmask;
11377 static u_int nspace_resolver_request_count;
11378 static bool nspace_resolver_request_wait_slot;
11379 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11380 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11381     &nspace_resolver_request_lck_grp);
11382 
11383 #define NSPACE_REQ_LOCK() \
11384 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11385 #define NSPACE_REQ_UNLOCK() \
11386 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11387 
11388 #define NSPACE_RESOLVER_HASH(req_id)    \
11389 	(&nspace_resolver_request_hashtbl[(req_id) & \
11390 	 nspace_resolver_request_hashmask])
11391 
11392 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11393 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11394 {
11395 	struct nspace_resolver_requesthead *bucket;
11396 	struct nspace_resolver_request *req;
11397 
11398 	bucket = NSPACE_RESOLVER_HASH(req_id);
11399 	LIST_FOREACH(req, bucket, r_hashlink) {
11400 		if (req->r_req_id == req_id) {
11401 			/*
11402 			 * If this request already has a completion
11403 			 * pending, don't return it again.
11404 			 */
11405 			if ((req->r_flags & RRF_COMPLETING) != 0 &&
11406 			    skip_completing) {
11407 				req = NULL;
11408 			}
11409 			return req;
11410 		}
11411 	}
11412 
11413 	return NULL;
11414 }
11415 
11416 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11417 nspace_resolver_req_add(struct nspace_resolver_request *req)
11418 {
11419 	struct nspace_resolver_requesthead *bucket;
11420 	int error;
11421 
11422 	NSPACE_REQ_LOCK();
11423 
11424 	while (nspace_resolver_request_count >=
11425 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11426 		nspace_resolver_request_wait_slot = true;
11427 		error = msleep(&nspace_resolver_request_count,
11428 		    &nspace_resolver_request_hash_mutex,
11429 		    PVFS | PCATCH, "nspacerq", NULL);
11430 		if (error) {
11431 			NSPACE_REQ_UNLOCK();
11432 			return error;
11433 		}
11434 	}
11435 
11436 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11437 #if DIAGNOSTIC
11438 	assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11439 #endif /* DIAGNOSTIC */
11440 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11441 	nspace_resolver_request_count++;
11442 
11443 	NSPACE_REQ_UNLOCK();
11444 
11445 	return 0;
11446 }
11447 
11448 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11449 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11450 {
11451 	/*
11452 	 * If a completion is in-progress, we have to wait for the
11453 	 * completion handler to finish because it's still using 'req',
11454 	 * which is allocated on our stack a couple of frames up.
11455 	 */
11456 	while ((req->r_flags & RRF_COMPLETING) != 0) {
11457 		(void) msleep(req, &nspace_resolver_request_hash_mutex,
11458 		    PVFS, "nspacecmplt", NULL);
11459 	}
11460 }
11461 
11462 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11463 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11464 {
11465 	struct nspace_resolver_requesthead *bucket;
11466 
11467 	/* We're called with NSPACE_REQ_LOCK held. */
11468 
11469 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11470 #if DIAGNOSTIC
11471 	assert((req->r_flags & RRF_COMPLETING) == 0);
11472 	assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11473 #endif /* DIAGNOSTIC */
11474 	LIST_REMOVE(req, r_hashlink);
11475 	nspace_resolver_request_count--;
11476 
11477 	if (nspace_resolver_request_wait_slot) {
11478 		nspace_resolver_request_wait_slot = false;
11479 		wakeup(&nspace_resolver_request_count);
11480 	}
11481 
11482 	nspace_resolver_req_wait_pending_completion(req);
11483 
11484 	NSPACE_REQ_UNLOCK();
11485 }
11486 
11487 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11488 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11489 {
11490 	NSPACE_REQ_LOCK();
11491 	nspace_resolver_req_remove_and_unlock(req);
11492 }
11493 
11494 static void
nspace_resolver_req_cancel(uint32_t req_id)11495 nspace_resolver_req_cancel(uint32_t req_id)
11496 {
11497 	kern_return_t kr;
11498 	mach_port_t mp;
11499 
11500 	// Failures here aren't fatal -- the cancellation message
11501 	// sent to the resolver is merely advisory.
11502 
11503 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11504 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11505 		return;
11506 	}
11507 
11508 	kr = send_nspace_resolve_cancel(mp, req_id);
11509 	if (kr != KERN_SUCCESS) {
11510 		os_log_error(OS_LOG_DEFAULT,
11511 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11512 	}
11513 
11514 	ipc_port_release_send(mp);
11515 }
11516 
11517 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11518 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11519 {
11520 	bool send_cancel_message = false;
11521 	int error;
11522 
11523 	NSPACE_REQ_LOCK();
11524 
11525 	while ((req->r_flags & RRF_COMPLETE) == 0) {
11526 		error = msleep(req, &nspace_resolver_request_hash_mutex,
11527 		    PVFS | PCATCH, "nspace", NULL);
11528 		if (error && error != ERESTART) {
11529 			req->r_resolver_error = (error == EINTR) ? EINTR :
11530 			    ETIMEDOUT;
11531 			send_cancel_message = true;
11532 			break;
11533 		}
11534 	}
11535 
11536 	nspace_resolver_req_remove_and_unlock(req);
11537 
11538 	/*
11539 	 * It's safe to continue referencing 'req' here because it's
11540 	 * allocated on our caller's stack.
11541 	 */
11542 
11543 	if (send_cancel_message) {
11544 		nspace_resolver_req_cancel(req->r_req_id);
11545 	}
11546 
11547 	return req->r_resolver_error;
11548 }
11549 
11550 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11551 nspace_resolver_req_mark_complete(
11552 	struct nspace_resolver_request *req,
11553 	int resolver_error)
11554 {
11555 	req->r_resolver_error = resolver_error;
11556 	req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11557 	wakeup(req);
11558 }
11559 
11560 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11561 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11562 {
11563 	req->r_flags |= RRF_COMPLETING;
11564 }
11565 
11566 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11567 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11568 {
11569 	struct nspace_resolver_request *req;
11570 	int error;
11571 	struct vnode_attr va;
11572 	vnode_t vp;
11573 
11574 	NSPACE_REQ_LOCK();
11575 
11576 	req = nspace_resolver_req_lookup(c->req_id, true);
11577 	if (req == NULL) {
11578 		/*
11579 		 * If we don't find the request corresponding to our req_id,
11580 		 * just drop the completion on the floor; it's likely that
11581 		 * the requester interrupted with a signal, or it may already
11582 		 * be completing.
11583 		 */
11584 		NSPACE_REQ_UNLOCK();
11585 		return;
11586 	}
11587 
11588 	/*
11589 	 * Get out now if the resolver reported an error.
11590 	 */
11591 	if ((error = c->resolver_error) != 0) {
11592 		goto out;
11593 	}
11594 
11595 	/*
11596 	 * If the resolver did not specify any namespace shape criteria
11597 	 * for letting the operation proceed, then get out now.
11598 	 */
11599 	if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11600 		goto out;
11601 	}
11602 
11603 	/*
11604 	 * We're going to have to acquire the mount rename lock and do
11605 	 * some I/O in order to verify the criteria.  Mark the request
11606 	 * as pending so no one else messes with it after we drop the
11607 	 * NSPACE_REQ_LOCK.
11608 	 */
11609 	nspace_resolver_req_mark_completion_pending(req);
11610 	NSPACE_REQ_UNLOCK();
11611 
11612 	/*
11613 	 * Lock out renames from changing the shape of the tree while
11614 	 * validate the criteria.
11615 	 */
11616 	mount_t locked_mp = req->r_vp->v_mount;
11617 	mount_ref(locked_mp, 0);
11618 	mount_lock_renames(locked_mp);
11619 
11620 	if (c->orig_gencount != 0) {
11621 		vp = req->r_vp;
11622 		if (error) {
11623 			goto out_dropmount;
11624 		}
11625 
11626 		VATTR_INIT(&va);
11627 		VATTR_WANTED(&va, va_recursive_gencount);
11628 		error = vnode_getattr(vp, &va, vfs_context_kernel());
11629 		if (error) {
11630 			goto out_dropmount;
11631 		}
11632 		if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11633 		    va.va_recursive_gencount != c->orig_gencount) {
11634 			printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11635 			    c->orig_gencount, va.va_recursive_gencount);
11636 			error = EBUSY;
11637 			goto out_dropmount;
11638 		}
11639 	}
11640 
11641 	/*
11642 	 * Ignore orig_syncroot if a destination directory wasn't specified
11643 	 * in the request.
11644 	 */
11645 	if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11646 		uint64_t syncroot_id;
11647 
11648 		if (error) {
11649 			goto out_dropmount;
11650 		}
11651 
11652 #ifndef APFSIOC_GET_SYNC_ROOT
11653 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11654 #endif
11655 
11656 		error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11657 		    (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11658 		if (error) {
11659 			goto out_dropmount;
11660 		}
11661 		if (syncroot_id != c->orig_syncroot) {
11662 			printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11663 			    c->orig_syncroot, syncroot_id);
11664 			error = EBUSY;
11665 			goto out_dropmount;
11666 		}
11667 	}
11668 
11669 out_dropmount:
11670 	mount_unlock_renames(locked_mp);
11671 	mount_drop(locked_mp, 0);
11672 	NSPACE_REQ_LOCK();
11673 
11674 out:
11675 	nspace_resolver_req_mark_complete(req, error);
11676 	NSPACE_REQ_UNLOCK();
11677 }
11678 
11679 static struct proc *nspace_resolver_proc;
11680 
11681 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11682 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11683 {
11684 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11685 	    p == nspace_resolver_proc) ? 1 : 0;
11686 	return 0;
11687 }
11688 
11689 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11690 
11691 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11692 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11693 {
11694 	vfs_context_t ctx = vfs_context_current();
11695 	int error = 0;
11696 
11697 	//
11698 	// The system filecoordinationd runs as uid == 0.  This also
11699 	// has the nice side-effect of filtering out filecoordinationd
11700 	// running in the simulator.
11701 	//
11702 	if (!vfs_context_issuser(ctx) ||
11703 	    !vfs_context_is_dataless_resolver(ctx)) {
11704 		return EPERM;
11705 	}
11706 
11707 	if (is_resolver) {
11708 		NSPACE_REQ_LOCK();
11709 
11710 		if (nspace_resolver_proc == NULL) {
11711 			proc_lock(p);
11712 			p->p_lflag |= P_LNSPACE_RESOLVER;
11713 			proc_unlock(p);
11714 			nspace_resolver_proc = p;
11715 		} else {
11716 			error = EBUSY;
11717 		}
11718 
11719 		NSPACE_REQ_UNLOCK();
11720 	} else {
11721 		// This is basically just like the exit case.
11722 		// nspace_resolver_exited() will verify that the
11723 		// process is the resolver, and will clear the
11724 		// global.
11725 		nspace_resolver_exited(p);
11726 	}
11727 
11728 	return error;
11729 }
11730 
11731 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11732 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11733 {
11734 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11735 	    (p->p_vfs_iopolicy &
11736 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11737 		*is_prevented = 1;
11738 	} else {
11739 		*is_prevented = 0;
11740 	}
11741 	return 0;
11742 }
11743 
11744 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11745 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11746 {
11747 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11748 		return is_prevented ? 0 : EBUSY;
11749 	}
11750 
11751 	if (is_prevented) {
11752 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11753 	} else {
11754 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11755 	}
11756 	return 0;
11757 }
11758 
11759 static int
nspace_materialization_get_thread_state(int * is_prevented)11760 nspace_materialization_get_thread_state(int *is_prevented)
11761 {
11762 	uthread_t ut = current_uthread();
11763 
11764 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11765 	return 0;
11766 }
11767 
11768 static int
nspace_materialization_set_thread_state(int is_prevented)11769 nspace_materialization_set_thread_state(int is_prevented)
11770 {
11771 	uthread_t ut = current_uthread();
11772 
11773 	if (is_prevented) {
11774 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11775 	} else {
11776 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11777 	}
11778 	return 0;
11779 }
11780 
11781 /* the vfs.nspace branch */
11782 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11783 
11784 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11785 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11786     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11787 {
11788 	struct proc *p = req->p;
11789 	int new_value, old_value, changed = 0;
11790 	int error;
11791 
11792 	error = nspace_resolver_get_proc_state(p, &old_value);
11793 	if (error) {
11794 		return error;
11795 	}
11796 
11797 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11798 	    &changed);
11799 	if (error == 0 && changed) {
11800 		error = nspace_resolver_set_proc_state(p, new_value);
11801 	}
11802 	return error;
11803 }
11804 
11805 /* decorate this process as the dataless file resolver */
11806 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11807     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11808     0, 0, sysctl_nspace_resolver, "I", "");
11809 
11810 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11811 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11812     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11813 {
11814 	struct proc *p = req->p;
11815 	int new_value, old_value, changed = 0;
11816 	int error;
11817 
11818 	error = nspace_materialization_get_proc_state(p, &old_value);
11819 	if (error) {
11820 		return error;
11821 	}
11822 
11823 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11824 	    &changed);
11825 	if (error == 0 && changed) {
11826 		error = nspace_materialization_set_proc_state(p, new_value);
11827 	}
11828 	return error;
11829 }
11830 
11831 /* decorate this process as not wanting to materialize dataless files */
11832 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11833     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11834     0, 0, sysctl_nspace_prevent_materialization, "I", "");
11835 
11836 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11837 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11838     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11839 {
11840 	int new_value, old_value, changed = 0;
11841 	int error;
11842 
11843 	error = nspace_materialization_get_thread_state(&old_value);
11844 	if (error) {
11845 		return error;
11846 	}
11847 
11848 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11849 	    &changed);
11850 	if (error == 0 && changed) {
11851 		error = nspace_materialization_set_thread_state(new_value);
11852 	}
11853 	return error;
11854 }
11855 
11856 /* decorate this thread as not wanting to materialize dataless files */
11857 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11858     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11859     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11860 
11861 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11862 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11863     __unused int arg2, struct sysctl_req *req)
11864 {
11865 	struct proc *p = req->p;
11866 	uint32_t req_status[2] = { 0, 0 };
11867 	uint64_t gencount = 0;
11868 	uint64_t syncroot = 0;
11869 	int error, is_resolver, changed = 0, other_changed;
11870 
11871 	error = nspace_resolver_get_proc_state(p, &is_resolver);
11872 	if (error) {
11873 		return error;
11874 	}
11875 
11876 	if (!is_resolver) {
11877 		return EPERM;
11878 	}
11879 
11880 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11881 	    &changed);
11882 	if (error) {
11883 		return error;
11884 	}
11885 
11886 	/*
11887 	 * Get the gencount if it was passed.  Ignore errors, because
11888 	 * it's optional.
11889 	 */
11890 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11891 	    &other_changed);
11892 	if (error) {
11893 		gencount = 0;
11894 		error = 0;
11895 	}
11896 
11897 	/*
11898 	 * ...and now the syncroot ID.
11899 	 */
11900 	error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
11901 	    &other_changed);
11902 	if (error) {
11903 		syncroot = 0;
11904 		error = 0;
11905 	}
11906 
11907 	/*
11908 	 * req_status[0] is the req_id
11909 	 *
11910 	 * req_status[1] is the errno
11911 	 */
11912 	if (error == 0 && changed) {
11913 		const struct nspace_resolver_completion_data cd = {
11914 			.req_id = req_status[0],
11915 			.resolver_error = req_status[1],
11916 			.orig_gencount = gencount,
11917 			.orig_syncroot = syncroot,
11918 		};
11919 		nspace_resolver_req_completed(&cd);
11920 	}
11921 	return error;
11922 }
11923 
11924 /* Resolver reports completed reqs here. */
11925 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11926     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11927     0, 0, sysctl_nspace_complete, "-", "");
11928 
11929 #endif /* CONFIG_DATALESS_FILES */
11930 
11931 #if CONFIG_DATALESS_FILES
11932 #define __no_dataless_unused    /* nothing */
11933 #else
11934 #define __no_dataless_unused    __unused
11935 #endif
11936 
11937 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11938 vfs_context_dataless_materialization_is_prevented(
11939 	vfs_context_t const ctx __no_dataless_unused)
11940 {
11941 #if CONFIG_DATALESS_FILES
11942 	proc_t const p = vfs_context_proc(ctx);
11943 	thread_t const t = vfs_context_thread(ctx);
11944 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11945 
11946 	/*
11947 	 * Kernel context ==> return EDEADLK, as we would with any random
11948 	 * process decorated as no-materialize.
11949 	 */
11950 	if (ctx == vfs_context_kernel()) {
11951 		return EDEADLK;
11952 	}
11953 
11954 	/*
11955 	 * If the process has the dataless-manipulation entitlement,
11956 	 * materialization is prevented, and depending on the kind
11957 	 * of file system operation, things get to proceed as if the
11958 	 * object is not dataless.
11959 	 */
11960 	if (vfs_context_is_dataless_manipulator(ctx)) {
11961 		return EJUSTRETURN;
11962 	}
11963 
11964 	/*
11965 	 * Per-thread decorations override any process-wide decorations.
11966 	 * (Foundation uses this, and this overrides even the dataless-
11967 	 * manipulation entitlement so as to make API contracts consistent.)
11968 	 */
11969 	if (ut != NULL) {
11970 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11971 			return EDEADLK;
11972 		}
11973 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11974 			return 0;
11975 		}
11976 	}
11977 
11978 	/*
11979 	 * If the process's iopolicy specifies that dataless files
11980 	 * can be materialized, then we let it go ahead.
11981 	 */
11982 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11983 		return 0;
11984 	}
11985 #endif /* CONFIG_DATALESS_FILES */
11986 
11987 	/*
11988 	 * The default behavior is to not materialize dataless files;
11989 	 * return to the caller that deadlock was detected.
11990 	 */
11991 	return EDEADLK;
11992 }
11993 
11994 void
nspace_resolver_init(void)11995 nspace_resolver_init(void)
11996 {
11997 #if CONFIG_DATALESS_FILES
11998 	nspace_resolver_request_hashtbl =
11999 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12000 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12001 #endif /* CONFIG_DATALESS_FILES */
12002 }
12003 
12004 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12005 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12006 {
12007 #if CONFIG_DATALESS_FILES
12008 	struct nspace_resolver_requesthead *bucket;
12009 	struct nspace_resolver_request *req;
12010 	u_long idx;
12011 
12012 	NSPACE_REQ_LOCK();
12013 
12014 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12015 	    p == nspace_resolver_proc) {
12016 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12017 			bucket = &nspace_resolver_request_hashtbl[idx];
12018 			LIST_FOREACH(req, bucket, r_hashlink) {
12019 				nspace_resolver_req_wait_pending_completion(req);
12020 				nspace_resolver_req_mark_complete(req,
12021 				    ETIMEDOUT);
12022 			}
12023 		}
12024 		nspace_resolver_proc = NULL;
12025 	}
12026 
12027 	NSPACE_REQ_UNLOCK();
12028 #endif /* CONFIG_DATALESS_FILES */
12029 }
12030 
12031 #define DATALESS_RESOLVER_ENTITLEMENT     \
12032 	"com.apple.private.vfs.dataless-resolver"
12033 #define DATALESS_MANIPULATION_ENTITLEMENT \
12034 	"com.apple.private.vfs.dataless-manipulation"
12035 
12036 #if CONFIG_DATALESS_FILES
12037 /*
12038  * Return TRUE if the vfs context is associated with the dataless
12039  * resolver.
12040  */
12041 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12042 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12043 {
12044 	return IOTaskHasEntitlement(vfs_context_task(ctx),
12045 	           DATALESS_RESOLVER_ENTITLEMENT);
12046 }
12047 #endif /* CONFIG_DATALESS_FILES */
12048 
12049 /*
12050  * Return TRUE if the vfs context is associated with a process entitled
12051  * for dataless manipulation.
12052  *
12053  * XXX Arguably belongs in vfs_subr.c, but is here because of the
12054  * complication around CONFIG_DATALESS_FILES.
12055  */
12056 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12057 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12058 {
12059 #if CONFIG_DATALESS_FILES
12060 	task_t task = vfs_context_task(ctx);
12061 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12062 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12063 #else
12064 	return false;
12065 #endif /* CONFIG_DATALESS_FILES */
12066 }
12067 
12068 #if CONFIG_DATALESS_FILES
12069 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12070 log_materialization_prevented(vnode_t vp, uint64_t op)
12071 {
12072 	char p_name[MAXCOMLEN + 1];
12073 	char *vntype;
12074 	proc_selfname(&p_name[0], sizeof(p_name));
12075 
12076 	if (vp->v_type == VREG) {
12077 		vntype = "File";
12078 	} else if (vp->v_type == VDIR) {
12079 		vntype = "Dir";
12080 	} else if (vp->v_type == VLNK) {
12081 		vntype = "SymLink";
12082 	} else {
12083 		vntype = "Other";
12084 	}
12085 
12086 #if DEVELOPMENT
12087 	char *path = NULL;
12088 	int   len;
12089 
12090 	path = get_pathbuff();
12091 	len = MAXPATHLEN;
12092 	if (path) {
12093 		vn_getpath(vp, path, &len);
12094 	}
12095 
12096 	os_log_debug(OS_LOG_DEFAULT,
12097 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
12098 	    p_name, proc_selfpid(),
12099 	    op, vntype, path ? path : "<unknown-path>");
12100 	if (path) {
12101 		release_pathbuff(path);
12102 	}
12103 #else
12104 	os_log_debug(OS_LOG_DEFAULT,
12105 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12106 	    p_name, proc_selfpid(),
12107 	    op, vntype);
12108 #endif
12109 }
12110 #endif /* CONFIG_DATALESS_FILES */
12111 
12112 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12113 vfs_materialize_item(
12114 	vnode_t vp __no_dataless_unused,
12115 	uint32_t op __no_dataless_unused,
12116 	int64_t offset __no_dataless_unused,
12117 	int64_t size __no_dataless_unused,
12118 	char *lookup_name __no_dataless_unused,
12119 	size_t const namelen __no_dataless_unused,
12120 	vnode_t tdvp __no_dataless_unused)
12121 {
12122 #if CONFIG_DATALESS_FILES
12123 	kern_return_t kern_ret;
12124 	mach_port_t mach_port;
12125 	char *path = NULL;
12126 	vfs_context_t context;
12127 	int path_len;
12128 	int error;
12129 	audit_token_t atoken;
12130 	enum vtype vp_vtype;
12131 
12132 	/* Swap files are special; ignore them */
12133 	if (vnode_isswap(vp)) {
12134 		return 0;
12135 	}
12136 
12137 	/*
12138 	 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12139 	 * are no longer used nor supported.
12140 	 */
12141 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12142 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12143 		return ENOTSUP;
12144 	}
12145 	if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12146 		os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12147 		return ENOTSUP;
12148 	}
12149 
12150 	/* Normalize 'op'. */
12151 	op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12152 
12153 	/*
12154 	 * To-directory is only meaningful for rename operations;
12155 	 * ignore it if someone handed one to us unexpectedly.
12156 	 */
12157 	if (op != NAMESPACE_HANDLER_RENAME_OP) {
12158 		tdvp = NULL;
12159 	}
12160 
12161 	context = vfs_context_current();
12162 
12163 	/* Remember this for later. */
12164 	vp_vtype = vnode_vtype(vp);
12165 
12166 	error = vfs_context_dataless_materialization_is_prevented(context);
12167 	if (error) {
12168 		log_materialization_prevented(vp, op);
12169 		goto out_check_errors;
12170 	}
12171 
12172 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12173 	    &mach_port);
12174 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12175 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12176 		/*
12177 		 * Treat this like being unable to access the backing store
12178 		 * server.
12179 		 */
12180 		return ETIMEDOUT;
12181 	}
12182 
12183 	int path_alloc_len = MAXPATHLEN;
12184 	do {
12185 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12186 		if (path == NULL) {
12187 			return ENOMEM;
12188 		}
12189 
12190 		path_len = path_alloc_len;
12191 		error = vn_getpath(vp, path, &path_len);
12192 		if (error == 0) {
12193 			break;
12194 		} else if (error == ENOSPC) {
12195 			kfree_data(path, path_alloc_len);
12196 			path = NULL;
12197 		} else {
12198 			goto out_release_port;
12199 		}
12200 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12201 
12202 	error = vfs_context_copy_audit_token(context, &atoken);
12203 	if (error) {
12204 		goto out_release_port;
12205 	}
12206 
12207 	struct nspace_resolver_request req = {
12208 		.r_req_id = next_nspace_req_id(),
12209 		.r_vp = vp,
12210 		.r_tdvp = tdvp,
12211 	};
12212 
12213 	error = nspace_resolver_req_add(&req);
12214 	if (error) {
12215 		goto out_release_port;
12216 	}
12217 
12218 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12219 
12220 	if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12221 		char *dest_path = NULL;
12222 		int dest_path_len;
12223 
12224 		dest_path = zalloc(ZV_NAMEI);
12225 		dest_path_len = MAXPATHLEN;
12226 
12227 		error = vn_getpath(tdvp, dest_path, &dest_path_len);
12228 		if (error) {
12229 			zfree(ZV_NAMEI, dest_path);
12230 			goto out_release_port;
12231 		}
12232 
12233 		/*
12234 		 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12235 		 * compatibility with existing agents in user-space
12236 		 * who get passed this value.
12237 		 */
12238 		kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12239 		    req.r_req_id,
12240 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12241 		    path, dest_path, atoken);
12242 
12243 		zfree(ZV_NAMEI, dest_path);
12244 	} else if (vp_vtype == VDIR) {
12245 		char *tmpname = NULL;
12246 
12247 		/*
12248 		 * If the caller provided a lookup_name *and* a name length,
12249 		 * then we assume the lookup_name is not NUL-terminated.
12250 		 * Allocate a temporary buffer in this case to provide
12251 		 * a NUL-terminated path name to the IPC call.
12252 		 */
12253 		if (lookup_name != NULL && namelen != 0) {
12254 			if (namelen >= PATH_MAX) {
12255 				error = EINVAL;
12256 				goto out_req_remove;
12257 			}
12258 			tmpname = zalloc(ZV_NAMEI);
12259 			strlcpy(tmpname, lookup_name, namelen + 1);
12260 			lookup_name = tmpname;
12261 		} else if (lookup_name != NULL) {
12262 			/*
12263 			 * If the caller provided a lookup_name with a
12264 			 * zero name length, then we assume it's NUL-
12265 			 * terminated.  Verify it has a valid length.
12266 			 */
12267 			if (strlen(lookup_name) >= PATH_MAX) {
12268 				error = EINVAL;
12269 				goto out_req_remove;
12270 			}
12271 		}
12272 
12273 		/* (See above.) */
12274 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12275 		    req.r_req_id,
12276 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12277 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12278 
12279 		if (tmpname != NULL) {
12280 			zfree(ZV_NAMEI, tmpname);
12281 
12282 			/*
12283 			 * Poison lookup_name rather than reference
12284 			 * freed memory.
12285 			 */
12286 			lookup_name = NULL;
12287 		}
12288 	} else {
12289 		/* (See above.) */
12290 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12291 		    req.r_req_id,
12292 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12293 		    offset, size, path, atoken);
12294 	}
12295 	if (kern_ret != KERN_SUCCESS) {
12296 		/*
12297 		 * Also treat this like being unable to access the backing
12298 		 * store server.
12299 		 */
12300 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12301 		    kern_ret);
12302 		error = ETIMEDOUT;
12303 		goto out_req_remove;
12304 	}
12305 
12306 	/*
12307 	 * Give back the memory we allocated earlier while we wait; we
12308 	 * no longer need it.
12309 	 */
12310 	kfree_data(path, path_alloc_len);
12311 	path = NULL;
12312 
12313 	/*
12314 	 * Request has been submitted to the resolver. Now (interruptibly)
12315 	 * wait for completion. Upon requrn, the request will have been
12316 	 * removed from the lookup table.
12317 	 */
12318 	error = nspace_resolver_req_wait(&req);
12319 
12320 out_release_port:
12321 	if (path != NULL) {
12322 		kfree_data(path, path_alloc_len);
12323 		path = NULL;
12324 	}
12325 	ipc_port_release_send(mach_port);
12326 
12327 out_check_errors:
12328 	/*
12329 	 * The file resolver owns the logic about what error to return
12330 	 * to the caller.  We only need to handle a couple of special
12331 	 * cases here:
12332 	 */
12333 	if (error == EJUSTRETURN) {
12334 		/*
12335 		 * The requesting process is allowed to interact with
12336 		 * dataless objects.  Make a couple of sanity-checks
12337 		 * here to ensure the action makes sense.
12338 		 */
12339 		switch (op) {
12340 		case NAMESPACE_HANDLER_WRITE_OP:
12341 		case NAMESPACE_HANDLER_TRUNCATE_OP:
12342 		case NAMESPACE_HANDLER_RENAME_OP:
12343 			/*
12344 			 * This handles the case of the resolver itself
12345 			 * writing data to the file (or throwing it
12346 			 * away).
12347 			 */
12348 			error = 0;
12349 			break;
12350 		case NAMESPACE_HANDLER_READ_OP:
12351 		case NAMESPACE_HANDLER_LOOKUP_OP:
12352 			/*
12353 			 * This handles the case of the resolver needing
12354 			 * to look up inside of a dataless directory while
12355 			 * it's in the process of materializing it (for
12356 			 * example, creating files or directories).
12357 			 */
12358 			error = (vp_vtype == VDIR) ? 0 : EBADF;
12359 			break;
12360 		default:
12361 			error = EBADF;
12362 			break;
12363 		}
12364 	}
12365 
12366 	return error;
12367 
12368 out_req_remove:
12369 	nspace_resolver_req_remove(&req);
12370 	goto out_release_port;
12371 #else
12372 	return ENOTSUP;
12373 #endif /* CONFIG_DATALESS_FILES */
12374 }
12375 
12376 /*
12377  * vfs_materialize_file: Materialize a regular file.
12378  *
12379  * Inputs:
12380  * vp		The dataless file to be materialized.
12381  *
12382  * op		What kind of operation is being performed:
12383  *		-> NAMESPACE_HANDLER_READ_OP
12384  *		-> NAMESPACE_HANDLER_WRITE_OP
12385  *		-> NAMESPACE_HANDLER_LINK_CREATE
12386  *		-> NAMESPACE_HANDLER_DELETE_OP
12387  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12388  *		-> NAMESPACE_HANDLER_RENAME_OP
12389  *
12390  * offset	offset of I/O for READ or WRITE.  Ignored for
12391  *		other ops.
12392  *
12393  * size		size of I/O for READ or WRITE  Ignored for
12394  *		other ops.
12395  *
12396  * If offset or size are -1 for a READ or WRITE, then the resolver should
12397  * consider the range to be unknown.
12398  *
12399  * Upon successful return, the caller may proceed with the operation.
12400  * N.B. the file may still be "dataless" in this case.
12401  */
12402 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12403 vfs_materialize_file(
12404 	struct vnode *vp,
12405 	uint64_t op,
12406 	int64_t offset,
12407 	int64_t size)
12408 {
12409 	if (vp->v_type != VREG) {
12410 		return EFTYPE;
12411 	}
12412 	return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12413 	           NULL);
12414 }
12415 
12416 /*
12417  * vfs_materialize_dir:
12418  *
12419  * Inputs:
12420  * vp		The dataless directory to be materialized.
12421  *
12422  * op		What kind of operation is being performed:
12423  *		-> NAMESPACE_HANDLER_READ_OP
12424  *		-> NAMESPACE_HANDLER_WRITE_OP
12425  *		-> NAMESPACE_HANDLER_DELETE_OP
12426  *		-> NAMESPACE_HANDLER_RENAME_OP
12427  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12428  *
12429  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12430  *		other ops.  May or may not be NUL-terminated; see below.
12431  *
12432  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12433  *		terminated and namelen is the number of valid bytes in
12434  *		lookup_name. If zero, then lookup_name is assumed to be
12435  *		NUL-terminated.
12436  *
12437  * Upon successful return, the caller may proceed with the operation.
12438  * N.B. the directory may still be "dataless" in this case.
12439  */
12440 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12441 vfs_materialize_dir(
12442 	struct vnode *vp,
12443 	uint64_t op,
12444 	char *lookup_name,
12445 	size_t namelen)
12446 {
12447 	if (vp->v_type != VDIR) {
12448 		return EFTYPE;
12449 	}
12450 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12451 		return EINVAL;
12452 	}
12453 	return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12454 	           namelen, NULL);
12455 }
12456 
12457 /*
12458  * vfs_materialize_reparent:
12459  *
12460  * Inputs:
12461  * vp		The dataless file or directory to be materialized.
12462  *
12463  * tdvp		The new parent directory for the dataless file.
12464  *
12465  * Upon successful return, the caller may proceed with the operation.
12466  * N.B. the item may still be "dataless" in this case.
12467  */
12468 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12469 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12470 {
12471 	if (vp->v_type != VDIR && vp->v_type != VREG) {
12472 		return EFTYPE;
12473 	}
12474 	return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12475 	           0, 0, NULL, 0, tdvp);
12476 }
12477 
12478 #if 0
12479 static int
12480 build_volfs_path(struct vnode *vp, char *path, int *len)
12481 {
12482 	struct vnode_attr va;
12483 	int ret;
12484 
12485 	VATTR_INIT(&va);
12486 	VATTR_WANTED(&va, va_fsid);
12487 	VATTR_WANTED(&va, va_fileid);
12488 
12489 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12490 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12491 		ret = -1;
12492 	} else {
12493 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12494 		ret = 0;
12495 	}
12496 
12497 	return ret;
12498 }
12499 #endif
12500 
12501 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12502 fsctl_bogus_command_compat(unsigned long cmd)
12503 {
12504 	switch (cmd) {
12505 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12506 		return FSIOC_SYNC_VOLUME;
12507 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12508 		return FSIOC_ROUTEFS_SETROUTEID;
12509 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12510 		return FSIOC_SET_PACKAGE_EXTS;
12511 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12512 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
12513 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12514 		return DISK_CONDITIONER_IOC_GET;
12515 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12516 		return DISK_CONDITIONER_IOC_SET;
12517 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12518 		return FSIOC_FIOSEEKHOLE;
12519 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12520 		return FSIOC_FIOSEEKDATA;
12521 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12522 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12523 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12524 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
12525 	}
12526 
12527 	return cmd;
12528 }
12529 
12530 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12531 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12532 {
12533 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12534 }
12535 
12536 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12537 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12538 {
12539 	struct vfs_attr vfa;
12540 	mount_t mp = vp->v_mount;
12541 	unsigned arg;
12542 	int error;
12543 
12544 	/* record vid of vp so we can drop it below. */
12545 	uint32_t vvid = vp->v_id;
12546 
12547 	/*
12548 	 * Then grab mount_iterref so that we can release the vnode.
12549 	 * Without this, a thread may call vnode_iterate_prepare then
12550 	 * get into a deadlock because we've never released the root vp
12551 	 */
12552 	error = mount_iterref(mp, 0);
12553 	if (error) {
12554 		return error;
12555 	}
12556 	vnode_hold(vp);
12557 	vnode_put(vp);
12558 
12559 	arg = MNT_NOWAIT;
12560 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12561 		arg = MNT_WAIT;
12562 	}
12563 
12564 	/*
12565 	 * If the filessytem supports multiple filesytems in a
12566 	 * partition (For eg APFS volumes in a container, it knows
12567 	 * that the waitfor argument to VFS_SYNC are flags.
12568 	 */
12569 	VFSATTR_INIT(&vfa);
12570 	VFSATTR_WANTED(&vfa, f_capabilities);
12571 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12572 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12573 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12574 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12575 		arg |= MNT_VOLUME;
12576 	}
12577 
12578 	/* issue the sync for this volume */
12579 	(void)sync_callback(mp, &arg);
12580 
12581 	/*
12582 	 * Then release the mount_iterref once we're done syncing; it's not
12583 	 * needed for the VNOP_IOCTL below
12584 	 */
12585 	mount_iterdrop(mp);
12586 
12587 	if (arg & FSCTL_SYNC_FULLSYNC) {
12588 		/* re-obtain vnode iocount on the root vp, if possible */
12589 		error = vnode_getwithvid(vp, vvid);
12590 		if (error == 0) {
12591 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12592 			vnode_put(vp);
12593 		}
12594 	}
12595 	vnode_drop(vp);
12596 	/* mark the argument VP as having been released */
12597 	*arg_vp = NULL;
12598 	return error;
12599 }
12600 
12601 #if ROUTEFS
12602 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12603 handle_routes(user_addr_t udata)
12604 {
12605 	char routepath[MAXPATHLEN];
12606 	size_t len = 0;
12607 	int error;
12608 
12609 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12610 		return error;
12611 	}
12612 	bzero(routepath, MAXPATHLEN);
12613 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12614 	if (error) {
12615 		return error;
12616 	}
12617 	error = routefs_kernel_mount(routepath);
12618 	return error;
12619 }
12620 #endif
12621 
12622 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12623 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12624 {
12625 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12626 	struct vnode_attr va;
12627 	int error;
12628 
12629 	VATTR_INIT(&va);
12630 	VATTR_SET(&va, va_flags, cas->new_flags);
12631 
12632 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12633 
12634 #if CONFIG_FSE
12635 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12636 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12637 	}
12638 #endif
12639 
12640 	return error;
12641 }
12642 
12643 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12644 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12645 {
12646 	struct mount *mp = NULL;
12647 	errno_t rootauth = 0;
12648 
12649 	mp = vp->v_mount;
12650 
12651 	/*
12652 	 * query the underlying FS and see if it reports something
12653 	 * sane for this vnode. If volume is authenticated via
12654 	 * chunklist, leave that for the caller to determine.
12655 	 */
12656 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12657 
12658 	return rootauth;
12659 }
12660 
12661 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12662 	"com.apple.private.kernel.set-package-extensions"
12663 
12664 /*
12665  * Make a filesystem-specific control call:
12666  */
12667 /* ARGSUSED */
12668 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12669 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12670 {
12671 	int error = 0;
12672 	boolean_t is64bit;
12673 	u_int size;
12674 #define STK_PARAMS 128
12675 	char stkbuf[STK_PARAMS] = {0};
12676 	caddr_t data, memp;
12677 	vnode_t vp = *arg_vp;
12678 
12679 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
12680 		return ENOTTY;
12681 	}
12682 
12683 	cmd = fsctl_bogus_command_compat(cmd);
12684 
12685 	size = IOCPARM_LEN(cmd);
12686 	if (size > IOCPARM_MAX) {
12687 		return EINVAL;
12688 	}
12689 
12690 	is64bit = proc_is64bit(p);
12691 
12692 	memp = NULL;
12693 
12694 	if (size > sizeof(stkbuf)) {
12695 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12696 			return ENOMEM;
12697 		}
12698 		data = memp;
12699 	} else {
12700 		data = &stkbuf[0];
12701 	};
12702 
12703 	if (cmd & IOC_IN) {
12704 		if (size) {
12705 			error = copyin(udata, data, size);
12706 			if (error) {
12707 				if (memp) {
12708 					kfree_data(memp, size);
12709 				}
12710 				return error;
12711 			}
12712 		} else {
12713 			if (is64bit) {
12714 				*(user_addr_t *)data = udata;
12715 			} else {
12716 				*(uint32_t *)data = (uint32_t)udata;
12717 			}
12718 		};
12719 	} else if ((cmd & IOC_OUT) && size) {
12720 		/*
12721 		 * Zero the buffer so the user always
12722 		 * gets back something deterministic.
12723 		 */
12724 		bzero(data, size);
12725 	} else if (cmd & IOC_VOID) {
12726 		if (is64bit) {
12727 			*(user_addr_t *)data = udata;
12728 		} else {
12729 			*(uint32_t *)data = (uint32_t)udata;
12730 		}
12731 	}
12732 
12733 	/* Check to see if it's a generic command */
12734 	switch (cmd) {
12735 	case FSIOC_SYNC_VOLUME:
12736 		error = handle_sync_volume(vp, arg_vp, data, ctx);
12737 		break;
12738 
12739 	case FSIOC_ROUTEFS_SETROUTEID:
12740 #if ROUTEFS
12741 		error = handle_routes(udata);
12742 #endif
12743 		break;
12744 
12745 	case FSIOC_SET_PACKAGE_EXTS: {
12746 		user_addr_t ext_strings;
12747 		uint32_t    num_entries;
12748 		uint32_t    max_width;
12749 
12750 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12751 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12752 			error = EPERM;
12753 			break;
12754 		}
12755 
12756 		if ((is64bit && size != sizeof(user64_package_ext_info))
12757 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12758 			// either you're 64-bit and passed a 64-bit struct or
12759 			// you're 32-bit and passed a 32-bit struct.  otherwise
12760 			// it's not ok.
12761 			error = EINVAL;
12762 			break;
12763 		}
12764 
12765 		if (is64bit) {
12766 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12767 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12768 			}
12769 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12770 			num_entries = ((user64_package_ext_info *)data)->num_entries;
12771 			max_width   = ((user64_package_ext_info *)data)->max_width;
12772 		} else {
12773 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12774 			num_entries = ((user32_package_ext_info *)data)->num_entries;
12775 			max_width   = ((user32_package_ext_info *)data)->max_width;
12776 		}
12777 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
12778 	}
12779 	break;
12780 
12781 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12782 	{
12783 		mount_t mp;
12784 
12785 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12786 			break;
12787 		}
12788 		if ((mp = vp->v_mount) != NULL) {
12789 			mount_lock(mp);
12790 			if (data[0] != 0) {
12791 				for (int i = 0; i < MFSTYPENAMELEN; i++) {
12792 					if (!data[i]) {
12793 						goto continue_copy;
12794 					}
12795 				}
12796 				/*
12797 				 * Getting here means we have a user data
12798 				 * string which has no NULL termination in
12799 				 * its first MFSTYPENAMELEN bytes.  This is
12800 				 * bogus, let's avoid strlcpy-ing the read
12801 				 * data and return an error.
12802 				 */
12803 				error = EINVAL;
12804 				goto unlock;
12805 continue_copy:
12806 				vfs_setfstypename_locked(mp, data);
12807 				if (vfs_isrdonly(mp) &&
12808 				    strcmp(data, "mtmfs") == 0) {
12809 					mp->mnt_kern_flag |=
12810 					    MNTK_EXTENDED_SECURITY;
12811 					mp->mnt_kern_flag &=
12812 					    ~MNTK_AUTH_OPAQUE;
12813 				}
12814 			} else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12815 				const char *name =
12816 				    vfs_getfstypenameref_locked(mp, NULL);
12817 				if (strcmp(name, "mtmfs") == 0) {
12818 					mp->mnt_kern_flag &=
12819 					    ~MNTK_EXTENDED_SECURITY;
12820 				}
12821 				vfs_setfstypename_locked(mp, NULL);
12822 			}
12823 unlock:
12824 			mount_unlock(mp);
12825 		}
12826 	}
12827 	break;
12828 
12829 	case DISK_CONDITIONER_IOC_GET: {
12830 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12831 	}
12832 	break;
12833 
12834 	case DISK_CONDITIONER_IOC_SET: {
12835 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12836 	}
12837 	break;
12838 
12839 	case FSIOC_CAS_BSDFLAGS:
12840 		error = handle_flags(vp, data, ctx);
12841 		break;
12842 
12843 	case FSIOC_FD_ONLY_OPEN_ONCE: {
12844 		error = 0;
12845 		if (vnode_usecount(vp) > 1) {
12846 			vnode_lock_spin(vp);
12847 			if (vp->v_lflag & VL_HASSTREAMS) {
12848 				if (vnode_isinuse_locked(vp, 1, 1)) {
12849 					error = EBUSY;
12850 				}
12851 			} else if (vnode_usecount(vp) > 1) {
12852 				error = EBUSY;
12853 			}
12854 			vnode_unlock(vp);
12855 		}
12856 	}
12857 	break;
12858 
12859 	case FSIOC_EVAL_ROOTAUTH:
12860 		error = handle_auth(vp, cmd, data, options, ctx);
12861 		break;
12862 
12863 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
12864 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12865 		break;
12866 
12867 #if CONFIG_EXCLAVES
12868 	case FSIOC_EXCLAVE_FS_REGISTER:
12869 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12870 			error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
12871 		} else {
12872 			error = EPERM;
12873 		}
12874 		break;
12875 
12876 	case FSIOC_EXCLAVE_FS_UNREGISTER:
12877 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12878 			error = vfs_exclave_fs_unregister(vp);
12879 		} else {
12880 			error = EPERM;
12881 		}
12882 		break;
12883 
12884 	case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
12885 		exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
12886 		exclave_fs_base_dir_t *dirs = NULL;
12887 		if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12888 			error = EPERM;
12889 			break;
12890 		}
12891 		if (get_base_dirs->base_dirs) {
12892 			if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
12893 				error = EINVAL;
12894 				break;
12895 			}
12896 			dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
12897 			if (!dirs) {
12898 				error = ENOSPC;
12899 				break;
12900 			}
12901 		}
12902 		error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
12903 		if (!error && dirs) {
12904 			error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
12905 			    get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
12906 		}
12907 		if (dirs) {
12908 			kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
12909 		}
12910 	}
12911 	break;
12912 #endif
12913 
12914 	default: {
12915 		/*
12916 		 * Other, known commands shouldn't be passed down here.
12917 		 * (When adding a selector to this list, it may be prudent
12918 		 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
12919 		 */
12920 		switch (cmd) {
12921 		case F_PUNCHHOLE:
12922 		case F_TRIM_ACTIVE_FILE:
12923 		case F_RDADVISE:
12924 		case F_TRANSCODEKEY:
12925 		case F_GETPROTECTIONLEVEL:
12926 		case F_GETDEFAULTPROTLEVEL:
12927 		case F_MAKECOMPRESSED:
12928 		case F_SET_GREEDY_MODE:
12929 		case F_SETSTATICCONTENT:
12930 		case F_SETIOTYPE:
12931 		case F_SETBACKINGSTORE:
12932 		case F_GETPATH_MTMINFO:
12933 		case APFSIOC_REVERT_TO_SNAPSHOT:
12934 		case FSIOC_FIOSEEKHOLE:
12935 		case FSIOC_FIOSEEKDATA:
12936 		case HFS_GET_BOOT_INFO:
12937 		case HFS_SET_BOOT_INFO:
12938 		case FIOPINSWAP:
12939 		case F_CHKCLEAN:
12940 		case F_FULLFSYNC:
12941 		case F_BARRIERFSYNC:
12942 		case F_FREEZE_FS:
12943 		case F_THAW_FS:
12944 		case FSIOC_KERNEL_ROOTAUTH:
12945 		case FSIOC_GRAFT_FS:
12946 		case FSIOC_UNGRAFT_FS:
12947 		case FSIOC_AUTH_FS:
12948 			error = EINVAL;
12949 			goto outdrop;
12950 		}
12951 		/* Invoke the filesystem-specific code */
12952 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12953 	}
12954 	} /* end switch stmt */
12955 
12956 	/*
12957 	 * if no errors, copy any data to user. Size was
12958 	 * already set and checked above.
12959 	 */
12960 	if (error == 0 && (cmd & IOC_OUT) && size) {
12961 		error = copyout(data, udata, size);
12962 	}
12963 
12964 outdrop:
12965 	if (memp) {
12966 		kfree_data(memp, size);
12967 	}
12968 
12969 	return error;
12970 }
12971 
12972 /* ARGSUSED */
12973 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12974 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12975 {
12976 	int error;
12977 	struct nameidata nd;
12978 	uint32_t nameiflags;
12979 	vnode_t vp = NULL;
12980 	vfs_context_t ctx = vfs_context_current();
12981 
12982 	AUDIT_ARG(cmd, (int)uap->cmd);
12983 	AUDIT_ARG(value32, uap->options);
12984 	/* Get the vnode for the file we are getting info on:  */
12985 	nameiflags = 0;
12986 	//
12987 	// if we come through fsctl() then the file is by definition not open.
12988 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12989 	// lest the caller mistakenly thinks the only open is their own (but in
12990 	// reality it's someone elses).
12991 	//
12992 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12993 		return EINVAL;
12994 	}
12995 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12996 		nameiflags |= FOLLOW;
12997 	}
12998 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12999 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13000 	}
13001 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13002 	    UIO_USERSPACE, uap->path, ctx);
13003 	if ((error = namei(&nd))) {
13004 		goto done;
13005 	}
13006 	vp = nd.ni_vp;
13007 	nameidone(&nd);
13008 
13009 #if CONFIG_MACF
13010 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13011 	if (error) {
13012 		goto done;
13013 	}
13014 #endif
13015 
13016 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13017 
13018 done:
13019 	if (vp) {
13020 		vnode_put(vp);
13021 	}
13022 	return error;
13023 }
13024 /* ARGSUSED */
13025 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13026 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13027 {
13028 	int error;
13029 	vnode_t vp = NULL;
13030 	vfs_context_t ctx = vfs_context_current();
13031 	int fd = -1;
13032 
13033 	AUDIT_ARG(fd, uap->fd);
13034 	AUDIT_ARG(cmd, (int)uap->cmd);
13035 	AUDIT_ARG(value32, uap->options);
13036 
13037 	/* Get the vnode for the file we are getting info on:  */
13038 	if ((error = file_vnode(uap->fd, &vp))) {
13039 		return error;
13040 	}
13041 	fd = uap->fd;
13042 	if ((error = vnode_getwithref(vp))) {
13043 		file_drop(fd);
13044 		return error;
13045 	}
13046 
13047 #if CONFIG_MACF
13048 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13049 		file_drop(fd);
13050 		vnode_put(vp);
13051 		return error;
13052 	}
13053 #endif
13054 
13055 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13056 
13057 	file_drop(fd);
13058 
13059 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13060 	if (vp) {
13061 		vnode_put(vp);
13062 	}
13063 
13064 	return error;
13065 }
13066 /* end of fsctl system call */
13067 
13068 #define FILESEC_ACCESS_ENTITLEMENT              \
13069 	"com.apple.private.vfs.filesec-access"
13070 
13071 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13072 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13073 {
13074 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13075 		/*
13076 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13077 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13078 		 */
13079 		if ((!setting && vfs_context_issuser(ctx)) ||
13080 		    IOTaskHasEntitlement(vfs_context_task(ctx),
13081 		    FILESEC_ACCESS_ENTITLEMENT)) {
13082 			return 0;
13083 		}
13084 	}
13085 
13086 	return EPERM;
13087 }
13088 
13089 /*
13090  *  Retrieve the data of an extended attribute.
13091  */
13092 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13093 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13094 {
13095 	vnode_t vp;
13096 	struct nameidata nd;
13097 	char attrname[XATTR_MAXNAMELEN + 1];
13098 	vfs_context_t ctx = vfs_context_current();
13099 	uio_t auio = NULL;
13100 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13101 	size_t attrsize = 0;
13102 	size_t namelen;
13103 	u_int32_t nameiflags;
13104 	int error;
13105 	UIO_STACKBUF(uio_buf, 1);
13106 
13107 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13108 		return EINVAL;
13109 	}
13110 
13111 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13112 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13113 	if ((error = namei(&nd))) {
13114 		return error;
13115 	}
13116 	vp = nd.ni_vp;
13117 	nameidone(&nd);
13118 
13119 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13120 	if (error != 0) {
13121 		goto out;
13122 	}
13123 	if (xattr_protected(attrname) &&
13124 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13125 		goto out;
13126 	}
13127 	/*
13128 	 * the specific check for 0xffffffff is a hack to preserve
13129 	 * binaray compatibilty in K64 with applications that discovered
13130 	 * that passing in a buf pointer and a size of -1 resulted in
13131 	 * just the size of the indicated extended attribute being returned.
13132 	 * this isn't part of the documented behavior, but because of the
13133 	 * original implemtation's check for "uap->size > 0", this behavior
13134 	 * was allowed. In K32 that check turned into a signed comparison
13135 	 * even though uap->size is unsigned...  in K64, we blow by that
13136 	 * check because uap->size is unsigned and doesn't get sign smeared
13137 	 * in the munger for a 32 bit user app.  we also need to add a
13138 	 * check to limit the maximum size of the buffer being passed in...
13139 	 * unfortunately, the underlying fileystems seem to just malloc
13140 	 * the requested size even if the actual extended attribute is tiny.
13141 	 * because that malloc is for kernel wired memory, we have to put a
13142 	 * sane limit on it.
13143 	 *
13144 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13145 	 * U64 running on K64 will yield -1 (64 bits wide)
13146 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
13147 	 */
13148 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13149 		goto no_uio;
13150 	}
13151 
13152 	if (uap->value) {
13153 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13154 			uap->size = XATTR_MAXSIZE;
13155 		}
13156 
13157 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13158 		    &uio_buf[0], sizeof(uio_buf));
13159 		uio_addiov(auio, uap->value, uap->size);
13160 	}
13161 no_uio:
13162 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13163 out:
13164 	vnode_put(vp);
13165 
13166 	if (auio) {
13167 		*retval = uap->size - uio_resid(auio);
13168 	} else {
13169 		*retval = (user_ssize_t)attrsize;
13170 	}
13171 
13172 	return error;
13173 }
13174 
13175 /*
13176  * Retrieve the data of an extended attribute.
13177  */
13178 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13179 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13180 {
13181 	vnode_t vp;
13182 	char attrname[XATTR_MAXNAMELEN + 1];
13183 	vfs_context_t ctx = vfs_context_current();
13184 	uio_t auio = NULL;
13185 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13186 	size_t attrsize = 0;
13187 	size_t namelen;
13188 	int error;
13189 	UIO_STACKBUF(uio_buf, 1);
13190 
13191 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13192 		return EINVAL;
13193 	}
13194 
13195 	if ((error = file_vnode(uap->fd, &vp))) {
13196 		return error;
13197 	}
13198 	if ((error = vnode_getwithref(vp))) {
13199 		file_drop(uap->fd);
13200 		return error;
13201 	}
13202 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13203 	if (error != 0) {
13204 		goto out;
13205 	}
13206 	if (xattr_protected(attrname) &&
13207 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13208 		goto out;
13209 	}
13210 	if (uap->value && uap->size > 0) {
13211 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13212 			uap->size = XATTR_MAXSIZE;
13213 		}
13214 
13215 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13216 		    &uio_buf[0], sizeof(uio_buf));
13217 		uio_addiov(auio, uap->value, uap->size);
13218 	}
13219 
13220 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13221 out:
13222 	(void)vnode_put(vp);
13223 	file_drop(uap->fd);
13224 
13225 	if (auio) {
13226 		*retval = uap->size - uio_resid(auio);
13227 	} else {
13228 		*retval = (user_ssize_t)attrsize;
13229 	}
13230 	return error;
13231 }
13232 
13233 /* struct for checkdirs iteration */
13234 struct setxattr_ctx {
13235 	struct nameidata nd;
13236 	char attrname[XATTR_MAXNAMELEN + 1];
13237 	UIO_STACKBUF(uio_buf, 1);
13238 };
13239 
13240 /*
13241  * Set the data of an extended attribute.
13242  */
13243 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13244 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13245 {
13246 	vnode_t vp;
13247 	vfs_context_t ctx = vfs_context_current();
13248 	uio_t auio = NULL;
13249 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13250 	size_t namelen;
13251 	u_int32_t nameiflags;
13252 	int error;
13253 	struct setxattr_ctx *sactx;
13254 
13255 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13256 		return EINVAL;
13257 	}
13258 
13259 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13260 	if (sactx == NULL) {
13261 		return ENOMEM;
13262 	}
13263 
13264 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13265 	if (error != 0) {
13266 		if (error == EPERM) {
13267 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13268 			error = ENAMETOOLONG;
13269 		}
13270 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13271 		goto out;
13272 	}
13273 	if (xattr_protected(sactx->attrname) &&
13274 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13275 		goto out;
13276 	}
13277 	if (uap->size != 0 && uap->value == 0) {
13278 		error = EINVAL;
13279 		goto out;
13280 	}
13281 	if (uap->size > INT_MAX) {
13282 		error = E2BIG;
13283 		goto out;
13284 	}
13285 
13286 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13287 #if CONFIG_FILE_LEASES
13288 	nameiflags |= WANTPARENT;
13289 #endif
13290 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13291 	if ((error = namei(&sactx->nd))) {
13292 		goto out;
13293 	}
13294 	vp = sactx->nd.ni_vp;
13295 #if CONFIG_FILE_LEASES
13296 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13297 	vnode_put(sactx->nd.ni_dvp);
13298 #endif
13299 	nameidone(&sactx->nd);
13300 
13301 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13302 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13303 	uio_addiov(auio, uap->value, uap->size);
13304 
13305 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13306 #if CONFIG_FSE
13307 	if (error == 0) {
13308 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13309 		    FSE_ARG_VNODE, vp,
13310 		    FSE_ARG_DONE);
13311 	}
13312 #endif
13313 	vnode_put(vp);
13314 out:
13315 	kfree_type(struct setxattr_ctx, sactx);
13316 	*retval = 0;
13317 	return error;
13318 }
13319 
13320 /*
13321  * Set the data of an extended attribute.
13322  */
13323 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13324 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13325 {
13326 	vnode_t vp;
13327 	char attrname[XATTR_MAXNAMELEN + 1];
13328 	vfs_context_t ctx = vfs_context_current();
13329 	uio_t auio = NULL;
13330 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13331 	size_t namelen;
13332 	int error;
13333 	UIO_STACKBUF(uio_buf, 1);
13334 
13335 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13336 		return EINVAL;
13337 	}
13338 
13339 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13340 	if (error != 0) {
13341 		if (error == EPERM) {
13342 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13343 			return ENAMETOOLONG;
13344 		}
13345 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13346 		return error;
13347 	}
13348 	if (xattr_protected(attrname) &&
13349 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13350 		return error;
13351 	}
13352 	if (uap->size != 0 && uap->value == 0) {
13353 		return EINVAL;
13354 	}
13355 	if (uap->size > INT_MAX) {
13356 		return E2BIG;
13357 	}
13358 	if ((error = file_vnode(uap->fd, &vp))) {
13359 		return error;
13360 	}
13361 	if ((error = vnode_getwithref(vp))) {
13362 		file_drop(uap->fd);
13363 		return error;
13364 	}
13365 
13366 #if CONFIG_FILE_LEASES
13367 	vnode_breakdirlease(vp, true, O_WRONLY);
13368 #endif
13369 
13370 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13371 	    &uio_buf[0], sizeof(uio_buf));
13372 	uio_addiov(auio, uap->value, uap->size);
13373 
13374 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13375 #if CONFIG_FSE
13376 	if (error == 0) {
13377 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13378 		    FSE_ARG_VNODE, vp,
13379 		    FSE_ARG_DONE);
13380 	}
13381 #endif
13382 	vnode_put(vp);
13383 	file_drop(uap->fd);
13384 	*retval = 0;
13385 	return error;
13386 }
13387 
13388 /*
13389  * Remove an extended attribute.
13390  * XXX Code duplication here.
13391  */
13392 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13393 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13394 {
13395 	vnode_t vp;
13396 	struct nameidata nd;
13397 	char attrname[XATTR_MAXNAMELEN + 1];
13398 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13399 	vfs_context_t ctx = vfs_context_current();
13400 	size_t namelen;
13401 	u_int32_t nameiflags;
13402 	int error;
13403 
13404 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13405 		return EINVAL;
13406 	}
13407 
13408 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13409 	if (error != 0) {
13410 		return error;
13411 	}
13412 	if (xattr_protected(attrname)) {
13413 		return EPERM;
13414 	}
13415 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13416 #if CONFIG_FILE_LEASES
13417 	nameiflags |= WANTPARENT;
13418 #endif
13419 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13420 	if ((error = namei(&nd))) {
13421 		return error;
13422 	}
13423 	vp = nd.ni_vp;
13424 #if CONFIG_FILE_LEASES
13425 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13426 	vnode_put(nd.ni_dvp);
13427 #endif
13428 	nameidone(&nd);
13429 
13430 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13431 #if CONFIG_FSE
13432 	if (error == 0) {
13433 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13434 		    FSE_ARG_VNODE, vp,
13435 		    FSE_ARG_DONE);
13436 	}
13437 #endif
13438 	vnode_put(vp);
13439 	*retval = 0;
13440 	return error;
13441 }
13442 
13443 /*
13444  * Remove an extended attribute.
13445  * XXX Code duplication here.
13446  */
13447 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13448 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13449 {
13450 	vnode_t vp;
13451 	char attrname[XATTR_MAXNAMELEN + 1];
13452 	size_t namelen;
13453 	int error;
13454 #if CONFIG_FSE
13455 	vfs_context_t ctx = vfs_context_current();
13456 #endif
13457 
13458 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13459 		return EINVAL;
13460 	}
13461 
13462 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13463 	if (error != 0) {
13464 		return error;
13465 	}
13466 	if (xattr_protected(attrname)) {
13467 		return EPERM;
13468 	}
13469 	if ((error = file_vnode(uap->fd, &vp))) {
13470 		return error;
13471 	}
13472 	if ((error = vnode_getwithref(vp))) {
13473 		file_drop(uap->fd);
13474 		return error;
13475 	}
13476 
13477 #if CONFIG_FILE_LEASES
13478 	vnode_breakdirlease(vp, true, O_WRONLY);
13479 #endif
13480 
13481 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13482 #if CONFIG_FSE
13483 	if (error == 0) {
13484 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13485 		    FSE_ARG_VNODE, vp,
13486 		    FSE_ARG_DONE);
13487 	}
13488 #endif
13489 	vnode_put(vp);
13490 	file_drop(uap->fd);
13491 	*retval = 0;
13492 	return error;
13493 }
13494 
13495 /*
13496  * Retrieve the list of extended attribute names.
13497  * XXX Code duplication here.
13498  */
13499 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13500 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13501 {
13502 	vnode_t vp;
13503 	struct nameidata nd;
13504 	vfs_context_t ctx = vfs_context_current();
13505 	uio_t auio = NULL;
13506 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13507 	size_t attrsize = 0;
13508 	u_int32_t nameiflags;
13509 	int error;
13510 	UIO_STACKBUF(uio_buf, 1);
13511 
13512 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13513 		return EINVAL;
13514 	}
13515 
13516 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13517 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13518 	if ((error = namei(&nd))) {
13519 		return error;
13520 	}
13521 	vp = nd.ni_vp;
13522 	nameidone(&nd);
13523 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13524 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13525 		    &uio_buf[0], sizeof(uio_buf));
13526 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13527 	}
13528 
13529 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13530 
13531 	vnode_put(vp);
13532 	if (auio) {
13533 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13534 	} else {
13535 		*retval = (user_ssize_t)attrsize;
13536 	}
13537 	return error;
13538 }
13539 
13540 /*
13541  * Retrieve the list of extended attribute names.
13542  * XXX Code duplication here.
13543  */
13544 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13545 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13546 {
13547 	vnode_t vp;
13548 	uio_t auio = NULL;
13549 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13550 	size_t attrsize = 0;
13551 	int error;
13552 	UIO_STACKBUF(uio_buf, 1);
13553 
13554 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13555 		return EINVAL;
13556 	}
13557 
13558 	if ((error = file_vnode(uap->fd, &vp))) {
13559 		return error;
13560 	}
13561 	if ((error = vnode_getwithref(vp))) {
13562 		file_drop(uap->fd);
13563 		return error;
13564 	}
13565 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13566 		auio = uio_createwithbuffer(1, 0, spacetype,
13567 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
13568 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13569 	}
13570 
13571 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13572 
13573 	vnode_put(vp);
13574 	file_drop(uap->fd);
13575 	if (auio) {
13576 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13577 	} else {
13578 		*retval = (user_ssize_t)attrsize;
13579 	}
13580 	return error;
13581 }
13582 
13583 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13584 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13585     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13586 {
13587 	int error;
13588 	struct mount *mp = NULL;
13589 	vnode_t vp;
13590 	int length;
13591 	int bpflags;
13592 	/* maximum number of times to retry build_path */
13593 	unsigned int retries = 0x10;
13594 
13595 	if (bufsize > FSGETPATH_MAXBUFLEN) {
13596 		return EINVAL;
13597 	}
13598 
13599 	if (buf == NULL) {
13600 		return ENOMEM;
13601 	}
13602 
13603 retry:
13604 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13605 		error = ENOTSUP;  /* unexpected failure */
13606 		return ENOTSUP;
13607 	}
13608 
13609 #if CONFIG_UNION_MOUNTS
13610 unionget:
13611 #endif /* CONFIG_UNION_MOUNTS */
13612 	if (objid == 2) {
13613 		struct vfs_attr vfsattr;
13614 		int use_vfs_root = TRUE;
13615 
13616 		VFSATTR_INIT(&vfsattr);
13617 		VFSATTR_WANTED(&vfsattr, f_capabilities);
13618 		if (!(options & FSOPT_ISREALFSID) &&
13619 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13620 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13621 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13622 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13623 				use_vfs_root = FALSE;
13624 			}
13625 		}
13626 
13627 		if (use_vfs_root) {
13628 			error = VFS_ROOT(mp, &vp, ctx);
13629 		} else {
13630 			error = VFS_VGET(mp, objid, &vp, ctx);
13631 		}
13632 	} else {
13633 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13634 	}
13635 
13636 #if CONFIG_UNION_MOUNTS
13637 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13638 		/*
13639 		 * If the fileid isn't found and we're in a union
13640 		 * mount volume, then see if the fileid is in the
13641 		 * mounted-on volume.
13642 		 */
13643 		struct mount *tmp = mp;
13644 		mp = vnode_mount(tmp->mnt_vnodecovered);
13645 		vfs_unbusy(tmp);
13646 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
13647 			goto unionget;
13648 		}
13649 	} else {
13650 		vfs_unbusy(mp);
13651 	}
13652 #else
13653 	vfs_unbusy(mp);
13654 #endif /* CONFIG_UNION_MOUNTS */
13655 
13656 	if (error) {
13657 		return error;
13658 	}
13659 
13660 #if CONFIG_MACF
13661 	error = mac_vnode_check_fsgetpath(ctx, vp);
13662 	if (error) {
13663 		vnode_put(vp);
13664 		return error;
13665 	}
13666 #endif
13667 
13668 	/* Obtain the absolute path to this vnode. */
13669 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13670 	if (options & FSOPT_NOFIRMLINKPATH) {
13671 		bpflags |= BUILDPATH_NO_FIRMLINK;
13672 	}
13673 	bpflags |= BUILDPATH_CHECK_MOVED;
13674 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13675 	vnode_put(vp);
13676 
13677 	if (error) {
13678 		/* there was a race building the path, try a few more times */
13679 		if (error == EAGAIN) {
13680 			--retries;
13681 			if (retries > 0) {
13682 				goto retry;
13683 			}
13684 
13685 			error = ENOENT;
13686 		}
13687 		goto out;
13688 	}
13689 
13690 	AUDIT_ARG(text, buf);
13691 
13692 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13693 		unsigned long path_words[NUMPARMS];
13694 		size_t path_len = sizeof(path_words);
13695 
13696 		if ((size_t)length < path_len) {
13697 			memcpy((char *)path_words, buf, length);
13698 			memset((char *)path_words + length, 0, path_len - length);
13699 
13700 			path_len = length;
13701 		} else {
13702 			memcpy((char *)path_words, buf + (length - path_len), path_len);
13703 		}
13704 
13705 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
13706 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13707 	}
13708 
13709 	*pathlen = length; /* may be superseded by error */
13710 
13711 out:
13712 	return error;
13713 }
13714 
13715 /*
13716  * Obtain the full pathname of a file system object by id.
13717  */
13718 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13719 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13720     uint32_t options, user_ssize_t *retval)
13721 {
13722 	vfs_context_t ctx = vfs_context_current();
13723 	fsid_t fsid;
13724 	char *realpath;
13725 	int length;
13726 	int error;
13727 
13728 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13729 		return EINVAL;
13730 	}
13731 
13732 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13733 		return error;
13734 	}
13735 	AUDIT_ARG(value32, fsid.val[0]);
13736 	AUDIT_ARG(value64, objid);
13737 	/* Restrict output buffer size for now. */
13738 
13739 	if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13740 		return EINVAL;
13741 	}
13742 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13743 	if (realpath == NULL) {
13744 		return ENOMEM;
13745 	}
13746 
13747 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13748 	    options, &length);
13749 
13750 	if (error) {
13751 		goto out;
13752 	}
13753 
13754 	error = copyout((caddr_t)realpath, buf, length);
13755 
13756 	*retval = (user_ssize_t)length; /* may be superseded by error */
13757 out:
13758 	kfree_data(realpath, bufsize);
13759 	return error;
13760 }
13761 
13762 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13763 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13764 {
13765 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13766 	           0, retval);
13767 }
13768 
13769 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13770 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13771 {
13772 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13773 	           uap->options, retval);
13774 }
13775 
13776 /*
13777  * Common routine to handle various flavors of statfs data heading out
13778  *	to user space.
13779  *
13780  * Returns:	0			Success
13781  *		EFAULT
13782  */
13783 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13784 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13785     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13786     boolean_t partial_copy)
13787 {
13788 	int             error;
13789 	int             my_size, copy_size;
13790 
13791 	if (is_64_bit) {
13792 		struct user64_statfs sfs;
13793 		my_size = copy_size = sizeof(sfs);
13794 		bzero(&sfs, my_size);
13795 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13796 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13797 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13798 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13799 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13800 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13801 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13802 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13803 		sfs.f_files = (user64_long_t)sfsp->f_files;
13804 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13805 		sfs.f_fsid = sfsp->f_fsid;
13806 		sfs.f_owner = sfsp->f_owner;
13807 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13808 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13809 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13810 
13811 		if (partial_copy) {
13812 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13813 		}
13814 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13815 	} else {
13816 		struct user32_statfs sfs;
13817 
13818 		my_size = copy_size = sizeof(sfs);
13819 		bzero(&sfs, my_size);
13820 
13821 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13822 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13823 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13824 
13825 		/*
13826 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13827 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
13828 		 * to reflect the filesystem size as best we can.
13829 		 */
13830 		if ((sfsp->f_blocks > INT_MAX)
13831 		    /* Hack for 4061702 . I think the real fix is for Carbon to
13832 		     * look for some volume capability and not depend on hidden
13833 		     * semantics agreed between a FS and carbon.
13834 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13835 		     * for Carbon to set bNoVolumeSizes volume attribute.
13836 		     * Without this the webdavfs files cannot be copied onto
13837 		     * disk as they look huge. This change should not affect
13838 		     * XSAN as they should not setting these to -1..
13839 		     */
13840 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
13841 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
13842 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13843 			int             shift;
13844 
13845 			/*
13846 			 * Work out how far we have to shift the block count down to make it fit.
13847 			 * Note that it's possible to have to shift so far that the resulting
13848 			 * blocksize would be unreportably large.  At that point, we will clip
13849 			 * any values that don't fit.
13850 			 *
13851 			 * For safety's sake, we also ensure that f_iosize is never reported as
13852 			 * being smaller than f_bsize.
13853 			 */
13854 			for (shift = 0; shift < 32; shift++) {
13855 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13856 					break;
13857 				}
13858 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13859 					break;
13860 				}
13861 			}
13862 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13863 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13864 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13865 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13866 #undef __SHIFT_OR_CLIP
13867 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13868 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13869 		} else {
13870 			/* filesystem is small enough to be reported honestly */
13871 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13872 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13873 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13874 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13875 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13876 		}
13877 		sfs.f_files = (user32_long_t)sfsp->f_files;
13878 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13879 		sfs.f_fsid = sfsp->f_fsid;
13880 		sfs.f_owner = sfsp->f_owner;
13881 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13882 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13883 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13884 
13885 		if (partial_copy) {
13886 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13887 		}
13888 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13889 	}
13890 
13891 	if (sizep != NULL) {
13892 		*sizep = my_size;
13893 	}
13894 	return error;
13895 }
13896 
13897 /*
13898  * copy stat structure into user_stat structure.
13899  */
13900 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13901 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13902 {
13903 	bzero(usbp, sizeof(*usbp));
13904 
13905 	usbp->st_dev = sbp->st_dev;
13906 	usbp->st_ino = sbp->st_ino;
13907 	usbp->st_mode = sbp->st_mode;
13908 	usbp->st_nlink = sbp->st_nlink;
13909 	usbp->st_uid = sbp->st_uid;
13910 	usbp->st_gid = sbp->st_gid;
13911 	usbp->st_rdev = sbp->st_rdev;
13912 #ifndef _POSIX_C_SOURCE
13913 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13914 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13915 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13916 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13917 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13918 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13919 #else
13920 	usbp->st_atime = sbp->st_atime;
13921 	usbp->st_atimensec = sbp->st_atimensec;
13922 	usbp->st_mtime = sbp->st_mtime;
13923 	usbp->st_mtimensec = sbp->st_mtimensec;
13924 	usbp->st_ctime = sbp->st_ctime;
13925 	usbp->st_ctimensec = sbp->st_ctimensec;
13926 #endif
13927 	usbp->st_size = sbp->st_size;
13928 	usbp->st_blocks = sbp->st_blocks;
13929 	usbp->st_blksize = sbp->st_blksize;
13930 	usbp->st_flags = sbp->st_flags;
13931 	usbp->st_gen = sbp->st_gen;
13932 	usbp->st_lspare = sbp->st_lspare;
13933 	usbp->st_qspare[0] = sbp->st_qspare[0];
13934 	usbp->st_qspare[1] = sbp->st_qspare[1];
13935 }
13936 
13937 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13938 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13939 {
13940 	bzero(usbp, sizeof(*usbp));
13941 
13942 	usbp->st_dev = sbp->st_dev;
13943 	usbp->st_ino = sbp->st_ino;
13944 	usbp->st_mode = sbp->st_mode;
13945 	usbp->st_nlink = sbp->st_nlink;
13946 	usbp->st_uid = sbp->st_uid;
13947 	usbp->st_gid = sbp->st_gid;
13948 	usbp->st_rdev = sbp->st_rdev;
13949 #ifndef _POSIX_C_SOURCE
13950 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13951 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13952 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13953 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13954 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13955 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13956 #else
13957 	usbp->st_atime = sbp->st_atime;
13958 	usbp->st_atimensec = sbp->st_atimensec;
13959 	usbp->st_mtime = sbp->st_mtime;
13960 	usbp->st_mtimensec = sbp->st_mtimensec;
13961 	usbp->st_ctime = sbp->st_ctime;
13962 	usbp->st_ctimensec = sbp->st_ctimensec;
13963 #endif
13964 	usbp->st_size = sbp->st_size;
13965 	usbp->st_blocks = sbp->st_blocks;
13966 	usbp->st_blksize = sbp->st_blksize;
13967 	usbp->st_flags = sbp->st_flags;
13968 	usbp->st_gen = sbp->st_gen;
13969 	usbp->st_lspare = sbp->st_lspare;
13970 	usbp->st_qspare[0] = sbp->st_qspare[0];
13971 	usbp->st_qspare[1] = sbp->st_qspare[1];
13972 }
13973 
13974 /*
13975  * copy stat64 structure into user_stat64 structure.
13976  */
13977 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13978 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13979 {
13980 	bzero(usbp, sizeof(*usbp));
13981 
13982 	usbp->st_dev = sbp->st_dev;
13983 	usbp->st_ino = sbp->st_ino;
13984 	usbp->st_mode = sbp->st_mode;
13985 	usbp->st_nlink = sbp->st_nlink;
13986 	usbp->st_uid = sbp->st_uid;
13987 	usbp->st_gid = sbp->st_gid;
13988 	usbp->st_rdev = sbp->st_rdev;
13989 #ifndef _POSIX_C_SOURCE
13990 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13991 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13992 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13993 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13994 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13995 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13996 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13997 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13998 #else
13999 	usbp->st_atime = sbp->st_atime;
14000 	usbp->st_atimensec = sbp->st_atimensec;
14001 	usbp->st_mtime = sbp->st_mtime;
14002 	usbp->st_mtimensec = sbp->st_mtimensec;
14003 	usbp->st_ctime = sbp->st_ctime;
14004 	usbp->st_ctimensec = sbp->st_ctimensec;
14005 	usbp->st_birthtime = sbp->st_birthtime;
14006 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14007 #endif
14008 	usbp->st_size = sbp->st_size;
14009 	usbp->st_blocks = sbp->st_blocks;
14010 	usbp->st_blksize = sbp->st_blksize;
14011 	usbp->st_flags = sbp->st_flags;
14012 	usbp->st_gen = sbp->st_gen;
14013 	usbp->st_lspare = sbp->st_lspare;
14014 	usbp->st_qspare[0] = sbp->st_qspare[0];
14015 	usbp->st_qspare[1] = sbp->st_qspare[1];
14016 }
14017 
14018 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14019 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14020 {
14021 	bzero(usbp, sizeof(*usbp));
14022 
14023 	usbp->st_dev = sbp->st_dev;
14024 	usbp->st_ino = sbp->st_ino;
14025 	usbp->st_mode = sbp->st_mode;
14026 	usbp->st_nlink = sbp->st_nlink;
14027 	usbp->st_uid = sbp->st_uid;
14028 	usbp->st_gid = sbp->st_gid;
14029 	usbp->st_rdev = sbp->st_rdev;
14030 #ifndef _POSIX_C_SOURCE
14031 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14032 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14033 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14034 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14035 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14036 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14037 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14038 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14039 #else
14040 	usbp->st_atime = sbp->st_atime;
14041 	usbp->st_atimensec = sbp->st_atimensec;
14042 	usbp->st_mtime = sbp->st_mtime;
14043 	usbp->st_mtimensec = sbp->st_mtimensec;
14044 	usbp->st_ctime = sbp->st_ctime;
14045 	usbp->st_ctimensec = sbp->st_ctimensec;
14046 	usbp->st_birthtime = sbp->st_birthtime;
14047 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14048 #endif
14049 	usbp->st_size = sbp->st_size;
14050 	usbp->st_blocks = sbp->st_blocks;
14051 	usbp->st_blksize = sbp->st_blksize;
14052 	usbp->st_flags = sbp->st_flags;
14053 	usbp->st_gen = sbp->st_gen;
14054 	usbp->st_lspare = sbp->st_lspare;
14055 	usbp->st_qspare[0] = sbp->st_qspare[0];
14056 	usbp->st_qspare[1] = sbp->st_qspare[1];
14057 }
14058 
14059 /*
14060  * Purge buffer cache for simulating cold starts
14061  */
14062 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14063 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14064 {
14065 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14066 
14067 	return VNODE_RETURNED;
14068 }
14069 
14070 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14071 vfs_purge_callback(mount_t mp, __unused void * arg)
14072 {
14073 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14074 
14075 	return VFS_RETURNED;
14076 }
14077 
14078 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14079 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14080 
14081 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14082 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14083 {
14084 	if (!kauth_cred_issuser(kauth_cred_get())) {
14085 		return EPERM;
14086 	}
14087 
14088 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14089 
14090 	/* also flush any VM pagers backed by files */
14091 	if (vfs_purge_vm_pagers) {
14092 		vm_purge_filebacked_pagers();
14093 	}
14094 
14095 	return 0;
14096 }
14097 
14098 /*
14099  * gets the vnode associated with the (unnamed) snapshot directory
14100  * for a Filesystem. The snapshot directory vnode is returned with
14101  * an iocount on it.
14102  */
14103 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14104 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14105 {
14106 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14107 }
14108 
14109 /*
14110  * Get the snapshot vnode.
14111  *
14112  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14113  * needs nameidone() on ndp.
14114  *
14115  * If the snapshot vnode exists it is returned in ndp->ni_vp.
14116  *
14117  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14118  * not needed.
14119  */
14120 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14121 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14122     user_addr_t name, struct nameidata *ndp, int32_t op,
14123 #if !CONFIG_TRIGGERS
14124     __unused
14125 #endif
14126     enum path_operation pathop,
14127     vfs_context_t ctx)
14128 {
14129 	int error, i;
14130 	caddr_t name_buf;
14131 	size_t name_len;
14132 	struct vfs_attr vfa;
14133 
14134 	*sdvpp = NULLVP;
14135 	*rvpp = NULLVP;
14136 
14137 	error = vnode_getfromfd(ctx, dirfd, rvpp);
14138 	if (error) {
14139 		return error;
14140 	}
14141 
14142 	if (!vnode_isvroot(*rvpp)) {
14143 		error = EINVAL;
14144 		goto out;
14145 	}
14146 
14147 	/* Make sure the filesystem supports snapshots */
14148 	VFSATTR_INIT(&vfa);
14149 	VFSATTR_WANTED(&vfa, f_capabilities);
14150 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14151 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14152 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14153 	    VOL_CAP_INT_SNAPSHOT)) ||
14154 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14155 	    VOL_CAP_INT_SNAPSHOT))) {
14156 		error = ENOTSUP;
14157 		goto out;
14158 	}
14159 
14160 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14161 	if (error) {
14162 		goto out;
14163 	}
14164 
14165 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14166 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14167 	if (error) {
14168 		goto out1;
14169 	}
14170 
14171 	/*
14172 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14173 	 * (the length returned by copyinstr includes the terminating NUL)
14174 	 */
14175 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14176 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14177 		error = EINVAL;
14178 		goto out1;
14179 	}
14180 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14181 		;
14182 	}
14183 	if (i < (int)name_len) {
14184 		error = EINVAL;
14185 		goto out1;
14186 	}
14187 
14188 #if CONFIG_MACF
14189 	if (op == CREATE) {
14190 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14191 		    name_buf);
14192 	} else if (op == DELETE) {
14193 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14194 		    name_buf);
14195 	}
14196 	if (error) {
14197 		goto out1;
14198 	}
14199 #endif
14200 
14201 	/* Check if the snapshot already exists ... */
14202 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14203 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14204 	ndp->ni_dvp = *sdvpp;
14205 
14206 	error = namei(ndp);
14207 out1:
14208 	zfree(ZV_NAMEI, name_buf);
14209 out:
14210 	if (error) {
14211 		if (*sdvpp) {
14212 			vnode_put(*sdvpp);
14213 			*sdvpp = NULLVP;
14214 		}
14215 		if (*rvpp) {
14216 			vnode_put(*rvpp);
14217 			*rvpp = NULLVP;
14218 		}
14219 	}
14220 	return error;
14221 }
14222 
14223 /*
14224  * create a filesystem snapshot (for supporting filesystems)
14225  *
14226  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14227  * We get to the (unnamed) snapshot directory vnode and create the vnode
14228  * for the snapshot in it.
14229  *
14230  * Restrictions:
14231  *
14232  *    a) Passed in name for snapshot cannot have slashes.
14233  *    b) name can't be "." or ".."
14234  *
14235  * Since this requires superuser privileges, vnode_authorize calls are not
14236  * made.
14237  */
14238 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14239 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14240     vfs_context_t ctx)
14241 {
14242 	vnode_t rvp, snapdvp;
14243 	int error;
14244 	struct nameidata *ndp;
14245 
14246 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14247 
14248 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14249 	    OP_LINK, ctx);
14250 	if (error) {
14251 		goto out;
14252 	}
14253 
14254 	if (ndp->ni_vp) {
14255 		vnode_put(ndp->ni_vp);
14256 		error = EEXIST;
14257 	} else {
14258 		struct vnode_attr *vap;
14259 		vnode_t vp = NULLVP;
14260 
14261 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14262 
14263 		VATTR_INIT(vap);
14264 		VATTR_SET(vap, va_type, VREG);
14265 		VATTR_SET(vap, va_mode, 0);
14266 
14267 		error = vn_create(snapdvp, &vp, ndp, vap,
14268 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14269 		if (!error && vp) {
14270 			vnode_put(vp);
14271 		}
14272 
14273 		kfree_type(struct vnode_attr, vap);
14274 	}
14275 
14276 	nameidone(ndp);
14277 	vnode_put(snapdvp);
14278 	vnode_put(rvp);
14279 out:
14280 	kfree_type(struct nameidata, ndp);
14281 
14282 	return error;
14283 }
14284 
14285 /*
14286  * Delete a Filesystem snapshot
14287  *
14288  * get the vnode for the unnamed snapshot directory and the snapshot and
14289  * delete the snapshot.
14290  */
14291 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14292 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14293     vfs_context_t ctx)
14294 {
14295 	vnode_t rvp, snapdvp;
14296 	int error;
14297 	struct nameidata *ndp;
14298 
14299 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14300 
14301 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14302 	    OP_UNLINK, ctx);
14303 	if (error) {
14304 		goto out;
14305 	}
14306 
14307 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14308 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14309 
14310 	vnode_put(ndp->ni_vp);
14311 	nameidone(ndp);
14312 	vnode_put(snapdvp);
14313 	vnode_put(rvp);
14314 out:
14315 	kfree_type(struct nameidata, ndp);
14316 
14317 	return error;
14318 }
14319 
14320 /*
14321  * Revert a filesystem to a snapshot
14322  *
14323  * Marks the filesystem to revert to the given snapshot on next mount.
14324  */
14325 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14326 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14327     vfs_context_t ctx)
14328 {
14329 	int error;
14330 	vnode_t rvp;
14331 	mount_t mp;
14332 	struct fs_snapshot_revert_args revert_data;
14333 	struct componentname cnp;
14334 	caddr_t name_buf;
14335 	size_t name_len;
14336 
14337 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14338 	if (error) {
14339 		return error;
14340 	}
14341 	mp = vnode_mount(rvp);
14342 
14343 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14344 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14345 	if (error) {
14346 		zfree(ZV_NAMEI, name_buf);
14347 		vnode_put(rvp);
14348 		return error;
14349 	}
14350 
14351 #if CONFIG_MACF
14352 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14353 	if (error) {
14354 		zfree(ZV_NAMEI, name_buf);
14355 		vnode_put(rvp);
14356 		return error;
14357 	}
14358 #endif
14359 
14360 	/*
14361 	 * Grab mount_iterref so that we can release the vnode,
14362 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14363 	 */
14364 	error = mount_iterref(mp, 0);
14365 	vnode_put(rvp);
14366 	if (error) {
14367 		zfree(ZV_NAMEI, name_buf);
14368 		return error;
14369 	}
14370 
14371 	memset(&cnp, 0, sizeof(cnp));
14372 	cnp.cn_pnbuf = (char *)name_buf;
14373 	cnp.cn_nameiop = LOOKUP;
14374 	cnp.cn_flags = ISLASTCN | HASBUF;
14375 	cnp.cn_pnlen = MAXPATHLEN;
14376 	cnp.cn_nameptr = cnp.cn_pnbuf;
14377 	cnp.cn_namelen = (int)name_len;
14378 	revert_data.sr_cnp = &cnp;
14379 
14380 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14381 	mount_iterdrop(mp);
14382 	zfree(ZV_NAMEI, name_buf);
14383 
14384 	if (error) {
14385 		/* If there was any error, try again using VNOP_IOCTL */
14386 
14387 		vnode_t snapdvp;
14388 		struct nameidata namend;
14389 
14390 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14391 		    OP_LOOKUP, ctx);
14392 		if (error) {
14393 			return error;
14394 		}
14395 
14396 
14397 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14398 		    0, ctx);
14399 
14400 		vnode_put(namend.ni_vp);
14401 		nameidone(&namend);
14402 		vnode_put(snapdvp);
14403 		vnode_put(rvp);
14404 	}
14405 
14406 	return error;
14407 }
14408 
14409 /*
14410  * rename a Filesystem snapshot
14411  *
14412  * get the vnode for the unnamed snapshot directory and the snapshot and
14413  * rename the snapshot. This is a very specialised (and simple) case of
14414  * rename(2) (which has to deal with a lot more complications). It differs
14415  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14416  */
14417 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14418 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14419     __unused uint32_t flags, vfs_context_t ctx)
14420 {
14421 	vnode_t rvp, snapdvp;
14422 	int error, i;
14423 	caddr_t newname_buf;
14424 	size_t name_len;
14425 	vnode_t fvp;
14426 	struct nameidata *fromnd, *tond;
14427 	/* carving out a chunk for structs that are too big to be on stack. */
14428 	struct {
14429 		struct nameidata from_node;
14430 		struct nameidata to_node;
14431 	} * __rename_data;
14432 
14433 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14434 	fromnd = &__rename_data->from_node;
14435 	tond = &__rename_data->to_node;
14436 
14437 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14438 	    OP_UNLINK, ctx);
14439 	if (error) {
14440 		goto out;
14441 	}
14442 	fvp  = fromnd->ni_vp;
14443 
14444 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14445 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14446 	if (error) {
14447 		goto out1;
14448 	}
14449 
14450 	/*
14451 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14452 	 * slashes.
14453 	 * (the length returned by copyinstr includes the terminating NUL)
14454 	 *
14455 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14456 	 * off here itself.
14457 	 */
14458 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14459 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14460 		error = EINVAL;
14461 		goto out1;
14462 	}
14463 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14464 		;
14465 	}
14466 	if (i < (int)name_len) {
14467 		error = EINVAL;
14468 		goto out1;
14469 	}
14470 
14471 #if CONFIG_MACF
14472 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14473 	    newname_buf);
14474 	if (error) {
14475 		goto out1;
14476 	}
14477 #endif
14478 
14479 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14480 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14481 	tond->ni_dvp = snapdvp;
14482 
14483 	error = namei(tond);
14484 	if (error) {
14485 		goto out2;
14486 	} else if (tond->ni_vp) {
14487 		/*
14488 		 * snapshot rename behaves differently than rename(2) - if the
14489 		 * new name exists, EEXIST is returned.
14490 		 */
14491 		vnode_put(tond->ni_vp);
14492 		error = EEXIST;
14493 		goto out2;
14494 	}
14495 
14496 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14497 	    &tond->ni_cnd, ctx);
14498 
14499 out2:
14500 	nameidone(tond);
14501 out1:
14502 	zfree(ZV_NAMEI, newname_buf);
14503 	vnode_put(fvp);
14504 	vnode_put(snapdvp);
14505 	vnode_put(rvp);
14506 	nameidone(fromnd);
14507 out:
14508 	kfree_type(typeof(*__rename_data), __rename_data);
14509 	return error;
14510 }
14511 
14512 /*
14513  * Mount a Filesystem snapshot
14514  *
14515  * get the vnode for the unnamed snapshot directory and the snapshot and
14516  * mount the snapshot.
14517  */
14518 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14519 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14520     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14521 {
14522 	mount_t mp;
14523 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14524 	struct fs_snapshot_mount_args smnt_data;
14525 	int error;
14526 	struct nameidata *snapndp, *dirndp;
14527 	/* carving out a chunk for structs that are too big to be on stack. */
14528 	struct {
14529 		struct nameidata snapnd;
14530 		struct nameidata dirnd;
14531 	} * __snapshot_mount_data;
14532 
14533 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14534 	snapndp = &__snapshot_mount_data->snapnd;
14535 	dirndp = &__snapshot_mount_data->dirnd;
14536 
14537 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14538 	    OP_LOOKUP, ctx);
14539 	if (error) {
14540 		goto out;
14541 	}
14542 
14543 	snapvp  = snapndp->ni_vp;
14544 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14545 		error = EIO;
14546 		goto out1;
14547 	}
14548 
14549 	/* Get the vnode to be covered */
14550 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14551 	    UIO_USERSPACE, directory, ctx);
14552 	error = namei(dirndp);
14553 	if (error) {
14554 		goto out1;
14555 	}
14556 
14557 	vp = dirndp->ni_vp;
14558 	pvp = dirndp->ni_dvp;
14559 	mp = vnode_mount(rvp);
14560 
14561 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14562 		error = EINVAL;
14563 		goto out2;
14564 	}
14565 
14566 #if CONFIG_MACF
14567 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14568 	    mp->mnt_vfsstat.f_fstypename);
14569 	if (error) {
14570 		goto out2;
14571 	}
14572 #endif
14573 
14574 	smnt_data.sm_mp  = mp;
14575 	smnt_data.sm_cnp = &snapndp->ni_cnd;
14576 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14577 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & (MNT_DONTBROWSE | MNT_IGNORE_OWNERSHIP),
14578 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14579 
14580 out2:
14581 	vnode_put(vp);
14582 	vnode_put(pvp);
14583 	nameidone(dirndp);
14584 out1:
14585 	vnode_put(snapvp);
14586 	vnode_put(snapdvp);
14587 	vnode_put(rvp);
14588 	nameidone(snapndp);
14589 out:
14590 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14591 	return error;
14592 }
14593 
14594 /*
14595  * Root from a snapshot of the filesystem
14596  *
14597  * Marks the filesystem to root from the given snapshot on next boot.
14598  */
14599 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14600 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14601     vfs_context_t ctx)
14602 {
14603 	int error;
14604 	vnode_t rvp;
14605 	mount_t mp;
14606 	struct fs_snapshot_root_args root_data;
14607 	struct componentname cnp;
14608 	caddr_t name_buf;
14609 	size_t name_len;
14610 
14611 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14612 	if (error) {
14613 		return error;
14614 	}
14615 	mp = vnode_mount(rvp);
14616 
14617 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14618 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14619 	if (error) {
14620 		zfree(ZV_NAMEI, name_buf);
14621 		vnode_put(rvp);
14622 		return error;
14623 	}
14624 
14625 	// XXX MAC checks ?
14626 
14627 	/*
14628 	 * Grab mount_iterref so that we can release the vnode,
14629 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14630 	 */
14631 	error = mount_iterref(mp, 0);
14632 	vnode_put(rvp);
14633 	if (error) {
14634 		zfree(ZV_NAMEI, name_buf);
14635 		return error;
14636 	}
14637 
14638 	memset(&cnp, 0, sizeof(cnp));
14639 	cnp.cn_pnbuf = (char *)name_buf;
14640 	cnp.cn_nameiop = LOOKUP;
14641 	cnp.cn_flags = ISLASTCN | HASBUF;
14642 	cnp.cn_pnlen = MAXPATHLEN;
14643 	cnp.cn_nameptr = cnp.cn_pnbuf;
14644 	cnp.cn_namelen = (int)name_len;
14645 	root_data.sr_cnp = &cnp;
14646 
14647 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14648 
14649 	mount_iterdrop(mp);
14650 	zfree(ZV_NAMEI, name_buf);
14651 
14652 	return error;
14653 }
14654 
14655 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14656 vfs_context_can_snapshot(vfs_context_t ctx)
14657 {
14658 	static const char * const snapshot_entitlements[] = {
14659 		"com.apple.private.vfs.snapshot",
14660 		"com.apple.developer.vfs.snapshot",
14661 		"com.apple.private.apfs.arv.limited.snapshot",
14662 	};
14663 	static const size_t nentitlements =
14664 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14665 	size_t i;
14666 
14667 	task_t task = vfs_context_task(ctx);
14668 	for (i = 0; i < nentitlements; i++) {
14669 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14670 			return TRUE;
14671 		}
14672 	}
14673 	return FALSE;
14674 }
14675 
14676 /*
14677  * FS snapshot operations dispatcher
14678  */
14679 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14680 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14681     __unused int32_t *retval)
14682 {
14683 	int error;
14684 	vfs_context_t ctx = vfs_context_current();
14685 
14686 	AUDIT_ARG(fd, uap->dirfd);
14687 	AUDIT_ARG(value32, uap->op);
14688 
14689 	if (!vfs_context_can_snapshot(ctx)) {
14690 		return EPERM;
14691 	}
14692 
14693 	/*
14694 	 * Enforce user authorization for snapshot modification operations,
14695 	 * or if trying to root from snapshot.
14696 	 */
14697 	if (uap->op != SNAPSHOT_OP_MOUNT) {
14698 		vnode_t dvp = NULLVP;
14699 		vnode_t devvp = NULLVP;
14700 		mount_t mp;
14701 
14702 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14703 		if (error) {
14704 			return error;
14705 		}
14706 		mp = vnode_mount(dvp);
14707 		devvp = mp->mnt_devvp;
14708 
14709 		/* get an iocount on devvp */
14710 		if (devvp == NULLVP) {
14711 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14712 			/* for mounts which arent block devices */
14713 			if (error == ENOENT) {
14714 				error = ENXIO;
14715 			}
14716 		} else {
14717 			error = vnode_getwithref(devvp);
14718 		}
14719 
14720 		if (error) {
14721 			vnode_put(dvp);
14722 			return error;
14723 		}
14724 
14725 		if ((vfs_context_issuser(ctx) == 0) &&
14726 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14727 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14728 			error = EPERM;
14729 		}
14730 		vnode_put(dvp);
14731 		vnode_put(devvp);
14732 
14733 		if (error) {
14734 			return error;
14735 		}
14736 	}
14737 
14738 	switch (uap->op) {
14739 	case SNAPSHOT_OP_CREATE:
14740 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14741 		break;
14742 	case SNAPSHOT_OP_DELETE:
14743 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14744 		break;
14745 	case SNAPSHOT_OP_RENAME:
14746 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14747 		    uap->flags, ctx);
14748 		break;
14749 	case SNAPSHOT_OP_MOUNT:
14750 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14751 		    uap->data, uap->flags, ctx);
14752 		break;
14753 	case SNAPSHOT_OP_REVERT:
14754 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14755 		break;
14756 #if CONFIG_MNT_ROOTSNAP
14757 	case SNAPSHOT_OP_ROOT:
14758 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14759 		break;
14760 #endif /* CONFIG_MNT_ROOTSNAP */
14761 	default:
14762 		error = ENOSYS;
14763 	}
14764 
14765 	return error;
14766 }
14767