xref: /xnu-12377.81.4/bsd/vfs/vfs_syscalls.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/syslimits.h> /* For MAXLONGPATHLEN */
77 #include <sys/namei.h>
78 #include <sys/filedesc.h>
79 #include <sys/kernel.h>
80 #include <sys/file_internal.h>
81 #include <sys/stat.h>
82 #include <sys/vnode_internal.h>
83 #include <sys/mount_internal.h>
84 #include <sys/proc_internal.h>
85 #include <sys/kauth.h>
86 #include <sys/uio_internal.h>
87 #include <kern/kalloc.h>
88 #include <sys/mman.h>
89 #include <sys/dirent.h>
90 #include <sys/attr.h>
91 #include <sys/sysctl.h>
92 #include <sys/ubc.h>
93 #include <sys/quota.h>
94 #include <sys/kdebug.h>
95 #include <sys/fsevents.h>
96 #include <sys/imgsrc.h>
97 #include <sys/sysproto.h>
98 #include <sys/sysctl.h>
99 #include <sys/xattr.h>
100 #include <sys/fcntl.h>
101 #include <sys/stdio.h>
102 #include <sys/fsctl.h>
103 #include <sys/ubc_internal.h>
104 #include <sys/disk.h>
105 #include <sys/content_protection.h>
106 #include <sys/clonefile.h>
107 #include <sys/snapshot.h>
108 #include <sys/priv.h>
109 #include <sys/fsgetpath.h>
110 #include <machine/cons.h>
111 #include <machine/limits.h>
112 #include <miscfs/specfs/specdev.h>
113 
114 #include <vfs/vfs_disk_conditioner.h>
115 #if CONFIG_EXCLAVES
116 #include <vfs/vfs_exclave_fs.h>
117 #endif
118 
119 #include <security/audit/audit.h>
120 #include <bsm/audit_kevents.h>
121 
122 #include <mach/mach_types.h>
123 #include <kern/exc_guard.h>
124 #include <kern/kern_types.h>
125 #include <kern/kalloc.h>
126 #include <kern/task.h>
127 
128 #include <vm/vm_pageout.h>
129 #include <vm/vm_protos.h>
130 #include <vm/memory_object_xnu.h>
131 
132 #include <libkern/OSAtomic.h>
133 #include <os/atomic_private.h>
134 #include <pexpert/pexpert.h>
135 #include <IOKit/IOBSD.h>
136 
137 // deps for MIG call
138 #include <kern/host.h>
139 #include <kern/ipc_misc.h>
140 #include <mach/host_priv.h>
141 #include <mach/vfs_nspace.h>
142 #include <os/log.h>
143 
144 #include <nfs/nfs_conf.h>
145 
146 #if ROUTEFS
147 #include <miscfs/routefs/routefs.h>
148 #endif /* ROUTEFS */
149 
150 #if CONFIG_MACF
151 #include <security/mac.h>
152 #include <security/mac_framework.h>
153 #endif
154 
155 #if CONFIG_FSE
156 #define GET_PATH(x) \
157 	((x) = get_pathbuff())
158 #define RELEASE_PATH(x) \
159 	release_pathbuff(x)
160 #else
161 #define GET_PATH(x)     \
162 	((x) = zalloc(ZV_NAMEI))
163 #define RELEASE_PATH(x) \
164 	zfree(ZV_NAMEI, x)
165 #endif /* CONFIG_FSE */
166 
167 #ifndef HFS_GET_BOOT_INFO
168 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
169 #endif
170 
171 #ifndef HFS_SET_BOOT_INFO
172 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
173 #endif
174 
175 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
176 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
177 #endif
178 
179 extern void disk_conditioner_unmount(mount_t mp);
180 
181 /* struct for checkdirs iteration */
182 struct cdirargs {
183 	vnode_t olddp;
184 	vnode_t newdp;
185 };
186 /* callback  for checkdirs iteration */
187 static int checkdirs_callback(proc_t p, void * arg);
188 
189 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
190 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
191 void enablequotas(struct mount *mp, vfs_context_t ctx);
192 static int getfsstat_callback(mount_t mp, void * arg);
193 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
194 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
195 static int sync_callback(mount_t, void *);
196 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
197     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
198     boolean_t partial_copy);
199 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
200 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
201     struct componentname *cnp, user_addr_t fsmountargs,
202     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
203 void vfs_notify_mount(vnode_t pdvp);
204 
205 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
206 
207 struct fd_vn_data * fg_vn_data_alloc(void);
208 
209 /*
210  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
211  * Concurrent lookups (or lookups by ids) on hard links can cause the
212  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
213  * does) to return ENOENT as the path cannot be returned from the name cache
214  * alone. We have no option but to retry and hope to get one namei->reverse path
215  * generation done without an intervening lookup, lookup by id on the hard link
216  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
217  * which currently are the MAC hooks for rename, unlink and rmdir.
218  */
219 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
220 
221 /* Max retry limit for rename due to vnode recycling. */
222 #define MAX_RENAME_ERECYCLE_RETRIES 1024
223 
224 #define MAX_LINK_ENOENT_RETRIES 1024
225 
226 /* Max retries for concurrent mounts on the same covered vnode. */
227 #define MAX_MOUNT_RETRIES       10
228 
229 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
230     int unlink_flags);
231 
232 #ifdef CONFIG_IMGSRC_ACCESS
233 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
234 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
235 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
236 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
237 static void mount_end_update(mount_t mp);
238 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
239 #endif /* CONFIG_IMGSRC_ACCESS */
240 
241 //snapshot functions
242 #if CONFIG_MNT_ROOTSNAP
243 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
244 #else
245 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
246 #endif
247 
248 __private_extern__
249 int sync_internal(void);
250 
251 __private_extern__
252 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
253 
254 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
255 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
256 
257 /* vars for sync mutex */
258 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
259 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
260 
261 extern lck_rw_t rootvnode_rw_lock;
262 
263 VFS_SMR_DECLARE;
264 extern uint32_t nc_smr_enabled;
265 
266 /*
267  * incremented each time a mount or unmount operation occurs
268  * used to invalidate the cached value of the rootvp in the
269  * mount structure utilized by cache_lookup_path
270  */
271 uint32_t mount_generation = 0;
272 
273 /* counts number of mount and unmount operations */
274 unsigned int vfs_nummntops = 0;
275 
276 /* system-wide, per-boot unique mount ID */
277 static _Atomic uint64_t mount_unique_id = 1;
278 
279 extern const struct fileops vnops;
280 #if CONFIG_APPLEDOUBLE
281 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
282 #endif /* CONFIG_APPLEDOUBLE */
283 
284 
285 /*
286  * Virtual File System System Calls
287  */
288 
289 /*
290  * Private in-kernel mounting spi (specific use-cases only)
291  */
292 boolean_t
vfs_iskernelmount(mount_t mp)293 vfs_iskernelmount(mount_t mp)
294 {
295 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
296 }
297 
298 __private_extern__
299 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)300 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
301     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
302     vfs_context_t ctx)
303 {
304 	struct nameidata nd;
305 	boolean_t did_namei;
306 	int error;
307 
308 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
309 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
310 	if (syscall_flags & MNT_NOFOLLOW) {
311 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
312 	}
313 
314 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
315 
316 	/*
317 	 * Get the vnode to be covered if it's not supplied
318 	 */
319 	if (vp == NULLVP) {
320 		error = namei(&nd);
321 		if (error) {
322 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
323 				printf("failed to locate mount-on path: %s ", path);
324 			}
325 			return error;
326 		}
327 		vp = nd.ni_vp;
328 		pvp = nd.ni_dvp;
329 		did_namei = TRUE;
330 	} else {
331 		char *pnbuf = CAST_DOWN(char *, path);
332 
333 		nd.ni_cnd.cn_pnbuf = pnbuf;
334 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
335 		did_namei = FALSE;
336 	}
337 
338 	kern_flags |= KERNEL_MOUNT_KMOUNT;
339 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
340 	    syscall_flags, kern_flags, NULL, ctx);
341 
342 	if (did_namei) {
343 		vnode_put(vp);
344 		vnode_put(pvp);
345 		nameidone(&nd);
346 	}
347 
348 	return error;
349 }
350 
351 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)352 vfs_mount_at_path(const char *fstype, const char *path,
353     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
354     int mnt_flags, int flags)
355 {
356 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
357 	int error, km_flags = 0;
358 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
359 
360 	/*
361 	 * This call is currently restricted to specific use cases.
362 	 */
363 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
364 		return ENOTSUP;
365 	}
366 
367 #if !defined(XNU_TARGET_OS_OSX)
368 	if (strcmp(fstype, "lifs") == 0) {
369 		syscall_flags |= MNT_NOEXEC;
370 	}
371 #endif
372 
373 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
374 		km_flags |= KERNEL_MOUNT_NOAUTH;
375 	}
376 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
377 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
378 	}
379 
380 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
381 	    syscall_flags, km_flags, ctx);
382 	if (error) {
383 		printf("%s: mount on %s failed, error %d\n", __func__, path,
384 		    error);
385 	}
386 
387 	return error;
388 }
389 
390 /*
391  * Mount a file system.
392  */
393 /* ARGSUSED */
394 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)395 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
396 {
397 	struct __mac_mount_args muap;
398 
399 	muap.type = uap->type;
400 	muap.path = uap->path;
401 	muap.flags = uap->flags;
402 	muap.data = uap->data;
403 	muap.mac_p = USER_ADDR_NULL;
404 	return __mac_mount(p, &muap, retval);
405 }
406 
407 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)408 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
409 {
410 	struct componentname    cn;
411 	vfs_context_t           ctx = vfs_context_current();
412 	size_t                  dummy = 0;
413 	int                     error;
414 	int                     flags = uap->flags;
415 	char                    fstypename[MFSNAMELEN];
416 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
417 	vnode_t                 pvp;
418 	vnode_t                 vp;
419 
420 	AUDIT_ARG(fd, uap->fd);
421 	AUDIT_ARG(fflags, flags);
422 	/* fstypename will get audited by mount_common */
423 
424 	/* Sanity check the flags */
425 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
426 		return ENOTSUP;
427 	}
428 
429 	if (flags & MNT_UNION) {
430 		return EPERM;
431 	}
432 
433 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
434 	if (error) {
435 		return error;
436 	}
437 
438 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
439 		return error;
440 	}
441 
442 	if ((error = vnode_getwithref(vp)) != 0) {
443 		file_drop(uap->fd);
444 		return error;
445 	}
446 
447 	pvp = vnode_getparent(vp);
448 	if (pvp == NULL) {
449 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
450 			error = EBUSY;
451 		} else {
452 			error = EINVAL;
453 		}
454 		vnode_put(vp);
455 		file_drop(uap->fd);
456 		return error;
457 	}
458 
459 	memset(&cn, 0, sizeof(struct componentname));
460 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
461 	cn.cn_pnlen = MAXPATHLEN;
462 
463 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
464 		zfree(ZV_NAMEI, cn.cn_pnbuf);
465 		vnode_put(pvp);
466 		vnode_put(vp);
467 		file_drop(uap->fd);
468 		return error;
469 	}
470 
471 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
472 
473 	zfree(ZV_NAMEI, cn.cn_pnbuf);
474 	vnode_put(pvp);
475 	vnode_put(vp);
476 	file_drop(uap->fd);
477 
478 	return error;
479 }
480 
481 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
482 
483 /*
484  * Get the size of a graft file (a manifest or payload file).
485  * The vp should be an iocounted vnode.
486  */
487 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)488 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
489 {
490 	struct stat64 sb = {};
491 	int error;
492 
493 	*size = 0;
494 
495 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
496 	if (error) {
497 		return error;
498 	}
499 
500 	if (sb.st_size == 0) {
501 		error = ENODATA;
502 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
503 		error = EFBIG;
504 	} else {
505 		*size = (size_t) sb.st_size;
506 	}
507 
508 	return error;
509 }
510 
511 /*
512  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
513  * `size` must already be validated.
514  */
515 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)516 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
517 {
518 	return vn_rdwr(UIO_READ, graft_vp,
519 	           (caddr_t) buf, (int) size, /* offset */ 0,
520 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
521 	           vfs_context_ucred(vctx), /* resid */ NULL,
522 	           vfs_context_proc(vctx));
523 }
524 
525 /*
526  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
527  * and read it into `buf`.
528  * If `path_prefix` is non-NULL, verify that the file path has that prefix.
529  */
530 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,const char * path_prefix,size_t * size,void * buf)531 graft_secureboot_read_fd(int fd, vfs_context_t vctx, const char *path_prefix, size_t *size, void *buf)
532 {
533 	vnode_t metadata_vp = NULLVP;
534 	char *path = NULL;
535 	int error;
536 
537 	// Convert this graft fd to a vnode.
538 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
539 		goto out;
540 	}
541 
542 	// Verify that the vnode path starts with `path_prefix` if it was passed.
543 	if (path_prefix) {
544 		int len = MAXPATHLEN;
545 		path = zalloc(ZV_NAMEI);
546 		if ((error = vn_getpath(metadata_vp, path, &len))) {
547 			goto out;
548 		}
549 		if (strncmp(path, path_prefix, strlen(path_prefix))) {
550 			error = EINVAL;
551 			goto out;
552 		}
553 	}
554 
555 	// Get (and validate) size information.
556 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
557 		goto out;
558 	}
559 
560 	// Read each file into the provided buffer - we must get the expected amount of bytes.
561 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
562 		goto out;
563 	}
564 
565 out:
566 	if (path) {
567 		zfree(ZV_NAMEI, path);
568 	}
569 	if (metadata_vp) {
570 		vnode_put(metadata_vp);
571 		metadata_vp = NULLVP;
572 	}
573 
574 	return error;
575 }
576 
577 #if XNU_TARGET_OS_OSX
578 #define BASESYSTEM_PATH "/System/Library/BaseSystem/"
579 #if defined(__arm64e__)
580 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/manifests/"
581 #define MOBILE_ASSET_DATA_VAULT_RECOVERYOS_PATH "/System/Volumes/Data/System/Library/AssetsV2/manifests/"
582 #else /* x86_64 */
583 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/"
584 #define MOBILE_ASSET_DATA_VAULT_RECOVERYOS_PATH "/System/Volumes/Update/MobileAsset/AssetsV2/"
585 #endif /* x86_64 */
586 #else /* !XNU_TARGET_OS_OSX */
587 #define MOBILE_ASSET_DATA_VAULT_PATH "/private/var/MobileAsset/AssetsV2/manifests/"
588 #endif /* !XNU_TARGET_OS_OSX */
589 
590 /*
591  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
592  * provided in `gfs`, saving the size of data read in `gfs`.
593  */
594 static int
graft_secureboot_read_metadata(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)595 graft_secureboot_read_metadata(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
596     vfs_context_t vctx, fsioc_graft_fs_t *gfs)
597 {
598 	const char *manifest_path_prefix = NULL;
599 	int error;
600 
601 	// For Mobile Asset, make sure that the manifest comes from a data vault.
602 	if ((graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET) ||
603 	    (graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET_WITH_CODE)) {
604 		manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_PATH;
605 #if XNU_TARGET_OS_OSX
606 		// Check if we're in RecoveryOS by checking for BaseSystem path
607 		// existence, and if so use the Data volume path of the data vault.
608 		struct nameidata nd;
609 		NDINIT(&nd, LOOKUP, OP_LOOKUP, NOFOLLOW, UIO_SYSSPACE,
610 		    CAST_USER_ADDR_T(BASESYSTEM_PATH), vctx);
611 		if (!namei(&nd)) {
612 			vnode_t vp = nd.ni_vp;
613 			if (vp->v_type == VDIR) {
614 				manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_RECOVERYOS_PATH;
615 			}
616 			vnode_put(vp);
617 			nameidone(&nd);
618 		}
619 #endif
620 	}
621 
622 	// Read the authentic manifest.
623 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
624 	    manifest_path_prefix, &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
625 		return error;
626 	}
627 
628 	// The user manifest is currently unused, but set its size.
629 	gfs->user_manifest_size = 0;
630 
631 	// Read the payload.
632 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
633 	    NULL, &gfs->payload_size, gfs->payload))) {
634 		return error;
635 	}
636 
637 	return 0;
638 }
639 
640 /*
641  * Call into the filesystem to verify and graft a cryptex.
642  */
643 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)644 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
645     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
646 {
647 	fsioc_graft_fs_t gfs = {};
648 	uint64_t graft_dir_ino = 0;
649 	struct stat64 sb = {};
650 	int error;
651 
652 	// Pre-flight arguments.
653 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
654 		// Make sure that this graft version matches what we support.
655 		return ENOTSUP;
656 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
657 		// For this type, cryptex VP must live on same volume as the target of graft.
658 		return EXDEV;
659 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
660 		// We cannot graft upon non-directories.
661 		return ENOTDIR;
662 	} else if (cryptex_vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) {
663 		// We do not allow grafts inside disk images.
664 		return ENODEV;
665 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
666 	    sbc_args->sbc_payload_fd < 0) {
667 		// We cannot graft without a manifest and payload.
668 		return EINVAL;
669 	}
670 
671 	if (mounton_vp) {
672 		// Get the mounton's inode number.
673 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
674 		if (error) {
675 			return error;
676 		}
677 		graft_dir_ino = (uint64_t) sb.st_ino;
678 	}
679 
680 	// Create buffers (of our maximum-defined size) to store authentication info.
681 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
682 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
683 
684 	if (!gfs.authentic_manifest || !gfs.payload) {
685 		error = ENOMEM;
686 		goto out;
687 	}
688 
689 	// Read our fd's into our buffers.
690 	// (Note that this will set the buffer size fields in `gfs`.)
691 	error = graft_secureboot_read_metadata(graft_type, sbc_args, vctx, &gfs);
692 	if (error) {
693 		goto out;
694 	}
695 
696 	gfs.graft_version = FSIOC_GRAFT_VERSION;
697 	gfs.graft_type = graft_type;
698 	gfs.graft_4cc = sbc_args->sbc_4cc;
699 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
700 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
701 	}
702 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
703 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
704 	}
705 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
706 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
707 	}
708 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
709 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
710 	}
711 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
712 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
713 	}
714 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
715 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
716 	}
717 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
718 
719 	// Call into the FS to perform the graft (and validation).
720 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
721 
722 out:
723 	if (gfs.authentic_manifest) {
724 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
725 		gfs.authentic_manifest = NULL;
726 	}
727 	if (gfs.payload) {
728 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
729 		gfs.payload = NULL;
730 	}
731 
732 	return error;
733 }
734 
735 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
736 
737 /*
738  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
739  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
740  */
741 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)742 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
743 {
744 	int ua_dmgfd = uap->dmg_fd;
745 	user_addr_t ua_mountdir = uap->mountdir;
746 	uint32_t ua_grafttype = uap->graft_type;
747 	user_addr_t ua_graftargs = uap->gda;
748 
749 	graftdmg_args_un kern_gda = {};
750 	int error = 0;
751 	secure_boot_cryptex_args_t *sbc_args = NULL;
752 	bool graft_on_parent = (ua_mountdir == USER_ADDR_NULL);
753 
754 	vnode_t cryptex_vp = NULLVP;
755 	struct nameidata nd = {};
756 	vfs_context_t ctx = vfs_context_current();
757 #if CONFIG_MACF
758 	vnode_t parent_vp = NULLVP;
759 #endif
760 
761 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
762 		return EPERM;
763 	}
764 
765 	// Copy graftargs in, if provided.
766 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
767 	if (error) {
768 		return error;
769 	}
770 
771 	// Convert fd to vnode.
772 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
773 	if (error) {
774 		return error;
775 	}
776 
777 	if (vnode_isdir(cryptex_vp)) {
778 		error = EISDIR;
779 		goto graftout;
780 	}
781 
782 #if CONFIG_MACF
783 	if (graft_on_parent) {
784 		// Grafting on Cryptex file parent directory, need to get its vp for MAC check.
785 		parent_vp = vnode_getparent(cryptex_vp);
786 		if (parent_vp == NULLVP) {
787 			error = ENOENT;
788 			goto graftout;
789 		}
790 	}
791 #endif
792 
793 	if (!graft_on_parent) {
794 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
795 		    UIO_USERSPACE, ua_mountdir, ctx);
796 
797 		error = namei(&nd);
798 		if (error) {
799 			goto graftout;
800 		}
801 	}
802 
803 #if CONFIG_MACF
804 	vnode_t macf_vp = graft_on_parent ? parent_vp : nd.ni_vp;
805 	error = mac_graft_check_graft(ctx, macf_vp);
806 	if (error) {
807 		goto graftout;
808 	}
809 #endif
810 
811 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
812 		error = EINVAL;
813 	} else {
814 		sbc_args = &kern_gda.sbc_args;
815 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx,
816 		    cryptex_vp, graft_on_parent ? NULLVP : nd.ni_vp);
817 	}
818 
819 #if CONFIG_MACF
820 	if (!error) {
821 		mac_graft_notify_graft(ctx, macf_vp);
822 	}
823 #endif
824 
825 graftout:
826 #if CONFIG_MACF
827 	if (parent_vp != NULLVP) {
828 		vnode_put(parent_vp);
829 		parent_vp = NULLVP;
830 	}
831 #endif
832 	if (cryptex_vp != NULLVP) {
833 		vnode_put(cryptex_vp);
834 		cryptex_vp = NULLVP;
835 	}
836 	if (nd.ni_vp != NULLVP) {
837 		vnode_put(nd.ni_vp);
838 		nameidone(&nd);
839 	}
840 
841 	return error;
842 }
843 
844 /*
845  * Ungraft a cryptex disk image (via mount dir FD)
846  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
847  */
848 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)849 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
850 {
851 	int error = 0;
852 	user_addr_t ua_mountdir = uap->mountdir;
853 	fsioc_ungraft_fs_t ugfs = {};
854 	struct nameidata nd = {};
855 	vfs_context_t ctx = vfs_context_current();
856 
857 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
858 		return EPERM;
859 	}
860 
861 	if (ua_mountdir == USER_ADDR_NULL) {
862 		return EINVAL;
863 	}
864 
865 	if (uap->flags & UNGRAFTDMG_NOFORCE) {
866 		ugfs.ungraft_flags |= FSCTL_UNGRAFT_NOFORCE;
867 	}
868 
869 	// Acquire vnode for mount-on path
870 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
871 	    UIO_USERSPACE, ua_mountdir, ctx);
872 
873 	error = namei(&nd);
874 	if (error) {
875 		return error;
876 	}
877 
878 	if (!vnode_isdir(nd.ni_vp)) {
879 		error = ENOTDIR;
880 		goto ungraftout;
881 	}
882 
883 #if CONFIG_MACF
884 	error = mac_graft_check_ungraft(ctx, nd.ni_vp);
885 	if (error) {
886 		goto ungraftout;
887 	}
888 #endif
889 
890 	// Call into the FS to perform the ungraft
891 	error = VNOP_IOCTL(nd.ni_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
892 
893 #if CONFIG_MACF
894 	if (!error) {
895 		mac_graft_notify_ungraft(ctx, nd.ni_vp);
896 	}
897 #endif
898 
899 ungraftout:
900 	vnode_put(nd.ni_vp);
901 	nameidone(&nd);
902 
903 	return error;
904 }
905 
906 
907 void
vfs_notify_mount(vnode_t pdvp)908 vfs_notify_mount(vnode_t pdvp)
909 {
910 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
911 	lock_vnode_and_post(pdvp, NOTE_WRITE);
912 }
913 
914 /*
915  * __mac_mount:
916  *	Mount a file system taking into account MAC label behavior.
917  *	See mount(2) man page for more information
918  *
919  * Parameters:    p                        Process requesting the mount
920  *                uap                      User argument descriptor (see below)
921  *                retval                   (ignored)
922  *
923  * Indirect:      uap->type                Filesystem type
924  *                uap->path                Path to mount
925  *                uap->data                Mount arguments
926  *                uap->mac_p               MAC info
927  *                uap->flags               Mount flags
928  *
929  *
930  * Returns:        0                       Success
931  *                !0                       Not success
932  */
933 boolean_t root_fs_upgrade_try = FALSE;
934 
935 #define MAX_NESTED_UNION_MOUNTS  10
936 
937 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)938 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
939 {
940 	vnode_t pvp = NULLVP;
941 	vnode_t vp = NULLVP;
942 	int need_nameidone = 0;
943 	vfs_context_t ctx = vfs_context_current();
944 	char fstypename[MFSNAMELEN];
945 	struct nameidata nd;
946 	size_t dummy = 0;
947 	char *labelstr = NULL;
948 	size_t labelsz = 0;
949 	int flags = uap->flags;
950 	int error;
951 	int num_retries = 0;
952 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
953 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
954 #else
955 #pragma unused(p)
956 #endif
957 	/*
958 	 * Get the fs type name from user space
959 	 */
960 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
961 	if (error) {
962 		return error;
963 	}
964 
965 retry:
966 	/*
967 	 * Get the vnode to be covered
968 	 */
969 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
970 	    UIO_USERSPACE, uap->path, ctx);
971 	if (flags & MNT_NOFOLLOW) {
972 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
973 	}
974 	error = namei(&nd);
975 	if (error) {
976 		goto out;
977 	}
978 	need_nameidone = 1;
979 	vp = nd.ni_vp;
980 	pvp = nd.ni_dvp;
981 
982 #ifdef CONFIG_IMGSRC_ACCESS
983 	/* Mounting image source cannot be batched with other operations */
984 	if (flags == MNT_IMGSRC_BY_INDEX) {
985 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
986 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
987 		goto out;
988 	}
989 #endif /* CONFIG_IMGSRC_ACCESS */
990 
991 #if CONFIG_MACF
992 	/*
993 	 * Get the label string (if any) from user space
994 	 */
995 	if (uap->mac_p != USER_ADDR_NULL) {
996 		struct user_mac mac;
997 		size_t ulen = 0;
998 
999 		if (is_64bit) {
1000 			struct user64_mac mac64;
1001 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
1002 			mac.m_buflen = (user_size_t)mac64.m_buflen;
1003 			mac.m_string = (user_addr_t)mac64.m_string;
1004 		} else {
1005 			struct user32_mac mac32;
1006 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
1007 			mac.m_buflen = mac32.m_buflen;
1008 			mac.m_string = mac32.m_string;
1009 		}
1010 		if (error) {
1011 			goto out;
1012 		}
1013 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
1014 		    (mac.m_buflen < 2)) {
1015 			error = EINVAL;
1016 			goto out;
1017 		}
1018 		labelsz = mac.m_buflen;
1019 		labelstr = kalloc_data(labelsz, Z_WAITOK);
1020 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
1021 		if (error) {
1022 			goto out;
1023 		}
1024 		AUDIT_ARG(mac_string, labelstr);
1025 	}
1026 #endif /* CONFIG_MACF */
1027 
1028 	AUDIT_ARG(fflags, flags);
1029 
1030 	if (flags & MNT_UNION) {
1031 #if CONFIG_UNION_MOUNTS
1032 		mount_t mp = vp->v_mount;
1033 		int nested_union_mounts = 0;
1034 
1035 		name_cache_lock_shared();
1036 
1037 		/* Walk up the vnodecovered chain and check for nested union mounts. */
1038 		mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
1039 		while (mp) {
1040 			if (!(mp->mnt_flag & MNT_UNION)) {
1041 				break;
1042 			}
1043 			mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
1044 
1045 			/*
1046 			 * Limit the max nested unon mounts to prevent stack exhaustion
1047 			 * when calling lookup_traverse_union().
1048 			 */
1049 			if (++nested_union_mounts >= MAX_NESTED_UNION_MOUNTS) {
1050 				error = ELOOP;
1051 				break;
1052 			}
1053 		}
1054 
1055 		name_cache_unlock();
1056 		if (error) {
1057 			goto out;
1058 		}
1059 #else
1060 		error = EPERM;
1061 		goto out;
1062 #endif /* CONFIG_UNION_MOUNTS */
1063 	}
1064 
1065 	if ((vp->v_flag & VROOT) &&
1066 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
1067 #if CONFIG_UNION_MOUNTS
1068 		if (!(flags & MNT_UNION)) {
1069 			flags |= MNT_UPDATE;
1070 		} else {
1071 			/*
1072 			 * For a union mount on '/', treat it as fresh
1073 			 * mount instead of update.
1074 			 * Otherwise, union mouting on '/' used to panic the
1075 			 * system before, since mnt_vnodecovered was found to
1076 			 * be NULL for '/' which is required for unionlookup
1077 			 * after it gets ENOENT on union mount.
1078 			 */
1079 			flags = (flags & ~(MNT_UPDATE));
1080 		}
1081 #else
1082 		flags |= MNT_UPDATE;
1083 #endif /* CONFIG_UNION_MOUNTS */
1084 
1085 #if SECURE_KERNEL
1086 		if ((flags & MNT_RDONLY) == 0) {
1087 			/* Release kernels are not allowed to mount "/" as rw */
1088 			error = EPERM;
1089 			goto out;
1090 		}
1091 #endif
1092 
1093 		/*
1094 		 * See 7392553 for more details on why this check exists.
1095 		 * Suffice to say: If this check is ON and something tries
1096 		 * to mount the rootFS RW, we'll turn off the codesign
1097 		 * bitmap optimization.
1098 		 */
1099 #if CHECK_CS_VALIDATION_BITMAP
1100 		if ((flags & MNT_RDONLY) == 0) {
1101 			root_fs_upgrade_try = TRUE;
1102 		}
1103 #endif
1104 	}
1105 
1106 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
1107 	    labelstr, ctx);
1108 
1109 out:
1110 
1111 #if CONFIG_MACF
1112 	kfree_data(labelstr, labelsz);
1113 #endif /* CONFIG_MACF */
1114 
1115 	if (vp) {
1116 		vnode_put(vp);
1117 		vp = NULLVP;
1118 	}
1119 	if (pvp) {
1120 		vnode_put(pvp);
1121 		pvp = NULLVP;
1122 	}
1123 	if (need_nameidone) {
1124 		nameidone(&nd);
1125 		need_nameidone = 0;
1126 	}
1127 
1128 	if (error == EBUSY) {
1129 		/* Retry the lookup and mount again due to concurrent mounts. */
1130 		if (++num_retries < MAX_MOUNT_RETRIES) {
1131 			goto retry;
1132 		}
1133 	}
1134 
1135 	return error;
1136 }
1137 
1138 /*
1139  * common mount implementation (final stage of mounting)
1140  *
1141  * Arguments:
1142  *  fstypename	file system type (ie it's vfs name)
1143  *  pvp		parent of covered vnode
1144  *  vp		covered vnode
1145  *  cnp		component name (ie path) of covered vnode
1146  *  flags	generic mount flags
1147  *  fsmountargs	file system specific data
1148  *  labelstr	optional MAC label
1149  *  kernelmount	TRUE for mounts initiated from inside the kernel
1150  *  ctx		caller's context
1151  */
1152 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1153 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1154     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1155     char *labelstr, vfs_context_t ctx)
1156 {
1157 #if !CONFIG_MACF
1158 #pragma unused(labelstr)
1159 #endif
1160 	struct vnode *devvp = NULLVP;
1161 	struct vnode *device_vnode = NULLVP;
1162 #if CONFIG_MACF
1163 	struct vnode *rvp;
1164 #endif
1165 	struct mount *mp = NULL;
1166 	struct vfstable *vfsp = (struct vfstable *)0;
1167 	struct proc *p = vfs_context_proc(ctx);
1168 	int error, flag = 0;
1169 	bool flag_set = false;
1170 	user_addr_t devpath = USER_ADDR_NULL;
1171 	int ronly = 0;
1172 	int mntalloc = 0;
1173 	boolean_t vfsp_ref = FALSE;
1174 	boolean_t is_rwlock_locked = FALSE;
1175 	boolean_t did_rele = FALSE;
1176 	boolean_t have_usecount = FALSE;
1177 	boolean_t did_set_lmount = FALSE;
1178 	boolean_t did_set_vmount = FALSE;
1179 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1180 
1181 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1182 	/* Check for mutually-exclusive flag bits */
1183 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1184 	int bitcount = 0;
1185 	while (checkflags != 0) {
1186 		checkflags &= (checkflags - 1);
1187 		bitcount++;
1188 	}
1189 
1190 	if (bitcount > 1) {
1191 		//not allowed to request multiple mount-by-role flags
1192 		error = EINVAL;
1193 		goto out1;
1194 	}
1195 #endif
1196 
1197 	/*
1198 	 * Process an update for an existing mount
1199 	 */
1200 	if (flags & MNT_UPDATE) {
1201 		if ((vp->v_flag & VROOT) == 0) {
1202 			error = EINVAL;
1203 			goto out1;
1204 		}
1205 		mp = vp->v_mount;
1206 
1207 		/* if unmount or mount in progress, return error */
1208 		mount_lock_spin(mp);
1209 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1210 			mount_unlock(mp);
1211 			error = EBUSY;
1212 			goto out1;
1213 		}
1214 		mp->mnt_lflag |= MNT_LMOUNT;
1215 		did_set_lmount = TRUE;
1216 		mount_unlock(mp);
1217 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1218 		is_rwlock_locked = TRUE;
1219 		/*
1220 		 * We only allow the filesystem to be reloaded if it
1221 		 * is currently mounted read-only.
1222 		 */
1223 		if ((flags & MNT_RELOAD) &&
1224 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1225 			error = ENOTSUP;
1226 			goto out1;
1227 		}
1228 
1229 		/*
1230 		 * If content protection is enabled, update mounts are not
1231 		 * allowed to turn it off.
1232 		 */
1233 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1234 		    ((flags & MNT_CPROTECT) == 0)) {
1235 			error = EINVAL;
1236 			goto out1;
1237 		}
1238 
1239 		/*
1240 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1241 		 * failure to return an error for this so we'll just silently
1242 		 * add it if it is not passed in.
1243 		 */
1244 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1245 		    ((flags & MNT_REMOVABLE) == 0)) {
1246 			flags |= MNT_REMOVABLE;
1247 		}
1248 
1249 		/* Can't downgrade the backer of the root FS */
1250 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1251 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1252 			error = ENOTSUP;
1253 			goto out1;
1254 		}
1255 
1256 		/*
1257 		 * Only root, or the user that did the original mount is
1258 		 * permitted to update it.
1259 		 */
1260 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1261 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1262 			goto out1;
1263 		}
1264 #if CONFIG_MACF
1265 		error = mac_mount_check_remount(ctx, mp, flags);
1266 		if (error != 0) {
1267 			goto out1;
1268 		}
1269 #endif
1270 		/*
1271 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1272 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1273 		 */
1274 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1275 			flags |= MNT_NOSUID | MNT_NODEV;
1276 			if (mp->mnt_flag & MNT_NOEXEC) {
1277 				flags |= MNT_NOEXEC;
1278 			}
1279 		}
1280 		flag = mp->mnt_flag;
1281 		flag_set = true;
1282 
1283 
1284 
1285 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1286 
1287 		vfsp = mp->mnt_vtable;
1288 		goto update;
1289 	} // MNT_UPDATE
1290 
1291 	/*
1292 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1293 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1294 	 */
1295 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1296 		flags |= MNT_NOSUID | MNT_NODEV;
1297 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1298 			flags |= MNT_NOEXEC;
1299 		}
1300 	}
1301 
1302 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1303 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1304 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1305 	mount_list_lock();
1306 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1307 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1308 			vfsp->vfc_refcount++;
1309 			vfsp_ref = TRUE;
1310 			break;
1311 		}
1312 	}
1313 	mount_list_unlock();
1314 	if (vfsp == NULL) {
1315 		error = ENODEV;
1316 		goto out1;
1317 	}
1318 
1319 	/*
1320 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1321 	 * except in ROSV configs and for the initial BaseSystem root.
1322 	 */
1323 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1324 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1325 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1326 		error = EINVAL;  /* unsupported request */
1327 		goto out1;
1328 	}
1329 
1330 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1331 	if (error != 0) {
1332 		goto out1;
1333 	}
1334 
1335 	/*
1336 	 * Upon successful of prepare_coveredvp(), VMOUNT is set for the covered vp.
1337 	 */
1338 	did_set_vmount = TRUE;
1339 
1340 	/*
1341 	 * Allocate and initialize the filesystem (mount_t)
1342 	 */
1343 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1344 	mntalloc = 1;
1345 
1346 	/* Initialize the default IO constraints */
1347 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1348 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1349 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1350 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1351 	mp->mnt_devblocksize = DEV_BSIZE;
1352 	mp->mnt_alignmentmask = PAGE_MASK;
1353 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1354 	mp->mnt_ioscale = 1;
1355 	mp->mnt_ioflags = 0;
1356 	mp->mnt_realrootvp = NULLVP;
1357 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1358 
1359 	mp->mnt_lflag |= MNT_LMOUNT;
1360 	did_set_lmount = TRUE;
1361 
1362 	TAILQ_INIT(&mp->mnt_vnodelist);
1363 	TAILQ_INIT(&mp->mnt_workerqueue);
1364 	TAILQ_INIT(&mp->mnt_newvnodes);
1365 	mount_lock_init(mp);
1366 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1367 	is_rwlock_locked = TRUE;
1368 	mp->mnt_op = vfsp->vfc_vfsops;
1369 	mp->mnt_vtable = vfsp;
1370 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1371 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1372 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1373 	do {
1374 		size_t pathlen = MAXPATHLEN;
1375 
1376 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1377 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1378 		}
1379 	} while (0);
1380 	mp->mnt_vnodecovered = vp;
1381 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1382 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1383 	mp->mnt_devbsdunit = 0;
1384 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1385 
1386 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1387 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1388 
1389 	if (kernelmount) {
1390 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1391 	}
1392 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1393 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1394 	}
1395 
1396 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1397 		// kernel mounted devfs
1398 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1399 	}
1400 
1401 update:
1402 
1403 	/*
1404 	 * Set the mount level flags.
1405 	 */
1406 	if (flags & MNT_RDONLY) {
1407 		mp->mnt_flag |= MNT_RDONLY;
1408 	} else if (mp->mnt_flag & MNT_RDONLY) {
1409 		// disallow read/write upgrades of file systems that
1410 		// had the TYPENAME_OVERRIDE feature set.
1411 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1412 			error = EPERM;
1413 			goto out1;
1414 		}
1415 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1416 	}
1417 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1418 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1419 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1420 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1421 	    MNT_QUARANTINE | MNT_CPROTECT);
1422 
1423 #if SECURE_KERNEL
1424 #if !CONFIG_MNT_SUID
1425 	/*
1426 	 * On release builds of iOS based platforms, always enforce NOSUID on
1427 	 * all mounts. We do this here because we can catch update mounts as well as
1428 	 * non-update mounts in this case.
1429 	 */
1430 	mp->mnt_flag |= (MNT_NOSUID);
1431 #endif
1432 #endif
1433 
1434 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1435 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1436 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1437 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1438 	    MNT_QUARANTINE | MNT_CPROTECT);
1439 
1440 #if CONFIG_MACF
1441 	if (flags & MNT_MULTILABEL) {
1442 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1443 			error = EINVAL;
1444 			goto out1;
1445 		}
1446 		mp->mnt_flag |= MNT_MULTILABEL;
1447 	}
1448 #endif
1449 	/*
1450 	 * Process device path for local file systems if requested.
1451 	 *
1452 	 * Snapshot and mount-by-role mounts do not use this path; they are
1453 	 * passing other opaque data in the device path field.
1454 	 *
1455 	 * Basesystemroot mounts pass a device path to be resolved here,
1456 	 * but it's just a char * already inside the kernel, which
1457 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1458 	 * mounts we must skip copyin (both of the address and of the string
1459 	 * (in NDINIT).
1460 	 */
1461 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1462 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1463 		boolean_t do_copyin_devpath = true;
1464 #if CONFIG_BASESYSTEMROOT
1465 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1466 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1467 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1468 			// but is actually a char ** pointing to a (kernelspace) string.
1469 			// We manually unpack it with a series of casts and dereferences
1470 			// that reverses what was done just above us on the stack in
1471 			// imageboot_pivot_image().
1472 			// After retrieving the path to the dev node (which we will NDINIT
1473 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1474 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1475 			char **devnamepp = (char **)fsmountargs;
1476 			char *devnamep = *devnamepp;
1477 			devpath = CAST_USER_ADDR_T(devnamep);
1478 			do_copyin_devpath = false;
1479 			fsmountargs = USER_ADDR_NULL;
1480 
1481 			//Now that we have a mp, denote that this mount is for the basesystem.
1482 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1483 		}
1484 #endif // CONFIG_BASESYSTEMROOT
1485 
1486 		if (do_copyin_devpath) {
1487 			if (vfs_context_is64bit(ctx)) {
1488 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1489 					goto out1;
1490 				}
1491 				fsmountargs += sizeof(devpath);
1492 			} else {
1493 				user32_addr_t tmp;
1494 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1495 					goto out1;
1496 				}
1497 				/* munge into LP64 addr */
1498 				devpath = CAST_USER_ADDR_T(tmp);
1499 				fsmountargs += sizeof(tmp);
1500 			}
1501 		}
1502 
1503 		/* Lookup device and authorize access to it */
1504 		if ((devpath)) {
1505 			struct nameidata nd;
1506 
1507 			enum uio_seg seg = UIO_USERSPACE;
1508 #if CONFIG_BASESYSTEMROOT
1509 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1510 				seg = UIO_SYSSPACE;
1511 			}
1512 #endif // CONFIG_BASESYSTEMROOT
1513 
1514 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1515 			if (flags & MNT_NOFOLLOW) {
1516 				nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
1517 			}
1518 			if ((error = namei(&nd))) {
1519 				goto out1;
1520 			}
1521 
1522 			devvp = nd.ni_vp;
1523 
1524 			if (devvp->v_type != VBLK) {
1525 				error = ENOTBLK;
1526 				nameidone(&nd);
1527 				goto out2;
1528 			}
1529 			if (major(devvp->v_rdev) >= nblkdev) {
1530 				error = ENXIO;
1531 				nameidone(&nd);
1532 				goto out2;
1533 			}
1534 			/*
1535 			 * If mount by non-root, then verify that user has necessary
1536 			 * permissions on the device.
1537 			 */
1538 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1539 				kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1540 
1541 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1542 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1543 				}
1544 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1545 					nameidone(&nd);
1546 					goto out2;
1547 				}
1548 			}
1549 
1550 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1551 			nameidone(&nd);
1552 		}
1553 		/* On first mount, preflight and open device */
1554 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1555 			if ((error = vnode_ref(devvp))) {
1556 				goto out2;
1557 			}
1558 			/*
1559 			 * Disallow multiple mounts of the same device.
1560 			 * Disallow mounting of a device that is currently in use
1561 			 * (except for root, which might share swap device for miniroot).
1562 			 * Flush out any old buffers remaining from a previous use.
1563 			 */
1564 			if ((error = vfs_setmounting(devvp))) {
1565 				vnode_rele(devvp);
1566 				goto out2;
1567 			}
1568 
1569 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1570 				error = EBUSY;
1571 				goto out3;
1572 			}
1573 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1574 				error = ENOTBLK;
1575 				goto out3;
1576 			}
1577 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1578 				goto out3;
1579 			}
1580 
1581 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1582 #if CONFIG_MACF
1583 			error = mac_vnode_check_open(ctx,
1584 			    devvp,
1585 			    ronly ? FREAD : FREAD | FWRITE);
1586 			if (error) {
1587 				goto out3;
1588 			}
1589 #endif /* MAC */
1590 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1591 				goto out3;
1592 			}
1593 
1594 			mp->mnt_devvp = devvp;
1595 			device_vnode = devvp;
1596 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1597 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1598 		    (device_vnode = mp->mnt_devvp)) {
1599 			dev_t dev;
1600 			int maj;
1601 			/*
1602 			 * If upgrade to read-write by non-root, then verify
1603 			 * that user has necessary permissions on the device.
1604 			 */
1605 			vnode_getalways(device_vnode);
1606 
1607 			if (suser(vfs_context_ucred(ctx), NULL) &&
1608 			    (error = vnode_authorize(device_vnode, NULL,
1609 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1610 			    ctx)) != 0) {
1611 				vnode_put(device_vnode);
1612 				goto out2;
1613 			}
1614 
1615 			/* Tell the device that we're upgrading */
1616 			dev = (dev_t)device_vnode->v_rdev;
1617 			maj = major(dev);
1618 
1619 			if ((u_int)maj >= (u_int)nblkdev) {
1620 				panic("Volume mounted on a device with invalid major number.");
1621 			}
1622 
1623 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1624 			vnode_put(device_vnode);
1625 			device_vnode = NULLVP;
1626 			if (error != 0) {
1627 				goto out2;
1628 			}
1629 		}
1630 	} // localargs && !(snapshot | data | vm)
1631 
1632 #if CONFIG_MACF
1633 	if ((flags & MNT_UPDATE) == 0) {
1634 		mac_mount_label_init(mp);
1635 		mac_mount_label_associate(ctx, mp);
1636 	}
1637 	if (labelstr) {
1638 		if ((flags & MNT_UPDATE) != 0) {
1639 			error = mac_mount_check_label_update(ctx, mp);
1640 			if (error != 0) {
1641 				goto out3;
1642 			}
1643 		}
1644 	}
1645 #endif
1646 	/*
1647 	 * Mount the filesystem.  We already asserted that internal_flags
1648 	 * cannot have more than one mount-by-role bit set.
1649 	 */
1650 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1651 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1652 		    (caddr_t)fsmountargs, 0, ctx);
1653 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1654 #if CONFIG_ROSV_STARTUP
1655 		struct mount *origin_mp = (struct mount*)fsmountargs;
1656 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1657 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1658 		if (error) {
1659 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1660 		} else {
1661 			/* Mark volume associated with system volume */
1662 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1663 
1664 			/* Attempt to acquire the mnt_devvp and set it up */
1665 			struct vnode *mp_devvp = NULL;
1666 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1667 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1668 				    0, &mp_devvp, vfs_context_kernel());
1669 				if (!lerr) {
1670 					mp->mnt_devvp = mp_devvp;
1671 					//vnode_lookup took an iocount, need to drop it.
1672 					vnode_put(mp_devvp);
1673 					// now set `device_vnode` to the devvp that was acquired.
1674 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1675 					// note that though the iocount above was dropped, the mount acquires
1676 					// an implicit reference against the device.
1677 					device_vnode = mp_devvp;
1678 				}
1679 			}
1680 		}
1681 #else
1682 		error = EINVAL;
1683 #endif
1684 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1685 #if CONFIG_MOUNT_VM
1686 		struct mount *origin_mp = (struct mount*)fsmountargs;
1687 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1688 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1689 		if (error) {
1690 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1691 		} else {
1692 			/* Mark volume associated with system volume and a swap mount */
1693 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1694 			/* Attempt to acquire the mnt_devvp and set it up */
1695 			struct vnode *mp_devvp = NULL;
1696 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1697 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1698 				    0, &mp_devvp, vfs_context_kernel());
1699 				if (!lerr) {
1700 					mp->mnt_devvp = mp_devvp;
1701 					//vnode_lookup took an iocount, need to drop it.
1702 					vnode_put(mp_devvp);
1703 
1704 					// now set `device_vnode` to the devvp that was acquired.
1705 					// note that though the iocount above was dropped, the mount acquires
1706 					// an implicit reference against the device.
1707 					device_vnode = mp_devvp;
1708 				}
1709 			}
1710 		}
1711 #else
1712 		error = EINVAL;
1713 #endif
1714 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1715 #if CONFIG_MOUNT_PREBOOTRECOVERY
1716 		struct mount *origin_mp = (struct mount*)fsmountargs;
1717 		uint32_t mount_role = 0;
1718 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1719 			mount_role = VFS_PREBOOT_ROLE;
1720 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1721 			mount_role = VFS_RECOVERY_ROLE;
1722 		}
1723 
1724 		if (mount_role != 0) {
1725 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1726 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1727 			if (error) {
1728 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1729 			} else {
1730 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1731 				/* Mark volume associated with system volume */
1732 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1733 				/* Attempt to acquire the mnt_devvp and set it up */
1734 				struct vnode *mp_devvp = NULL;
1735 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1736 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1737 					    0, &mp_devvp, vfs_context_kernel());
1738 					if (!lerr) {
1739 						mp->mnt_devvp = mp_devvp;
1740 						//vnode_lookup took an iocount, need to drop it.
1741 						vnode_put(mp_devvp);
1742 
1743 						// now set `device_vnode` to the devvp that was acquired.
1744 						// note that though the iocount above was dropped, the mount acquires
1745 						// an implicit reference against the device.
1746 						device_vnode = mp_devvp;
1747 					}
1748 				}
1749 			}
1750 		} else {
1751 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1752 			error = EINVAL;
1753 		}
1754 #else
1755 		error = EINVAL;
1756 #endif
1757 	} else {
1758 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1759 	}
1760 
1761 	if (flags & MNT_UPDATE) {
1762 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1763 			mp->mnt_flag &= ~MNT_RDONLY;
1764 		}
1765 		mp->mnt_flag &= ~
1766 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1767 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1768 		if (error) {
1769 			mp->mnt_flag = flag;  /* restore flag value */
1770 		}
1771 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1772 		lck_rw_done(&mp->mnt_rwlock);
1773 		is_rwlock_locked = FALSE;
1774 		if (!error) {
1775 			enablequotas(mp, ctx);
1776 		}
1777 		goto exit;
1778 	}
1779 
1780 	/*
1781 	 * Put the new filesystem on the mount list after root.
1782 	 */
1783 	if (error == 0) {
1784 		struct vfs_attr vfsattr;
1785 		if (device_vnode) {
1786 			/*
1787 			 *   cache the IO attributes for the underlying physical media...
1788 			 *   an error return indicates the underlying driver doesn't
1789 			 *   support all the queries necessary... however, reasonable
1790 			 *   defaults will have been set, so no reason to bail or care
1791 			 *
1792 			 *   Need to do this before calling the MAC hook as it needs
1793 			 *   information from this call.
1794 			 */
1795 			vfs_init_io_attributes(device_vnode, mp);
1796 		}
1797 
1798 #if CONFIG_MACF
1799 		error = mac_mount_check_mount_late(ctx, mp);
1800 		if (error != 0) {
1801 			goto out4;
1802 		}
1803 
1804 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1805 			error = VFS_ROOT(mp, &rvp, ctx);
1806 			if (error) {
1807 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1808 				goto out4;
1809 			}
1810 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1811 			/*
1812 			 * drop reference provided by VFS_ROOT
1813 			 */
1814 			vnode_put(rvp);
1815 
1816 			if (error) {
1817 				goto out4;
1818 			}
1819 		}
1820 #endif  /* MAC */
1821 
1822 		vnode_lock_spin(vp);
1823 		CLR(vp->v_flag, VMOUNT);
1824 		vp->v_mountedhere = mp;
1825 		SET(vp->v_flag, VMOUNTEDHERE);
1826 
1827 		/*
1828 		 * Wakeup any waiter(s) in prepare_coveredvp() that is waiting for the
1829 		 * 'v_mountedhere' to be planted.
1830 		 */
1831 		wakeup(&vp->v_flag);
1832 		vnode_unlock(vp);
1833 
1834 		/*
1835 		 * taking the name_cache_lock exclusively will
1836 		 * insure that everyone is out of the fast path who
1837 		 * might be trying to use a now stale copy of
1838 		 * vp->v_mountedhere->mnt_realrootvp
1839 		 * bumping mount_generation causes the cached values
1840 		 * to be invalidated
1841 		 */
1842 		name_cache_lock();
1843 		mount_generation++;
1844 		name_cache_unlock();
1845 
1846 		error = vnode_ref(vp);
1847 		if (error != 0) {
1848 			goto out4;
1849 		}
1850 
1851 		have_usecount = TRUE;
1852 
1853 		error = checkdirs(vp, ctx);
1854 		if (error != 0) {
1855 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1856 			goto out4;
1857 		}
1858 		/*
1859 		 * there is no cleanup code here so I have made it void
1860 		 * we need to revisit this
1861 		 */
1862 		(void)VFS_START(mp, 0, ctx);
1863 
1864 		if (mount_list_add(mp) != 0) {
1865 			/*
1866 			 * The system is shutting down trying to umount
1867 			 * everything, so fail with a plausible errno.
1868 			 */
1869 			error = EBUSY;
1870 			goto out4;
1871 		}
1872 		lck_rw_done(&mp->mnt_rwlock);
1873 		is_rwlock_locked = FALSE;
1874 
1875 		/* Check if this mounted file system supports EAs or named streams. */
1876 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1877 		VFSATTR_INIT(&vfsattr);
1878 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1879 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1880 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1881 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1882 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1883 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1884 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1885 			}
1886 #if NAMEDSTREAMS
1887 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1888 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1889 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1890 			}
1891 #endif
1892 			/* Check if this file system supports path from id lookups. */
1893 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1894 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1895 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1896 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1897 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1898 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1899 			}
1900 
1901 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1902 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1903 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1904 			}
1905 		}
1906 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1907 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1908 		}
1909 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1910 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1911 		}
1912 		/* Get subtype if supported to cache it */
1913 		VFSATTR_INIT(&vfsattr);
1914 		VFSATTR_WANTED(&vfsattr, f_fssubtype);
1915 		if (vfs_getattr(mp, &vfsattr, ctx) == 0 && VFSATTR_IS_SUPPORTED(&vfsattr, f_fssubtype)) {
1916 			mp->mnt_vfsstat.f_fssubtype = vfsattr.f_fssubtype;
1917 		}
1918 
1919 		/* increment the operations count */
1920 		OSAddAtomic(1, &vfs_nummntops);
1921 		enablequotas(mp, ctx);
1922 
1923 		if (device_vnode) {
1924 			vfs_setmountedon(device_vnode);
1925 		}
1926 
1927 		/* Now that mount is setup, notify the listeners */
1928 		vfs_notify_mount(pvp);
1929 		IOBSDMountChange(mp, kIOMountChangeMount);
1930 #if CONFIG_MACF
1931 		mac_mount_notify_mount(ctx, mp);
1932 #endif /* CONFIG_MACF */
1933 	} else {
1934 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1935 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1936 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1937 			    mp->mnt_vtable->vfc_name, error);
1938 		}
1939 
1940 		vnode_lock_spin(vp);
1941 		CLR(vp->v_flag, VMOUNT);
1942 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
1943 		wakeup(&vp->v_flag);
1944 		vnode_unlock(vp);
1945 		mount_list_lock();
1946 		mp->mnt_vtable->vfc_refcount--;
1947 		mount_list_unlock();
1948 
1949 		if (device_vnode) {
1950 			vnode_rele(device_vnode);
1951 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1952 			vfs_clearmounting(device_vnode);
1953 		}
1954 		lck_rw_done(&mp->mnt_rwlock);
1955 		is_rwlock_locked = FALSE;
1956 
1957 		if (nc_smr_enabled) {
1958 			vfs_smr_synchronize();
1959 		}
1960 
1961 		/*
1962 		 * if we get here, we have a mount structure that needs to be freed,
1963 		 * but since the coveredvp hasn't yet been updated to point at it,
1964 		 * no need to worry about other threads holding a crossref on this mp
1965 		 * so it's ok to just free it
1966 		 */
1967 		mount_lock_destroy(mp);
1968 #if CONFIG_MACF
1969 		mac_mount_label_destroy(mp);
1970 #endif
1971 		zfree(mount_zone, mp);
1972 		did_set_lmount = false;
1973 	}
1974 exit:
1975 	/*
1976 	 * drop I/O count on the device vp if there was one
1977 	 */
1978 	if (devpath && devvp) {
1979 		vnode_put(devvp);
1980 	}
1981 
1982 	if (did_set_lmount) {
1983 		mount_lock_spin(mp);
1984 		mp->mnt_lflag &= ~MNT_LMOUNT;
1985 		mount_unlock(mp);
1986 	}
1987 
1988 	return error;
1989 
1990 /* Error condition exits */
1991 out4:
1992 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1993 
1994 	/*
1995 	 * If the mount has been placed on the covered vp,
1996 	 * it may have been discovered by now, so we have
1997 	 * to treat this just like an unmount
1998 	 */
1999 	mount_lock_spin(mp);
2000 	mp->mnt_lflag |= MNT_LDEAD;
2001 	mount_unlock(mp);
2002 
2003 	if (device_vnode != NULLVP) {
2004 		vnode_rele(device_vnode);
2005 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2006 		    ctx);
2007 		vfs_clearmounting(device_vnode);
2008 		did_rele = TRUE;
2009 	}
2010 
2011 	vnode_lock_spin(vp);
2012 
2013 	mp->mnt_crossref++;
2014 	CLR(vp->v_flag, VMOUNTEDHERE);
2015 	vp->v_mountedhere = (mount_t) 0;
2016 
2017 	vnode_unlock(vp);
2018 
2019 	if (have_usecount) {
2020 		vnode_rele(vp);
2021 	}
2022 out3:
2023 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
2024 		vnode_rele(devvp);
2025 		vfs_clearmounting(devvp);
2026 	}
2027 out2:
2028 	if (devpath && devvp) {
2029 		vnode_put(devvp);
2030 	}
2031 out1:
2032 	/* Release mnt_rwlock only when it was taken */
2033 	if (is_rwlock_locked == TRUE) {
2034 		if (flag_set) {
2035 			mp->mnt_flag = flag;  /* restore mnt_flag value */
2036 		}
2037 		lck_rw_done(&mp->mnt_rwlock);
2038 	}
2039 
2040 	if (did_set_lmount) {
2041 		mount_lock_spin(mp);
2042 		mp->mnt_lflag &= ~MNT_LMOUNT;
2043 		mount_unlock(mp);
2044 	}
2045 
2046 	if (did_set_vmount) {
2047 		vnode_lock_spin(vp);
2048 		CLR(vp->v_flag, VMOUNT);
2049 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2050 		wakeup(&vp->v_flag);
2051 		vnode_unlock(vp);
2052 	}
2053 
2054 	if (mntalloc) {
2055 		if (mp->mnt_crossref) {
2056 			mount_dropcrossref(mp, vp, 0);
2057 		} else {
2058 			if (nc_smr_enabled) {
2059 				vfs_smr_synchronize();
2060 			}
2061 
2062 			mount_lock_destroy(mp);
2063 #if CONFIG_MACF
2064 			mac_mount_label_destroy(mp);
2065 #endif
2066 			zfree(mount_zone, mp);
2067 		}
2068 	}
2069 	if (vfsp_ref) {
2070 		mount_list_lock();
2071 		vfsp->vfc_refcount--;
2072 		mount_list_unlock();
2073 	}
2074 
2075 	return error;
2076 }
2077 
2078 /*
2079  * Flush in-core data, check for competing mount attempts,
2080  * and set VMOUNT
2081  */
2082 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)2083 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
2084 {
2085 #if !CONFIG_MACF
2086 #pragma unused(cnp,fsname)
2087 #endif
2088 	struct vnode_attr va;
2089 	int error;
2090 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
2091 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
2092 	boolean_t is_kmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
2093 
2094 	if (!skip_auth) {
2095 		/*
2096 		 * If the user is not root, ensure that they own the directory
2097 		 * onto which we are attempting to mount.
2098 		 */
2099 		VATTR_INIT(&va);
2100 		VATTR_WANTED(&va, va_uid);
2101 		if ((error = vnode_getattr(vp, &va, ctx)) ||
2102 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2103 		    (!vfs_context_issuser(ctx)))) {
2104 			error = EPERM;
2105 			goto out;
2106 		}
2107 	}
2108 
2109 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
2110 		goto out;
2111 	}
2112 
2113 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
2114 		goto out;
2115 	}
2116 
2117 	if (vp->v_type != VDIR) {
2118 		error = ENOTDIR;
2119 		goto out;
2120 	}
2121 
2122 	vnode_lock_spin(vp);
2123 
2124 	if (is_fmount && (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL))) {
2125 		error = EBUSY;
2126 	} else if (!is_kmount && (ISSET(vp->v_flag, VMOUNT) ||
2127 	    (vp->v_mountedhere != NULL))) {
2128 		/*
2129 		 * For mount triggered from mount() call, we want to wait for the
2130 		 * current in-progress mount to complete, redo lookup and retry the
2131 		 * mount again. Similarly, we also want to retry if we lost the race
2132 		 * due to concurrent mounts and the 'VMOUNT' flag has been cleared and
2133 		 * 'v_mountedhere' has been planted after initial lookup.
2134 		 */
2135 		if (ISSET(vp->v_flag, VMOUNT)) {
2136 			vnode_lock_convert(vp);
2137 			msleep(&vp->v_flag, &vp->v_lock, PVFS, "vnode_waitformount", NULL);
2138 		}
2139 		error = EBUSY;
2140 	} else if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
2141 		error = EBUSY;
2142 	}
2143 
2144 	if (error) {
2145 		vnode_unlock(vp);
2146 		goto out;
2147 	}
2148 	SET(vp->v_flag, VMOUNT);
2149 	vnode_unlock(vp);
2150 
2151 #if CONFIG_MACF
2152 	error = mac_mount_check_mount(ctx, vp,
2153 	    cnp, fsname);
2154 	if (error != 0) {
2155 		vnode_lock_spin(vp);
2156 		CLR(vp->v_flag, VMOUNT);
2157 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2158 		wakeup(&vp->v_flag);
2159 		vnode_unlock(vp);
2160 	}
2161 #endif
2162 
2163 out:
2164 	return error;
2165 }
2166 
2167 #if CONFIG_IMGSRC_ACCESS
2168 
2169 #define DEBUG_IMGSRC 0
2170 
2171 #if DEBUG_IMGSRC
2172 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2173 #else
2174 #define IMGSRC_DEBUG(args...) do { } while(0)
2175 #endif
2176 
2177 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2178 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2179 {
2180 	struct nameidata nd;
2181 	vnode_t vp, realdevvp;
2182 	kauth_action_t accessmode;
2183 	int error;
2184 	enum uio_seg uio = UIO_USERSPACE;
2185 
2186 	if (ctx == vfs_context_kernel()) {
2187 		uio = UIO_SYSSPACE;
2188 	}
2189 
2190 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2191 	if ((error = namei(&nd))) {
2192 		IMGSRC_DEBUG("namei() failed with %d\n", error);
2193 		return error;
2194 	}
2195 
2196 	vp = nd.ni_vp;
2197 
2198 	if (!vnode_isblk(vp)) {
2199 		IMGSRC_DEBUG("Not block device.\n");
2200 		error = ENOTBLK;
2201 		goto out;
2202 	}
2203 
2204 	realdevvp = mp->mnt_devvp;
2205 	if (realdevvp == NULLVP) {
2206 		IMGSRC_DEBUG("No device backs the mount.\n");
2207 		error = ENXIO;
2208 		goto out;
2209 	}
2210 
2211 	error = vnode_getwithref(realdevvp);
2212 	if (error != 0) {
2213 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2214 		goto out;
2215 	}
2216 
2217 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2218 		IMGSRC_DEBUG("Wrong dev_t.\n");
2219 		error = ENXIO;
2220 		goto out1;
2221 	}
2222 
2223 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2224 
2225 	/*
2226 	 * If mount by non-root, then verify that user has necessary
2227 	 * permissions on the device.
2228 	 */
2229 	if (!vfs_context_issuser(ctx)) {
2230 		accessmode = KAUTH_VNODE_READ_DATA;
2231 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2232 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2233 		}
2234 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2235 			IMGSRC_DEBUG("Access denied.\n");
2236 			goto out1;
2237 		}
2238 	}
2239 
2240 	*devvpp = vp;
2241 
2242 out1:
2243 	vnode_put(realdevvp);
2244 
2245 out:
2246 	nameidone(&nd);
2247 
2248 	if (error) {
2249 		vnode_put(vp);
2250 	}
2251 
2252 	return error;
2253 }
2254 
2255 /*
2256  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2257  * and call checkdirs()
2258  */
2259 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2260 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2261 {
2262 	int error;
2263 
2264 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2265 
2266 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2267 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2268 
2269 	vnode_lock_spin(vp);
2270 	CLR(vp->v_flag, VMOUNT);
2271 	vp->v_mountedhere = mp;
2272 	SET(vp->v_flag, VMOUNTEDHERE);
2273 	/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2274 	wakeup(&vp->v_flag);
2275 	vnode_unlock(vp);
2276 
2277 	/*
2278 	 * taking the name_cache_lock exclusively will
2279 	 * insure that everyone is out of the fast path who
2280 	 * might be trying to use a now stale copy of
2281 	 * vp->v_mountedhere->mnt_realrootvp
2282 	 * bumping mount_generation causes the cached values
2283 	 * to be invalidated
2284 	 */
2285 	name_cache_lock();
2286 	mount_generation++;
2287 	name_cache_unlock();
2288 
2289 	error = vnode_ref(vp);
2290 	if (error != 0) {
2291 		goto out;
2292 	}
2293 
2294 	error = checkdirs(vp, ctx);
2295 	if (error != 0) {
2296 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2297 		vnode_rele(vp);
2298 		goto out;
2299 	}
2300 
2301 out:
2302 	if (error != 0) {
2303 		mp->mnt_vnodecovered = NULLVP;
2304 	}
2305 	return error;
2306 }
2307 
2308 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2309 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2310 {
2311 	vnode_rele(vp);
2312 	vnode_lock_spin(vp);
2313 	CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2314 	vp->v_mountedhere = (mount_t)NULL;
2315 	/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2316 	wakeup(&vp->v_flag);
2317 	vnode_unlock(vp);
2318 
2319 	mp->mnt_vnodecovered = NULLVP;
2320 }
2321 
2322 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2323 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2324 {
2325 	int error;
2326 
2327 	/* unmount in progress return error */
2328 	mount_lock_spin(mp);
2329 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2330 		mount_unlock(mp);
2331 		return EBUSY;
2332 	}
2333 	mount_unlock(mp);
2334 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2335 
2336 	/*
2337 	 * We only allow the filesystem to be reloaded if it
2338 	 * is currently mounted read-only.
2339 	 */
2340 	if ((flags & MNT_RELOAD) &&
2341 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2342 		error = ENOTSUP;
2343 		goto out;
2344 	}
2345 
2346 	/*
2347 	 * Only root, or the user that did the original mount is
2348 	 * permitted to update it.
2349 	 */
2350 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2351 	    (!vfs_context_issuser(ctx))) {
2352 		error = EPERM;
2353 		goto out;
2354 	}
2355 #if CONFIG_MACF
2356 	error = mac_mount_check_remount(ctx, mp, flags);
2357 	if (error != 0) {
2358 		goto out;
2359 	}
2360 #endif
2361 
2362 out:
2363 	if (error) {
2364 		lck_rw_done(&mp->mnt_rwlock);
2365 	}
2366 
2367 	return error;
2368 }
2369 
2370 static void
mount_end_update(mount_t mp)2371 mount_end_update(mount_t mp)
2372 {
2373 	lck_rw_done(&mp->mnt_rwlock);
2374 }
2375 
2376 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2377 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2378 {
2379 	vnode_t vp;
2380 
2381 	if (height >= MAX_IMAGEBOOT_NESTING) {
2382 		return EINVAL;
2383 	}
2384 
2385 	vp = imgsrc_rootvnodes[height];
2386 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2387 		*rvpp = vp;
2388 		return 0;
2389 	} else {
2390 		return ENOENT;
2391 	}
2392 }
2393 
2394 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2395 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2396     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2397     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2398 {
2399 	int error;
2400 	mount_t mp;
2401 	boolean_t placed = FALSE;
2402 	struct vfstable *vfsp;
2403 	user_addr_t devpath;
2404 	char *old_mntonname;
2405 	vnode_t rvp;
2406 	vnode_t devvp;
2407 	uint32_t height;
2408 	uint32_t flags;
2409 
2410 	/* If we didn't imageboot, nothing to move */
2411 	if (imgsrc_rootvnodes[0] == NULLVP) {
2412 		return EINVAL;
2413 	}
2414 
2415 	/* Only root can do this */
2416 	if (!vfs_context_issuser(ctx)) {
2417 		return EPERM;
2418 	}
2419 
2420 	IMGSRC_DEBUG("looking for root vnode.\n");
2421 
2422 	/*
2423 	 * Get root vnode of filesystem we're moving.
2424 	 */
2425 	if (by_index) {
2426 		if (is64bit) {
2427 			struct user64_mnt_imgsrc_args mia64;
2428 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2429 			if (error != 0) {
2430 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2431 				return error;
2432 			}
2433 
2434 			height = mia64.mi_height;
2435 			flags = mia64.mi_flags;
2436 			devpath = (user_addr_t)mia64.mi_devpath;
2437 		} else {
2438 			struct user32_mnt_imgsrc_args mia32;
2439 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2440 			if (error != 0) {
2441 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2442 				return error;
2443 			}
2444 
2445 			height = mia32.mi_height;
2446 			flags = mia32.mi_flags;
2447 			devpath = mia32.mi_devpath;
2448 		}
2449 	} else {
2450 		/*
2451 		 * For binary compatibility--assumes one level of nesting.
2452 		 */
2453 		if (is64bit) {
2454 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2455 				return error;
2456 			}
2457 		} else {
2458 			user32_addr_t tmp;
2459 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2460 				return error;
2461 			}
2462 
2463 			/* munge into LP64 addr */
2464 			devpath = CAST_USER_ADDR_T(tmp);
2465 		}
2466 
2467 		height = 0;
2468 		flags = 0;
2469 	}
2470 
2471 	if (flags != 0) {
2472 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2473 		return EINVAL;
2474 	}
2475 
2476 	error = get_imgsrc_rootvnode(height, &rvp);
2477 	if (error != 0) {
2478 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2479 		return error;
2480 	}
2481 
2482 	IMGSRC_DEBUG("got old root vnode\n");
2483 
2484 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2485 
2486 	/* Can only move once */
2487 	mp = vnode_mount(rvp);
2488 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2489 		IMGSRC_DEBUG("Already moved.\n");
2490 		error = EBUSY;
2491 		goto out0;
2492 	}
2493 
2494 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2495 	IMGSRC_DEBUG("Starting updated.\n");
2496 
2497 	/* Get exclusive rwlock on mount, authorize update on mp */
2498 	error = mount_begin_update(mp, ctx, 0);
2499 	if (error != 0) {
2500 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2501 		goto out0;
2502 	}
2503 
2504 	/*
2505 	 * It can only be moved once.  Flag is set under the rwlock,
2506 	 * so we're now safe to proceed.
2507 	 */
2508 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2509 		IMGSRC_DEBUG("Already moved [2]\n");
2510 		goto out1;
2511 	}
2512 
2513 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2514 
2515 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2516 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2517 	if (error != 0) {
2518 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2519 		goto out1;
2520 	}
2521 
2522 	IMGSRC_DEBUG("Covered vp OK.\n");
2523 
2524 	/* Sanity check the name caller has provided */
2525 	vfsp = mp->mnt_vtable;
2526 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2527 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2528 		    vfsp->vfc_name, fsname);
2529 		error = EINVAL;
2530 		goto out2;
2531 	}
2532 
2533 	/* Check the device vnode and update mount-from name, for local filesystems */
2534 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2535 		IMGSRC_DEBUG("Local, doing device validation.\n");
2536 
2537 		if (devpath != USER_ADDR_NULL) {
2538 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2539 			if (error) {
2540 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2541 				goto out2;
2542 			}
2543 
2544 			vnode_put(devvp);
2545 		}
2546 	}
2547 
2548 	/*
2549 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2550 	 * and increment the name cache's mount generation
2551 	 */
2552 
2553 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2554 	error = place_mount_and_checkdirs(mp, vp, ctx);
2555 	if (error != 0) {
2556 		goto out2;
2557 	}
2558 
2559 	placed = TRUE;
2560 
2561 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2562 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2563 
2564 	/* Forbid future moves */
2565 	mount_lock(mp);
2566 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2567 	mount_unlock(mp);
2568 
2569 	/* Finally, add to mount list, completely ready to go */
2570 	if (mount_list_add(mp) != 0) {
2571 		/*
2572 		 * The system is shutting down trying to umount
2573 		 * everything, so fail with a plausible errno.
2574 		 */
2575 		error = EBUSY;
2576 		goto out3;
2577 	}
2578 
2579 	mount_end_update(mp);
2580 	vnode_put(rvp);
2581 	zfree(ZV_NAMEI, old_mntonname);
2582 
2583 	vfs_notify_mount(pvp);
2584 #if CONFIG_MACF
2585 	mac_mount_notify_mount(ctx, mp);
2586 #endif /* CONFIG_MACF */
2587 
2588 	return 0;
2589 out3:
2590 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2591 
2592 	mount_lock(mp);
2593 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2594 	mount_unlock(mp);
2595 
2596 out2:
2597 	/*
2598 	 * Placing the mp on the vnode clears VMOUNT,
2599 	 * so cleanup is different after that point
2600 	 */
2601 	if (placed) {
2602 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2603 		undo_place_on_covered_vp(mp, vp);
2604 	} else {
2605 		vnode_lock_spin(vp);
2606 		CLR(vp->v_flag, VMOUNT);
2607 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2608 		wakeup(&vp->v_flag);
2609 		vnode_unlock(vp);
2610 	}
2611 out1:
2612 	mount_end_update(mp);
2613 
2614 out0:
2615 	vnode_put(rvp);
2616 	zfree(ZV_NAMEI, old_mntonname);
2617 	return error;
2618 }
2619 
2620 #endif /* CONFIG_IMGSRC_ACCESS */
2621 
2622 void
enablequotas(struct mount * mp,vfs_context_t ctx)2623 enablequotas(struct mount *mp, vfs_context_t ctx)
2624 {
2625 	struct nameidata qnd;
2626 	int type;
2627 	char qfpath[MAXPATHLEN];
2628 	const char *qfname = QUOTAFILENAME;
2629 	const char *qfopsname = QUOTAOPSNAME;
2630 	const char *qfextension[] = INITQFNAMES;
2631 
2632 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2633 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2634 		return;
2635 	}
2636 	/*
2637 	 * Enable filesystem disk quotas if necessary.
2638 	 * We ignore errors as this should not interfere with final mount
2639 	 */
2640 	for (type = 0; type < MAXQUOTAS; type++) {
2641 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2642 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2643 		    CAST_USER_ADDR_T(qfpath), ctx);
2644 		if (namei(&qnd) != 0) {
2645 			continue;           /* option file to trigger quotas is not present */
2646 		}
2647 		vnode_put(qnd.ni_vp);
2648 		nameidone(&qnd);
2649 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2650 
2651 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2652 	}
2653 	return;
2654 }
2655 
2656 
2657 static int
checkdirs_callback(proc_t p,void * arg)2658 checkdirs_callback(proc_t p, void * arg)
2659 {
2660 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2661 	vnode_t olddp = cdrp->olddp;
2662 	vnode_t newdp = cdrp->newdp;
2663 	struct filedesc *fdp = &p->p_fd;
2664 	vnode_t new_cvp = newdp;
2665 	vnode_t new_rvp = newdp;
2666 	vnode_t old_cvp = NULL;
2667 	vnode_t old_rvp = NULL;
2668 
2669 	/*
2670 	 * XXX Also needs to iterate each thread in the process to see if it
2671 	 * XXX is using a per-thread current working directory, and, if so,
2672 	 * XXX update that as well.
2673 	 */
2674 
2675 	/*
2676 	 * First, with the proc_fdlock held, check to see if we will need
2677 	 * to do any work.  If not, we will get out fast.
2678 	 */
2679 	proc_fdlock(p);
2680 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2681 		proc_fdunlock(p);
2682 		return PROC_RETURNED;
2683 	}
2684 	proc_fdunlock(p);
2685 
2686 	/*
2687 	 * Ok, we will have to do some work.  Always take two refs
2688 	 * because we might need that many.  We'll dispose of whatever
2689 	 * we ended up not using.
2690 	 */
2691 	if (vnode_ref(newdp) != 0) {
2692 		return PROC_RETURNED;
2693 	}
2694 	if (vnode_ref(newdp) != 0) {
2695 		vnode_rele(newdp);
2696 		return PROC_RETURNED;
2697 	}
2698 
2699 	proc_dirs_lock_exclusive(p);
2700 	/*
2701 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2702 	 * have to do all of the checks again.
2703 	 */
2704 	proc_fdlock(p);
2705 	if (fdp->fd_cdir == olddp) {
2706 		old_cvp = olddp;
2707 		fdp->fd_cdir = newdp;
2708 		new_cvp = NULL;
2709 	}
2710 	if (fdp->fd_rdir == olddp) {
2711 		old_rvp = olddp;
2712 		fdp->fd_rdir = newdp;
2713 		new_rvp = NULL;
2714 	}
2715 	proc_fdunlock(p);
2716 	proc_dirs_unlock_exclusive(p);
2717 
2718 	/*
2719 	 * Dispose of any references that are no longer needed.
2720 	 */
2721 	if (old_cvp != NULL) {
2722 		vnode_rele(old_cvp);
2723 	}
2724 	if (old_rvp != NULL) {
2725 		vnode_rele(old_rvp);
2726 	}
2727 	if (new_cvp != NULL) {
2728 		vnode_rele(new_cvp);
2729 	}
2730 	if (new_rvp != NULL) {
2731 		vnode_rele(new_rvp);
2732 	}
2733 
2734 	return PROC_RETURNED;
2735 }
2736 
2737 
2738 
2739 /*
2740  * Scan all active processes to see if any of them have a current
2741  * or root directory onto which the new filesystem has just been
2742  * mounted. If so, replace them with the new mount point.
2743  */
2744 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2745 checkdirs(vnode_t olddp, vfs_context_t ctx)
2746 {
2747 	vnode_t newdp;
2748 	vnode_t tvp;
2749 	int err;
2750 	struct cdirargs cdr;
2751 
2752 	if (olddp->v_usecount == 1) {
2753 		return 0;
2754 	}
2755 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2756 
2757 	if (err != 0) {
2758 #if DIAGNOSTIC
2759 		panic("mount: lost mount: error %d", err);
2760 #endif
2761 		return err;
2762 	}
2763 
2764 	cdr.olddp = olddp;
2765 	cdr.newdp = newdp;
2766 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2767 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2768 
2769 	if (rootvnode == olddp) {
2770 		vnode_ref(newdp);
2771 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2772 		tvp = rootvnode;
2773 		rootvnode = newdp;
2774 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2775 		vnode_rele(tvp);
2776 	}
2777 
2778 	vnode_put(newdp);
2779 	return 0;
2780 }
2781 
2782 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2783 	"com.apple.private.vfs.role-account-unmount"
2784 #define SYSTEM_VOLUME_UNMOUNT_ENTITLEMENT       \
2785 	"com.apple.private.vfs.system-volume-unmount"
2786 
2787 /*
2788  * Unmount a file system.
2789  *
2790  * Note: unmount takes a path to the vnode mounted on as argument,
2791  * not special file (as before).
2792  */
2793 /* ARGSUSED */
2794 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2795 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2796 {
2797 	vnode_t vp;
2798 	struct mount *mp;
2799 	int flags = uap->flags;
2800 	int error;
2801 	struct nameidata nd;
2802 	vfs_context_t ctx;
2803 
2804 	/*
2805 	 * If the process has the entitlement, use the kernel's context when
2806 	 * performing lookup on the mount path as the process might lack proper
2807 	 * permission to access the directory.
2808 	 */
2809 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2810 	    vfs_context_kernel() : vfs_context_current();
2811 
2812 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2813 	    UIO_USERSPACE, uap->path, ctx);
2814 	if (flags & MNT_NOFOLLOW) {
2815 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
2816 	}
2817 
2818 	error = namei(&nd);
2819 	if (error) {
2820 		return error;
2821 	}
2822 	vp = nd.ni_vp;
2823 	mp = vp->v_mount;
2824 	nameidone(&nd);
2825 
2826 	/*
2827 	 * Must be the root of the filesystem
2828 	 */
2829 	if ((vp->v_flag & VROOT) == 0) {
2830 		vnode_put(vp);
2831 		return EINVAL;
2832 	}
2833 #if CONFIG_MACF
2834 	error = mac_mount_check_umount(ctx, mp);
2835 	if (error != 0) {
2836 		vnode_put(vp);
2837 		return error;
2838 	}
2839 #endif
2840 	mount_ref(mp, 0);
2841 	vnode_put(vp);
2842 	/* safedounmount consumes the mount ref */
2843 	return safedounmount(mp, flags, ctx);
2844 }
2845 
2846 int
funmount(__unused proc_t p,struct funmount_args * uap,__unused int32_t * retval)2847 funmount(__unused proc_t p, struct funmount_args *uap, __unused int32_t *retval)
2848 {
2849 	int error;
2850 	vnode_t vp;
2851 	struct mount *mp;
2852 	vfs_context_t ctx;
2853 
2854 	AUDIT_ARG(fd, uap->fd);
2855 	AUDIT_ARG(fflags, uap->flags);
2856 
2857 	/*
2858 	 * If the process has the entitlement, use the kernel's context when
2859 	 * performing lookup on the mount path as the process might lack proper
2860 	 * permission to access the directory.
2861 	 */
2862 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2863 	    vfs_context_kernel() : vfs_context_current();
2864 
2865 	error = vnode_getfromfd(ctx, uap->fd, &vp);
2866 	if (error) {
2867 		return error;
2868 	}
2869 
2870 	/*
2871 	 * Must be the root of the filesystem
2872 	 */
2873 	if ((vp->v_flag & VROOT) == 0) {
2874 		vnode_put(vp);
2875 		return EINVAL;
2876 	}
2877 	mp = vnode_mount(vp);
2878 
2879 #if CONFIG_MACF
2880 	error = mac_mount_check_umount(ctx, mp);
2881 	if (error != 0) {
2882 		vnode_put(vp);
2883 		return error;
2884 	}
2885 #endif
2886 	mount_ref(mp, 0);
2887 	vnode_put(vp);
2888 
2889 	/* safedounmount consumes the mount ref */
2890 	return safedounmount(mp, uap->flags, ctx);
2891 }
2892 
2893 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2894 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2895 {
2896 	mount_t mp;
2897 
2898 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2899 	if (mp == (mount_t)0) {
2900 		return ENOENT;
2901 	}
2902 	mount_ref(mp, 0);
2903 	mount_iterdrop(mp);
2904 	/* safedounmount consumes the mount ref */
2905 	return safedounmount(mp, flags, ctx);
2906 }
2907 
2908 /*
2909  * The mount struct comes with a mount ref which will be consumed.
2910  * Do the actual file system unmount, prevent some common foot shooting.
2911  */
2912 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2913 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2914 {
2915 	int error;
2916 	proc_t p = vfs_context_proc(ctx);
2917 
2918 	/*
2919 	 * If the file system is not responding and MNT_NOBLOCK
2920 	 * is set and not a forced unmount then return EBUSY.
2921 	 */
2922 	if ((mp->mnt_lflag & MNT_LNOTRESP) &&
2923 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2924 		error = EBUSY;
2925 		goto out;
2926 	}
2927 
2928 	/*
2929 	 * Skip authorization in two cases:
2930 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2931 	 *   This entitlement allows non-root processes unmount volumes mounted by
2932 	 *   other processes.
2933 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2934 	 *   attempt.
2935 	 */
2936 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2937 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2938 		/*
2939 		 * Only root, or the user that did the original mount is
2940 		 * permitted to unmount this filesystem.
2941 		 */
2942 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2943 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2944 			goto out;
2945 		}
2946 	}
2947 
2948 	/*
2949 	 * Don't allow unmounting the root file system, or other volumes
2950 	 * associated with it (for example, the associated VM or DATA mounts) .
2951 	 */
2952 	if (mp->mnt_flag & MNT_ROOTFS) {
2953 		error = EBUSY; /* the root is always busy */
2954 		goto out;
2955 	}
2956 	if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !IOCurrentTaskHasEntitlement(SYSTEM_VOLUME_UNMOUNT_ENTITLEMENT)) {
2957 		printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2958 		    mp->mnt_vfsstat.f_mntonname);
2959 		error = EBUSY; /* root-associated volumes are always busy unless caller is entitled */
2960 		goto out;
2961 	}
2962 
2963 	/*
2964 	 * If the mount is providing the root filesystem's disk image
2965 	 * (i.e. imageboot), don't allow unmounting
2966 	 */
2967 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2968 		error = EBUSY;
2969 		goto out;
2970 	}
2971 
2972 	return dounmount(mp, flags, 1, ctx);
2973 
2974 out:
2975 	mount_drop(mp, 0);
2976 	return error;
2977 }
2978 
2979 /*
2980  * Do the actual file system unmount.
2981  */
2982 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2983 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2984 {
2985 	vnode_t coveredvp = (vnode_t)0;
2986 	int error;
2987 	int needwakeup = 0;
2988 	int forcedunmount = 0;
2989 	int lflags = 0;
2990 	struct vnode *devvp = NULLVP;
2991 #if CONFIG_TRIGGERS
2992 	proc_t p = vfs_context_proc(ctx);
2993 	int did_vflush = 0;
2994 	int pflags_save = 0;
2995 #endif /* CONFIG_TRIGGERS */
2996 
2997 #if CONFIG_FSE
2998 	if (!(flags & MNT_FORCE)) {
2999 		fsevent_unmount(mp, ctx);  /* has to come first! */
3000 	}
3001 #endif
3002 
3003 	mount_lock(mp);
3004 
3005 	/*
3006 	 * If already an unmount in progress just return EBUSY.
3007 	 * Even a forced unmount cannot override.
3008 	 */
3009 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
3010 		if (withref != 0) {
3011 			mount_drop(mp, 1);
3012 		}
3013 		mount_unlock(mp);
3014 		return EBUSY;
3015 	}
3016 
3017 	if (flags & MNT_FORCE) {
3018 		forcedunmount = 1;
3019 		mp->mnt_lflag |= MNT_LFORCE;
3020 	}
3021 
3022 #if CONFIG_TRIGGERS
3023 	if (flags & MNT_NOBLOCK && p != kernproc) {
3024 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
3025 	}
3026 #endif
3027 
3028 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
3029 	mp->mnt_lflag |= MNT_LUNMOUNT;
3030 	mp->mnt_flag &= ~MNT_ASYNC;
3031 	/*
3032 	 * anyone currently in the fast path that
3033 	 * trips over the cached rootvp will be
3034 	 * dumped out and forced into the slow path
3035 	 * to regenerate a new cached value
3036 	 */
3037 	mp->mnt_realrootvp = NULLVP;
3038 	mount_unlock(mp);
3039 
3040 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
3041 		/*
3042 		 * Force unmount any mounts in this filesystem.
3043 		 * If any unmounts fail - just leave them dangling.
3044 		 * Avoids recursion.
3045 		 */
3046 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
3047 	}
3048 
3049 	/*
3050 	 * taking the name_cache_lock exclusively will
3051 	 * insure that everyone is out of the fast path who
3052 	 * might be trying to use a now stale copy of
3053 	 * vp->v_mountedhere->mnt_realrootvp
3054 	 * bumping mount_generation causes the cached values
3055 	 * to be invalidated
3056 	 */
3057 	name_cache_lock();
3058 	mount_generation++;
3059 	name_cache_unlock();
3060 
3061 	/*
3062 	 * Make sure there are no one in the mount iterations or lookup.
3063 	 * Drain makes 'mnt_iterref' -ve so on error exit we need to ensure that
3064 	 * 'mnt_iterref' is reset back to 0 by calling mount_iterreset().
3065 	 */
3066 	mount_iterdrain(mp);
3067 
3068 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
3069 	if (withref != 0) {
3070 		mount_drop(mp, 0);
3071 	}
3072 	error = 0;
3073 	if (forcedunmount == 0) {
3074 		ubc_umount(mp); /* release cached vnodes */
3075 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3076 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
3077 			if (error) {
3078 				mount_iterreset(mp);
3079 				mount_lock(mp);
3080 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
3081 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
3082 				mp->mnt_lflag &= ~MNT_LFORCE;
3083 				goto out;
3084 			}
3085 		}
3086 	}
3087 
3088 	IOBSDMountChange(mp, kIOMountChangeUnmount);
3089 
3090 #if CONFIG_TRIGGERS
3091 	vfs_nested_trigger_unmounts(mp, flags, ctx);
3092 	did_vflush = 1;
3093 #endif
3094 	if (forcedunmount) {
3095 		lflags |= FORCECLOSE;
3096 	}
3097 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
3098 	if ((forcedunmount == 0) && error) {
3099 		mount_iterreset(mp);
3100 		mount_lock(mp);
3101 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
3102 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
3103 		mp->mnt_lflag &= ~MNT_LFORCE;
3104 		goto out;
3105 	}
3106 
3107 	error = VFS_UNMOUNT(mp, flags, ctx);
3108 	if (error) {
3109 		mount_iterreset(mp);
3110 		mount_lock(mp);
3111 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
3112 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
3113 		mp->mnt_lflag &= ~MNT_LFORCE;
3114 		goto out;
3115 	}
3116 
3117 	/* increment the operations count */
3118 	if (!error) {
3119 		OSAddAtomic(1, &vfs_nummntops);
3120 	}
3121 
3122 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
3123 		/* hold an io reference and drop the usecount before close */
3124 		devvp = mp->mnt_devvp;
3125 		vnode_getalways(devvp);
3126 		vnode_rele(devvp);
3127 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
3128 		    ctx);
3129 		vnode_clearmountedon(devvp);
3130 		vnode_put(devvp);
3131 	}
3132 	lck_rw_done(&mp->mnt_rwlock);
3133 	mount_list_remove(mp);
3134 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
3135 
3136 	/* mark the mount point hook in the vp but not drop the ref yet */
3137 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
3138 		/*
3139 		 * The covered vnode needs special handling. Trying to get an
3140 		 * iocount must not block here as this may lead to deadlocks
3141 		 * if the Filesystem to which the covered vnode belongs is
3142 		 * undergoing forced unmounts. Since we hold a usecount, the
3143 		 * vnode cannot be reused (it can, however, still be terminated)
3144 		 */
3145 		vnode_getalways(coveredvp);
3146 		vnode_lock_spin(coveredvp);
3147 
3148 		mp->mnt_crossref++;
3149 		coveredvp->v_mountedhere = (struct mount *)0;
3150 		CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
3151 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
3152 		wakeup(&coveredvp->v_flag);
3153 		vnode_unlock(coveredvp);
3154 		vnode_put(coveredvp);
3155 	}
3156 
3157 	mount_list_lock();
3158 	mp->mnt_vtable->vfc_refcount--;
3159 	mount_list_unlock();
3160 
3161 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
3162 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
3163 	mount_lock(mp);
3164 	mp->mnt_lflag |= MNT_LDEAD;
3165 
3166 	if (mp->mnt_lflag & MNT_LWAIT) {
3167 		/*
3168 		 * do the wakeup here
3169 		 * in case we block in mount_refdrain
3170 		 * which will drop the mount lock
3171 		 * and allow anyone blocked in vfs_busy
3172 		 * to wakeup and see the LDEAD state
3173 		 */
3174 		mp->mnt_lflag &= ~MNT_LWAIT;
3175 		wakeup((caddr_t)mp);
3176 	}
3177 	mount_refdrain(mp);
3178 
3179 	/* free disk_conditioner_info structure for this mount */
3180 	disk_conditioner_unmount(mp);
3181 
3182 out:
3183 	if (mp->mnt_lflag & MNT_LWAIT) {
3184 		mp->mnt_lflag &= ~MNT_LWAIT;
3185 		needwakeup = 1;
3186 	}
3187 
3188 #if CONFIG_TRIGGERS
3189 	if (flags & MNT_NOBLOCK && p != kernproc) {
3190 		// Restore P_NOREMOTEHANG bit to its previous value
3191 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
3192 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
3193 		}
3194 	}
3195 
3196 	/*
3197 	 * Callback and context are set together under the mount lock, and
3198 	 * never cleared, so we're safe to examine them here, drop the lock,
3199 	 * and call out.
3200 	 */
3201 	if (mp->mnt_triggercallback != NULL) {
3202 		mount_unlock(mp);
3203 		if (error == 0) {
3204 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
3205 		} else if (did_vflush) {
3206 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
3207 		}
3208 	} else {
3209 		mount_unlock(mp);
3210 	}
3211 #else
3212 	mount_unlock(mp);
3213 #endif /* CONFIG_TRIGGERS */
3214 
3215 	lck_rw_done(&mp->mnt_rwlock);
3216 
3217 	if (needwakeup) {
3218 		wakeup((caddr_t)mp);
3219 	}
3220 
3221 	if (!error) {
3222 		if ((coveredvp != NULLVP)) {
3223 			vnode_t pvp = NULLVP;
3224 
3225 			/*
3226 			 * The covered vnode needs special handling. Trying to
3227 			 * get an iocount must not block here as this may lead
3228 			 * to deadlocks if the Filesystem to which the covered
3229 			 * vnode belongs is undergoing forced unmounts. Since we
3230 			 * hold a usecount, the  vnode cannot be reused
3231 			 * (it can, however, still be terminated).
3232 			 */
3233 			vnode_getalways(coveredvp);
3234 
3235 			mount_dropcrossref(mp, coveredvp, 0);
3236 			/*
3237 			 * We'll _try_ to detect if this really needs to be
3238 			 * done. The coveredvp can only be in termination (or
3239 			 * terminated) if the coveredvp's mount point is in a
3240 			 * forced unmount (or has been) since we still hold the
3241 			 * ref.
3242 			 */
3243 			if (!vnode_isrecycled(coveredvp)) {
3244 				pvp = vnode_getparent(coveredvp);
3245 #if CONFIG_TRIGGERS
3246 				if (coveredvp->v_resolve) {
3247 					vnode_trigger_rearm(coveredvp, ctx);
3248 				}
3249 #endif
3250 			}
3251 
3252 			vnode_rele(coveredvp);
3253 			vnode_put(coveredvp);
3254 			coveredvp = NULLVP;
3255 
3256 			if (pvp) {
3257 				lock_vnode_and_post(pvp, NOTE_WRITE);
3258 				vnode_put(pvp);
3259 			}
3260 		} else if (mp->mnt_flag & MNT_ROOTFS) {
3261 			if (nc_smr_enabled) {
3262 				vfs_smr_synchronize();
3263 			}
3264 
3265 			mount_lock_destroy(mp);
3266 #if CONFIG_MACF
3267 			mac_mount_label_destroy(mp);
3268 #endif
3269 			zfree(mount_zone, mp);
3270 		} else {
3271 			panic("dounmount: no coveredvp");
3272 		}
3273 	}
3274 	return error;
3275 }
3276 
3277 /*
3278  * Unmount any mounts in this filesystem.
3279  */
3280 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3281 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3282 {
3283 	mount_t smp;
3284 	fsid_t *fsids, fsid;
3285 	int fsids_sz;
3286 	int count = 0, i, m = 0;
3287 	vnode_t vp;
3288 
3289 	mount_list_lock();
3290 
3291 	// Get an array to hold the submounts fsids.
3292 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
3293 	count++;
3294 	fsids_sz = count * sizeof(fsid_t);
3295 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3296 	if (fsids == NULL) {
3297 		mount_list_unlock();
3298 		goto out;
3299 	}
3300 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
3301 
3302 	/*
3303 	 * Fill the array with submount fsids.
3304 	 * Since mounts are always added to the tail of the mount list, the
3305 	 * list is always in mount order.
3306 	 * For each mount check if the mounted-on vnode belongs to a
3307 	 * mount that's already added to our array of mounts to be unmounted.
3308 	 */
3309 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3310 		vp = smp->mnt_vnodecovered;
3311 		if (vp == NULL) {
3312 			continue;
3313 		}
3314 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3315 		for (i = 0; i <= m; i++) {
3316 			if (fsids[i].val[0] == fsid.val[0] &&
3317 			    fsids[i].val[1] == fsid.val[1]) {
3318 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3319 				break;
3320 			}
3321 		}
3322 	}
3323 	mount_list_unlock();
3324 
3325 	// Unmount the submounts in reverse order. Ignore errors.
3326 	for (i = m; i > 0; i--) {
3327 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3328 		if (smp) {
3329 			mount_ref(smp, 0);
3330 			mount_iterdrop(smp);
3331 			(void) dounmount(smp, flags, 1, ctx);
3332 		}
3333 	}
3334 out:
3335 	kfree_data(fsids, fsids_sz);
3336 }
3337 
3338 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3339 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3340 {
3341 	vnode_hold(dp);
3342 	vnode_lock(dp);
3343 	mp->mnt_crossref--;
3344 
3345 	if (mp->mnt_crossref < 0) {
3346 		panic("mount cross refs -ve");
3347 	}
3348 
3349 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3350 		if (need_put) {
3351 			vnode_put_locked(dp);
3352 		}
3353 		vnode_drop_and_unlock(dp);
3354 
3355 		if (nc_smr_enabled) {
3356 			vfs_smr_synchronize();
3357 		}
3358 
3359 		mount_lock_destroy(mp);
3360 #if CONFIG_MACF
3361 		mac_mount_label_destroy(mp);
3362 #endif
3363 		zfree(mount_zone, mp);
3364 		return;
3365 	}
3366 	if (need_put) {
3367 		vnode_put_locked(dp);
3368 	}
3369 	vnode_drop_and_unlock(dp);
3370 }
3371 
3372 
3373 /*
3374  * Sync each mounted filesystem.
3375  */
3376 #if DIAGNOSTIC
3377 int syncprt = 0;
3378 #endif
3379 
3380 int print_vmpage_stat = 0;
3381 
3382 /*
3383  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3384  *			mounted read-write with the passed waitfor value.
3385  *
3386  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3387  *		arg	user argument (please see below)
3388  *
3389  * User argument is a pointer to 32 bit unsigned integer which describes the
3390  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3391  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3392  * waitfor value.
3393  *
3394  * Returns:		VFS_RETURNED
3395  */
3396 static int
sync_callback(mount_t mp,void * arg)3397 sync_callback(mount_t mp, void *arg)
3398 {
3399 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3400 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3401 		unsigned waitfor = MNT_NOWAIT;
3402 
3403 		if (arg) {
3404 			waitfor = *(uint32_t*)arg;
3405 		}
3406 
3407 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3408 		if (waitfor != MNT_WAIT &&
3409 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3410 		    waitfor != MNT_NOWAIT &&
3411 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3412 		    waitfor != MNT_DWAIT &&
3413 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3414 			panic("Passed inappropriate waitfor %u to "
3415 			    "sync_callback()", waitfor);
3416 		}
3417 
3418 		mp->mnt_flag &= ~MNT_ASYNC;
3419 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3420 		if (asyncflag) {
3421 			mp->mnt_flag |= MNT_ASYNC;
3422 		}
3423 	}
3424 
3425 	return VFS_RETURNED;
3426 }
3427 
3428 /* ARGSUSED */
3429 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3430 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3431 {
3432 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3433 
3434 	if (print_vmpage_stat) {
3435 		vm_countdirtypages();
3436 	}
3437 
3438 #if DIAGNOSTIC
3439 	if (syncprt) {
3440 		vfs_bufstats();
3441 	}
3442 #endif /* DIAGNOSTIC */
3443 	return 0;
3444 }
3445 
3446 typedef enum {
3447 	SYNC_ALL = 0,
3448 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3449 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3450 } sync_type_t;
3451 
3452 static int
sync_internal_callback(mount_t mp,void * arg)3453 sync_internal_callback(mount_t mp, void *arg)
3454 {
3455 	if (arg) {
3456 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3457 		    (mp->mnt_flag & MNT_LOCAL);
3458 		sync_type_t sync_type = *((sync_type_t *)arg);
3459 
3460 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3461 			return VFS_RETURNED;
3462 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3463 			return VFS_RETURNED;
3464 		}
3465 	}
3466 
3467 	(void)sync_callback(mp, NULL);
3468 
3469 	return VFS_RETURNED;
3470 }
3471 
3472 int sync_thread_state = 0;
3473 int sync_timeout_seconds = 5;
3474 
3475 #define SYNC_THREAD_RUN       0x0001
3476 #define SYNC_THREAD_RUNNING   0x0002
3477 
3478 #if CONFIG_PHYS_WRITE_ACCT
3479 thread_t pm_sync_thread;
3480 #endif /* CONFIG_PHYS_WRITE_ACCT */
3481 
3482 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3483 sync_thread(__unused void *arg, __unused wait_result_t wr)
3484 {
3485 	sync_type_t sync_type;
3486 #if CONFIG_PHYS_WRITE_ACCT
3487 	pm_sync_thread = current_thread();
3488 #endif /* CONFIG_PHYS_WRITE_ACCT */
3489 
3490 	lck_mtx_lock(&sync_mtx_lck);
3491 	while (sync_thread_state & SYNC_THREAD_RUN) {
3492 		sync_thread_state &= ~SYNC_THREAD_RUN;
3493 		lck_mtx_unlock(&sync_mtx_lck);
3494 
3495 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3496 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3497 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3498 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3499 
3500 		lck_mtx_lock(&sync_mtx_lck);
3501 	}
3502 	/*
3503 	 * This wakeup _has_ to be issued before the lock is released otherwise
3504 	 * we may end up waking up a thread in sync_internal which is
3505 	 * expecting a wakeup from a thread it just created and not from this
3506 	 * thread which is about to exit.
3507 	 */
3508 	wakeup(&sync_thread_state);
3509 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3510 #if CONFIG_PHYS_WRITE_ACCT
3511 	pm_sync_thread = NULL;
3512 #endif /* CONFIG_PHYS_WRITE_ACCT */
3513 	lck_mtx_unlock(&sync_mtx_lck);
3514 
3515 	if (print_vmpage_stat) {
3516 		vm_countdirtypages();
3517 	}
3518 
3519 #if DIAGNOSTIC
3520 	if (syncprt) {
3521 		vfs_bufstats();
3522 	}
3523 #endif /* DIAGNOSTIC */
3524 }
3525 
3526 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3527 
3528 /*
3529  * An in-kernel sync for power management to call.
3530  * This function always returns within sync_timeout seconds.
3531  */
3532 __private_extern__ int
sync_internal(void)3533 sync_internal(void)
3534 {
3535 	thread_t thd = NULL;
3536 	int error;
3537 	int thread_created = FALSE;
3538 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3539 
3540 	lck_mtx_lock(&sync_mtx_lck);
3541 	sync_thread_state |= SYNC_THREAD_RUN;
3542 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3543 		int kr;
3544 
3545 		sync_thread_state |= SYNC_THREAD_RUNNING;
3546 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3547 		if (kr != KERN_SUCCESS) {
3548 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3549 			lck_mtx_unlock(&sync_mtx_lck);
3550 			printf("sync_thread failed\n");
3551 			return 0;
3552 		}
3553 		thread_created = TRUE;
3554 	}
3555 
3556 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3557 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3558 	if (error) {
3559 		struct timeval now;
3560 
3561 		microtime(&now);
3562 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3563 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3564 			sync_timeout_last_print.tv_sec = now.tv_sec;
3565 		}
3566 	}
3567 
3568 	if (thread_created) {
3569 		thread_deallocate(thd);
3570 	}
3571 
3572 	return 0;
3573 } /* end of sync_internal call */
3574 
3575 /*
3576  * Change filesystem quotas.
3577  */
3578 #if QUOTA
3579 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3580 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3581 {
3582 	struct mount *mp;
3583 	int error, quota_cmd, quota_status = 0;
3584 	caddr_t datap;
3585 	size_t fnamelen;
3586 	struct nameidata nd;
3587 	vfs_context_t ctx = vfs_context_current();
3588 	struct dqblk my_dqblk = {};
3589 
3590 	AUDIT_ARG(uid, uap->uid);
3591 	AUDIT_ARG(cmd, uap->cmd);
3592 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3593 	    uap->path, ctx);
3594 	error = namei(&nd);
3595 	if (error) {
3596 		return error;
3597 	}
3598 	mp = nd.ni_vp->v_mount;
3599 	mount_ref(mp, 0);
3600 	vnode_put(nd.ni_vp);
3601 	nameidone(&nd);
3602 
3603 #if CONFIG_MACF
3604 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3605 	if (error != 0) {
3606 		goto out;
3607 	}
3608 #endif
3609 
3610 	/* copyin any data we will need for downstream code */
3611 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3612 
3613 	switch (quota_cmd) {
3614 	case Q_QUOTAON:
3615 		/* uap->arg specifies a file from which to take the quotas */
3616 		fnamelen = MAXPATHLEN;
3617 		datap = zalloc(ZV_NAMEI);
3618 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3619 		break;
3620 	case Q_GETQUOTA:
3621 		/* uap->arg is a pointer to a dqblk structure. */
3622 		datap = (caddr_t) &my_dqblk;
3623 		break;
3624 	case Q_SETQUOTA:
3625 	case Q_SETUSE:
3626 		/* uap->arg is a pointer to a dqblk structure. */
3627 		datap = (caddr_t) &my_dqblk;
3628 		if (proc_is64bit(p)) {
3629 			struct user_dqblk       my_dqblk64;
3630 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3631 			if (error == 0) {
3632 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3633 			}
3634 		} else {
3635 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3636 		}
3637 		break;
3638 	case Q_QUOTASTAT:
3639 		/* uap->arg is a pointer to an integer */
3640 		datap = (caddr_t) &quota_status;
3641 		break;
3642 	default:
3643 		datap = NULL;
3644 		break;
3645 	} /* switch */
3646 
3647 	if (error == 0) {
3648 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3649 	}
3650 
3651 	switch (quota_cmd) {
3652 	case Q_QUOTAON:
3653 		if (datap != NULL) {
3654 			zfree(ZV_NAMEI, datap);
3655 		}
3656 		break;
3657 	case Q_GETQUOTA:
3658 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3659 		if (error == 0) {
3660 			if (proc_is64bit(p)) {
3661 				struct user_dqblk       my_dqblk64;
3662 
3663 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3664 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3665 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3666 			} else {
3667 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3668 			}
3669 		}
3670 		break;
3671 	case Q_QUOTASTAT:
3672 		/* uap->arg is a pointer to an integer */
3673 		if (error == 0) {
3674 			error = copyout(datap, uap->arg, sizeof(quota_status));
3675 		}
3676 		break;
3677 	default:
3678 		break;
3679 	} /* switch */
3680 
3681 out:
3682 	mount_drop(mp, 0);
3683 	return error;
3684 }
3685 #else
3686 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3687 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3688 {
3689 	return EOPNOTSUPP;
3690 }
3691 #endif /* QUOTA */
3692 
3693 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3694 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3695 {
3696 	int error;
3697 	vfs_context_t ctx = vfs_context_current();
3698 
3699 #if CONFIG_MACF
3700 	error = mac_mount_check_stat(ctx, mp);
3701 	if (error != 0) {
3702 		return error;
3703 	}
3704 #endif
3705 
3706 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3707 	if (error != 0) {
3708 		return error;
3709 	}
3710 
3711 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3712 }
3713 
3714 /*
3715  * Get filesystem statistics.
3716  *
3717  * Returns:	0			Success
3718  *	namei:???
3719  *	vfs_update_vfsstat:???
3720  *	munge_statfs:EFAULT
3721  */
3722 /* ARGSUSED */
3723 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3724 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3725 {
3726 	int error;
3727 	struct mount *mp;
3728 	struct nameidata nd;
3729 	vfs_context_t ctx = vfs_context_current();
3730 	vnode_t vp;
3731 
3732 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3733 	    UIO_USERSPACE, uap->path, ctx);
3734 	error = namei(&nd);
3735 	if (error != 0) {
3736 		return error;
3737 	}
3738 	vp = nd.ni_vp;
3739 	mp = vp->v_mount;
3740 	nameidone(&nd);
3741 
3742 	error = statfs_internal(p, mp, uap->buf);
3743 	vnode_put(vp);
3744 
3745 	return error;
3746 }
3747 
3748 /*
3749  * Get filesystem statistics.
3750  */
3751 /* ARGSUSED */
3752 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3753 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3754 {
3755 	int error;
3756 	vnode_t vp = NULL;
3757 	struct mount *mp;
3758 
3759 	AUDIT_ARG(fd, uap->fd);
3760 
3761 	if ((error = file_vnode(uap->fd, &vp)) ||
3762 	    (error = vnode_getwithref(vp))) {
3763 		goto out;
3764 	}
3765 
3766 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3767 
3768 	mp = vp->v_mount;
3769 	if (!mp) {
3770 		error = EBADF;
3771 		goto out_vnode;
3772 	}
3773 
3774 	error = statfs_internal(p, mp, uap->buf);
3775 
3776 out_vnode:
3777 	vnode_put(vp);
3778 
3779 out:
3780 	if (vp != NULL) {
3781 		file_drop(uap->fd);
3782 	}
3783 
3784 	return error;
3785 }
3786 
3787 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3788 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3789 {
3790 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3791 
3792 	bzero(sfs, sizeof(*sfs));
3793 
3794 	sfs->f_bsize = vsfs->f_bsize;
3795 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3796 	sfs->f_blocks = vsfs->f_blocks;
3797 	sfs->f_bfree = vsfs->f_bfree;
3798 	sfs->f_bavail = vsfs->f_bavail;
3799 	sfs->f_files = vsfs->f_files;
3800 	sfs->f_ffree = vsfs->f_ffree;
3801 	sfs->f_fsid = vsfs->f_fsid;
3802 	sfs->f_owner = vsfs->f_owner;
3803 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3804 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3805 	sfs->f_fssubtype = vsfs->f_fssubtype;
3806 	sfs->f_flags_ext = vfs_getextflags(mp);
3807 	vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3808 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3809 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3810 }
3811 
3812 /*
3813  * Get file system statistics in 64-bit mode
3814  */
3815 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3816 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3817 {
3818 	struct mount *mp;
3819 	int error;
3820 	struct nameidata *ndp;
3821 	struct statfs64 *sfsp;
3822 	vfs_context_t ctxp = vfs_context_current();
3823 	vnode_t vp;
3824 	struct {
3825 		struct nameidata nd;
3826 		struct statfs64 sfs;
3827 	} *__nameidata_statfs64;
3828 
3829 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3830 	    Z_WAITOK);
3831 	ndp = &__nameidata_statfs64->nd;
3832 
3833 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3834 	    UIO_USERSPACE, uap->path, ctxp);
3835 	error = namei(ndp);
3836 	if (error != 0) {
3837 		goto out;
3838 	}
3839 	vp = ndp->ni_vp;
3840 	mp = vp->v_mount;
3841 	nameidone(ndp);
3842 
3843 #if CONFIG_MACF
3844 	error = mac_mount_check_stat(ctxp, mp);
3845 	if (error != 0) {
3846 		vnode_put(vp);
3847 		goto out;
3848 	}
3849 #endif
3850 
3851 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3852 	if (error != 0) {
3853 		vnode_put(vp);
3854 		goto out;
3855 	}
3856 
3857 	sfsp = &__nameidata_statfs64->sfs;
3858 	vfs_get_statfs64(mp, sfsp);
3859 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3860 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3861 		/* This process does not want to see a seperate data volume mountpoint */
3862 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3863 	}
3864 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3865 	vnode_put(vp);
3866 
3867 out:
3868 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3869 
3870 	return error;
3871 }
3872 
3873 /*
3874  * Get file system statistics in 64-bit mode
3875  */
3876 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3877 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3878 {
3879 	struct vnode *vp;
3880 	struct mount *mp;
3881 	struct statfs64 sfs;
3882 	int error;
3883 
3884 	AUDIT_ARG(fd, uap->fd);
3885 
3886 	if ((error = file_vnode(uap->fd, &vp))) {
3887 		return error;
3888 	}
3889 
3890 	error = vnode_getwithref(vp);
3891 	if (error) {
3892 		file_drop(uap->fd);
3893 		return error;
3894 	}
3895 
3896 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3897 
3898 	mp = vp->v_mount;
3899 	if (!mp) {
3900 		error = EBADF;
3901 		goto out;
3902 	}
3903 
3904 #if CONFIG_MACF
3905 	error = mac_mount_check_stat(vfs_context_current(), mp);
3906 	if (error != 0) {
3907 		goto out;
3908 	}
3909 #endif
3910 
3911 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3912 		goto out;
3913 	}
3914 
3915 	vfs_get_statfs64(mp, &sfs);
3916 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3917 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3918 		/* This process does not want to see a seperate data volume mountpoint */
3919 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3920 	}
3921 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3922 
3923 out:
3924 	file_drop(uap->fd);
3925 	vnode_put(vp);
3926 
3927 	return error;
3928 }
3929 
3930 struct getfsstat_struct {
3931 	user_addr_t     sfsp;
3932 	user_addr_t     *mp;
3933 	int             count;
3934 	int             maxcount;
3935 	int             flags;
3936 	int             error;
3937 };
3938 
3939 
3940 static int
getfsstat_callback(mount_t mp,void * arg)3941 getfsstat_callback(mount_t mp, void * arg)
3942 {
3943 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3944 	struct vfsstatfs *sp;
3945 	int error, my_size;
3946 	vfs_context_t ctx = vfs_context_current();
3947 
3948 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3949 #if CONFIG_MACF
3950 		error = mac_mount_check_stat(ctx, mp);
3951 		if (error != 0) {
3952 			fstp->error = error;
3953 			return VFS_RETURNED_DONE;
3954 		}
3955 #endif
3956 		sp = &mp->mnt_vfsstat;
3957 		/*
3958 		 * If MNT_NOWAIT is specified, do not refresh the
3959 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3960 		 */
3961 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3962 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3963 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3964 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3965 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3966 			return VFS_RETURNED;
3967 		}
3968 
3969 		/*
3970 		 * Need to handle LP64 version of struct statfs
3971 		 */
3972 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3973 		if (error) {
3974 			fstp->error = error;
3975 			return VFS_RETURNED_DONE;
3976 		}
3977 		fstp->sfsp += my_size;
3978 
3979 		if (fstp->mp) {
3980 #if CONFIG_MACF
3981 			error = mac_mount_label_get(mp, *fstp->mp);
3982 			if (error) {
3983 				fstp->error = error;
3984 				return VFS_RETURNED_DONE;
3985 			}
3986 #endif
3987 			fstp->mp++;
3988 		}
3989 	}
3990 	fstp->count++;
3991 	return VFS_RETURNED;
3992 }
3993 
3994 /*
3995  * Get statistics on all filesystems.
3996  */
3997 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3998 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3999 {
4000 	struct __mac_getfsstat_args muap;
4001 
4002 	muap.buf = uap->buf;
4003 	muap.bufsize = uap->bufsize;
4004 	muap.mac = USER_ADDR_NULL;
4005 	muap.macsize = 0;
4006 	muap.flags = uap->flags;
4007 
4008 	return __mac_getfsstat(p, &muap, retval);
4009 }
4010 
4011 /*
4012  * __mac_getfsstat: Get MAC-related file system statistics
4013  *
4014  * Parameters:    p                        (ignored)
4015  *                uap                      User argument descriptor (see below)
4016  *                retval                   Count of file system statistics (N stats)
4017  *
4018  * Indirect:      uap->bufsize             Buffer size
4019  *                uap->macsize             MAC info size
4020  *                uap->buf                 Buffer where information will be returned
4021  *                uap->mac                 MAC info
4022  *                uap->flags               File system flags
4023  *
4024  *
4025  * Returns:        0                       Success
4026  *                !0                       Not success
4027  *
4028  */
4029 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)4030 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
4031 {
4032 	user_addr_t sfsp;
4033 	user_addr_t *mp;
4034 	size_t count, maxcount, bufsize, macsize;
4035 	struct getfsstat_struct fst;
4036 
4037 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
4038 		return EINVAL;
4039 	}
4040 
4041 	bufsize = (size_t) uap->bufsize;
4042 	macsize = (size_t) uap->macsize;
4043 
4044 	if (IS_64BIT_PROCESS(p)) {
4045 		maxcount = bufsize / sizeof(struct user64_statfs);
4046 	} else {
4047 		maxcount = bufsize / sizeof(struct user32_statfs);
4048 	}
4049 	sfsp = uap->buf;
4050 	count = 0;
4051 
4052 	mp = NULL;
4053 
4054 #if CONFIG_MACF
4055 	if (uap->mac != USER_ADDR_NULL) {
4056 		u_int32_t *mp0;
4057 		int error;
4058 		unsigned int i;
4059 
4060 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
4061 		if (count != maxcount) {
4062 			return EINVAL;
4063 		}
4064 
4065 		/* Copy in the array */
4066 		mp0 = kalloc_data(macsize, Z_WAITOK);
4067 		if (mp0 == NULL) {
4068 			return ENOMEM;
4069 		}
4070 
4071 		error = copyin(uap->mac, mp0, macsize);
4072 		if (error) {
4073 			kfree_data(mp0, macsize);
4074 			return error;
4075 		}
4076 
4077 		/* Normalize to an array of user_addr_t */
4078 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
4079 		if (mp == NULL) {
4080 			kfree_data(mp0, macsize);
4081 			return ENOMEM;
4082 		}
4083 
4084 		for (i = 0; i < count; i++) {
4085 			if (IS_64BIT_PROCESS(p)) {
4086 				mp[i] = ((user_addr_t *)mp0)[i];
4087 			} else {
4088 				mp[i] = (user_addr_t)mp0[i];
4089 			}
4090 		}
4091 		kfree_data(mp0, macsize);
4092 	}
4093 #endif
4094 
4095 
4096 	fst.sfsp = sfsp;
4097 	fst.mp = mp;
4098 	fst.flags = uap->flags;
4099 	fst.count = 0;
4100 	fst.error = 0;
4101 	fst.maxcount = (int)maxcount;
4102 
4103 
4104 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
4105 
4106 	if (mp) {
4107 		kfree_data(mp, count * sizeof(user_addr_t));
4108 	}
4109 
4110 	if (fst.error) {
4111 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4112 		return fst.error;
4113 	}
4114 
4115 	if (fst.sfsp && fst.count > fst.maxcount) {
4116 		*retval = fst.maxcount;
4117 	} else {
4118 		*retval = fst.count;
4119 	}
4120 	return 0;
4121 }
4122 
4123 static int
getfsstat64_callback(mount_t mp,void * arg)4124 getfsstat64_callback(mount_t mp, void * arg)
4125 {
4126 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
4127 	struct vfsstatfs *sp;
4128 	struct statfs64 sfs;
4129 	int error;
4130 
4131 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
4132 #if CONFIG_MACF
4133 		error = mac_mount_check_stat(vfs_context_current(), mp);
4134 		if (error != 0) {
4135 			fstp->error = error;
4136 			return VFS_RETURNED_DONE;
4137 		}
4138 #endif
4139 		sp = &mp->mnt_vfsstat;
4140 		/*
4141 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
4142 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
4143 		 *
4144 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
4145 		 * getfsstat, since the constants are out of the same
4146 		 * namespace.
4147 		 */
4148 		if ((mp->mnt_lflag & MNT_LDEAD) ||
4149 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
4150 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
4151 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
4152 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
4153 			return VFS_RETURNED;
4154 		}
4155 
4156 		vfs_get_statfs64(mp, &sfs);
4157 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
4158 		if (error) {
4159 			fstp->error = error;
4160 			return VFS_RETURNED_DONE;
4161 		}
4162 		fstp->sfsp += sizeof(sfs);
4163 	}
4164 	fstp->count++;
4165 	return VFS_RETURNED;
4166 }
4167 
4168 /*
4169  * Get statistics on all file systems in 64 bit mode.
4170  */
4171 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)4172 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
4173 {
4174 	user_addr_t sfsp;
4175 	int count, maxcount;
4176 	struct getfsstat_struct fst;
4177 
4178 	maxcount = uap->bufsize / sizeof(struct statfs64);
4179 
4180 	sfsp = uap->buf;
4181 	count = 0;
4182 
4183 	fst.sfsp = sfsp;
4184 	fst.flags = uap->flags;
4185 	fst.count = 0;
4186 	fst.error = 0;
4187 	fst.maxcount = maxcount;
4188 
4189 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
4190 
4191 	if (fst.error) {
4192 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4193 		return fst.error;
4194 	}
4195 
4196 	if (fst.sfsp && fst.count > fst.maxcount) {
4197 		*retval = fst.maxcount;
4198 	} else {
4199 		*retval = fst.count;
4200 	}
4201 
4202 	return 0;
4203 }
4204 
4205 /*
4206  * gets the associated vnode with the file descriptor passed.
4207  * as input
4208  *
4209  * INPUT
4210  * ctx - vfs context of caller
4211  * fd - file descriptor for which vnode is required.
4212  * vpp - Pointer to pointer to vnode to be returned.
4213  *
4214  * The vnode is returned with an iocount so any vnode obtained
4215  * by this call needs a vnode_put
4216  *
4217  */
4218 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)4219 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
4220 {
4221 	int error;
4222 	vnode_t vp;
4223 	struct fileproc *fp;
4224 	proc_t p = vfs_context_proc(ctx);
4225 
4226 	*vpp =  NULLVP;
4227 
4228 	error = fp_getfvp(p, fd, &fp, &vp);
4229 	if (error) {
4230 		return error;
4231 	}
4232 
4233 	error = vnode_getwithref(vp);
4234 	if (error) {
4235 		(void)fp_drop(p, fd, fp, 0);
4236 		return error;
4237 	}
4238 
4239 	(void)fp_drop(p, fd, fp, 0);
4240 	*vpp = vp;
4241 	return error;
4242 }
4243 
4244 int
vnode_getfromid(int volfs_id,uint64_t objid,vfs_context_t ctx,int realfsid,vnode_t * vpp)4245 vnode_getfromid(int volfs_id, uint64_t objid, vfs_context_t ctx, int realfsid, vnode_t *vpp)
4246 {
4247 	int error = 0;
4248 	vnode_t vp = NULLVP;
4249 	struct mount *mp = NULL;
4250 
4251 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
4252 		error = ENOTSUP; /* unexpected failure */
4253 		return ENOTSUP;
4254 	}
4255 
4256 #if CONFIG_UNION_MOUNTS
4257 unionget:
4258 #endif /* CONFIG_UNION_MOUNTS */
4259 	if (objid == 2) {
4260 		struct vfs_attr vfsattr;
4261 		int use_vfs_root = TRUE;
4262 
4263 		VFSATTR_INIT(&vfsattr);
4264 		VFSATTR_WANTED(&vfsattr, f_capabilities);
4265 		if (!realfsid &&
4266 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
4267 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
4268 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
4269 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
4270 				use_vfs_root = FALSE;
4271 			}
4272 		}
4273 
4274 		if (use_vfs_root) {
4275 			error = VFS_ROOT(mp, &vp, ctx);
4276 		} else {
4277 			error = VFS_VGET(mp, objid, &vp, ctx);
4278 		}
4279 	} else {
4280 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
4281 	}
4282 
4283 #if CONFIG_UNION_MOUNTS
4284 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
4285 		/*
4286 		 * If the fileid isn't found and we're in a union
4287 		 * mount volume, then see if the fileid is in the
4288 		 * mounted-on volume.
4289 		 */
4290 		struct mount *tmp = mp;
4291 		mp = vnode_mount(tmp->mnt_vnodecovered);
4292 		vfs_unbusy(tmp);
4293 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
4294 			goto unionget;
4295 		}
4296 	} else {
4297 		vfs_unbusy(mp);
4298 	}
4299 #else
4300 	vfs_unbusy(mp);
4301 #endif /* CONFIG_UNION_MOUNTS */
4302 
4303 	if (!error) {
4304 		*vpp = vp;
4305 	}
4306 
4307 	return error;
4308 }
4309 
4310 /*
4311  * Wrapper function around namei to start lookup from a directory
4312  * specified by a file descriptor ni_dirfd.
4313  *
4314  * In addition to all the errors returned by namei, this call can
4315  * return ENOTDIR if the file descriptor does not refer to a directory.
4316  * and EBADF if the file descriptor is not valid.
4317  */
4318 int
nameiat(struct nameidata * ndp,int dirfd)4319 nameiat(struct nameidata *ndp, int dirfd)
4320 {
4321 	if ((dirfd != AT_FDCWD) &&
4322 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4323 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
4324 		int error = 0;
4325 		char c;
4326 
4327 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4328 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4329 			if (error) {
4330 				return error;
4331 			}
4332 		} else {
4333 			c = *((char *)(ndp->ni_dirp));
4334 		}
4335 
4336 		if (c != '/') {
4337 			vnode_t dvp_at;
4338 
4339 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4340 			    &dvp_at);
4341 			if (error) {
4342 				return error;
4343 			}
4344 
4345 			if (vnode_vtype(dvp_at) != VDIR) {
4346 				vnode_put(dvp_at);
4347 				return ENOTDIR;
4348 			}
4349 
4350 			ndp->ni_dvp = dvp_at;
4351 			ndp->ni_cnd.cn_flags |= USEDVP;
4352 			error = namei(ndp);
4353 			ndp->ni_cnd.cn_flags &= ~USEDVP;
4354 			vnode_put(dvp_at);
4355 			return error;
4356 		}
4357 	}
4358 
4359 	return namei(ndp);
4360 }
4361 
4362 /*
4363  * Change current working directory to a given file descriptor.
4364  */
4365 /* ARGSUSED */
4366 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4367 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4368 {
4369 	vnode_t vp;
4370 	vnode_t tdp;
4371 	vnode_t tvp;
4372 	struct mount *mp;
4373 	int error, should_put = 1;
4374 
4375 	AUDIT_ARG(fd, fd);
4376 	if (per_thread && fd == -1) {
4377 		/*
4378 		 * Switching back from per-thread to per process CWD; verify we
4379 		 * in fact have one before proceeding.  The only success case
4380 		 * for this code path is to return 0 preemptively after zapping
4381 		 * the thread structure contents.
4382 		 */
4383 		thread_t th = vfs_context_thread(ctx);
4384 		if (th) {
4385 			uthread_t uth = get_bsdthread_info(th);
4386 			tvp = uth->uu_cdir;
4387 			uth->uu_cdir = NULLVP;
4388 			if (tvp != NULLVP) {
4389 				vnode_rele(tvp);
4390 				return 0;
4391 			}
4392 		}
4393 		return EBADF;
4394 	}
4395 
4396 	if ((error = file_vnode(fd, &vp))) {
4397 		return error;
4398 	}
4399 	if ((error = vnode_getwithref(vp))) {
4400 		file_drop(fd);
4401 		return error;
4402 	}
4403 
4404 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4405 
4406 	if (vp->v_type != VDIR) {
4407 		error = ENOTDIR;
4408 		goto out;
4409 	}
4410 
4411 #if CONFIG_MACF
4412 	error = mac_vnode_check_chdir(ctx, vp);
4413 	if (error) {
4414 		goto out;
4415 	}
4416 #endif
4417 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4418 	if (error) {
4419 		goto out;
4420 	}
4421 
4422 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4423 		if (vfs_busy(mp, LK_NOWAIT)) {
4424 			error = EACCES;
4425 			goto out;
4426 		}
4427 		error = VFS_ROOT(mp, &tdp, ctx);
4428 		vfs_unbusy(mp);
4429 		if (error) {
4430 			break;
4431 		}
4432 		vnode_put(vp);
4433 		vp = tdp;
4434 	}
4435 	if (error) {
4436 		goto out;
4437 	}
4438 	if ((error = vnode_ref(vp))) {
4439 		goto out;
4440 	}
4441 	vnode_put(vp);
4442 	should_put = 0;
4443 
4444 	if (per_thread) {
4445 		thread_t th = vfs_context_thread(ctx);
4446 		if (th) {
4447 			uthread_t uth = get_bsdthread_info(th);
4448 			tvp = uth->uu_cdir;
4449 			uth->uu_cdir = vp;
4450 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4451 		} else {
4452 			vnode_rele(vp);
4453 			error = ENOENT;
4454 			goto out;
4455 		}
4456 	} else {
4457 		proc_dirs_lock_exclusive(p);
4458 		proc_fdlock(p);
4459 		tvp = p->p_fd.fd_cdir;
4460 		p->p_fd.fd_cdir = vp;
4461 		proc_fdunlock(p);
4462 		proc_dirs_unlock_exclusive(p);
4463 	}
4464 
4465 	if (tvp) {
4466 		vnode_rele(tvp);
4467 	}
4468 
4469 out:
4470 	if (should_put) {
4471 		vnode_put(vp);
4472 	}
4473 	file_drop(fd);
4474 
4475 	return error;
4476 }
4477 
4478 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4479 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4480 {
4481 	return fchdir(p, vfs_context_current(), uap->fd, false);
4482 }
4483 
4484 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4485 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4486 {
4487 	return fchdir(p, vfs_context_current(), uap->fd, true);
4488 }
4489 
4490 
4491 /*
4492  * Change current working directory (".").
4493  *
4494  * Returns:	0			Success
4495  *	change_dir:ENOTDIR
4496  *	change_dir:???
4497  *	vnode_ref:ENOENT		No such file or directory
4498  */
4499 /* ARGSUSED */
4500 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4501 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4502 {
4503 	int error;
4504 	vnode_t tvp;
4505 
4506 	error = change_dir(ndp, ctx);
4507 	if (error) {
4508 		return error;
4509 	}
4510 	if ((error = vnode_ref(ndp->ni_vp))) {
4511 		vnode_put(ndp->ni_vp);
4512 		return error;
4513 	}
4514 	/*
4515 	 * drop the iocount we picked up in change_dir
4516 	 */
4517 	vnode_put(ndp->ni_vp);
4518 
4519 	if (per_thread) {
4520 		thread_t th = vfs_context_thread(ctx);
4521 		if (th) {
4522 			uthread_t uth = get_bsdthread_info(th);
4523 			tvp = uth->uu_cdir;
4524 			uth->uu_cdir = ndp->ni_vp;
4525 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4526 		} else {
4527 			vnode_rele(ndp->ni_vp);
4528 			return ENOENT;
4529 		}
4530 	} else {
4531 		proc_dirs_lock_exclusive(p);
4532 		proc_fdlock(p);
4533 		tvp = p->p_fd.fd_cdir;
4534 		p->p_fd.fd_cdir = ndp->ni_vp;
4535 		proc_fdunlock(p);
4536 		proc_dirs_unlock_exclusive(p);
4537 	}
4538 
4539 	if (tvp) {
4540 		vnode_rele(tvp);
4541 	}
4542 
4543 	return 0;
4544 }
4545 
4546 
4547 /*
4548  * Change current working directory (".").
4549  *
4550  * Returns:	0			Success
4551  *	chdir_internal:ENOTDIR
4552  *	chdir_internal:ENOENT		No such file or directory
4553  *	chdir_internal:???
4554  */
4555 /* ARGSUSED */
4556 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4557 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4558 {
4559 	struct nameidata nd;
4560 	vfs_context_t ctx = vfs_context_current();
4561 
4562 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4563 	    UIO_USERSPACE, uap->path, ctx);
4564 
4565 	return chdir_internal(p, ctx, &nd, per_thread);
4566 }
4567 
4568 
4569 /*
4570  * chdir
4571  *
4572  * Change current working directory (".") for the entire process
4573  *
4574  * Parameters:  p       Process requesting the call
4575  *              uap     User argument descriptor (see below)
4576  *              retval  (ignored)
4577  *
4578  * Indirect parameters:	uap->path	Directory path
4579  *
4580  * Returns:	0			Success
4581  *              common_chdir: ENOTDIR
4582  *              common_chdir: ENOENT	No such file or directory
4583  *              common_chdir: ???
4584  *
4585  */
4586 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4587 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4588 {
4589 	return common_chdir(p, (void *)uap, 0);
4590 }
4591 
4592 /*
4593  * __pthread_chdir
4594  *
4595  * Change current working directory (".") for a single thread
4596  *
4597  * Parameters:  p       Process requesting the call
4598  *              uap     User argument descriptor (see below)
4599  *              retval  (ignored)
4600  *
4601  * Indirect parameters:	uap->path	Directory path
4602  *
4603  * Returns:	0			Success
4604  *              common_chdir: ENOTDIR
4605  *		common_chdir: ENOENT	No such file or directory
4606  *		common_chdir: ???
4607  *
4608  */
4609 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4610 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4611 {
4612 	return common_chdir(p, (void *)uap, 1);
4613 }
4614 
4615 #define CHROOT_ENTITLEMENT    "com.apple.private.vfs.chroot"
4616 
4617 /*
4618  * Change notion of root (``/'') directory.
4619  */
4620 /* ARGSUSED */
4621 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4622 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4623 {
4624 	struct filedesc *fdp = &p->p_fd;
4625 	int error;
4626 	struct nameidata nd;
4627 	vnode_t tvp;
4628 	vfs_context_t ctx = vfs_context_current();
4629 
4630 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4631 		return error;
4632 	}
4633 
4634 #if XNU_TARGET_OS_IOS && (DEVELOPMENT || DEBUG)
4635 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), CHROOT_ENTITLEMENT)) {
4636 		mach_exception_code_t code = 0;
4637 
4638 		os_log_error(OS_LOG_DEFAULT,
4639 		    "%s: proc %s[%d] calls chroot(2) without entitlement\n",
4640 		    __func__, proc_best_name(p), proc_getpid(p));
4641 
4642 		/*
4643 		 * Generate a simulated EXC_GUARD crash report so we know about the
4644 		 * violation.
4645 		 */
4646 		EXC_GUARD_ENCODE_TYPE(code, GUARD_TYPE_REJECTED_SC);
4647 		task_violated_guard(code, 61 /* SYS_chroot */, NULL, true);
4648 	}
4649 #endif
4650 
4651 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4652 	    UIO_USERSPACE, uap->path, ctx);
4653 	error = change_dir(&nd, ctx);
4654 	if (error) {
4655 		return error;
4656 	}
4657 
4658 #if CONFIG_MACF
4659 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4660 	    &nd.ni_cnd);
4661 	if (error) {
4662 		vnode_put(nd.ni_vp);
4663 		return error;
4664 	}
4665 #endif
4666 
4667 	if ((error = vnode_ref(nd.ni_vp))) {
4668 		vnode_put(nd.ni_vp);
4669 		return error;
4670 	}
4671 	vnode_put(nd.ni_vp);
4672 
4673 	/*
4674 	 * This lock provides the guarantee that as long as you hold the lock
4675 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4676 	 * on a referenced vnode in namei when determining the rootvnode for
4677 	 * a process.
4678 	 */
4679 	/* needed for synchronization with lookup */
4680 	proc_dirs_lock_exclusive(p);
4681 	/* needed for setting the flag and other activities on the fd itself */
4682 	proc_fdlock(p);
4683 	tvp = fdp->fd_rdir;
4684 	fdp->fd_rdir = nd.ni_vp;
4685 	fdt_flag_set(fdp, FD_CHROOT);
4686 	proc_fdunlock(p);
4687 	proc_dirs_unlock_exclusive(p);
4688 
4689 	if (tvp != NULL) {
4690 		vnode_rele(tvp);
4691 	}
4692 
4693 	return 0;
4694 }
4695 
4696 #define PATHSTATICBUFLEN 256
4697 #define PIVOT_ROOT_ENTITLEMENT              \
4698        "com.apple.private.vfs.pivot-root"
4699 
4700 #if defined(XNU_TARGET_OS_OSX)
4701 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4702 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4703 {
4704 	int error;
4705 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4706 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4707 	char *new_rootfs_path_before_buf = NULL;
4708 	char *old_rootfs_path_after_buf = NULL;
4709 	char *incoming = NULL;
4710 	char *outgoing = NULL;
4711 	vnode_t incoming_rootvp = NULLVP;
4712 	size_t bytes_copied;
4713 
4714 	/*
4715 	 * XXX : Additional restrictions needed
4716 	 * - perhaps callable only once.
4717 	 */
4718 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4719 		return error;
4720 	}
4721 
4722 	/*
4723 	 * pivot_root can be executed by launchd only.
4724 	 * Enforce entitlement.
4725 	 */
4726 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4727 		return EPERM;
4728 	}
4729 
4730 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4731 	if (error == ENAMETOOLONG) {
4732 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4733 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4734 	}
4735 
4736 	if (error) {
4737 		goto out;
4738 	}
4739 
4740 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4741 	if (error == ENAMETOOLONG) {
4742 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4743 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4744 	}
4745 	if (error) {
4746 		goto out;
4747 	}
4748 
4749 	if (new_rootfs_path_before_buf) {
4750 		incoming = new_rootfs_path_before_buf;
4751 	} else {
4752 		incoming = &new_rootfs_path_before[0];
4753 	}
4754 
4755 	if (old_rootfs_path_after_buf) {
4756 		outgoing = old_rootfs_path_after_buf;
4757 	} else {
4758 		outgoing = &old_rootfs_path_after[0];
4759 	}
4760 
4761 	/*
4762 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4763 	 * Userland is not allowed to pivot to an image.
4764 	 */
4765 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4766 	if (error) {
4767 		goto out;
4768 	}
4769 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4770 	if (error) {
4771 		goto out;
4772 	}
4773 
4774 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4775 
4776 out:
4777 	if (incoming_rootvp != NULLVP) {
4778 		vnode_put(incoming_rootvp);
4779 		incoming_rootvp = NULLVP;
4780 	}
4781 
4782 	if (old_rootfs_path_after_buf) {
4783 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4784 	}
4785 
4786 	if (new_rootfs_path_before_buf) {
4787 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4788 	}
4789 
4790 	return error;
4791 }
4792 #else
4793 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4794 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4795 {
4796 	return nosys(p, NULL, retval);
4797 }
4798 #endif /* XNU_TARGET_OS_OSX */
4799 
4800 /*
4801  * Common routine for chroot and chdir.
4802  *
4803  * Returns:	0			Success
4804  *		ENOTDIR			Not a directory
4805  *		namei:???		[anything namei can return]
4806  *		vnode_authorize:???	[anything vnode_authorize can return]
4807  */
4808 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4809 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4810 {
4811 	vnode_t vp;
4812 	int error;
4813 
4814 	if ((error = namei(ndp))) {
4815 		return error;
4816 	}
4817 	nameidone(ndp);
4818 	vp = ndp->ni_vp;
4819 
4820 	if (vp->v_type != VDIR) {
4821 		vnode_put(vp);
4822 		return ENOTDIR;
4823 	}
4824 
4825 #if CONFIG_MACF
4826 	error = mac_vnode_check_chdir(ctx, vp);
4827 	if (error) {
4828 		vnode_put(vp);
4829 		return error;
4830 	}
4831 #endif
4832 
4833 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4834 	if (error) {
4835 		vnode_put(vp);
4836 		return error;
4837 	}
4838 
4839 	return error;
4840 }
4841 
4842 /*
4843  * Free the vnode data (for directories) associated with the file glob.
4844  */
4845 struct fd_vn_data *
fg_vn_data_alloc(void)4846 fg_vn_data_alloc(void)
4847 {
4848 	struct fd_vn_data *fvdata;
4849 
4850 	/* Allocate per fd vnode data */
4851 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4852 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4853 	return fvdata;
4854 }
4855 
4856 /*
4857  * Free the vnode data (for directories) associated with the file glob.
4858  */
4859 void
fg_vn_data_free(void * fgvndata)4860 fg_vn_data_free(void *fgvndata)
4861 {
4862 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4863 
4864 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4865 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4866 	kfree_type(struct fd_vn_data, fvdata);
4867 }
4868 
4869 /*
4870  * Check permissions, allocate an open file structure,
4871  * and call the device open routine if any.
4872  *
4873  * Returns:	0			Success
4874  *		EINVAL
4875  *		EINTR
4876  *	falloc:ENFILE
4877  *	falloc:EMFILE
4878  *	falloc:ENOMEM
4879  *	vn_open_auth:???
4880  *	dupfdopen:???
4881  *	VNOP_ADVLOCK:???
4882  *	vnode_setsize:???
4883  *
4884  * XXX Need to implement uid, gid
4885  */
4886 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4887 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4888     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4889 {
4890 	proc_t p = vfs_context_proc(ctx);
4891 	kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4892 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4893 	struct fileproc *fp;
4894 	vnode_t vp;
4895 	int flags, oflags, amode;
4896 	int type, indx, error;
4897 	struct vfs_context context;
4898 	vnode_t authvp = NULLVP;
4899 
4900 	oflags = uflags;
4901 
4902 	amode = oflags & O_ACCMODE;
4903 	/*
4904 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4905 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4906 	 * with FREAD/FWRITE.
4907 	 */
4908 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4909 		return EINVAL;
4910 	}
4911 
4912 	flags = FFLAGS(uflags);
4913 	CLR(flags, FENCRYPTED);
4914 	CLR(flags, FUNENCRYPTED);
4915 
4916 	AUDIT_ARG(fflags, oflags);
4917 	AUDIT_ARG(mode, vap->va_mode);
4918 
4919 	if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4920 		return error;
4921 	}
4922 	if (flags & O_CLOEXEC) {
4923 		fp->fp_flags |= FP_CLOEXEC;
4924 	}
4925 	if (flags & O_CLOFORK) {
4926 		fp->fp_flags |= FP_CLOFORK;
4927 	}
4928 
4929 	/* setup state to recognize when fdesc_open was called */
4930 	uu->uu_dupfd = -1;
4931 
4932 	/*
4933 	 * Disable read/write access if file is opened with O_EVTONLY and
4934 	 * the process has requested to deny read/write access.
4935 	 */
4936 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4937 		flags &= ~(FREAD | FWRITE);
4938 	}
4939 
4940 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4941 		error = vnode_getfromfd(ctx, authfd, &authvp);
4942 		if (error) {
4943 			fp_free(p, indx, fp);
4944 			return error;
4945 		}
4946 	}
4947 
4948 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4949 		if (authvp != NULLVP) {
4950 			vnode_put(authvp);
4951 		}
4952 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4953 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4954 				*retval = indx;
4955 				return 0;
4956 			}
4957 		}
4958 		if (error == ERESTART) {
4959 			error = EINTR;
4960 		}
4961 		fp_free(p, indx, fp);
4962 		return error;
4963 	}
4964 
4965 	if (authvp != NULLVP) {
4966 		vnode_put(authvp);
4967 	}
4968 
4969 	uu->uu_dupfd = 0;
4970 	vp = ndp->ni_vp;
4971 
4972 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4973 	fp->fp_glob->fg_ops = &vnops;
4974 	fp_set_data(fp, vp);
4975 
4976 #if CONFIG_FILE_LEASES
4977 	/*
4978 	 * If we are creating a file or open with truncate, we need to break the
4979 	 * lease if there is a read lease placed on the parent dir.
4980 	 */
4981 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4982 		vnode_breakdirlease(vp, true, oflags);
4983 	}
4984 	/* Now check if there is a lease placed on the file itself. */
4985 	error = vnode_breaklease(vp, oflags, ctx);
4986 	if (error) {
4987 		goto bad;
4988 	}
4989 #endif /* CONFIG_FILE_LEASES */
4990 
4991 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4992 		struct flock lf = {
4993 			.l_whence = SEEK_SET,
4994 		};
4995 
4996 		if (flags & O_EXLOCK) {
4997 			lf.l_type = F_WRLCK;
4998 		} else {
4999 			lf.l_type = F_RDLCK;
5000 		}
5001 		type = F_FLOCK;
5002 		if ((flags & FNONBLOCK) == 0) {
5003 			type |= F_WAIT;
5004 		}
5005 #if CONFIG_MACF
5006 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
5007 		    F_SETLK, &lf);
5008 		if (error) {
5009 			goto bad;
5010 		}
5011 #endif
5012 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
5013 			goto bad;
5014 		}
5015 		fp->fp_glob->fg_flag |= FWASLOCKED;
5016 	}
5017 
5018 	/* try to truncate by setting the size attribute */
5019 	if (flags & O_TRUNC) {
5020 		if ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0) {
5021 			goto bad;
5022 		}
5023 		fp->fp_glob->fg_flag |= FWASWRITTEN;
5024 	}
5025 
5026 	/*
5027 	 * For directories we hold some additional information in the fd.
5028 	 */
5029 	if (vnode_vtype(vp) == VDIR) {
5030 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
5031 	} else {
5032 		fp->fp_glob->fg_vn_data = NULL;
5033 	}
5034 
5035 #if CONFIG_SECLUDED_MEMORY
5036 	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
5037 		memory_object_control_t moc;
5038 		const char *v_name;
5039 
5040 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
5041 
5042 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
5043 			/* nothing to do... */
5044 		} else if (fp->fp_glob->fg_flag & FWRITE) {
5045 			/* writable -> no longer  eligible for secluded pages */
5046 			memory_object_mark_eligible_for_secluded(moc,
5047 			    FALSE);
5048 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
5049 			char pathname[32] = { 0, };
5050 			size_t copied;
5051 			/* XXX FBDP: better way to detect /Applications/ ? */
5052 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5053 				(void)copyinstr(ndp->ni_dirp,
5054 				    pathname,
5055 				    sizeof(pathname),
5056 				    &copied);
5057 			} else {
5058 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
5059 				    pathname,
5060 				    sizeof(pathname),
5061 				    &copied);
5062 			}
5063 			pathname[sizeof(pathname) - 1] = '\0';
5064 			if (strncmp(pathname,
5065 			    "/Applications/",
5066 			    strlen("/Applications/")) == 0 &&
5067 			    strncmp(pathname,
5068 			    "/Applications/Camera.app/",
5069 			    strlen("/Applications/Camera.app/")) != 0) {
5070 				/*
5071 				 * not writable
5072 				 * AND from "/Applications/"
5073 				 * AND not from "/Applications/Camera.app/"
5074 				 * ==> eligible for secluded
5075 				 */
5076 				memory_object_mark_eligible_for_secluded(moc,
5077 				    TRUE);
5078 			}
5079 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
5080 		    (v_name = vnode_getname(vp))) {
5081 			size_t len = strlen(v_name);
5082 
5083 			if (!strncmp(v_name, "dyld", len) ||
5084 			    !strncmp(v_name, "launchd", len) ||
5085 			    !strncmp(v_name, "Camera", len) ||
5086 			    !strncmp(v_name, "SpringBoard", len) ||
5087 			    !strncmp(v_name, "backboardd", len) ||
5088 			    !strncmp(v_name, "cameracaptured", len)) {
5089 				/*
5090 				 * This file matters when launching Camera:
5091 				 * do not store its contents in the secluded
5092 				 * pool that will be drained on Camera launch.
5093 				 */
5094 				memory_object_mark_eligible_for_secluded(moc,
5095 				    FALSE);
5096 			} else if (!strncmp(v_name, "audiomxd", len) ||
5097 			    !strncmp(v_name, "mediaplaybackd", len)) {
5098 				memory_object_mark_eligible_for_secluded(moc,
5099 				    FALSE);
5100 				memory_object_mark_for_realtime(moc,
5101 				    true);
5102 			} else if (!strncmp(v_name, "bluetoothd", len)) {
5103 				/*
5104 				 * bluetoothd might be needed for realtime audio
5105 				 * playback.
5106 				 */
5107 				memory_object_mark_eligible_for_secluded(moc,
5108 				    FALSE);
5109 				memory_object_mark_for_realtime(moc,
5110 				    true);
5111 			} else {
5112 				char pathname[64] = { 0, };
5113 				size_t copied;
5114 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5115 					(void)copyinstr(ndp->ni_dirp,
5116 					    pathname,
5117 					    sizeof(pathname),
5118 					    &copied);
5119 				} else {
5120 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
5121 					    pathname,
5122 					    sizeof(pathname),
5123 					    &copied);
5124 				}
5125 				pathname[sizeof(pathname) - 1] = '\0';
5126 				if (strncmp(pathname,
5127 				    "/Library/Audio/Plug-Ins/",
5128 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
5129 				    strncmp(pathname,
5130 				    "/System/Library/Audio/Plug-Ins/",
5131 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
5132 					/*
5133 					 * This may be an audio plugin required
5134 					 * for realtime playback.
5135 					 * ==> NOT eligible for secluded.
5136 					 */
5137 					memory_object_mark_eligible_for_secluded(moc,
5138 					    FALSE);
5139 					memory_object_mark_for_realtime(moc,
5140 					    true);
5141 				}
5142 			}
5143 			vnode_putname(v_name);
5144 		}
5145 	}
5146 #endif /* CONFIG_SECLUDED_MEMORY */
5147 
5148 	vnode_put(vp);
5149 
5150 	/*
5151 	 * The first terminal open (without a O_NOCTTY) by a session leader
5152 	 * results in it being set as the controlling terminal.
5153 	 */
5154 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
5155 	    !(flags & O_NOCTTY)) {
5156 		int tmp = 0;
5157 
5158 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
5159 		    (caddr_t)&tmp, ctx);
5160 	}
5161 
5162 	proc_fdlock(p);
5163 	procfdtbl_releasefd(p, indx, NULL);
5164 
5165 	fp_drop(p, indx, fp, 1);
5166 	proc_fdunlock(p);
5167 
5168 	*retval = indx;
5169 
5170 	return 0;
5171 bad:
5172 	context = *vfs_context_current();
5173 	context.vc_ucred = fp->fp_glob->fg_cred;
5174 
5175 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
5176 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
5177 		struct flock lf = {
5178 			.l_whence = SEEK_SET,
5179 			.l_type = F_UNLCK,
5180 		};
5181 
5182 		(void)VNOP_ADVLOCK(
5183 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
5184 	}
5185 
5186 	vn_close(vp, fp->fp_glob->fg_flag, &context);
5187 	vnode_put(vp);
5188 	fp_free(p, indx, fp);
5189 
5190 	return error;
5191 }
5192 
5193 /*
5194  * While most of the *at syscall handlers can call nameiat() which
5195  * is a wrapper around namei, the use of namei and initialisation
5196  * of nameidata are far removed and in different functions  - namei
5197  * gets called in vn_open_auth for open1. So we'll just do here what
5198  * nameiat() does.
5199  */
5200 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)5201 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
5202     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
5203     int dirfd, int authfd)
5204 {
5205 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
5206 		int error;
5207 		char c;
5208 
5209 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5210 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
5211 			if (error) {
5212 				return error;
5213 			}
5214 		} else {
5215 			c = *((char *)(ndp->ni_dirp));
5216 		}
5217 
5218 		if (c != '/') {
5219 			vnode_t dvp_at;
5220 
5221 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
5222 			    &dvp_at);
5223 			if (error) {
5224 				return error;
5225 			}
5226 
5227 			if (vnode_vtype(dvp_at) != VDIR) {
5228 				vnode_put(dvp_at);
5229 				return ENOTDIR;
5230 			}
5231 
5232 			ndp->ni_dvp = dvp_at;
5233 			ndp->ni_cnd.cn_flags |= USEDVP;
5234 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
5235 			    retval, authfd);
5236 			vnode_put(dvp_at);
5237 			return error;
5238 		}
5239 	}
5240 
5241 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
5242 }
5243 
5244 /*
5245  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
5246  *
5247  * Parameters:	p			Process requesting the open
5248  *		uap			User argument descriptor (see below)
5249  *		retval			Pointer to an area to receive the
5250  *					return calue from the system call
5251  *
5252  * Indirect:	uap->path		Path to open (same as 'open')
5253  *		uap->flags		Flags to open (same as 'open'
5254  *		uap->uid		UID to set, if creating
5255  *		uap->gid		GID to set, if creating
5256  *		uap->mode		File mode, if creating (same as 'open')
5257  *		uap->xsecurity		ACL to set, if creating
5258  *
5259  * Returns:	0			Success
5260  *		!0			errno value
5261  *
5262  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5263  *
5264  * XXX:		We should enummerate the possible errno values here, and where
5265  *		in the code they originated.
5266  */
5267 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)5268 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
5269 {
5270 	int ciferror;
5271 	kauth_filesec_t xsecdst;
5272 	struct vnode_attr va;
5273 	struct nameidata nd;
5274 	int cmode;
5275 
5276 	AUDIT_ARG(owner, uap->uid, uap->gid);
5277 
5278 	xsecdst = NULL;
5279 	if ((uap->xsecurity != USER_ADDR_NULL) &&
5280 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
5281 		return ciferror;
5282 	}
5283 
5284 	VATTR_INIT(&va);
5285 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
5286 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5287 	if (uap->uid != KAUTH_UID_NONE) {
5288 		VATTR_SET(&va, va_uid, uap->uid);
5289 	}
5290 	if (uap->gid != KAUTH_GID_NONE) {
5291 		VATTR_SET(&va, va_gid, uap->gid);
5292 	}
5293 	if (xsecdst != NULL) {
5294 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5295 		va.va_vaflags |= VA_FILESEC_ACL;
5296 	}
5297 
5298 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
5299 	    uap->path, vfs_context_current());
5300 
5301 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
5302 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
5303 	if (xsecdst != NULL) {
5304 		kauth_filesec_free(xsecdst);
5305 	}
5306 
5307 	return ciferror;
5308 }
5309 
5310 /*
5311  * Go through the data-protected atomically controlled open (2)
5312  *
5313  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5314  */
5315 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5316 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5317     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5318 {
5319 	/*
5320 	 * Follow the same path as normal open(2)
5321 	 * Look up the item if it exists, and acquire the vnode.
5322 	 */
5323 	struct vnode_attr va;
5324 	struct nameidata nd;
5325 	int cmode;
5326 	int error;
5327 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5328 
5329 	VATTR_INIT(&va);
5330 	/* Mask off all but regular access permissions */
5331 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5332 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5333 
5334 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5335 	    path, ctx);
5336 
5337 	/*
5338 	 * Initialize the extra fields in vnode_attr to pass down our
5339 	 * extra fields.
5340 	 * 1. target cprotect class.
5341 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5342 	 */
5343 	if (flags & O_CREAT) {
5344 		/* lower level kernel code validates that the class is valid before applying it. */
5345 		if (class != PROTECTION_CLASS_DEFAULT) {
5346 			/*
5347 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5348 			 * file behave the same as open (2)
5349 			 */
5350 			VATTR_SET(&va, va_dataprotect_class, class);
5351 		}
5352 	}
5353 
5354 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5355 		if (flags & (O_RDWR | O_WRONLY)) {
5356 			/*
5357 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
5358 			 */
5359 			return EINVAL;
5360 		}
5361 		if (dpflags & O_DP_GETRAWENCRYPTED) {
5362 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5363 		}
5364 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5365 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5366 		}
5367 		if (dpflags & O_DP_AUTHENTICATE) {
5368 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5369 		}
5370 	}
5371 
5372 	error = open1at(vfs_context_current(), &nd, flags, &va,
5373 	    NULL, NULL, retval, fd, authfd);
5374 
5375 	return error;
5376 }
5377 
5378 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5379 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5380 {
5381 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5382 		return EINVAL;
5383 	}
5384 
5385 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5386 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5387 }
5388 
5389 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5390 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5391 {
5392 	if (uap->dpflags & O_DP_AUTHENTICATE) {
5393 		return EINVAL;
5394 	}
5395 
5396 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5397 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5398 }
5399 
5400 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval,uint64_t * objidp,fsid_t * fsidp)5401 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5402     int fd, enum uio_seg segflg, int *retval, uint64_t *objidp, fsid_t *fsidp)
5403 {
5404 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5405 	struct {
5406 		struct vnode_attr va;
5407 		struct nameidata nd;
5408 	} *__open_data;
5409 	struct vnode_attr *vap;
5410 	struct nameidata *ndp;
5411 	int cmode;
5412 	int error;
5413 
5414 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5415 	vap = &__open_data->va;
5416 	ndp = &__open_data->nd;
5417 
5418 	VATTR_INIT(vap);
5419 	/* Mask off all but regular access permissions */
5420 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5421 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5422 
5423 	/* Check for fileid and fsid authentication */
5424 	if (objidp || fsidp) {
5425 		if (!objidp || !fsidp) {
5426 			error = EINVAL;
5427 			goto out;
5428 		}
5429 		VATTR_SET(vap, va_flags, VA_VAFILEID);
5430 		VATTR_SET(vap, va_fileid, *objidp);
5431 		VATTR_SET(vap, va_fsid64, *fsidp);
5432 	}
5433 
5434 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5435 	    segflg, path, ctx);
5436 
5437 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5438 
5439 out:
5440 	kfree_type(typeof(*__open_data), __open_data);
5441 
5442 	return error;
5443 }
5444 
5445 int
open(proc_t p,struct open_args * uap,int32_t * retval)5446 open(proc_t p, struct open_args *uap, int32_t *retval)
5447 {
5448 	__pthread_testcancel(1);
5449 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5450 }
5451 
5452 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5453 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5454     int32_t *retval)
5455 {
5456 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5457 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval, NULL, NULL);
5458 }
5459 
5460 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5461 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5462     int32_t *retval)
5463 {
5464 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5465 	           uap->mode, uap->fd, UIO_USERSPACE, retval, NULL, NULL);
5466 }
5467 
5468 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5469 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5470 {
5471 	__pthread_testcancel(1);
5472 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5473 }
5474 
5475 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5476 
5477 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5478 vfs_context_can_open_by_id(vfs_context_t ctx)
5479 {
5480 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5481 		return TRUE;
5482 	}
5483 
5484 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5485 	           OPEN_BY_ID_ENTITLEMENT);
5486 }
5487 
5488 #define MAX_OPENBYID_NP_RETRIES 10
5489 
5490 /*
5491  * openbyid_np: open a file given a file system id and a file system object id
5492  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5493  *	file systems that don't support object ids it is a node id (uint64_t).
5494  *
5495  * Parameters:	p			Process requesting the open
5496  *		uap			User argument descriptor (see below)
5497  *		retval			Pointer to an area to receive the
5498  *					return calue from the system call
5499  *
5500  * Indirect:	uap->path		Path to open (same as 'open')
5501  *
5502  *		uap->fsid		id of target file system
5503  *		uap->objid		id of target file system object
5504  *		uap->flags		Flags to open (same as 'open')
5505  *
5506  * Returns:	0			Success
5507  *		!0			errno value
5508  *
5509  *
5510  * XXX:		We should enummerate the possible errno values here, and where
5511  *		in the code they originated.
5512  */
5513 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5514 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5515 {
5516 	fsid_t fsid;
5517 	uint64_t objid;
5518 	int fd;
5519 	int error;
5520 	int retry_count = 0;
5521 	char *buf = NULL;
5522 	int buflen = MAXPATHLEN;
5523 	int pathlen = 0;
5524 	vfs_context_t ctx = vfs_context_current();
5525 
5526 	if (!vfs_context_can_open_by_id(ctx)) {
5527 		return EPERM;
5528 	}
5529 
5530 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5531 		return error;
5532 	}
5533 
5534 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5535 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5536 		return error;
5537 	}
5538 
5539 	AUDIT_ARG(value32, fsid.val[0]);
5540 	AUDIT_ARG(value64, objid);
5541 
5542 retry:
5543 	fd = -1;
5544 	error = 0;
5545 	buf = NULL;
5546 	pathlen = 0;
5547 	buflen = MAXPATHLEN;
5548 
5549 	/*resolve path from fsis, objid*/
5550 	do {
5551 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5552 		if (buf == NULL) {
5553 			return ENOMEM;
5554 		}
5555 
5556 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5557 		    buf, FSOPT_ISREALFSID, &pathlen);
5558 
5559 		if (error) {
5560 			kfree_data(buf, buflen + 1);
5561 			buf = NULL;
5562 		}
5563 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5564 
5565 	if (error) {
5566 		return error;
5567 	}
5568 
5569 	buf[pathlen] = 0;
5570 
5571 	error = openat_internal(
5572 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, &fd, &objid, &fsid);
5573 
5574 	kfree_data(buf, buflen + 1);
5575 
5576 	/* Ensure the correct file is opened */
5577 	if (error == ERECYCLE) {
5578 		if (retry_count < MAX_OPENBYID_NP_RETRIES) {
5579 			retry_count += 1;
5580 			goto retry;
5581 		} else {
5582 			printf("openbyid_np() retry limit due to ERECYCLE reached\n");
5583 			error = ENOENT;
5584 		}
5585 	}
5586 
5587 	if (!error) {
5588 		*retval = fd;
5589 	}
5590 
5591 	return error;
5592 }
5593 
5594 
5595 /*
5596  * Create a special file.
5597  */
5598 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5599     int fd);
5600 
5601 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5602 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5603     mode_t mode, int fd)
5604 {
5605 	vfs_context_t ctx = vfs_context_current();
5606 	struct nameidata nd;
5607 	vnode_t vp, dvp;
5608 	int error;
5609 
5610 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5611 	if ((mode & S_IFMT) == S_IFIFO) {
5612 		return mkfifo1(ctx, upath, vap, fd);
5613 	}
5614 
5615 	AUDIT_ARG(mode, mode);
5616 	AUDIT_ARG(value32, vap->va_rdev);
5617 
5618 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5619 		return error;
5620 	}
5621 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5622 	    UIO_USERSPACE, upath, ctx);
5623 	error = nameiat(&nd, fd);
5624 	if (error) {
5625 		return error;
5626 	}
5627 	dvp = nd.ni_dvp;
5628 	vp = nd.ni_vp;
5629 
5630 	if (vp != NULL) {
5631 		error = EEXIST;
5632 		goto out;
5633 	}
5634 
5635 	switch (mode & S_IFMT) {
5636 	case S_IFCHR:
5637 		VATTR_SET(vap, va_type, VCHR);
5638 		break;
5639 	case S_IFBLK:
5640 		VATTR_SET(vap, va_type, VBLK);
5641 		break;
5642 	default:
5643 		error = EINVAL;
5644 		goto out;
5645 	}
5646 
5647 #if CONFIG_MACF
5648 	error = mac_vnode_check_create(ctx,
5649 	    nd.ni_dvp, &nd.ni_cnd, vap);
5650 	if (error) {
5651 		goto out;
5652 	}
5653 #endif
5654 
5655 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5656 		goto out;
5657 	}
5658 
5659 #if CONFIG_FILE_LEASES
5660 	vnode_breakdirlease(dvp, false, O_WRONLY);
5661 #endif
5662 
5663 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5664 		goto out;
5665 	}
5666 
5667 	if (vp) {
5668 		int     update_flags = 0;
5669 
5670 		// Make sure the name & parent pointers are hooked up
5671 		if (vp->v_name == NULL) {
5672 			update_flags |= VNODE_UPDATE_NAME;
5673 		}
5674 		if (vp->v_parent == NULLVP) {
5675 			update_flags |= VNODE_UPDATE_PARENT;
5676 		}
5677 
5678 		if (update_flags) {
5679 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5680 		}
5681 
5682 #if CONFIG_FSE
5683 		add_fsevent(FSE_CREATE_FILE, ctx,
5684 		    FSE_ARG_VNODE, vp,
5685 		    FSE_ARG_DONE);
5686 #endif
5687 	}
5688 
5689 out:
5690 	/*
5691 	 * nameidone has to happen before we vnode_put(dvp)
5692 	 * since it may need to release the fs_nodelock on the dvp
5693 	 */
5694 	nameidone(&nd);
5695 
5696 	if (vp) {
5697 		vnode_put(vp);
5698 	}
5699 	vnode_put(dvp);
5700 
5701 	return error;
5702 }
5703 
5704 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5705 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5706 {
5707 	struct vnode_attr va;
5708 
5709 	VATTR_INIT(&va);
5710 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5711 	VATTR_SET(&va, va_rdev, uap->dev);
5712 
5713 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5714 }
5715 
5716 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5717 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5718 {
5719 	struct vnode_attr va;
5720 
5721 	VATTR_INIT(&va);
5722 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5723 	VATTR_SET(&va, va_rdev, uap->dev);
5724 
5725 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5726 }
5727 
5728 /*
5729  * Create a named pipe.
5730  *
5731  * Returns:	0			Success
5732  *		EEXIST
5733  *	namei:???
5734  *	vnode_authorize:???
5735  *	vn_create:???
5736  */
5737 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5738 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5739 {
5740 	vnode_t vp, dvp;
5741 	int error;
5742 	struct nameidata nd;
5743 
5744 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5745 	    UIO_USERSPACE, upath, ctx);
5746 	error = nameiat(&nd, fd);
5747 	if (error) {
5748 		return error;
5749 	}
5750 	dvp = nd.ni_dvp;
5751 	vp = nd.ni_vp;
5752 
5753 	/* check that this is a new file and authorize addition */
5754 	if (vp != NULL) {
5755 		error = EEXIST;
5756 		goto out;
5757 	}
5758 	VATTR_SET(vap, va_type, VFIFO);
5759 
5760 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5761 		goto out;
5762 	}
5763 
5764 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5765 out:
5766 	/*
5767 	 * nameidone has to happen before we vnode_put(dvp)
5768 	 * since it may need to release the fs_nodelock on the dvp
5769 	 */
5770 	nameidone(&nd);
5771 
5772 	if (vp) {
5773 		vnode_put(vp);
5774 	}
5775 	vnode_put(dvp);
5776 
5777 	return error;
5778 }
5779 
5780 
5781 /*
5782  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5783  *
5784  * Parameters:	p			Process requesting the open
5785  *		uap			User argument descriptor (see below)
5786  *		retval			(Ignored)
5787  *
5788  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5789  *		uap->uid		UID to set
5790  *		uap->gid		GID to set
5791  *		uap->mode		File mode to set (same as 'mkfifo')
5792  *		uap->xsecurity		ACL to set, if creating
5793  *
5794  * Returns:	0			Success
5795  *		!0			errno value
5796  *
5797  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5798  *
5799  * XXX:		We should enummerate the possible errno values here, and where
5800  *		in the code they originated.
5801  */
5802 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5803 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5804 {
5805 	int ciferror;
5806 	kauth_filesec_t xsecdst;
5807 	struct vnode_attr va;
5808 
5809 	AUDIT_ARG(owner, uap->uid, uap->gid);
5810 
5811 	xsecdst = KAUTH_FILESEC_NONE;
5812 	if (uap->xsecurity != USER_ADDR_NULL) {
5813 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5814 			return ciferror;
5815 		}
5816 	}
5817 
5818 	VATTR_INIT(&va);
5819 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5820 	if (uap->uid != KAUTH_UID_NONE) {
5821 		VATTR_SET(&va, va_uid, uap->uid);
5822 	}
5823 	if (uap->gid != KAUTH_GID_NONE) {
5824 		VATTR_SET(&va, va_gid, uap->gid);
5825 	}
5826 	if (xsecdst != KAUTH_FILESEC_NONE) {
5827 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5828 		va.va_vaflags |= VA_FILESEC_ACL;
5829 	}
5830 
5831 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5832 
5833 	if (xsecdst != KAUTH_FILESEC_NONE) {
5834 		kauth_filesec_free(xsecdst);
5835 	}
5836 	return ciferror;
5837 }
5838 
5839 /* ARGSUSED */
5840 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5841 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5842 {
5843 	struct vnode_attr va;
5844 
5845 	VATTR_INIT(&va);
5846 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5847 
5848 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5849 }
5850 
5851 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5852 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5853 {
5854 	struct vnode_attr va;
5855 
5856 	VATTR_INIT(&va);
5857 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5858 
5859 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5860 }
5861 
5862 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5863 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5864 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5865 
5866 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5867 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5868 {
5869 	int ret, len = _len;
5870 
5871 	*truncated_path = 0;
5872 
5873 	if (firmlink) {
5874 		ret = vn_getpath(dvp, path, &len);
5875 	} else {
5876 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5877 	}
5878 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5879 		if (leafname) {
5880 			path[len - 1] = '/';
5881 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5882 			if (len > MAXPATHLEN) {
5883 				char *ptr;
5884 
5885 				// the string got truncated!
5886 				*truncated_path = 1;
5887 				ptr = strrchr(path, '/');
5888 				if (ptr) {
5889 					*ptr = '\0';   // chop off the string at the last directory component
5890 				}
5891 				len = (int)strlen(path) + 1;
5892 			}
5893 		}
5894 	} else if (ret == 0) {
5895 		*truncated_path = 1;
5896 	} else if (ret != 0) {
5897 		struct vnode *mydvp = dvp;
5898 
5899 		if (ret != ENOSPC) {
5900 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5901 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5902 		}
5903 		*truncated_path = 1;
5904 
5905 		do {
5906 			if (mydvp->v_parent != NULL) {
5907 				mydvp = mydvp->v_parent;
5908 			} else if (mydvp->v_mount) {
5909 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5910 				break;
5911 			} else {
5912 				// no parent and no mount point?  only thing is to punt and say "/" changed
5913 				strlcpy(path, "/", _len);
5914 				len = 2;
5915 				mydvp = NULL;
5916 			}
5917 
5918 			if (mydvp == NULL) {
5919 				break;
5920 			}
5921 
5922 			len = _len;
5923 			if (firmlink) {
5924 				ret = vn_getpath(mydvp, path, &len);
5925 			} else {
5926 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5927 			}
5928 		} while (ret == ENOSPC);
5929 	}
5930 
5931 	return len;
5932 }
5933 
5934 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5935 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5936 {
5937 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5938 }
5939 
5940 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5941 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5942 {
5943 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5944 }
5945 
5946 /*
5947  * Make a hard file link.
5948  *
5949  * Returns:	0			Success
5950  *		EPERM
5951  *		EEXIST
5952  *		EXDEV
5953  *	namei:???
5954  *	vnode_authorize:???
5955  *	VNOP_LINK:???
5956  */
5957 /* ARGSUSED */
5958 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5959 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5960     user_addr_t link, int flag, enum uio_seg segflg)
5961 {
5962 	vnode_t vp, pvp, dvp, lvp;
5963 	struct nameidata nd;
5964 	int follow;
5965 	int error;
5966 #if CONFIG_FSE
5967 	fse_info finfo;
5968 #endif
5969 	char *target_path = NULL;
5970 	char  *no_firmlink_path = NULL;
5971 	vnode_t locked_vp = NULLVP;
5972 	int truncated = 0;
5973 	int truncated_no_firmlink_path = 0;
5974 	int num_retries = 0;
5975 	int need_event, has_listeners, need_kpath2;
5976 	bool do_retry;
5977 
5978 	/* look up the object we are linking to */
5979 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5980 
5981 retry:
5982 	do_retry = false;
5983 	vp = dvp = lvp = NULLVP;
5984 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5985 	    segflg, path, ctx);
5986 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
5987 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
5988 	}
5989 	if (flag & AT_RESOLVE_BENEATH) {
5990 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
5991 	}
5992 	if (flag & AT_UNIQUE) {
5993 		nd.ni_flag |= NAMEI_UNIQUE;
5994 	}
5995 
5996 	error = nameiat(&nd, fd1);
5997 	if (error) {
5998 		return error;
5999 	}
6000 	vp = nd.ni_vp;
6001 
6002 	nameidone(&nd);
6003 
6004 	/*
6005 	 * Normally, linking to directories is not supported.
6006 	 * However, some file systems may have limited support.
6007 	 */
6008 	if (vp->v_type == VDIR) {
6009 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
6010 			error = EPERM;   /* POSIX */
6011 			goto out;
6012 		}
6013 
6014 		/* Linking to a directory requires ownership. */
6015 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
6016 			struct vnode_attr dva;
6017 
6018 			VATTR_INIT(&dva);
6019 			VATTR_WANTED(&dva, va_uid);
6020 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
6021 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
6022 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
6023 				error = EACCES;
6024 				goto out;
6025 			}
6026 		}
6027 	}
6028 
6029 	/* lookup the target node */
6030 #if CONFIG_TRIGGERS
6031 	nd.ni_op = OP_LINK;
6032 #endif
6033 	nd.ni_cnd.cn_nameiop = CREATE;
6034 	nd.ni_flag &= ~NAMEI_UNIQUE;
6035 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
6036 	nd.ni_dirp = link;
6037 	error = nameiat(&nd, fd2);
6038 	if (error != 0) {
6039 		goto out;
6040 	}
6041 	dvp = nd.ni_dvp;
6042 	lvp = nd.ni_vp;
6043 
6044 	assert(locked_vp == NULLVP);
6045 	vnode_link_lock(vp);
6046 	locked_vp = vp;
6047 
6048 #if CONFIG_MACF
6049 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
6050 		goto out2;
6051 	}
6052 #endif
6053 
6054 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
6055 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
6056 		goto out2;
6057 	}
6058 
6059 	/* target node must not exist */
6060 	if (lvp != NULLVP) {
6061 		error = EEXIST;
6062 		goto out2;
6063 	}
6064 	/* cannot link across mountpoints */
6065 	if (vnode_mount(vp) != vnode_mount(dvp)) {
6066 		error = EXDEV;
6067 		goto out2;
6068 	}
6069 
6070 	/* authorize creation of the target note */
6071 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
6072 		goto out2;
6073 	}
6074 
6075 #if CONFIG_FILE_LEASES
6076 	vnode_breakdirlease(dvp, false, O_WRONLY);
6077 #endif
6078 
6079 	/* and finally make the link */
6080 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
6081 	if (error) {
6082 		if (error == ENOENT && num_retries < MAX_LINK_ENOENT_RETRIES) {
6083 			do_retry = true;
6084 			num_retries += 1;
6085 		}
6086 		goto out2;
6087 	}
6088 
6089 #if CONFIG_MACF
6090 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
6091 #endif
6092 
6093 	os_atomic_andnot(&vp->v_ext_flag, VE_NOT_HARDLINK, relaxed);
6094 
6095 	assert(locked_vp == vp);
6096 	vnode_link_unlock(locked_vp);
6097 	locked_vp = NULLVP;
6098 
6099 #if CONFIG_FSE
6100 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
6101 #else
6102 	need_event = 0;
6103 #endif
6104 	has_listeners = kauth_authorize_fileop_has_listeners();
6105 
6106 	need_kpath2 = 0;
6107 #if CONFIG_AUDIT
6108 	if (AUDIT_RECORD_EXISTS()) {
6109 		need_kpath2 = 1;
6110 	}
6111 #endif
6112 
6113 	if (need_event || has_listeners || need_kpath2) {
6114 		char *link_to_path = NULL;
6115 		int len, link_name_len;
6116 		int  len_no_firmlink_path = 0;
6117 
6118 		/* build the path to the new link file */
6119 		GET_PATH(target_path);
6120 
6121 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
6122 		if (no_firmlink_path == NULL) {
6123 			GET_PATH(no_firmlink_path);
6124 		}
6125 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6126 
6127 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
6128 
6129 		if (has_listeners) {
6130 			/* build the path to file we are linking to */
6131 			GET_PATH(link_to_path);
6132 
6133 			link_name_len = MAXPATHLEN;
6134 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
6135 				/*
6136 				 * Call out to allow 3rd party notification of rename.
6137 				 * Ignore result of kauth_authorize_fileop call.
6138 				 */
6139 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
6140 				    (uintptr_t)link_to_path,
6141 				    (uintptr_t)target_path);
6142 			}
6143 			if (link_to_path != NULL) {
6144 				RELEASE_PATH(link_to_path);
6145 			}
6146 		}
6147 #if CONFIG_FSE
6148 		if (need_event) {
6149 			/* construct fsevent */
6150 			if (get_fse_info(vp, &finfo, ctx) == 0) {
6151 				if (truncated_no_firmlink_path) {
6152 					finfo.mode |= FSE_TRUNCATED_PATH;
6153 				}
6154 
6155 				// build the path to the destination of the link
6156 				add_fsevent(FSE_CREATE_FILE, ctx,
6157 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6158 				    FSE_ARG_FINFO, &finfo,
6159 				    FSE_ARG_DONE);
6160 			}
6161 
6162 			pvp = vp->v_parent;
6163 			// need an iocount on parent vnode in this case
6164 			if (pvp && pvp != dvp) {
6165 				pvp = vnode_getparent_if_different(vp, dvp);
6166 			}
6167 			if (pvp) {
6168 				add_fsevent(FSE_STAT_CHANGED, ctx,
6169 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
6170 			}
6171 			if (pvp && pvp != dvp) {
6172 				vnode_put(pvp);
6173 			}
6174 		}
6175 #endif
6176 	}
6177 out2:
6178 	/*
6179 	 * nameidone has to happen before we vnode_put(dvp)
6180 	 * since it may need to release the fs_nodelock on the dvp
6181 	 */
6182 	nameidone(&nd);
6183 	if (target_path != NULL) {
6184 		RELEASE_PATH(target_path);
6185 		target_path = NULL;
6186 	}
6187 	if (no_firmlink_path != NULL) {
6188 		RELEASE_PATH(no_firmlink_path);
6189 		no_firmlink_path = NULL;
6190 	}
6191 out:
6192 	if (locked_vp) {
6193 		assert(locked_vp == vp);
6194 		vnode_link_unlock(locked_vp);
6195 		locked_vp = NULLVP;
6196 	}
6197 	if (lvp) {
6198 		vnode_put(lvp);
6199 	}
6200 	if (dvp) {
6201 		vnode_put(dvp);
6202 	}
6203 	vnode_put(vp);
6204 
6205 	if (do_retry) {
6206 		goto retry;
6207 	}
6208 
6209 	return error;
6210 }
6211 
6212 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)6213 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
6214 {
6215 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6216 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
6217 }
6218 
6219 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)6220 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
6221 {
6222 	if (uap->flag & ~(AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH | AT_UNIQUE)) {
6223 		return EINVAL;
6224 	}
6225 
6226 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
6227 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
6228 }
6229 
6230 /*
6231  * Make a symbolic link.
6232  *
6233  * We could add support for ACLs here too...
6234  */
6235 /* ARGSUSED */
6236 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)6237 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
6238     user_addr_t link, enum uio_seg segflg)
6239 {
6240 	struct vnode_attr va;
6241 	char *path;
6242 	int error;
6243 	struct nameidata nd;
6244 	vnode_t vp, dvp;
6245 	size_t dummy = 0;
6246 	proc_t p;
6247 
6248 	error = 0;
6249 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
6250 		path = zalloc(ZV_NAMEI);
6251 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
6252 	} else {
6253 		path = (char *)path_data;
6254 	}
6255 	if (error) {
6256 		goto out;
6257 	}
6258 	AUDIT_ARG(text, path);  /* This is the link string */
6259 
6260 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
6261 	    segflg, link, ctx);
6262 
6263 	error = nameiat(&nd, fd);
6264 	if (error) {
6265 		goto out;
6266 	}
6267 	dvp = nd.ni_dvp;
6268 	vp = nd.ni_vp;
6269 
6270 	p = vfs_context_proc(ctx);
6271 	VATTR_INIT(&va);
6272 	VATTR_SET(&va, va_type, VLNK);
6273 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
6274 
6275 #if CONFIG_MACF
6276 	error = mac_vnode_check_create(ctx,
6277 	    dvp, &nd.ni_cnd, &va);
6278 #endif
6279 	if (error != 0) {
6280 		goto skipit;
6281 	}
6282 
6283 	if (vp != NULL) {
6284 		error = EEXIST;
6285 		goto skipit;
6286 	}
6287 
6288 	/* authorize */
6289 	if (error == 0) {
6290 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6291 	}
6292 	/* get default ownership, etc. */
6293 	if (error == 0) {
6294 		error = vnode_authattr_new(dvp, &va, 0, ctx);
6295 	}
6296 
6297 #if CONFIG_FILE_LEASES
6298 	vnode_breakdirlease(dvp, false, O_WRONLY);
6299 #endif
6300 
6301 	if (error == 0) {
6302 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
6303 	}
6304 
6305 	/* do fallback attribute handling */
6306 	if (error == 0 && vp) {
6307 		error = vnode_setattr_fallback(vp, &va, ctx);
6308 	}
6309 
6310 #if CONFIG_MACF
6311 	if (error == 0 && vp) {
6312 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
6313 	}
6314 #endif
6315 
6316 	if (error == 0) {
6317 		int     update_flags = 0;
6318 
6319 		/*check if a new vnode was created, else try to get one*/
6320 		if (vp == NULL) {
6321 			nd.ni_cnd.cn_nameiop = LOOKUP;
6322 #if CONFIG_TRIGGERS
6323 			nd.ni_op = OP_LOOKUP;
6324 #endif
6325 			/*
6326 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
6327 			 * reallocated again in namei().
6328 			 */
6329 			nd.ni_cnd.cn_flags &= HASBUF;
6330 			error = nameiat(&nd, fd);
6331 			if (error) {
6332 				goto skipit;
6333 			}
6334 			vp = nd.ni_vp;
6335 		}
6336 
6337 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
6338 		/* call out to allow 3rd party notification of rename.
6339 		 * Ignore result of kauth_authorize_fileop call.
6340 		 */
6341 		if (kauth_authorize_fileop_has_listeners() &&
6342 		    namei(&nd) == 0) {
6343 			char *new_link_path = NULL;
6344 			int             len;
6345 
6346 			/* build the path to the new link file */
6347 			new_link_path = get_pathbuff();
6348 			len = MAXPATHLEN;
6349 			vn_getpath(dvp, new_link_path, &len);
6350 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
6351 				new_link_path[len - 1] = '/';
6352 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
6353 			}
6354 
6355 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
6356 			    (uintptr_t)path, (uintptr_t)new_link_path);
6357 			if (new_link_path != NULL) {
6358 				release_pathbuff(new_link_path);
6359 			}
6360 		}
6361 #endif
6362 		// Make sure the name & parent pointers are hooked up
6363 		if (vp->v_name == NULL) {
6364 			update_flags |= VNODE_UPDATE_NAME;
6365 		}
6366 		if (vp->v_parent == NULLVP) {
6367 			update_flags |= VNODE_UPDATE_PARENT;
6368 		}
6369 
6370 		if (update_flags) {
6371 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
6372 		}
6373 
6374 #if CONFIG_FSE
6375 		add_fsevent(FSE_CREATE_FILE, ctx,
6376 		    FSE_ARG_VNODE, vp,
6377 		    FSE_ARG_DONE);
6378 #endif
6379 	}
6380 
6381 skipit:
6382 	/*
6383 	 * nameidone has to happen before we vnode_put(dvp)
6384 	 * since it may need to release the fs_nodelock on the dvp
6385 	 */
6386 	nameidone(&nd);
6387 
6388 	if (vp) {
6389 		vnode_put(vp);
6390 	}
6391 	vnode_put(dvp);
6392 out:
6393 	if (path && (path != (char *)path_data)) {
6394 		zfree(ZV_NAMEI, path);
6395 	}
6396 
6397 	return error;
6398 }
6399 
6400 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6401 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6402 {
6403 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6404 	           uap->link, UIO_USERSPACE);
6405 }
6406 
6407 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6408 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6409     __unused int32_t *retval)
6410 {
6411 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6412 	           uap->path2, UIO_USERSPACE);
6413 }
6414 
6415 /*
6416  * Delete a whiteout from the filesystem.
6417  * No longer supported.
6418  */
6419 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6420 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6421 {
6422 	return ENOTSUP;
6423 }
6424 
6425 /*
6426  * Delete a name from the filesystem.
6427  */
6428 /* ARGSUSED */
6429 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6430 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6431     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6432 {
6433 	struct {
6434 		struct nameidata nd;
6435 #if CONFIG_FSE
6436 		struct vnode_attr va;
6437 		fse_info finfo;
6438 #endif
6439 	} *__unlink_data;
6440 	struct nameidata *ndp;
6441 	vnode_t vp, dvp;
6442 	int error;
6443 	struct componentname *cnp;
6444 	char  *path = NULL;
6445 	char  *no_firmlink_path = NULL;
6446 	int  len_path = 0;
6447 	int  len_no_firmlink_path = 0;
6448 	int flags;
6449 	int need_event;
6450 	int has_listeners;
6451 	int truncated_path;
6452 	int truncated_no_firmlink_path;
6453 	int batched;
6454 	struct vnode_attr *vap;
6455 	vnode_t locked_vp = NULLVP;
6456 	int do_retry;
6457 	int retry_count = 0;
6458 	int cn_flags;
6459 	int namei_flags = 0;
6460 
6461 	cn_flags = LOCKPARENT;
6462 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6463 		cn_flags |= AUDITVNPATH1;
6464 	}
6465 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6466 		namei_flags |= NAMEI_NOFOLLOW_ANY;
6467 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6468 	}
6469 	if (unlink_flags & VNODE_REMOVE_RESOLVE_BENEATH) {
6470 		namei_flags |= NAMEI_RESOLVE_BENEATH;
6471 		unlink_flags &= ~VNODE_REMOVE_RESOLVE_BENEATH;
6472 	}
6473 	if (unlink_flags & VNODE_REMOVE_UNIQUE) {
6474 		namei_flags |= NAMEI_UNIQUE;
6475 		unlink_flags &= ~VNODE_REMOVE_UNIQUE;
6476 	}
6477 
6478 	/* If a starting dvp is passed, it trumps any fd passed. */
6479 	if (start_dvp) {
6480 		cn_flags |= USEDVP;
6481 	}
6482 
6483 #if NAMEDRSRCFORK
6484 	/* unlink or delete is allowed on rsrc forks and named streams */
6485 	cn_flags |= CN_ALLOWRSRCFORK;
6486 #endif
6487 
6488 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6489 	ndp = &__unlink_data->nd;
6490 #if CONFIG_FSE
6491 	fse_info *finfop = &__unlink_data->finfo;
6492 #endif
6493 
6494 retry:
6495 	do_retry = 0;
6496 	flags = 0;
6497 	need_event = 0;
6498 	has_listeners = 0;
6499 	truncated_path = 0;
6500 	truncated_no_firmlink_path = 0;
6501 	vap = NULL;
6502 
6503 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6504 
6505 	ndp->ni_dvp = start_dvp;
6506 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | namei_flags;
6507 	cnp = &ndp->ni_cnd;
6508 
6509 continue_lookup:
6510 	error = nameiat(ndp, fd);
6511 	if (error) {
6512 		goto early_out;
6513 	}
6514 
6515 	dvp = ndp->ni_dvp;
6516 	vp = ndp->ni_vp;
6517 
6518 	/* With Carbon delete semantics, busy files cannot be deleted */
6519 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6520 		flags |= VNODE_REMOVE_NODELETEBUSY;
6521 	}
6522 
6523 	/* Skip any potential upcalls if told to. */
6524 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6525 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6526 	}
6527 
6528 	/* Update speculative telemetry with system discarded use state */
6529 	if (unlink_flags & VNODE_REMOVE_SYSTEM_DISCARDED) {
6530 		flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6531 	}
6532 
6533 	if (vp) {
6534 		batched = vnode_compound_remove_available(vp);
6535 		/*
6536 		 * The root of a mounted filesystem cannot be deleted.
6537 		 */
6538 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6539 			error = EBUSY;
6540 			goto out;
6541 		}
6542 
6543 #if DEVELOPMENT || DEBUG
6544 		/*
6545 		 * XXX VSWAP: Check for entitlements or special flag here
6546 		 * so we can restrict access appropriately.
6547 		 */
6548 #else /* DEVELOPMENT || DEBUG */
6549 
6550 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6551 			error = EPERM;
6552 			goto out;
6553 		}
6554 #endif /* DEVELOPMENT || DEBUG */
6555 
6556 		if (!batched) {
6557 			vnode_link_lock(vp);
6558 			locked_vp = vp;
6559 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6560 			if (error) {
6561 				if (error == ENOENT) {
6562 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6563 						do_retry = 1;
6564 						retry_count++;
6565 					}
6566 				}
6567 				vnode_link_unlock(vp);
6568 				locked_vp = NULLVP;
6569 				goto out;
6570 			}
6571 		}
6572 	} else {
6573 		batched = 1;
6574 
6575 		if (!vnode_compound_remove_available(dvp)) {
6576 			panic("No vp, but no compound remove?");
6577 		}
6578 	}
6579 
6580 #if CONFIG_FSE
6581 	need_event = need_fsevent(FSE_DELETE, dvp);
6582 	if (need_event) {
6583 		if (!batched) {
6584 			if ((vp->v_flag & VISHARDLINK) == 0) {
6585 				/* XXX need to get these data in batched VNOP */
6586 				get_fse_info(vp, finfop, ctx);
6587 			}
6588 		} else {
6589 			error =
6590 			    vfs_get_notify_attributes(&__unlink_data->va);
6591 			if (error) {
6592 				goto out;
6593 			}
6594 
6595 			vap = &__unlink_data->va;
6596 		}
6597 	}
6598 #endif
6599 	has_listeners = kauth_authorize_fileop_has_listeners();
6600 	if (need_event || has_listeners) {
6601 		if (path == NULL) {
6602 			GET_PATH(path);
6603 		}
6604 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6605 		if (no_firmlink_path == NULL) {
6606 			GET_PATH(no_firmlink_path);
6607 		}
6608 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6609 	}
6610 
6611 #if NAMEDRSRCFORK
6612 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6613 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6614 	} else
6615 #endif
6616 	{
6617 #if CONFIG_FILE_LEASES
6618 		vnode_breakdirlease(dvp, false, O_WRONLY);
6619 #endif
6620 
6621 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6622 		vp = ndp->ni_vp;
6623 		if (error == EKEEPLOOKING) {
6624 			if (!batched) {
6625 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6626 			}
6627 
6628 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6629 				panic("EKEEPLOOKING, but continue flag not set?");
6630 			}
6631 
6632 			if (vnode_isdir(vp)) {
6633 				error = EISDIR;
6634 				goto out;
6635 			}
6636 			goto continue_lookup;
6637 		} else if (error == ENOENT && batched) {
6638 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6639 				/*
6640 				 * For compound VNOPs, the authorization callback may
6641 				 * return ENOENT in case of racing hardlink lookups
6642 				 * hitting the name  cache, redrive the lookup.
6643 				 */
6644 				do_retry = 1;
6645 				retry_count += 1;
6646 				goto out;
6647 			}
6648 		}
6649 	}
6650 
6651 	/*
6652 	 * Call out to allow 3rd party notification of delete.
6653 	 * Ignore result of kauth_authorize_fileop call.
6654 	 */
6655 	if (!error) {
6656 		if (has_listeners) {
6657 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6658 			    KAUTH_FILEOP_DELETE,
6659 			    (uintptr_t)vp,
6660 			    (uintptr_t)path);
6661 		}
6662 
6663 		if (vp->v_flag & VISHARDLINK) {
6664 			//
6665 			// if a hardlink gets deleted we want to blow away the
6666 			// v_parent link because the path that got us to this
6667 			// instance of the link is no longer valid.  this will
6668 			// force the next call to get the path to ask the file
6669 			// system instead of just following the v_parent link.
6670 			//
6671 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6672 		}
6673 
6674 #if CONFIG_FSE
6675 		if (need_event) {
6676 			if (vp->v_flag & VISHARDLINK) {
6677 				get_fse_info(vp, finfop, ctx);
6678 			} else if (vap) {
6679 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6680 			}
6681 			if (truncated_path) {
6682 				finfop->mode |= FSE_TRUNCATED_PATH;
6683 			}
6684 			add_fsevent(FSE_DELETE, ctx,
6685 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6686 			    FSE_ARG_FINFO, finfop,
6687 			    FSE_ARG_DONE);
6688 		}
6689 #endif
6690 
6691 #if CONFIG_MACF
6692 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6693 #endif
6694 	}
6695 
6696 out:
6697 	if (locked_vp) {
6698 		assert(locked_vp == vp);
6699 		vnode_link_unlock(locked_vp);
6700 		locked_vp = NULLVP;
6701 	}
6702 
6703 	if (path != NULL) {
6704 		RELEASE_PATH(path);
6705 		path = NULL;
6706 	}
6707 
6708 	if (no_firmlink_path != NULL) {
6709 		RELEASE_PATH(no_firmlink_path);
6710 		no_firmlink_path = NULL;
6711 	}
6712 #if NAMEDRSRCFORK
6713 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6714 	 * will cause its shadow file to go away if necessary.
6715 	 */
6716 	if (vp && (vnode_isnamedstream(vp)) &&
6717 	    (vp->v_parent != NULLVP) &&
6718 	    vnode_isshadow(vp)) {
6719 		vnode_recycle(vp);
6720 	}
6721 #endif
6722 	/*
6723 	 * nameidone has to happen before we vnode_put(dvp)
6724 	 * since it may need to release the fs_nodelock on the dvp
6725 	 */
6726 	nameidone(ndp);
6727 	vnode_put(dvp);
6728 	if (vp) {
6729 		vnode_put(vp);
6730 	}
6731 
6732 	if (do_retry) {
6733 		goto retry;
6734 	}
6735 
6736 early_out:
6737 	kfree_type(typeof(*__unlink_data), __unlink_data);
6738 	return error;
6739 }
6740 
6741 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6742 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6743     enum uio_seg segflg, int unlink_flags)
6744 {
6745 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6746 	           unlink_flags);
6747 }
6748 
6749 /*
6750  * Delete a name from the filesystem using Carbon semantics.
6751  */
6752 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6753 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6754 {
6755 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6756 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6757 }
6758 
6759 /*
6760  * Delete a name from the filesystem using POSIX semantics.
6761  */
6762 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6763 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6764 {
6765 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6766 	           uap->path, UIO_USERSPACE, 0);
6767 }
6768 
6769 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6770 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6771 {
6772 	int unlink_flags = 0;
6773 
6774 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY | AT_SYSTEM_DISCARDED | AT_RESOLVE_BENEATH | AT_NODELETEBUSY | AT_UNIQUE)) {
6775 		return EINVAL;
6776 	}
6777 
6778 	if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6779 		unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6780 	}
6781 	if (uap->flag & AT_RESOLVE_BENEATH) {
6782 		unlink_flags |= VNODE_REMOVE_RESOLVE_BENEATH;
6783 	}
6784 	if (uap->flag & AT_SYSTEM_DISCARDED) {
6785 		unlink_flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6786 	}
6787 	if (uap->flag & AT_NODELETEBUSY) {
6788 		unlink_flags |= VNODE_REMOVE_NODELETEBUSY;
6789 	}
6790 	if (uap->flag & AT_UNIQUE) {
6791 		unlink_flags |= VNODE_REMOVE_UNIQUE;
6792 	}
6793 
6794 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6795 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6796 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6797 		}
6798 		return rmdirat_internal(vfs_context_current(), uap->fd,
6799 		           uap->path, UIO_USERSPACE, unlink_flags);
6800 	} else {
6801 		return unlinkat_internal(vfs_context_current(), uap->fd,
6802 		           NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6803 	}
6804 }
6805 
6806 /*
6807  * Reposition read/write file offset.
6808  */
6809 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6810 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6811 {
6812 	struct fileproc *fp;
6813 	vnode_t vp;
6814 	struct vfs_context *ctx;
6815 	off_t offset = uap->offset, file_size;
6816 	int error;
6817 
6818 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6819 		if (error == ENOTSUP) {
6820 			return ESPIPE;
6821 		}
6822 		return error;
6823 	}
6824 	if (
6825 		// rdar://3837316: Seeking a pipe is disallowed by POSIX.
6826 		vnode_isfifo(vp)
6827 		// rdar://120750171: Seeking a TTY is undefined and should be denied.
6828 		|| vnode_istty(vp)
6829 		) {
6830 		file_drop(uap->fd);
6831 		return ESPIPE;
6832 	}
6833 
6834 
6835 	ctx = vfs_context_current();
6836 #if CONFIG_MACF
6837 	if (uap->whence == L_INCR && uap->offset == 0) {
6838 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6839 		    fp->fp_glob);
6840 	} else {
6841 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6842 		    fp->fp_glob);
6843 	}
6844 	if (error) {
6845 		file_drop(uap->fd);
6846 		return error;
6847 	}
6848 #endif
6849 	if ((error = vnode_getwithref(vp))) {
6850 		file_drop(uap->fd);
6851 		return error;
6852 	}
6853 
6854 	switch (uap->whence) {
6855 	case L_INCR:
6856 		offset += fp->fp_glob->fg_offset;
6857 		break;
6858 	case L_XTND:
6859 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6860 			break;
6861 		}
6862 		offset += file_size;
6863 		break;
6864 	case L_SET:
6865 		break;
6866 	case SEEK_HOLE:
6867 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6868 		break;
6869 	case SEEK_DATA:
6870 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6871 		break;
6872 	default:
6873 		error = EINVAL;
6874 	}
6875 	if (error == 0) {
6876 		if (uap->offset > 0 && offset < 0) {
6877 			/* Incremented/relative move past max size */
6878 			error = EOVERFLOW;
6879 		} else {
6880 			/*
6881 			 * Allow negative offsets on character devices, per
6882 			 * POSIX 1003.1-2001.  Most likely for writing disk
6883 			 * labels.
6884 			 */
6885 			if (offset < 0 && vp->v_type != VCHR) {
6886 				/* Decremented/relative move before start */
6887 				error = EINVAL;
6888 			} else {
6889 				/* Success */
6890 				fp->fp_glob->fg_offset = offset;
6891 				*retval = fp->fp_glob->fg_offset;
6892 			}
6893 		}
6894 	}
6895 
6896 	/*
6897 	 * An lseek can affect whether data is "available to read."  Use
6898 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6899 	 */
6900 	post_event_if_success(vp, error, NOTE_NONE);
6901 	(void)vnode_put(vp);
6902 	file_drop(uap->fd);
6903 	return error;
6904 }
6905 
6906 
6907 /*
6908  * Check access permissions.
6909  *
6910  * Returns:	0			Success
6911  *		vnode_authorize:???
6912  */
6913 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6914 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6915 {
6916 	kauth_action_t action;
6917 	int error;
6918 
6919 	/*
6920 	 * If just the regular access bits, convert them to something
6921 	 * that vnode_authorize will understand.
6922 	 */
6923 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6924 		action = 0;
6925 		if (uflags & R_OK) {
6926 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6927 		}
6928 		if (uflags & W_OK) {
6929 			if (vnode_isdir(vp)) {
6930 				action |= KAUTH_VNODE_ADD_FILE |
6931 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6932 				/* might want delete rights here too */
6933 			} else {
6934 				action |= KAUTH_VNODE_WRITE_DATA;
6935 			}
6936 		}
6937 		if (uflags & X_OK) {
6938 			if (vnode_isdir(vp)) {
6939 				action |= KAUTH_VNODE_SEARCH;
6940 			} else {
6941 				action |= KAUTH_VNODE_EXECUTE;
6942 			}
6943 		}
6944 	} else {
6945 		/* take advantage of definition of uflags */
6946 		action = uflags >> 8;
6947 	}
6948 
6949 #if CONFIG_MACF
6950 	error = mac_vnode_check_access(ctx, vp, uflags);
6951 	if (error) {
6952 		return error;
6953 	}
6954 #endif /* MAC */
6955 
6956 	/* action == 0 means only check for existence */
6957 	if (action != 0) {
6958 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6959 	} else {
6960 		error = 0;
6961 	}
6962 
6963 	return error;
6964 }
6965 
6966 
6967 
6968 /*
6969  * access_extended: Check access permissions in bulk.
6970  *
6971  * Description:	uap->entries		Pointer to an array of accessx
6972  *                                      descriptor structs, plus one or
6973  *                                      more NULL terminated strings (see
6974  *                                      "Notes" section below).
6975  *		uap->size		Size of the area pointed to by
6976  *					uap->entries.
6977  *		uap->results		Pointer to the results array.
6978  *
6979  * Returns:	0			Success
6980  *		ENOMEM			Insufficient memory
6981  *		EINVAL			Invalid arguments
6982  *		namei:EFAULT		Bad address
6983  *		namei:ENAMETOOLONG	Filename too long
6984  *		namei:ENOENT		No such file or directory
6985  *		namei:ELOOP		Too many levels of symbolic links
6986  *		namei:EBADF		Bad file descriptor
6987  *		namei:ENOTDIR		Not a directory
6988  *		namei:???
6989  *		access1:
6990  *
6991  * Implicit returns:
6992  *		uap->results		Array contents modified
6993  *
6994  * Notes:	The uap->entries are structured as an arbitrary length array
6995  *		of accessx descriptors, followed by one or more NULL terminated
6996  *		strings
6997  *
6998  *			struct accessx_descriptor[0]
6999  *			...
7000  *			struct accessx_descriptor[n]
7001  *			char name_data[0];
7002  *
7003  *		We determine the entry count by walking the buffer containing
7004  *		the uap->entries argument descriptor.  For each descriptor we
7005  *		see, the valid values for the offset ad_name_offset will be
7006  *		in the byte range:
7007  *
7008  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
7009  *						to
7010  *				[ uap->entries + uap->size - 2 ]
7011  *
7012  *		since we must have at least one string, and the string must
7013  *		be at least one character plus the NULL terminator in length.
7014  *
7015  * XXX:		Need to support the check-as uid argument
7016  */
7017 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)7018 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
7019 {
7020 	struct accessx_descriptor *input = NULL;
7021 	errno_t *result = NULL;
7022 	errno_t error = 0;
7023 	int wantdelete = 0;
7024 	size_t desc_max, desc_actual = 0;
7025 	unsigned int i, j;
7026 	struct vfs_context context;
7027 	struct nameidata nd;
7028 	int niopts;
7029 	vnode_t vp = NULL;
7030 	vnode_t dvp = NULL;
7031 #define ACCESSX_MAX_DESCR_ON_STACK 10
7032 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
7033 
7034 	context.vc_ucred = NULL;
7035 
7036 	/*
7037 	 * Validate parameters; if valid, copy the descriptor array and string
7038 	 * arguments into local memory.  Before proceeding, the following
7039 	 * conditions must have been met:
7040 	 *
7041 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
7042 	 * o	There must be sufficient room in the request for at least one
7043 	 *	descriptor and a one yte NUL terminated string.
7044 	 * o	The allocation of local storage must not fail.
7045 	 */
7046 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
7047 		return ENOMEM;
7048 	}
7049 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
7050 		return EINVAL;
7051 	}
7052 	if (uap->size <= sizeof(stack_input)) {
7053 		input = stack_input;
7054 	} else {
7055 		input = kalloc_data(uap->size, Z_WAITOK);
7056 		if (input == NULL) {
7057 			error = ENOMEM;
7058 			goto out;
7059 		}
7060 	}
7061 	error = copyin(uap->entries, input, uap->size);
7062 	if (error) {
7063 		goto out;
7064 	}
7065 
7066 	AUDIT_ARG(opaque, input, uap->size);
7067 
7068 	/*
7069 	 * Force NUL termination of the copyin buffer to avoid nami() running
7070 	 * off the end.  If the caller passes us bogus data, they may get a
7071 	 * bogus result.
7072 	 */
7073 	((char *)input)[uap->size - 1] = 0;
7074 
7075 	/*
7076 	 * Access is defined as checking against the process' real identity,
7077 	 * even if operations are checking the effective identity.  This
7078 	 * requires that we use a local vfs context.
7079 	 */
7080 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
7081 	context.vc_thread = current_thread();
7082 
7083 	/*
7084 	 * Find out how many entries we have, so we can allocate the result
7085 	 * array by walking the list and adjusting the count downward by the
7086 	 * earliest string offset we see.
7087 	 */
7088 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
7089 	desc_actual = desc_max;
7090 	for (i = 0; i < desc_actual; i++) {
7091 		/*
7092 		 * Take the offset to the name string for this entry and
7093 		 * convert to an input array index, which would be one off
7094 		 * the end of the array if this entry was the lowest-addressed
7095 		 * name string.
7096 		 */
7097 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
7098 
7099 		/*
7100 		 * An offset greater than the max allowable offset is an error.
7101 		 * It is also an error for any valid entry to point
7102 		 * to a location prior to the end of the current entry, if
7103 		 * it's not a reference to the string of the previous entry.
7104 		 */
7105 		if (j > desc_max || (j != 0 && j <= i)) {
7106 			error = EINVAL;
7107 			goto out;
7108 		}
7109 
7110 		/* Also do not let ad_name_offset point to something beyond the size of the input */
7111 		if (input[i].ad_name_offset >= uap->size) {
7112 			error = EINVAL;
7113 			goto out;
7114 		}
7115 
7116 		/*
7117 		 * An offset of 0 means use the previous descriptor's offset;
7118 		 * this is used to chain multiple requests for the same file
7119 		 * to avoid multiple lookups.
7120 		 */
7121 		if (j == 0) {
7122 			/* This is not valid for the first entry */
7123 			if (i == 0) {
7124 				error = EINVAL;
7125 				goto out;
7126 			}
7127 			continue;
7128 		}
7129 
7130 		/*
7131 		 * If the offset of the string for this descriptor is before
7132 		 * what we believe is the current actual last descriptor,
7133 		 * then we need to adjust our estimate downward; this permits
7134 		 * the string table following the last descriptor to be out
7135 		 * of order relative to the descriptor list.
7136 		 */
7137 		if (j < desc_actual) {
7138 			desc_actual = j;
7139 		}
7140 	}
7141 
7142 	/*
7143 	 * We limit the actual number of descriptors we are willing to process
7144 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
7145 	 * requested does not exceed this limit,
7146 	 */
7147 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
7148 		error = ENOMEM;
7149 		goto out;
7150 	}
7151 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
7152 	if (result == NULL) {
7153 		error = ENOMEM;
7154 		goto out;
7155 	}
7156 
7157 	/*
7158 	 * Do the work by iterating over the descriptor entries we know to
7159 	 * at least appear to contain valid data.
7160 	 */
7161 	error = 0;
7162 	for (i = 0; i < desc_actual; i++) {
7163 		/*
7164 		 * If the ad_name_offset is 0, then we use the previous
7165 		 * results to make the check; otherwise, we are looking up
7166 		 * a new file name.
7167 		 */
7168 		if (input[i].ad_name_offset != 0) {
7169 			/* discard old vnodes */
7170 			if (vp) {
7171 				vnode_put(vp);
7172 				vp = NULL;
7173 			}
7174 			if (dvp) {
7175 				vnode_put(dvp);
7176 				dvp = NULL;
7177 			}
7178 
7179 			/*
7180 			 * Scan forward in the descriptor list to see if we
7181 			 * need the parent vnode.  We will need it if we are
7182 			 * deleting, since we must have rights  to remove
7183 			 * entries in the parent directory, as well as the
7184 			 * rights to delete the object itself.
7185 			 */
7186 			wantdelete = input[i].ad_flags & _DELETE_OK;
7187 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
7188 				if (input[j].ad_flags & _DELETE_OK) {
7189 					wantdelete = 1;
7190 				}
7191 			}
7192 
7193 			niopts = FOLLOW | AUDITVNPATH1;
7194 
7195 			/* need parent for vnode_authorize for deletion test */
7196 			if (wantdelete) {
7197 				niopts |= WANTPARENT;
7198 			}
7199 
7200 			/* do the lookup */
7201 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
7202 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
7203 			    &context);
7204 			error = namei(&nd);
7205 			if (!error) {
7206 				vp = nd.ni_vp;
7207 				if (wantdelete) {
7208 					dvp = nd.ni_dvp;
7209 				}
7210 			}
7211 			nameidone(&nd);
7212 		}
7213 
7214 		/*
7215 		 * Handle lookup errors.
7216 		 */
7217 		switch (error) {
7218 		case ENOENT:
7219 		case EACCES:
7220 		case EPERM:
7221 		case ENOTDIR:
7222 			result[i] = error;
7223 			break;
7224 		case 0:
7225 			/* run this access check */
7226 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
7227 			break;
7228 		default:
7229 			/* fatal lookup error */
7230 
7231 			goto out;
7232 		}
7233 	}
7234 
7235 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
7236 
7237 	/* copy out results */
7238 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
7239 
7240 out:
7241 	if (input && input != stack_input) {
7242 		kfree_data(input, uap->size);
7243 	}
7244 	if (result) {
7245 		kfree_data(result, desc_actual * sizeof(errno_t));
7246 	}
7247 	if (vp) {
7248 		vnode_put(vp);
7249 	}
7250 	if (dvp) {
7251 		vnode_put(dvp);
7252 	}
7253 	if (IS_VALID_CRED(context.vc_ucred)) {
7254 		kauth_cred_unref(&context.vc_ucred);
7255 	}
7256 	return error;
7257 }
7258 
7259 
7260 /*
7261  * Returns:	0			Success
7262  *		namei:EFAULT		Bad address
7263  *		namei:ENAMETOOLONG	Filename too long
7264  *		namei:ENOENT		No such file or directory
7265  *		namei:ELOOP		Too many levels of symbolic links
7266  *		namei:EBADF		Bad file descriptor
7267  *		namei:ENOTDIR		Not a directory
7268  *		namei:???
7269  *		access1:
7270  */
7271 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)7272 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
7273     int flag, enum uio_seg segflg)
7274 {
7275 	int error;
7276 	struct nameidata nd;
7277 	int niopts;
7278 	struct vfs_context context;
7279 #if NAMEDRSRCFORK
7280 	int is_namedstream = 0;
7281 #endif
7282 
7283 	/*
7284 	 * Unless the AT_EACCESS option is used, Access is defined as checking
7285 	 * against the process' real identity, even if operations are checking
7286 	 * the effective identity.  So we need to tweak the credential
7287 	 * in the context for that case.
7288 	 */
7289 	if (!(flag & AT_EACCESS)) {
7290 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
7291 	} else {
7292 		context.vc_ucred = ctx->vc_ucred;
7293 	}
7294 	context.vc_thread = ctx->vc_thread;
7295 
7296 
7297 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
7298 	/* need parent for vnode_authorize for deletion test */
7299 	if (amode & _DELETE_OK) {
7300 		niopts |= WANTPARENT;
7301 	}
7302 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
7303 	    path, &context);
7304 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7305 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7306 	}
7307 	if (flag & AT_RESOLVE_BENEATH) {
7308 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
7309 	}
7310 	if (flag & AT_UNIQUE) {
7311 		nd.ni_flag |= NAMEI_UNIQUE;
7312 	}
7313 
7314 #if NAMEDRSRCFORK
7315 	/* access(F_OK) calls are allowed for resource forks. */
7316 	if (amode == F_OK) {
7317 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7318 	}
7319 #endif
7320 	error = nameiat(&nd, fd);
7321 	if (error) {
7322 		goto out;
7323 	}
7324 
7325 #if NAMEDRSRCFORK
7326 	/* Grab reference on the shadow stream file vnode to
7327 	 * force an inactive on release which will mark it
7328 	 * for recycle.
7329 	 */
7330 	if (vnode_isnamedstream(nd.ni_vp) &&
7331 	    (nd.ni_vp->v_parent != NULLVP) &&
7332 	    vnode_isshadow(nd.ni_vp)) {
7333 		is_namedstream = 1;
7334 		vnode_ref(nd.ni_vp);
7335 	}
7336 #endif
7337 
7338 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
7339 
7340 #if NAMEDRSRCFORK
7341 	if (is_namedstream) {
7342 		vnode_rele(nd.ni_vp);
7343 	}
7344 #endif
7345 
7346 	vnode_put(nd.ni_vp);
7347 	if (amode & _DELETE_OK) {
7348 		vnode_put(nd.ni_dvp);
7349 	}
7350 	nameidone(&nd);
7351 
7352 out:
7353 	if (!(flag & AT_EACCESS)) {
7354 		kauth_cred_unref(&context.vc_ucred);
7355 	}
7356 	return error;
7357 }
7358 
7359 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)7360 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
7361 {
7362 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
7363 	           uap->path, uap->flags, 0, UIO_USERSPACE);
7364 }
7365 
7366 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)7367 faccessat(__unused proc_t p, struct faccessat_args *uap,
7368     __unused int32_t *retval)
7369 {
7370 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH | AT_UNIQUE)) {
7371 		return EINVAL;
7372 	}
7373 
7374 	return faccessat_internal(vfs_context_current(), uap->fd,
7375 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
7376 }
7377 
7378 /*
7379  * Returns:	0			Success
7380  *		EFAULT
7381  *	copyout:EFAULT
7382  *	namei:???
7383  *	vn_stat:???
7384  */
7385 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)7386 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
7387     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
7388     enum uio_seg segflg, int fd, int flag)
7389 {
7390 	struct nameidata *ndp = NULL;
7391 	int follow;
7392 	union {
7393 		struct stat sb;
7394 		struct stat64 sb64;
7395 	} source = {};
7396 	union {
7397 		struct user64_stat user64_sb;
7398 		struct user32_stat user32_sb;
7399 		struct user64_stat64 user64_sb64;
7400 		struct user32_stat64 user32_sb64;
7401 	} dest = {};
7402 	caddr_t sbp;
7403 	int error, my_size;
7404 	kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
7405 	size_t xsecurity_bufsize;
7406 	void * statptr;
7407 	struct fileproc *fp = NULL;
7408 	int needsrealdev = 0;
7409 
7410 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7411 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
7412 	NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
7413 	    segflg, path, ctx);
7414 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7415 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
7416 	}
7417 	if (flag & AT_RESOLVE_BENEATH) {
7418 		ndp->ni_flag |= NAMEI_RESOLVE_BENEATH;
7419 	}
7420 	if (flag & AT_UNIQUE) {
7421 		ndp->ni_flag |= NAMEI_UNIQUE;
7422 	}
7423 
7424 #if NAMEDRSRCFORK
7425 	int is_namedstream = 0;
7426 	/* stat calls are allowed for resource forks. */
7427 	ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7428 #endif
7429 
7430 	if (flag & AT_FDONLY) {
7431 		vnode_t fvp;
7432 
7433 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
7434 		if (error) {
7435 			goto out;
7436 		}
7437 		if ((error = vnode_getwithref(fvp))) {
7438 			file_drop(fd);
7439 			goto out;
7440 		}
7441 		ndp->ni_vp = fvp;
7442 	} else {
7443 		error = nameiat(ndp, fd);
7444 		if (error) {
7445 			goto out;
7446 		}
7447 	}
7448 
7449 	statptr = (void *)&source;
7450 
7451 #if NAMEDRSRCFORK
7452 	/* Grab reference on the shadow stream file vnode to
7453 	 * force an inactive on release which will mark it
7454 	 * for recycle.
7455 	 */
7456 	if (vnode_isnamedstream(ndp->ni_vp) &&
7457 	    (ndp->ni_vp->v_parent != NULLVP) &&
7458 	    vnode_isshadow(ndp->ni_vp)) {
7459 		is_namedstream = 1;
7460 		vnode_ref(ndp->ni_vp);
7461 	}
7462 #endif
7463 
7464 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
7465 	if (fp && (xsecurity == USER_ADDR_NULL)) {
7466 		/*
7467 		 * If the caller has the file open, and is not
7468 		 * requesting extended security information, we are
7469 		 * going to let them get the basic stat information.
7470 		 */
7471 		error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7472 		    fp->fp_glob->fg_cred);
7473 	} else {
7474 		error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7475 		    isstat64, needsrealdev, ctx);
7476 	}
7477 
7478 #if NAMEDRSRCFORK
7479 	if (is_namedstream) {
7480 		vnode_rele(ndp->ni_vp);
7481 	}
7482 #endif
7483 	vnode_put(ndp->ni_vp);
7484 	nameidone(ndp);
7485 
7486 	if (fp) {
7487 		file_drop(fd);
7488 		fp = NULL;
7489 	}
7490 
7491 	if (error) {
7492 		goto out;
7493 	}
7494 	/* Zap spare fields */
7495 	if (isstat64 != 0) {
7496 		source.sb64.st_lspare = 0;
7497 		source.sb64.st_qspare[0] = 0LL;
7498 		source.sb64.st_qspare[1] = 0LL;
7499 		if (vfs_context_is64bit(ctx)) {
7500 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7501 			my_size = sizeof(dest.user64_sb64);
7502 			sbp = (caddr_t)&dest.user64_sb64;
7503 		} else {
7504 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7505 			my_size = sizeof(dest.user32_sb64);
7506 			sbp = (caddr_t)&dest.user32_sb64;
7507 		}
7508 		/*
7509 		 * Check if we raced (post lookup) against the last unlink of a file.
7510 		 */
7511 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7512 			source.sb64.st_nlink = 1;
7513 		}
7514 	} else {
7515 		source.sb.st_lspare = 0;
7516 		source.sb.st_qspare[0] = 0LL;
7517 		source.sb.st_qspare[1] = 0LL;
7518 		if (vfs_context_is64bit(ctx)) {
7519 			munge_user64_stat(&source.sb, &dest.user64_sb);
7520 			my_size = sizeof(dest.user64_sb);
7521 			sbp = (caddr_t)&dest.user64_sb;
7522 		} else {
7523 			munge_user32_stat(&source.sb, &dest.user32_sb);
7524 			my_size = sizeof(dest.user32_sb);
7525 			sbp = (caddr_t)&dest.user32_sb;
7526 		}
7527 
7528 		/*
7529 		 * Check if we raced (post lookup) against the last unlink of a file.
7530 		 */
7531 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7532 			source.sb.st_nlink = 1;
7533 		}
7534 	}
7535 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7536 		goto out;
7537 	}
7538 
7539 	/* caller wants extended security information? */
7540 	if (xsecurity != USER_ADDR_NULL) {
7541 		/* did we get any? */
7542 		if (fsec == KAUTH_FILESEC_NONE) {
7543 			if (susize(xsecurity_size, 0) != 0) {
7544 				error = EFAULT;
7545 				goto out;
7546 			}
7547 		} else {
7548 			/* find the user buffer size */
7549 			xsecurity_bufsize = fusize(xsecurity_size);
7550 
7551 			/* copy out the actual data size */
7552 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7553 				error = EFAULT;
7554 				goto out;
7555 			}
7556 
7557 			/* if the caller supplied enough room, copy out to it */
7558 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7559 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7560 			}
7561 		}
7562 	}
7563 out:
7564 	if (ndp) {
7565 		kfree_type(struct nameidata, ndp);
7566 	}
7567 	if (fsec != KAUTH_FILESEC_NONE) {
7568 		kauth_filesec_free(fsec);
7569 	}
7570 	return error;
7571 }
7572 
7573 /*
7574  * stat_extended: Get file status; with extended security (ACL).
7575  *
7576  * Parameters:    p                       (ignored)
7577  *                uap                     User argument descriptor (see below)
7578  *                retval                  (ignored)
7579  *
7580  * Indirect:      uap->path               Path of file to get status from
7581  *                uap->ub                 User buffer (holds file status info)
7582  *                uap->xsecurity          ACL to get (extended security)
7583  *                uap->xsecurity_size     Size of ACL
7584  *
7585  * Returns:        0                      Success
7586  *                !0                      errno value
7587  *
7588  */
7589 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7590 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7591     __unused int32_t *retval)
7592 {
7593 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7594 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7595 	           0);
7596 }
7597 
7598 /*
7599  * Returns:	0			Success
7600  *	fstatat_internal:???		[see fstatat_internal() in this file]
7601  */
7602 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7603 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7604 {
7605 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7606 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7607 }
7608 
7609 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7610 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7611 {
7612 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7613 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7614 }
7615 
7616 /*
7617  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7618  *
7619  * Parameters:    p                       (ignored)
7620  *                uap                     User argument descriptor (see below)
7621  *                retval                  (ignored)
7622  *
7623  * Indirect:      uap->path               Path of file to get status from
7624  *                uap->ub                 User buffer (holds file status info)
7625  *                uap->xsecurity          ACL to get (extended security)
7626  *                uap->xsecurity_size     Size of ACL
7627  *
7628  * Returns:        0                      Success
7629  *                !0                      errno value
7630  *
7631  */
7632 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7633 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7634 {
7635 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7636 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7637 	           0);
7638 }
7639 
7640 /*
7641  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7642  *
7643  * Parameters:    p                       (ignored)
7644  *                uap                     User argument descriptor (see below)
7645  *                retval                  (ignored)
7646  *
7647  * Indirect:      uap->path               Path of file to get status from
7648  *                uap->ub                 User buffer (holds file status info)
7649  *                uap->xsecurity          ACL to get (extended security)
7650  *                uap->xsecurity_size     Size of ACL
7651  *
7652  * Returns:        0                      Success
7653  *                !0                      errno value
7654  *
7655  */
7656 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7657 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7658 {
7659 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7660 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7661 	           AT_SYMLINK_NOFOLLOW);
7662 }
7663 
7664 /*
7665  * Get file status; this version does not follow links.
7666  */
7667 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7668 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7669 {
7670 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7671 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7672 }
7673 
7674 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7675 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7676 {
7677 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7678 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7679 }
7680 
7681 /*
7682  * lstat64_extended: Get file status; can handle large inode numbers; does not
7683  * follow links; with extended security (ACL).
7684  *
7685  * Parameters:    p                       (ignored)
7686  *                uap                     User argument descriptor (see below)
7687  *                retval                  (ignored)
7688  *
7689  * Indirect:      uap->path               Path of file to get status from
7690  *                uap->ub                 User buffer (holds file status info)
7691  *                uap->xsecurity          ACL to get (extended security)
7692  *                uap->xsecurity_size     Size of ACL
7693  *
7694  * Returns:        0                      Success
7695  *                !0                      errno value
7696  *
7697  */
7698 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7699 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7700 {
7701 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7702 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7703 	           AT_SYMLINK_NOFOLLOW);
7704 }
7705 
7706 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7707 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7708 {
7709 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH | AT_UNIQUE)) {
7710 		return EINVAL;
7711 	}
7712 
7713 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7714 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7715 }
7716 
7717 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7718 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7719     __unused int32_t *retval)
7720 {
7721 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH | AT_UNIQUE)) {
7722 		return EINVAL;
7723 	}
7724 
7725 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7726 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7727 }
7728 
7729 /*
7730  * Get configurable pathname variables.
7731  *
7732  * Returns:	0			Success
7733  *	namei:???
7734  *	vn_pathconf:???
7735  *
7736  * Notes:	Global implementation  constants are intended to be
7737  *		implemented in this function directly; all other constants
7738  *		are per-FS implementation, and therefore must be handled in
7739  *		each respective FS, instead.
7740  *
7741  * XXX We implement some things globally right now that should actually be
7742  * XXX per-FS; we will need to deal with this at some point.
7743  */
7744 /* ARGSUSED */
7745 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7746 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7747 {
7748 	int error;
7749 	struct nameidata nd;
7750 	vfs_context_t ctx = vfs_context_current();
7751 
7752 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7753 	    UIO_USERSPACE, uap->path, ctx);
7754 	error = namei(&nd);
7755 	if (error) {
7756 		return error;
7757 	}
7758 
7759 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7760 
7761 	vnode_put(nd.ni_vp);
7762 	nameidone(&nd);
7763 	return error;
7764 }
7765 
7766 /*
7767  * Return target name of a symbolic link.
7768  */
7769 /* ARGSUSED */
7770 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7771 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7772     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7773     int *retval)
7774 {
7775 	vnode_t vp;
7776 	uio_t auio;
7777 	int error;
7778 	struct nameidata nd;
7779 	UIO_STACKBUF(uio_buf, 1);
7780 	bool put_vnode;
7781 
7782 	if (bufsize > INT32_MAX) {
7783 		return EINVAL;
7784 	}
7785 
7786 	if (lnk_vp) {
7787 		vp = lnk_vp;
7788 		put_vnode = false;
7789 	} else {
7790 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7791 		    seg, path, ctx);
7792 
7793 		error = nameiat(&nd, fd);
7794 		if (error) {
7795 			return error;
7796 		}
7797 		vp = nd.ni_vp;
7798 		put_vnode = true;
7799 		nameidone(&nd);
7800 	}
7801 
7802 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7803 	    &uio_buf[0], sizeof(uio_buf));
7804 	uio_addiov(auio, buf, bufsize);
7805 	if (vp->v_type != VLNK) {
7806 		error = EINVAL;
7807 	} else {
7808 #if CONFIG_MACF
7809 		error = mac_vnode_check_readlink(ctx, vp);
7810 #endif
7811 		if (error == 0) {
7812 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7813 			    ctx);
7814 		}
7815 		if (error == 0) {
7816 			error = VNOP_READLINK(vp, auio, ctx);
7817 		}
7818 	}
7819 
7820 	if (put_vnode) {
7821 		vnode_put(vp);
7822 	}
7823 
7824 	*retval = (int)(bufsize - uio_resid(auio));
7825 	return error;
7826 }
7827 
7828 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7829 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7830 {
7831 	enum uio_seg procseg;
7832 	vnode_t vp;
7833 	int error;
7834 
7835 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7836 
7837 	AUDIT_ARG(fd, uap->fd);
7838 
7839 	if ((error = file_vnode(uap->fd, &vp))) {
7840 		return error;
7841 	}
7842 	if ((error = vnode_getwithref(vp))) {
7843 		file_drop(uap->fd);
7844 		return error;
7845 	}
7846 
7847 	error = readlinkat_internal(vfs_context_current(), -1,
7848 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7849 	    uap->bufsize, procseg, retval);
7850 
7851 	vnode_put(vp);
7852 	file_drop(uap->fd);
7853 	return error;
7854 }
7855 
7856 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7857 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7858 {
7859 	enum uio_seg procseg;
7860 
7861 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7862 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7863 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7864 	           uap->count, procseg, retval);
7865 }
7866 
7867 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7868 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7869 {
7870 	enum uio_seg procseg;
7871 
7872 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7873 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7874 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7875 	           retval);
7876 }
7877 
7878 /*
7879  * Change file flags, the deep inner layer.
7880  */
7881 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7882 chflags0(vnode_t vp, struct vnode_attr *va,
7883     int (*setattr)(vnode_t, void *, vfs_context_t),
7884     void *arg, vfs_context_t ctx)
7885 {
7886 	kauth_action_t action = 0;
7887 	int error;
7888 
7889 #if CONFIG_MACF
7890 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7891 	if (error) {
7892 		goto out;
7893 	}
7894 #endif
7895 
7896 	/* request authorisation, disregard immutability */
7897 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7898 		goto out;
7899 	}
7900 	/*
7901 	 * Request that the auth layer disregard those file flags it's allowed to when
7902 	 * authorizing this operation; we need to do this in order to be able to
7903 	 * clear immutable flags.
7904 	 */
7905 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7906 		goto out;
7907 	}
7908 	error = (*setattr)(vp, arg, ctx);
7909 
7910 	if (error == 0) {
7911 		if (va->va_flags & APPEND) {
7912 			os_atomic_or(&vp->v_ext_flag, VE_APPENDONLY, relaxed);
7913 		} else {
7914 			os_atomic_andnot(&vp->v_ext_flag, VE_APPENDONLY, relaxed);
7915 		}
7916 #if CONFIG_MACF
7917 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7918 #endif
7919 	}
7920 
7921 out:
7922 	return error;
7923 }
7924 
7925 /*
7926  * Change file flags.
7927  *
7928  * NOTE: this will vnode_put() `vp'
7929  */
7930 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7931 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7932 {
7933 	struct vnode_attr va;
7934 	int error;
7935 
7936 	VATTR_INIT(&va);
7937 	VATTR_SET(&va, va_flags, flags);
7938 
7939 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7940 	vnode_put(vp);
7941 
7942 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7943 		error = ENOTSUP;
7944 	}
7945 
7946 	return error;
7947 }
7948 
7949 /*
7950  * Change flags of a file given a path name.
7951  */
7952 /* ARGSUSED */
7953 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7954 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7955 {
7956 	vnode_t vp;
7957 	vfs_context_t ctx = vfs_context_current();
7958 	int error;
7959 	struct nameidata nd;
7960 	uint32_t wantparent = 0;
7961 
7962 #if CONFIG_FILE_LEASES
7963 	wantparent = WANTPARENT;
7964 #endif
7965 
7966 	AUDIT_ARG(fflags, uap->flags);
7967 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7968 	    UIO_USERSPACE, uap->path, ctx);
7969 	error = namei(&nd);
7970 	if (error) {
7971 		return error;
7972 	}
7973 	vp = nd.ni_vp;
7974 
7975 #if CONFIG_FILE_LEASES
7976 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7977 	vnode_put(nd.ni_dvp);
7978 #endif
7979 
7980 	nameidone(&nd);
7981 
7982 	/* we don't vnode_put() here because chflags1 does internally */
7983 	error = chflags1(vp, uap->flags, ctx);
7984 
7985 	return error;
7986 }
7987 
7988 /*
7989  * Change flags of a file given a file descriptor.
7990  */
7991 /* ARGSUSED */
7992 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7993 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7994 {
7995 	vnode_t vp;
7996 	int error;
7997 
7998 	AUDIT_ARG(fd, uap->fd);
7999 	AUDIT_ARG(fflags, uap->flags);
8000 	if ((error = file_vnode(uap->fd, &vp))) {
8001 		return error;
8002 	}
8003 
8004 	if ((error = vnode_getwithref(vp))) {
8005 		file_drop(uap->fd);
8006 		return error;
8007 	}
8008 
8009 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8010 
8011 #if CONFIG_FILE_LEASES
8012 	vnode_breakdirlease(vp, true, O_WRONLY);
8013 #endif
8014 
8015 	/* we don't vnode_put() here because chflags1 does internally */
8016 	error = chflags1(vp, uap->flags, vfs_context_current());
8017 
8018 	file_drop(uap->fd);
8019 	return error;
8020 }
8021 
8022 /*
8023  * Change security information on a filesystem object.
8024  *
8025  * Returns:	0			Success
8026  *		EPERM			Operation not permitted
8027  *		vnode_authattr:???	[anything vnode_authattr can return]
8028  *		vnode_authorize:???	[anything vnode_authorize can return]
8029  *		vnode_setattr:???	[anything vnode_setattr can return]
8030  *
8031  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
8032  *		translated to EPERM before being returned.
8033  */
8034 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)8035 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
8036 {
8037 	kauth_action_t action;
8038 	int error;
8039 
8040 	AUDIT_ARG(mode, vap->va_mode);
8041 	/* XXX audit new args */
8042 
8043 #if NAMEDSTREAMS
8044 	/* chmod calls are not allowed for resource forks. */
8045 	if (vp->v_flag & VISNAMEDSTREAM) {
8046 		return EPERM;
8047 	}
8048 #endif
8049 
8050 #if CONFIG_MACF
8051 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
8052 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
8053 		return error;
8054 	}
8055 
8056 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
8057 		if ((error = mac_vnode_check_setowner(ctx, vp,
8058 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
8059 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
8060 			return error;
8061 		}
8062 	}
8063 
8064 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
8065 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
8066 		return error;
8067 	}
8068 #endif
8069 
8070 	/* make sure that the caller is allowed to set this security information */
8071 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
8072 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8073 		if (error == EACCES) {
8074 			error = EPERM;
8075 		}
8076 		return error;
8077 	}
8078 
8079 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
8080 		return error;
8081 	}
8082 
8083 #if CONFIG_MACF
8084 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
8085 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
8086 	}
8087 
8088 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
8089 		mac_vnode_notify_setowner(ctx, vp,
8090 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
8091 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
8092 	}
8093 
8094 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
8095 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
8096 	}
8097 #endif
8098 
8099 	return error;
8100 }
8101 
8102 
8103 /*
8104  * Change mode of a file given a path name.
8105  *
8106  * Returns:	0			Success
8107  *		namei:???		[anything namei can return]
8108  *		chmod_vnode:???		[anything chmod_vnode can return]
8109  */
8110 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)8111 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
8112     int fd, int flag, enum uio_seg segflg)
8113 {
8114 	struct nameidata nd;
8115 	int follow, error;
8116 	uint32_t wantparent = 0;
8117 
8118 #if CONFIG_FILE_LEASES
8119 	wantparent = WANTPARENT;
8120 #endif
8121 
8122 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8123 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
8124 	    segflg, path, ctx);
8125 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8126 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8127 	}
8128 	if (flag & AT_RESOLVE_BENEATH) {
8129 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
8130 	}
8131 	if (flag & AT_UNIQUE) {
8132 		nd.ni_flag |= NAMEI_UNIQUE;
8133 	}
8134 	if ((error = nameiat(&nd, fd))) {
8135 		return error;
8136 	}
8137 
8138 #if CONFIG_FILE_LEASES
8139 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8140 	vnode_put(nd.ni_dvp);
8141 #endif
8142 
8143 	error = chmod_vnode(ctx, nd.ni_vp, vap);
8144 	vnode_put(nd.ni_vp);
8145 	nameidone(&nd);
8146 	return error;
8147 }
8148 
8149 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)8150 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
8151     gid_t gid, user_addr_t xsecurity)
8152 {
8153 	int error;
8154 
8155 	VATTR_INIT(pva);
8156 
8157 	if (mode != -1) {
8158 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
8159 	} else {
8160 		pva->va_mode = 0;
8161 	}
8162 
8163 	if (uid != KAUTH_UID_NONE) {
8164 		VATTR_SET(pva, va_uid, uid);
8165 	}
8166 
8167 	if (gid != KAUTH_GID_NONE) {
8168 		VATTR_SET(pva, va_gid, gid);
8169 	}
8170 
8171 	*pxsecdst = NULL;
8172 	switch (xsecurity) {
8173 	case USER_ADDR_NULL:
8174 		break;
8175 
8176 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
8177 		VATTR_SET(pva, va_acl, NULL);
8178 		break;
8179 
8180 	default:
8181 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
8182 			return error;
8183 		}
8184 
8185 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
8186 		pva->va_vaflags |= VA_FILESEC_ACL;
8187 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
8188 		break;
8189 	}
8190 
8191 	return 0;
8192 }
8193 
8194 /*
8195  * chmod_extended: Change the mode of a file given a path name; with extended
8196  * argument list (including extended security (ACL)).
8197  *
8198  * Parameters:	p			Process requesting the open
8199  *		uap			User argument descriptor (see below)
8200  *		retval			(ignored)
8201  *
8202  * Indirect:	uap->path		Path to object (same as 'chmod')
8203  *		uap->uid		UID to set
8204  *		uap->gid		GID to set
8205  *		uap->mode		File mode to set (same as 'chmod')
8206  *		uap->xsecurity		ACL to set (or delete)
8207  *
8208  * Returns:	0			Success
8209  *		!0			errno value
8210  *
8211  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
8212  *
8213  * XXX:		We should enummerate the possible errno values here, and where
8214  *		in the code they originated.
8215  */
8216 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)8217 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
8218 {
8219 	int error;
8220 	struct vnode_attr va;
8221 	kauth_filesec_t xsecdst = NULL;
8222 
8223 	AUDIT_ARG(owner, uap->uid, uap->gid);
8224 
8225 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8226 	    uap->gid, uap->xsecurity);
8227 
8228 	if (error) {
8229 		return error;
8230 	}
8231 
8232 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
8233 	    UIO_USERSPACE);
8234 
8235 	if (xsecdst != NULL) {
8236 		kauth_filesec_free(xsecdst);
8237 	}
8238 	return error;
8239 }
8240 
8241 /*
8242  * Returns:	0			Success
8243  *		chmodat:???		[anything chmodat can return]
8244  */
8245 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)8246 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
8247     int flag, enum uio_seg segflg)
8248 {
8249 	struct vnode_attr va;
8250 
8251 	VATTR_INIT(&va);
8252 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
8253 
8254 	return chmodat(ctx, path, &va, fd, flag, segflg);
8255 }
8256 
8257 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)8258 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
8259 {
8260 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
8261 	           AT_FDCWD, 0, UIO_USERSPACE);
8262 }
8263 
8264 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)8265 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
8266 {
8267 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH | AT_UNIQUE)) {
8268 		return EINVAL;
8269 	}
8270 
8271 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
8272 	           uap->fd, uap->flag, UIO_USERSPACE);
8273 }
8274 
8275 /*
8276  * Change mode of a file given a file descriptor.
8277  */
8278 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)8279 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
8280 {
8281 	vnode_t vp;
8282 	int error;
8283 
8284 	AUDIT_ARG(fd, fd);
8285 
8286 	if ((error = file_vnode(fd, &vp)) != 0) {
8287 		return error;
8288 	}
8289 	if ((error = vnode_getwithref(vp)) != 0) {
8290 		file_drop(fd);
8291 		return error;
8292 	}
8293 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8294 
8295 #if CONFIG_FILE_LEASES
8296 	vnode_breakdirlease(vp, true, O_WRONLY);
8297 #endif
8298 
8299 	error = chmod_vnode(vfs_context_current(), vp, vap);
8300 	(void)vnode_put(vp);
8301 	file_drop(fd);
8302 
8303 	return error;
8304 }
8305 
8306 /*
8307  * fchmod_extended: Change mode of a file given a file descriptor; with
8308  * extended argument list (including extended security (ACL)).
8309  *
8310  * Parameters:    p                       Process requesting to change file mode
8311  *                uap                     User argument descriptor (see below)
8312  *                retval                  (ignored)
8313  *
8314  * Indirect:      uap->mode               File mode to set (same as 'chmod')
8315  *                uap->uid                UID to set
8316  *                uap->gid                GID to set
8317  *                uap->xsecurity          ACL to set (or delete)
8318  *                uap->fd                 File descriptor of file to change mode
8319  *
8320  * Returns:        0                      Success
8321  *                !0                      errno value
8322  *
8323  */
8324 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)8325 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
8326 {
8327 	int error;
8328 	struct vnode_attr va;
8329 	kauth_filesec_t xsecdst = NULL;
8330 
8331 	AUDIT_ARG(owner, uap->uid, uap->gid);
8332 
8333 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8334 	    uap->gid, uap->xsecurity);
8335 
8336 	if (error) {
8337 		return error;
8338 	}
8339 
8340 	error = fchmod1(p, uap->fd, &va);
8341 
8342 	if (xsecdst != NULL) {
8343 		kauth_filesec_free(xsecdst);
8344 	}
8345 	return error;
8346 }
8347 
8348 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)8349 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
8350 {
8351 	struct vnode_attr va;
8352 
8353 	VATTR_INIT(&va);
8354 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
8355 
8356 	return fchmod1(p, uap->fd, &va);
8357 }
8358 
8359 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)8360 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
8361 {
8362 	struct vnode_attr va;
8363 	kauth_action_t action;
8364 	int error;
8365 
8366 	VATTR_INIT(&va);
8367 	if (uid != (uid_t)VNOVAL) {
8368 		VATTR_SET(&va, va_uid, uid);
8369 	}
8370 	if (gid != (gid_t)VNOVAL) {
8371 		VATTR_SET(&va, va_gid, gid);
8372 	}
8373 
8374 #if NAMEDSTREAMS
8375 	/* chown calls are not allowed for resource forks. */
8376 	if (vp->v_flag & VISNAMEDSTREAM) {
8377 		error = EPERM;
8378 		goto out;
8379 	}
8380 #endif
8381 
8382 #if CONFIG_MACF
8383 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
8384 	if (error) {
8385 		goto out;
8386 	}
8387 #endif
8388 
8389 	/* preflight and authorize attribute changes */
8390 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8391 		goto out;
8392 	}
8393 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8394 		/*
8395 		 * EACCES is only allowed from namei(); permissions failure should
8396 		 * return EPERM, so we need to translate the error code.
8397 		 */
8398 		if (error == EACCES) {
8399 			error = EPERM;
8400 		}
8401 
8402 		goto out;
8403 	}
8404 
8405 #if CONFIG_FILE_LEASES
8406 	vnode_breakdirlease(vp, true, O_WRONLY);
8407 #endif
8408 
8409 	error = vnode_setattr(vp, &va, ctx);
8410 
8411 #if CONFIG_MACF
8412 	if (error == 0) {
8413 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
8414 	}
8415 #endif
8416 
8417 out:
8418 	return error;
8419 }
8420 
8421 /*
8422  * Set ownership given a path name.
8423  */
8424 /* ARGSUSED */
8425 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)8426 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
8427     gid_t gid, int flag, enum uio_seg segflg)
8428 {
8429 	vnode_t vp;
8430 	int error;
8431 	struct nameidata nd;
8432 	int follow;
8433 
8434 	AUDIT_ARG(owner, uid, gid);
8435 
8436 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8437 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
8438 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8439 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8440 	}
8441 	if (flag & AT_RESOLVE_BENEATH) {
8442 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
8443 	}
8444 	if (flag & AT_UNIQUE) {
8445 		nd.ni_flag |= NAMEI_UNIQUE;
8446 	}
8447 
8448 	error = nameiat(&nd, fd);
8449 	if (error) {
8450 		return error;
8451 	}
8452 
8453 	vp = nd.ni_vp;
8454 	error = vn_chown_internal(ctx, vp, uid, gid);
8455 
8456 	nameidone(&nd);
8457 	vnode_put(vp);
8458 	return error;
8459 }
8460 
8461 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8462 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8463 {
8464 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8465 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
8466 }
8467 
8468 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8469 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8470 {
8471 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8472 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8473 }
8474 
8475 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8476 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8477 {
8478 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH | AT_UNIQUE)) {
8479 		return EINVAL;
8480 	}
8481 
8482 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8483 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8484 }
8485 
8486 /*
8487  * Set ownership given a file descriptor.
8488  */
8489 /* ARGSUSED */
8490 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8491 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8492 {
8493 	vfs_context_t ctx = vfs_context_current();
8494 	vnode_t vp;
8495 	int error;
8496 
8497 	AUDIT_ARG(owner, uap->uid, uap->gid);
8498 	AUDIT_ARG(fd, uap->fd);
8499 
8500 	if ((error = file_vnode(uap->fd, &vp))) {
8501 		return error;
8502 	}
8503 
8504 	if ((error = vnode_getwithref(vp))) {
8505 		file_drop(uap->fd);
8506 		return error;
8507 	}
8508 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8509 
8510 	error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8511 
8512 	(void)vnode_put(vp);
8513 	file_drop(uap->fd);
8514 	return error;
8515 }
8516 
8517 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8518 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8519 {
8520 	int error;
8521 
8522 	if (usrtvp == USER_ADDR_NULL) {
8523 		struct timeval old_tv;
8524 		/* XXX Y2038 bug because of microtime argument */
8525 		microtime(&old_tv);
8526 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8527 		tsp[1] = tsp[0];
8528 	} else {
8529 		if (IS_64BIT_PROCESS(current_proc())) {
8530 			struct user64_timeval tv[2];
8531 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8532 			if (error) {
8533 				return error;
8534 			}
8535 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8536 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8537 		} else {
8538 			struct user32_timeval tv[2];
8539 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8540 			if (error) {
8541 				return error;
8542 			}
8543 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8544 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8545 		}
8546 	}
8547 	return 0;
8548 }
8549 
8550 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8551 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8552     int nullflag)
8553 {
8554 	int error;
8555 	struct vnode_attr va;
8556 	kauth_action_t action;
8557 
8558 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8559 
8560 	VATTR_INIT(&va);
8561 	VATTR_SET(&va, va_access_time, ts[0]);
8562 	VATTR_SET(&va, va_modify_time, ts[1]);
8563 	if (nullflag) {
8564 		va.va_vaflags |= VA_UTIMES_NULL;
8565 	}
8566 
8567 #if NAMEDSTREAMS
8568 	/* utimes calls are not allowed for resource forks. */
8569 	if (vp->v_flag & VISNAMEDSTREAM) {
8570 		error = EPERM;
8571 		goto out;
8572 	}
8573 #endif
8574 
8575 #if CONFIG_MACF
8576 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8577 	if (error) {
8578 		goto out;
8579 	}
8580 #endif
8581 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8582 		if (!nullflag && error == EACCES) {
8583 			error = EPERM;
8584 		}
8585 		goto out;
8586 	}
8587 
8588 	/* since we may not need to auth anything, check here */
8589 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8590 		if (!nullflag && error == EACCES) {
8591 			error = EPERM;
8592 		}
8593 		goto out;
8594 	}
8595 	error = vnode_setattr(vp, &va, ctx);
8596 
8597 #if CONFIG_MACF
8598 	if (error == 0) {
8599 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8600 	}
8601 #endif
8602 
8603 out:
8604 	return error;
8605 }
8606 
8607 /*
8608  * Set the access and modification times of a file.
8609  */
8610 /* ARGSUSED */
8611 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8612 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8613 {
8614 	struct timespec ts[2];
8615 	user_addr_t usrtvp;
8616 	int error;
8617 	struct nameidata nd;
8618 	vfs_context_t ctx = vfs_context_current();
8619 	uint32_t wantparent = 0;
8620 
8621 #if CONFIG_FILE_LEASES
8622 	wantparent = WANTPARENT;
8623 #endif
8624 
8625 	/*
8626 	 * AUDIT: Needed to change the order of operations to do the
8627 	 * name lookup first because auditing wants the path.
8628 	 */
8629 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8630 	    UIO_USERSPACE, uap->path, ctx);
8631 	error = namei(&nd);
8632 	if (error) {
8633 		return error;
8634 	}
8635 
8636 	/*
8637 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8638 	 * the current time instead.
8639 	 */
8640 	usrtvp = uap->tptr;
8641 	if ((error = getutimes(usrtvp, ts)) != 0) {
8642 		goto out;
8643 	}
8644 
8645 #if CONFIG_FILE_LEASES
8646 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8647 #endif
8648 
8649 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8650 
8651 out:
8652 #if CONFIG_FILE_LEASES
8653 	vnode_put(nd.ni_dvp);
8654 #endif
8655 	nameidone(&nd);
8656 	vnode_put(nd.ni_vp);
8657 	return error;
8658 }
8659 
8660 /*
8661  * Set the access and modification times of a file.
8662  */
8663 /* ARGSUSED */
8664 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8665 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8666 {
8667 	struct timespec ts[2];
8668 	vnode_t vp;
8669 	user_addr_t usrtvp;
8670 	int error;
8671 
8672 	AUDIT_ARG(fd, uap->fd);
8673 	usrtvp = uap->tptr;
8674 	if ((error = getutimes(usrtvp, ts)) != 0) {
8675 		return error;
8676 	}
8677 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8678 		return error;
8679 	}
8680 	if ((error = vnode_getwithref(vp))) {
8681 		file_drop(uap->fd);
8682 		return error;
8683 	}
8684 
8685 #if CONFIG_FILE_LEASES
8686 	vnode_breakdirlease(vp, true, O_WRONLY);
8687 #endif
8688 
8689 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8690 
8691 	vnode_put(vp);
8692 	file_drop(uap->fd);
8693 	return error;
8694 }
8695 
8696 static int
truncate_validate_common(proc_t p,off_t length)8697 truncate_validate_common(proc_t p, off_t length)
8698 {
8699 	rlim_t fsize_limit;
8700 
8701 	if (length < 0) {
8702 		return EINVAL;
8703 	}
8704 
8705 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8706 	if ((rlim_t)length > fsize_limit) {
8707 		psignal(p, SIGXFSZ);
8708 		return EFBIG;
8709 	}
8710 
8711 	return 0;
8712 }
8713 
8714 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8715 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8716     vfs_context_t ctx, boolean_t need_auth)
8717 {
8718 	struct vnode_attr va;
8719 	kauth_action_t action;
8720 	int error;
8721 
8722 	VATTR_INIT(&va);
8723 	VATTR_SET(&va, va_data_size, length);
8724 
8725 #if CONFIG_MACF
8726 	error = mac_vnode_check_truncate(ctx, cred, vp);
8727 	if (error) {
8728 		return error;
8729 	}
8730 #endif
8731 
8732 	/*
8733 	 * If we reached here from `ftruncate` then we already did an effective
8734 	 * `vnode_authorize` upon open.  We honour the result from then.
8735 	 */
8736 	if (need_auth) {
8737 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8738 			return error;
8739 		}
8740 
8741 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8742 			return error;
8743 		}
8744 	}
8745 
8746 #if CONFIG_FILE_LEASES
8747 	/* Check if there is a lease placed on the parent directory. */
8748 	vnode_breakdirlease(vp, true, O_WRONLY);
8749 
8750 	/* Now check if there is a lease placed on the file itself. */
8751 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8752 #endif
8753 
8754 	error = vnode_setattr(vp, &va, ctx);
8755 
8756 #if CONFIG_MACF
8757 	if (error == 0) {
8758 		mac_vnode_notify_truncate(ctx, cred, vp);
8759 	}
8760 #endif
8761 
8762 	return error;
8763 }
8764 
8765 /*
8766  * Truncate a file given its path name.
8767  */
8768 /* ARGSUSED */
8769 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8770 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8771 {
8772 	vfs_context_t ctx = vfs_context_current();
8773 	vnode_t vp;
8774 	int error;
8775 	struct nameidata nd;
8776 
8777 	if ((error = truncate_validate_common(p, uap->length))) {
8778 		return error;
8779 	}
8780 
8781 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8782 	    UIO_USERSPACE, uap->path, ctx);
8783 
8784 	if ((error = namei(&nd))) {
8785 		return error;
8786 	}
8787 
8788 	vp = nd.ni_vp;
8789 	nameidone(&nd);
8790 
8791 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8792 	vnode_put(vp);
8793 
8794 	return error;
8795 }
8796 
8797 /*
8798  * Truncate a file given a file descriptor.
8799  */
8800 /* ARGSUSED */
8801 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8802 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8803 {
8804 	vnode_t vp = NULLVP;
8805 	struct fileproc *fp;
8806 	bool need_vnode_put = false;
8807 	int error;
8808 
8809 	AUDIT_ARG(fd, uap->fd);
8810 
8811 	if ((error = truncate_validate_common(p, uap->length))) {
8812 		return error;
8813 	}
8814 
8815 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8816 		return error;
8817 	}
8818 
8819 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8820 	case DTYPE_PSXSHM:
8821 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8822 		goto out;
8823 	case DTYPE_VNODE:
8824 		break;
8825 	default:
8826 		error = EINVAL;
8827 		goto out;
8828 	}
8829 
8830 	vp = (vnode_t)fp_get_data(fp);
8831 
8832 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8833 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8834 		error = EINVAL;
8835 		goto out;
8836 	}
8837 
8838 	if ((error = vnode_getwithref(vp)) != 0) {
8839 		goto out;
8840 	}
8841 	need_vnode_put = true;
8842 
8843 	/* Don't allow ftruncate if the file has append-only flag set. */
8844 	if (vnode_isappendonly(vp)) {
8845 		error = EPERM;
8846 		goto out;
8847 	}
8848 
8849 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8850 
8851 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8852 	    vfs_context_current(), false);
8853 	if (!error) {
8854 		fp->fp_glob->fg_flag |= FWASWRITTEN;
8855 	}
8856 
8857 out:
8858 	if (vp && need_vnode_put) {
8859 		vnode_put(vp);
8860 	}
8861 
8862 	file_drop(uap->fd);
8863 	return error;
8864 }
8865 
8866 
8867 /*
8868  * Sync an open file with synchronized I/O _file_ integrity completion
8869  */
8870 /* ARGSUSED */
8871 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8872 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8873 {
8874 	__pthread_testcancel(1);
8875 	return fsync_common(p, uap, MNT_WAIT);
8876 }
8877 
8878 
8879 /*
8880  * Sync an open file with synchronized I/O _file_ integrity completion
8881  *
8882  * Notes:	This is a legacy support function that does not test for
8883  *		thread cancellation points.
8884  */
8885 /* ARGSUSED */
8886 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8887 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8888 {
8889 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8890 }
8891 
8892 
8893 /*
8894  * Sync an open file with synchronized I/O _data_ integrity completion
8895  */
8896 /* ARGSUSED */
8897 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8898 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8899 {
8900 	__pthread_testcancel(1);
8901 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8902 }
8903 
8904 
8905 /*
8906  * fsync_common
8907  *
8908  * Common fsync code to support both synchronized I/O file integrity completion
8909  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8910  *
8911  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8912  * will only guarantee that the file data contents are retrievable.  If
8913  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8914  * includes additional metadata unnecessary for retrieving the file data
8915  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8916  * storage.
8917  *
8918  * Parameters:	p				The process
8919  *		uap->fd				The descriptor to synchronize
8920  *		flags				The data integrity flags
8921  *
8922  * Returns:	int				Success
8923  *	fp_getfvp:EBADF				Bad file descriptor
8924  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8925  *	VNOP_FSYNC:???				unspecified
8926  *
8927  * Notes:	We use struct fsync_args because it is a short name, and all
8928  *		caller argument structures are otherwise identical.
8929  */
8930 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8931 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8932 {
8933 	vnode_t vp;
8934 	struct fileproc *fp;
8935 	vfs_context_t ctx = vfs_context_current();
8936 	int error;
8937 
8938 	AUDIT_ARG(fd, uap->fd);
8939 
8940 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8941 		return error;
8942 	}
8943 	if ((error = vnode_getwithref(vp))) {
8944 		file_drop(uap->fd);
8945 		return error;
8946 	}
8947 
8948 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8949 
8950 	error = VNOP_FSYNC(vp, flags, ctx);
8951 
8952 #if NAMEDRSRCFORK
8953 	/* Sync resource fork shadow file if necessary. */
8954 	if ((error == 0) &&
8955 	    (vp->v_flag & VISNAMEDSTREAM) &&
8956 	    (vp->v_parent != NULLVP) &&
8957 	    vnode_isshadow(vp) &&
8958 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8959 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8960 	}
8961 #endif
8962 
8963 	(void)vnode_put(vp);
8964 	file_drop(uap->fd);
8965 	return error;
8966 }
8967 
8968 /*
8969  * Duplicate files.  Source must be a file, target must be a file or
8970  * must not exist.
8971  *
8972  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8973  *     perform inheritance correctly.
8974  */
8975 /* ARGSUSED */
8976 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8977 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8978 {
8979 	vnode_t tvp, fvp, tdvp, sdvp;
8980 	struct nameidata fromnd, tond;
8981 	int error;
8982 	vfs_context_t ctx = vfs_context_current();
8983 
8984 	/* Check that the flags are valid. */
8985 	if (uap->flags & ~CPF_MASK) {
8986 		return EINVAL;
8987 	}
8988 
8989 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8990 	    UIO_USERSPACE, uap->from, ctx);
8991 	if ((error = namei(&fromnd))) {
8992 		return error;
8993 	}
8994 	fvp = fromnd.ni_vp;
8995 
8996 	NDINIT(&tond, CREATE, OP_LINK,
8997 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8998 	    UIO_USERSPACE, uap->to, ctx);
8999 	if ((error = namei(&tond))) {
9000 		goto out1;
9001 	}
9002 	tdvp = tond.ni_dvp;
9003 	tvp = tond.ni_vp;
9004 
9005 	if (tvp != NULL) {
9006 		if (!(uap->flags & CPF_OVERWRITE)) {
9007 			error = EEXIST;
9008 			goto out;
9009 		}
9010 	}
9011 
9012 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
9013 		error = EISDIR;
9014 		goto out;
9015 	}
9016 
9017 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
9018 		error = EOPNOTSUPP;
9019 		goto out;
9020 	}
9021 
9022 #if CONFIG_MACF
9023 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
9024 		goto out;
9025 	}
9026 #endif /* CONFIG_MACF */
9027 
9028 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
9029 		goto out;
9030 	}
9031 	if (tvp) {
9032 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
9033 			goto out;
9034 		}
9035 	}
9036 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
9037 		goto out;
9038 	}
9039 
9040 	if (fvp == tdvp) {
9041 		error = EINVAL;
9042 	}
9043 	/*
9044 	 * If source is the same as the destination (that is the
9045 	 * same inode number) then there is nothing to do.
9046 	 * (fixed to have POSIX semantics - CSM 3/2/98)
9047 	 */
9048 	if (fvp == tvp) {
9049 		error = -1;
9050 	}
9051 
9052 #if CONFIG_FILE_LEASES
9053 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9054 #endif
9055 
9056 	if (!error) {
9057 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
9058 	}
9059 out:
9060 	sdvp = tond.ni_startdir;
9061 	/*
9062 	 * nameidone has to happen before we vnode_put(tdvp)
9063 	 * since it may need to release the fs_nodelock on the tdvp
9064 	 */
9065 	nameidone(&tond);
9066 
9067 	if (tvp) {
9068 		vnode_put(tvp);
9069 	}
9070 	vnode_put(tdvp);
9071 	vnode_put(sdvp);
9072 out1:
9073 	vnode_put(fvp);
9074 
9075 	nameidone(&fromnd);
9076 
9077 	if (error == -1) {
9078 		return 0;
9079 	}
9080 	return error;
9081 }
9082 
9083 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
9084 
9085 /*
9086  * Helper function for doing clones. The caller is expected to provide an
9087  * iocounted source vnode and release it.
9088  */
9089 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)9090 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
9091     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
9092 {
9093 	vnode_t tvp, tdvp;
9094 	struct nameidata *tondp = NULL;
9095 	int error;
9096 	int follow;
9097 	boolean_t free_src_acl;
9098 	boolean_t attr_cleanup;
9099 	enum vtype v_type;
9100 	kauth_action_t action;
9101 	struct componentname *cnp;
9102 	uint32_t defaulted = 0;
9103 	struct {
9104 		struct vnode_attr va[2];
9105 	} *va2p = NULL;
9106 	struct vnode_attr *vap = NULL;
9107 	struct vnode_attr *nvap = NULL;
9108 	uint32_t vnop_flags;
9109 
9110 	v_type = vnode_vtype(fvp);
9111 	switch (v_type) {
9112 	case VLNK:
9113 	/* FALLTHRU */
9114 	case VREG:
9115 		action = KAUTH_VNODE_ADD_FILE;
9116 		break;
9117 	case VDIR:
9118 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
9119 		    fvp->v_mountedhere) {
9120 			return EINVAL;
9121 		}
9122 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
9123 		break;
9124 	default:
9125 		return EINVAL;
9126 	}
9127 
9128 	AUDIT_ARG(fd2, dst_dirfd);
9129 	AUDIT_ARG(value32, flags);
9130 
9131 	tondp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9132 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9133 	NDINIT(tondp, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
9134 	    UIO_USERSPACE, dst, ctx);
9135 	if (flags & CLONE_NOFOLLOW_ANY) {
9136 		tondp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9137 	}
9138 	if (flags & CLONE_RESOLVE_BENEATH) {
9139 		tondp->ni_flag |= NAMEI_RESOLVE_BENEATH;
9140 	}
9141 
9142 	if ((error = nameiat(tondp, dst_dirfd))) {
9143 		kfree_type(struct nameidata, tondp);
9144 		return error;
9145 	}
9146 	cnp = &tondp->ni_cnd;
9147 	tdvp = tondp->ni_dvp;
9148 	tvp = tondp->ni_vp;
9149 
9150 	free_src_acl = FALSE;
9151 	attr_cleanup = FALSE;
9152 
9153 	if (tvp != NULL) {
9154 		error = EEXIST;
9155 		goto out;
9156 	}
9157 
9158 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
9159 		error = EXDEV;
9160 		goto out;
9161 	}
9162 
9163 #if CONFIG_MACF
9164 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
9165 		goto out;
9166 	}
9167 #endif
9168 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
9169 		goto out;
9170 	}
9171 
9172 	action = KAUTH_VNODE_GENERIC_READ_BITS;
9173 	if (data_read_authorised) {
9174 		action &= ~KAUTH_VNODE_READ_DATA;
9175 	}
9176 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
9177 		goto out;
9178 	}
9179 
9180 	va2p = kalloc_type(typeof(*va2p), Z_WAITOK | Z_NOFAIL);
9181 	vap = &va2p->va[0];
9182 	nvap = &va2p->va[1];
9183 
9184 	/*
9185 	 * certain attributes may need to be changed from the source, we ask for
9186 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
9187 	 * flag is specified. By default, the clone file will inherit the target
9188 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
9189 	 * will inherit the source file's ACLs instead.
9190 	 */
9191 	VATTR_INIT(vap);
9192 	VATTR_WANTED(vap, va_uid);
9193 	VATTR_WANTED(vap, va_gid);
9194 	VATTR_WANTED(vap, va_mode);
9195 	VATTR_WANTED(vap, va_flags);
9196 	if (flags & CLONE_ACL) {
9197 		VATTR_WANTED(vap, va_acl);
9198 	}
9199 
9200 	if ((error = vnode_getattr(fvp, vap, ctx)) != 0) {
9201 		goto out;
9202 	}
9203 
9204 	VATTR_INIT(nvap);
9205 	VATTR_SET(nvap, va_type, v_type);
9206 	if (VATTR_IS_SUPPORTED(vap, va_acl) && vap->va_acl != NULL) {
9207 		VATTR_SET(nvap, va_acl, vap->va_acl);
9208 		free_src_acl = TRUE;
9209 	}
9210 
9211 	/* Handle ACL inheritance, initialize vap. */
9212 	if (v_type == VLNK) {
9213 		error = vnode_authattr_new(tdvp, nvap, 0, ctx);
9214 	} else {
9215 		error = vn_attribute_prepare(tdvp, nvap, &defaulted, ctx);
9216 		if (error) {
9217 			goto out;
9218 		}
9219 		attr_cleanup = TRUE;
9220 	}
9221 
9222 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
9223 	/*
9224 	 * We've got initial values for all security parameters,
9225 	 * If we are superuser, then we can change owners to be the
9226 	 * same as the source. Both superuser and the owner have default
9227 	 * WRITE_SECURITY privileges so all other fields can be taken
9228 	 * from source as well.
9229 	 */
9230 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
9231 		if (VATTR_IS_SUPPORTED(vap, va_uid)) {
9232 			VATTR_SET(nvap, va_uid, vap->va_uid);
9233 		}
9234 		if (VATTR_IS_SUPPORTED(vap, va_gid)) {
9235 			VATTR_SET(nvap, va_gid, vap->va_gid);
9236 		}
9237 	} else {
9238 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
9239 	}
9240 
9241 	if (VATTR_IS_SUPPORTED(vap, va_mode)) {
9242 		VATTR_SET(nvap, va_mode, vap->va_mode);
9243 	}
9244 	if (VATTR_IS_SUPPORTED(vap, va_flags)) {
9245 		VATTR_SET(nvap, va_flags,
9246 		    ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
9247 		    (nvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
9248 	}
9249 
9250 #if CONFIG_FILE_LEASES
9251 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9252 #endif
9253 
9254 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, nvap, vnop_flags, ctx);
9255 
9256 	if (!error && tvp) {
9257 		int     update_flags = 0;
9258 #if CONFIG_FSE
9259 		int fsevent;
9260 #endif /* CONFIG_FSE */
9261 
9262 		/*
9263 		 * If some of the requested attributes weren't handled by the
9264 		 * VNOP, use our fallback code.
9265 		 */
9266 		if (!VATTR_ALL_SUPPORTED(nvap)) {
9267 			(void)vnode_setattr_fallback(tvp, nvap, ctx);
9268 		}
9269 
9270 #if CONFIG_MACF
9271 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
9272 		    VNODE_LABEL_CREATE, ctx);
9273 #endif
9274 
9275 		// Make sure the name & parent pointers are hooked up
9276 		if (tvp->v_name == NULL) {
9277 			update_flags |= VNODE_UPDATE_NAME;
9278 		}
9279 		if (tvp->v_parent == NULLVP) {
9280 			update_flags |= VNODE_UPDATE_PARENT;
9281 		}
9282 
9283 		if (update_flags) {
9284 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
9285 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
9286 		}
9287 
9288 #if CONFIG_FSE
9289 		switch (vnode_vtype(tvp)) {
9290 		case VLNK:
9291 		/* FALLTHRU */
9292 		case VREG:
9293 			fsevent = FSE_CREATE_FILE;
9294 			break;
9295 		case VDIR:
9296 			fsevent = FSE_CREATE_DIR;
9297 			break;
9298 		default:
9299 			goto out;
9300 		}
9301 
9302 		if (need_fsevent(fsevent, tvp)) {
9303 			/*
9304 			 * The following is a sequence of three explicit events.
9305 			 * A pair of FSE_CLONE events representing the source and destination
9306 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
9307 			 * fseventsd may coalesce the destination clone and create events
9308 			 * into a single event resulting in the following sequence for a client
9309 			 * FSE_CLONE (src)
9310 			 * FSE_CLONE | FSE_CREATE (dst)
9311 			 */
9312 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
9313 			    FSE_ARG_DONE);
9314 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
9315 			    FSE_ARG_DONE);
9316 		}
9317 #endif /* CONFIG_FSE */
9318 	}
9319 
9320 out:
9321 	if (attr_cleanup) {
9322 		vn_attribute_cleanup(nvap, defaulted);
9323 	}
9324 	if (free_src_acl && vap->va_acl) {
9325 		kauth_acl_free(vap->va_acl);
9326 	}
9327 	if (va2p) {
9328 		kfree_type(typeof(*va2p), va2p);
9329 	}
9330 	nameidone(tondp);
9331 	kfree_type(struct nameidata, tondp);
9332 	if (tvp) {
9333 		vnode_put(tvp);
9334 	}
9335 	vnode_put(tdvp);
9336 	return error;
9337 }
9338 
9339 /*
9340  * clone files or directories, target must not exist.
9341  */
9342 /* ARGSUSED */
9343 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)9344 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
9345     __unused int32_t *retval)
9346 {
9347 	vnode_t fvp;
9348 	struct nameidata *ndp = NULL;
9349 	int follow;
9350 	int error;
9351 	vfs_context_t ctx = vfs_context_current();
9352 
9353 	/* Check that the flags are valid. */
9354 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9355 	    CLONE_NOFOLLOW_ANY | CLONE_RESOLVE_BENEATH)) {
9356 		return EINVAL;
9357 	}
9358 
9359 	AUDIT_ARG(fd, uap->src_dirfd);
9360 
9361 	ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9362 
9363 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9364 	NDINIT(ndp, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
9365 	    UIO_USERSPACE, uap->src, ctx);
9366 	if (uap->flags & CLONE_NOFOLLOW_ANY) {
9367 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9368 	}
9369 	if (uap->flags & CLONE_RESOLVE_BENEATH) {
9370 		ndp->ni_flag |= NAMEI_RESOLVE_BENEATH;
9371 	}
9372 
9373 	if ((error = nameiat(ndp, uap->src_dirfd))) {
9374 		kfree_type(struct nameidata, ndp);
9375 		return error;
9376 	}
9377 
9378 	fvp = ndp->ni_vp;
9379 	nameidone(ndp);
9380 	kfree_type(struct nameidata, ndp);
9381 
9382 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
9383 	    uap->flags, ctx);
9384 
9385 	vnode_put(fvp);
9386 	return error;
9387 }
9388 
9389 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)9390 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
9391     __unused int32_t *retval)
9392 {
9393 	vnode_t fvp;
9394 	struct fileproc *fp;
9395 	int error;
9396 	vfs_context_t ctx = vfs_context_current();
9397 
9398 	/* Check that the flags are valid. */
9399 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9400 	    CLONE_NOFOLLOW_ANY | CLONE_RESOLVE_BENEATH)) {
9401 		return EINVAL;
9402 	}
9403 
9404 	AUDIT_ARG(fd, uap->src_fd);
9405 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
9406 	if (error) {
9407 		return error;
9408 	}
9409 
9410 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9411 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
9412 		error = EBADF;
9413 		goto out;
9414 	}
9415 
9416 	if ((error = vnode_getwithref(fvp))) {
9417 		goto out;
9418 	}
9419 
9420 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
9421 
9422 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
9423 	    uap->flags, ctx);
9424 
9425 	vnode_put(fvp);
9426 out:
9427 	file_drop(uap->src_fd);
9428 	return error;
9429 }
9430 
9431 static int
rename_submounts_callback(mount_t mp,void * arg)9432 rename_submounts_callback(mount_t mp, void *arg)
9433 {
9434 	char *prefix = (char *)arg;
9435 	int prefix_len = (int)strlen(prefix);
9436 	int error = 0;
9437 
9438 	if (strncmp(mp->mnt_vfsstat.f_mntonname, prefix, prefix_len) != 0) {
9439 		return 0;
9440 	}
9441 
9442 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
9443 		return 0;
9444 	}
9445 
9446 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
9447 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
9448 		return -1;
9449 	}
9450 
9451 	size_t pathlen = MAXPATHLEN;
9452 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
9453 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
9454 	}
9455 
9456 	vfs_unbusy(mp);
9457 
9458 	return error;
9459 }
9460 
9461 /*
9462  * Rename files.  Source and destination must either both be directories,
9463  * or both not be directories.  If target is a directory, it must be empty.
9464  */
9465 /* ARGSUSED */
9466 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)9467 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
9468     int tofd, user_addr_t to, int segflg, u_int uflags)
9469 {
9470 	vnode_t tvp, tdvp;
9471 	vnode_t fvp, fdvp;
9472 	vnode_t mnt_fvp;
9473 	struct nameidata *fromnd, *tond;
9474 	int error = 0;
9475 	int do_retry;
9476 	int retry_count;
9477 	int mntrename;
9478 	int dirrename;
9479 	int need_event;
9480 	int need_kpath2;
9481 	int has_listeners;
9482 	const char *oname = NULL;
9483 	char *old_dirpath = NULL, *from_name = NULL, *to_name = NULL;
9484 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
9485 	int from_len = 0, to_len = 0;
9486 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
9487 	int holding_mntlock;
9488 	int vn_authorize_skipped;
9489 	mount_t locked_mp = NULL;
9490 	vnode_t oparent = NULLVP;
9491 	vnode_t locked_vp = NULLVP;
9492 #if CONFIG_FSE
9493 	fse_info from_finfo = {}, to_finfo;
9494 #endif
9495 	int from_truncated = 0, to_truncated = 0;
9496 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
9497 	int batched = 0;
9498 	struct vnode_attr *fvap, *tvap;
9499 	int continuing = 0;
9500 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
9501 	int32_t nofollow_any = 0;
9502 	int32_t resolve_beneath = 0;
9503 	/* carving out a chunk for structs that are too big to be on stack. */
9504 	struct {
9505 		struct nameidata from_node, to_node;
9506 		struct vnode_attr fv_attr, tv_attr;
9507 	} * __rename_data;
9508 
9509 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9510 	fromnd = &__rename_data->from_node;
9511 	tond = &__rename_data->to_node;
9512 
9513 	holding_mntlock = 0;
9514 	do_retry = 0;
9515 	retry_count = 0;
9516 retry:
9517 	fvp = tvp = NULL;
9518 	fdvp = tdvp = NULL;
9519 	fvap = tvap = NULL;
9520 	mnt_fvp = NULLVP;
9521 	mntrename = dirrename = FALSE;
9522 	vn_authorize_skipped = FALSE;
9523 
9524 	if (uflags & RENAME_NOFOLLOW_ANY) {
9525 		nofollow_any = NAMEI_NOFOLLOW_ANY;
9526 	}
9527 	if (uflags & RENAME_RESOLVE_BENEATH) {
9528 		resolve_beneath = NAMEI_RESOLVE_BENEATH;
9529 	}
9530 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9531 	    segflg, from, ctx);
9532 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any | resolve_beneath;
9533 
9534 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9535 	    segflg, to, ctx);
9536 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any | resolve_beneath;
9537 
9538 continue_lookup:
9539 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9540 		if ((error = nameiat(fromnd, fromfd))) {
9541 			goto out1;
9542 		}
9543 		fdvp = fromnd->ni_dvp;
9544 		fvp  = fromnd->ni_vp;
9545 
9546 		if (fvp && fvp->v_type == VDIR) {
9547 			tond->ni_cnd.cn_flags |= WILLBEDIR;
9548 #if defined(XNU_TARGET_OS_OSX)
9549 			dirrename = TRUE;
9550 #endif
9551 		}
9552 	}
9553 
9554 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9555 		if ((error = nameiat(tond, tofd))) {
9556 			/*
9557 			 * Translate error code for rename("dir1", "dir2/.").
9558 			 */
9559 			if (error == EISDIR && fvp->v_type == VDIR) {
9560 				error = EINVAL;
9561 			}
9562 			goto out1;
9563 		}
9564 		tdvp = tond->ni_dvp;
9565 		tvp  = tond->ni_vp;
9566 	}
9567 
9568 #if DEVELOPMENT || DEBUG
9569 	/*
9570 	 * XXX VSWAP: Check for entitlements or special flag here
9571 	 * so we can restrict access appropriately.
9572 	 */
9573 #else /* DEVELOPMENT || DEBUG */
9574 
9575 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9576 		error = EPERM;
9577 		goto out1;
9578 	}
9579 
9580 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9581 		error = EPERM;
9582 		goto out1;
9583 	}
9584 #endif /* DEVELOPMENT || DEBUG */
9585 
9586 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9587 		error = ENOENT;
9588 		goto out1;
9589 	}
9590 
9591 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9592 		int32_t pval = 0;
9593 		int err = 0;
9594 
9595 		/*
9596 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9597 		 * has the same name as target iff the following conditions are met:
9598 		 * 1. the target file system is case insensitive
9599 		 * 2. source and target directories are the same
9600 		 * 3. source and target files are the same
9601 		 * 4. name only differs in case (determined by underlying filesystem)
9602 		 */
9603 		if (fvp != tvp || fdvp != tdvp) {
9604 			error = EEXIST;
9605 			goto out1;
9606 		}
9607 
9608 		/*
9609 		 * Assume that the target file system is case sensitive if
9610 		 * _PC_CASE_SENSITIVE selector isn't supported.
9611 		 */
9612 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9613 		if (err != 0 || pval != 0) {
9614 			error = EEXIST;
9615 			goto out1;
9616 		}
9617 	}
9618 
9619 	batched = vnode_compound_rename_available(fdvp);
9620 
9621 #if CONFIG_FSE
9622 	need_event = need_fsevent(FSE_RENAME, fdvp);
9623 	if (need_event) {
9624 		if (fvp) {
9625 			get_fse_info(fvp, &from_finfo, ctx);
9626 		} else {
9627 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9628 			if (error) {
9629 				goto out1;
9630 			}
9631 
9632 			fvap = &__rename_data->fv_attr;
9633 		}
9634 
9635 		if (tvp) {
9636 			get_fse_info(tvp, &to_finfo, ctx);
9637 		} else if (batched) {
9638 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9639 			if (error) {
9640 				goto out1;
9641 			}
9642 
9643 			tvap = &__rename_data->tv_attr;
9644 		}
9645 	}
9646 #else
9647 	need_event = 0;
9648 #endif /* CONFIG_FSE */
9649 
9650 	has_listeners = kauth_authorize_fileop_has_listeners();
9651 
9652 	need_kpath2 = 0;
9653 #if CONFIG_AUDIT
9654 	if (AUDIT_RECORD_EXISTS()) {
9655 		need_kpath2 = 1;
9656 	}
9657 #endif
9658 
9659 	if (need_event || has_listeners) {
9660 		if (from_name == NULL) {
9661 			GET_PATH(from_name);
9662 		}
9663 
9664 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9665 
9666 		if (from_name_no_firmlink == NULL) {
9667 			GET_PATH(from_name_no_firmlink);
9668 		}
9669 
9670 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9671 	}
9672 
9673 	if (need_event || need_kpath2 || has_listeners) {
9674 		if (to_name == NULL) {
9675 			GET_PATH(to_name);
9676 		}
9677 
9678 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9679 
9680 		if (to_name_no_firmlink == NULL) {
9681 			GET_PATH(to_name_no_firmlink);
9682 		}
9683 
9684 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9685 		if (to_name && need_kpath2) {
9686 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9687 		}
9688 	}
9689 	if (!fvp) {
9690 		/*
9691 		 * Claim: this check will never reject a valid rename.
9692 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9693 		 * Suppose fdvp and tdvp are not on the same mount.
9694 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9695 		 *      then you can't move it to within another dir on the same mountpoint.
9696 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9697 		 *
9698 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9699 		 */
9700 		if (fdvp->v_mount != tdvp->v_mount) {
9701 			error = EXDEV;
9702 			goto out1;
9703 		}
9704 		goto skipped_lookup;
9705 	}
9706 
9707 	/*
9708 	 * If the source and destination are the same (i.e. they're
9709 	 * links to the same vnode) and the target file system is
9710 	 * case sensitive, then there is nothing to do.
9711 	 *
9712 	 * XXX Come back to this.
9713 	 */
9714 	if (fvp == tvp) {
9715 		int pathconf_val;
9716 
9717 		/*
9718 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9719 		 * then assume that this file system is case sensitive.
9720 		 */
9721 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9722 		    pathconf_val != 0) {
9723 			vn_authorize_skipped = TRUE;
9724 			goto out1;
9725 		}
9726 	}
9727 
9728 	/*
9729 	 * Allow the renaming of mount points.
9730 	 * - target must not exist
9731 	 * - target must reside in the same directory as source
9732 	 * - union mounts cannot be renamed
9733 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9734 	 *
9735 	 * XXX Handle this in VFS after a continued lookup (if we missed
9736 	 * in the cache to start off)
9737 	 *
9738 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9739 	 * we'll skip past here.  The file system is responsible for
9740 	 * checking that @tvp is not a descendent of @fvp and vice versa
9741 	 * so it should always return EINVAL if either @tvp or @fvp is the
9742 	 * root of a volume.
9743 	 */
9744 	if ((fvp->v_flag & VROOT) &&
9745 	    (fvp->v_type == VDIR) &&
9746 	    (tvp == NULL) &&
9747 	    (fvp->v_mountedhere == NULL) &&
9748 	    (fdvp == tdvp) &&
9749 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9750 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9751 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9752 		vnode_t coveredvp;
9753 
9754 		/* switch fvp to the covered vnode */
9755 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9756 		if ((vnode_getwithref(coveredvp))) {
9757 			error = ENOENT;
9758 			goto out1;
9759 		}
9760 		/*
9761 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9762 		 * later.
9763 		 */
9764 		mnt_fvp = fvp;
9765 
9766 		fvp = coveredvp;
9767 		mntrename = TRUE;
9768 	}
9769 	/*
9770 	 * Check for cross-device rename.
9771 	 * For rename on mountpoint, we want to also check the source and its parent
9772 	 * belong to the same mountpoint.
9773 	 */
9774 	if ((fvp->v_mount != tdvp->v_mount) ||
9775 	    (fvp->v_mount != fdvp->v_mount) ||
9776 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9777 		error = EXDEV;
9778 		goto out1;
9779 	}
9780 
9781 	/*
9782 	 * If source is the same as the destination (that is the
9783 	 * same inode number) then there is nothing to do...
9784 	 * EXCEPT if the underlying file system supports case
9785 	 * insensitivity and is case preserving.  In this case
9786 	 * the file system needs to handle the special case of
9787 	 * getting the same vnode as target (fvp) and source (tvp).
9788 	 *
9789 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9790 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9791 	 * handle the special case of getting the same vnode as target and
9792 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9793 	 * so not to cause locking problems. There is a single reference on tvp.
9794 	 *
9795 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9796 	 * that correct behaviour then is just to return success without doing
9797 	 * anything.
9798 	 *
9799 	 * XXX filesystem should take care of this itself, perhaps...
9800 	 */
9801 	if (fvp == tvp && fdvp == tdvp) {
9802 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9803 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9804 		    fromnd->ni_cnd.cn_namelen)) {
9805 			vn_authorize_skipped = TRUE;
9806 			goto out1;
9807 		}
9808 	}
9809 
9810 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9811 		/*
9812 		 * we're holding a reference and lock
9813 		 * on locked_mp, but it no longer matches
9814 		 * what we want to do... so drop our hold
9815 		 */
9816 		mount_unlock_renames(locked_mp);
9817 		mount_drop(locked_mp, 0);
9818 		holding_mntlock = 0;
9819 	}
9820 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9821 		/*
9822 		 * serialize renames that re-shape
9823 		 * the tree... if holding_mntlock is
9824 		 * set, then we're ready to go...
9825 		 * otherwise we
9826 		 * first need to drop the iocounts
9827 		 * we picked up, second take the
9828 		 * lock to serialize the access,
9829 		 * then finally start the lookup
9830 		 * process over with the lock held
9831 		 */
9832 		if (!holding_mntlock) {
9833 			/*
9834 			 * need to grab a reference on
9835 			 * the mount point before we
9836 			 * drop all the iocounts... once
9837 			 * the iocounts are gone, the mount
9838 			 * could follow
9839 			 */
9840 			locked_mp = fvp->v_mount;
9841 			mount_ref(locked_mp, 0);
9842 
9843 			/*
9844 			 * nameidone has to happen before we vnode_put(tvp)
9845 			 * since it may need to release the fs_nodelock on the tvp
9846 			 */
9847 			nameidone(tond);
9848 
9849 			if (tvp) {
9850 				vnode_put(tvp);
9851 			}
9852 			vnode_put(tdvp);
9853 
9854 			/*
9855 			 * nameidone has to happen before we vnode_put(fdvp)
9856 			 * since it may need to release the fs_nodelock on the fvp
9857 			 */
9858 			nameidone(fromnd);
9859 
9860 			vnode_put(fvp);
9861 			vnode_put(fdvp);
9862 
9863 			if (mnt_fvp != NULLVP) {
9864 				vnode_put(mnt_fvp);
9865 			}
9866 
9867 			mount_lock_renames(locked_mp);
9868 			holding_mntlock = 1;
9869 
9870 			goto retry;
9871 		}
9872 	} else {
9873 		/*
9874 		 * when we dropped the iocounts to take
9875 		 * the lock, we allowed the identity of
9876 		 * the various vnodes to change... if they did,
9877 		 * we may no longer be dealing with a rename
9878 		 * that reshapes the tree... once we're holding
9879 		 * the iocounts, the vnodes can't change type
9880 		 * so we're free to drop the lock at this point
9881 		 * and continue on
9882 		 */
9883 		if (holding_mntlock) {
9884 			mount_unlock_renames(locked_mp);
9885 			mount_drop(locked_mp, 0);
9886 			holding_mntlock = 0;
9887 		}
9888 	}
9889 
9890 	if (!batched) {
9891 		assert(locked_vp == NULLVP);
9892 		vnode_link_lock(fvp);
9893 		locked_vp = fvp;
9894 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9895 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9896 		    flags, NULL);
9897 		if (error) {
9898 			if (error == ENOENT) {
9899 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9900 					/*
9901 					 * We encountered a race where after doing the namei,
9902 					 * tvp stops being valid. If so, simply re-drive the rename
9903 					 * call from the top.
9904 					 */
9905 					do_retry = 1;
9906 					retry_count += 1;
9907 				}
9908 			}
9909 			goto out1;
9910 		}
9911 	}
9912 
9913 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9914 	if (mnt_fvp != NULLVP) {
9915 		vnode_put(mnt_fvp);
9916 		mnt_fvp = NULLVP;
9917 	}
9918 
9919 	// save these off so we can later verify that fvp is the same
9920 	oname   = fvp->v_name;
9921 	oparent = fvp->v_parent;
9922 
9923 	/*
9924 	 * If renaming a directory, stash its path which we need later when
9925 	 * updating the 'f_mntonname' of sub mounts.
9926 	 */
9927 	if (dirrename) {
9928 		int pathlen = MAXPATHLEN;
9929 
9930 		old_dirpath = zalloc(ZV_NAMEI);
9931 		error = vn_getpath_fsenter(fvp, old_dirpath, &pathlen);
9932 		if (error) {
9933 			/*
9934 			 * Process that supports long path (opt-in to IO policy
9935 			 * IOPOL_TYPE_VFS_SUPPORT_LONG_PATHS) can have directory with path
9936 			 * length up to MAXLONGPATHLEN (8192). Since max path length in
9937 			 * mount's 'f_mntonname' is MAXPATHLEN (1024), this means the
9938 			 * directory can't be the parent of the sub mounts so we can just
9939 			 * silently drop the error and skip the check to update the
9940 			 * 'f_mntonname' of sub mounts.
9941 			 */
9942 			if (error == ENOSPC) {
9943 				dirrename = false;
9944 				error = 0;
9945 				if (old_dirpath) {
9946 					zfree(ZV_NAMEI, old_dirpath);
9947 					old_dirpath = NULL;
9948 				}
9949 			} else {
9950 				goto out1;
9951 			}
9952 		}
9953 	}
9954 
9955 skipped_lookup:
9956 #if CONFIG_FILE_LEASES
9957 	/* Lease break needed for source's parent dir? */
9958 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9959 
9960 	/* Lease break needed for target's parent dir? */
9961 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9962 #endif
9963 
9964 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9965 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9966 	    flags, ctx);
9967 
9968 	if (locked_vp) {
9969 		vnode_link_unlock(fvp);
9970 		locked_vp = NULLVP;
9971 	}
9972 
9973 	if (holding_mntlock) {
9974 		/*
9975 		 * we can drop our serialization
9976 		 * lock now
9977 		 */
9978 		mount_unlock_renames(locked_mp);
9979 		mount_drop(locked_mp, 0);
9980 		holding_mntlock = 0;
9981 	}
9982 	if (error) {
9983 		if (error == EDATALESS) {
9984 			/*
9985 			 * If we've been here before, something has gone
9986 			 * horribly wrong and we should just get out lest
9987 			 * we spiral around the drain forever.
9988 			 */
9989 			if (flags & VFS_RENAME_DATALESS) {
9990 				error = EIO;
9991 				goto out1;
9992 			}
9993 
9994 			/*
9995 			 * The object we're renaming is dataless (or has a
9996 			 * dataless descendent) and requires materialization
9997 			 * before the rename occurs.  But we're holding the
9998 			 * mount point's rename lock, so it's not safe to
9999 			 * make the upcall.
10000 			 *
10001 			 * In this case, we release the lock (above), perform
10002 			 * the materialization, and start the whole thing over.
10003 			 */
10004 			error = vfs_materialize_reparent(fvp, tdvp);
10005 			if (error == 0) {
10006 				/*
10007 				 * The next time around we need to tell the
10008 				 * file system that the materializtaion has
10009 				 * been performed.
10010 				 */
10011 				flags |= VFS_RENAME_DATALESS;
10012 				do_retry = 1;
10013 			}
10014 			goto out1;
10015 		}
10016 		if (error == EKEEPLOOKING) {
10017 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
10018 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
10019 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
10020 				}
10021 			}
10022 
10023 			fromnd->ni_vp = fvp;
10024 			tond->ni_vp = tvp;
10025 
10026 			goto continue_lookup;
10027 		}
10028 
10029 		/*
10030 		 * We may encounter a race in the VNOP where the destination didn't
10031 		 * exist when we did the namei, but it does by the time we go and
10032 		 * try to create the entry. In this case, we should re-drive this rename
10033 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
10034 		 * but other filesystems susceptible to this race could return it, too.
10035 		 */
10036 		if (error == ERECYCLE) {
10037 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
10038 				do_retry = 1;
10039 				retry_count += 1;
10040 			} else {
10041 				printf("rename retry limit due to ERECYCLE reached\n");
10042 				error = ENOENT;
10043 			}
10044 		}
10045 
10046 		/*
10047 		 * For compound VNOPs, the authorization callback may return
10048 		 * ENOENT in case of racing hardlink lookups hitting the name
10049 		 * cache, redrive the lookup.
10050 		 */
10051 		if (batched && error == ENOENT) {
10052 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10053 				do_retry = 1;
10054 				retry_count += 1;
10055 			}
10056 		}
10057 
10058 		goto out1;
10059 	}
10060 
10061 	/* call out to allow 3rd party notification of rename.
10062 	 * Ignore result of kauth_authorize_fileop call.
10063 	 */
10064 	kauth_authorize_fileop(vfs_context_ucred(ctx),
10065 	    KAUTH_FILEOP_RENAME,
10066 	    (uintptr_t)from_name, (uintptr_t)to_name);
10067 	if (flags & VFS_RENAME_SWAP) {
10068 		kauth_authorize_fileop(vfs_context_ucred(ctx),
10069 		    KAUTH_FILEOP_RENAME,
10070 		    (uintptr_t)to_name, (uintptr_t)from_name);
10071 	}
10072 
10073 #if CONFIG_FSE
10074 	if (from_name != NULL && to_name != NULL) {
10075 		if (from_truncated || to_truncated) {
10076 			// set it here since only the from_finfo gets reported up to user space
10077 			from_finfo.mode |= FSE_TRUNCATED_PATH;
10078 		}
10079 
10080 		if (tvap && tvp) {
10081 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
10082 		}
10083 		if (fvap) {
10084 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
10085 		}
10086 
10087 		if (tvp) {
10088 			add_fsevent(FSE_RENAME, ctx,
10089 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
10090 			    FSE_ARG_FINFO, &from_finfo,
10091 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
10092 			    FSE_ARG_FINFO, &to_finfo,
10093 			    FSE_ARG_DONE);
10094 			if (flags & VFS_RENAME_SWAP) {
10095 				/*
10096 				 * Strictly speaking, swap is the equivalent of
10097 				 * *three* renames.  FSEvents clients should only take
10098 				 * the events as a hint, so we only bother reporting
10099 				 * two.
10100 				 */
10101 				add_fsevent(FSE_RENAME, ctx,
10102 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
10103 				    FSE_ARG_FINFO, &to_finfo,
10104 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
10105 				    FSE_ARG_FINFO, &from_finfo,
10106 				    FSE_ARG_DONE);
10107 			}
10108 		} else {
10109 			add_fsevent(FSE_RENAME, ctx,
10110 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
10111 			    FSE_ARG_FINFO, &from_finfo,
10112 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
10113 			    FSE_ARG_DONE);
10114 		}
10115 	}
10116 #endif /* CONFIG_FSE */
10117 
10118 	/*
10119 	 * update filesystem's mount point data
10120 	 */
10121 	if (mntrename) {
10122 		char *cp, *pathend, *mpname;
10123 		char * tobuf;
10124 		struct mount *mp;
10125 		int maxlen;
10126 		size_t len = 0;
10127 
10128 		mp = fvp->v_mountedhere;
10129 
10130 		if (vfs_busy(mp, LK_NOWAIT)) {
10131 			error = EBUSY;
10132 			goto out1;
10133 		}
10134 		tobuf = zalloc(ZV_NAMEI);
10135 
10136 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
10137 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
10138 		} else {
10139 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
10140 		}
10141 		if (!error) {
10142 			/* find current mount point prefix */
10143 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
10144 			for (cp = pathend; *cp != '\0'; ++cp) {
10145 				if (*cp == '/') {
10146 					pathend = cp + 1;
10147 				}
10148 			}
10149 			/* find last component of target name */
10150 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
10151 				if (*cp == '/') {
10152 					mpname = cp + 1;
10153 				}
10154 			}
10155 
10156 			/* Update f_mntonname of sub mounts */
10157 			vfs_iterate(0, rename_submounts_callback,
10158 			    (void *)mp->mnt_vfsstat.f_mntonname);
10159 
10160 			/* append name to prefix */
10161 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
10162 			bzero(pathend, maxlen);
10163 
10164 			strlcpy(pathend, mpname, maxlen);
10165 		}
10166 		zfree(ZV_NAMEI, tobuf);
10167 
10168 		vfs_unbusy(mp);
10169 
10170 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
10171 	} else if (dirrename) {
10172 		/*
10173 		 * If we renamed a directory, we need to check if there is any sub
10174 		 * mount(s) mounted under the directory. If so, then we need to update
10175 		 * the sub mount's f_mntonname path.
10176 		 */
10177 		vfs_iterate(0, rename_submounts_callback, (void *)old_dirpath);
10178 	}
10179 
10180 	/*
10181 	 * fix up name & parent pointers.  note that we first
10182 	 * check that fvp has the same name/parent pointers it
10183 	 * had before the rename call... this is a 'weak' check
10184 	 * at best...
10185 	 *
10186 	 * XXX oparent and oname may not be set in the compound vnop case
10187 	 */
10188 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
10189 		int update_flags;
10190 
10191 		update_flags = VNODE_UPDATE_NAME;
10192 
10193 		if (fdvp != tdvp) {
10194 			update_flags |= VNODE_UPDATE_PARENT;
10195 		}
10196 
10197 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
10198 	}
10199 out1:
10200 	/*
10201 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
10202 	 * skipped earlier as no actual rename was performed.
10203 	 */
10204 	if (vn_authorize_skipped && error == 0) {
10205 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
10206 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
10207 		    flags, NULL);
10208 		if (error && error == ENOENT) {
10209 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10210 				do_retry = 1;
10211 				retry_count += 1;
10212 			}
10213 		}
10214 	}
10215 	if (locked_vp) {
10216 		assert(locked_vp == fvp);
10217 		vnode_link_unlock(locked_vp);
10218 		locked_vp = NULLVP;
10219 	}
10220 	if (to_name != NULL) {
10221 		RELEASE_PATH(to_name);
10222 		to_name = NULL;
10223 	}
10224 	if (to_name_no_firmlink != NULL) {
10225 		RELEASE_PATH(to_name_no_firmlink);
10226 		to_name_no_firmlink = NULL;
10227 	}
10228 	if (from_name != NULL) {
10229 		RELEASE_PATH(from_name);
10230 		from_name = NULL;
10231 	}
10232 	if (from_name_no_firmlink != NULL) {
10233 		RELEASE_PATH(from_name_no_firmlink);
10234 		from_name_no_firmlink = NULL;
10235 	}
10236 	if (old_dirpath != NULL) {
10237 		zfree(ZV_NAMEI, old_dirpath);
10238 		old_dirpath = NULL;
10239 	}
10240 	if (holding_mntlock) {
10241 		mount_unlock_renames(locked_mp);
10242 		mount_drop(locked_mp, 0);
10243 		holding_mntlock = 0;
10244 	}
10245 	if (tdvp) {
10246 		/*
10247 		 * nameidone has to happen before we vnode_put(tdvp)
10248 		 * since it may need to release the fs_nodelock on the tdvp
10249 		 */
10250 		nameidone(tond);
10251 
10252 		if (tvp) {
10253 			vnode_put(tvp);
10254 		}
10255 		vnode_put(tdvp);
10256 	}
10257 	if (fdvp) {
10258 		/*
10259 		 * nameidone has to happen before we vnode_put(fdvp)
10260 		 * since it may need to release the fs_nodelock on the fdvp
10261 		 */
10262 		nameidone(fromnd);
10263 
10264 		if (fvp) {
10265 			vnode_put(fvp);
10266 		}
10267 		vnode_put(fdvp);
10268 	}
10269 	if (mnt_fvp != NULLVP) {
10270 		vnode_put(mnt_fvp);
10271 	}
10272 	/*
10273 	 * If things changed after we did the namei, then we will re-drive
10274 	 * this rename call from the top.
10275 	 */
10276 	if (do_retry) {
10277 		do_retry = 0;
10278 		goto retry;
10279 	}
10280 
10281 	kfree_type(typeof(*__rename_data), __rename_data);
10282 	return error;
10283 }
10284 
10285 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)10286 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
10287 {
10288 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
10289 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
10290 }
10291 
10292 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)10293 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
10294 {
10295 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY | RENAME_RESOLVE_BENEATH)) {
10296 		return EINVAL;
10297 	}
10298 
10299 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
10300 		return EINVAL;
10301 	}
10302 
10303 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
10304 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
10305 }
10306 
10307 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)10308 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
10309 {
10310 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
10311 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
10312 }
10313 
10314 /*
10315  * Make a directory file.
10316  *
10317  * Returns:	0			Success
10318  *		EEXIST
10319  *	namei:???
10320  *	vnode_authorize:???
10321  *	vn_create:???
10322  */
10323 /* ARGSUSED */
10324 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)10325 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
10326     enum uio_seg segflg)
10327 {
10328 	vnode_t vp, dvp;
10329 	int error;
10330 	int update_flags = 0;
10331 	int batched;
10332 	struct nameidata nd;
10333 
10334 	AUDIT_ARG(mode, vap->va_mode);
10335 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
10336 	    path, ctx);
10337 	nd.ni_cnd.cn_flags |= WILLBEDIR;
10338 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
10339 
10340 continue_lookup:
10341 	error = nameiat(&nd, fd);
10342 	if (error) {
10343 		return error;
10344 	}
10345 	dvp = nd.ni_dvp;
10346 	vp = nd.ni_vp;
10347 
10348 	if (vp != NULL) {
10349 		error = EEXIST;
10350 		goto out;
10351 	}
10352 
10353 	batched = vnode_compound_mkdir_available(dvp);
10354 
10355 	VATTR_SET(vap, va_type, VDIR);
10356 
10357 	/*
10358 	 * XXX
10359 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
10360 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
10361 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
10362 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
10363 	 */
10364 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
10365 		if (error == EACCES || error == EPERM) {
10366 			int error2;
10367 
10368 			nameidone(&nd);
10369 			vnode_put(dvp);
10370 			dvp = NULLVP;
10371 
10372 			/*
10373 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
10374 			 * rather than EACCESS if the target exists.
10375 			 */
10376 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
10377 			    path, ctx);
10378 			error2 = nameiat(&nd, fd);
10379 			if (error2) {
10380 				goto out;
10381 			} else {
10382 				vp = nd.ni_vp;
10383 				error = EEXIST;
10384 				goto out;
10385 			}
10386 		}
10387 
10388 		goto out;
10389 	}
10390 
10391 #if CONFIG_FILE_LEASES
10392 	vnode_breakdirlease(dvp, false, O_WRONLY);
10393 #endif
10394 
10395 	/*
10396 	 * make the directory
10397 	 */
10398 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
10399 		if (error == EKEEPLOOKING) {
10400 			nd.ni_vp = vp;
10401 			goto continue_lookup;
10402 		}
10403 
10404 		goto out;
10405 	}
10406 
10407 	// Make sure the name & parent pointers are hooked up
10408 	if (vp->v_name == NULL) {
10409 		update_flags |= VNODE_UPDATE_NAME;
10410 	}
10411 	if (vp->v_parent == NULLVP) {
10412 		update_flags |= VNODE_UPDATE_PARENT;
10413 	}
10414 
10415 	if (update_flags) {
10416 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
10417 	}
10418 
10419 #if CONFIG_FSE
10420 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
10421 #endif
10422 
10423 out:
10424 	/*
10425 	 * nameidone has to happen before we vnode_put(dvp)
10426 	 * since it may need to release the fs_nodelock on the dvp
10427 	 */
10428 	nameidone(&nd);
10429 
10430 	if (vp) {
10431 		vnode_put(vp);
10432 	}
10433 	if (dvp) {
10434 		vnode_put(dvp);
10435 	}
10436 
10437 	return error;
10438 }
10439 
10440 /*
10441  * mkdir_extended: Create a directory; with extended security (ACL).
10442  *
10443  * Parameters:    p                       Process requesting to create the directory
10444  *                uap                     User argument descriptor (see below)
10445  *                retval                  (ignored)
10446  *
10447  * Indirect:      uap->path               Path of directory to create
10448  *                uap->mode               Access permissions to set
10449  *                uap->xsecurity          ACL to set
10450  *
10451  * Returns:        0                      Success
10452  *                !0                      Not success
10453  *
10454  */
10455 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)10456 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
10457 {
10458 	int ciferror;
10459 	kauth_filesec_t xsecdst;
10460 	struct vnode_attr va;
10461 
10462 	AUDIT_ARG(owner, uap->uid, uap->gid);
10463 
10464 	xsecdst = NULL;
10465 	if ((uap->xsecurity != USER_ADDR_NULL) &&
10466 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
10467 		return ciferror;
10468 	}
10469 
10470 	VATTR_INIT(&va);
10471 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10472 	if (xsecdst != NULL) {
10473 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
10474 		va.va_vaflags |= VA_FILESEC_ACL;
10475 	}
10476 
10477 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10478 	    UIO_USERSPACE);
10479 	if (xsecdst != NULL) {
10480 		kauth_filesec_free(xsecdst);
10481 	}
10482 	return ciferror;
10483 }
10484 
10485 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)10486 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
10487 {
10488 	struct vnode_attr va;
10489 
10490 	VATTR_INIT(&va);
10491 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10492 
10493 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10494 	           UIO_USERSPACE);
10495 }
10496 
10497 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)10498 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
10499 {
10500 	struct vnode_attr va;
10501 
10502 	VATTR_INIT(&va);
10503 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10504 
10505 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
10506 	           UIO_USERSPACE);
10507 }
10508 
10509 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)10510 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
10511     enum uio_seg segflg, int unlink_flags)
10512 {
10513 	struct {
10514 		struct nameidata nd;
10515 #if CONFIG_FSE
10516 		struct vnode_attr va;
10517 #endif /* CONFIG_FSE */
10518 	} *__rmdir_data;
10519 	vnode_t vp, dvp;
10520 	int error;
10521 	struct nameidata *ndp;
10522 	char     *path = NULL;
10523 	char     *no_firmlink_path = NULL;
10524 	int       len_path = 0;
10525 	int       len_no_firmlink_path = 0;
10526 	int has_listeners = 0;
10527 	int need_event = 0;
10528 	int truncated_path = 0;
10529 	int truncated_no_firmlink_path = 0;
10530 	struct vnode_attr *vap = NULL;
10531 	int restart_count = 0;
10532 	int batched;
10533 
10534 	int restart_flag;
10535 	int namei_flags = 0;
10536 
10537 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
10538 	ndp = &__rmdir_data->nd;
10539 
10540 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
10541 		namei_flags |= NAMEI_NOFOLLOW_ANY;
10542 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
10543 	}
10544 	if (unlink_flags & VNODE_REMOVE_RESOLVE_BENEATH) {
10545 		namei_flags |= NAMEI_RESOLVE_BENEATH;
10546 		unlink_flags &= ~VNODE_REMOVE_RESOLVE_BENEATH;
10547 	}
10548 	if (unlink_flags & VNODE_REMOVE_UNIQUE) {
10549 		namei_flags |= NAMEI_UNIQUE;
10550 		unlink_flags &= ~VNODE_REMOVE_UNIQUE;
10551 	}
10552 
10553 	/*
10554 	 * This loop exists to restart rmdir in the unlikely case that two
10555 	 * processes are simultaneously trying to remove the same directory
10556 	 * containing orphaned appleDouble files.
10557 	 */
10558 	do {
10559 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
10560 		    segflg, dirpath, ctx);
10561 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR | namei_flags;
10562 continue_lookup:
10563 		restart_flag = 0;
10564 		vap = NULL;
10565 
10566 		error = nameiat(ndp, fd);
10567 		if (error) {
10568 			goto err_out;
10569 		}
10570 
10571 		dvp = ndp->ni_dvp;
10572 		vp = ndp->ni_vp;
10573 
10574 		if (vp) {
10575 			batched = vnode_compound_rmdir_available(vp);
10576 
10577 			if (vp->v_flag & VROOT) {
10578 				/*
10579 				 * The root of a mounted filesystem cannot be deleted.
10580 				 */
10581 				error = EBUSY;
10582 				goto out;
10583 			}
10584 
10585 #if DEVELOPMENT || DEBUG
10586 			/*
10587 			 * XXX VSWAP: Check for entitlements or special flag here
10588 			 * so we can restrict access appropriately.
10589 			 */
10590 #else /* DEVELOPMENT || DEBUG */
10591 
10592 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10593 				error = EPERM;
10594 				goto out;
10595 			}
10596 #endif /* DEVELOPMENT || DEBUG */
10597 
10598 			/*
10599 			 * Removed a check here; we used to abort if vp's vid
10600 			 * was not the same as what we'd seen the last time around.
10601 			 * I do not think that check was valid, because if we retry
10602 			 * and all dirents are gone, the directory could legitimately
10603 			 * be recycled but still be present in a situation where we would
10604 			 * have had permission to delete.  Therefore, we won't make
10605 			 * an effort to preserve that check now that we may not have a
10606 			 * vp here.
10607 			 */
10608 
10609 			if (!batched) {
10610 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10611 				if (error) {
10612 					if (error == ENOENT) {
10613 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10614 							restart_flag = 1;
10615 							restart_count += 1;
10616 						}
10617 					}
10618 					goto out;
10619 				}
10620 			}
10621 		} else {
10622 			batched = 1;
10623 
10624 			if (!vnode_compound_rmdir_available(dvp)) {
10625 				panic("No error, but no compound rmdir?");
10626 			}
10627 		}
10628 
10629 #if CONFIG_FSE
10630 		fse_info  finfo = {0};
10631 
10632 		need_event = need_fsevent(FSE_DELETE, dvp);
10633 		if (need_event) {
10634 			if (!batched) {
10635 				get_fse_info(vp, &finfo, ctx);
10636 			} else {
10637 				error = vfs_get_notify_attributes(&__rmdir_data->va);
10638 				if (error) {
10639 					goto out;
10640 				}
10641 
10642 				vap = &__rmdir_data->va;
10643 			}
10644 		}
10645 #endif
10646 		has_listeners = kauth_authorize_fileop_has_listeners();
10647 		if (need_event || has_listeners) {
10648 			if (path == NULL) {
10649 				GET_PATH(path);
10650 			}
10651 
10652 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10653 
10654 			if (no_firmlink_path == NULL) {
10655 				GET_PATH(no_firmlink_path);
10656 			}
10657 
10658 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10659 #if CONFIG_FSE
10660 			if (truncated_no_firmlink_path) {
10661 				finfo.mode |= FSE_TRUNCATED_PATH;
10662 			}
10663 #endif
10664 		}
10665 
10666 #if CONFIG_FILE_LEASES
10667 		vnode_breakdirlease(dvp, false, O_WRONLY);
10668 #endif
10669 
10670 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10671 		ndp->ni_vp = vp;
10672 		if (vp == NULLVP) {
10673 			/* Couldn't find a vnode */
10674 			goto out;
10675 		}
10676 
10677 		if (error == EKEEPLOOKING) {
10678 			goto continue_lookup;
10679 		} else if (batched && error == ENOENT) {
10680 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10681 				/*
10682 				 * For compound VNOPs, the authorization callback
10683 				 * may return ENOENT in case of racing hard link lookups
10684 				 * redrive the lookup.
10685 				 */
10686 				restart_flag = 1;
10687 				restart_count += 1;
10688 				goto out;
10689 			}
10690 		}
10691 
10692 		/*
10693 		 * XXX There's no provision for passing flags
10694 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10695 		 * because it's not empty, then we try again
10696 		 * with VNOP_REMOVE(), passing in a special
10697 		 * flag that clever file systems will know
10698 		 * how to handle.
10699 		 */
10700 		if (error == ENOTEMPTY &&
10701 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10702 			/*
10703 			 * Only do this if the directory is actually
10704 			 * marked as DATALESS.
10705 			 */
10706 			struct vnode_attr *lvap =
10707 			    kalloc_type(struct vnode_attr, Z_WAITOK);
10708 
10709 			VATTR_INIT(lvap);
10710 			VATTR_WANTED(lvap, va_flags);
10711 			if (vnode_getattr(vp, lvap, ctx) == 0 &&
10712 			    VATTR_IS_SUPPORTED(lvap, va_flags) &&
10713 			    (lvap->va_flags & SF_DATALESS) != 0) {
10714 				/*
10715 				 * If this fails, we want to keep the original
10716 				 * error.
10717 				 */
10718 				if (vn_remove(dvp, &vp, ndp,
10719 				    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10720 					error = 0;
10721 				}
10722 			}
10723 			kfree_type(struct vnode_attr, lvap);
10724 		}
10725 
10726 #if CONFIG_APPLEDOUBLE
10727 		/*
10728 		 * Special case to remove orphaned AppleDouble
10729 		 * files. I don't like putting this in the kernel,
10730 		 * but carbon does not like putting this in carbon either,
10731 		 * so here we are.
10732 		 */
10733 		if (error == ENOTEMPTY) {
10734 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10735 			if (ad_error == EBUSY) {
10736 				error = ad_error;
10737 				goto out;
10738 			}
10739 
10740 
10741 			/*
10742 			 * Assuming everything went well, we will try the RMDIR again
10743 			 */
10744 			if (!ad_error) {
10745 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10746 			}
10747 		}
10748 #endif /* CONFIG_APPLEDOUBLE */
10749 		/*
10750 		 * Call out to allow 3rd party notification of delete.
10751 		 * Ignore result of kauth_authorize_fileop call.
10752 		 */
10753 		if (!error) {
10754 			if (has_listeners) {
10755 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10756 				    KAUTH_FILEOP_DELETE,
10757 				    (uintptr_t)vp,
10758 				    (uintptr_t)path);
10759 			}
10760 
10761 			if (vp->v_flag & VISHARDLINK) {
10762 				// see the comment in unlink1() about why we update
10763 				// the parent of a hard link when it is removed
10764 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10765 			}
10766 
10767 #if CONFIG_FSE
10768 			if (need_event) {
10769 				if (vap) {
10770 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10771 				}
10772 				add_fsevent(FSE_DELETE, ctx,
10773 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10774 				    FSE_ARG_FINFO, &finfo,
10775 				    FSE_ARG_DONE);
10776 			}
10777 #endif
10778 
10779 #if CONFIG_MACF
10780 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10781 #endif
10782 		}
10783 
10784 out:
10785 		if (path != NULL) {
10786 			RELEASE_PATH(path);
10787 			path = NULL;
10788 		}
10789 
10790 		if (no_firmlink_path != NULL) {
10791 			RELEASE_PATH(no_firmlink_path);
10792 			no_firmlink_path = NULL;
10793 		}
10794 
10795 		/*
10796 		 * nameidone has to happen before we vnode_put(dvp)
10797 		 * since it may need to release the fs_nodelock on the dvp
10798 		 */
10799 		nameidone(ndp);
10800 		vnode_put(dvp);
10801 
10802 		if (vp) {
10803 			vnode_put(vp);
10804 		}
10805 
10806 		if (restart_flag == 0) {
10807 			wakeup_one((caddr_t)vp);
10808 			goto err_out;
10809 		}
10810 		tsleep(vp, PVFS, "rm AD", 1);
10811 	} while (restart_flag != 0);
10812 
10813 err_out:
10814 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10815 
10816 	return error;
10817 }
10818 
10819 /*
10820  * Remove a directory file.
10821  */
10822 /* ARGSUSED */
10823 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10824 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10825 {
10826 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10827 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10828 }
10829 
10830 /* Get direntry length padded to 8 byte alignment */
10831 #define DIRENT64_LEN(namlen) \
10832 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10833 
10834 /* Get dirent length padded to 4 byte alignment */
10835 #define DIRENT_LEN(namelen) \
10836 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10837 
10838 /* Get the end of this dirent */
10839 #define DIRENT_END(dep) \
10840 	(((char *)(dep)) + (dep)->d_reclen - 1)
10841 
10842 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10843 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10844     int *numdirent, vfs_context_t ctxp)
10845 {
10846 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10847 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10848 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10849 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10850 	} else {
10851 		size_t bufsize;
10852 		void * bufptr;
10853 		uio_t auio;
10854 		struct direntry *entry64;
10855 		struct dirent *dep;
10856 		size_t bytesread;
10857 		int error;
10858 
10859 		/*
10860 		 * We're here because the underlying file system does not
10861 		 * support direnties or we mounted denying support so we must
10862 		 * fall back to dirents and convert them to direntries.
10863 		 *
10864 		 * Our kernel buffer needs to be smaller since re-packing will
10865 		 * expand each dirent.  The worse case (when the name length
10866 		 * is 3 or less) corresponds to a struct direntry size of 32
10867 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10868 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10869 		 * will prevent us from reading more than we can pack.
10870 		 *
10871 		 * Since this buffer is wired memory, we will limit the
10872 		 * buffer size to a maximum of 32K. We would really like to
10873 		 * use 32K in the MIN(), but we use magic number 87371 to
10874 		 * prevent uio_resid() * 3 / 8 from overflowing.
10875 		 */
10876 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10877 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10878 		if (bufptr == NULL) {
10879 			return ENOMEM;
10880 		}
10881 
10882 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10883 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10884 		auio->uio_offset = uio->uio_offset;
10885 
10886 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10887 
10888 		dep = (struct dirent *)bufptr;
10889 		bytesread = bufsize - uio_resid(auio);
10890 
10891 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10892 		/*
10893 		 * Convert all the entries and copy them out to user's buffer.
10894 		 */
10895 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10896 			/* First check that the dirent struct up to d_name is within the buffer */
10897 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10898 			    /* Check that the length of the entire dirent is within the buffer */
10899 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10900 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10901 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10902 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10903 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10904 				    vp->v_name ? vp->v_name : "<unknown>");
10905 				error = EIO;
10906 				break;
10907 			}
10908 
10909 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10910 
10911 			bzero(entry64, enbufsize);
10912 			/* Convert a dirent to a dirent64. */
10913 			entry64->d_ino = dep->d_ino;
10914 			entry64->d_seekoff = 0;
10915 			entry64->d_reclen = (uint16_t)enbufsize;
10916 			entry64->d_namlen = dep->d_namlen;
10917 			entry64->d_type = dep->d_type;
10918 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10919 
10920 			/* Move to next entry. */
10921 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10922 
10923 			/* Copy entry64 to user's buffer. */
10924 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10925 		}
10926 
10927 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10928 		if (error == 0) {
10929 			uio->uio_offset = auio->uio_offset;
10930 		}
10931 		uio_free(auio);
10932 		kfree_data(bufptr, bufsize);
10933 		kfree_type(struct direntry, entry64);
10934 		return error;
10935 	}
10936 }
10937 
10938 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10939 
10940 /*
10941  * Read a block of directory entries in a file system independent format.
10942  */
10943 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10944 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10945     off_t *offset, int *eofflag, int flags)
10946 {
10947 	vnode_t vp;
10948 	struct vfs_context context = *vfs_context_current();    /* local copy */
10949 	struct fileproc *fp;
10950 	uio_t auio;
10951 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10952 	off_t loff;
10953 	int error, numdirent;
10954 	UIO_STACKBUF(uio_buf, 1);
10955 
10956 get_from_fd:
10957 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10958 	if (error) {
10959 		return error;
10960 	}
10961 
10962 	vn_offset_lock(fp->fp_glob);
10963 	if (((vnode_t)fp_get_data(fp)) != vp) {
10964 		vn_offset_unlock(fp->fp_glob);
10965 		file_drop(fd);
10966 		goto get_from_fd;
10967 	}
10968 
10969 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10970 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10971 		error = EBADF;
10972 		goto out;
10973 	}
10974 
10975 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10976 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10977 	}
10978 
10979 #if CONFIG_MACF
10980 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10981 	if (error) {
10982 		goto out;
10983 	}
10984 #endif
10985 
10986 	if ((error = vnode_getwithref(vp))) {
10987 		goto out;
10988 	}
10989 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10990 
10991 #if CONFIG_UNION_MOUNTS
10992 unionread:
10993 #endif /* CONFIG_UNION_MOUNTS */
10994 	if (vp->v_type != VDIR) {
10995 		(void)vnode_put(vp);
10996 		error = EINVAL;
10997 		goto out;
10998 	}
10999 
11000 #if CONFIG_MACF
11001 	error = mac_vnode_check_readdir(&context, vp);
11002 	if (error != 0) {
11003 		(void)vnode_put(vp);
11004 		goto out;
11005 	}
11006 #endif /* MAC */
11007 
11008 	loff = fp->fp_glob->fg_offset;
11009 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11010 	uio_addiov(auio, bufp, bufsize);
11011 
11012 	if (flags & VNODE_READDIR_EXTENDED) {
11013 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
11014 		fp->fp_glob->fg_offset = uio_offset(auio);
11015 	} else {
11016 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
11017 		fp->fp_glob->fg_offset = uio_offset(auio);
11018 	}
11019 	if (error) {
11020 		(void)vnode_put(vp);
11021 		goto out;
11022 	}
11023 
11024 #if CONFIG_UNION_MOUNTS
11025 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
11026 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
11027 		vnode_t uvp;
11028 
11029 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
11030 			if (vnode_ref(uvp) == 0) {
11031 				if ((error = VNOP_OPEN(uvp, fp->fp_glob->fg_flag, &context)) == 0) {
11032 					fp_set_data(fp, uvp);
11033 					/* Close the old vnode to maintain proper lifecycle */
11034 					VNOP_CLOSE(vp, fp->fp_glob->fg_flag, &context);
11035 					fp->fp_glob->fg_offset = 0;
11036 					vnode_rele(vp);
11037 					vnode_put(vp);
11038 					vp = uvp;
11039 					goto unionread;
11040 				} else {
11041 					vnode_rele(uvp);
11042 					vnode_put(uvp);
11043 				}
11044 			} else {
11045 				/* could not get a ref, can't replace in fd */
11046 				vnode_put(uvp);
11047 			}
11048 		}
11049 	}
11050 #endif /* CONFIG_UNION_MOUNTS */
11051 
11052 	vnode_put(vp);
11053 	if (offset) {
11054 		*offset = loff;
11055 	}
11056 
11057 	*bytesread = bufsize - uio_resid(auio);
11058 out:
11059 	vn_offset_unlock(fp->fp_glob);
11060 	file_drop(fd);
11061 	return error;
11062 }
11063 
11064 
11065 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)11066 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
11067 {
11068 	off_t offset;
11069 	ssize_t bytesread;
11070 	int error, eofflag;
11071 
11072 	AUDIT_ARG(fd, uap->fd);
11073 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
11074 	    &bytesread, &offset, &eofflag, 0);
11075 
11076 	if (error == 0) {
11077 		if (proc_is64bit(p)) {
11078 			user64_long_t base = (user64_long_t)offset;
11079 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
11080 		} else {
11081 			user32_long_t base = (user32_long_t)offset;
11082 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
11083 		}
11084 		*retval = (int)bytesread;
11085 	}
11086 	return error;
11087 }
11088 
11089 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)11090 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
11091 {
11092 	off_t offset;
11093 	ssize_t bytesread;
11094 	int error, eofflag;
11095 	user_size_t bufsize;
11096 
11097 	AUDIT_ARG(fd, uap->fd);
11098 
11099 	/*
11100 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
11101 	 * then the kernel carves out the last 4 bytes to return extended
11102 	 * information to userspace (namely whether we reached EOF with this call).
11103 	 */
11104 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
11105 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
11106 	} else {
11107 		bufsize = uap->bufsize;
11108 	}
11109 
11110 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
11111 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
11112 
11113 	if (error == 0) {
11114 		*retval = bytesread;
11115 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
11116 
11117 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
11118 			getdirentries64_flags_t flags = 0;
11119 			if (eofflag) {
11120 				flags |= GETDIRENTRIES64_EOF;
11121 			}
11122 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
11123 			    sizeof(flags));
11124 		}
11125 	}
11126 	return error;
11127 }
11128 
11129 
11130 /*
11131  * Set the mode mask for creation of filesystem nodes.
11132  * XXX implement xsecurity
11133  */
11134 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
11135 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)11136 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
11137 {
11138 	AUDIT_ARG(mask, newmask);
11139 	proc_fdlock(p);
11140 	*retval = p->p_fd.fd_cmask;
11141 	p->p_fd.fd_cmask = newmask & ALLPERMS;
11142 	proc_fdunlock(p);
11143 	return 0;
11144 }
11145 
11146 /*
11147  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
11148  *
11149  * Parameters:    p                       Process requesting to set the umask
11150  *                uap                     User argument descriptor (see below)
11151  *                retval                  umask of the process (parameter p)
11152  *
11153  * Indirect:      uap->newmask            umask to set
11154  *                uap->xsecurity          ACL to set
11155  *
11156  * Returns:        0                      Success
11157  *                !0                      Not success
11158  *
11159  */
11160 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)11161 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
11162 {
11163 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
11164 }
11165 
11166 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)11167 umask(proc_t p, struct umask_args *uap, int32_t *retval)
11168 {
11169 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
11170 }
11171 
11172 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
11173 	"com.apple.private.vfs.revoke-mounted-device"
11174 
11175 /*
11176  * Void all references to file by ripping underlying filesystem
11177  * away from vnode.
11178  */
11179 /* ARGSUSED */
11180 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)11181 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
11182 {
11183 	vnode_t vp;
11184 	struct vnode_attr va;
11185 	vfs_context_t ctx = vfs_context_current();
11186 	int error;
11187 	struct nameidata nd;
11188 
11189 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
11190 	    uap->path, ctx);
11191 	error = namei(&nd);
11192 	if (error) {
11193 		return error;
11194 	}
11195 	vp = nd.ni_vp;
11196 
11197 	nameidone(&nd);
11198 
11199 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
11200 		error = ENOTSUP;
11201 		goto out;
11202 	}
11203 
11204 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
11205 		error = EBUSY;
11206 		goto out;
11207 	}
11208 
11209 #if CONFIG_MACF
11210 	error = mac_vnode_check_revoke(ctx, vp);
11211 	if (error) {
11212 		goto out;
11213 	}
11214 #endif
11215 
11216 	VATTR_INIT(&va);
11217 	VATTR_WANTED(&va, va_uid);
11218 	if ((error = vnode_getattr(vp, &va, ctx))) {
11219 		goto out;
11220 	}
11221 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
11222 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
11223 		goto out;
11224 	}
11225 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
11226 		VNOP_REVOKE(vp, REVOKEALL, ctx);
11227 	}
11228 out:
11229 	vnode_put(vp);
11230 	return error;
11231 }
11232 
11233 
11234 /*
11235  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
11236  *  The following system calls are designed to support features
11237  *  which are specific to the HFS & HFS Plus volume formats
11238  */
11239 
11240 
11241 /*
11242  * Obtain attribute information on objects in a directory while enumerating
11243  * the directory.
11244  */
11245 /* ARGSUSED */
11246 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)11247 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
11248 {
11249 	vnode_t vp;
11250 	struct fileproc *fp;
11251 	uio_t auio = NULL;
11252 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11253 	uint32_t count = 0, savecount = 0;
11254 	uint32_t newstate = 0;
11255 	int error, eofflag = 0;
11256 	off_t loff = 0;
11257 	struct attrlist attributelist;
11258 	vfs_context_t ctx = vfs_context_current();
11259 	int fd = uap->fd;
11260 	UIO_STACKBUF(uio_buf, 1);
11261 	kauth_action_t action;
11262 
11263 	AUDIT_ARG(fd, fd);
11264 
11265 	/* Get the attributes into kernel space */
11266 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
11267 		return error;
11268 	}
11269 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
11270 		return error;
11271 	}
11272 	savecount = count;
11273 
11274 get_from_fd:
11275 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
11276 		return error;
11277 	}
11278 
11279 	vn_offset_lock(fp->fp_glob);
11280 	if (((vnode_t)fp_get_data(fp)) != vp) {
11281 		vn_offset_unlock(fp->fp_glob);
11282 		file_drop(fd);
11283 		goto get_from_fd;
11284 	}
11285 
11286 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
11287 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
11288 		error = EBADF;
11289 		goto out;
11290 	}
11291 
11292 
11293 #if CONFIG_MACF
11294 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
11295 	    fp->fp_glob);
11296 	if (error) {
11297 		goto out;
11298 	}
11299 #endif
11300 
11301 
11302 	if ((error = vnode_getwithref(vp))) {
11303 		goto out;
11304 	}
11305 
11306 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
11307 
11308 #if CONFIG_UNION_MOUNTS
11309 unionread:
11310 #endif /* CONFIG_UNION_MOUNTS */
11311 	if (vp->v_type != VDIR) {
11312 		(void)vnode_put(vp);
11313 		error = EINVAL;
11314 		goto out;
11315 	}
11316 
11317 #if CONFIG_MACF
11318 	error = mac_vnode_check_readdir(ctx, vp);
11319 	if (error != 0) {
11320 		(void)vnode_put(vp);
11321 		goto out;
11322 	}
11323 #endif /* MAC */
11324 
11325 	/* set up the uio structure which will contain the users return buffer */
11326 	loff = fp->fp_glob->fg_offset;
11327 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11328 	uio_addiov(auio, uap->buffer, uap->buffersize);
11329 
11330 	/*
11331 	 * If the only item requested is file names, we can let that past with
11332 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
11333 	 * they need SEARCH as well.
11334 	 */
11335 	action = KAUTH_VNODE_LIST_DIRECTORY;
11336 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
11337 	    attributelist.fileattr || attributelist.dirattr) {
11338 		action |= KAUTH_VNODE_SEARCH;
11339 	}
11340 
11341 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
11342 		/* Believe it or not, uap->options only has 32-bits of valid
11343 		 * info, so truncate before extending again */
11344 
11345 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
11346 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
11347 	}
11348 
11349 	if (error) {
11350 		(void) vnode_put(vp);
11351 		goto out;
11352 	}
11353 
11354 #if CONFIG_UNION_MOUNTS
11355 	/*
11356 	 * If we've got the last entry of a directory in a union mount
11357 	 * then reset the eofflag and pretend there's still more to come.
11358 	 * The next call will again set eofflag and the buffer will be empty,
11359 	 * so traverse to the underlying directory and do the directory
11360 	 * read there.
11361 	 */
11362 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
11363 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
11364 			eofflag = 0;
11365 		} else {                                                // Empty buffer
11366 			vnode_t uvp;
11367 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
11368 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
11369 					if ((error = VNOP_OPEN(uvp, fp->fp_glob->fg_flag, ctx)) == 0) {
11370 						fp_set_data(fp, uvp);
11371 						/* Close the old vnode to maintain proper lifecycle */
11372 						VNOP_CLOSE(vp, fp->fp_glob->fg_flag, ctx);
11373 						fp->fp_glob->fg_offset = 0; // reset index for new dir
11374 						count = savecount;
11375 						vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
11376 						vnode_put(vp);
11377 						vp = uvp;
11378 						goto unionread;
11379 					} else {
11380 						vnode_rele_internal(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
11381 						vnode_put(uvp);
11382 					}
11383 				} else {
11384 					/* could not get a ref, can't replace in fd */
11385 					vnode_put(uvp);
11386 				}
11387 			}
11388 		}
11389 	}
11390 #endif /* CONFIG_UNION_MOUNTS */
11391 
11392 	(void)vnode_put(vp);
11393 
11394 	if (error) {
11395 		goto out;
11396 	}
11397 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
11398 
11399 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
11400 		goto out;
11401 	}
11402 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
11403 		goto out;
11404 	}
11405 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
11406 		goto out;
11407 	}
11408 
11409 	*retval = eofflag;  /* similar to getdirentries */
11410 	error = 0;
11411 out:
11412 	vn_offset_unlock(fp->fp_glob);
11413 	file_drop(fd);
11414 	return error; /* return error earlier, an retval of 0 or 1 now */
11415 } /* end of getdirentriesattr system call */
11416 
11417 /*
11418  * Exchange data between two files
11419  */
11420 
11421 /* ARGSUSED */
11422 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)11423 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
11424 {
11425 	struct nameidata fnd, snd;
11426 	vfs_context_t ctx = vfs_context_current();
11427 	vnode_t fvp;
11428 	vnode_t svp;
11429 	int error;
11430 	u_int32_t nameiflags;
11431 	char *fpath = NULL;
11432 	char *spath = NULL;
11433 	int   flen = 0, slen = 0;
11434 	int from_truncated = 0, to_truncated = 0;
11435 #if CONFIG_FSE
11436 	fse_info f_finfo, s_finfo;
11437 #endif
11438 
11439 	nameiflags = 0;
11440 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11441 		nameiflags |= FOLLOW;
11442 	}
11443 
11444 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
11445 	    UIO_USERSPACE, uap->path1, ctx);
11446 
11447 	error = namei(&fnd);
11448 	if (error) {
11449 		goto out2;
11450 	}
11451 
11452 	nameidone(&fnd);
11453 	fvp = fnd.ni_vp;
11454 
11455 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
11456 	    UIO_USERSPACE, uap->path2, ctx);
11457 
11458 	error = namei(&snd);
11459 	if (error) {
11460 		vnode_put(fvp);
11461 		goto out2;
11462 	}
11463 	nameidone(&snd);
11464 	svp = snd.ni_vp;
11465 
11466 	/*
11467 	 * if the files are the same, return an inval error
11468 	 */
11469 	if (svp == fvp) {
11470 		error = EINVAL;
11471 		goto out;
11472 	}
11473 
11474 	/*
11475 	 * if the files are on different volumes, return an error
11476 	 */
11477 	if (svp->v_mount != fvp->v_mount) {
11478 		error = EXDEV;
11479 		goto out;
11480 	}
11481 
11482 	/* If they're not files, return an error */
11483 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
11484 		error = EINVAL;
11485 		goto out;
11486 	}
11487 
11488 #if CONFIG_MACF
11489 	error = mac_vnode_check_exchangedata(ctx,
11490 	    fvp, svp);
11491 	if (error) {
11492 		goto out;
11493 	}
11494 #endif
11495 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
11496 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
11497 		goto out;
11498 	}
11499 
11500 	if (
11501 #if CONFIG_FSE
11502 		need_fsevent(FSE_EXCHANGE, fvp) ||
11503 #endif
11504 		kauth_authorize_fileop_has_listeners()) {
11505 		GET_PATH(fpath);
11506 		GET_PATH(spath);
11507 
11508 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
11509 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
11510 
11511 #if CONFIG_FSE
11512 		get_fse_info(fvp, &f_finfo, ctx);
11513 		get_fse_info(svp, &s_finfo, ctx);
11514 		if (from_truncated || to_truncated) {
11515 			// set it here since only the f_finfo gets reported up to user space
11516 			f_finfo.mode |= FSE_TRUNCATED_PATH;
11517 		}
11518 #endif
11519 	}
11520 	/* Ok, make the call */
11521 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
11522 
11523 	if (error == 0) {
11524 		const char *tmpname;
11525 
11526 		if (fpath != NULL && spath != NULL) {
11527 			/* call out to allow 3rd party notification of exchangedata.
11528 			 * Ignore result of kauth_authorize_fileop call.
11529 			 */
11530 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
11531 			    (uintptr_t)fpath, (uintptr_t)spath);
11532 		}
11533 		name_cache_lock();
11534 
11535 		tmpname     = fvp->v_name;
11536 		fvp->v_name = svp->v_name;
11537 		svp->v_name = tmpname;
11538 
11539 		if (fvp->v_parent != svp->v_parent) {
11540 			vnode_t tmp;
11541 
11542 			tmp           = fvp->v_parent;
11543 			fvp->v_parent = svp->v_parent;
11544 			svp->v_parent = tmp;
11545 		}
11546 		name_cache_unlock();
11547 
11548 #if CONFIG_FSE
11549 		if (fpath != NULL && spath != NULL) {
11550 			add_fsevent(FSE_EXCHANGE, ctx,
11551 			    FSE_ARG_STRING, flen, fpath,
11552 			    FSE_ARG_FINFO, &f_finfo,
11553 			    FSE_ARG_STRING, slen, spath,
11554 			    FSE_ARG_FINFO, &s_finfo,
11555 			    FSE_ARG_DONE);
11556 		}
11557 #endif
11558 	}
11559 
11560 out:
11561 	if (fpath != NULL) {
11562 		RELEASE_PATH(fpath);
11563 	}
11564 	if (spath != NULL) {
11565 		RELEASE_PATH(spath);
11566 	}
11567 	vnode_put(svp);
11568 	vnode_put(fvp);
11569 out2:
11570 	return error;
11571 }
11572 
11573 /*
11574  * Return (in MB) the amount of freespace on the given vnode's volume.
11575  */
11576 uint32_t freespace_mb(vnode_t vp);
11577 
11578 uint32_t
freespace_mb(vnode_t vp)11579 freespace_mb(vnode_t vp)
11580 {
11581 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
11582 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
11583 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
11584 }
11585 
11586 #if CONFIG_SEARCHFS
11587 
11588 /* ARGSUSED */
11589 
11590 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)11591 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
11592 {
11593 	vnode_t vp, tvp;
11594 	int i, error = 0;
11595 	int fserror = 0;
11596 	struct nameidata nd;
11597 	struct user64_fssearchblock searchblock;
11598 	struct searchstate *state;
11599 	struct attrlist *returnattrs;
11600 	struct timeval timelimit;
11601 	void *searchparams1, *searchparams2;
11602 	uio_t auio = NULL;
11603 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11604 	uint32_t nummatches;
11605 	size_t mallocsize;
11606 	uint32_t nameiflags;
11607 	vfs_context_t ctx = vfs_context_current();
11608 	UIO_STACKBUF(uio_buf, 1);
11609 
11610 	/* Start by copying in fsearchblock parameter list */
11611 	if (IS_64BIT_PROCESS(p)) {
11612 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11613 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
11614 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
11615 	} else {
11616 		struct user32_fssearchblock tmp_searchblock;
11617 
11618 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11619 		// munge into 64-bit version
11620 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11621 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11622 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11623 		searchblock.maxmatches = tmp_searchblock.maxmatches;
11624 		/*
11625 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11626 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11627 		 */
11628 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11629 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11630 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11631 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11632 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11633 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11634 		searchblock.searchattrs = tmp_searchblock.searchattrs;
11635 	}
11636 	if (error) {
11637 		return error;
11638 	}
11639 
11640 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11641 	 */
11642 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11643 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11644 		return EINVAL;
11645 	}
11646 
11647 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11648 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
11649 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11650 	/* block.                                                                                             */
11651 	/*												      */
11652 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
11653 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
11654 	/*       assumes the size is still 556 bytes it will continue to work				      */
11655 
11656 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11657 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11658 
11659 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11660 
11661 	/* Now set up the various pointers to the correct place in our newly allocated memory */
11662 
11663 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11664 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11665 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11666 
11667 	/* Now copy in the stuff given our local variables. */
11668 
11669 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11670 		goto freeandexit;
11671 	}
11672 
11673 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11674 		goto freeandexit;
11675 	}
11676 
11677 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11678 		goto freeandexit;
11679 	}
11680 
11681 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11682 		goto freeandexit;
11683 	}
11684 
11685 	/*
11686 	 * When searching a union mount, need to set the
11687 	 * start flag at the first call on each layer to
11688 	 * reset state for the new volume.
11689 	 */
11690 	if (uap->options & SRCHFS_START) {
11691 		state->ss_union_layer = 0;
11692 	} else {
11693 		uap->options |= state->ss_union_flags;
11694 	}
11695 	state->ss_union_flags = 0;
11696 
11697 	/*
11698 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11699 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11700 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11701 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11702 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11703 	 */
11704 
11705 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11706 		attrreference_t* string_ref;
11707 		u_int32_t* start_length;
11708 		user64_size_t param_length;
11709 
11710 		/* validate searchparams1 */
11711 		param_length = searchblock.sizeofsearchparams1;
11712 		/* skip the word that specifies length of the buffer */
11713 		start_length = (u_int32_t*) searchparams1;
11714 		start_length = start_length + 1;
11715 		string_ref = (attrreference_t*) start_length;
11716 
11717 		/* ensure no negative offsets or too big offsets */
11718 		if (string_ref->attr_dataoffset < 0) {
11719 			error = EINVAL;
11720 			goto freeandexit;
11721 		}
11722 		if (string_ref->attr_length > MAXPATHLEN) {
11723 			error = EINVAL;
11724 			goto freeandexit;
11725 		}
11726 
11727 		/* Check for pointer overflow in the string ref */
11728 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11729 			error = EINVAL;
11730 			goto freeandexit;
11731 		}
11732 
11733 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11734 			error = EINVAL;
11735 			goto freeandexit;
11736 		}
11737 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11738 			error = EINVAL;
11739 			goto freeandexit;
11740 		}
11741 	}
11742 
11743 	/* set up the uio structure which will contain the users return buffer */
11744 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11745 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11746 
11747 	nameiflags = 0;
11748 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11749 		nameiflags |= FOLLOW;
11750 	}
11751 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11752 	    UIO_USERSPACE, uap->path, ctx);
11753 
11754 	error = namei(&nd);
11755 	if (error) {
11756 		goto freeandexit;
11757 	}
11758 	vp = nd.ni_vp;
11759 	nameidone(&nd);
11760 
11761 	/*
11762 	 * Switch to the root vnode for the volume
11763 	 */
11764 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11765 	vnode_put(vp);
11766 	if (error) {
11767 		goto freeandexit;
11768 	}
11769 	vp = tvp;
11770 
11771 #if CONFIG_UNION_MOUNTS
11772 	/*
11773 	 * If it's a union mount, the path lookup takes
11774 	 * us to the top layer. But we may need to descend
11775 	 * to a lower layer. For non-union mounts the layer
11776 	 * is always zero.
11777 	 */
11778 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11779 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11780 			break;
11781 		}
11782 		tvp = vp;
11783 		vp = vp->v_mount->mnt_vnodecovered;
11784 		if (vp == NULL) {
11785 			vnode_put(tvp);
11786 			error = ENOENT;
11787 			goto freeandexit;
11788 		}
11789 		error = vnode_getwithref(vp);
11790 		vnode_put(tvp);
11791 		if (error) {
11792 			goto freeandexit;
11793 		}
11794 	}
11795 #endif /* CONFIG_UNION_MOUNTS */
11796 
11797 #if CONFIG_MACF
11798 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11799 	if (error) {
11800 		vnode_put(vp);
11801 		goto freeandexit;
11802 	}
11803 #endif
11804 
11805 
11806 	/*
11807 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11808 	 * before and sometimes the underlying code doesnt deal with it well.
11809 	 */
11810 	if (searchblock.maxmatches == 0) {
11811 		nummatches = 0;
11812 		goto saveandexit;
11813 	}
11814 
11815 	/*
11816 	 * Allright, we have everything we need, so lets make that call.
11817 	 *
11818 	 * We keep special track of the return value from the file system:
11819 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11820 	 * from copying out any results...
11821 	 */
11822 
11823 	fserror = VNOP_SEARCHFS(vp,
11824 	    searchparams1,
11825 	    searchparams2,
11826 	    &searchblock.searchattrs,
11827 	    (uint32_t)searchblock.maxmatches,
11828 	    &timelimit,
11829 	    returnattrs,
11830 	    &nummatches,
11831 	    (uint32_t)uap->scriptcode,
11832 	    (uint32_t)uap->options,
11833 	    auio,
11834 	    (struct searchstate *) &state->ss_fsstate,
11835 	    ctx);
11836 
11837 #if CONFIG_UNION_MOUNTS
11838 	/*
11839 	 * If it's a union mount we need to be called again
11840 	 * to search the mounted-on filesystem.
11841 	 */
11842 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11843 		state->ss_union_flags = SRCHFS_START;
11844 		state->ss_union_layer++;        // search next layer down
11845 		fserror = EAGAIN;
11846 	}
11847 #endif /* CONFIG_UNION_MOUNTS */
11848 
11849 saveandexit:
11850 
11851 	vnode_put(vp);
11852 
11853 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11854 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11855 
11856 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11857 		goto freeandexit;
11858 	}
11859 
11860 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11861 		goto freeandexit;
11862 	}
11863 
11864 	error = fserror;
11865 
11866 freeandexit:
11867 
11868 	kfree_data(searchparams1, mallocsize);
11869 
11870 	return error;
11871 } /* end of searchfs system call */
11872 
11873 #else /* CONFIG_SEARCHFS */
11874 
11875 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11876 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11877 {
11878 	return ENOTSUP;
11879 }
11880 
11881 #endif /* CONFIG_SEARCHFS */
11882 
11883 
11884 #if CONFIG_DATALESS_FILES
11885 
11886 /*
11887  * === Namespace Resolver Up-call Mechanism ===
11888  *
11889  * When I/O is performed to a dataless file or directory (read, write,
11890  * lookup-in, etc.), the file system performs an upcall to the namespace
11891  * resolver (filecoordinationd) to materialize the object.
11892  *
11893  * We need multiple up-calls to be in flight at once, and we need these
11894  * up-calls to be interruptible, thus the following implementation:
11895  *
11896  * => The nspace_resolver_request represents the in-kernel request state.
11897  *    It contains a request ID, storage space for the errno code returned
11898  *    by filecoordinationd, and flags.
11899  *
11900  * => The request ID is simply a global monotonically incrementing 32-bit
11901  *    number.  Outstanding requests are stored in a hash table, and the
11902  *    hash function is extremely simple.
11903  *
11904  * => When an upcall is to be made to filecoordinationd, a request structure
11905  *    is allocated on the stack (it is small, and needs to live only during
11906  *    the duration of the call to resolve_nspace_item_ext()).  It is
11907  *    initialized and inserted into the table.  Some backpressure from
11908  *    filecoordinationd is applied by limiting the numnber of entries that
11909  *    can be inserted into the table (and thus limiting the number of
11910  *    outstanding requests issued to filecoordinationd); waiting for an
11911  *    available slot is interruptible.
11912  *
11913  * => Once the request has been inserted into the table, the up-call is made
11914  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11915  *    immediately and filecoordinationd processes the request asynchronously.
11916  *
11917  * => The caller now waits for the request to complete.  Tnis is achieved by
11918  *    sleeping on the address of the request structure and waiting for
11919  *    filecoordinationd to mark the request structure as complete.  This
11920  *    is an interruptible sleep call; if interrupted, the request structure
11921  *    is removed from the table and EINTR is returned to the caller.  If
11922  *    this occurs, an advisory up-call is made to filecoordinationd with
11923  *    the request ID to indicate that the request can be aborted or
11924  *    de-prioritized at the discretion of filecoordinationd.
11925  *
11926  * => When filecoordinationd has completed the request, it signals completion
11927  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11928  *    decorated as a namespace resolver can write to this sysctl node.  The
11929  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11930  *    The request ID is looked up in the table, and if the request is found,
11931  *    the error code is stored in the request structure and a wakeup()
11932  *    issued on the address of the request structure.  If the request is not
11933  *    found, we simply drop the completion notification, assuming that the
11934  *    caller was interrupted.
11935  *
11936  * => When the waiting thread wakes up, it extracts the error code from the
11937  *    request structure, removes the request from the table, and returns the
11938  *    error code to the calling function.  Fini!
11939  */
11940 
11941 struct nspace_resolver_request {
11942 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11943 	vnode_t         r_vp;
11944 	vnode_t         r_tdvp;
11945 	uint32_t        r_req_id;
11946 	int             r_resolver_error;
11947 	int             r_flags;
11948 };
11949 
11950 #define RRF_COMPLETE    0x0001
11951 #define RRF_COMPLETING  0x0002
11952 
11953 struct nspace_resolver_completion_data {
11954 	uint32_t req_id;
11955 	int32_t  resolver_error;
11956 	uint64_t orig_gencount;
11957 	uint64_t orig_syncroot;
11958 };
11959 
11960 static uint32_t
next_nspace_req_id(void)11961 next_nspace_req_id(void)
11962 {
11963 	static uint32_t next_req_id;
11964 
11965 	return OSAddAtomic(1, &next_req_id);
11966 }
11967 
11968 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11969 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11970 
11971 static LIST_HEAD(nspace_resolver_requesthead,
11972     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11973 static u_long nspace_resolver_request_hashmask;
11974 static u_int nspace_resolver_request_count;
11975 static bool nspace_resolver_request_wait_slot;
11976 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11977 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11978     &nspace_resolver_request_lck_grp);
11979 
11980 #define NSPACE_REQ_LOCK() \
11981 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11982 #define NSPACE_REQ_UNLOCK() \
11983 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11984 
11985 #define NSPACE_RESOLVER_HASH(req_id)    \
11986 	(&nspace_resolver_request_hashtbl[(req_id) & \
11987 	 nspace_resolver_request_hashmask])
11988 
11989 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11990 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11991 {
11992 	struct nspace_resolver_requesthead *bucket;
11993 	struct nspace_resolver_request *req;
11994 
11995 	bucket = NSPACE_RESOLVER_HASH(req_id);
11996 	LIST_FOREACH(req, bucket, r_hashlink) {
11997 		if (req->r_req_id == req_id) {
11998 			/*
11999 			 * If this request already has a completion
12000 			 * pending, don't return it again.
12001 			 */
12002 			if ((req->r_flags & RRF_COMPLETING) != 0 &&
12003 			    skip_completing) {
12004 				req = NULL;
12005 			}
12006 			return req;
12007 		}
12008 	}
12009 
12010 	return NULL;
12011 }
12012 
12013 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)12014 nspace_resolver_req_add(struct nspace_resolver_request *req)
12015 {
12016 	struct nspace_resolver_requesthead *bucket;
12017 	int error;
12018 
12019 	NSPACE_REQ_LOCK();
12020 
12021 	while (nspace_resolver_request_count >=
12022 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
12023 		nspace_resolver_request_wait_slot = true;
12024 		error = msleep(&nspace_resolver_request_count,
12025 		    &nspace_resolver_request_hash_mutex,
12026 		    PVFS | PCATCH, "nspacerq", NULL);
12027 		if (error) {
12028 			NSPACE_REQ_UNLOCK();
12029 			return error;
12030 		}
12031 	}
12032 
12033 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
12034 #if DIAGNOSTIC
12035 	assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
12036 #endif /* DIAGNOSTIC */
12037 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
12038 	nspace_resolver_request_count++;
12039 
12040 	NSPACE_REQ_UNLOCK();
12041 
12042 	return 0;
12043 }
12044 
12045 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)12046 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
12047 {
12048 	/*
12049 	 * If a completion is in-progress, we have to wait for the
12050 	 * completion handler to finish because it's still using 'req',
12051 	 * which is allocated on our stack a couple of frames up.
12052 	 */
12053 	while ((req->r_flags & RRF_COMPLETING) != 0) {
12054 		(void) msleep(req, &nspace_resolver_request_hash_mutex,
12055 		    PVFS, "nspacecmplt", NULL);
12056 	}
12057 }
12058 
12059 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)12060 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
12061 {
12062 	struct nspace_resolver_requesthead *bucket;
12063 
12064 	/* We're called with NSPACE_REQ_LOCK held. */
12065 
12066 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
12067 #if DIAGNOSTIC
12068 	assert((req->r_flags & RRF_COMPLETING) == 0);
12069 	assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
12070 #endif /* DIAGNOSTIC */
12071 	LIST_REMOVE(req, r_hashlink);
12072 	nspace_resolver_request_count--;
12073 
12074 	if (nspace_resolver_request_wait_slot) {
12075 		nspace_resolver_request_wait_slot = false;
12076 		wakeup(&nspace_resolver_request_count);
12077 	}
12078 
12079 	nspace_resolver_req_wait_pending_completion(req);
12080 
12081 	NSPACE_REQ_UNLOCK();
12082 }
12083 
12084 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)12085 nspace_resolver_req_remove(struct nspace_resolver_request *req)
12086 {
12087 	NSPACE_REQ_LOCK();
12088 	nspace_resolver_req_remove_and_unlock(req);
12089 }
12090 
12091 static void
nspace_resolver_req_cancel(uint32_t req_id)12092 nspace_resolver_req_cancel(uint32_t req_id)
12093 {
12094 	kern_return_t kr;
12095 	mach_port_t mp;
12096 
12097 	// Failures here aren't fatal -- the cancellation message
12098 	// sent to the resolver is merely advisory.
12099 
12100 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
12101 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
12102 		return;
12103 	}
12104 
12105 	kr = send_nspace_resolve_cancel(mp, req_id);
12106 	if (kr != KERN_SUCCESS) {
12107 		os_log_error(OS_LOG_DEFAULT,
12108 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
12109 	}
12110 
12111 	ipc_port_release_send(mp);
12112 }
12113 
12114 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)12115 nspace_resolver_req_wait(struct nspace_resolver_request *req)
12116 {
12117 	bool send_cancel_message = false;
12118 	int error;
12119 
12120 	NSPACE_REQ_LOCK();
12121 
12122 	while ((req->r_flags & RRF_COMPLETE) == 0) {
12123 		error = msleep(req, &nspace_resolver_request_hash_mutex,
12124 		    PVFS | PCATCH, "nspace", NULL);
12125 		if (error && error != ERESTART) {
12126 			req->r_resolver_error = (error == EINTR) ? EINTR :
12127 			    ETIMEDOUT;
12128 			send_cancel_message = true;
12129 			break;
12130 		}
12131 	}
12132 
12133 	nspace_resolver_req_remove_and_unlock(req);
12134 
12135 	/*
12136 	 * It's safe to continue referencing 'req' here because it's
12137 	 * allocated on our caller's stack.
12138 	 */
12139 
12140 	if (send_cancel_message) {
12141 		nspace_resolver_req_cancel(req->r_req_id);
12142 	}
12143 
12144 	return req->r_resolver_error;
12145 }
12146 
12147 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)12148 nspace_resolver_req_mark_complete(
12149 	struct nspace_resolver_request *req,
12150 	int resolver_error)
12151 {
12152 	req->r_resolver_error = resolver_error;
12153 	req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
12154 	wakeup(req);
12155 }
12156 
12157 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)12158 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
12159 {
12160 	req->r_flags |= RRF_COMPLETING;
12161 }
12162 
12163 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)12164 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
12165 {
12166 	struct nspace_resolver_request *req;
12167 	int error;
12168 	struct vnode_attr va;
12169 	vnode_t vp;
12170 
12171 	NSPACE_REQ_LOCK();
12172 
12173 	req = nspace_resolver_req_lookup(c->req_id, true);
12174 	if (req == NULL) {
12175 		/*
12176 		 * If we don't find the request corresponding to our req_id,
12177 		 * just drop the completion on the floor; it's likely that
12178 		 * the requester interrupted with a signal, or it may already
12179 		 * be completing.
12180 		 */
12181 		NSPACE_REQ_UNLOCK();
12182 		return;
12183 	}
12184 
12185 	/*
12186 	 * Get out now if the resolver reported an error.
12187 	 */
12188 	if ((error = c->resolver_error) != 0) {
12189 		goto out;
12190 	}
12191 
12192 	/*
12193 	 * If the resolver did not specify any namespace shape criteria
12194 	 * for letting the operation proceed, then get out now.
12195 	 */
12196 	if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
12197 		goto out;
12198 	}
12199 
12200 	/*
12201 	 * We're going to have to acquire the mount rename lock and do
12202 	 * some I/O in order to verify the criteria.  Mark the request
12203 	 * as pending so no one else messes with it after we drop the
12204 	 * NSPACE_REQ_LOCK.
12205 	 */
12206 	nspace_resolver_req_mark_completion_pending(req);
12207 	NSPACE_REQ_UNLOCK();
12208 
12209 	/*
12210 	 * Lock out renames from changing the shape of the tree while
12211 	 * validate the criteria.
12212 	 */
12213 	mount_t locked_mp = req->r_vp->v_mount;
12214 	mount_ref(locked_mp, 0);
12215 	mount_lock_renames(locked_mp);
12216 
12217 	if (c->orig_gencount != 0) {
12218 		vp = req->r_vp;
12219 		if (error) {
12220 			goto out_dropmount;
12221 		}
12222 
12223 		VATTR_INIT(&va);
12224 		VATTR_WANTED(&va, va_recursive_gencount);
12225 		error = vnode_getattr(vp, &va, vfs_context_kernel());
12226 		if (error) {
12227 			goto out_dropmount;
12228 		}
12229 		if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
12230 		    va.va_recursive_gencount != c->orig_gencount) {
12231 			printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
12232 			    c->orig_gencount, va.va_recursive_gencount);
12233 			error = EBUSY;
12234 			goto out_dropmount;
12235 		}
12236 	}
12237 
12238 	/*
12239 	 * Ignore orig_syncroot if a destination directory wasn't specified
12240 	 * in the request.
12241 	 */
12242 	if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
12243 		uint64_t syncroot_id;
12244 
12245 		if (error) {
12246 			goto out_dropmount;
12247 		}
12248 
12249 #ifndef APFSIOC_GET_SYNC_ROOT
12250 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
12251 #endif
12252 
12253 		error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
12254 		    (caddr_t)&syncroot_id, 0, vfs_context_kernel());
12255 		if (error) {
12256 			goto out_dropmount;
12257 		}
12258 		if (syncroot_id != c->orig_syncroot) {
12259 			printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
12260 			    c->orig_syncroot, syncroot_id);
12261 			error = EBUSY;
12262 			goto out_dropmount;
12263 		}
12264 	}
12265 
12266 out_dropmount:
12267 	mount_unlock_renames(locked_mp);
12268 	mount_drop(locked_mp, 0);
12269 	NSPACE_REQ_LOCK();
12270 
12271 out:
12272 	nspace_resolver_req_mark_complete(req, error);
12273 	NSPACE_REQ_UNLOCK();
12274 }
12275 
12276 static struct proc *nspace_resolver_proc;
12277 
12278 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)12279 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
12280 {
12281 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12282 	    p == nspace_resolver_proc) ? 1 : 0;
12283 	return 0;
12284 }
12285 
12286 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
12287 
12288 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)12289 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
12290 {
12291 	vfs_context_t ctx = vfs_context_current();
12292 	int error = 0;
12293 
12294 	//
12295 	// The system filecoordinationd runs as uid == 0.  This also
12296 	// has the nice side-effect of filtering out filecoordinationd
12297 	// running in the simulator.
12298 	//
12299 	if (!vfs_context_issuser(ctx) ||
12300 	    !vfs_context_is_dataless_resolver(ctx)) {
12301 		return EPERM;
12302 	}
12303 
12304 	if (is_resolver) {
12305 		NSPACE_REQ_LOCK();
12306 
12307 		if (nspace_resolver_proc == NULL) {
12308 			proc_lock(p);
12309 			p->p_lflag |= P_LNSPACE_RESOLVER;
12310 			proc_unlock(p);
12311 			nspace_resolver_proc = p;
12312 		} else {
12313 			error = EBUSY;
12314 		}
12315 
12316 		NSPACE_REQ_UNLOCK();
12317 	} else {
12318 		// This is basically just like the exit case.
12319 		// nspace_resolver_exited() will verify that the
12320 		// process is the resolver, and will clear the
12321 		// global.
12322 		nspace_resolver_exited(p);
12323 	}
12324 
12325 	return error;
12326 }
12327 
12328 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)12329 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
12330 {
12331 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
12332 	    (p->p_vfs_iopolicy &
12333 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
12334 		*is_prevented = 1;
12335 	} else {
12336 		*is_prevented = 0;
12337 	}
12338 	return 0;
12339 }
12340 
12341 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)12342 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
12343 {
12344 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
12345 		return is_prevented ? 0 : EBUSY;
12346 	}
12347 
12348 	if (is_prevented) {
12349 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
12350 	} else {
12351 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
12352 	}
12353 	return 0;
12354 }
12355 
12356 static int
nspace_materialization_get_thread_state(int * is_prevented)12357 nspace_materialization_get_thread_state(int *is_prevented)
12358 {
12359 	uthread_t ut = current_uthread();
12360 
12361 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
12362 	return 0;
12363 }
12364 
12365 static int
nspace_materialization_set_thread_state(int is_prevented)12366 nspace_materialization_set_thread_state(int is_prevented)
12367 {
12368 	uthread_t ut = current_uthread();
12369 
12370 	if (is_prevented) {
12371 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
12372 	} else {
12373 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
12374 	}
12375 	return 0;
12376 }
12377 
12378 /* the vfs.nspace branch */
12379 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
12380 
12381 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12382 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
12383     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12384 {
12385 	struct proc *p = req->p;
12386 	int new_value, old_value, changed = 0;
12387 	int error;
12388 
12389 	error = nspace_resolver_get_proc_state(p, &old_value);
12390 	if (error) {
12391 		return error;
12392 	}
12393 
12394 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12395 	    &changed);
12396 	if (error == 0 && changed) {
12397 		error = nspace_resolver_set_proc_state(p, new_value);
12398 	}
12399 	return error;
12400 }
12401 
12402 /* decorate this process as the dataless file resolver */
12403 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
12404     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12405     0, 0, sysctl_nspace_resolver, "I", "");
12406 
12407 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12408 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
12409     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12410 {
12411 	struct proc *p = req->p;
12412 	int new_value, old_value, changed = 0;
12413 	int error;
12414 
12415 	error = nspace_materialization_get_proc_state(p, &old_value);
12416 	if (error) {
12417 		return error;
12418 	}
12419 
12420 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12421 	    &changed);
12422 	if (error == 0 && changed) {
12423 		error = nspace_materialization_set_proc_state(p, new_value);
12424 	}
12425 	return error;
12426 }
12427 
12428 /* decorate this process as not wanting to materialize dataless files */
12429 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
12430     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12431     0, 0, sysctl_nspace_prevent_materialization, "I", "");
12432 
12433 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12434 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
12435     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12436 {
12437 	int new_value, old_value, changed = 0;
12438 	int error;
12439 
12440 	error = nspace_materialization_get_thread_state(&old_value);
12441 	if (error) {
12442 		return error;
12443 	}
12444 
12445 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12446 	    &changed);
12447 	if (error == 0 && changed) {
12448 		error = nspace_materialization_set_thread_state(new_value);
12449 	}
12450 	return error;
12451 }
12452 
12453 /* decorate this thread as not wanting to materialize dataless files */
12454 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
12455     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12456     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
12457 
12458 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12459 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
12460     __unused int arg2, struct sysctl_req *req)
12461 {
12462 	struct proc *p = req->p;
12463 	uint32_t req_status[2] = { 0, 0 };
12464 	uint64_t gencount = 0;
12465 	uint64_t syncroot = 0;
12466 	int error, is_resolver, changed = 0, other_changed;
12467 
12468 	error = nspace_resolver_get_proc_state(p, &is_resolver);
12469 	if (error) {
12470 		return error;
12471 	}
12472 
12473 	if (!is_resolver) {
12474 		return EPERM;
12475 	}
12476 
12477 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
12478 	    &changed);
12479 	if (error) {
12480 		return error;
12481 	}
12482 
12483 	/*
12484 	 * Get the gencount if it was passed.  Ignore errors, because
12485 	 * it's optional.
12486 	 */
12487 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
12488 	    &other_changed);
12489 	if (error) {
12490 		gencount = 0;
12491 		error = 0;
12492 	}
12493 
12494 	/*
12495 	 * ...and now the syncroot ID.
12496 	 */
12497 	error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
12498 	    &other_changed);
12499 	if (error) {
12500 		syncroot = 0;
12501 		error = 0;
12502 	}
12503 
12504 	/*
12505 	 * req_status[0] is the req_id
12506 	 *
12507 	 * req_status[1] is the errno
12508 	 */
12509 	if (error == 0 && changed) {
12510 		const struct nspace_resolver_completion_data cd = {
12511 			.req_id = req_status[0],
12512 			.resolver_error = req_status[1],
12513 			.orig_gencount = gencount,
12514 			.orig_syncroot = syncroot,
12515 		};
12516 		nspace_resolver_req_completed(&cd);
12517 	}
12518 	return error;
12519 }
12520 
12521 /* Resolver reports completed reqs here. */
12522 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
12523     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12524     0, 0, sysctl_nspace_complete, "-", "");
12525 
12526 #endif /* CONFIG_DATALESS_FILES */
12527 
12528 #if CONFIG_DATALESS_FILES
12529 #define __no_dataless_unused    /* nothing */
12530 #else
12531 #define __no_dataless_unused    __unused
12532 #endif
12533 
12534 static int
vfs_context_dataless_materialization_is_prevented_internal(vfs_context_t const ctx __no_dataless_unused,bool is_original_materialization __no_dataless_unused)12535 vfs_context_dataless_materialization_is_prevented_internal(
12536 	vfs_context_t const ctx __no_dataless_unused, bool is_original_materialization __no_dataless_unused)
12537 {
12538 #if CONFIG_DATALESS_FILES
12539 	proc_t const p = vfs_context_proc(ctx);
12540 
12541 	/*
12542 	 * Kernel context ==> return EDEADLK, as we would with any random
12543 	 * process decorated as no-materialize.
12544 	 */
12545 	if (ctx == vfs_context_kernel()) {
12546 		return EDEADLK;
12547 	}
12548 
12549 	/*
12550 	 * If the process has the dataless-manipulation entitlement,
12551 	 * materialization is prevented, and depending on the kind
12552 	 * of file system operation, things get to proceed as if the
12553 	 * object is not dataless.
12554 	 */
12555 	if (vfs_context_is_dataless_manipulator(ctx)) {
12556 		return EJUSTRETURN;
12557 	}
12558 
12559 	/*
12560 	 * If the process's iopolicy specifies that dataless files
12561 	 * can be materialized, then we let it go ahead.
12562 	 */
12563 	if (is_original_materialization) {
12564 		return (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES_ORIG) ? 0 : EDEADLK;
12565 	} else {
12566 		thread_t const t = vfs_context_thread(ctx);
12567 		uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
12568 
12569 		/*
12570 		 * Per-thread decorations override any process-wide decorations.
12571 		 * (Foundation uses this, and this overrides even the dataless-
12572 		 * manipulation entitlement so as to make API contracts consistent.)
12573 		 */
12574 		if (ut != NULL) {
12575 			if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
12576 				return EDEADLK;
12577 			}
12578 			if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
12579 				return 0;
12580 			}
12581 		}
12582 
12583 		if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
12584 			return 0;
12585 		}
12586 	}
12587 
12588 #endif /* CONFIG_DATALESS_FILES */
12589 
12590 	/*
12591 	 * The default behavior is to not materialize dataless files;
12592 	 * return to the caller that deadlock was detected.
12593 	 */
12594 	return EDEADLK;
12595 }
12596 
12597 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)12598 vfs_context_dataless_materialization_is_prevented(
12599 	vfs_context_t const ctx __no_dataless_unused)
12600 {
12601 	return vfs_context_dataless_materialization_is_prevented_internal(ctx, false);
12602 }
12603 
12604 int
vfs_context_orig_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)12605 vfs_context_orig_dataless_materialization_is_prevented(
12606 	vfs_context_t const ctx __no_dataless_unused)
12607 {
12608 	return vfs_context_dataless_materialization_is_prevented_internal(ctx, true);
12609 }
12610 
12611 void
nspace_resolver_init(void)12612 nspace_resolver_init(void)
12613 {
12614 #if CONFIG_DATALESS_FILES
12615 	nspace_resolver_request_hashtbl =
12616 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12617 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12618 #endif /* CONFIG_DATALESS_FILES */
12619 }
12620 
12621 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12622 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12623 {
12624 #if CONFIG_DATALESS_FILES
12625 	struct nspace_resolver_requesthead *bucket;
12626 	struct nspace_resolver_request *req;
12627 	u_long idx;
12628 
12629 	NSPACE_REQ_LOCK();
12630 
12631 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12632 	    p == nspace_resolver_proc) {
12633 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12634 			bucket = &nspace_resolver_request_hashtbl[idx];
12635 			LIST_FOREACH(req, bucket, r_hashlink) {
12636 				nspace_resolver_req_wait_pending_completion(req);
12637 				nspace_resolver_req_mark_complete(req,
12638 				    ETIMEDOUT);
12639 			}
12640 		}
12641 		nspace_resolver_proc = NULL;
12642 	}
12643 
12644 	NSPACE_REQ_UNLOCK();
12645 #endif /* CONFIG_DATALESS_FILES */
12646 }
12647 
12648 #define DATALESS_RESOLVER_ENTITLEMENT     \
12649 	"com.apple.private.vfs.dataless-resolver"
12650 #define DATALESS_MANIPULATION_ENTITLEMENT \
12651 	"com.apple.private.vfs.dataless-manipulation"
12652 
12653 #if CONFIG_DATALESS_FILES
12654 /*
12655  * Return TRUE if the vfs context is associated with the dataless
12656  * resolver.
12657  */
12658 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12659 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12660 {
12661 	return IOTaskHasEntitlement(vfs_context_task(ctx),
12662 	           DATALESS_RESOLVER_ENTITLEMENT);
12663 }
12664 #endif /* CONFIG_DATALESS_FILES */
12665 
12666 /*
12667  * Return TRUE if the vfs context is associated with a process entitled
12668  * for dataless manipulation.
12669  *
12670  * XXX Arguably belongs in vfs_subr.c, but is here because of the
12671  * complication around CONFIG_DATALESS_FILES.
12672  */
12673 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12674 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12675 {
12676 #if CONFIG_DATALESS_FILES
12677 	task_t task = vfs_context_task(ctx);
12678 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12679 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12680 #else
12681 	return false;
12682 #endif /* CONFIG_DATALESS_FILES */
12683 }
12684 
12685 #if CONFIG_DATALESS_FILES
12686 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12687 log_materialization_prevented(vnode_t vp, uint64_t op)
12688 {
12689 	char p_name[MAXCOMLEN + 1];
12690 	char *vntype;
12691 	proc_selfname(&p_name[0], sizeof(p_name));
12692 
12693 	if (vp->v_type == VREG) {
12694 		vntype = "File";
12695 	} else if (vp->v_type == VDIR) {
12696 		vntype = "Dir";
12697 	} else if (vp->v_type == VLNK) {
12698 		vntype = "SymLink";
12699 	} else {
12700 		vntype = "Other";
12701 	}
12702 
12703 #if DEVELOPMENT
12704 	struct vnode_attr *vap = kalloc_type(struct vnode_attr, Z_WAITOK);
12705 
12706 	VATTR_INIT(vap);
12707 	VATTR_WANTED(vap, va_fsid);
12708 	VATTR_WANTED(vap, va_fileid);
12709 	if (vnode_getattr(vp, vap, vfs_context_current()) == 0) {
12710 		os_log_debug(OS_LOG_DEFAULT,
12711 		    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) fsid 0x%08x/%u fileid=%llu",
12712 		    p_name, proc_selfpid(), op, vntype,
12713 		    vap->va_fsid, vap->va_fsid, vap->va_fileid);
12714 	} else
12715 #endif
12716 	{
12717 		os_log_debug(OS_LOG_DEFAULT,
12718 		    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12719 		    p_name, proc_selfpid(), op, vntype);
12720 	}
12721 #if DEVELOPMENT
12722 	kfree_type(struct vnode_attr, vap);
12723 #endif
12724 }
12725 #endif /* CONFIG_DATALESS_FILES */
12726 
12727 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12728 vfs_materialize_item(
12729 	vnode_t vp __no_dataless_unused,
12730 	uint32_t op __no_dataless_unused,
12731 	int64_t offset __no_dataless_unused,
12732 	int64_t size __no_dataless_unused,
12733 	char *lookup_name __no_dataless_unused,
12734 	size_t const namelen __no_dataless_unused,
12735 	vnode_t tdvp __no_dataless_unused)
12736 {
12737 #if CONFIG_DATALESS_FILES
12738 	kern_return_t kern_ret;
12739 	mach_port_t mach_port;
12740 	char *path = NULL;
12741 	vfs_context_t context;
12742 	int path_len;
12743 	int error;
12744 	audit_token_t atoken;
12745 	enum vtype vp_vtype;
12746 
12747 	/* Swap files are special; ignore them */
12748 	if (vnode_isswap(vp)) {
12749 		return 0;
12750 	}
12751 
12752 	/*
12753 	 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12754 	 * are no longer used nor supported.
12755 	 */
12756 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12757 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12758 		return ENOTSUP;
12759 	}
12760 	if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12761 		os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12762 		return ENOTSUP;
12763 	}
12764 
12765 	/* Normalize 'op'. */
12766 	op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12767 
12768 	/*
12769 	 * To-directory is only meaningful for rename operations;
12770 	 * ignore it if someone handed one to us unexpectedly.
12771 	 */
12772 	if (op != NAMESPACE_HANDLER_RENAME_OP) {
12773 		tdvp = NULL;
12774 	}
12775 
12776 	context = vfs_context_current();
12777 
12778 	/* Remember this for later. */
12779 	vp_vtype = vnode_vtype(vp);
12780 
12781 	error = vfs_context_dataless_materialization_is_prevented(context);
12782 	if (error) {
12783 		log_materialization_prevented(vp, op);
12784 		goto out_check_errors;
12785 	}
12786 
12787 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12788 	    &mach_port);
12789 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12790 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12791 		/*
12792 		 * Treat this like being unable to access the backing store
12793 		 * server.
12794 		 */
12795 		return ETIMEDOUT;
12796 	}
12797 
12798 	int path_alloc_len = MAXPATHLEN;
12799 	do {
12800 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12801 		if (path == NULL) {
12802 			return ENOMEM;
12803 		}
12804 
12805 		path_len = path_alloc_len;
12806 		error = vn_getpath(vp, path, &path_len);
12807 		if (error == 0) {
12808 			break;
12809 		} else if (error == ENOSPC) {
12810 			kfree_data(path, path_alloc_len);
12811 			path = NULL;
12812 		} else {
12813 			goto out_release_port;
12814 		}
12815 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) &&
12816 	    path_alloc_len <= MAXLONGPATHLEN);
12817 
12818 	error = vfs_context_copy_audit_token(context, &atoken);
12819 	if (error) {
12820 		goto out_release_port;
12821 	}
12822 
12823 	struct nspace_resolver_request req = {
12824 		.r_req_id = next_nspace_req_id(),
12825 		.r_vp = vp,
12826 		.r_tdvp = tdvp,
12827 	};
12828 
12829 	error = nspace_resolver_req_add(&req);
12830 	if (error) {
12831 		goto out_release_port;
12832 	}
12833 
12834 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12835 
12836 	if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12837 		char *dest_path = NULL;
12838 		int dest_path_len;
12839 
12840 		dest_path = zalloc(ZV_NAMEI);
12841 		dest_path_len = MAXPATHLEN;
12842 
12843 		error = vn_getpath(tdvp, dest_path, &dest_path_len);
12844 		if (error) {
12845 			zfree(ZV_NAMEI, dest_path);
12846 			goto out_release_port;
12847 		}
12848 
12849 		/*
12850 		 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12851 		 * compatibility with existing agents in user-space
12852 		 * who get passed this value.
12853 		 */
12854 		kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12855 		    req.r_req_id,
12856 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12857 		    path, dest_path, atoken);
12858 
12859 		zfree(ZV_NAMEI, dest_path);
12860 	} else if (vp_vtype == VDIR) {
12861 		char *tmpname = NULL;
12862 
12863 		/*
12864 		 * If the caller provided a lookup_name *and* a name length,
12865 		 * then we assume the lookup_name is not NUL-terminated.
12866 		 * Allocate a temporary buffer in this case to provide
12867 		 * a NUL-terminated path name to the IPC call.
12868 		 */
12869 		if (lookup_name != NULL && namelen != 0) {
12870 			if (namelen >= PATH_MAX) {
12871 				error = EINVAL;
12872 				goto out_req_remove;
12873 			}
12874 			tmpname = zalloc(ZV_NAMEI);
12875 			strlcpy(tmpname, lookup_name, namelen + 1);
12876 			lookup_name = tmpname;
12877 		} else if (lookup_name != NULL) {
12878 			/*
12879 			 * If the caller provided a lookup_name with a
12880 			 * zero name length, then we assume it's NUL-
12881 			 * terminated.  Verify it has a valid length.
12882 			 */
12883 			if (strlen(lookup_name) >= PATH_MAX) {
12884 				error = EINVAL;
12885 				goto out_req_remove;
12886 			}
12887 		}
12888 
12889 		/* (See above.) */
12890 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12891 		    req.r_req_id,
12892 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12893 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12894 
12895 		if (tmpname != NULL) {
12896 			zfree(ZV_NAMEI, tmpname);
12897 
12898 			/*
12899 			 * Poison lookup_name rather than reference
12900 			 * freed memory.
12901 			 */
12902 			lookup_name = NULL;
12903 		}
12904 	} else {
12905 		/* (See above.) */
12906 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12907 		    req.r_req_id,
12908 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12909 		    offset, size, path, atoken);
12910 	}
12911 	if (kern_ret != KERN_SUCCESS) {
12912 		/*
12913 		 * Also treat this like being unable to access the backing
12914 		 * store server.
12915 		 */
12916 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12917 		    kern_ret);
12918 		error = ETIMEDOUT;
12919 		goto out_req_remove;
12920 	}
12921 
12922 	/*
12923 	 * Give back the memory we allocated earlier while we wait; we
12924 	 * no longer need it.
12925 	 */
12926 	kfree_data(path, path_alloc_len);
12927 	path = NULL;
12928 
12929 	/*
12930 	 * Request has been submitted to the resolver. Now (interruptibly)
12931 	 * wait for completion. Upon requrn, the request will have been
12932 	 * removed from the lookup table.
12933 	 */
12934 	error = nspace_resolver_req_wait(&req);
12935 
12936 out_release_port:
12937 	if (path != NULL) {
12938 		kfree_data(path, path_alloc_len);
12939 		path = NULL;
12940 	}
12941 	ipc_port_release_send(mach_port);
12942 
12943 out_check_errors:
12944 	/*
12945 	 * The file resolver owns the logic about what error to return
12946 	 * to the caller.  We only need to handle a couple of special
12947 	 * cases here:
12948 	 */
12949 	if (error == EJUSTRETURN) {
12950 		/*
12951 		 * The requesting process is allowed to interact with
12952 		 * dataless objects.  Make a couple of sanity-checks
12953 		 * here to ensure the action makes sense.
12954 		 */
12955 		switch (op) {
12956 		case NAMESPACE_HANDLER_WRITE_OP:
12957 		case NAMESPACE_HANDLER_TRUNCATE_OP:
12958 		case NAMESPACE_HANDLER_RENAME_OP:
12959 			/*
12960 			 * This handles the case of the resolver itself
12961 			 * writing data to the file (or throwing it
12962 			 * away).
12963 			 */
12964 			error = 0;
12965 			break;
12966 		case NAMESPACE_HANDLER_READ_OP:
12967 		case NAMESPACE_HANDLER_LOOKUP_OP:
12968 			/*
12969 			 * This handles the case of the resolver needing
12970 			 * to look up inside of a dataless directory while
12971 			 * it's in the process of materializing it (for
12972 			 * example, creating files or directories).
12973 			 */
12974 			error = (vp_vtype == VDIR) ? 0 : EBADF;
12975 			break;
12976 		default:
12977 			error = EBADF;
12978 			break;
12979 		}
12980 	}
12981 
12982 	return error;
12983 
12984 out_req_remove:
12985 	nspace_resolver_req_remove(&req);
12986 	goto out_release_port;
12987 #else
12988 	return ENOTSUP;
12989 #endif /* CONFIG_DATALESS_FILES */
12990 }
12991 
12992 /*
12993  * vfs_materialize_file: Materialize a regular file.
12994  *
12995  * Inputs:
12996  * vp		The dataless file to be materialized.
12997  *
12998  * op		What kind of operation is being performed:
12999  *		-> NAMESPACE_HANDLER_READ_OP
13000  *		-> NAMESPACE_HANDLER_WRITE_OP
13001  *		-> NAMESPACE_HANDLER_LINK_CREATE
13002  *		-> NAMESPACE_HANDLER_DELETE_OP
13003  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
13004  *		-> NAMESPACE_HANDLER_RENAME_OP
13005  *
13006  * offset	offset of I/O for READ or WRITE.  Ignored for
13007  *		other ops.
13008  *
13009  * size		size of I/O for READ or WRITE  Ignored for
13010  *		other ops.
13011  *
13012  * If offset or size are -1 for a READ or WRITE, then the resolver should
13013  * consider the range to be unknown.
13014  *
13015  * Upon successful return, the caller may proceed with the operation.
13016  * N.B. the file may still be "dataless" in this case.
13017  */
13018 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)13019 vfs_materialize_file(
13020 	struct vnode *vp,
13021 	uint64_t op,
13022 	int64_t offset,
13023 	int64_t size)
13024 {
13025 	if (vp->v_type != VREG) {
13026 		return EFTYPE;
13027 	}
13028 	return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
13029 	           NULL);
13030 }
13031 
13032 /*
13033  * vfs_materialize_dir:
13034  *
13035  * Inputs:
13036  * vp		The dataless directory to be materialized.
13037  *
13038  * op		What kind of operation is being performed:
13039  *		-> NAMESPACE_HANDLER_READ_OP
13040  *		-> NAMESPACE_HANDLER_WRITE_OP
13041  *		-> NAMESPACE_HANDLER_DELETE_OP
13042  *		-> NAMESPACE_HANDLER_RENAME_OP
13043  *		-> NAMESPACE_HANDLER_LOOKUP_OP
13044  *
13045  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
13046  *		other ops.  May or may not be NUL-terminated; see below.
13047  *
13048  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
13049  *		terminated and namelen is the number of valid bytes in
13050  *		lookup_name. If zero, then lookup_name is assumed to be
13051  *		NUL-terminated.
13052  *
13053  * Upon successful return, the caller may proceed with the operation.
13054  * N.B. the directory may still be "dataless" in this case.
13055  */
13056 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)13057 vfs_materialize_dir(
13058 	struct vnode *vp,
13059 	uint64_t op,
13060 	char *lookup_name,
13061 	size_t namelen)
13062 {
13063 	if (vp->v_type != VDIR) {
13064 		return EFTYPE;
13065 	}
13066 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
13067 		return EINVAL;
13068 	}
13069 	return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
13070 	           namelen, NULL);
13071 }
13072 
13073 /*
13074  * vfs_materialize_reparent:
13075  *
13076  * Inputs:
13077  * vp		The dataless file or directory to be materialized.
13078  *
13079  * tdvp		The new parent directory for the dataless file.
13080  *
13081  * Upon successful return, the caller may proceed with the operation.
13082  * N.B. the item may still be "dataless" in this case.
13083  */
13084 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)13085 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
13086 {
13087 	if (vp->v_type != VDIR && vp->v_type != VREG) {
13088 		return EFTYPE;
13089 	}
13090 	return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
13091 	           0, 0, NULL, 0, tdvp);
13092 }
13093 
13094 #if 0
13095 static int
13096 build_volfs_path(struct vnode *vp, char *path, int *len)
13097 {
13098 	struct vnode_attr va;
13099 	int ret;
13100 
13101 	VATTR_INIT(&va);
13102 	VATTR_WANTED(&va, va_fsid);
13103 	VATTR_WANTED(&va, va_fileid);
13104 
13105 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
13106 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
13107 		ret = -1;
13108 	} else {
13109 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
13110 		ret = 0;
13111 	}
13112 
13113 	return ret;
13114 }
13115 #endif
13116 
13117 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)13118 fsctl_bogus_command_compat(unsigned long cmd)
13119 {
13120 	switch (cmd) {
13121 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
13122 		return FSIOC_SYNC_VOLUME;
13123 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
13124 		return FSIOC_ROUTEFS_SETROUTEID;
13125 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
13126 		return FSIOC_SET_PACKAGE_EXTS;
13127 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
13128 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
13129 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
13130 		return DISK_CONDITIONER_IOC_GET;
13131 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
13132 		return DISK_CONDITIONER_IOC_SET;
13133 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
13134 		return FSIOC_FIOSEEKHOLE;
13135 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
13136 		return FSIOC_FIOSEEKDATA;
13137 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
13138 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
13139 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
13140 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
13141 	}
13142 
13143 	return cmd;
13144 }
13145 
13146 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)13147 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
13148 {
13149 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
13150 }
13151 
13152 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)13153 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
13154 {
13155 	struct vfs_attr vfa;
13156 	mount_t mp = vp->v_mount;
13157 	unsigned arg;
13158 	int error;
13159 
13160 	/* record vid of vp so we can drop it below. */
13161 	uint32_t vvid = vp->v_id;
13162 
13163 	/*
13164 	 * Then grab mount_iterref so that we can release the vnode.
13165 	 * Without this, a thread may call vnode_iterate_prepare then
13166 	 * get into a deadlock because we've never released the root vp
13167 	 */
13168 	error = mount_iterref(mp, 0);
13169 	if (error) {
13170 		return error;
13171 	}
13172 	vnode_hold(vp);
13173 	vnode_put(vp);
13174 
13175 	arg = MNT_NOWAIT;
13176 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
13177 		arg = MNT_WAIT;
13178 	}
13179 
13180 	/*
13181 	 * If the filessytem supports multiple filesytems in a
13182 	 * partition (For eg APFS volumes in a container, it knows
13183 	 * that the waitfor argument to VFS_SYNC are flags.
13184 	 */
13185 	VFSATTR_INIT(&vfa);
13186 	VFSATTR_WANTED(&vfa, f_capabilities);
13187 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
13188 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
13189 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
13190 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
13191 		arg |= MNT_VOLUME;
13192 	}
13193 
13194 	/* issue the sync for this volume */
13195 	(void)sync_callback(mp, &arg);
13196 
13197 	/*
13198 	 * Then release the mount_iterref once we're done syncing; it's not
13199 	 * needed for the VNOP_IOCTL below
13200 	 */
13201 	mount_iterdrop(mp);
13202 
13203 	if (arg & FSCTL_SYNC_FULLSYNC) {
13204 		/* re-obtain vnode iocount on the root vp, if possible */
13205 		error = vnode_getwithvid(vp, vvid);
13206 		if (error == 0) {
13207 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
13208 			vnode_put(vp);
13209 		}
13210 	}
13211 	vnode_drop(vp);
13212 	/* mark the argument VP as having been released */
13213 	*arg_vp = NULL;
13214 	return error;
13215 }
13216 
13217 #if ROUTEFS
13218 static int __attribute__((noinline))
handle_routes(user_addr_t udata)13219 handle_routes(user_addr_t udata)
13220 {
13221 	char routepath[MAXPATHLEN];
13222 	size_t len = 0;
13223 	int error;
13224 
13225 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
13226 		return error;
13227 	}
13228 	bzero(routepath, MAXPATHLEN);
13229 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
13230 	if (error) {
13231 		return error;
13232 	}
13233 	error = routefs_kernel_mount(routepath);
13234 	return error;
13235 }
13236 #endif
13237 
13238 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)13239 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
13240 {
13241 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
13242 	struct vnode_attr va;
13243 	int error;
13244 
13245 	VATTR_INIT(&va);
13246 	VATTR_SET(&va, va_flags, cas->new_flags);
13247 
13248 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
13249 
13250 #if CONFIG_FSE
13251 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
13252 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
13253 	}
13254 #endif
13255 
13256 	return error;
13257 }
13258 
13259 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)13260 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
13261 {
13262 	struct mount *mp = NULL;
13263 	errno_t rootauth = 0;
13264 
13265 	mp = vp->v_mount;
13266 
13267 	/*
13268 	 * query the underlying FS and see if it reports something
13269 	 * sane for this vnode. If volume is authenticated via
13270 	 * chunklist, leave that for the caller to determine.
13271 	 */
13272 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13273 
13274 	return rootauth;
13275 }
13276 
13277 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
13278 	"com.apple.private.kernel.set-package-extensions"
13279 
13280 /*
13281  * Make a filesystem-specific control call:
13282  */
13283 /* ARGSUSED */
13284 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)13285 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
13286 {
13287 	int error = 0;
13288 	boolean_t is64bit;
13289 	u_int size;
13290 #define STK_PARAMS 128
13291 	char stkbuf[STK_PARAMS] = {0};
13292 	caddr_t data, memp;
13293 	vnode_t vp = *arg_vp;
13294 
13295 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
13296 		return ENOTTY;
13297 	}
13298 
13299 	cmd = fsctl_bogus_command_compat(cmd);
13300 
13301 	size = IOCPARM_LEN(cmd);
13302 	if (size > IOCPARM_MAX) {
13303 		return EINVAL;
13304 	}
13305 
13306 	is64bit = proc_is64bit(p);
13307 
13308 	memp = NULL;
13309 
13310 	if (size > sizeof(stkbuf)) {
13311 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
13312 			return ENOMEM;
13313 		}
13314 		data = memp;
13315 	} else {
13316 		data = &stkbuf[0];
13317 	};
13318 
13319 	if (cmd & IOC_IN) {
13320 		if (size) {
13321 			error = copyin(udata, data, size);
13322 			if (error) {
13323 				if (memp) {
13324 					kfree_data(memp, size);
13325 				}
13326 				return error;
13327 			}
13328 		} else {
13329 			if (is64bit) {
13330 				*(user_addr_t *)data = udata;
13331 			} else {
13332 				*(uint32_t *)data = (uint32_t)udata;
13333 			}
13334 		};
13335 	} else if ((cmd & IOC_OUT) && size) {
13336 		/*
13337 		 * Zero the buffer so the user always
13338 		 * gets back something deterministic.
13339 		 */
13340 		bzero(data, size);
13341 	} else if (cmd & IOC_VOID) {
13342 		if (is64bit) {
13343 			*(user_addr_t *)data = udata;
13344 		} else {
13345 			*(uint32_t *)data = (uint32_t)udata;
13346 		}
13347 	}
13348 
13349 	/* Check to see if it's a generic command */
13350 	switch (cmd) {
13351 	case FSIOC_SYNC_VOLUME:
13352 		error = handle_sync_volume(vp, arg_vp, data, ctx);
13353 		break;
13354 
13355 	case FSIOC_ROUTEFS_SETROUTEID:
13356 #if ROUTEFS
13357 		error = handle_routes(udata);
13358 #endif
13359 		break;
13360 
13361 	case FSIOC_SET_PACKAGE_EXTS: {
13362 		user_addr_t ext_strings;
13363 		uint32_t    num_entries;
13364 		uint32_t    max_width;
13365 
13366 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
13367 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
13368 			error = EPERM;
13369 			break;
13370 		}
13371 
13372 		if ((is64bit && size != sizeof(user64_package_ext_info))
13373 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
13374 			// either you're 64-bit and passed a 64-bit struct or
13375 			// you're 32-bit and passed a 32-bit struct.  otherwise
13376 			// it's not ok.
13377 			error = EINVAL;
13378 			break;
13379 		}
13380 
13381 		if (is64bit) {
13382 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
13383 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
13384 			}
13385 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
13386 			num_entries = ((user64_package_ext_info *)data)->num_entries;
13387 			max_width   = ((user64_package_ext_info *)data)->max_width;
13388 		} else {
13389 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
13390 			num_entries = ((user32_package_ext_info *)data)->num_entries;
13391 			max_width   = ((user32_package_ext_info *)data)->max_width;
13392 		}
13393 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
13394 	}
13395 	break;
13396 
13397 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
13398 	{
13399 		mount_t mp;
13400 
13401 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
13402 			break;
13403 		}
13404 		if ((mp = vp->v_mount) != NULL) {
13405 			mount_lock(mp);
13406 			if (data[0] != 0) {
13407 				for (int i = 0; i < MFSTYPENAMELEN; i++) {
13408 					if (!data[i]) {
13409 						goto continue_copy;
13410 					}
13411 				}
13412 				/*
13413 				 * Getting here means we have a user data
13414 				 * string which has no NULL termination in
13415 				 * its first MFSTYPENAMELEN bytes.  This is
13416 				 * bogus, let's avoid strlcpy-ing the read
13417 				 * data and return an error.
13418 				 */
13419 				error = EINVAL;
13420 				goto unlock;
13421 continue_copy:
13422 				vfs_setfstypename_locked(mp, data);
13423 				if (vfs_isrdonly(mp) &&
13424 				    strcmp(data, "mtmfs") == 0) {
13425 					mp->mnt_kern_flag |=
13426 					    MNTK_EXTENDED_SECURITY;
13427 					mp->mnt_kern_flag &=
13428 					    ~MNTK_AUTH_OPAQUE;
13429 				}
13430 			} else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13431 				const char *name =
13432 				    vfs_getfstypenameref_locked(mp, NULL);
13433 				if (strcmp(name, "mtmfs") == 0) {
13434 					mp->mnt_kern_flag &=
13435 					    ~MNTK_EXTENDED_SECURITY;
13436 				}
13437 				vfs_setfstypename_locked(mp, NULL);
13438 			}
13439 unlock:
13440 			mount_unlock(mp);
13441 		}
13442 	}
13443 	break;
13444 
13445 	case DISK_CONDITIONER_IOC_GET: {
13446 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
13447 	}
13448 	break;
13449 
13450 	case DISK_CONDITIONER_IOC_SET: {
13451 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
13452 	}
13453 	break;
13454 
13455 	case FSIOC_CAS_BSDFLAGS:
13456 		error = handle_flags(vp, data, ctx);
13457 		break;
13458 
13459 	case FSIOC_FD_ONLY_OPEN_ONCE: {
13460 		error = 0;
13461 		if (vnode_usecount(vp) > 1) {
13462 			vnode_lock_spin(vp);
13463 			if (vp->v_lflag & VL_HASSTREAMS) {
13464 				if (vnode_isinuse_locked(vp, 1, 1)) {
13465 					error = EBUSY;
13466 				}
13467 			} else if (vnode_usecount(vp) > 1) {
13468 				error = EBUSY;
13469 			}
13470 			vnode_unlock(vp);
13471 		}
13472 	}
13473 	break;
13474 
13475 	case FSIOC_EVAL_ROOTAUTH:
13476 		error = handle_auth(vp, cmd, data, options, ctx);
13477 		break;
13478 
13479 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
13480 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
13481 		break;
13482 
13483 #if CONFIG_EXCLAVES
13484 	case FSIOC_EXCLAVE_FS_REGISTER:
13485 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13486 			error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
13487 		} else {
13488 			error = EPERM;
13489 		}
13490 		break;
13491 
13492 	case FSIOC_EXCLAVE_FS_UNREGISTER:
13493 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13494 			error = vfs_exclave_fs_unregister(vp);
13495 		} else {
13496 			error = EPERM;
13497 		}
13498 		break;
13499 
13500 	case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
13501 		exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
13502 		exclave_fs_base_dir_t *dirs = NULL;
13503 		if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT) &&
13504 		    !IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_LIST_ENTITLEMENT)) {
13505 			error = EPERM;
13506 			break;
13507 		}
13508 		if (get_base_dirs->base_dirs) {
13509 			if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
13510 				error = EINVAL;
13511 				break;
13512 			}
13513 			dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
13514 			if (!dirs) {
13515 				error = ENOSPC;
13516 				break;
13517 			}
13518 		}
13519 		error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
13520 		if (!error && dirs) {
13521 			error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
13522 			    get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
13523 		}
13524 		if (dirs) {
13525 			kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
13526 		}
13527 	}
13528 	break;
13529 #endif
13530 
13531 	default: {
13532 		/*
13533 		 * Other, known commands shouldn't be passed down here.
13534 		 * (When adding a selector to this list, it may be prudent
13535 		 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
13536 		 */
13537 		switch (cmd) {
13538 		case F_PUNCHHOLE:
13539 		case F_TRIM_ACTIVE_FILE:
13540 		case F_RDADVISE:
13541 		case F_TRANSCODEKEY:
13542 		case F_GETPROTECTIONLEVEL:
13543 		case F_GETDEFAULTPROTLEVEL:
13544 		case F_MAKECOMPRESSED:
13545 		case F_SET_GREEDY_MODE:
13546 		case F_SETSTATICCONTENT:
13547 		case F_SETIOTYPE:
13548 		case F_SETBACKINGSTORE:
13549 		case F_GETPATH_MTMINFO:
13550 		case APFSIOC_REVERT_TO_SNAPSHOT:
13551 		case FSIOC_FIOSEEKHOLE:
13552 		case FSIOC_FIOSEEKDATA:
13553 		case HFS_GET_BOOT_INFO:
13554 		case HFS_SET_BOOT_INFO:
13555 		case FIOPINSWAP:
13556 		case F_CHKCLEAN:
13557 		case F_FULLFSYNC:
13558 		case F_BARRIERFSYNC:
13559 		case F_FREEZE_FS:
13560 		case F_THAW_FS:
13561 		case FSIOC_KERNEL_ROOTAUTH:
13562 		case FSIOC_GRAFT_FS:
13563 		case FSIOC_UNGRAFT_FS:
13564 		case FSIOC_AUTH_FS:
13565 		case F_SPECULATIVE_READ:
13566 		case F_ATTRIBUTION_TAG:
13567 		case F_TRANSFEREXTENTS:
13568 		case F_ASSERT_BG_ACCESS:
13569 		case F_RELEASE_BG_ACCESS:
13570 			error = EINVAL;
13571 			goto outdrop;
13572 		}
13573 		/* Invoke the filesystem-specific code */
13574 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13575 	}
13576 	} /* end switch stmt */
13577 
13578 	/*
13579 	 * if no errors, copy any data to user. Size was
13580 	 * already set and checked above.
13581 	 */
13582 	if (error == 0 && (cmd & IOC_OUT) && size) {
13583 		error = copyout(data, udata, size);
13584 	}
13585 
13586 outdrop:
13587 	if (memp) {
13588 		kfree_data(memp, size);
13589 	}
13590 
13591 	return error;
13592 }
13593 
13594 /* ARGSUSED */
13595 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)13596 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
13597 {
13598 	int error;
13599 	struct nameidata nd;
13600 	uint32_t nameiflags;
13601 	vnode_t vp = NULL;
13602 	vfs_context_t ctx = vfs_context_current();
13603 
13604 	AUDIT_ARG(cmd, (int)uap->cmd);
13605 	AUDIT_ARG(value32, uap->options);
13606 	/* Get the vnode for the file we are getting info on:  */
13607 	nameiflags = 0;
13608 	//
13609 	// if we come through fsctl() then the file is by definition not open.
13610 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
13611 	// lest the caller mistakenly thinks the only open is their own (but in
13612 	// reality it's someone elses).
13613 	//
13614 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
13615 		return EINVAL;
13616 	}
13617 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
13618 		nameiflags |= FOLLOW;
13619 	}
13620 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
13621 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13622 	}
13623 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13624 	    UIO_USERSPACE, uap->path, ctx);
13625 	if ((error = namei(&nd))) {
13626 		goto done;
13627 	}
13628 	vp = nd.ni_vp;
13629 	nameidone(&nd);
13630 
13631 #if CONFIG_MACF
13632 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13633 	if (error) {
13634 		goto done;
13635 	}
13636 #endif
13637 
13638 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13639 
13640 done:
13641 	if (vp) {
13642 		vnode_put(vp);
13643 	}
13644 	return error;
13645 }
13646 /* ARGSUSED */
13647 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13648 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13649 {
13650 	int error;
13651 	vnode_t vp = NULL;
13652 	vfs_context_t ctx = vfs_context_current();
13653 	int fd = -1;
13654 
13655 	AUDIT_ARG(fd, uap->fd);
13656 	AUDIT_ARG(cmd, (int)uap->cmd);
13657 	AUDIT_ARG(value32, uap->options);
13658 
13659 	/* Get the vnode for the file we are getting info on:  */
13660 	if ((error = file_vnode(uap->fd, &vp))) {
13661 		return error;
13662 	}
13663 	fd = uap->fd;
13664 	if ((error = vnode_getwithref(vp))) {
13665 		file_drop(fd);
13666 		return error;
13667 	}
13668 
13669 #if CONFIG_MACF
13670 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13671 		file_drop(fd);
13672 		vnode_put(vp);
13673 		return error;
13674 	}
13675 #endif
13676 
13677 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13678 
13679 	file_drop(fd);
13680 
13681 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13682 	if (vp) {
13683 		vnode_put(vp);
13684 	}
13685 
13686 	return error;
13687 }
13688 /* end of fsctl system call */
13689 
13690 #define FILESEC_ACCESS_ENTITLEMENT              \
13691 	"com.apple.private.vfs.filesec-access"
13692 
13693 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13694 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13695 {
13696 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13697 		/*
13698 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13699 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13700 		 */
13701 		if ((!setting && vfs_context_issuser(ctx)) ||
13702 		    IOTaskHasEntitlement(vfs_context_task(ctx),
13703 		    FILESEC_ACCESS_ENTITLEMENT)) {
13704 			return 0;
13705 		}
13706 	}
13707 
13708 	return EPERM;
13709 }
13710 
13711 /*
13712  *  Retrieve the data of an extended attribute.
13713  */
13714 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13715 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13716 {
13717 	vnode_t vp;
13718 	struct nameidata nd;
13719 	char attrname[XATTR_MAXNAMELEN + 1];
13720 	vfs_context_t ctx = vfs_context_current();
13721 	uio_t auio = NULL;
13722 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13723 	size_t attrsize = 0;
13724 	size_t namelen;
13725 	u_int32_t nameiflags;
13726 	int error;
13727 	UIO_STACKBUF(uio_buf, 1);
13728 
13729 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13730 		return EINVAL;
13731 	}
13732 
13733 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13734 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13735 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13736 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13737 	}
13738 	if (uap->options & XATTR_RESOLVE_BENEATH) {
13739 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
13740 	}
13741 
13742 	if ((error = namei(&nd))) {
13743 		return error;
13744 	}
13745 	vp = nd.ni_vp;
13746 	nameidone(&nd);
13747 
13748 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13749 	if (error != 0) {
13750 		goto out;
13751 	}
13752 	if (xattr_protected(attrname) &&
13753 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13754 		goto out;
13755 	}
13756 	/*
13757 	 * the specific check for 0xffffffff is a hack to preserve
13758 	 * binaray compatibilty in K64 with applications that discovered
13759 	 * that passing in a buf pointer and a size of -1 resulted in
13760 	 * just the size of the indicated extended attribute being returned.
13761 	 * this isn't part of the documented behavior, but because of the
13762 	 * original implemtation's check for "uap->size > 0", this behavior
13763 	 * was allowed. In K32 that check turned into a signed comparison
13764 	 * even though uap->size is unsigned...  in K64, we blow by that
13765 	 * check because uap->size is unsigned and doesn't get sign smeared
13766 	 * in the munger for a 32 bit user app.  we also need to add a
13767 	 * check to limit the maximum size of the buffer being passed in...
13768 	 * unfortunately, the underlying fileystems seem to just malloc
13769 	 * the requested size even if the actual extended attribute is tiny.
13770 	 * because that malloc is for kernel wired memory, we have to put a
13771 	 * sane limit on it.
13772 	 *
13773 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13774 	 * U64 running on K64 will yield -1 (64 bits wide)
13775 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
13776 	 */
13777 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13778 		goto no_uio;
13779 	}
13780 
13781 	if (uap->value) {
13782 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13783 			uap->size = XATTR_MAXSIZE;
13784 		}
13785 
13786 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13787 		    &uio_buf[0], sizeof(uio_buf));
13788 		uio_addiov(auio, uap->value, uap->size);
13789 	}
13790 no_uio:
13791 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13792 out:
13793 	vnode_put(vp);
13794 
13795 	if (auio) {
13796 		*retval = uap->size - uio_resid(auio);
13797 	} else {
13798 		*retval = (user_ssize_t)attrsize;
13799 	}
13800 
13801 	return error;
13802 }
13803 
13804 /*
13805  * Retrieve the data of an extended attribute.
13806  */
13807 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13808 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13809 {
13810 	vnode_t vp;
13811 	char attrname[XATTR_MAXNAMELEN + 1];
13812 	vfs_context_t ctx = vfs_context_current();
13813 	uio_t auio = NULL;
13814 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13815 	size_t attrsize = 0;
13816 	size_t namelen;
13817 	int error;
13818 	UIO_STACKBUF(uio_buf, 1);
13819 
13820 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13821 	    XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
13822 		return EINVAL;
13823 	}
13824 
13825 	if ((error = file_vnode(uap->fd, &vp))) {
13826 		return error;
13827 	}
13828 	if ((error = vnode_getwithref(vp))) {
13829 		file_drop(uap->fd);
13830 		return error;
13831 	}
13832 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13833 	if (error != 0) {
13834 		goto out;
13835 	}
13836 	if (xattr_protected(attrname) &&
13837 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13838 		goto out;
13839 	}
13840 	if (uap->value && uap->size > 0) {
13841 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13842 			uap->size = XATTR_MAXSIZE;
13843 		}
13844 
13845 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13846 		    &uio_buf[0], sizeof(uio_buf));
13847 		uio_addiov(auio, uap->value, uap->size);
13848 	}
13849 
13850 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13851 out:
13852 	(void)vnode_put(vp);
13853 	file_drop(uap->fd);
13854 
13855 	if (auio) {
13856 		*retval = uap->size - uio_resid(auio);
13857 	} else {
13858 		*retval = (user_ssize_t)attrsize;
13859 	}
13860 	return error;
13861 }
13862 
13863 /* struct for checkdirs iteration */
13864 struct setxattr_ctx {
13865 	struct nameidata nd;
13866 	char attrname[XATTR_MAXNAMELEN + 1];
13867 	UIO_STACKBUF(uio_buf, 1);
13868 };
13869 
13870 /*
13871  * Set the data of an extended attribute.
13872  */
13873 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13874 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13875 {
13876 	vnode_t vp;
13877 	vfs_context_t ctx = vfs_context_current();
13878 	uio_t auio = NULL;
13879 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13880 	size_t namelen;
13881 	u_int32_t nameiflags;
13882 	int error;
13883 	struct setxattr_ctx *sactx;
13884 
13885 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13886 		return EINVAL;
13887 	}
13888 
13889 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13890 	if (sactx == NULL) {
13891 		return ENOMEM;
13892 	}
13893 
13894 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13895 	if (error != 0) {
13896 		if (error == EPERM) {
13897 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13898 			error = ENAMETOOLONG;
13899 		}
13900 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13901 		goto out;
13902 	}
13903 	if (xattr_protected(sactx->attrname) &&
13904 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13905 		goto out;
13906 	}
13907 	if (uap->size != 0 && uap->value == 0) {
13908 		error = EINVAL;
13909 		goto out;
13910 	}
13911 	if (uap->size > INT_MAX) {
13912 		error = E2BIG;
13913 		goto out;
13914 	}
13915 
13916 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13917 #if CONFIG_FILE_LEASES
13918 	nameiflags |= WANTPARENT;
13919 #endif
13920 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13921 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13922 		sactx->nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13923 	}
13924 	if (uap->options & XATTR_RESOLVE_BENEATH) {
13925 		sactx->nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
13926 	}
13927 
13928 	if ((error = namei(&sactx->nd))) {
13929 		goto out;
13930 	}
13931 	vp = sactx->nd.ni_vp;
13932 #if CONFIG_FILE_LEASES
13933 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13934 	vnode_put(sactx->nd.ni_dvp);
13935 #endif
13936 	nameidone(&sactx->nd);
13937 
13938 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13939 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13940 	uio_addiov(auio, uap->value, uap->size);
13941 
13942 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13943 #if CONFIG_FSE
13944 	if (error == 0) {
13945 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13946 		    FSE_ARG_VNODE, vp,
13947 		    FSE_ARG_DONE);
13948 	}
13949 #endif
13950 	vnode_put(vp);
13951 out:
13952 	kfree_type(struct setxattr_ctx, sactx);
13953 	*retval = 0;
13954 	return error;
13955 }
13956 
13957 /*
13958  * Set the data of an extended attribute.
13959  */
13960 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13961 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13962 {
13963 	vnode_t vp;
13964 	char attrname[XATTR_MAXNAMELEN + 1];
13965 	vfs_context_t ctx = vfs_context_current();
13966 	uio_t auio = NULL;
13967 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13968 	size_t namelen;
13969 	int error;
13970 	UIO_STACKBUF(uio_buf, 1);
13971 
13972 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13973 	    XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
13974 		return EINVAL;
13975 	}
13976 
13977 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13978 	if (error != 0) {
13979 		if (error == EPERM) {
13980 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13981 			return ENAMETOOLONG;
13982 		}
13983 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13984 		return error;
13985 	}
13986 	if (xattr_protected(attrname) &&
13987 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13988 		return error;
13989 	}
13990 	if (uap->size != 0 && uap->value == 0) {
13991 		return EINVAL;
13992 	}
13993 	if (uap->size > INT_MAX) {
13994 		return E2BIG;
13995 	}
13996 	if ((error = file_vnode(uap->fd, &vp))) {
13997 		return error;
13998 	}
13999 	if ((error = vnode_getwithref(vp))) {
14000 		file_drop(uap->fd);
14001 		return error;
14002 	}
14003 
14004 #if CONFIG_FILE_LEASES
14005 	vnode_breakdirlease(vp, true, O_WRONLY);
14006 #endif
14007 
14008 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
14009 	    &uio_buf[0], sizeof(uio_buf));
14010 	uio_addiov(auio, uap->value, uap->size);
14011 
14012 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
14013 #if CONFIG_FSE
14014 	if (error == 0) {
14015 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
14016 		    FSE_ARG_VNODE, vp,
14017 		    FSE_ARG_DONE);
14018 	}
14019 #endif
14020 	vnode_put(vp);
14021 	file_drop(uap->fd);
14022 	*retval = 0;
14023 	return error;
14024 }
14025 
14026 /*
14027  * Remove an extended attribute.
14028  * XXX Code duplication here.
14029  */
14030 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)14031 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
14032 {
14033 	vnode_t vp;
14034 	struct nameidata nd;
14035 	char attrname[XATTR_MAXNAMELEN + 1];
14036 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
14037 	vfs_context_t ctx = vfs_context_current();
14038 	size_t namelen;
14039 	u_int32_t nameiflags;
14040 	int error;
14041 
14042 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
14043 		return EINVAL;
14044 	}
14045 
14046 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
14047 	if (error != 0) {
14048 		return error;
14049 	}
14050 	if (xattr_protected(attrname)) {
14051 		return EPERM;
14052 	}
14053 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
14054 #if CONFIG_FILE_LEASES
14055 	nameiflags |= WANTPARENT;
14056 #endif
14057 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
14058 	if (uap->options & XATTR_NOFOLLOW_ANY) {
14059 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
14060 	}
14061 	if (uap->options & XATTR_RESOLVE_BENEATH) {
14062 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
14063 	}
14064 
14065 	if ((error = namei(&nd))) {
14066 		return error;
14067 	}
14068 	vp = nd.ni_vp;
14069 #if CONFIG_FILE_LEASES
14070 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
14071 	vnode_put(nd.ni_dvp);
14072 #endif
14073 	nameidone(&nd);
14074 
14075 	error = vn_removexattr(vp, attrname, uap->options, ctx);
14076 #if CONFIG_FSE
14077 	if (error == 0) {
14078 		add_fsevent(FSE_XATTR_REMOVED, ctx,
14079 		    FSE_ARG_VNODE, vp,
14080 		    FSE_ARG_DONE);
14081 	}
14082 #endif
14083 	vnode_put(vp);
14084 	*retval = 0;
14085 	return error;
14086 }
14087 
14088 /*
14089  * Remove an extended attribute.
14090  * XXX Code duplication here.
14091  */
14092 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)14093 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
14094 {
14095 	vnode_t vp;
14096 	char attrname[XATTR_MAXNAMELEN + 1];
14097 	size_t namelen;
14098 	int error;
14099 #if CONFIG_FSE
14100 	vfs_context_t ctx = vfs_context_current();
14101 #endif
14102 
14103 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
14104 	    XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
14105 		return EINVAL;
14106 	}
14107 
14108 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
14109 	if (error != 0) {
14110 		return error;
14111 	}
14112 	if (xattr_protected(attrname)) {
14113 		return EPERM;
14114 	}
14115 	if ((error = file_vnode(uap->fd, &vp))) {
14116 		return error;
14117 	}
14118 	if ((error = vnode_getwithref(vp))) {
14119 		file_drop(uap->fd);
14120 		return error;
14121 	}
14122 
14123 #if CONFIG_FILE_LEASES
14124 	vnode_breakdirlease(vp, true, O_WRONLY);
14125 #endif
14126 
14127 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
14128 #if CONFIG_FSE
14129 	if (error == 0) {
14130 		add_fsevent(FSE_XATTR_REMOVED, ctx,
14131 		    FSE_ARG_VNODE, vp,
14132 		    FSE_ARG_DONE);
14133 	}
14134 #endif
14135 	vnode_put(vp);
14136 	file_drop(uap->fd);
14137 	*retval = 0;
14138 	return error;
14139 }
14140 
14141 /*
14142  * Retrieve the list of extended attribute names.
14143  * XXX Code duplication here.
14144  */
14145 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)14146 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
14147 {
14148 	vnode_t vp;
14149 	struct nameidata nd;
14150 	vfs_context_t ctx = vfs_context_current();
14151 	uio_t auio = NULL;
14152 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
14153 	size_t attrsize = 0;
14154 	u_int32_t nameiflags;
14155 	int error;
14156 	UIO_STACKBUF(uio_buf, 1);
14157 
14158 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
14159 		return EINVAL;
14160 	}
14161 
14162 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
14163 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
14164 	if (uap->options & XATTR_NOFOLLOW_ANY) {
14165 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
14166 	}
14167 	if (uap->options & XATTR_RESOLVE_BENEATH) {
14168 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
14169 	}
14170 
14171 	if ((error = namei(&nd))) {
14172 		return error;
14173 	}
14174 	vp = nd.ni_vp;
14175 	nameidone(&nd);
14176 	if (uap->namebuf != 0 && uap->bufsize > 0) {
14177 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
14178 		    &uio_buf[0], sizeof(uio_buf));
14179 		uio_addiov(auio, uap->namebuf, uap->bufsize);
14180 	}
14181 
14182 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
14183 
14184 	vnode_put(vp);
14185 	if (auio) {
14186 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
14187 	} else {
14188 		*retval = (user_ssize_t)attrsize;
14189 	}
14190 	return error;
14191 }
14192 
14193 /*
14194  * Retrieve the list of extended attribute names.
14195  * XXX Code duplication here.
14196  */
14197 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)14198 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
14199 {
14200 	vnode_t vp;
14201 	uio_t auio = NULL;
14202 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
14203 	size_t attrsize = 0;
14204 	int error;
14205 	UIO_STACKBUF(uio_buf, 1);
14206 
14207 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
14208 	    XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
14209 		return EINVAL;
14210 	}
14211 
14212 	if ((error = file_vnode(uap->fd, &vp))) {
14213 		return error;
14214 	}
14215 	if ((error = vnode_getwithref(vp))) {
14216 		file_drop(uap->fd);
14217 		return error;
14218 	}
14219 	if (uap->namebuf != 0 && uap->bufsize > 0) {
14220 		auio = uio_createwithbuffer(1, 0, spacetype,
14221 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
14222 		uio_addiov(auio, uap->namebuf, uap->bufsize);
14223 	}
14224 
14225 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
14226 
14227 	vnode_put(vp);
14228 	file_drop(uap->fd);
14229 	if (auio) {
14230 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
14231 	} else {
14232 		*retval = (user_ssize_t)attrsize;
14233 	}
14234 	return error;
14235 }
14236 
14237 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)14238 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
14239     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
14240 {
14241 	int error;
14242 	vnode_t vp;
14243 	int length;
14244 	int bpflags;
14245 	/* maximum number of times to retry build_path */
14246 	unsigned int retries = 0x10;
14247 
14248 	if (bufsize > MAXLONGPATHLEN) {
14249 		return EINVAL;
14250 	}
14251 
14252 	if (buf == NULL) {
14253 		return ENOMEM;
14254 	}
14255 
14256 retry:
14257 	error = vnode_getfromid(volfs_id, objid, ctx, options & FSOPT_ISREALFSID, &vp);
14258 	if (error) {
14259 		return error;
14260 	}
14261 
14262 #if CONFIG_MACF
14263 	error = mac_vnode_check_fsgetpath(ctx, vp);
14264 	if (error) {
14265 		vnode_put(vp);
14266 		return error;
14267 	}
14268 #endif
14269 
14270 	/* Obtain the absolute path to this vnode. */
14271 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
14272 	if (options & FSOPT_NOFIRMLINKPATH) {
14273 		bpflags |= BUILDPATH_NO_FIRMLINK;
14274 	}
14275 	bpflags |= BUILDPATH_CHECK_MOVED;
14276 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
14277 	vnode_put(vp);
14278 
14279 	if (error) {
14280 		/* there was a race building the path, try a few more times */
14281 		if (error == EAGAIN) {
14282 			--retries;
14283 			if (retries > 0) {
14284 				goto retry;
14285 			}
14286 
14287 			error = ENOENT;
14288 		}
14289 		goto out;
14290 	}
14291 
14292 	AUDIT_ARG(text, buf);
14293 
14294 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
14295 		kdebug_vfs_lookup(buf, length, vp, KDBG_VFSLKUP_LOOKUP);
14296 	}
14297 
14298 	*pathlen = length; /* may be superseded by error */
14299 
14300 out:
14301 	return error;
14302 }
14303 
14304 /*
14305  * Obtain the full pathname of a file system object by id.
14306  */
14307 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)14308 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
14309     uint32_t options, user_ssize_t *retval)
14310 {
14311 	vfs_context_t ctx = vfs_context_current();
14312 	fsid_t fsid;
14313 	char *realpath;
14314 	int length;
14315 	int error;
14316 
14317 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
14318 		return EINVAL;
14319 	}
14320 
14321 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
14322 		return error;
14323 	}
14324 	AUDIT_ARG(value32, fsid.val[0]);
14325 	AUDIT_ARG(value64, objid);
14326 	/* Restrict output buffer size for now. */
14327 
14328 	if (bufsize > MAXLONGPATHLEN || bufsize <= 0) {
14329 		return EINVAL;
14330 	}
14331 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
14332 	if (realpath == NULL) {
14333 		return ENOMEM;
14334 	}
14335 
14336 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
14337 	    options, &length);
14338 
14339 	if (error) {
14340 		goto out;
14341 	}
14342 
14343 	error = copyout((caddr_t)realpath, buf, length);
14344 
14345 	*retval = (user_ssize_t)length; /* may be superseded by error */
14346 out:
14347 	kfree_data(realpath, bufsize);
14348 	return error;
14349 }
14350 
14351 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)14352 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
14353 {
14354 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
14355 	           0, retval);
14356 }
14357 
14358 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)14359 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
14360 {
14361 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
14362 	           uap->options, retval);
14363 }
14364 
14365 /*
14366  * Common routine to handle various flavors of statfs data heading out
14367  *	to user space.
14368  *
14369  * Returns:	0			Success
14370  *		EFAULT
14371  */
14372 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)14373 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
14374     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
14375     boolean_t partial_copy)
14376 {
14377 	int             error;
14378 	int             my_size, copy_size;
14379 
14380 	if (is_64_bit) {
14381 		struct user64_statfs sfs;
14382 		my_size = copy_size = sizeof(sfs);
14383 		bzero(&sfs, my_size);
14384 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14385 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14386 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14387 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
14388 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
14389 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
14390 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
14391 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
14392 		sfs.f_files = (user64_long_t)sfsp->f_files;
14393 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
14394 		sfs.f_fsid = sfsp->f_fsid;
14395 		sfs.f_owner = sfsp->f_owner;
14396 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14397 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14398 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14399 
14400 		if (partial_copy) {
14401 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14402 		}
14403 		error = copyout((caddr_t)&sfs, bufp, copy_size);
14404 	} else {
14405 		struct user32_statfs sfs;
14406 
14407 		my_size = copy_size = sizeof(sfs);
14408 		bzero(&sfs, my_size);
14409 
14410 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14411 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14412 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14413 
14414 		/*
14415 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
14416 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
14417 		 * to reflect the filesystem size as best we can.
14418 		 */
14419 		if ((sfsp->f_blocks > INT_MAX)
14420 		    /* Hack for 4061702 . I think the real fix is for Carbon to
14421 		     * look for some volume capability and not depend on hidden
14422 		     * semantics agreed between a FS and carbon.
14423 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
14424 		     * for Carbon to set bNoVolumeSizes volume attribute.
14425 		     * Without this the webdavfs files cannot be copied onto
14426 		     * disk as they look huge. This change should not affect
14427 		     * XSAN as they should not setting these to -1..
14428 		     */
14429 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
14430 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
14431 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
14432 			int             shift;
14433 
14434 			/*
14435 			 * Work out how far we have to shift the block count down to make it fit.
14436 			 * Note that it's possible to have to shift so far that the resulting
14437 			 * blocksize would be unreportably large.  At that point, we will clip
14438 			 * any values that don't fit.
14439 			 *
14440 			 * For safety's sake, we also ensure that f_iosize is never reported as
14441 			 * being smaller than f_bsize.
14442 			 */
14443 			for (shift = 0; shift < 32; shift++) {
14444 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
14445 					break;
14446 				}
14447 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
14448 					break;
14449 				}
14450 			}
14451 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
14452 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
14453 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
14454 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
14455 #undef __SHIFT_OR_CLIP
14456 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
14457 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
14458 		} else {
14459 			/* filesystem is small enough to be reported honestly */
14460 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
14461 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
14462 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
14463 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
14464 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
14465 		}
14466 		sfs.f_files = (user32_long_t)sfsp->f_files;
14467 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
14468 		sfs.f_fsid = sfsp->f_fsid;
14469 		sfs.f_owner = sfsp->f_owner;
14470 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14471 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14472 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14473 
14474 		if (partial_copy) {
14475 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14476 		}
14477 		error = copyout((caddr_t)&sfs, bufp, copy_size);
14478 	}
14479 
14480 	if (sizep != NULL) {
14481 		*sizep = my_size;
14482 	}
14483 	return error;
14484 }
14485 
14486 /*
14487  * copy stat structure into user_stat structure.
14488  */
14489 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)14490 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
14491 {
14492 	bzero(usbp, sizeof(*usbp));
14493 
14494 	usbp->st_dev = sbp->st_dev;
14495 	usbp->st_ino = sbp->st_ino;
14496 	usbp->st_mode = sbp->st_mode;
14497 	usbp->st_nlink = sbp->st_nlink;
14498 	usbp->st_uid = sbp->st_uid;
14499 	usbp->st_gid = sbp->st_gid;
14500 	usbp->st_rdev = sbp->st_rdev;
14501 #ifndef _POSIX_C_SOURCE
14502 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14503 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14504 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14505 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14506 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14507 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14508 #else
14509 	usbp->st_atime = sbp->st_atime;
14510 	usbp->st_atimensec = sbp->st_atimensec;
14511 	usbp->st_mtime = sbp->st_mtime;
14512 	usbp->st_mtimensec = sbp->st_mtimensec;
14513 	usbp->st_ctime = sbp->st_ctime;
14514 	usbp->st_ctimensec = sbp->st_ctimensec;
14515 #endif
14516 	usbp->st_size = sbp->st_size;
14517 	usbp->st_blocks = sbp->st_blocks;
14518 	usbp->st_blksize = sbp->st_blksize;
14519 	usbp->st_flags = sbp->st_flags;
14520 	usbp->st_gen = sbp->st_gen;
14521 	usbp->st_lspare = sbp->st_lspare;
14522 	usbp->st_qspare[0] = sbp->st_qspare[0];
14523 	usbp->st_qspare[1] = sbp->st_qspare[1];
14524 }
14525 
14526 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)14527 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
14528 {
14529 	bzero(usbp, sizeof(*usbp));
14530 
14531 	usbp->st_dev = sbp->st_dev;
14532 	usbp->st_ino = sbp->st_ino;
14533 	usbp->st_mode = sbp->st_mode;
14534 	usbp->st_nlink = sbp->st_nlink;
14535 	usbp->st_uid = sbp->st_uid;
14536 	usbp->st_gid = sbp->st_gid;
14537 	usbp->st_rdev = sbp->st_rdev;
14538 #ifndef _POSIX_C_SOURCE
14539 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14540 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14541 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14542 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14543 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14544 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14545 #else
14546 	usbp->st_atime = sbp->st_atime;
14547 	usbp->st_atimensec = sbp->st_atimensec;
14548 	usbp->st_mtime = sbp->st_mtime;
14549 	usbp->st_mtimensec = sbp->st_mtimensec;
14550 	usbp->st_ctime = sbp->st_ctime;
14551 	usbp->st_ctimensec = sbp->st_ctimensec;
14552 #endif
14553 	usbp->st_size = sbp->st_size;
14554 	usbp->st_blocks = sbp->st_blocks;
14555 	usbp->st_blksize = sbp->st_blksize;
14556 	usbp->st_flags = sbp->st_flags;
14557 	usbp->st_gen = sbp->st_gen;
14558 	usbp->st_lspare = sbp->st_lspare;
14559 	usbp->st_qspare[0] = sbp->st_qspare[0];
14560 	usbp->st_qspare[1] = sbp->st_qspare[1];
14561 }
14562 
14563 /*
14564  * copy stat64 structure into user_stat64 structure.
14565  */
14566 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)14567 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
14568 {
14569 	bzero(usbp, sizeof(*usbp));
14570 
14571 	usbp->st_dev = sbp->st_dev;
14572 	usbp->st_ino = sbp->st_ino;
14573 	usbp->st_mode = sbp->st_mode;
14574 	usbp->st_nlink = sbp->st_nlink;
14575 	usbp->st_uid = sbp->st_uid;
14576 	usbp->st_gid = sbp->st_gid;
14577 	usbp->st_rdev = sbp->st_rdev;
14578 #ifndef _POSIX_C_SOURCE
14579 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14580 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14581 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14582 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14583 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14584 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14585 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
14586 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
14587 #else
14588 	usbp->st_atime = sbp->st_atime;
14589 	usbp->st_atimensec = sbp->st_atimensec;
14590 	usbp->st_mtime = sbp->st_mtime;
14591 	usbp->st_mtimensec = sbp->st_mtimensec;
14592 	usbp->st_ctime = sbp->st_ctime;
14593 	usbp->st_ctimensec = sbp->st_ctimensec;
14594 	usbp->st_birthtime = sbp->st_birthtime;
14595 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14596 #endif
14597 	usbp->st_size = sbp->st_size;
14598 	usbp->st_blocks = sbp->st_blocks;
14599 	usbp->st_blksize = sbp->st_blksize;
14600 	usbp->st_flags = sbp->st_flags;
14601 	usbp->st_gen = sbp->st_gen;
14602 	usbp->st_lspare = sbp->st_lspare;
14603 	usbp->st_qspare[0] = sbp->st_qspare[0];
14604 	usbp->st_qspare[1] = sbp->st_qspare[1];
14605 }
14606 
14607 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14608 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14609 {
14610 	bzero(usbp, sizeof(*usbp));
14611 
14612 	usbp->st_dev = sbp->st_dev;
14613 	usbp->st_ino = sbp->st_ino;
14614 	usbp->st_mode = sbp->st_mode;
14615 	usbp->st_nlink = sbp->st_nlink;
14616 	usbp->st_uid = sbp->st_uid;
14617 	usbp->st_gid = sbp->st_gid;
14618 	usbp->st_rdev = sbp->st_rdev;
14619 #ifndef _POSIX_C_SOURCE
14620 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14621 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14622 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14623 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14624 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14625 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14626 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14627 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14628 #else
14629 	usbp->st_atime = sbp->st_atime;
14630 	usbp->st_atimensec = sbp->st_atimensec;
14631 	usbp->st_mtime = sbp->st_mtime;
14632 	usbp->st_mtimensec = sbp->st_mtimensec;
14633 	usbp->st_ctime = sbp->st_ctime;
14634 	usbp->st_ctimensec = sbp->st_ctimensec;
14635 	usbp->st_birthtime = sbp->st_birthtime;
14636 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14637 #endif
14638 	usbp->st_size = sbp->st_size;
14639 	usbp->st_blocks = sbp->st_blocks;
14640 	usbp->st_blksize = sbp->st_blksize;
14641 	usbp->st_flags = sbp->st_flags;
14642 	usbp->st_gen = sbp->st_gen;
14643 	usbp->st_lspare = sbp->st_lspare;
14644 	usbp->st_qspare[0] = sbp->st_qspare[0];
14645 	usbp->st_qspare[1] = sbp->st_qspare[1];
14646 }
14647 
14648 /*
14649  * Purge buffer cache for simulating cold starts
14650  */
14651 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14652 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14653 {
14654 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14655 
14656 	return VNODE_RETURNED;
14657 }
14658 
14659 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14660 vfs_purge_callback(mount_t mp, __unused void * arg)
14661 {
14662 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14663 
14664 	return VFS_RETURNED;
14665 }
14666 
14667 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14668 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14669 
14670 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14671 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14672 {
14673 	if (!kauth_cred_issuser(kauth_cred_get())) {
14674 		return EPERM;
14675 	}
14676 
14677 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14678 
14679 	/* also flush any VM pagers backed by files */
14680 	if (vfs_purge_vm_pagers) {
14681 		vm_purge_filebacked_pagers();
14682 	}
14683 
14684 	return 0;
14685 }
14686 
14687 /*
14688  * gets the vnode associated with the (unnamed) snapshot directory
14689  * for a Filesystem. The snapshot directory vnode is returned with
14690  * an iocount on it.
14691  */
14692 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14693 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14694 {
14695 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14696 }
14697 
14698 /*
14699  * Get the snapshot vnode.
14700  *
14701  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14702  * needs nameidone() on ndp.
14703  *
14704  * If the snapshot vnode exists it is returned in ndp->ni_vp.
14705  *
14706  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14707  * not needed.
14708  */
14709 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14710 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14711     user_addr_t name, struct nameidata *ndp, int32_t op,
14712 #if !CONFIG_TRIGGERS
14713     __unused
14714 #endif
14715     enum path_operation pathop,
14716     vfs_context_t ctx)
14717 {
14718 	int error, i;
14719 	caddr_t name_buf;
14720 	size_t name_len;
14721 	struct vfs_attr vfa;
14722 
14723 	*sdvpp = NULLVP;
14724 	*rvpp = NULLVP;
14725 
14726 	error = vnode_getfromfd(ctx, dirfd, rvpp);
14727 	if (error) {
14728 		return error;
14729 	}
14730 
14731 	if (!vnode_isvroot(*rvpp)) {
14732 		error = EINVAL;
14733 		goto out;
14734 	}
14735 
14736 	/* Make sure the filesystem supports snapshots */
14737 	VFSATTR_INIT(&vfa);
14738 	VFSATTR_WANTED(&vfa, f_capabilities);
14739 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14740 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14741 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14742 	    VOL_CAP_INT_SNAPSHOT)) ||
14743 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14744 	    VOL_CAP_INT_SNAPSHOT))) {
14745 		error = ENOTSUP;
14746 		goto out;
14747 	}
14748 
14749 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14750 	if (error) {
14751 		goto out;
14752 	}
14753 
14754 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14755 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14756 	if (error) {
14757 		goto out1;
14758 	}
14759 
14760 	/*
14761 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14762 	 * (the length returned by copyinstr includes the terminating NUL)
14763 	 */
14764 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14765 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14766 		error = EINVAL;
14767 		goto out1;
14768 	}
14769 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14770 		;
14771 	}
14772 	if (i < (int)name_len) {
14773 		error = EINVAL;
14774 		goto out1;
14775 	}
14776 
14777 #if CONFIG_MACF
14778 	if (op == CREATE) {
14779 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14780 		    name_buf);
14781 	} else if (op == DELETE) {
14782 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14783 		    name_buf);
14784 	}
14785 	if (error) {
14786 		goto out1;
14787 	}
14788 #endif
14789 
14790 	/* Check if the snapshot already exists ... */
14791 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14792 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14793 	ndp->ni_dvp = *sdvpp;
14794 
14795 	error = namei(ndp);
14796 out1:
14797 	zfree(ZV_NAMEI, name_buf);
14798 out:
14799 	if (error) {
14800 		if (*sdvpp) {
14801 			vnode_put(*sdvpp);
14802 			*sdvpp = NULLVP;
14803 		}
14804 		if (*rvpp) {
14805 			vnode_put(*rvpp);
14806 			*rvpp = NULLVP;
14807 		}
14808 	}
14809 	return error;
14810 }
14811 
14812 /*
14813  * create a filesystem snapshot (for supporting filesystems)
14814  *
14815  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14816  * We get to the (unnamed) snapshot directory vnode and create the vnode
14817  * for the snapshot in it.
14818  *
14819  * Restrictions:
14820  *
14821  *    a) Passed in name for snapshot cannot have slashes.
14822  *    b) name can't be "." or ".."
14823  *
14824  * Since this requires superuser privileges, vnode_authorize calls are not
14825  * made.
14826  */
14827 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)14828 snapshot_create(int dirfd, user_addr_t name, uint32_t flags,
14829     vfs_context_t ctx)
14830 {
14831 	vnode_t rvp, snapdvp;
14832 	int error;
14833 	struct nameidata *ndp;
14834 
14835 	/* No flags are currently defined */
14836 	if (flags) {
14837 		printf("snapshot_create: Invalid flags passed 0x%x\n", flags);
14838 		return EINVAL;
14839 	}
14840 
14841 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14842 
14843 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14844 	    OP_LINK, ctx);
14845 	if (error) {
14846 		goto out;
14847 	}
14848 
14849 	if (ndp->ni_vp) {
14850 		vnode_put(ndp->ni_vp);
14851 		error = EEXIST;
14852 	} else {
14853 		struct vnode_attr *vap;
14854 		vnode_t vp = NULLVP;
14855 
14856 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14857 
14858 		VATTR_INIT(vap);
14859 		VATTR_SET(vap, va_type, VREG);
14860 		VATTR_SET(vap, va_mode, 0);
14861 
14862 		error = vn_create(snapdvp, &vp, ndp, vap,
14863 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14864 		if (!error && vp) {
14865 			vnode_put(vp);
14866 		}
14867 
14868 		kfree_type(struct vnode_attr, vap);
14869 	}
14870 
14871 	nameidone(ndp);
14872 	vnode_put(snapdvp);
14873 	vnode_put(rvp);
14874 out:
14875 	kfree_type(struct nameidata, ndp);
14876 
14877 	return error;
14878 }
14879 
14880 /*
14881  * Delete a Filesystem snapshot
14882  *
14883  * get the vnode for the unnamed snapshot directory and the snapshot and
14884  * delete the snapshot.
14885  */
14886 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)14887 snapshot_delete(int dirfd, user_addr_t name, uint32_t flags,
14888     vfs_context_t ctx)
14889 {
14890 	vnode_t rvp, snapdvp;
14891 	int error;
14892 	struct nameidata *ndp;
14893 
14894 	/* No flags are currently defined */
14895 	if (flags) {
14896 		printf("snapshot_delete: Invalid flags passed 0x%x\n", flags);
14897 		return EINVAL;
14898 	}
14899 
14900 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14901 
14902 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14903 	    OP_UNLINK, ctx);
14904 	if (error) {
14905 		goto out;
14906 	}
14907 
14908 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14909 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14910 
14911 	vnode_put(ndp->ni_vp);
14912 	nameidone(ndp);
14913 	vnode_put(snapdvp);
14914 	vnode_put(rvp);
14915 out:
14916 	kfree_type(struct nameidata, ndp);
14917 
14918 	return error;
14919 }
14920 
14921 /*
14922  * Revert a filesystem to a snapshot
14923  *
14924  * Marks the filesystem to revert to the given snapshot on next mount.
14925  */
14926 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)14927 snapshot_revert(int dirfd, user_addr_t name, uint32_t flags,
14928     vfs_context_t ctx)
14929 {
14930 	int error;
14931 	vnode_t rvp;
14932 	mount_t mp;
14933 	struct fs_snapshot_revert_args revert_data;
14934 	struct componentname cnp;
14935 	caddr_t name_buf;
14936 	size_t name_len;
14937 
14938 	/* No flags are currently defined */
14939 	if (flags) {
14940 		printf("snapshot_revert: Invalid flags passed 0x%x\n", flags);
14941 		return EINVAL;
14942 	}
14943 
14944 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14945 	if (error) {
14946 		return error;
14947 	}
14948 	mp = vnode_mount(rvp);
14949 
14950 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14951 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14952 	if (error) {
14953 		zfree(ZV_NAMEI, name_buf);
14954 		vnode_put(rvp);
14955 		return error;
14956 	}
14957 
14958 #if CONFIG_MACF
14959 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14960 	if (error) {
14961 		zfree(ZV_NAMEI, name_buf);
14962 		vnode_put(rvp);
14963 		return error;
14964 	}
14965 #endif
14966 
14967 	/*
14968 	 * Grab mount_iterref so that we can release the vnode,
14969 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14970 	 */
14971 	error = mount_iterref(mp, 0);
14972 	vnode_put(rvp);
14973 	if (error) {
14974 		zfree(ZV_NAMEI, name_buf);
14975 		return error;
14976 	}
14977 
14978 	memset(&cnp, 0, sizeof(cnp));
14979 	cnp.cn_pnbuf = (char *)name_buf;
14980 	cnp.cn_nameiop = LOOKUP;
14981 	cnp.cn_flags = ISLASTCN | HASBUF;
14982 	cnp.cn_pnlen = MAXPATHLEN;
14983 	cnp.cn_nameptr = cnp.cn_pnbuf;
14984 	cnp.cn_namelen = (int)name_len;
14985 	revert_data.sr_cnp = &cnp;
14986 
14987 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14988 	mount_iterdrop(mp);
14989 	zfree(ZV_NAMEI, name_buf);
14990 
14991 	if (error) {
14992 		/* If there was any error, try again using VNOP_IOCTL */
14993 
14994 		vnode_t snapdvp;
14995 		struct nameidata namend;
14996 
14997 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14998 		    OP_LOOKUP, ctx);
14999 		if (error) {
15000 			return error;
15001 		}
15002 
15003 
15004 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
15005 		    0, ctx);
15006 
15007 		vnode_put(namend.ni_vp);
15008 		nameidone(&namend);
15009 		vnode_put(snapdvp);
15010 		vnode_put(rvp);
15011 	}
15012 
15013 	return error;
15014 }
15015 
15016 /*
15017  * rename a Filesystem snapshot
15018  *
15019  * get the vnode for the unnamed snapshot directory and the snapshot and
15020  * rename the snapshot. This is a very specialised (and simple) case of
15021  * rename(2) (which has to deal with a lot more complications). It differs
15022  * slightly from rename(2) in that EEXIST is returned if the new name exists.
15023  */
15024 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,uint32_t flags,vfs_context_t ctx)15025 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
15026     uint32_t flags, vfs_context_t ctx)
15027 {
15028 	vnode_t rvp, snapdvp;
15029 	int error, i;
15030 	caddr_t newname_buf;
15031 	size_t name_len;
15032 	vnode_t fvp;
15033 	struct nameidata *fromnd, *tond;
15034 	/* carving out a chunk for structs that are too big to be on stack. */
15035 	struct {
15036 		struct nameidata from_node;
15037 		struct nameidata to_node;
15038 	} * __rename_data;
15039 
15040 	/* No flags are currently defined */
15041 	if (flags) {
15042 		printf("snapshot_rename: Invalid flags passed 0x%x\n", flags);
15043 		return EINVAL;
15044 	}
15045 
15046 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
15047 	fromnd = &__rename_data->from_node;
15048 	tond = &__rename_data->to_node;
15049 
15050 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
15051 	    OP_UNLINK, ctx);
15052 	if (error) {
15053 		goto out;
15054 	}
15055 	fvp  = fromnd->ni_vp;
15056 
15057 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
15058 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
15059 	if (error) {
15060 		goto out1;
15061 	}
15062 
15063 	/*
15064 	 * Some sanity checks- new name can't be empty, "." or ".." or have
15065 	 * slashes.
15066 	 * (the length returned by copyinstr includes the terminating NUL)
15067 	 *
15068 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
15069 	 * off here itself.
15070 	 */
15071 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
15072 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
15073 		error = EINVAL;
15074 		goto out1;
15075 	}
15076 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
15077 		;
15078 	}
15079 	if (i < (int)name_len) {
15080 		error = EINVAL;
15081 		goto out1;
15082 	}
15083 
15084 #if CONFIG_MACF
15085 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
15086 	    newname_buf);
15087 	if (error) {
15088 		goto out1;
15089 	}
15090 #endif
15091 
15092 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
15093 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
15094 	tond->ni_dvp = snapdvp;
15095 
15096 	error = namei(tond);
15097 	if (error) {
15098 		goto out2;
15099 	} else if (tond->ni_vp) {
15100 		/*
15101 		 * snapshot rename behaves differently than rename(2) - if the
15102 		 * new name exists, EEXIST is returned.
15103 		 */
15104 		vnode_put(tond->ni_vp);
15105 		error = EEXIST;
15106 		goto out2;
15107 	}
15108 
15109 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
15110 	    &tond->ni_cnd, ctx);
15111 
15112 out2:
15113 	nameidone(tond);
15114 out1:
15115 	zfree(ZV_NAMEI, newname_buf);
15116 	vnode_put(fvp);
15117 	vnode_put(snapdvp);
15118 	vnode_put(rvp);
15119 	nameidone(fromnd);
15120 out:
15121 	kfree_type(typeof(*__rename_data), __rename_data);
15122 	return error;
15123 }
15124 
15125 /*
15126  * Mount a Filesystem snapshot
15127  *
15128  * get the vnode for the unnamed snapshot directory and the snapshot and
15129  * mount the snapshot.
15130  */
15131 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,uint32_t flags,vfs_context_t ctx)15132 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
15133     __unused user_addr_t mnt_data, uint32_t flags, vfs_context_t ctx)
15134 {
15135 	mount_t mp;
15136 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
15137 	struct fs_snapshot_mount_args smnt_data;
15138 	int error, mount_flags = 0;
15139 	struct nameidata *snapndp, *dirndp;
15140 	/* carving out a chunk for structs that are too big to be on stack. */
15141 	struct {
15142 		struct nameidata snapnd;
15143 		struct nameidata dirnd;
15144 	} * __snapshot_mount_data;
15145 
15146 	/* Check for invalid flags */
15147 	if (flags & ~SNAPSHOT_MNT_VALIDMASK) {
15148 		printf("snapshot_mount: Invalid flags passed 0x%x\n", flags);
15149 		return EINVAL;
15150 	}
15151 
15152 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
15153 	snapndp = &__snapshot_mount_data->snapnd;
15154 	dirndp = &__snapshot_mount_data->dirnd;
15155 
15156 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
15157 	    OP_LOOKUP, ctx);
15158 	if (error) {
15159 		goto out;
15160 	}
15161 
15162 	snapvp  = snapndp->ni_vp;
15163 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
15164 		error = EIO;
15165 		goto out1;
15166 	}
15167 
15168 	/* Convert snapshot_mount flags to mount flags */
15169 	if (flags & SNAPSHOT_MNT_NOEXEC) {
15170 		mount_flags |= MNT_NOEXEC;
15171 	}
15172 	if (flags & SNAPSHOT_MNT_NOSUID) {
15173 		mount_flags |= MNT_NOSUID;
15174 	}
15175 	if (flags & SNAPSHOT_MNT_NODEV) {
15176 		mount_flags |= MNT_NODEV;
15177 	}
15178 	if (flags & SNAPSHOT_MNT_DONTBROWSE) {
15179 		mount_flags |= MNT_DONTBROWSE;
15180 	}
15181 	if (flags & SNAPSHOT_MNT_IGNORE_OWNERSHIP) {
15182 		mount_flags |= MNT_IGNORE_OWNERSHIP;
15183 	}
15184 	if (flags & SNAPSHOT_MNT_NOFOLLOW) {
15185 		mount_flags |= MNT_NOFOLLOW;
15186 	}
15187 
15188 	/* Get the vnode to be covered */
15189 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
15190 	    UIO_USERSPACE, directory, ctx);
15191 	if (mount_flags & MNT_NOFOLLOW) {
15192 		dirndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
15193 	}
15194 
15195 	error = namei(dirndp);
15196 	if (error) {
15197 		goto out1;
15198 	}
15199 
15200 	vp = dirndp->ni_vp;
15201 	pvp = dirndp->ni_dvp;
15202 	mp = vnode_mount(rvp);
15203 
15204 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
15205 		error = EINVAL;
15206 		goto out2;
15207 	}
15208 
15209 #if CONFIG_MACF
15210 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
15211 	    mp->mnt_vfsstat.f_fstypename);
15212 	if (error) {
15213 		goto out2;
15214 	}
15215 #endif
15216 
15217 	smnt_data.sm_mp  = mp;
15218 	smnt_data.sm_cnp = &snapndp->ni_cnd;
15219 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
15220 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), mount_flags,
15221 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
15222 
15223 out2:
15224 	vnode_put(vp);
15225 	vnode_put(pvp);
15226 	nameidone(dirndp);
15227 out1:
15228 	vnode_put(snapvp);
15229 	vnode_put(snapdvp);
15230 	vnode_put(rvp);
15231 	nameidone(snapndp);
15232 out:
15233 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
15234 	return error;
15235 }
15236 
15237 /*
15238  * Root from a snapshot of the filesystem
15239  *
15240  * Marks the filesystem to root from the given snapshot on next boot.
15241  */
15242 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)15243 snapshot_root(int dirfd, user_addr_t name, uint32_t flags,
15244     vfs_context_t ctx)
15245 {
15246 	int error;
15247 	vnode_t rvp;
15248 	mount_t mp;
15249 	struct fs_snapshot_root_args root_data;
15250 	struct componentname cnp;
15251 	caddr_t name_buf;
15252 	size_t name_len;
15253 
15254 	/* No flags are currently defined */
15255 	if (flags) {
15256 		printf("snapshot_root: Invalid flags passed 0x%x\n", flags);
15257 		return EINVAL;
15258 	}
15259 
15260 	error = vnode_getfromfd(ctx, dirfd, &rvp);
15261 	if (error) {
15262 		return error;
15263 	}
15264 	mp = vnode_mount(rvp);
15265 
15266 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
15267 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
15268 	if (error) {
15269 		zfree(ZV_NAMEI, name_buf);
15270 		vnode_put(rvp);
15271 		return error;
15272 	}
15273 
15274 	// XXX MAC checks ?
15275 
15276 	/*
15277 	 * Grab mount_iterref so that we can release the vnode,
15278 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
15279 	 */
15280 	error = mount_iterref(mp, 0);
15281 	vnode_put(rvp);
15282 	if (error) {
15283 		zfree(ZV_NAMEI, name_buf);
15284 		return error;
15285 	}
15286 
15287 	memset(&cnp, 0, sizeof(cnp));
15288 	cnp.cn_pnbuf = (char *)name_buf;
15289 	cnp.cn_nameiop = LOOKUP;
15290 	cnp.cn_flags = ISLASTCN | HASBUF;
15291 	cnp.cn_pnlen = MAXPATHLEN;
15292 	cnp.cn_nameptr = cnp.cn_pnbuf;
15293 	cnp.cn_namelen = (int)name_len;
15294 	root_data.sr_cnp = &cnp;
15295 
15296 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
15297 
15298 	mount_iterdrop(mp);
15299 	zfree(ZV_NAMEI, name_buf);
15300 
15301 	return error;
15302 }
15303 
15304 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)15305 vfs_context_can_snapshot(vfs_context_t ctx)
15306 {
15307 	static const char * const snapshot_entitlements[] = {
15308 		"com.apple.private.vfs.snapshot",
15309 		"com.apple.developer.vfs.snapshot",
15310 		"com.apple.private.apfs.arv.limited.snapshot",
15311 	};
15312 	static const size_t nentitlements =
15313 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
15314 	size_t i;
15315 
15316 	task_t task = vfs_context_task(ctx);
15317 	for (i = 0; i < nentitlements; i++) {
15318 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
15319 			return TRUE;
15320 		}
15321 	}
15322 	return FALSE;
15323 }
15324 
15325 /*
15326  * FS snapshot operations dispatcher
15327  */
15328 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)15329 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
15330     __unused int32_t *retval)
15331 {
15332 	int error;
15333 	vfs_context_t ctx = vfs_context_current();
15334 
15335 	AUDIT_ARG(fd, uap->dirfd);
15336 	AUDIT_ARG(value32, uap->op);
15337 
15338 	if (!vfs_context_can_snapshot(ctx)) {
15339 		return EPERM;
15340 	}
15341 
15342 	/*
15343 	 * Enforce user authorization for snapshot modification operations,
15344 	 * or if trying to root from snapshot.
15345 	 */
15346 	if (uap->op != SNAPSHOT_OP_MOUNT) {
15347 		vnode_t dvp = NULLVP;
15348 		vnode_t devvp = NULLVP;
15349 		mount_t mp;
15350 
15351 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
15352 		if (error) {
15353 			return error;
15354 		}
15355 		mp = vnode_mount(dvp);
15356 		devvp = mp->mnt_devvp;
15357 
15358 		/* get an iocount on devvp */
15359 		if (devvp == NULLVP) {
15360 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
15361 			/* for mounts which arent block devices */
15362 			if (error == ENOENT) {
15363 				error = ENXIO;
15364 			}
15365 		} else {
15366 			error = vnode_getwithref(devvp);
15367 		}
15368 
15369 		if (error) {
15370 			vnode_put(dvp);
15371 			return error;
15372 		}
15373 
15374 		if ((vfs_context_issuser(ctx) == 0) &&
15375 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
15376 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
15377 			error = EPERM;
15378 		}
15379 		vnode_put(dvp);
15380 		vnode_put(devvp);
15381 
15382 		if (error) {
15383 			return error;
15384 		}
15385 	}
15386 
15387 	switch (uap->op) {
15388 	case SNAPSHOT_OP_CREATE:
15389 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
15390 		break;
15391 	case SNAPSHOT_OP_DELETE:
15392 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
15393 		break;
15394 	case SNAPSHOT_OP_RENAME:
15395 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
15396 		    uap->flags, ctx);
15397 		break;
15398 	case SNAPSHOT_OP_MOUNT:
15399 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
15400 		    uap->data, uap->flags, ctx);
15401 		break;
15402 	case SNAPSHOT_OP_REVERT:
15403 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
15404 		break;
15405 #if CONFIG_MNT_ROOTSNAP
15406 	case SNAPSHOT_OP_ROOT:
15407 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
15408 		break;
15409 #endif /* CONFIG_MNT_ROOTSNAP */
15410 	default:
15411 		error = ENOSYS;
15412 	}
15413 
15414 	return error;
15415 }
15416