xref: /xnu-11215.41.3/bsd/vfs/vfs_syscalls.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 #if CONFIG_EXCLAVES
115 #include <vfs/vfs_exclave_fs.h>
116 #endif
117 
118 #include <security/audit/audit.h>
119 #include <bsm/audit_kevents.h>
120 
121 #include <mach/mach_types.h>
122 #include <kern/kern_types.h>
123 #include <kern/kalloc.h>
124 #include <kern/task.h>
125 
126 #include <vm/vm_pageout.h>
127 #include <vm/vm_protos.h>
128 #include <vm/memory_object_xnu.h>
129 
130 #include <libkern/OSAtomic.h>
131 #include <os/atomic_private.h>
132 #include <pexpert/pexpert.h>
133 #include <IOKit/IOBSD.h>
134 
135 // deps for MIG call
136 #include <kern/host.h>
137 #include <kern/ipc_misc.h>
138 #include <mach/host_priv.h>
139 #include <mach/vfs_nspace.h>
140 #include <os/log.h>
141 
142 #include <nfs/nfs_conf.h>
143 
144 #if ROUTEFS
145 #include <miscfs/routefs/routefs.h>
146 #endif /* ROUTEFS */
147 
148 #if CONFIG_MACF
149 #include <security/mac.h>
150 #include <security/mac_framework.h>
151 #endif
152 
153 #if CONFIG_FSE
154 #define GET_PATH(x) \
155 	((x) = get_pathbuff())
156 #define RELEASE_PATH(x) \
157 	release_pathbuff(x)
158 #else
159 #define GET_PATH(x)     \
160 	((x) = zalloc(ZV_NAMEI))
161 #define RELEASE_PATH(x) \
162 	zfree(ZV_NAMEI, x)
163 #endif /* CONFIG_FSE */
164 
165 #ifndef HFS_GET_BOOT_INFO
166 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
167 #endif
168 
169 #ifndef HFS_SET_BOOT_INFO
170 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
171 #endif
172 
173 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
174 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
175 #endif
176 
177 extern void disk_conditioner_unmount(mount_t mp);
178 
179 /* struct for checkdirs iteration */
180 struct cdirargs {
181 	vnode_t olddp;
182 	vnode_t newdp;
183 };
184 /* callback  for checkdirs iteration */
185 static int checkdirs_callback(proc_t p, void * arg);
186 
187 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
188 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
189 void enablequotas(struct mount *mp, vfs_context_t ctx);
190 static int getfsstat_callback(mount_t mp, void * arg);
191 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
192 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
193 static int sync_callback(mount_t, void *);
194 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
195     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
196     boolean_t partial_copy);
197 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
198 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
199     struct componentname *cnp, user_addr_t fsmountargs,
200     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
201 void vfs_notify_mount(vnode_t pdvp);
202 
203 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
204 
205 struct fd_vn_data * fg_vn_data_alloc(void);
206 
207 /*
208  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
209  * Concurrent lookups (or lookups by ids) on hard links can cause the
210  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
211  * does) to return ENOENT as the path cannot be returned from the name cache
212  * alone. We have no option but to retry and hope to get one namei->reverse path
213  * generation done without an intervening lookup, lookup by id on the hard link
214  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
215  * which currently are the MAC hooks for rename, unlink and rmdir.
216  */
217 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
218 
219 /* Max retry limit for rename due to vnode recycling. */
220 #define MAX_RENAME_ERECYCLE_RETRIES 1024
221 
222 #define MAX_LINK_ENOENT_RETRIES 1024
223 
224 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
225     int unlink_flags);
226 
227 #ifdef CONFIG_IMGSRC_ACCESS
228 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
229 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
230 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
231 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
232 static void mount_end_update(mount_t mp);
233 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
234 #endif /* CONFIG_IMGSRC_ACCESS */
235 
236 //snapshot functions
237 #if CONFIG_MNT_ROOTSNAP
238 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
239 #else
240 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
241 #endif
242 
243 __private_extern__
244 int sync_internal(void);
245 
246 __private_extern__
247 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
248 
249 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
250 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
251 
252 /* vars for sync mutex */
253 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
254 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
255 
256 extern lck_rw_t rootvnode_rw_lock;
257 
258 VFS_SMR_DECLARE;
259 extern uint32_t nc_smr_enabled;
260 
261 /*
262  * incremented each time a mount or unmount operation occurs
263  * used to invalidate the cached value of the rootvp in the
264  * mount structure utilized by cache_lookup_path
265  */
266 uint32_t mount_generation = 0;
267 
268 /* counts number of mount and unmount operations */
269 unsigned int vfs_nummntops = 0;
270 
271 /* system-wide, per-boot unique mount ID */
272 static _Atomic uint64_t mount_unique_id = 1;
273 
274 extern const struct fileops vnops;
275 #if CONFIG_APPLEDOUBLE
276 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
277 #endif /* CONFIG_APPLEDOUBLE */
278 
279 /* Maximum buffer length supported by fsgetpath(2) */
280 #define FSGETPATH_MAXBUFLEN  8192
281 
282 /*
283  * Virtual File System System Calls
284  */
285 
286 /*
287  * Private in-kernel mounting spi (specific use-cases only)
288  */
289 boolean_t
vfs_iskernelmount(mount_t mp)290 vfs_iskernelmount(mount_t mp)
291 {
292 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
293 }
294 
295 __private_extern__
296 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)297 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
298     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
299     vfs_context_t ctx)
300 {
301 	struct nameidata nd;
302 	boolean_t did_namei;
303 	int error;
304 
305 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
306 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
307 
308 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
309 
310 	/*
311 	 * Get the vnode to be covered if it's not supplied
312 	 */
313 	if (vp == NULLVP) {
314 		error = namei(&nd);
315 		if (error) {
316 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
317 				printf("failed to locate mount-on path: %s ", path);
318 			}
319 			return error;
320 		}
321 		vp = nd.ni_vp;
322 		pvp = nd.ni_dvp;
323 		did_namei = TRUE;
324 	} else {
325 		char *pnbuf = CAST_DOWN(char *, path);
326 
327 		nd.ni_cnd.cn_pnbuf = pnbuf;
328 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
329 		did_namei = FALSE;
330 	}
331 
332 	kern_flags |= KERNEL_MOUNT_KMOUNT;
333 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
334 	    syscall_flags, kern_flags, NULL, ctx);
335 
336 	if (did_namei) {
337 		vnode_put(vp);
338 		vnode_put(pvp);
339 		nameidone(&nd);
340 	}
341 
342 	return error;
343 }
344 
345 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)346 vfs_mount_at_path(const char *fstype, const char *path,
347     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
348     int mnt_flags, int flags)
349 {
350 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
351 	int error, km_flags = 0;
352 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
353 
354 	/*
355 	 * This call is currently restricted to specific use cases.
356 	 */
357 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
358 		return ENOTSUP;
359 	}
360 
361 #if !defined(XNU_TARGET_OS_OSX)
362 	if (strcmp(fstype, "lifs") == 0) {
363 		syscall_flags |= MNT_NOEXEC;
364 	}
365 #endif
366 
367 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
368 		km_flags |= KERNEL_MOUNT_NOAUTH;
369 	}
370 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
371 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
372 	}
373 
374 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
375 	    syscall_flags, km_flags, ctx);
376 	if (error) {
377 		printf("%s: mount on %s failed, error %d\n", __func__, path,
378 		    error);
379 	}
380 
381 	return error;
382 }
383 
384 /*
385  * Mount a file system.
386  */
387 /* ARGSUSED */
388 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)389 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
390 {
391 	struct __mac_mount_args muap;
392 
393 	muap.type = uap->type;
394 	muap.path = uap->path;
395 	muap.flags = uap->flags;
396 	muap.data = uap->data;
397 	muap.mac_p = USER_ADDR_NULL;
398 	return __mac_mount(p, &muap, retval);
399 }
400 
401 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)402 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
403 {
404 	struct componentname    cn;
405 	vfs_context_t           ctx = vfs_context_current();
406 	size_t                  dummy = 0;
407 	int                     error;
408 	int                     flags = uap->flags;
409 	char                    fstypename[MFSNAMELEN];
410 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
411 	vnode_t                 pvp;
412 	vnode_t                 vp;
413 
414 	AUDIT_ARG(fd, uap->fd);
415 	AUDIT_ARG(fflags, flags);
416 	/* fstypename will get audited by mount_common */
417 
418 	/* Sanity check the flags */
419 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
420 		return ENOTSUP;
421 	}
422 
423 	if (flags & MNT_UNION) {
424 		return EPERM;
425 	}
426 
427 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
428 	if (error) {
429 		return error;
430 	}
431 
432 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
433 		return error;
434 	}
435 
436 	if ((error = vnode_getwithref(vp)) != 0) {
437 		file_drop(uap->fd);
438 		return error;
439 	}
440 
441 	pvp = vnode_getparent(vp);
442 	if (pvp == NULL) {
443 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
444 			error = EBUSY;
445 		} else {
446 			error = EINVAL;
447 		}
448 		vnode_put(vp);
449 		file_drop(uap->fd);
450 		return error;
451 	}
452 
453 	memset(&cn, 0, sizeof(struct componentname));
454 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
455 	cn.cn_pnlen = MAXPATHLEN;
456 
457 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
458 		zfree(ZV_NAMEI, cn.cn_pnbuf);
459 		vnode_put(pvp);
460 		vnode_put(vp);
461 		file_drop(uap->fd);
462 		return error;
463 	}
464 
465 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
466 
467 	zfree(ZV_NAMEI, cn.cn_pnbuf);
468 	vnode_put(pvp);
469 	vnode_put(vp);
470 	file_drop(uap->fd);
471 
472 	return error;
473 }
474 
475 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
476 
477 /*
478  * Get the size of a graft file (a manifest or payload file).
479  * The vp should be an iocounted vnode.
480  */
481 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)482 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
483 {
484 	struct stat64 sb = {};
485 	int error;
486 
487 	*size = 0;
488 
489 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
490 	if (error) {
491 		return error;
492 	}
493 
494 	if (sb.st_size == 0) {
495 		error = ENODATA;
496 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
497 		error = EFBIG;
498 	} else {
499 		*size = (size_t) sb.st_size;
500 	}
501 
502 	return error;
503 }
504 
505 /*
506  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
507  * `size` must already be validated.
508  */
509 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)510 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
511 {
512 	return vn_rdwr(UIO_READ, graft_vp,
513 	           (caddr_t) buf, (int) size, /* offset */ 0,
514 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
515 	           vfs_context_ucred(vctx), /* resid */ NULL,
516 	           vfs_context_proc(vctx));
517 }
518 
519 /*
520  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
521  * and read it into `buf`.
522  * If `path_prefix` is non-NULL, verify that the file path has that prefix.
523  */
524 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,const char * path_prefix,size_t * size,void * buf)525 graft_secureboot_read_fd(int fd, vfs_context_t vctx, const char *path_prefix, size_t *size, void *buf)
526 {
527 	vnode_t metadata_vp = NULLVP;
528 	char *path = NULL;
529 	int error;
530 
531 	// Convert this graft fd to a vnode.
532 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
533 		goto out;
534 	}
535 
536 	// Verify that the vnode path starts with `path_prefix` if it was passed.
537 	if (path_prefix) {
538 		int len = MAXPATHLEN;
539 		path = zalloc(ZV_NAMEI);
540 		if ((error = vn_getpath(metadata_vp, path, &len))) {
541 			goto out;
542 		}
543 		if (strncmp(path, path_prefix, strlen(path_prefix))) {
544 			error = EINVAL;
545 			goto out;
546 		}
547 	}
548 
549 	// Get (and validate) size information.
550 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
551 		goto out;
552 	}
553 
554 	// Read each file into the provided buffer - we must get the expected amount of bytes.
555 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
556 		goto out;
557 	}
558 
559 out:
560 	if (path) {
561 		zfree(ZV_NAMEI, path);
562 	}
563 	if (metadata_vp) {
564 		vnode_put(metadata_vp);
565 		metadata_vp = NULLVP;
566 	}
567 
568 	return error;
569 }
570 
571 #if XNU_TARGET_OS_OSX
572 #if defined(__arm64e__)
573 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/manifests/"
574 #else /* x86_64 */
575 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/"
576 #endif /* x86_64 */
577 #else /* !XNU_TARGET_OS_OSX */
578 #define MOBILE_ASSET_DATA_VAULT_PATH "/private/var/MobileAsset/AssetsV2/manifests/"
579 #endif /* !XNU_TARGET_OS_OSX */
580 
581 /*
582  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
583  * provided in `gfs`, saving the size of data read in `gfs`.
584  */
585 static int
graft_secureboot_read_metadata(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)586 graft_secureboot_read_metadata(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
587     vfs_context_t vctx, fsioc_graft_fs_t *gfs)
588 {
589 	const char *manifest_path_prefix = NULL;
590 	int error;
591 
592 	// For Mobile Asset, make sure that the manifest comes from a data vault.
593 	if (graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET) {
594 		manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_PATH;
595 	}
596 
597 	// Read the authentic manifest.
598 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
599 	    manifest_path_prefix, &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
600 		return error;
601 	}
602 
603 	// The user manifest is currently unused, but set its size.
604 	gfs->user_manifest_size = 0;
605 
606 	// Read the payload.
607 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
608 	    NULL, &gfs->payload_size, gfs->payload))) {
609 		return error;
610 	}
611 
612 	return 0;
613 }
614 
615 /*
616  * Call into the filesystem to verify and graft a cryptex.
617  */
618 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)619 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
620     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
621 {
622 	fsioc_graft_fs_t gfs = {};
623 	uint64_t graft_dir_ino = 0;
624 	struct stat64 sb = {};
625 	int error;
626 
627 	// Pre-flight arguments.
628 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
629 		// Make sure that this graft version matches what we support.
630 		return ENOTSUP;
631 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
632 		// For this type, cryptex VP must live on same volume as the target of graft.
633 		return EXDEV;
634 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
635 		// We cannot graft upon non-directories.
636 		return ENOTDIR;
637 	} else if (cryptex_vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) {
638 		// We do not allow grafts inside disk images.
639 		return ENODEV;
640 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
641 	    sbc_args->sbc_payload_fd < 0) {
642 		// We cannot graft without a manifest and payload.
643 		return EINVAL;
644 	}
645 
646 	if (mounton_vp) {
647 		// Get the mounton's inode number.
648 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
649 		if (error) {
650 			return error;
651 		}
652 		graft_dir_ino = (uint64_t) sb.st_ino;
653 	}
654 
655 	// Create buffers (of our maximum-defined size) to store authentication info.
656 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
657 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
658 
659 	if (!gfs.authentic_manifest || !gfs.payload) {
660 		error = ENOMEM;
661 		goto out;
662 	}
663 
664 	// Read our fd's into our buffers.
665 	// (Note that this will set the buffer size fields in `gfs`.)
666 	error = graft_secureboot_read_metadata(graft_type, sbc_args, vctx, &gfs);
667 	if (error) {
668 		goto out;
669 	}
670 
671 	gfs.graft_version = FSIOC_GRAFT_VERSION;
672 	gfs.graft_type = graft_type;
673 	gfs.graft_4cc = sbc_args->sbc_4cc;
674 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
675 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
676 	}
677 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
678 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
679 	}
680 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
681 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
682 	}
683 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
684 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
685 	}
686 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
687 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
688 	}
689 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
690 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
691 	}
692 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
693 
694 	// Call into the FS to perform the graft (and validation).
695 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
696 
697 out:
698 	if (gfs.authentic_manifest) {
699 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
700 		gfs.authentic_manifest = NULL;
701 	}
702 	if (gfs.payload) {
703 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
704 		gfs.payload = NULL;
705 	}
706 
707 	return error;
708 }
709 
710 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
711 
712 /*
713  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
714  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
715  */
716 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)717 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
718 {
719 	int ua_dmgfd = uap->dmg_fd;
720 	user_addr_t ua_mountdir = uap->mountdir;
721 	uint32_t ua_grafttype = uap->graft_type;
722 	user_addr_t ua_graftargs = uap->gda;
723 
724 	graftdmg_args_un kern_gda = {};
725 	int error = 0;
726 	secure_boot_cryptex_args_t *sbc_args = NULL;
727 
728 	vnode_t cryptex_vp = NULLVP;
729 	vnode_t mounton_vp = NULLVP;
730 	struct nameidata nd = {};
731 	vfs_context_t ctx = vfs_context_current();
732 
733 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
734 		return EPERM;
735 	}
736 
737 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
738 	if (error) {
739 		return error;
740 	}
741 
742 	// Copy mount dir in, if provided.
743 	if (ua_mountdir != USER_ADDR_NULL) {
744 		// Acquire vnode for mount-on path
745 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
746 		    UIO_USERSPACE, ua_mountdir, ctx);
747 
748 		error = namei(&nd);
749 		if (error) {
750 			return error;
751 		}
752 		mounton_vp = nd.ni_vp;
753 	}
754 
755 	// Convert fd to vnode.
756 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
757 	if (error) {
758 		goto graftout;
759 	}
760 
761 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
762 		error = EINVAL;
763 	} else {
764 		sbc_args = &kern_gda.sbc_args;
765 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
766 	}
767 
768 graftout:
769 	if (cryptex_vp) {
770 		vnode_put(cryptex_vp);
771 		cryptex_vp = NULLVP;
772 	}
773 	if (mounton_vp) {
774 		vnode_put(mounton_vp);
775 		mounton_vp = NULLVP;
776 	}
777 	if (ua_mountdir != USER_ADDR_NULL) {
778 		nameidone(&nd);
779 	}
780 
781 	return error;
782 }
783 
784 /*
785  * Ungraft a cryptex disk image (via mount dir FD)
786  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
787  */
788 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)789 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
790 {
791 	int error = 0;
792 	user_addr_t ua_mountdir = uap->mountdir;
793 	fsioc_ungraft_fs_t ugfs;
794 	vnode_t mounton_vp = NULLVP;
795 	struct nameidata nd = {};
796 	vfs_context_t ctx = vfs_context_current();
797 
798 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
799 		return EPERM;
800 	}
801 
802 	if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
803 		return EINVAL;
804 	}
805 
806 	ugfs.ungraft_flags = 0;
807 
808 	// Acquire vnode for mount-on path
809 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
810 	    UIO_USERSPACE, ua_mountdir, ctx);
811 
812 	error = namei(&nd);
813 	if (error) {
814 		return error;
815 	}
816 	mounton_vp = nd.ni_vp;
817 
818 	// Call into the FS to perform the ungraft
819 	error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
820 
821 	vnode_put(mounton_vp);
822 	nameidone(&nd);
823 
824 	return error;
825 }
826 
827 
828 void
vfs_notify_mount(vnode_t pdvp)829 vfs_notify_mount(vnode_t pdvp)
830 {
831 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
832 	lock_vnode_and_post(pdvp, NOTE_WRITE);
833 }
834 
835 /*
836  * __mac_mount:
837  *	Mount a file system taking into account MAC label behavior.
838  *	See mount(2) man page for more information
839  *
840  * Parameters:    p                        Process requesting the mount
841  *                uap                      User argument descriptor (see below)
842  *                retval                   (ignored)
843  *
844  * Indirect:      uap->type                Filesystem type
845  *                uap->path                Path to mount
846  *                uap->data                Mount arguments
847  *                uap->mac_p               MAC info
848  *                uap->flags               Mount flags
849  *
850  *
851  * Returns:        0                       Success
852  *                !0                       Not success
853  */
854 boolean_t root_fs_upgrade_try = FALSE;
855 
856 #define MAX_NESTED_UNION_MOUNTS  10
857 
858 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)859 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
860 {
861 	vnode_t pvp = NULL;
862 	vnode_t vp = NULL;
863 	int need_nameidone = 0;
864 	vfs_context_t ctx = vfs_context_current();
865 	char fstypename[MFSNAMELEN];
866 	struct nameidata nd;
867 	size_t dummy = 0;
868 	char *labelstr = NULL;
869 	size_t labelsz = 0;
870 	int flags = uap->flags;
871 	int error;
872 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
873 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
874 #else
875 #pragma unused(p)
876 #endif
877 	/*
878 	 * Get the fs type name from user space
879 	 */
880 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
881 	if (error) {
882 		return error;
883 	}
884 
885 	/*
886 	 * Get the vnode to be covered
887 	 */
888 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
889 	    UIO_USERSPACE, uap->path, ctx);
890 	if (flags & MNT_NOFOLLOW) {
891 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
892 	}
893 	error = namei(&nd);
894 	if (error) {
895 		goto out;
896 	}
897 	need_nameidone = 1;
898 	vp = nd.ni_vp;
899 	pvp = nd.ni_dvp;
900 
901 #ifdef CONFIG_IMGSRC_ACCESS
902 	/* Mounting image source cannot be batched with other operations */
903 	if (flags == MNT_IMGSRC_BY_INDEX) {
904 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
905 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
906 		goto out;
907 	}
908 #endif /* CONFIG_IMGSRC_ACCESS */
909 
910 #if CONFIG_MACF
911 	/*
912 	 * Get the label string (if any) from user space
913 	 */
914 	if (uap->mac_p != USER_ADDR_NULL) {
915 		struct user_mac mac;
916 		size_t ulen = 0;
917 
918 		if (is_64bit) {
919 			struct user64_mac mac64;
920 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
921 			mac.m_buflen = (user_size_t)mac64.m_buflen;
922 			mac.m_string = (user_addr_t)mac64.m_string;
923 		} else {
924 			struct user32_mac mac32;
925 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
926 			mac.m_buflen = mac32.m_buflen;
927 			mac.m_string = mac32.m_string;
928 		}
929 		if (error) {
930 			goto out;
931 		}
932 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
933 		    (mac.m_buflen < 2)) {
934 			error = EINVAL;
935 			goto out;
936 		}
937 		labelsz = mac.m_buflen;
938 		labelstr = kalloc_data(labelsz, Z_WAITOK);
939 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
940 		if (error) {
941 			goto out;
942 		}
943 		AUDIT_ARG(mac_string, labelstr);
944 	}
945 #endif /* CONFIG_MACF */
946 
947 	AUDIT_ARG(fflags, flags);
948 
949 	if (flags & MNT_UNION) {
950 #if CONFIG_UNION_MOUNTS
951 		mount_t mp = vp->v_mount;
952 		int nested_union_mounts = 0;
953 
954 		name_cache_lock_shared();
955 
956 		/* Walk up the vnodecovered chain and check for nested union mounts. */
957 		mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
958 		while (mp) {
959 			if (!(mp->mnt_flag & MNT_UNION)) {
960 				break;
961 			}
962 			mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
963 
964 			/*
965 			 * Limit the max nested unon mounts to prevent stack exhaustion
966 			 * when calling lookup_traverse_union().
967 			 */
968 			if (++nested_union_mounts >= MAX_NESTED_UNION_MOUNTS) {
969 				error = ELOOP;
970 				break;
971 			}
972 		}
973 
974 		name_cache_unlock();
975 		if (error) {
976 			goto out;
977 		}
978 #else
979 		error = EPERM;
980 		goto out;
981 #endif /* CONFIG_UNION_MOUNTS */
982 	}
983 
984 	if ((vp->v_flag & VROOT) &&
985 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
986 #if CONFIG_UNION_MOUNTS
987 		if (!(flags & MNT_UNION)) {
988 			flags |= MNT_UPDATE;
989 		} else {
990 			/*
991 			 * For a union mount on '/', treat it as fresh
992 			 * mount instead of update.
993 			 * Otherwise, union mouting on '/' used to panic the
994 			 * system before, since mnt_vnodecovered was found to
995 			 * be NULL for '/' which is required for unionlookup
996 			 * after it gets ENOENT on union mount.
997 			 */
998 			flags = (flags & ~(MNT_UPDATE));
999 		}
1000 #else
1001 		flags |= MNT_UPDATE;
1002 #endif /* CONFIG_UNION_MOUNTS */
1003 
1004 #if SECURE_KERNEL
1005 		if ((flags & MNT_RDONLY) == 0) {
1006 			/* Release kernels are not allowed to mount "/" as rw */
1007 			error = EPERM;
1008 			goto out;
1009 		}
1010 #endif
1011 
1012 		/*
1013 		 * See 7392553 for more details on why this check exists.
1014 		 * Suffice to say: If this check is ON and something tries
1015 		 * to mount the rootFS RW, we'll turn off the codesign
1016 		 * bitmap optimization.
1017 		 */
1018 #if CHECK_CS_VALIDATION_BITMAP
1019 		if ((flags & MNT_RDONLY) == 0) {
1020 			root_fs_upgrade_try = TRUE;
1021 		}
1022 #endif
1023 	}
1024 
1025 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
1026 	    labelstr, ctx);
1027 
1028 out:
1029 
1030 #if CONFIG_MACF
1031 	kfree_data(labelstr, labelsz);
1032 #endif /* CONFIG_MACF */
1033 
1034 	if (vp) {
1035 		vnode_put(vp);
1036 	}
1037 	if (pvp) {
1038 		vnode_put(pvp);
1039 	}
1040 	if (need_nameidone) {
1041 		nameidone(&nd);
1042 	}
1043 
1044 	return error;
1045 }
1046 
1047 /*
1048  * common mount implementation (final stage of mounting)
1049  *
1050  * Arguments:
1051  *  fstypename	file system type (ie it's vfs name)
1052  *  pvp		parent of covered vnode
1053  *  vp		covered vnode
1054  *  cnp		component name (ie path) of covered vnode
1055  *  flags	generic mount flags
1056  *  fsmountargs	file system specific data
1057  *  labelstr	optional MAC label
1058  *  kernelmount	TRUE for mounts initiated from inside the kernel
1059  *  ctx		caller's context
1060  */
1061 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1062 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1063     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1064     char *labelstr, vfs_context_t ctx)
1065 {
1066 #if !CONFIG_MACF
1067 #pragma unused(labelstr)
1068 #endif
1069 	struct vnode *devvp = NULLVP;
1070 	struct vnode *device_vnode = NULLVP;
1071 #if CONFIG_MACF
1072 	struct vnode *rvp;
1073 #endif
1074 	struct mount *mp = NULL;
1075 	struct vfstable *vfsp = (struct vfstable *)0;
1076 	struct proc *p = vfs_context_proc(ctx);
1077 	int error, flag = 0;
1078 	bool flag_set = false;
1079 	user_addr_t devpath = USER_ADDR_NULL;
1080 	int ronly = 0;
1081 	int mntalloc = 0;
1082 	boolean_t vfsp_ref = FALSE;
1083 	boolean_t is_rwlock_locked = FALSE;
1084 	boolean_t did_rele = FALSE;
1085 	boolean_t have_usecount = FALSE;
1086 	boolean_t did_set_lmount = FALSE;
1087 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1088 
1089 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1090 	/* Check for mutually-exclusive flag bits */
1091 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1092 	int bitcount = 0;
1093 	while (checkflags != 0) {
1094 		checkflags &= (checkflags - 1);
1095 		bitcount++;
1096 	}
1097 
1098 	if (bitcount > 1) {
1099 		//not allowed to request multiple mount-by-role flags
1100 		error = EINVAL;
1101 		goto out1;
1102 	}
1103 #endif
1104 
1105 	/*
1106 	 * Process an update for an existing mount
1107 	 */
1108 	if (flags & MNT_UPDATE) {
1109 		if ((vp->v_flag & VROOT) == 0) {
1110 			error = EINVAL;
1111 			goto out1;
1112 		}
1113 		mp = vp->v_mount;
1114 
1115 		/* if unmount or mount in progress, return error */
1116 		mount_lock_spin(mp);
1117 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1118 			mount_unlock(mp);
1119 			error = EBUSY;
1120 			goto out1;
1121 		}
1122 		mp->mnt_lflag |= MNT_LMOUNT;
1123 		did_set_lmount = TRUE;
1124 		mount_unlock(mp);
1125 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1126 		is_rwlock_locked = TRUE;
1127 		/*
1128 		 * We only allow the filesystem to be reloaded if it
1129 		 * is currently mounted read-only.
1130 		 */
1131 		if ((flags & MNT_RELOAD) &&
1132 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1133 			error = ENOTSUP;
1134 			goto out1;
1135 		}
1136 
1137 		/*
1138 		 * If content protection is enabled, update mounts are not
1139 		 * allowed to turn it off.
1140 		 */
1141 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1142 		    ((flags & MNT_CPROTECT) == 0)) {
1143 			error = EINVAL;
1144 			goto out1;
1145 		}
1146 
1147 		/*
1148 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1149 		 * failure to return an error for this so we'll just silently
1150 		 * add it if it is not passed in.
1151 		 */
1152 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1153 		    ((flags & MNT_REMOVABLE) == 0)) {
1154 			flags |= MNT_REMOVABLE;
1155 		}
1156 
1157 		/* Can't downgrade the backer of the root FS */
1158 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1159 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1160 			error = ENOTSUP;
1161 			goto out1;
1162 		}
1163 
1164 		/*
1165 		 * Only root, or the user that did the original mount is
1166 		 * permitted to update it.
1167 		 */
1168 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1169 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1170 			goto out1;
1171 		}
1172 #if CONFIG_MACF
1173 		error = mac_mount_check_remount(ctx, mp, flags);
1174 		if (error != 0) {
1175 			goto out1;
1176 		}
1177 #endif
1178 		/*
1179 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1180 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1181 		 */
1182 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1183 			flags |= MNT_NOSUID | MNT_NODEV;
1184 			if (mp->mnt_flag & MNT_NOEXEC) {
1185 				flags |= MNT_NOEXEC;
1186 			}
1187 		}
1188 		flag = mp->mnt_flag;
1189 		flag_set = true;
1190 
1191 
1192 
1193 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1194 
1195 		vfsp = mp->mnt_vtable;
1196 		goto update;
1197 	} // MNT_UPDATE
1198 
1199 	/*
1200 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1201 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1202 	 */
1203 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1204 		flags |= MNT_NOSUID | MNT_NODEV;
1205 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1206 			flags |= MNT_NOEXEC;
1207 		}
1208 	}
1209 
1210 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1211 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1212 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1213 	mount_list_lock();
1214 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1215 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1216 			vfsp->vfc_refcount++;
1217 			vfsp_ref = TRUE;
1218 			break;
1219 		}
1220 	}
1221 	mount_list_unlock();
1222 	if (vfsp == NULL) {
1223 		error = ENODEV;
1224 		goto out1;
1225 	}
1226 
1227 	/*
1228 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1229 	 * except in ROSV configs and for the initial BaseSystem root.
1230 	 */
1231 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1232 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1233 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1234 		error = EINVAL;  /* unsupported request */
1235 		goto out1;
1236 	}
1237 
1238 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1239 	if (error != 0) {
1240 		goto out1;
1241 	}
1242 
1243 	/*
1244 	 * Allocate and initialize the filesystem (mount_t)
1245 	 */
1246 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1247 	mntalloc = 1;
1248 
1249 	/* Initialize the default IO constraints */
1250 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1251 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1252 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1253 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1254 	mp->mnt_devblocksize = DEV_BSIZE;
1255 	mp->mnt_alignmentmask = PAGE_MASK;
1256 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1257 	mp->mnt_ioscale = 1;
1258 	mp->mnt_ioflags = 0;
1259 	mp->mnt_realrootvp = NULLVP;
1260 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1261 
1262 	mp->mnt_lflag |= MNT_LMOUNT;
1263 	did_set_lmount = TRUE;
1264 
1265 	TAILQ_INIT(&mp->mnt_vnodelist);
1266 	TAILQ_INIT(&mp->mnt_workerqueue);
1267 	TAILQ_INIT(&mp->mnt_newvnodes);
1268 	mount_lock_init(mp);
1269 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1270 	is_rwlock_locked = TRUE;
1271 	mp->mnt_op = vfsp->vfc_vfsops;
1272 	mp->mnt_vtable = vfsp;
1273 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1274 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1275 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1276 	do {
1277 		size_t pathlen = MAXPATHLEN;
1278 
1279 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1280 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1281 		}
1282 	} while (0);
1283 	mp->mnt_vnodecovered = vp;
1284 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1285 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1286 	mp->mnt_devbsdunit = 0;
1287 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1288 
1289 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1290 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1291 
1292 	if (kernelmount) {
1293 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1294 	}
1295 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1296 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1297 	}
1298 
1299 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1300 		// kernel mounted devfs
1301 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1302 	}
1303 
1304 update:
1305 
1306 	/*
1307 	 * Set the mount level flags.
1308 	 */
1309 	if (flags & MNT_RDONLY) {
1310 		mp->mnt_flag |= MNT_RDONLY;
1311 	} else if (mp->mnt_flag & MNT_RDONLY) {
1312 		// disallow read/write upgrades of file systems that
1313 		// had the TYPENAME_OVERRIDE feature set.
1314 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1315 			error = EPERM;
1316 			goto out1;
1317 		}
1318 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1319 	}
1320 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1321 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1322 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1323 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1324 	    MNT_QUARANTINE | MNT_CPROTECT);
1325 
1326 #if SECURE_KERNEL
1327 #if !CONFIG_MNT_SUID
1328 	/*
1329 	 * On release builds of iOS based platforms, always enforce NOSUID on
1330 	 * all mounts. We do this here because we can catch update mounts as well as
1331 	 * non-update mounts in this case.
1332 	 */
1333 	mp->mnt_flag |= (MNT_NOSUID);
1334 #endif
1335 #endif
1336 
1337 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1338 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1339 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1340 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1341 	    MNT_QUARANTINE | MNT_CPROTECT);
1342 
1343 #if CONFIG_MACF
1344 	if (flags & MNT_MULTILABEL) {
1345 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1346 			error = EINVAL;
1347 			goto out1;
1348 		}
1349 		mp->mnt_flag |= MNT_MULTILABEL;
1350 	}
1351 #endif
1352 	/*
1353 	 * Process device path for local file systems if requested.
1354 	 *
1355 	 * Snapshot and mount-by-role mounts do not use this path; they are
1356 	 * passing other opaque data in the device path field.
1357 	 *
1358 	 * Basesystemroot mounts pass a device path to be resolved here,
1359 	 * but it's just a char * already inside the kernel, which
1360 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1361 	 * mounts we must skip copyin (both of the address and of the string
1362 	 * (in NDINIT).
1363 	 */
1364 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1365 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1366 		boolean_t do_copyin_devpath = true;
1367 #if CONFIG_BASESYSTEMROOT
1368 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1369 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1370 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1371 			// but is actually a char ** pointing to a (kernelspace) string.
1372 			// We manually unpack it with a series of casts and dereferences
1373 			// that reverses what was done just above us on the stack in
1374 			// imageboot_pivot_image().
1375 			// After retrieving the path to the dev node (which we will NDINIT
1376 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1377 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1378 			char **devnamepp = (char **)fsmountargs;
1379 			char *devnamep = *devnamepp;
1380 			devpath = CAST_USER_ADDR_T(devnamep);
1381 			do_copyin_devpath = false;
1382 			fsmountargs = USER_ADDR_NULL;
1383 
1384 			//Now that we have a mp, denote that this mount is for the basesystem.
1385 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1386 		}
1387 #endif // CONFIG_BASESYSTEMROOT
1388 
1389 		if (do_copyin_devpath) {
1390 			if (vfs_context_is64bit(ctx)) {
1391 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1392 					goto out1;
1393 				}
1394 				fsmountargs += sizeof(devpath);
1395 			} else {
1396 				user32_addr_t tmp;
1397 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1398 					goto out1;
1399 				}
1400 				/* munge into LP64 addr */
1401 				devpath = CAST_USER_ADDR_T(tmp);
1402 				fsmountargs += sizeof(tmp);
1403 			}
1404 		}
1405 
1406 		/* Lookup device and authorize access to it */
1407 		if ((devpath)) {
1408 			struct nameidata nd;
1409 
1410 			enum uio_seg seg = UIO_USERSPACE;
1411 #if CONFIG_BASESYSTEMROOT
1412 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1413 				seg = UIO_SYSSPACE;
1414 			}
1415 #endif // CONFIG_BASESYSTEMROOT
1416 
1417 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1418 			if ((error = namei(&nd))) {
1419 				goto out1;
1420 			}
1421 
1422 			devvp = nd.ni_vp;
1423 
1424 			if (devvp->v_type != VBLK) {
1425 				error = ENOTBLK;
1426 				nameidone(&nd);
1427 				goto out2;
1428 			}
1429 			if (major(devvp->v_rdev) >= nblkdev) {
1430 				error = ENXIO;
1431 				nameidone(&nd);
1432 				goto out2;
1433 			}
1434 			/*
1435 			 * If mount by non-root, then verify that user has necessary
1436 			 * permissions on the device.
1437 			 */
1438 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1439 				kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1440 
1441 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1442 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1443 				}
1444 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1445 					nameidone(&nd);
1446 					goto out2;
1447 				}
1448 			}
1449 
1450 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1451 			nameidone(&nd);
1452 		}
1453 		/* On first mount, preflight and open device */
1454 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1455 			if ((error = vnode_ref(devvp))) {
1456 				goto out2;
1457 			}
1458 			/*
1459 			 * Disallow multiple mounts of the same device.
1460 			 * Disallow mounting of a device that is currently in use
1461 			 * (except for root, which might share swap device for miniroot).
1462 			 * Flush out any old buffers remaining from a previous use.
1463 			 */
1464 			if ((error = vfs_setmounting(devvp))) {
1465 				vnode_rele(devvp);
1466 				goto out2;
1467 			}
1468 
1469 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1470 				error = EBUSY;
1471 				goto out3;
1472 			}
1473 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1474 				error = ENOTBLK;
1475 				goto out3;
1476 			}
1477 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1478 				goto out3;
1479 			}
1480 
1481 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1482 #if CONFIG_MACF
1483 			error = mac_vnode_check_open(ctx,
1484 			    devvp,
1485 			    ronly ? FREAD : FREAD | FWRITE);
1486 			if (error) {
1487 				goto out3;
1488 			}
1489 #endif /* MAC */
1490 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1491 				goto out3;
1492 			}
1493 
1494 			mp->mnt_devvp = devvp;
1495 			device_vnode = devvp;
1496 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1497 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1498 		    (device_vnode = mp->mnt_devvp)) {
1499 			dev_t dev;
1500 			int maj;
1501 			/*
1502 			 * If upgrade to read-write by non-root, then verify
1503 			 * that user has necessary permissions on the device.
1504 			 */
1505 			vnode_getalways(device_vnode);
1506 
1507 			if (suser(vfs_context_ucred(ctx), NULL) &&
1508 			    (error = vnode_authorize(device_vnode, NULL,
1509 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1510 			    ctx)) != 0) {
1511 				vnode_put(device_vnode);
1512 				goto out2;
1513 			}
1514 
1515 			/* Tell the device that we're upgrading */
1516 			dev = (dev_t)device_vnode->v_rdev;
1517 			maj = major(dev);
1518 
1519 			if ((u_int)maj >= (u_int)nblkdev) {
1520 				panic("Volume mounted on a device with invalid major number.");
1521 			}
1522 
1523 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1524 			vnode_put(device_vnode);
1525 			device_vnode = NULLVP;
1526 			if (error != 0) {
1527 				goto out2;
1528 			}
1529 		}
1530 	} // localargs && !(snapshot | data | vm)
1531 
1532 #if CONFIG_MACF
1533 	if ((flags & MNT_UPDATE) == 0) {
1534 		mac_mount_label_init(mp);
1535 		mac_mount_label_associate(ctx, mp);
1536 	}
1537 	if (labelstr) {
1538 		if ((flags & MNT_UPDATE) != 0) {
1539 			error = mac_mount_check_label_update(ctx, mp);
1540 			if (error != 0) {
1541 				goto out3;
1542 			}
1543 		}
1544 	}
1545 #endif
1546 	/*
1547 	 * Mount the filesystem.  We already asserted that internal_flags
1548 	 * cannot have more than one mount-by-role bit set.
1549 	 */
1550 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1551 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1552 		    (caddr_t)fsmountargs, 0, ctx);
1553 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1554 #if CONFIG_ROSV_STARTUP
1555 		struct mount *origin_mp = (struct mount*)fsmountargs;
1556 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1557 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1558 		if (error) {
1559 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1560 		} else {
1561 			/* Mark volume associated with system volume */
1562 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1563 
1564 			/* Attempt to acquire the mnt_devvp and set it up */
1565 			struct vnode *mp_devvp = NULL;
1566 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1567 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1568 				    0, &mp_devvp, vfs_context_kernel());
1569 				if (!lerr) {
1570 					mp->mnt_devvp = mp_devvp;
1571 					//vnode_lookup took an iocount, need to drop it.
1572 					vnode_put(mp_devvp);
1573 					// now set `device_vnode` to the devvp that was acquired.
1574 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1575 					// note that though the iocount above was dropped, the mount acquires
1576 					// an implicit reference against the device.
1577 					device_vnode = mp_devvp;
1578 				}
1579 			}
1580 		}
1581 #else
1582 		error = EINVAL;
1583 #endif
1584 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1585 #if CONFIG_MOUNT_VM
1586 		struct mount *origin_mp = (struct mount*)fsmountargs;
1587 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1588 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1589 		if (error) {
1590 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1591 		} else {
1592 			/* Mark volume associated with system volume and a swap mount */
1593 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1594 			/* Attempt to acquire the mnt_devvp and set it up */
1595 			struct vnode *mp_devvp = NULL;
1596 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1597 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1598 				    0, &mp_devvp, vfs_context_kernel());
1599 				if (!lerr) {
1600 					mp->mnt_devvp = mp_devvp;
1601 					//vnode_lookup took an iocount, need to drop it.
1602 					vnode_put(mp_devvp);
1603 
1604 					// now set `device_vnode` to the devvp that was acquired.
1605 					// note that though the iocount above was dropped, the mount acquires
1606 					// an implicit reference against the device.
1607 					device_vnode = mp_devvp;
1608 				}
1609 			}
1610 		}
1611 #else
1612 		error = EINVAL;
1613 #endif
1614 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1615 #if CONFIG_MOUNT_PREBOOTRECOVERY
1616 		struct mount *origin_mp = (struct mount*)fsmountargs;
1617 		uint32_t mount_role = 0;
1618 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1619 			mount_role = VFS_PREBOOT_ROLE;
1620 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1621 			mount_role = VFS_RECOVERY_ROLE;
1622 		}
1623 
1624 		if (mount_role != 0) {
1625 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1626 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1627 			if (error) {
1628 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1629 			} else {
1630 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1631 				/* Mark volume associated with system volume */
1632 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1633 				/* Attempt to acquire the mnt_devvp and set it up */
1634 				struct vnode *mp_devvp = NULL;
1635 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1636 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1637 					    0, &mp_devvp, vfs_context_kernel());
1638 					if (!lerr) {
1639 						mp->mnt_devvp = mp_devvp;
1640 						//vnode_lookup took an iocount, need to drop it.
1641 						vnode_put(mp_devvp);
1642 
1643 						// now set `device_vnode` to the devvp that was acquired.
1644 						// note that though the iocount above was dropped, the mount acquires
1645 						// an implicit reference against the device.
1646 						device_vnode = mp_devvp;
1647 					}
1648 				}
1649 			}
1650 		} else {
1651 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1652 			error = EINVAL;
1653 		}
1654 #else
1655 		error = EINVAL;
1656 #endif
1657 	} else {
1658 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1659 	}
1660 
1661 	if (flags & MNT_UPDATE) {
1662 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1663 			mp->mnt_flag &= ~MNT_RDONLY;
1664 		}
1665 		mp->mnt_flag &= ~
1666 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1667 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1668 		if (error) {
1669 			mp->mnt_flag = flag;  /* restore flag value */
1670 		}
1671 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1672 		lck_rw_done(&mp->mnt_rwlock);
1673 		is_rwlock_locked = FALSE;
1674 		if (!error) {
1675 			enablequotas(mp, ctx);
1676 		}
1677 		goto exit;
1678 	}
1679 
1680 	/*
1681 	 * Put the new filesystem on the mount list after root.
1682 	 */
1683 	if (error == 0) {
1684 		struct vfs_attr vfsattr;
1685 		if (device_vnode) {
1686 			/*
1687 			 *   cache the IO attributes for the underlying physical media...
1688 			 *   an error return indicates the underlying driver doesn't
1689 			 *   support all the queries necessary... however, reasonable
1690 			 *   defaults will have been set, so no reason to bail or care
1691 			 *
1692 			 *   Need to do this before calling the MAC hook as it needs
1693 			 *   information from this call.
1694 			 */
1695 			vfs_init_io_attributes(device_vnode, mp);
1696 		}
1697 
1698 #if CONFIG_MACF
1699 		error = mac_mount_check_mount_late(ctx, mp);
1700 		if (error != 0) {
1701 			goto out4;
1702 		}
1703 
1704 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1705 			error = VFS_ROOT(mp, &rvp, ctx);
1706 			if (error) {
1707 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1708 				goto out4;
1709 			}
1710 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1711 			/*
1712 			 * drop reference provided by VFS_ROOT
1713 			 */
1714 			vnode_put(rvp);
1715 
1716 			if (error) {
1717 				goto out4;
1718 			}
1719 		}
1720 #endif  /* MAC */
1721 
1722 		vnode_lock_spin(vp);
1723 		CLR(vp->v_flag, VMOUNT);
1724 		vp->v_mountedhere = mp;
1725 		SET(vp->v_flag, VMOUNTEDHERE);
1726 		vnode_unlock(vp);
1727 
1728 		/*
1729 		 * taking the name_cache_lock exclusively will
1730 		 * insure that everyone is out of the fast path who
1731 		 * might be trying to use a now stale copy of
1732 		 * vp->v_mountedhere->mnt_realrootvp
1733 		 * bumping mount_generation causes the cached values
1734 		 * to be invalidated
1735 		 */
1736 		name_cache_lock();
1737 		mount_generation++;
1738 		name_cache_unlock();
1739 
1740 		error = vnode_ref(vp);
1741 		if (error != 0) {
1742 			goto out4;
1743 		}
1744 
1745 		have_usecount = TRUE;
1746 
1747 		error = checkdirs(vp, ctx);
1748 		if (error != 0) {
1749 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1750 			goto out4;
1751 		}
1752 		/*
1753 		 * there is no cleanup code here so I have made it void
1754 		 * we need to revisit this
1755 		 */
1756 		(void)VFS_START(mp, 0, ctx);
1757 
1758 		if (mount_list_add(mp) != 0) {
1759 			/*
1760 			 * The system is shutting down trying to umount
1761 			 * everything, so fail with a plausible errno.
1762 			 */
1763 			error = EBUSY;
1764 			goto out4;
1765 		}
1766 		lck_rw_done(&mp->mnt_rwlock);
1767 		is_rwlock_locked = FALSE;
1768 
1769 		/* Check if this mounted file system supports EAs or named streams. */
1770 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1771 		VFSATTR_INIT(&vfsattr);
1772 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1773 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1774 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1775 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1776 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1777 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1778 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1779 			}
1780 #if NAMEDSTREAMS
1781 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1782 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1783 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1784 			}
1785 #endif
1786 			/* Check if this file system supports path from id lookups. */
1787 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1788 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1789 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1790 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1791 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1792 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1793 			}
1794 
1795 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1796 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1797 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1798 			}
1799 		}
1800 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1801 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1802 		}
1803 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1804 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1805 		}
1806 		/* increment the operations count */
1807 		OSAddAtomic(1, &vfs_nummntops);
1808 		enablequotas(mp, ctx);
1809 
1810 		if (device_vnode) {
1811 			vfs_setmountedon(device_vnode);
1812 		}
1813 
1814 		/* Now that mount is setup, notify the listeners */
1815 		vfs_notify_mount(pvp);
1816 		IOBSDMountChange(mp, kIOMountChangeMount);
1817 	} else {
1818 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1819 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1820 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1821 			    mp->mnt_vtable->vfc_name, error);
1822 		}
1823 
1824 		vnode_lock_spin(vp);
1825 		CLR(vp->v_flag, VMOUNT);
1826 		vnode_unlock(vp);
1827 		mount_list_lock();
1828 		mp->mnt_vtable->vfc_refcount--;
1829 		mount_list_unlock();
1830 
1831 		if (device_vnode) {
1832 			vnode_rele(device_vnode);
1833 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1834 			vfs_clearmounting(device_vnode);
1835 		}
1836 		lck_rw_done(&mp->mnt_rwlock);
1837 		is_rwlock_locked = FALSE;
1838 
1839 		if (nc_smr_enabled) {
1840 			vfs_smr_synchronize();
1841 		}
1842 
1843 		/*
1844 		 * if we get here, we have a mount structure that needs to be freed,
1845 		 * but since the coveredvp hasn't yet been updated to point at it,
1846 		 * no need to worry about other threads holding a crossref on this mp
1847 		 * so it's ok to just free it
1848 		 */
1849 		mount_lock_destroy(mp);
1850 #if CONFIG_MACF
1851 		mac_mount_label_destroy(mp);
1852 #endif
1853 		zfree(mount_zone, mp);
1854 		did_set_lmount = false;
1855 	}
1856 exit:
1857 	/*
1858 	 * drop I/O count on the device vp if there was one
1859 	 */
1860 	if (devpath && devvp) {
1861 		vnode_put(devvp);
1862 	}
1863 
1864 	if (did_set_lmount) {
1865 		mount_lock_spin(mp);
1866 		mp->mnt_lflag &= ~MNT_LMOUNT;
1867 		mount_unlock(mp);
1868 	}
1869 
1870 	return error;
1871 
1872 /* Error condition exits */
1873 out4:
1874 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1875 
1876 	/*
1877 	 * If the mount has been placed on the covered vp,
1878 	 * it may have been discovered by now, so we have
1879 	 * to treat this just like an unmount
1880 	 */
1881 	mount_lock_spin(mp);
1882 	mp->mnt_lflag |= MNT_LDEAD;
1883 	mount_unlock(mp);
1884 
1885 	if (device_vnode != NULLVP) {
1886 		vnode_rele(device_vnode);
1887 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1888 		    ctx);
1889 		vfs_clearmounting(device_vnode);
1890 		did_rele = TRUE;
1891 	}
1892 
1893 	vnode_lock_spin(vp);
1894 
1895 	mp->mnt_crossref++;
1896 	CLR(vp->v_flag, VMOUNTEDHERE);
1897 	vp->v_mountedhere = (mount_t) 0;
1898 
1899 	vnode_unlock(vp);
1900 
1901 	if (have_usecount) {
1902 		vnode_rele(vp);
1903 	}
1904 out3:
1905 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1906 		vnode_rele(devvp);
1907 		vfs_clearmounting(devvp);
1908 	}
1909 out2:
1910 	if (devpath && devvp) {
1911 		vnode_put(devvp);
1912 	}
1913 out1:
1914 	/* Release mnt_rwlock only when it was taken */
1915 	if (is_rwlock_locked == TRUE) {
1916 		if (flag_set) {
1917 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1918 		}
1919 		lck_rw_done(&mp->mnt_rwlock);
1920 	}
1921 
1922 	if (did_set_lmount) {
1923 		mount_lock_spin(mp);
1924 		mp->mnt_lflag &= ~MNT_LMOUNT;
1925 		mount_unlock(mp);
1926 	}
1927 
1928 	if (mntalloc) {
1929 		if (mp->mnt_crossref) {
1930 			mount_dropcrossref(mp, vp, 0);
1931 		} else {
1932 			if (nc_smr_enabled) {
1933 				vfs_smr_synchronize();
1934 			}
1935 
1936 			mount_lock_destroy(mp);
1937 #if CONFIG_MACF
1938 			mac_mount_label_destroy(mp);
1939 #endif
1940 			zfree(mount_zone, mp);
1941 		}
1942 	}
1943 	if (vfsp_ref) {
1944 		mount_list_lock();
1945 		vfsp->vfc_refcount--;
1946 		mount_list_unlock();
1947 	}
1948 
1949 	return error;
1950 }
1951 
1952 /*
1953  * Flush in-core data, check for competing mount attempts,
1954  * and set VMOUNT
1955  */
1956 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1957 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1958 {
1959 #if !CONFIG_MACF
1960 #pragma unused(cnp,fsname)
1961 #endif
1962 	struct vnode_attr va;
1963 	int error;
1964 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1965 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1966 	boolean_t is_busy;
1967 
1968 	if (!skip_auth) {
1969 		/*
1970 		 * If the user is not root, ensure that they own the directory
1971 		 * onto which we are attempting to mount.
1972 		 */
1973 		VATTR_INIT(&va);
1974 		VATTR_WANTED(&va, va_uid);
1975 		if ((error = vnode_getattr(vp, &va, ctx)) ||
1976 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1977 		    (!vfs_context_issuser(ctx)))) {
1978 			error = EPERM;
1979 			goto out;
1980 		}
1981 	}
1982 
1983 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1984 		goto out;
1985 	}
1986 
1987 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1988 		goto out;
1989 	}
1990 
1991 	if (vp->v_type != VDIR) {
1992 		error = ENOTDIR;
1993 		goto out;
1994 	}
1995 
1996 	vnode_lock_spin(vp);
1997 	is_busy = is_fmount ?
1998 	    (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1999 	    (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
2000 	if (is_busy) {
2001 		vnode_unlock(vp);
2002 		error = EBUSY;
2003 		goto out;
2004 	}
2005 	SET(vp->v_flag, VMOUNT);
2006 	vnode_unlock(vp);
2007 
2008 #if CONFIG_MACF
2009 	error = mac_mount_check_mount(ctx, vp,
2010 	    cnp, fsname);
2011 	if (error != 0) {
2012 		vnode_lock_spin(vp);
2013 		CLR(vp->v_flag, VMOUNT);
2014 		vnode_unlock(vp);
2015 	}
2016 #endif
2017 
2018 out:
2019 	return error;
2020 }
2021 
2022 #if CONFIG_IMGSRC_ACCESS
2023 
2024 #define DEBUG_IMGSRC 0
2025 
2026 #if DEBUG_IMGSRC
2027 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2028 #else
2029 #define IMGSRC_DEBUG(args...) do { } while(0)
2030 #endif
2031 
2032 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2033 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2034 {
2035 	struct nameidata nd;
2036 	vnode_t vp, realdevvp;
2037 	kauth_action_t accessmode;
2038 	int error;
2039 	enum uio_seg uio = UIO_USERSPACE;
2040 
2041 	if (ctx == vfs_context_kernel()) {
2042 		uio = UIO_SYSSPACE;
2043 	}
2044 
2045 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2046 	if ((error = namei(&nd))) {
2047 		IMGSRC_DEBUG("namei() failed with %d\n", error);
2048 		return error;
2049 	}
2050 
2051 	vp = nd.ni_vp;
2052 
2053 	if (!vnode_isblk(vp)) {
2054 		IMGSRC_DEBUG("Not block device.\n");
2055 		error = ENOTBLK;
2056 		goto out;
2057 	}
2058 
2059 	realdevvp = mp->mnt_devvp;
2060 	if (realdevvp == NULLVP) {
2061 		IMGSRC_DEBUG("No device backs the mount.\n");
2062 		error = ENXIO;
2063 		goto out;
2064 	}
2065 
2066 	error = vnode_getwithref(realdevvp);
2067 	if (error != 0) {
2068 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2069 		goto out;
2070 	}
2071 
2072 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2073 		IMGSRC_DEBUG("Wrong dev_t.\n");
2074 		error = ENXIO;
2075 		goto out1;
2076 	}
2077 
2078 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2079 
2080 	/*
2081 	 * If mount by non-root, then verify that user has necessary
2082 	 * permissions on the device.
2083 	 */
2084 	if (!vfs_context_issuser(ctx)) {
2085 		accessmode = KAUTH_VNODE_READ_DATA;
2086 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2087 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2088 		}
2089 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2090 			IMGSRC_DEBUG("Access denied.\n");
2091 			goto out1;
2092 		}
2093 	}
2094 
2095 	*devvpp = vp;
2096 
2097 out1:
2098 	vnode_put(realdevvp);
2099 
2100 out:
2101 	nameidone(&nd);
2102 
2103 	if (error) {
2104 		vnode_put(vp);
2105 	}
2106 
2107 	return error;
2108 }
2109 
2110 /*
2111  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2112  * and call checkdirs()
2113  */
2114 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2115 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2116 {
2117 	int error;
2118 
2119 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2120 
2121 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2122 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2123 
2124 	vnode_lock_spin(vp);
2125 	CLR(vp->v_flag, VMOUNT);
2126 	vp->v_mountedhere = mp;
2127 	SET(vp->v_flag, VMOUNTEDHERE);
2128 	vnode_unlock(vp);
2129 
2130 	/*
2131 	 * taking the name_cache_lock exclusively will
2132 	 * insure that everyone is out of the fast path who
2133 	 * might be trying to use a now stale copy of
2134 	 * vp->v_mountedhere->mnt_realrootvp
2135 	 * bumping mount_generation causes the cached values
2136 	 * to be invalidated
2137 	 */
2138 	name_cache_lock();
2139 	mount_generation++;
2140 	name_cache_unlock();
2141 
2142 	error = vnode_ref(vp);
2143 	if (error != 0) {
2144 		goto out;
2145 	}
2146 
2147 	error = checkdirs(vp, ctx);
2148 	if (error != 0) {
2149 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2150 		vnode_rele(vp);
2151 		goto out;
2152 	}
2153 
2154 out:
2155 	if (error != 0) {
2156 		mp->mnt_vnodecovered = NULLVP;
2157 	}
2158 	return error;
2159 }
2160 
2161 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2162 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2163 {
2164 	vnode_rele(vp);
2165 	vnode_lock_spin(vp);
2166 	CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2167 	vp->v_mountedhere = (mount_t)NULL;
2168 	vnode_unlock(vp);
2169 
2170 	mp->mnt_vnodecovered = NULLVP;
2171 }
2172 
2173 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2174 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2175 {
2176 	int error;
2177 
2178 	/* unmount in progress return error */
2179 	mount_lock_spin(mp);
2180 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2181 		mount_unlock(mp);
2182 		return EBUSY;
2183 	}
2184 	mount_unlock(mp);
2185 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2186 
2187 	/*
2188 	 * We only allow the filesystem to be reloaded if it
2189 	 * is currently mounted read-only.
2190 	 */
2191 	if ((flags & MNT_RELOAD) &&
2192 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2193 		error = ENOTSUP;
2194 		goto out;
2195 	}
2196 
2197 	/*
2198 	 * Only root, or the user that did the original mount is
2199 	 * permitted to update it.
2200 	 */
2201 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2202 	    (!vfs_context_issuser(ctx))) {
2203 		error = EPERM;
2204 		goto out;
2205 	}
2206 #if CONFIG_MACF
2207 	error = mac_mount_check_remount(ctx, mp, flags);
2208 	if (error != 0) {
2209 		goto out;
2210 	}
2211 #endif
2212 
2213 out:
2214 	if (error) {
2215 		lck_rw_done(&mp->mnt_rwlock);
2216 	}
2217 
2218 	return error;
2219 }
2220 
2221 static void
mount_end_update(mount_t mp)2222 mount_end_update(mount_t mp)
2223 {
2224 	lck_rw_done(&mp->mnt_rwlock);
2225 }
2226 
2227 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2228 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2229 {
2230 	vnode_t vp;
2231 
2232 	if (height >= MAX_IMAGEBOOT_NESTING) {
2233 		return EINVAL;
2234 	}
2235 
2236 	vp = imgsrc_rootvnodes[height];
2237 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2238 		*rvpp = vp;
2239 		return 0;
2240 	} else {
2241 		return ENOENT;
2242 	}
2243 }
2244 
2245 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2246 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2247     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2248     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2249 {
2250 	int error;
2251 	mount_t mp;
2252 	boolean_t placed = FALSE;
2253 	struct vfstable *vfsp;
2254 	user_addr_t devpath;
2255 	char *old_mntonname;
2256 	vnode_t rvp;
2257 	vnode_t devvp;
2258 	uint32_t height;
2259 	uint32_t flags;
2260 
2261 	/* If we didn't imageboot, nothing to move */
2262 	if (imgsrc_rootvnodes[0] == NULLVP) {
2263 		return EINVAL;
2264 	}
2265 
2266 	/* Only root can do this */
2267 	if (!vfs_context_issuser(ctx)) {
2268 		return EPERM;
2269 	}
2270 
2271 	IMGSRC_DEBUG("looking for root vnode.\n");
2272 
2273 	/*
2274 	 * Get root vnode of filesystem we're moving.
2275 	 */
2276 	if (by_index) {
2277 		if (is64bit) {
2278 			struct user64_mnt_imgsrc_args mia64;
2279 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2280 			if (error != 0) {
2281 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2282 				return error;
2283 			}
2284 
2285 			height = mia64.mi_height;
2286 			flags = mia64.mi_flags;
2287 			devpath = (user_addr_t)mia64.mi_devpath;
2288 		} else {
2289 			struct user32_mnt_imgsrc_args mia32;
2290 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2291 			if (error != 0) {
2292 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2293 				return error;
2294 			}
2295 
2296 			height = mia32.mi_height;
2297 			flags = mia32.mi_flags;
2298 			devpath = mia32.mi_devpath;
2299 		}
2300 	} else {
2301 		/*
2302 		 * For binary compatibility--assumes one level of nesting.
2303 		 */
2304 		if (is64bit) {
2305 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2306 				return error;
2307 			}
2308 		} else {
2309 			user32_addr_t tmp;
2310 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2311 				return error;
2312 			}
2313 
2314 			/* munge into LP64 addr */
2315 			devpath = CAST_USER_ADDR_T(tmp);
2316 		}
2317 
2318 		height = 0;
2319 		flags = 0;
2320 	}
2321 
2322 	if (flags != 0) {
2323 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2324 		return EINVAL;
2325 	}
2326 
2327 	error = get_imgsrc_rootvnode(height, &rvp);
2328 	if (error != 0) {
2329 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2330 		return error;
2331 	}
2332 
2333 	IMGSRC_DEBUG("got old root vnode\n");
2334 
2335 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2336 
2337 	/* Can only move once */
2338 	mp = vnode_mount(rvp);
2339 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2340 		IMGSRC_DEBUG("Already moved.\n");
2341 		error = EBUSY;
2342 		goto out0;
2343 	}
2344 
2345 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2346 	IMGSRC_DEBUG("Starting updated.\n");
2347 
2348 	/* Get exclusive rwlock on mount, authorize update on mp */
2349 	error = mount_begin_update(mp, ctx, 0);
2350 	if (error != 0) {
2351 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2352 		goto out0;
2353 	}
2354 
2355 	/*
2356 	 * It can only be moved once.  Flag is set under the rwlock,
2357 	 * so we're now safe to proceed.
2358 	 */
2359 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2360 		IMGSRC_DEBUG("Already moved [2]\n");
2361 		goto out1;
2362 	}
2363 
2364 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2365 
2366 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2367 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2368 	if (error != 0) {
2369 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2370 		goto out1;
2371 	}
2372 
2373 	IMGSRC_DEBUG("Covered vp OK.\n");
2374 
2375 	/* Sanity check the name caller has provided */
2376 	vfsp = mp->mnt_vtable;
2377 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2378 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2379 		    vfsp->vfc_name, fsname);
2380 		error = EINVAL;
2381 		goto out2;
2382 	}
2383 
2384 	/* Check the device vnode and update mount-from name, for local filesystems */
2385 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2386 		IMGSRC_DEBUG("Local, doing device validation.\n");
2387 
2388 		if (devpath != USER_ADDR_NULL) {
2389 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2390 			if (error) {
2391 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2392 				goto out2;
2393 			}
2394 
2395 			vnode_put(devvp);
2396 		}
2397 	}
2398 
2399 	/*
2400 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2401 	 * and increment the name cache's mount generation
2402 	 */
2403 
2404 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2405 	error = place_mount_and_checkdirs(mp, vp, ctx);
2406 	if (error != 0) {
2407 		goto out2;
2408 	}
2409 
2410 	placed = TRUE;
2411 
2412 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2413 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2414 
2415 	/* Forbid future moves */
2416 	mount_lock(mp);
2417 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2418 	mount_unlock(mp);
2419 
2420 	/* Finally, add to mount list, completely ready to go */
2421 	if (mount_list_add(mp) != 0) {
2422 		/*
2423 		 * The system is shutting down trying to umount
2424 		 * everything, so fail with a plausible errno.
2425 		 */
2426 		error = EBUSY;
2427 		goto out3;
2428 	}
2429 
2430 	mount_end_update(mp);
2431 	vnode_put(rvp);
2432 	zfree(ZV_NAMEI, old_mntonname);
2433 
2434 	vfs_notify_mount(pvp);
2435 
2436 	return 0;
2437 out3:
2438 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2439 
2440 	mount_lock(mp);
2441 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2442 	mount_unlock(mp);
2443 
2444 out2:
2445 	/*
2446 	 * Placing the mp on the vnode clears VMOUNT,
2447 	 * so cleanup is different after that point
2448 	 */
2449 	if (placed) {
2450 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2451 		undo_place_on_covered_vp(mp, vp);
2452 	} else {
2453 		vnode_lock_spin(vp);
2454 		CLR(vp->v_flag, VMOUNT);
2455 		vnode_unlock(vp);
2456 	}
2457 out1:
2458 	mount_end_update(mp);
2459 
2460 out0:
2461 	vnode_put(rvp);
2462 	zfree(ZV_NAMEI, old_mntonname);
2463 	return error;
2464 }
2465 
2466 #endif /* CONFIG_IMGSRC_ACCESS */
2467 
2468 void
enablequotas(struct mount * mp,vfs_context_t ctx)2469 enablequotas(struct mount *mp, vfs_context_t ctx)
2470 {
2471 	struct nameidata qnd;
2472 	int type;
2473 	char qfpath[MAXPATHLEN];
2474 	const char *qfname = QUOTAFILENAME;
2475 	const char *qfopsname = QUOTAOPSNAME;
2476 	const char *qfextension[] = INITQFNAMES;
2477 
2478 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2479 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2480 		return;
2481 	}
2482 	/*
2483 	 * Enable filesystem disk quotas if necessary.
2484 	 * We ignore errors as this should not interfere with final mount
2485 	 */
2486 	for (type = 0; type < MAXQUOTAS; type++) {
2487 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2488 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2489 		    CAST_USER_ADDR_T(qfpath), ctx);
2490 		if (namei(&qnd) != 0) {
2491 			continue;           /* option file to trigger quotas is not present */
2492 		}
2493 		vnode_put(qnd.ni_vp);
2494 		nameidone(&qnd);
2495 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2496 
2497 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2498 	}
2499 	return;
2500 }
2501 
2502 
2503 static int
checkdirs_callback(proc_t p,void * arg)2504 checkdirs_callback(proc_t p, void * arg)
2505 {
2506 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2507 	vnode_t olddp = cdrp->olddp;
2508 	vnode_t newdp = cdrp->newdp;
2509 	struct filedesc *fdp = &p->p_fd;
2510 	vnode_t new_cvp = newdp;
2511 	vnode_t new_rvp = newdp;
2512 	vnode_t old_cvp = NULL;
2513 	vnode_t old_rvp = NULL;
2514 
2515 	/*
2516 	 * XXX Also needs to iterate each thread in the process to see if it
2517 	 * XXX is using a per-thread current working directory, and, if so,
2518 	 * XXX update that as well.
2519 	 */
2520 
2521 	/*
2522 	 * First, with the proc_fdlock held, check to see if we will need
2523 	 * to do any work.  If not, we will get out fast.
2524 	 */
2525 	proc_fdlock(p);
2526 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2527 		proc_fdunlock(p);
2528 		return PROC_RETURNED;
2529 	}
2530 	proc_fdunlock(p);
2531 
2532 	/*
2533 	 * Ok, we will have to do some work.  Always take two refs
2534 	 * because we might need that many.  We'll dispose of whatever
2535 	 * we ended up not using.
2536 	 */
2537 	if (vnode_ref(newdp) != 0) {
2538 		return PROC_RETURNED;
2539 	}
2540 	if (vnode_ref(newdp) != 0) {
2541 		vnode_rele(newdp);
2542 		return PROC_RETURNED;
2543 	}
2544 
2545 	proc_dirs_lock_exclusive(p);
2546 	/*
2547 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2548 	 * have to do all of the checks again.
2549 	 */
2550 	proc_fdlock(p);
2551 	if (fdp->fd_cdir == olddp) {
2552 		old_cvp = olddp;
2553 		fdp->fd_cdir = newdp;
2554 		new_cvp = NULL;
2555 	}
2556 	if (fdp->fd_rdir == olddp) {
2557 		old_rvp = olddp;
2558 		fdp->fd_rdir = newdp;
2559 		new_rvp = NULL;
2560 	}
2561 	proc_fdunlock(p);
2562 	proc_dirs_unlock_exclusive(p);
2563 
2564 	/*
2565 	 * Dispose of any references that are no longer needed.
2566 	 */
2567 	if (old_cvp != NULL) {
2568 		vnode_rele(old_cvp);
2569 	}
2570 	if (old_rvp != NULL) {
2571 		vnode_rele(old_rvp);
2572 	}
2573 	if (new_cvp != NULL) {
2574 		vnode_rele(new_cvp);
2575 	}
2576 	if (new_rvp != NULL) {
2577 		vnode_rele(new_rvp);
2578 	}
2579 
2580 	return PROC_RETURNED;
2581 }
2582 
2583 
2584 
2585 /*
2586  * Scan all active processes to see if any of them have a current
2587  * or root directory onto which the new filesystem has just been
2588  * mounted. If so, replace them with the new mount point.
2589  */
2590 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2591 checkdirs(vnode_t olddp, vfs_context_t ctx)
2592 {
2593 	vnode_t newdp;
2594 	vnode_t tvp;
2595 	int err;
2596 	struct cdirargs cdr;
2597 
2598 	if (olddp->v_usecount == 1) {
2599 		return 0;
2600 	}
2601 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2602 
2603 	if (err != 0) {
2604 #if DIAGNOSTIC
2605 		panic("mount: lost mount: error %d", err);
2606 #endif
2607 		return err;
2608 	}
2609 
2610 	cdr.olddp = olddp;
2611 	cdr.newdp = newdp;
2612 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2613 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2614 
2615 	if (rootvnode == olddp) {
2616 		vnode_ref(newdp);
2617 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2618 		tvp = rootvnode;
2619 		rootvnode = newdp;
2620 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2621 		vnode_rele(tvp);
2622 	}
2623 
2624 	vnode_put(newdp);
2625 	return 0;
2626 }
2627 
2628 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2629 	"com.apple.private.vfs.role-account-unmount"
2630 
2631 /*
2632  * Unmount a file system.
2633  *
2634  * Note: unmount takes a path to the vnode mounted on as argument,
2635  * not special file (as before).
2636  */
2637 /* ARGSUSED */
2638 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2639 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2640 {
2641 	vnode_t vp;
2642 	struct mount *mp;
2643 	int error;
2644 	struct nameidata nd;
2645 	vfs_context_t ctx;
2646 
2647 	/*
2648 	 * If the process has the entitlement, use the kernel's context when
2649 	 * performing lookup on the mount path as the process might lack proper
2650 	 * permission to access the directory.
2651 	 */
2652 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2653 	    vfs_context_kernel() : vfs_context_current();
2654 
2655 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2656 	    UIO_USERSPACE, uap->path, ctx);
2657 	error = namei(&nd);
2658 	if (error) {
2659 		return error;
2660 	}
2661 	vp = nd.ni_vp;
2662 	mp = vp->v_mount;
2663 	nameidone(&nd);
2664 
2665 	/*
2666 	 * Must be the root of the filesystem
2667 	 */
2668 	if ((vp->v_flag & VROOT) == 0) {
2669 		vnode_put(vp);
2670 		return EINVAL;
2671 	}
2672 #if CONFIG_MACF
2673 	error = mac_mount_check_umount(ctx, mp);
2674 	if (error != 0) {
2675 		vnode_put(vp);
2676 		return error;
2677 	}
2678 #endif
2679 	mount_ref(mp, 0);
2680 	vnode_put(vp);
2681 	/* safedounmount consumes the mount ref */
2682 	return safedounmount(mp, uap->flags, ctx);
2683 }
2684 
2685 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2686 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2687 {
2688 	mount_t mp;
2689 
2690 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2691 	if (mp == (mount_t)0) {
2692 		return ENOENT;
2693 	}
2694 	mount_ref(mp, 0);
2695 	mount_iterdrop(mp);
2696 	/* safedounmount consumes the mount ref */
2697 	return safedounmount(mp, flags, ctx);
2698 }
2699 
2700 /*
2701  * The mount struct comes with a mount ref which will be consumed.
2702  * Do the actual file system unmount, prevent some common foot shooting.
2703  */
2704 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2705 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2706 {
2707 	int error;
2708 	proc_t p = vfs_context_proc(ctx);
2709 
2710 	/*
2711 	 * If the file system is not responding and MNT_NOBLOCK
2712 	 * is set and not a forced unmount then return EBUSY.
2713 	 */
2714 	if ((mp->mnt_lflag & MNT_LNOTRESP) &&
2715 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2716 		error = EBUSY;
2717 		goto out;
2718 	}
2719 
2720 	/*
2721 	 * Skip authorization in two cases:
2722 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2723 	 *   This entitlement allows non-root processes unmount volumes mounted by
2724 	 *   other processes.
2725 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2726 	 *   attempt.
2727 	 */
2728 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2729 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2730 		/*
2731 		 * Only root, or the user that did the original mount is
2732 		 * permitted to unmount this filesystem.
2733 		 */
2734 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2735 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2736 			goto out;
2737 		}
2738 	}
2739 	/*
2740 	 * Don't allow unmounting the root file system, or other volumes
2741 	 * associated with it (for example, the associated VM or DATA mounts) .
2742 	 */
2743 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2744 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2745 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2746 			    mp->mnt_vfsstat.f_mntonname);
2747 		}
2748 		error = EBUSY; /* the root (or associated volumes) is always busy */
2749 		goto out;
2750 	}
2751 
2752 	/*
2753 	 * If the mount is providing the root filesystem's disk image
2754 	 * (i.e. imageboot), don't allow unmounting
2755 	 */
2756 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2757 		error = EBUSY;
2758 		goto out;
2759 	}
2760 
2761 	return dounmount(mp, flags, 1, ctx);
2762 
2763 out:
2764 	mount_drop(mp, 0);
2765 	return error;
2766 }
2767 
2768 /*
2769  * Do the actual file system unmount.
2770  */
2771 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2772 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2773 {
2774 	vnode_t coveredvp = (vnode_t)0;
2775 	int error;
2776 	int needwakeup = 0;
2777 	int forcedunmount = 0;
2778 	int lflags = 0;
2779 	struct vnode *devvp = NULLVP;
2780 #if CONFIG_TRIGGERS
2781 	proc_t p = vfs_context_proc(ctx);
2782 	int did_vflush = 0;
2783 	int pflags_save = 0;
2784 #endif /* CONFIG_TRIGGERS */
2785 
2786 #if CONFIG_FSE
2787 	if (!(flags & MNT_FORCE)) {
2788 		fsevent_unmount(mp, ctx);  /* has to come first! */
2789 	}
2790 #endif
2791 
2792 	mount_lock(mp);
2793 
2794 	/*
2795 	 * If already an unmount in progress just return EBUSY.
2796 	 * Even a forced unmount cannot override.
2797 	 */
2798 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2799 		if (withref != 0) {
2800 			mount_drop(mp, 1);
2801 		}
2802 		mount_unlock(mp);
2803 		return EBUSY;
2804 	}
2805 
2806 	if (flags & MNT_FORCE) {
2807 		forcedunmount = 1;
2808 		mp->mnt_lflag |= MNT_LFORCE;
2809 	}
2810 
2811 #if CONFIG_TRIGGERS
2812 	if (flags & MNT_NOBLOCK && p != kernproc) {
2813 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2814 	}
2815 #endif
2816 
2817 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2818 	mp->mnt_lflag |= MNT_LUNMOUNT;
2819 	mp->mnt_flag &= ~MNT_ASYNC;
2820 	/*
2821 	 * anyone currently in the fast path that
2822 	 * trips over the cached rootvp will be
2823 	 * dumped out and forced into the slow path
2824 	 * to regenerate a new cached value
2825 	 */
2826 	mp->mnt_realrootvp = NULLVP;
2827 	mount_unlock(mp);
2828 
2829 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2830 		/*
2831 		 * Force unmount any mounts in this filesystem.
2832 		 * If any unmounts fail - just leave them dangling.
2833 		 * Avoids recursion.
2834 		 */
2835 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2836 	}
2837 
2838 	/*
2839 	 * taking the name_cache_lock exclusively will
2840 	 * insure that everyone is out of the fast path who
2841 	 * might be trying to use a now stale copy of
2842 	 * vp->v_mountedhere->mnt_realrootvp
2843 	 * bumping mount_generation causes the cached values
2844 	 * to be invalidated
2845 	 */
2846 	name_cache_lock();
2847 	mount_generation++;
2848 	name_cache_unlock();
2849 
2850 
2851 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2852 	if (withref != 0) {
2853 		mount_drop(mp, 0);
2854 	}
2855 	error = 0;
2856 	if (forcedunmount == 0) {
2857 		ubc_umount(mp); /* release cached vnodes */
2858 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2859 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2860 			if (error) {
2861 				mount_lock(mp);
2862 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2863 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2864 				mp->mnt_lflag &= ~MNT_LFORCE;
2865 				goto out;
2866 			}
2867 		}
2868 	}
2869 
2870 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2871 
2872 #if CONFIG_TRIGGERS
2873 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2874 	did_vflush = 1;
2875 #endif
2876 	if (forcedunmount) {
2877 		lflags |= FORCECLOSE;
2878 	}
2879 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2880 	if ((forcedunmount == 0) && error) {
2881 		mount_lock(mp);
2882 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2883 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2884 		mp->mnt_lflag &= ~MNT_LFORCE;
2885 		goto out;
2886 	}
2887 
2888 	/* make sure there are no one in the mount iterations or lookup */
2889 	mount_iterdrain(mp);
2890 
2891 	error = VFS_UNMOUNT(mp, flags, ctx);
2892 	if (error) {
2893 		mount_iterreset(mp);
2894 		mount_lock(mp);
2895 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2896 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2897 		mp->mnt_lflag &= ~MNT_LFORCE;
2898 		goto out;
2899 	}
2900 
2901 	/* increment the operations count */
2902 	if (!error) {
2903 		OSAddAtomic(1, &vfs_nummntops);
2904 	}
2905 
2906 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2907 		/* hold an io reference and drop the usecount before close */
2908 		devvp = mp->mnt_devvp;
2909 		vnode_getalways(devvp);
2910 		vnode_rele(devvp);
2911 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2912 		    ctx);
2913 		vnode_clearmountedon(devvp);
2914 		vnode_put(devvp);
2915 	}
2916 	lck_rw_done(&mp->mnt_rwlock);
2917 	mount_list_remove(mp);
2918 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2919 
2920 	/* mark the mount point hook in the vp but not drop the ref yet */
2921 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2922 		/*
2923 		 * The covered vnode needs special handling. Trying to get an
2924 		 * iocount must not block here as this may lead to deadlocks
2925 		 * if the Filesystem to which the covered vnode belongs is
2926 		 * undergoing forced unmounts. Since we hold a usecount, the
2927 		 * vnode cannot be reused (it can, however, still be terminated)
2928 		 */
2929 		vnode_getalways(coveredvp);
2930 		vnode_lock_spin(coveredvp);
2931 
2932 		mp->mnt_crossref++;
2933 		coveredvp->v_mountedhere = (struct mount *)0;
2934 		CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
2935 		vnode_unlock(coveredvp);
2936 		vnode_put(coveredvp);
2937 	}
2938 
2939 	mount_list_lock();
2940 	mp->mnt_vtable->vfc_refcount--;
2941 	mount_list_unlock();
2942 
2943 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
2944 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2945 	mount_lock(mp);
2946 	mp->mnt_lflag |= MNT_LDEAD;
2947 
2948 	if (mp->mnt_lflag & MNT_LWAIT) {
2949 		/*
2950 		 * do the wakeup here
2951 		 * in case we block in mount_refdrain
2952 		 * which will drop the mount lock
2953 		 * and allow anyone blocked in vfs_busy
2954 		 * to wakeup and see the LDEAD state
2955 		 */
2956 		mp->mnt_lflag &= ~MNT_LWAIT;
2957 		wakeup((caddr_t)mp);
2958 	}
2959 	mount_refdrain(mp);
2960 
2961 	/* free disk_conditioner_info structure for this mount */
2962 	disk_conditioner_unmount(mp);
2963 
2964 out:
2965 	if (mp->mnt_lflag & MNT_LWAIT) {
2966 		mp->mnt_lflag &= ~MNT_LWAIT;
2967 		needwakeup = 1;
2968 	}
2969 
2970 #if CONFIG_TRIGGERS
2971 	if (flags & MNT_NOBLOCK && p != kernproc) {
2972 		// Restore P_NOREMOTEHANG bit to its previous value
2973 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
2974 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2975 		}
2976 	}
2977 
2978 	/*
2979 	 * Callback and context are set together under the mount lock, and
2980 	 * never cleared, so we're safe to examine them here, drop the lock,
2981 	 * and call out.
2982 	 */
2983 	if (mp->mnt_triggercallback != NULL) {
2984 		mount_unlock(mp);
2985 		if (error == 0) {
2986 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2987 		} else if (did_vflush) {
2988 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2989 		}
2990 	} else {
2991 		mount_unlock(mp);
2992 	}
2993 #else
2994 	mount_unlock(mp);
2995 #endif /* CONFIG_TRIGGERS */
2996 
2997 	lck_rw_done(&mp->mnt_rwlock);
2998 
2999 	if (needwakeup) {
3000 		wakeup((caddr_t)mp);
3001 	}
3002 
3003 	if (!error) {
3004 		if ((coveredvp != NULLVP)) {
3005 			vnode_t pvp = NULLVP;
3006 
3007 			/*
3008 			 * The covered vnode needs special handling. Trying to
3009 			 * get an iocount must not block here as this may lead
3010 			 * to deadlocks if the Filesystem to which the covered
3011 			 * vnode belongs is undergoing forced unmounts. Since we
3012 			 * hold a usecount, the  vnode cannot be reused
3013 			 * (it can, however, still be terminated).
3014 			 */
3015 			vnode_getalways(coveredvp);
3016 
3017 			mount_dropcrossref(mp, coveredvp, 0);
3018 			/*
3019 			 * We'll _try_ to detect if this really needs to be
3020 			 * done. The coveredvp can only be in termination (or
3021 			 * terminated) if the coveredvp's mount point is in a
3022 			 * forced unmount (or has been) since we still hold the
3023 			 * ref.
3024 			 */
3025 			if (!vnode_isrecycled(coveredvp)) {
3026 				pvp = vnode_getparent(coveredvp);
3027 #if CONFIG_TRIGGERS
3028 				if (coveredvp->v_resolve) {
3029 					vnode_trigger_rearm(coveredvp, ctx);
3030 				}
3031 #endif
3032 			}
3033 
3034 			vnode_rele(coveredvp);
3035 			vnode_put(coveredvp);
3036 			coveredvp = NULLVP;
3037 
3038 			if (pvp) {
3039 				lock_vnode_and_post(pvp, NOTE_WRITE);
3040 				vnode_put(pvp);
3041 			}
3042 		} else if (mp->mnt_flag & MNT_ROOTFS) {
3043 			if (nc_smr_enabled) {
3044 				vfs_smr_synchronize();
3045 			}
3046 
3047 			mount_lock_destroy(mp);
3048 #if CONFIG_MACF
3049 			mac_mount_label_destroy(mp);
3050 #endif
3051 			zfree(mount_zone, mp);
3052 		} else {
3053 			panic("dounmount: no coveredvp");
3054 		}
3055 	}
3056 	return error;
3057 }
3058 
3059 /*
3060  * Unmount any mounts in this filesystem.
3061  */
3062 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3063 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3064 {
3065 	mount_t smp;
3066 	fsid_t *fsids, fsid;
3067 	int fsids_sz;
3068 	int count = 0, i, m = 0;
3069 	vnode_t vp;
3070 
3071 	mount_list_lock();
3072 
3073 	// Get an array to hold the submounts fsids.
3074 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
3075 	count++;
3076 	fsids_sz = count * sizeof(fsid_t);
3077 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3078 	if (fsids == NULL) {
3079 		mount_list_unlock();
3080 		goto out;
3081 	}
3082 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
3083 
3084 	/*
3085 	 * Fill the array with submount fsids.
3086 	 * Since mounts are always added to the tail of the mount list, the
3087 	 * list is always in mount order.
3088 	 * For each mount check if the mounted-on vnode belongs to a
3089 	 * mount that's already added to our array of mounts to be unmounted.
3090 	 */
3091 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3092 		vp = smp->mnt_vnodecovered;
3093 		if (vp == NULL) {
3094 			continue;
3095 		}
3096 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3097 		for (i = 0; i <= m; i++) {
3098 			if (fsids[i].val[0] == fsid.val[0] &&
3099 			    fsids[i].val[1] == fsid.val[1]) {
3100 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3101 				break;
3102 			}
3103 		}
3104 	}
3105 	mount_list_unlock();
3106 
3107 	// Unmount the submounts in reverse order. Ignore errors.
3108 	for (i = m; i > 0; i--) {
3109 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3110 		if (smp) {
3111 			mount_ref(smp, 0);
3112 			mount_iterdrop(smp);
3113 			(void) dounmount(smp, flags, 1, ctx);
3114 		}
3115 	}
3116 out:
3117 	kfree_data(fsids, fsids_sz);
3118 }
3119 
3120 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3121 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3122 {
3123 	vnode_hold(dp);
3124 	vnode_lock(dp);
3125 	mp->mnt_crossref--;
3126 
3127 	if (mp->mnt_crossref < 0) {
3128 		panic("mount cross refs -ve");
3129 	}
3130 
3131 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3132 		if (need_put) {
3133 			vnode_put_locked(dp);
3134 		}
3135 		vnode_drop_and_unlock(dp);
3136 
3137 		if (nc_smr_enabled) {
3138 			vfs_smr_synchronize();
3139 		}
3140 
3141 		mount_lock_destroy(mp);
3142 #if CONFIG_MACF
3143 		mac_mount_label_destroy(mp);
3144 #endif
3145 		zfree(mount_zone, mp);
3146 		return;
3147 	}
3148 	if (need_put) {
3149 		vnode_put_locked(dp);
3150 	}
3151 	vnode_drop_and_unlock(dp);
3152 }
3153 
3154 
3155 /*
3156  * Sync each mounted filesystem.
3157  */
3158 #if DIAGNOSTIC
3159 int syncprt = 0;
3160 #endif
3161 
3162 int print_vmpage_stat = 0;
3163 
3164 /*
3165  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3166  *			mounted read-write with the passed waitfor value.
3167  *
3168  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3169  *		arg	user argument (please see below)
3170  *
3171  * User argument is a pointer to 32 bit unsigned integer which describes the
3172  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3173  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3174  * waitfor value.
3175  *
3176  * Returns:		VFS_RETURNED
3177  */
3178 static int
sync_callback(mount_t mp,void * arg)3179 sync_callback(mount_t mp, void *arg)
3180 {
3181 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3182 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3183 		unsigned waitfor = MNT_NOWAIT;
3184 
3185 		if (arg) {
3186 			waitfor = *(uint32_t*)arg;
3187 		}
3188 
3189 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3190 		if (waitfor != MNT_WAIT &&
3191 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3192 		    waitfor != MNT_NOWAIT &&
3193 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3194 		    waitfor != MNT_DWAIT &&
3195 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3196 			panic("Passed inappropriate waitfor %u to "
3197 			    "sync_callback()", waitfor);
3198 		}
3199 
3200 		mp->mnt_flag &= ~MNT_ASYNC;
3201 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3202 		if (asyncflag) {
3203 			mp->mnt_flag |= MNT_ASYNC;
3204 		}
3205 	}
3206 
3207 	return VFS_RETURNED;
3208 }
3209 
3210 /* ARGSUSED */
3211 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3212 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3213 {
3214 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3215 
3216 	if (print_vmpage_stat) {
3217 		vm_countdirtypages();
3218 	}
3219 
3220 #if DIAGNOSTIC
3221 	if (syncprt) {
3222 		vfs_bufstats();
3223 	}
3224 #endif /* DIAGNOSTIC */
3225 	return 0;
3226 }
3227 
3228 typedef enum {
3229 	SYNC_ALL = 0,
3230 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3231 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3232 } sync_type_t;
3233 
3234 static int
sync_internal_callback(mount_t mp,void * arg)3235 sync_internal_callback(mount_t mp, void *arg)
3236 {
3237 	if (arg) {
3238 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3239 		    (mp->mnt_flag & MNT_LOCAL);
3240 		sync_type_t sync_type = *((sync_type_t *)arg);
3241 
3242 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3243 			return VFS_RETURNED;
3244 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3245 			return VFS_RETURNED;
3246 		}
3247 	}
3248 
3249 	(void)sync_callback(mp, NULL);
3250 
3251 	return VFS_RETURNED;
3252 }
3253 
3254 int sync_thread_state = 0;
3255 int sync_timeout_seconds = 5;
3256 
3257 #define SYNC_THREAD_RUN       0x0001
3258 #define SYNC_THREAD_RUNNING   0x0002
3259 
3260 #if CONFIG_PHYS_WRITE_ACCT
3261 thread_t pm_sync_thread;
3262 #endif /* CONFIG_PHYS_WRITE_ACCT */
3263 
3264 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3265 sync_thread(__unused void *arg, __unused wait_result_t wr)
3266 {
3267 	sync_type_t sync_type;
3268 #if CONFIG_PHYS_WRITE_ACCT
3269 	pm_sync_thread = current_thread();
3270 #endif /* CONFIG_PHYS_WRITE_ACCT */
3271 
3272 	lck_mtx_lock(&sync_mtx_lck);
3273 	while (sync_thread_state & SYNC_THREAD_RUN) {
3274 		sync_thread_state &= ~SYNC_THREAD_RUN;
3275 		lck_mtx_unlock(&sync_mtx_lck);
3276 
3277 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3278 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3279 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3280 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3281 
3282 		lck_mtx_lock(&sync_mtx_lck);
3283 	}
3284 	/*
3285 	 * This wakeup _has_ to be issued before the lock is released otherwise
3286 	 * we may end up waking up a thread in sync_internal which is
3287 	 * expecting a wakeup from a thread it just created and not from this
3288 	 * thread which is about to exit.
3289 	 */
3290 	wakeup(&sync_thread_state);
3291 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3292 #if CONFIG_PHYS_WRITE_ACCT
3293 	pm_sync_thread = NULL;
3294 #endif /* CONFIG_PHYS_WRITE_ACCT */
3295 	lck_mtx_unlock(&sync_mtx_lck);
3296 
3297 	if (print_vmpage_stat) {
3298 		vm_countdirtypages();
3299 	}
3300 
3301 #if DIAGNOSTIC
3302 	if (syncprt) {
3303 		vfs_bufstats();
3304 	}
3305 #endif /* DIAGNOSTIC */
3306 }
3307 
3308 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3309 
3310 /*
3311  * An in-kernel sync for power management to call.
3312  * This function always returns within sync_timeout seconds.
3313  */
3314 __private_extern__ int
sync_internal(void)3315 sync_internal(void)
3316 {
3317 	thread_t thd = NULL;
3318 	int error;
3319 	int thread_created = FALSE;
3320 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3321 
3322 	lck_mtx_lock(&sync_mtx_lck);
3323 	sync_thread_state |= SYNC_THREAD_RUN;
3324 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3325 		int kr;
3326 
3327 		sync_thread_state |= SYNC_THREAD_RUNNING;
3328 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3329 		if (kr != KERN_SUCCESS) {
3330 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3331 			lck_mtx_unlock(&sync_mtx_lck);
3332 			printf("sync_thread failed\n");
3333 			return 0;
3334 		}
3335 		thread_created = TRUE;
3336 	}
3337 
3338 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3339 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3340 	if (error) {
3341 		struct timeval now;
3342 
3343 		microtime(&now);
3344 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3345 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3346 			sync_timeout_last_print.tv_sec = now.tv_sec;
3347 		}
3348 	}
3349 
3350 	if (thread_created) {
3351 		thread_deallocate(thd);
3352 	}
3353 
3354 	return 0;
3355 } /* end of sync_internal call */
3356 
3357 /*
3358  * Change filesystem quotas.
3359  */
3360 #if QUOTA
3361 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3362 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3363 {
3364 	struct mount *mp;
3365 	int error, quota_cmd, quota_status = 0;
3366 	caddr_t datap;
3367 	size_t fnamelen;
3368 	struct nameidata nd;
3369 	vfs_context_t ctx = vfs_context_current();
3370 	struct dqblk my_dqblk = {};
3371 
3372 	AUDIT_ARG(uid, uap->uid);
3373 	AUDIT_ARG(cmd, uap->cmd);
3374 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3375 	    uap->path, ctx);
3376 	error = namei(&nd);
3377 	if (error) {
3378 		return error;
3379 	}
3380 	mp = nd.ni_vp->v_mount;
3381 	mount_ref(mp, 0);
3382 	vnode_put(nd.ni_vp);
3383 	nameidone(&nd);
3384 
3385 #if CONFIG_MACF
3386 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3387 	if (error != 0) {
3388 		goto out;
3389 	}
3390 #endif
3391 
3392 	/* copyin any data we will need for downstream code */
3393 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3394 
3395 	switch (quota_cmd) {
3396 	case Q_QUOTAON:
3397 		/* uap->arg specifies a file from which to take the quotas */
3398 		fnamelen = MAXPATHLEN;
3399 		datap = zalloc(ZV_NAMEI);
3400 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3401 		break;
3402 	case Q_GETQUOTA:
3403 		/* uap->arg is a pointer to a dqblk structure. */
3404 		datap = (caddr_t) &my_dqblk;
3405 		break;
3406 	case Q_SETQUOTA:
3407 	case Q_SETUSE:
3408 		/* uap->arg is a pointer to a dqblk structure. */
3409 		datap = (caddr_t) &my_dqblk;
3410 		if (proc_is64bit(p)) {
3411 			struct user_dqblk       my_dqblk64;
3412 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3413 			if (error == 0) {
3414 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3415 			}
3416 		} else {
3417 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3418 		}
3419 		break;
3420 	case Q_QUOTASTAT:
3421 		/* uap->arg is a pointer to an integer */
3422 		datap = (caddr_t) &quota_status;
3423 		break;
3424 	default:
3425 		datap = NULL;
3426 		break;
3427 	} /* switch */
3428 
3429 	if (error == 0) {
3430 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3431 	}
3432 
3433 	switch (quota_cmd) {
3434 	case Q_QUOTAON:
3435 		if (datap != NULL) {
3436 			zfree(ZV_NAMEI, datap);
3437 		}
3438 		break;
3439 	case Q_GETQUOTA:
3440 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3441 		if (error == 0) {
3442 			if (proc_is64bit(p)) {
3443 				struct user_dqblk       my_dqblk64;
3444 
3445 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3446 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3447 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3448 			} else {
3449 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3450 			}
3451 		}
3452 		break;
3453 	case Q_QUOTASTAT:
3454 		/* uap->arg is a pointer to an integer */
3455 		if (error == 0) {
3456 			error = copyout(datap, uap->arg, sizeof(quota_status));
3457 		}
3458 		break;
3459 	default:
3460 		break;
3461 	} /* switch */
3462 
3463 out:
3464 	mount_drop(mp, 0);
3465 	return error;
3466 }
3467 #else
3468 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3469 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3470 {
3471 	return EOPNOTSUPP;
3472 }
3473 #endif /* QUOTA */
3474 
3475 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3476 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3477 {
3478 	int error;
3479 	vfs_context_t ctx = vfs_context_current();
3480 
3481 #if CONFIG_MACF
3482 	error = mac_mount_check_stat(ctx, mp);
3483 	if (error != 0) {
3484 		return error;
3485 	}
3486 #endif
3487 
3488 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3489 	if (error != 0) {
3490 		return error;
3491 	}
3492 
3493 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3494 }
3495 
3496 /*
3497  * Get filesystem statistics.
3498  *
3499  * Returns:	0			Success
3500  *	namei:???
3501  *	vfs_update_vfsstat:???
3502  *	munge_statfs:EFAULT
3503  */
3504 /* ARGSUSED */
3505 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3506 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3507 {
3508 	int error;
3509 	struct mount *mp;
3510 	struct nameidata nd;
3511 	vfs_context_t ctx = vfs_context_current();
3512 	vnode_t vp;
3513 
3514 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3515 	    UIO_USERSPACE, uap->path, ctx);
3516 	error = namei(&nd);
3517 	if (error != 0) {
3518 		return error;
3519 	}
3520 	vp = nd.ni_vp;
3521 	mp = vp->v_mount;
3522 	nameidone(&nd);
3523 
3524 	error = statfs_internal(p, mp, uap->buf);
3525 	vnode_put(vp);
3526 
3527 	return error;
3528 }
3529 
3530 /*
3531  * Get filesystem statistics.
3532  */
3533 /* ARGSUSED */
3534 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3535 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3536 {
3537 	int error;
3538 	vnode_t vp = NULL;
3539 	struct mount *mp;
3540 
3541 	AUDIT_ARG(fd, uap->fd);
3542 
3543 	if ((error = file_vnode(uap->fd, &vp)) ||
3544 	    (error = vnode_getwithref(vp))) {
3545 		goto out;
3546 	}
3547 
3548 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3549 
3550 	mp = vp->v_mount;
3551 	if (!mp) {
3552 		error = EBADF;
3553 		goto out_vnode;
3554 	}
3555 
3556 	error = statfs_internal(p, mp, uap->buf);
3557 
3558 out_vnode:
3559 	vnode_put(vp);
3560 
3561 out:
3562 	if (vp != NULL) {
3563 		file_drop(uap->fd);
3564 	}
3565 
3566 	return error;
3567 }
3568 
3569 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3570 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3571 {
3572 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3573 
3574 	bzero(sfs, sizeof(*sfs));
3575 
3576 	sfs->f_bsize = vsfs->f_bsize;
3577 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3578 	sfs->f_blocks = vsfs->f_blocks;
3579 	sfs->f_bfree = vsfs->f_bfree;
3580 	sfs->f_bavail = vsfs->f_bavail;
3581 	sfs->f_files = vsfs->f_files;
3582 	sfs->f_ffree = vsfs->f_ffree;
3583 	sfs->f_fsid = vsfs->f_fsid;
3584 	sfs->f_owner = vsfs->f_owner;
3585 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3586 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3587 	sfs->f_fssubtype = vsfs->f_fssubtype;
3588 	sfs->f_flags_ext = 0;
3589 	if (mp->mnt_kern_flag & MNTK_SYSTEMDATA) {
3590 		sfs->f_flags_ext |= MNT_EXT_ROOT_DATA_VOL;
3591 	}
3592 	if (mp->mnt_kern_flag & MNTK_FSKIT) {
3593 		sfs->f_flags_ext |= MNT_EXT_FSKIT;
3594 	}
3595 	vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3596 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3597 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3598 }
3599 
3600 /*
3601  * Get file system statistics in 64-bit mode
3602  */
3603 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3604 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3605 {
3606 	struct mount *mp;
3607 	int error;
3608 	struct nameidata *ndp;
3609 	struct statfs64 *sfsp;
3610 	vfs_context_t ctxp = vfs_context_current();
3611 	vnode_t vp;
3612 	struct {
3613 		struct nameidata nd;
3614 		struct statfs64 sfs;
3615 	} *__nameidata_statfs64;
3616 
3617 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3618 	    Z_WAITOK);
3619 	ndp = &__nameidata_statfs64->nd;
3620 
3621 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3622 	    UIO_USERSPACE, uap->path, ctxp);
3623 	error = namei(ndp);
3624 	if (error != 0) {
3625 		goto out;
3626 	}
3627 	vp = ndp->ni_vp;
3628 	mp = vp->v_mount;
3629 	nameidone(ndp);
3630 
3631 #if CONFIG_MACF
3632 	error = mac_mount_check_stat(ctxp, mp);
3633 	if (error != 0) {
3634 		vnode_put(vp);
3635 		goto out;
3636 	}
3637 #endif
3638 
3639 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3640 	if (error != 0) {
3641 		vnode_put(vp);
3642 		goto out;
3643 	}
3644 
3645 	sfsp = &__nameidata_statfs64->sfs;
3646 	vfs_get_statfs64(mp, sfsp);
3647 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3648 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3649 		/* This process does not want to see a seperate data volume mountpoint */
3650 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3651 	}
3652 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3653 	vnode_put(vp);
3654 
3655 out:
3656 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3657 
3658 	return error;
3659 }
3660 
3661 /*
3662  * Get file system statistics in 64-bit mode
3663  */
3664 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3665 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3666 {
3667 	struct vnode *vp;
3668 	struct mount *mp;
3669 	struct statfs64 sfs;
3670 	int error;
3671 
3672 	AUDIT_ARG(fd, uap->fd);
3673 
3674 	if ((error = file_vnode(uap->fd, &vp))) {
3675 		return error;
3676 	}
3677 
3678 	error = vnode_getwithref(vp);
3679 	if (error) {
3680 		file_drop(uap->fd);
3681 		return error;
3682 	}
3683 
3684 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3685 
3686 	mp = vp->v_mount;
3687 	if (!mp) {
3688 		error = EBADF;
3689 		goto out;
3690 	}
3691 
3692 #if CONFIG_MACF
3693 	error = mac_mount_check_stat(vfs_context_current(), mp);
3694 	if (error != 0) {
3695 		goto out;
3696 	}
3697 #endif
3698 
3699 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3700 		goto out;
3701 	}
3702 
3703 	vfs_get_statfs64(mp, &sfs);
3704 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3705 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3706 		/* This process does not want to see a seperate data volume mountpoint */
3707 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3708 	}
3709 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3710 
3711 out:
3712 	file_drop(uap->fd);
3713 	vnode_put(vp);
3714 
3715 	return error;
3716 }
3717 
3718 struct getfsstat_struct {
3719 	user_addr_t     sfsp;
3720 	user_addr_t     *mp;
3721 	int             count;
3722 	int             maxcount;
3723 	int             flags;
3724 	int             error;
3725 };
3726 
3727 
3728 static int
getfsstat_callback(mount_t mp,void * arg)3729 getfsstat_callback(mount_t mp, void * arg)
3730 {
3731 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3732 	struct vfsstatfs *sp;
3733 	int error, my_size;
3734 	vfs_context_t ctx = vfs_context_current();
3735 
3736 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3737 #if CONFIG_MACF
3738 		error = mac_mount_check_stat(ctx, mp);
3739 		if (error != 0) {
3740 			fstp->error = error;
3741 			return VFS_RETURNED_DONE;
3742 		}
3743 #endif
3744 		sp = &mp->mnt_vfsstat;
3745 		/*
3746 		 * If MNT_NOWAIT is specified, do not refresh the
3747 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3748 		 */
3749 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3750 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3751 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3752 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3753 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3754 			return VFS_RETURNED;
3755 		}
3756 
3757 		/*
3758 		 * Need to handle LP64 version of struct statfs
3759 		 */
3760 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3761 		if (error) {
3762 			fstp->error = error;
3763 			return VFS_RETURNED_DONE;
3764 		}
3765 		fstp->sfsp += my_size;
3766 
3767 		if (fstp->mp) {
3768 #if CONFIG_MACF
3769 			error = mac_mount_label_get(mp, *fstp->mp);
3770 			if (error) {
3771 				fstp->error = error;
3772 				return VFS_RETURNED_DONE;
3773 			}
3774 #endif
3775 			fstp->mp++;
3776 		}
3777 	}
3778 	fstp->count++;
3779 	return VFS_RETURNED;
3780 }
3781 
3782 /*
3783  * Get statistics on all filesystems.
3784  */
3785 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3786 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3787 {
3788 	struct __mac_getfsstat_args muap;
3789 
3790 	muap.buf = uap->buf;
3791 	muap.bufsize = uap->bufsize;
3792 	muap.mac = USER_ADDR_NULL;
3793 	muap.macsize = 0;
3794 	muap.flags = uap->flags;
3795 
3796 	return __mac_getfsstat(p, &muap, retval);
3797 }
3798 
3799 /*
3800  * __mac_getfsstat: Get MAC-related file system statistics
3801  *
3802  * Parameters:    p                        (ignored)
3803  *                uap                      User argument descriptor (see below)
3804  *                retval                   Count of file system statistics (N stats)
3805  *
3806  * Indirect:      uap->bufsize             Buffer size
3807  *                uap->macsize             MAC info size
3808  *                uap->buf                 Buffer where information will be returned
3809  *                uap->mac                 MAC info
3810  *                uap->flags               File system flags
3811  *
3812  *
3813  * Returns:        0                       Success
3814  *                !0                       Not success
3815  *
3816  */
3817 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3818 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3819 {
3820 	user_addr_t sfsp;
3821 	user_addr_t *mp;
3822 	size_t count, maxcount, bufsize, macsize;
3823 	struct getfsstat_struct fst;
3824 
3825 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3826 		return EINVAL;
3827 	}
3828 
3829 	bufsize = (size_t) uap->bufsize;
3830 	macsize = (size_t) uap->macsize;
3831 
3832 	if (IS_64BIT_PROCESS(p)) {
3833 		maxcount = bufsize / sizeof(struct user64_statfs);
3834 	} else {
3835 		maxcount = bufsize / sizeof(struct user32_statfs);
3836 	}
3837 	sfsp = uap->buf;
3838 	count = 0;
3839 
3840 	mp = NULL;
3841 
3842 #if CONFIG_MACF
3843 	if (uap->mac != USER_ADDR_NULL) {
3844 		u_int32_t *mp0;
3845 		int error;
3846 		unsigned int i;
3847 
3848 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3849 		if (count != maxcount) {
3850 			return EINVAL;
3851 		}
3852 
3853 		/* Copy in the array */
3854 		mp0 = kalloc_data(macsize, Z_WAITOK);
3855 		if (mp0 == NULL) {
3856 			return ENOMEM;
3857 		}
3858 
3859 		error = copyin(uap->mac, mp0, macsize);
3860 		if (error) {
3861 			kfree_data(mp0, macsize);
3862 			return error;
3863 		}
3864 
3865 		/* Normalize to an array of user_addr_t */
3866 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3867 		if (mp == NULL) {
3868 			kfree_data(mp0, macsize);
3869 			return ENOMEM;
3870 		}
3871 
3872 		for (i = 0; i < count; i++) {
3873 			if (IS_64BIT_PROCESS(p)) {
3874 				mp[i] = ((user_addr_t *)mp0)[i];
3875 			} else {
3876 				mp[i] = (user_addr_t)mp0[i];
3877 			}
3878 		}
3879 		kfree_data(mp0, macsize);
3880 	}
3881 #endif
3882 
3883 
3884 	fst.sfsp = sfsp;
3885 	fst.mp = mp;
3886 	fst.flags = uap->flags;
3887 	fst.count = 0;
3888 	fst.error = 0;
3889 	fst.maxcount = (int)maxcount;
3890 
3891 
3892 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3893 
3894 	if (mp) {
3895 		kfree_data(mp, count * sizeof(user_addr_t));
3896 	}
3897 
3898 	if (fst.error) {
3899 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3900 		return fst.error;
3901 	}
3902 
3903 	if (fst.sfsp && fst.count > fst.maxcount) {
3904 		*retval = fst.maxcount;
3905 	} else {
3906 		*retval = fst.count;
3907 	}
3908 	return 0;
3909 }
3910 
3911 static int
getfsstat64_callback(mount_t mp,void * arg)3912 getfsstat64_callback(mount_t mp, void * arg)
3913 {
3914 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3915 	struct vfsstatfs *sp;
3916 	struct statfs64 sfs;
3917 	int error;
3918 
3919 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3920 #if CONFIG_MACF
3921 		error = mac_mount_check_stat(vfs_context_current(), mp);
3922 		if (error != 0) {
3923 			fstp->error = error;
3924 			return VFS_RETURNED_DONE;
3925 		}
3926 #endif
3927 		sp = &mp->mnt_vfsstat;
3928 		/*
3929 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3930 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
3931 		 *
3932 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3933 		 * getfsstat, since the constants are out of the same
3934 		 * namespace.
3935 		 */
3936 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3937 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3938 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3939 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3940 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3941 			return VFS_RETURNED;
3942 		}
3943 
3944 		vfs_get_statfs64(mp, &sfs);
3945 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3946 		if (error) {
3947 			fstp->error = error;
3948 			return VFS_RETURNED_DONE;
3949 		}
3950 		fstp->sfsp += sizeof(sfs);
3951 	}
3952 	fstp->count++;
3953 	return VFS_RETURNED;
3954 }
3955 
3956 /*
3957  * Get statistics on all file systems in 64 bit mode.
3958  */
3959 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3960 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3961 {
3962 	user_addr_t sfsp;
3963 	int count, maxcount;
3964 	struct getfsstat_struct fst;
3965 
3966 	maxcount = uap->bufsize / sizeof(struct statfs64);
3967 
3968 	sfsp = uap->buf;
3969 	count = 0;
3970 
3971 	fst.sfsp = sfsp;
3972 	fst.flags = uap->flags;
3973 	fst.count = 0;
3974 	fst.error = 0;
3975 	fst.maxcount = maxcount;
3976 
3977 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3978 
3979 	if (fst.error) {
3980 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3981 		return fst.error;
3982 	}
3983 
3984 	if (fst.sfsp && fst.count > fst.maxcount) {
3985 		*retval = fst.maxcount;
3986 	} else {
3987 		*retval = fst.count;
3988 	}
3989 
3990 	return 0;
3991 }
3992 
3993 /*
3994  * gets the associated vnode with the file descriptor passed.
3995  * as input
3996  *
3997  * INPUT
3998  * ctx - vfs context of caller
3999  * fd - file descriptor for which vnode is required.
4000  * vpp - Pointer to pointer to vnode to be returned.
4001  *
4002  * The vnode is returned with an iocount so any vnode obtained
4003  * by this call needs a vnode_put
4004  *
4005  */
4006 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)4007 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
4008 {
4009 	int error;
4010 	vnode_t vp;
4011 	struct fileproc *fp;
4012 	proc_t p = vfs_context_proc(ctx);
4013 
4014 	*vpp =  NULLVP;
4015 
4016 	error = fp_getfvp(p, fd, &fp, &vp);
4017 	if (error) {
4018 		return error;
4019 	}
4020 
4021 	error = vnode_getwithref(vp);
4022 	if (error) {
4023 		(void)fp_drop(p, fd, fp, 0);
4024 		return error;
4025 	}
4026 
4027 	(void)fp_drop(p, fd, fp, 0);
4028 	*vpp = vp;
4029 	return error;
4030 }
4031 
4032 /*
4033  * Wrapper function around namei to start lookup from a directory
4034  * specified by a file descriptor ni_dirfd.
4035  *
4036  * In addition to all the errors returned by namei, this call can
4037  * return ENOTDIR if the file descriptor does not refer to a directory.
4038  * and EBADF if the file descriptor is not valid.
4039  */
4040 int
nameiat(struct nameidata * ndp,int dirfd)4041 nameiat(struct nameidata *ndp, int dirfd)
4042 {
4043 	if ((dirfd != AT_FDCWD) &&
4044 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4045 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
4046 		int error = 0;
4047 		char c;
4048 
4049 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4050 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4051 			if (error) {
4052 				return error;
4053 			}
4054 		} else {
4055 			c = *((char *)(ndp->ni_dirp));
4056 		}
4057 
4058 		if (c != '/') {
4059 			vnode_t dvp_at;
4060 
4061 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4062 			    &dvp_at);
4063 			if (error) {
4064 				return error;
4065 			}
4066 
4067 			if (vnode_vtype(dvp_at) != VDIR) {
4068 				vnode_put(dvp_at);
4069 				return ENOTDIR;
4070 			}
4071 
4072 			ndp->ni_dvp = dvp_at;
4073 			ndp->ni_cnd.cn_flags |= USEDVP;
4074 			error = namei(ndp);
4075 			ndp->ni_cnd.cn_flags &= ~USEDVP;
4076 			vnode_put(dvp_at);
4077 			return error;
4078 		}
4079 	}
4080 
4081 	return namei(ndp);
4082 }
4083 
4084 /*
4085  * Change current working directory to a given file descriptor.
4086  */
4087 /* ARGSUSED */
4088 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4089 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4090 {
4091 	vnode_t vp;
4092 	vnode_t tdp;
4093 	vnode_t tvp;
4094 	struct mount *mp;
4095 	int error, should_put = 1;
4096 
4097 	AUDIT_ARG(fd, fd);
4098 	if (per_thread && fd == -1) {
4099 		/*
4100 		 * Switching back from per-thread to per process CWD; verify we
4101 		 * in fact have one before proceeding.  The only success case
4102 		 * for this code path is to return 0 preemptively after zapping
4103 		 * the thread structure contents.
4104 		 */
4105 		thread_t th = vfs_context_thread(ctx);
4106 		if (th) {
4107 			uthread_t uth = get_bsdthread_info(th);
4108 			tvp = uth->uu_cdir;
4109 			uth->uu_cdir = NULLVP;
4110 			if (tvp != NULLVP) {
4111 				vnode_rele(tvp);
4112 				return 0;
4113 			}
4114 		}
4115 		return EBADF;
4116 	}
4117 
4118 	if ((error = file_vnode(fd, &vp))) {
4119 		return error;
4120 	}
4121 	if ((error = vnode_getwithref(vp))) {
4122 		file_drop(fd);
4123 		return error;
4124 	}
4125 
4126 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4127 
4128 	if (vp->v_type != VDIR) {
4129 		error = ENOTDIR;
4130 		goto out;
4131 	}
4132 
4133 #if CONFIG_MACF
4134 	error = mac_vnode_check_chdir(ctx, vp);
4135 	if (error) {
4136 		goto out;
4137 	}
4138 #endif
4139 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4140 	if (error) {
4141 		goto out;
4142 	}
4143 
4144 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4145 		if (vfs_busy(mp, LK_NOWAIT)) {
4146 			error = EACCES;
4147 			goto out;
4148 		}
4149 		error = VFS_ROOT(mp, &tdp, ctx);
4150 		vfs_unbusy(mp);
4151 		if (error) {
4152 			break;
4153 		}
4154 		vnode_put(vp);
4155 		vp = tdp;
4156 	}
4157 	if (error) {
4158 		goto out;
4159 	}
4160 	if ((error = vnode_ref(vp))) {
4161 		goto out;
4162 	}
4163 	vnode_put(vp);
4164 	should_put = 0;
4165 
4166 	if (per_thread) {
4167 		thread_t th = vfs_context_thread(ctx);
4168 		if (th) {
4169 			uthread_t uth = get_bsdthread_info(th);
4170 			tvp = uth->uu_cdir;
4171 			uth->uu_cdir = vp;
4172 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4173 		} else {
4174 			vnode_rele(vp);
4175 			error = ENOENT;
4176 			goto out;
4177 		}
4178 	} else {
4179 		proc_dirs_lock_exclusive(p);
4180 		proc_fdlock(p);
4181 		tvp = p->p_fd.fd_cdir;
4182 		p->p_fd.fd_cdir = vp;
4183 		proc_fdunlock(p);
4184 		proc_dirs_unlock_exclusive(p);
4185 	}
4186 
4187 	if (tvp) {
4188 		vnode_rele(tvp);
4189 	}
4190 
4191 out:
4192 	if (should_put) {
4193 		vnode_put(vp);
4194 	}
4195 	file_drop(fd);
4196 
4197 	return error;
4198 }
4199 
4200 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4201 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4202 {
4203 	return fchdir(p, vfs_context_current(), uap->fd, false);
4204 }
4205 
4206 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4207 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4208 {
4209 	return fchdir(p, vfs_context_current(), uap->fd, true);
4210 }
4211 
4212 
4213 /*
4214  * Change current working directory (".").
4215  *
4216  * Returns:	0			Success
4217  *	change_dir:ENOTDIR
4218  *	change_dir:???
4219  *	vnode_ref:ENOENT		No such file or directory
4220  */
4221 /* ARGSUSED */
4222 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4223 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4224 {
4225 	int error;
4226 	vnode_t tvp;
4227 
4228 	error = change_dir(ndp, ctx);
4229 	if (error) {
4230 		return error;
4231 	}
4232 	if ((error = vnode_ref(ndp->ni_vp))) {
4233 		vnode_put(ndp->ni_vp);
4234 		return error;
4235 	}
4236 	/*
4237 	 * drop the iocount we picked up in change_dir
4238 	 */
4239 	vnode_put(ndp->ni_vp);
4240 
4241 	if (per_thread) {
4242 		thread_t th = vfs_context_thread(ctx);
4243 		if (th) {
4244 			uthread_t uth = get_bsdthread_info(th);
4245 			tvp = uth->uu_cdir;
4246 			uth->uu_cdir = ndp->ni_vp;
4247 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4248 		} else {
4249 			vnode_rele(ndp->ni_vp);
4250 			return ENOENT;
4251 		}
4252 	} else {
4253 		proc_dirs_lock_exclusive(p);
4254 		proc_fdlock(p);
4255 		tvp = p->p_fd.fd_cdir;
4256 		p->p_fd.fd_cdir = ndp->ni_vp;
4257 		proc_fdunlock(p);
4258 		proc_dirs_unlock_exclusive(p);
4259 	}
4260 
4261 	if (tvp) {
4262 		vnode_rele(tvp);
4263 	}
4264 
4265 	return 0;
4266 }
4267 
4268 
4269 /*
4270  * Change current working directory (".").
4271  *
4272  * Returns:	0			Success
4273  *	chdir_internal:ENOTDIR
4274  *	chdir_internal:ENOENT		No such file or directory
4275  *	chdir_internal:???
4276  */
4277 /* ARGSUSED */
4278 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4279 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4280 {
4281 	struct nameidata nd;
4282 	vfs_context_t ctx = vfs_context_current();
4283 
4284 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4285 	    UIO_USERSPACE, uap->path, ctx);
4286 
4287 	return chdir_internal(p, ctx, &nd, per_thread);
4288 }
4289 
4290 
4291 /*
4292  * chdir
4293  *
4294  * Change current working directory (".") for the entire process
4295  *
4296  * Parameters:  p       Process requesting the call
4297  *              uap     User argument descriptor (see below)
4298  *              retval  (ignored)
4299  *
4300  * Indirect parameters:	uap->path	Directory path
4301  *
4302  * Returns:	0			Success
4303  *              common_chdir: ENOTDIR
4304  *              common_chdir: ENOENT	No such file or directory
4305  *              common_chdir: ???
4306  *
4307  */
4308 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4309 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4310 {
4311 	return common_chdir(p, (void *)uap, 0);
4312 }
4313 
4314 /*
4315  * __pthread_chdir
4316  *
4317  * Change current working directory (".") for a single thread
4318  *
4319  * Parameters:  p       Process requesting the call
4320  *              uap     User argument descriptor (see below)
4321  *              retval  (ignored)
4322  *
4323  * Indirect parameters:	uap->path	Directory path
4324  *
4325  * Returns:	0			Success
4326  *              common_chdir: ENOTDIR
4327  *		common_chdir: ENOENT	No such file or directory
4328  *		common_chdir: ???
4329  *
4330  */
4331 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4332 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4333 {
4334 	return common_chdir(p, (void *)uap, 1);
4335 }
4336 
4337 
4338 /*
4339  * Change notion of root (``/'') directory.
4340  */
4341 /* ARGSUSED */
4342 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4343 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4344 {
4345 	struct filedesc *fdp = &p->p_fd;
4346 	int error;
4347 	struct nameidata nd;
4348 	vnode_t tvp;
4349 	vfs_context_t ctx = vfs_context_current();
4350 
4351 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4352 		return error;
4353 	}
4354 
4355 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4356 	    UIO_USERSPACE, uap->path, ctx);
4357 	error = change_dir(&nd, ctx);
4358 	if (error) {
4359 		return error;
4360 	}
4361 
4362 #if CONFIG_MACF
4363 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4364 	    &nd.ni_cnd);
4365 	if (error) {
4366 		vnode_put(nd.ni_vp);
4367 		return error;
4368 	}
4369 #endif
4370 
4371 	if ((error = vnode_ref(nd.ni_vp))) {
4372 		vnode_put(nd.ni_vp);
4373 		return error;
4374 	}
4375 	vnode_put(nd.ni_vp);
4376 
4377 	/*
4378 	 * This lock provides the guarantee that as long as you hold the lock
4379 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4380 	 * on a referenced vnode in namei when determining the rootvnode for
4381 	 * a process.
4382 	 */
4383 	/* needed for synchronization with lookup */
4384 	proc_dirs_lock_exclusive(p);
4385 	/* needed for setting the flag and other activities on the fd itself */
4386 	proc_fdlock(p);
4387 	tvp = fdp->fd_rdir;
4388 	fdp->fd_rdir = nd.ni_vp;
4389 	fdt_flag_set(fdp, FD_CHROOT);
4390 	proc_fdunlock(p);
4391 	proc_dirs_unlock_exclusive(p);
4392 
4393 	if (tvp != NULL) {
4394 		vnode_rele(tvp);
4395 	}
4396 
4397 	return 0;
4398 }
4399 
4400 #define PATHSTATICBUFLEN 256
4401 #define PIVOT_ROOT_ENTITLEMENT              \
4402        "com.apple.private.vfs.pivot-root"
4403 
4404 #if defined(XNU_TARGET_OS_OSX)
4405 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4406 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4407 {
4408 	int error;
4409 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4410 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4411 	char *new_rootfs_path_before_buf = NULL;
4412 	char *old_rootfs_path_after_buf = NULL;
4413 	char *incoming = NULL;
4414 	char *outgoing = NULL;
4415 	vnode_t incoming_rootvp = NULLVP;
4416 	size_t bytes_copied;
4417 
4418 	/*
4419 	 * XXX : Additional restrictions needed
4420 	 * - perhaps callable only once.
4421 	 */
4422 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4423 		return error;
4424 	}
4425 
4426 	/*
4427 	 * pivot_root can be executed by launchd only.
4428 	 * Enforce entitlement.
4429 	 */
4430 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4431 		return EPERM;
4432 	}
4433 
4434 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4435 	if (error == ENAMETOOLONG) {
4436 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4437 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4438 	}
4439 
4440 	if (error) {
4441 		goto out;
4442 	}
4443 
4444 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4445 	if (error == ENAMETOOLONG) {
4446 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4447 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4448 	}
4449 	if (error) {
4450 		goto out;
4451 	}
4452 
4453 	if (new_rootfs_path_before_buf) {
4454 		incoming = new_rootfs_path_before_buf;
4455 	} else {
4456 		incoming = &new_rootfs_path_before[0];
4457 	}
4458 
4459 	if (old_rootfs_path_after_buf) {
4460 		outgoing = old_rootfs_path_after_buf;
4461 	} else {
4462 		outgoing = &old_rootfs_path_after[0];
4463 	}
4464 
4465 	/*
4466 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4467 	 * Userland is not allowed to pivot to an image.
4468 	 */
4469 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4470 	if (error) {
4471 		goto out;
4472 	}
4473 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4474 	if (error) {
4475 		goto out;
4476 	}
4477 
4478 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4479 
4480 out:
4481 	if (incoming_rootvp != NULLVP) {
4482 		vnode_put(incoming_rootvp);
4483 		incoming_rootvp = NULLVP;
4484 	}
4485 
4486 	if (old_rootfs_path_after_buf) {
4487 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4488 	}
4489 
4490 	if (new_rootfs_path_before_buf) {
4491 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4492 	}
4493 
4494 	return error;
4495 }
4496 #else
4497 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4498 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4499 {
4500 	return nosys(p, NULL, retval);
4501 }
4502 #endif /* XNU_TARGET_OS_OSX */
4503 
4504 /*
4505  * Common routine for chroot and chdir.
4506  *
4507  * Returns:	0			Success
4508  *		ENOTDIR			Not a directory
4509  *		namei:???		[anything namei can return]
4510  *		vnode_authorize:???	[anything vnode_authorize can return]
4511  */
4512 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4513 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4514 {
4515 	vnode_t vp;
4516 	int error;
4517 
4518 	if ((error = namei(ndp))) {
4519 		return error;
4520 	}
4521 	nameidone(ndp);
4522 	vp = ndp->ni_vp;
4523 
4524 	if (vp->v_type != VDIR) {
4525 		vnode_put(vp);
4526 		return ENOTDIR;
4527 	}
4528 
4529 #if CONFIG_MACF
4530 	error = mac_vnode_check_chdir(ctx, vp);
4531 	if (error) {
4532 		vnode_put(vp);
4533 		return error;
4534 	}
4535 #endif
4536 
4537 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4538 	if (error) {
4539 		vnode_put(vp);
4540 		return error;
4541 	}
4542 
4543 	return error;
4544 }
4545 
4546 /*
4547  * Free the vnode data (for directories) associated with the file glob.
4548  */
4549 struct fd_vn_data *
fg_vn_data_alloc(void)4550 fg_vn_data_alloc(void)
4551 {
4552 	struct fd_vn_data *fvdata;
4553 
4554 	/* Allocate per fd vnode data */
4555 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4556 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4557 	return fvdata;
4558 }
4559 
4560 /*
4561  * Free the vnode data (for directories) associated with the file glob.
4562  */
4563 void
fg_vn_data_free(void * fgvndata)4564 fg_vn_data_free(void *fgvndata)
4565 {
4566 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4567 
4568 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4569 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4570 	kfree_type(struct fd_vn_data, fvdata);
4571 }
4572 
4573 /*
4574  * Check permissions, allocate an open file structure,
4575  * and call the device open routine if any.
4576  *
4577  * Returns:	0			Success
4578  *		EINVAL
4579  *		EINTR
4580  *	falloc:ENFILE
4581  *	falloc:EMFILE
4582  *	falloc:ENOMEM
4583  *	vn_open_auth:???
4584  *	dupfdopen:???
4585  *	VNOP_ADVLOCK:???
4586  *	vnode_setsize:???
4587  *
4588  * XXX Need to implement uid, gid
4589  */
4590 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4591 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4592     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4593 {
4594 	proc_t p = vfs_context_proc(ctx);
4595 	kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4596 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4597 	struct fileproc *fp;
4598 	vnode_t vp;
4599 	int flags, oflags, amode;
4600 	int type, indx, error;
4601 	struct vfs_context context;
4602 	vnode_t authvp = NULLVP;
4603 
4604 	oflags = uflags;
4605 
4606 	amode = oflags & O_ACCMODE;
4607 	/*
4608 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4609 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4610 	 * with FREAD/FWRITE.
4611 	 */
4612 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4613 		return EINVAL;
4614 	}
4615 
4616 	flags = FFLAGS(uflags);
4617 	CLR(flags, FENCRYPTED);
4618 	CLR(flags, FUNENCRYPTED);
4619 
4620 	AUDIT_ARG(fflags, oflags);
4621 	AUDIT_ARG(mode, vap->va_mode);
4622 
4623 	if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4624 		return error;
4625 	}
4626 	if (flags & O_CLOEXEC) {
4627 		fp->fp_flags |= FP_CLOEXEC;
4628 	}
4629 	if (flags & O_CLOFORK) {
4630 		fp->fp_flags |= FP_CLOFORK;
4631 	}
4632 
4633 	/* setup state to recognize when fdesc_open was called */
4634 	uu->uu_dupfd = -1;
4635 
4636 	/*
4637 	 * Disable read/write access if file is opened with O_EVTONLY and
4638 	 * the process has requested to deny read/write access.
4639 	 */
4640 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4641 		flags &= ~(FREAD | FWRITE);
4642 	}
4643 
4644 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4645 		error = vnode_getfromfd(ctx, authfd, &authvp);
4646 		if (error) {
4647 			fp_free(p, indx, fp);
4648 			return error;
4649 		}
4650 	}
4651 
4652 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4653 		if (authvp != NULLVP) {
4654 			vnode_put(authvp);
4655 		}
4656 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4657 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4658 				*retval = indx;
4659 				return 0;
4660 			}
4661 		}
4662 		if (error == ERESTART) {
4663 			error = EINTR;
4664 		}
4665 		fp_free(p, indx, fp);
4666 		return error;
4667 	}
4668 
4669 	if (authvp != NULLVP) {
4670 		vnode_put(authvp);
4671 	}
4672 
4673 	uu->uu_dupfd = 0;
4674 	vp = ndp->ni_vp;
4675 
4676 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4677 	fp->fp_glob->fg_ops = &vnops;
4678 	fp_set_data(fp, vp);
4679 
4680 #if CONFIG_FILE_LEASES
4681 	/*
4682 	 * If we are creating a file or open with truncate, we need to break the
4683 	 * lease if there is a read lease placed on the parent dir.
4684 	 */
4685 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4686 		vnode_breakdirlease(vp, true, oflags);
4687 	}
4688 	/* Now check if there is a lease placed on the file itself. */
4689 	error = vnode_breaklease(vp, oflags, ctx);
4690 	if (error) {
4691 		goto bad;
4692 	}
4693 #endif /* CONFIG_FILE_LEASES */
4694 
4695 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4696 		struct flock lf = {
4697 			.l_whence = SEEK_SET,
4698 		};
4699 
4700 		if (flags & O_EXLOCK) {
4701 			lf.l_type = F_WRLCK;
4702 		} else {
4703 			lf.l_type = F_RDLCK;
4704 		}
4705 		type = F_FLOCK;
4706 		if ((flags & FNONBLOCK) == 0) {
4707 			type |= F_WAIT;
4708 		}
4709 #if CONFIG_MACF
4710 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4711 		    F_SETLK, &lf);
4712 		if (error) {
4713 			goto bad;
4714 		}
4715 #endif
4716 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4717 			goto bad;
4718 		}
4719 		fp->fp_glob->fg_flag |= FWASLOCKED;
4720 	}
4721 
4722 	/* try to truncate by setting the size attribute */
4723 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4724 		goto bad;
4725 	}
4726 
4727 	/*
4728 	 * For directories we hold some additional information in the fd.
4729 	 */
4730 	if (vnode_vtype(vp) == VDIR) {
4731 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4732 	} else {
4733 		fp->fp_glob->fg_vn_data = NULL;
4734 	}
4735 
4736 #if CONFIG_SECLUDED_MEMORY
4737 	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4738 		memory_object_control_t moc;
4739 		const char *v_name;
4740 
4741 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4742 
4743 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4744 			/* nothing to do... */
4745 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4746 			/* writable -> no longer  eligible for secluded pages */
4747 			memory_object_mark_eligible_for_secluded(moc,
4748 			    FALSE);
4749 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4750 			char pathname[32] = { 0, };
4751 			size_t copied;
4752 			/* XXX FBDP: better way to detect /Applications/ ? */
4753 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4754 				(void)copyinstr(ndp->ni_dirp,
4755 				    pathname,
4756 				    sizeof(pathname),
4757 				    &copied);
4758 			} else {
4759 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4760 				    pathname,
4761 				    sizeof(pathname),
4762 				    &copied);
4763 			}
4764 			pathname[sizeof(pathname) - 1] = '\0';
4765 			if (strncmp(pathname,
4766 			    "/Applications/",
4767 			    strlen("/Applications/")) == 0 &&
4768 			    strncmp(pathname,
4769 			    "/Applications/Camera.app/",
4770 			    strlen("/Applications/Camera.app/")) != 0) {
4771 				/*
4772 				 * not writable
4773 				 * AND from "/Applications/"
4774 				 * AND not from "/Applications/Camera.app/"
4775 				 * ==> eligible for secluded
4776 				 */
4777 				memory_object_mark_eligible_for_secluded(moc,
4778 				    TRUE);
4779 			}
4780 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4781 		    (v_name = vnode_getname(vp))) {
4782 			size_t len = strlen(v_name);
4783 
4784 			if (!strncmp(v_name, "dyld", len) ||
4785 			    !strncmp(v_name, "launchd", len) ||
4786 			    !strncmp(v_name, "Camera", len) ||
4787 			    !strncmp(v_name, "SpringBoard", len) ||
4788 			    !strncmp(v_name, "backboardd", len) ||
4789 			    !strncmp(v_name, "cameracaptured", len)) {
4790 				/*
4791 				 * This file matters when launching Camera:
4792 				 * do not store its contents in the secluded
4793 				 * pool that will be drained on Camera launch.
4794 				 */
4795 				memory_object_mark_eligible_for_secluded(moc,
4796 				    FALSE);
4797 			} else if (!strncmp(v_name, "audiomxd", len) ||
4798 			    !strncmp(v_name, "mediaplaybackd", len)) {
4799 				memory_object_mark_eligible_for_secluded(moc,
4800 				    FALSE);
4801 				memory_object_mark_for_realtime(moc,
4802 				    true);
4803 			} else if (!strncmp(v_name, "bluetoothd", len)) {
4804 				/*
4805 				 * bluetoothd might be needed for realtime audio
4806 				 * playback.
4807 				 */
4808 				memory_object_mark_eligible_for_secluded(moc,
4809 				    FALSE);
4810 				memory_object_mark_for_realtime(moc,
4811 				    true);
4812 			} else {
4813 				char pathname[64] = { 0, };
4814 				size_t copied;
4815 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4816 					(void)copyinstr(ndp->ni_dirp,
4817 					    pathname,
4818 					    sizeof(pathname),
4819 					    &copied);
4820 				} else {
4821 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
4822 					    pathname,
4823 					    sizeof(pathname),
4824 					    &copied);
4825 				}
4826 				pathname[sizeof(pathname) - 1] = '\0';
4827 				if (strncmp(pathname,
4828 				    "/Library/Audio/Plug-Ins/",
4829 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4830 				    strncmp(pathname,
4831 				    "/System/Library/Audio/Plug-Ins/",
4832 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4833 					/*
4834 					 * This may be an audio plugin required
4835 					 * for realtime playback.
4836 					 * ==> NOT eligible for secluded.
4837 					 */
4838 					memory_object_mark_eligible_for_secluded(moc,
4839 					    FALSE);
4840 					memory_object_mark_for_realtime(moc,
4841 					    true);
4842 				}
4843 			}
4844 			vnode_putname(v_name);
4845 		}
4846 	}
4847 #endif /* CONFIG_SECLUDED_MEMORY */
4848 
4849 	vnode_put(vp);
4850 
4851 	/*
4852 	 * The first terminal open (without a O_NOCTTY) by a session leader
4853 	 * results in it being set as the controlling terminal.
4854 	 */
4855 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4856 	    !(flags & O_NOCTTY)) {
4857 		int tmp = 0;
4858 
4859 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4860 		    (caddr_t)&tmp, ctx);
4861 	}
4862 
4863 	proc_fdlock(p);
4864 	procfdtbl_releasefd(p, indx, NULL);
4865 
4866 	fp_drop(p, indx, fp, 1);
4867 	proc_fdunlock(p);
4868 
4869 	*retval = indx;
4870 
4871 	return 0;
4872 bad:
4873 	context = *vfs_context_current();
4874 	context.vc_ucred = fp->fp_glob->fg_cred;
4875 
4876 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4877 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4878 		struct flock lf = {
4879 			.l_whence = SEEK_SET,
4880 			.l_type = F_UNLCK,
4881 		};
4882 
4883 		(void)VNOP_ADVLOCK(
4884 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4885 	}
4886 
4887 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4888 	vnode_put(vp);
4889 	fp_free(p, indx, fp);
4890 
4891 	return error;
4892 }
4893 
4894 /*
4895  * While most of the *at syscall handlers can call nameiat() which
4896  * is a wrapper around namei, the use of namei and initialisation
4897  * of nameidata are far removed and in different functions  - namei
4898  * gets called in vn_open_auth for open1. So we'll just do here what
4899  * nameiat() does.
4900  */
4901 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4902 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4903     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4904     int dirfd, int authfd)
4905 {
4906 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4907 		int error;
4908 		char c;
4909 
4910 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4911 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4912 			if (error) {
4913 				return error;
4914 			}
4915 		} else {
4916 			c = *((char *)(ndp->ni_dirp));
4917 		}
4918 
4919 		if (c != '/') {
4920 			vnode_t dvp_at;
4921 
4922 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4923 			    &dvp_at);
4924 			if (error) {
4925 				return error;
4926 			}
4927 
4928 			if (vnode_vtype(dvp_at) != VDIR) {
4929 				vnode_put(dvp_at);
4930 				return ENOTDIR;
4931 			}
4932 
4933 			ndp->ni_dvp = dvp_at;
4934 			ndp->ni_cnd.cn_flags |= USEDVP;
4935 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4936 			    retval, authfd);
4937 			vnode_put(dvp_at);
4938 			return error;
4939 		}
4940 	}
4941 
4942 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4943 }
4944 
4945 /*
4946  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4947  *
4948  * Parameters:	p			Process requesting the open
4949  *		uap			User argument descriptor (see below)
4950  *		retval			Pointer to an area to receive the
4951  *					return calue from the system call
4952  *
4953  * Indirect:	uap->path		Path to open (same as 'open')
4954  *		uap->flags		Flags to open (same as 'open'
4955  *		uap->uid		UID to set, if creating
4956  *		uap->gid		GID to set, if creating
4957  *		uap->mode		File mode, if creating (same as 'open')
4958  *		uap->xsecurity		ACL to set, if creating
4959  *
4960  * Returns:	0			Success
4961  *		!0			errno value
4962  *
4963  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4964  *
4965  * XXX:		We should enummerate the possible errno values here, and where
4966  *		in the code they originated.
4967  */
4968 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4969 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4970 {
4971 	int ciferror;
4972 	kauth_filesec_t xsecdst;
4973 	struct vnode_attr va;
4974 	struct nameidata nd;
4975 	int cmode;
4976 
4977 	AUDIT_ARG(owner, uap->uid, uap->gid);
4978 
4979 	xsecdst = NULL;
4980 	if ((uap->xsecurity != USER_ADDR_NULL) &&
4981 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4982 		return ciferror;
4983 	}
4984 
4985 	VATTR_INIT(&va);
4986 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4987 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4988 	if (uap->uid != KAUTH_UID_NONE) {
4989 		VATTR_SET(&va, va_uid, uap->uid);
4990 	}
4991 	if (uap->gid != KAUTH_GID_NONE) {
4992 		VATTR_SET(&va, va_gid, uap->gid);
4993 	}
4994 	if (xsecdst != NULL) {
4995 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4996 		va.va_vaflags |= VA_FILESEC_ACL;
4997 	}
4998 
4999 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
5000 	    uap->path, vfs_context_current());
5001 
5002 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
5003 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
5004 	if (xsecdst != NULL) {
5005 		kauth_filesec_free(xsecdst);
5006 	}
5007 
5008 	return ciferror;
5009 }
5010 
5011 /*
5012  * Go through the data-protected atomically controlled open (2)
5013  *
5014  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5015  */
5016 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5017 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5018     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5019 {
5020 	/*
5021 	 * Follow the same path as normal open(2)
5022 	 * Look up the item if it exists, and acquire the vnode.
5023 	 */
5024 	struct vnode_attr va;
5025 	struct nameidata nd;
5026 	int cmode;
5027 	int error;
5028 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5029 
5030 	VATTR_INIT(&va);
5031 	/* Mask off all but regular access permissions */
5032 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5033 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5034 
5035 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5036 	    path, ctx);
5037 
5038 	/*
5039 	 * Initialize the extra fields in vnode_attr to pass down our
5040 	 * extra fields.
5041 	 * 1. target cprotect class.
5042 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5043 	 */
5044 	if (flags & O_CREAT) {
5045 		/* lower level kernel code validates that the class is valid before applying it. */
5046 		if (class != PROTECTION_CLASS_DEFAULT) {
5047 			/*
5048 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5049 			 * file behave the same as open (2)
5050 			 */
5051 			VATTR_SET(&va, va_dataprotect_class, class);
5052 		}
5053 	}
5054 
5055 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5056 		if (flags & (O_RDWR | O_WRONLY)) {
5057 			/*
5058 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
5059 			 */
5060 			return EINVAL;
5061 		}
5062 		if (dpflags & O_DP_GETRAWENCRYPTED) {
5063 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5064 		}
5065 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5066 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5067 		}
5068 		if (dpflags & O_DP_AUTHENTICATE) {
5069 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5070 		}
5071 	}
5072 
5073 	error = open1at(vfs_context_current(), &nd, flags, &va,
5074 	    NULL, NULL, retval, fd, authfd);
5075 
5076 	return error;
5077 }
5078 
5079 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5080 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5081 {
5082 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5083 		return EINVAL;
5084 	}
5085 
5086 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5087 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5088 }
5089 
5090 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5091 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5092 {
5093 	if (uap->dpflags & O_DP_AUTHENTICATE) {
5094 		return EINVAL;
5095 	}
5096 
5097 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5098 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5099 }
5100 
5101 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5102 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5103     int fd, enum uio_seg segflg, int *retval)
5104 {
5105 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5106 	struct {
5107 		struct vnode_attr va;
5108 		struct nameidata nd;
5109 	} *__open_data;
5110 	struct vnode_attr *vap;
5111 	struct nameidata *ndp;
5112 	int cmode;
5113 	int error;
5114 
5115 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5116 	vap = &__open_data->va;
5117 	ndp = &__open_data->nd;
5118 
5119 	VATTR_INIT(vap);
5120 	/* Mask off all but regular access permissions */
5121 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5122 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5123 
5124 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5125 	    segflg, path, ctx);
5126 
5127 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5128 
5129 	kfree_type(typeof(*__open_data), __open_data);
5130 
5131 	return error;
5132 }
5133 
5134 int
open(proc_t p,struct open_args * uap,int32_t * retval)5135 open(proc_t p, struct open_args *uap, int32_t *retval)
5136 {
5137 	__pthread_testcancel(1);
5138 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5139 }
5140 
5141 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5142 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5143     int32_t *retval)
5144 {
5145 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5146 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5147 }
5148 
5149 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5150 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5151     int32_t *retval)
5152 {
5153 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5154 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
5155 }
5156 
5157 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5158 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5159 {
5160 	__pthread_testcancel(1);
5161 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5162 }
5163 
5164 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5165 
5166 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5167 vfs_context_can_open_by_id(vfs_context_t ctx)
5168 {
5169 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5170 		return TRUE;
5171 	}
5172 
5173 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5174 	           OPEN_BY_ID_ENTITLEMENT);
5175 }
5176 
5177 /*
5178  * openbyid_np: open a file given a file system id and a file system object id
5179  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5180  *	file systems that don't support object ids it is a node id (uint64_t).
5181  *
5182  * Parameters:	p			Process requesting the open
5183  *		uap			User argument descriptor (see below)
5184  *		retval			Pointer to an area to receive the
5185  *					return calue from the system call
5186  *
5187  * Indirect:	uap->path		Path to open (same as 'open')
5188  *
5189  *		uap->fsid		id of target file system
5190  *		uap->objid		id of target file system object
5191  *		uap->flags		Flags to open (same as 'open')
5192  *
5193  * Returns:	0			Success
5194  *		!0			errno value
5195  *
5196  *
5197  * XXX:		We should enummerate the possible errno values here, and where
5198  *		in the code they originated.
5199  */
5200 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5201 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5202 {
5203 	fsid_t fsid;
5204 	uint64_t objid;
5205 	int error;
5206 	char *buf = NULL;
5207 	int buflen = MAXPATHLEN;
5208 	int pathlen = 0;
5209 	vfs_context_t ctx = vfs_context_current();
5210 
5211 	if (!vfs_context_can_open_by_id(ctx)) {
5212 		return EPERM;
5213 	}
5214 
5215 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5216 		return error;
5217 	}
5218 
5219 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5220 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5221 		return error;
5222 	}
5223 
5224 	AUDIT_ARG(value32, fsid.val[0]);
5225 	AUDIT_ARG(value64, objid);
5226 
5227 	/*resolve path from fsis, objid*/
5228 	do {
5229 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5230 		if (buf == NULL) {
5231 			return ENOMEM;
5232 		}
5233 
5234 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5235 		    buf, FSOPT_ISREALFSID, &pathlen);
5236 
5237 		if (error) {
5238 			kfree_data(buf, buflen + 1);
5239 			buf = NULL;
5240 		}
5241 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5242 
5243 	if (error) {
5244 		return error;
5245 	}
5246 
5247 	buf[pathlen] = 0;
5248 
5249 	error = openat_internal(
5250 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5251 
5252 	kfree_data(buf, buflen + 1);
5253 
5254 	return error;
5255 }
5256 
5257 
5258 /*
5259  * Create a special file.
5260  */
5261 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5262     int fd);
5263 
5264 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5265 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5266     mode_t mode, int fd)
5267 {
5268 	vfs_context_t ctx = vfs_context_current();
5269 	struct nameidata nd;
5270 	vnode_t vp, dvp;
5271 	int error;
5272 
5273 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5274 	if ((mode & S_IFMT) == S_IFIFO) {
5275 		return mkfifo1(ctx, upath, vap, fd);
5276 	}
5277 
5278 	AUDIT_ARG(mode, mode);
5279 	AUDIT_ARG(value32, vap->va_rdev);
5280 
5281 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5282 		return error;
5283 	}
5284 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5285 	    UIO_USERSPACE, upath, ctx);
5286 	error = nameiat(&nd, fd);
5287 	if (error) {
5288 		return error;
5289 	}
5290 	dvp = nd.ni_dvp;
5291 	vp = nd.ni_vp;
5292 
5293 	if (vp != NULL) {
5294 		error = EEXIST;
5295 		goto out;
5296 	}
5297 
5298 	switch (mode & S_IFMT) {
5299 	case S_IFCHR:
5300 		VATTR_SET(vap, va_type, VCHR);
5301 		break;
5302 	case S_IFBLK:
5303 		VATTR_SET(vap, va_type, VBLK);
5304 		break;
5305 	default:
5306 		error = EINVAL;
5307 		goto out;
5308 	}
5309 
5310 #if CONFIG_MACF
5311 	error = mac_vnode_check_create(ctx,
5312 	    nd.ni_dvp, &nd.ni_cnd, vap);
5313 	if (error) {
5314 		goto out;
5315 	}
5316 #endif
5317 
5318 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5319 		goto out;
5320 	}
5321 
5322 #if CONFIG_FILE_LEASES
5323 	vnode_breakdirlease(dvp, false, O_WRONLY);
5324 #endif
5325 
5326 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5327 		goto out;
5328 	}
5329 
5330 	if (vp) {
5331 		int     update_flags = 0;
5332 
5333 		// Make sure the name & parent pointers are hooked up
5334 		if (vp->v_name == NULL) {
5335 			update_flags |= VNODE_UPDATE_NAME;
5336 		}
5337 		if (vp->v_parent == NULLVP) {
5338 			update_flags |= VNODE_UPDATE_PARENT;
5339 		}
5340 
5341 		if (update_flags) {
5342 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5343 		}
5344 
5345 #if CONFIG_FSE
5346 		add_fsevent(FSE_CREATE_FILE, ctx,
5347 		    FSE_ARG_VNODE, vp,
5348 		    FSE_ARG_DONE);
5349 #endif
5350 	}
5351 
5352 out:
5353 	/*
5354 	 * nameidone has to happen before we vnode_put(dvp)
5355 	 * since it may need to release the fs_nodelock on the dvp
5356 	 */
5357 	nameidone(&nd);
5358 
5359 	if (vp) {
5360 		vnode_put(vp);
5361 	}
5362 	vnode_put(dvp);
5363 
5364 	return error;
5365 }
5366 
5367 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5368 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5369 {
5370 	struct vnode_attr va;
5371 
5372 	VATTR_INIT(&va);
5373 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5374 	VATTR_SET(&va, va_rdev, uap->dev);
5375 
5376 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5377 }
5378 
5379 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5380 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5381 {
5382 	struct vnode_attr va;
5383 
5384 	VATTR_INIT(&va);
5385 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5386 	VATTR_SET(&va, va_rdev, uap->dev);
5387 
5388 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5389 }
5390 
5391 /*
5392  * Create a named pipe.
5393  *
5394  * Returns:	0			Success
5395  *		EEXIST
5396  *	namei:???
5397  *	vnode_authorize:???
5398  *	vn_create:???
5399  */
5400 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5401 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5402 {
5403 	vnode_t vp, dvp;
5404 	int error;
5405 	struct nameidata nd;
5406 
5407 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5408 	    UIO_USERSPACE, upath, ctx);
5409 	error = nameiat(&nd, fd);
5410 	if (error) {
5411 		return error;
5412 	}
5413 	dvp = nd.ni_dvp;
5414 	vp = nd.ni_vp;
5415 
5416 	/* check that this is a new file and authorize addition */
5417 	if (vp != NULL) {
5418 		error = EEXIST;
5419 		goto out;
5420 	}
5421 	VATTR_SET(vap, va_type, VFIFO);
5422 
5423 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5424 		goto out;
5425 	}
5426 
5427 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5428 out:
5429 	/*
5430 	 * nameidone has to happen before we vnode_put(dvp)
5431 	 * since it may need to release the fs_nodelock on the dvp
5432 	 */
5433 	nameidone(&nd);
5434 
5435 	if (vp) {
5436 		vnode_put(vp);
5437 	}
5438 	vnode_put(dvp);
5439 
5440 	return error;
5441 }
5442 
5443 
5444 /*
5445  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5446  *
5447  * Parameters:	p			Process requesting the open
5448  *		uap			User argument descriptor (see below)
5449  *		retval			(Ignored)
5450  *
5451  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5452  *		uap->uid		UID to set
5453  *		uap->gid		GID to set
5454  *		uap->mode		File mode to set (same as 'mkfifo')
5455  *		uap->xsecurity		ACL to set, if creating
5456  *
5457  * Returns:	0			Success
5458  *		!0			errno value
5459  *
5460  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5461  *
5462  * XXX:		We should enummerate the possible errno values here, and where
5463  *		in the code they originated.
5464  */
5465 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5466 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5467 {
5468 	int ciferror;
5469 	kauth_filesec_t xsecdst;
5470 	struct vnode_attr va;
5471 
5472 	AUDIT_ARG(owner, uap->uid, uap->gid);
5473 
5474 	xsecdst = KAUTH_FILESEC_NONE;
5475 	if (uap->xsecurity != USER_ADDR_NULL) {
5476 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5477 			return ciferror;
5478 		}
5479 	}
5480 
5481 	VATTR_INIT(&va);
5482 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5483 	if (uap->uid != KAUTH_UID_NONE) {
5484 		VATTR_SET(&va, va_uid, uap->uid);
5485 	}
5486 	if (uap->gid != KAUTH_GID_NONE) {
5487 		VATTR_SET(&va, va_gid, uap->gid);
5488 	}
5489 	if (xsecdst != KAUTH_FILESEC_NONE) {
5490 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5491 		va.va_vaflags |= VA_FILESEC_ACL;
5492 	}
5493 
5494 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5495 
5496 	if (xsecdst != KAUTH_FILESEC_NONE) {
5497 		kauth_filesec_free(xsecdst);
5498 	}
5499 	return ciferror;
5500 }
5501 
5502 /* ARGSUSED */
5503 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5504 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5505 {
5506 	struct vnode_attr va;
5507 
5508 	VATTR_INIT(&va);
5509 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5510 
5511 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5512 }
5513 
5514 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5515 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5516 {
5517 	struct vnode_attr va;
5518 
5519 	VATTR_INIT(&va);
5520 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5521 
5522 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5523 }
5524 
5525 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5526 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5527 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5528 
5529 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5530 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5531 {
5532 	int ret, len = _len;
5533 
5534 	*truncated_path = 0;
5535 
5536 	if (firmlink) {
5537 		ret = vn_getpath(dvp, path, &len);
5538 	} else {
5539 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5540 	}
5541 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5542 		if (leafname) {
5543 			path[len - 1] = '/';
5544 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5545 			if (len > MAXPATHLEN) {
5546 				char *ptr;
5547 
5548 				// the string got truncated!
5549 				*truncated_path = 1;
5550 				ptr = strrchr(path, '/');
5551 				if (ptr) {
5552 					*ptr = '\0';   // chop off the string at the last directory component
5553 				}
5554 				len = (int)strlen(path) + 1;
5555 			}
5556 		}
5557 	} else if (ret == 0) {
5558 		*truncated_path = 1;
5559 	} else if (ret != 0) {
5560 		struct vnode *mydvp = dvp;
5561 
5562 		if (ret != ENOSPC) {
5563 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5564 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5565 		}
5566 		*truncated_path = 1;
5567 
5568 		do {
5569 			if (mydvp->v_parent != NULL) {
5570 				mydvp = mydvp->v_parent;
5571 			} else if (mydvp->v_mount) {
5572 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5573 				break;
5574 			} else {
5575 				// no parent and no mount point?  only thing is to punt and say "/" changed
5576 				strlcpy(path, "/", _len);
5577 				len = 2;
5578 				mydvp = NULL;
5579 			}
5580 
5581 			if (mydvp == NULL) {
5582 				break;
5583 			}
5584 
5585 			len = _len;
5586 			if (firmlink) {
5587 				ret = vn_getpath(mydvp, path, &len);
5588 			} else {
5589 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5590 			}
5591 		} while (ret == ENOSPC);
5592 	}
5593 
5594 	return len;
5595 }
5596 
5597 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5598 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5599 {
5600 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5601 }
5602 
5603 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5604 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5605 {
5606 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5607 }
5608 
5609 /*
5610  * Make a hard file link.
5611  *
5612  * Returns:	0			Success
5613  *		EPERM
5614  *		EEXIST
5615  *		EXDEV
5616  *	namei:???
5617  *	vnode_authorize:???
5618  *	VNOP_LINK:???
5619  */
5620 /* ARGSUSED */
5621 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5622 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5623     user_addr_t link, int flag, enum uio_seg segflg)
5624 {
5625 	vnode_t vp, pvp, dvp, lvp;
5626 	struct nameidata nd;
5627 	int follow;
5628 	int error;
5629 #if CONFIG_FSE
5630 	fse_info finfo;
5631 #endif
5632 	int need_event, has_listeners, need_kpath2;
5633 	char *target_path = NULL;
5634 	char  *no_firmlink_path = NULL;
5635 	int truncated = 0;
5636 	int truncated_no_firmlink_path = 0;
5637 	bool do_retry;
5638 	int num_retries = 0;
5639 
5640 	/* look up the object we are linking to */
5641 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5642 
5643 retry:
5644 	do_retry = false;
5645 	vp = dvp = lvp = NULLVP;
5646 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5647 	    segflg, path, ctx);
5648 
5649 	error = nameiat(&nd, fd1);
5650 	if (error) {
5651 		return error;
5652 	}
5653 	vp = nd.ni_vp;
5654 
5655 	nameidone(&nd);
5656 
5657 	/*
5658 	 * Normally, linking to directories is not supported.
5659 	 * However, some file systems may have limited support.
5660 	 */
5661 	if (vp->v_type == VDIR) {
5662 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5663 			error = EPERM;   /* POSIX */
5664 			goto out;
5665 		}
5666 
5667 		/* Linking to a directory requires ownership. */
5668 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5669 			struct vnode_attr dva;
5670 
5671 			VATTR_INIT(&dva);
5672 			VATTR_WANTED(&dva, va_uid);
5673 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5674 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5675 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5676 				error = EACCES;
5677 				goto out;
5678 			}
5679 		}
5680 	}
5681 
5682 	/* lookup the target node */
5683 #if CONFIG_TRIGGERS
5684 	nd.ni_op = OP_LINK;
5685 #endif
5686 	nd.ni_cnd.cn_nameiop = CREATE;
5687 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5688 	nd.ni_dirp = link;
5689 	error = nameiat(&nd, fd2);
5690 	if (error != 0) {
5691 		goto out;
5692 	}
5693 	dvp = nd.ni_dvp;
5694 	lvp = nd.ni_vp;
5695 
5696 #if CONFIG_MACF
5697 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5698 		goto out2;
5699 	}
5700 #endif
5701 
5702 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5703 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5704 		goto out2;
5705 	}
5706 
5707 	/* target node must not exist */
5708 	if (lvp != NULLVP) {
5709 		error = EEXIST;
5710 		goto out2;
5711 	}
5712 	/* cannot link across mountpoints */
5713 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5714 		error = EXDEV;
5715 		goto out2;
5716 	}
5717 
5718 	/* authorize creation of the target note */
5719 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5720 		goto out2;
5721 	}
5722 
5723 #if CONFIG_FILE_LEASES
5724 	vnode_breakdirlease(dvp, false, O_WRONLY);
5725 #endif
5726 
5727 	/* and finally make the link */
5728 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5729 	if (error) {
5730 		if (error == ENOENT && num_retries < MAX_LINK_ENOENT_RETRIES) {
5731 			do_retry = true;
5732 		}
5733 		goto out2;
5734 	}
5735 
5736 #if CONFIG_MACF
5737 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5738 #endif
5739 
5740 #if CONFIG_FSE
5741 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5742 #else
5743 	need_event = 0;
5744 #endif
5745 	has_listeners = kauth_authorize_fileop_has_listeners();
5746 
5747 	need_kpath2 = 0;
5748 #if CONFIG_AUDIT
5749 	if (AUDIT_RECORD_EXISTS()) {
5750 		need_kpath2 = 1;
5751 	}
5752 #endif
5753 
5754 	if (need_event || has_listeners || need_kpath2) {
5755 		char *link_to_path = NULL;
5756 		int len, link_name_len;
5757 		int  len_no_firmlink_path = 0;
5758 
5759 		/* build the path to the new link file */
5760 		GET_PATH(target_path);
5761 
5762 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5763 		if (no_firmlink_path == NULL) {
5764 			GET_PATH(no_firmlink_path);
5765 		}
5766 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5767 
5768 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5769 
5770 		if (has_listeners) {
5771 			/* build the path to file we are linking to */
5772 			GET_PATH(link_to_path);
5773 
5774 			link_name_len = MAXPATHLEN;
5775 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5776 				/*
5777 				 * Call out to allow 3rd party notification of rename.
5778 				 * Ignore result of kauth_authorize_fileop call.
5779 				 */
5780 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5781 				    (uintptr_t)link_to_path,
5782 				    (uintptr_t)target_path);
5783 			}
5784 			if (link_to_path != NULL) {
5785 				RELEASE_PATH(link_to_path);
5786 			}
5787 		}
5788 #if CONFIG_FSE
5789 		if (need_event) {
5790 			/* construct fsevent */
5791 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5792 				if (truncated_no_firmlink_path) {
5793 					finfo.mode |= FSE_TRUNCATED_PATH;
5794 				}
5795 
5796 				// build the path to the destination of the link
5797 				add_fsevent(FSE_CREATE_FILE, ctx,
5798 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5799 				    FSE_ARG_FINFO, &finfo,
5800 				    FSE_ARG_DONE);
5801 			}
5802 
5803 			pvp = vp->v_parent;
5804 			// need an iocount on parent vnode in this case
5805 			if (pvp && pvp != dvp) {
5806 				pvp = vnode_getparent_if_different(vp, dvp);
5807 			}
5808 			if (pvp) {
5809 				add_fsevent(FSE_STAT_CHANGED, ctx,
5810 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5811 			}
5812 			if (pvp && pvp != dvp) {
5813 				vnode_put(pvp);
5814 			}
5815 		}
5816 #endif
5817 	}
5818 out2:
5819 	/*
5820 	 * nameidone has to happen before we vnode_put(dvp)
5821 	 * since it may need to release the fs_nodelock on the dvp
5822 	 */
5823 	nameidone(&nd);
5824 	if (target_path != NULL) {
5825 		RELEASE_PATH(target_path);
5826 		target_path = NULL;
5827 	}
5828 	if (no_firmlink_path != NULL) {
5829 		RELEASE_PATH(no_firmlink_path);
5830 		no_firmlink_path = NULL;
5831 	}
5832 out:
5833 	if (lvp) {
5834 		vnode_put(lvp);
5835 	}
5836 	if (dvp) {
5837 		vnode_put(dvp);
5838 	}
5839 	vnode_put(vp);
5840 
5841 	if (do_retry) {
5842 		goto retry;
5843 	}
5844 
5845 	return error;
5846 }
5847 
5848 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5849 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5850 {
5851 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5852 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5853 }
5854 
5855 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5856 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5857 {
5858 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5859 		return EINVAL;
5860 	}
5861 
5862 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5863 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5864 }
5865 
5866 /*
5867  * Make a symbolic link.
5868  *
5869  * We could add support for ACLs here too...
5870  */
5871 /* ARGSUSED */
5872 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5873 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5874     user_addr_t link, enum uio_seg segflg)
5875 {
5876 	struct vnode_attr va;
5877 	char *path;
5878 	int error;
5879 	struct nameidata nd;
5880 	vnode_t vp, dvp;
5881 	size_t dummy = 0;
5882 	proc_t p;
5883 
5884 	error = 0;
5885 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5886 		path = zalloc(ZV_NAMEI);
5887 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5888 	} else {
5889 		path = (char *)path_data;
5890 	}
5891 	if (error) {
5892 		goto out;
5893 	}
5894 	AUDIT_ARG(text, path);  /* This is the link string */
5895 
5896 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5897 	    segflg, link, ctx);
5898 
5899 	error = nameiat(&nd, fd);
5900 	if (error) {
5901 		goto out;
5902 	}
5903 	dvp = nd.ni_dvp;
5904 	vp = nd.ni_vp;
5905 
5906 	p = vfs_context_proc(ctx);
5907 	VATTR_INIT(&va);
5908 	VATTR_SET(&va, va_type, VLNK);
5909 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5910 
5911 #if CONFIG_MACF
5912 	error = mac_vnode_check_create(ctx,
5913 	    dvp, &nd.ni_cnd, &va);
5914 #endif
5915 	if (error != 0) {
5916 		goto skipit;
5917 	}
5918 
5919 	if (vp != NULL) {
5920 		error = EEXIST;
5921 		goto skipit;
5922 	}
5923 
5924 	/* authorize */
5925 	if (error == 0) {
5926 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5927 	}
5928 	/* get default ownership, etc. */
5929 	if (error == 0) {
5930 		error = vnode_authattr_new(dvp, &va, 0, ctx);
5931 	}
5932 
5933 #if CONFIG_FILE_LEASES
5934 	vnode_breakdirlease(dvp, false, O_WRONLY);
5935 #endif
5936 
5937 	if (error == 0) {
5938 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5939 	}
5940 
5941 	/* do fallback attribute handling */
5942 	if (error == 0 && vp) {
5943 		error = vnode_setattr_fallback(vp, &va, ctx);
5944 	}
5945 
5946 #if CONFIG_MACF
5947 	if (error == 0 && vp) {
5948 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5949 	}
5950 #endif
5951 
5952 	if (error == 0) {
5953 		int     update_flags = 0;
5954 
5955 		/*check if a new vnode was created, else try to get one*/
5956 		if (vp == NULL) {
5957 			nd.ni_cnd.cn_nameiop = LOOKUP;
5958 #if CONFIG_TRIGGERS
5959 			nd.ni_op = OP_LOOKUP;
5960 #endif
5961 			/*
5962 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5963 			 * reallocated again in namei().
5964 			 */
5965 			nd.ni_cnd.cn_flags &= HASBUF;
5966 			error = nameiat(&nd, fd);
5967 			if (error) {
5968 				goto skipit;
5969 			}
5970 			vp = nd.ni_vp;
5971 		}
5972 
5973 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5974 		/* call out to allow 3rd party notification of rename.
5975 		 * Ignore result of kauth_authorize_fileop call.
5976 		 */
5977 		if (kauth_authorize_fileop_has_listeners() &&
5978 		    namei(&nd) == 0) {
5979 			char *new_link_path = NULL;
5980 			int             len;
5981 
5982 			/* build the path to the new link file */
5983 			new_link_path = get_pathbuff();
5984 			len = MAXPATHLEN;
5985 			vn_getpath(dvp, new_link_path, &len);
5986 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5987 				new_link_path[len - 1] = '/';
5988 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5989 			}
5990 
5991 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5992 			    (uintptr_t)path, (uintptr_t)new_link_path);
5993 			if (new_link_path != NULL) {
5994 				release_pathbuff(new_link_path);
5995 			}
5996 		}
5997 #endif
5998 		// Make sure the name & parent pointers are hooked up
5999 		if (vp->v_name == NULL) {
6000 			update_flags |= VNODE_UPDATE_NAME;
6001 		}
6002 		if (vp->v_parent == NULLVP) {
6003 			update_flags |= VNODE_UPDATE_PARENT;
6004 		}
6005 
6006 		if (update_flags) {
6007 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
6008 		}
6009 
6010 #if CONFIG_FSE
6011 		add_fsevent(FSE_CREATE_FILE, ctx,
6012 		    FSE_ARG_VNODE, vp,
6013 		    FSE_ARG_DONE);
6014 #endif
6015 	}
6016 
6017 skipit:
6018 	/*
6019 	 * nameidone has to happen before we vnode_put(dvp)
6020 	 * since it may need to release the fs_nodelock on the dvp
6021 	 */
6022 	nameidone(&nd);
6023 
6024 	if (vp) {
6025 		vnode_put(vp);
6026 	}
6027 	vnode_put(dvp);
6028 out:
6029 	if (path && (path != (char *)path_data)) {
6030 		zfree(ZV_NAMEI, path);
6031 	}
6032 
6033 	return error;
6034 }
6035 
6036 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6037 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6038 {
6039 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6040 	           uap->link, UIO_USERSPACE);
6041 }
6042 
6043 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6044 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6045     __unused int32_t *retval)
6046 {
6047 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6048 	           uap->path2, UIO_USERSPACE);
6049 }
6050 
6051 /*
6052  * Delete a whiteout from the filesystem.
6053  * No longer supported.
6054  */
6055 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6056 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6057 {
6058 	return ENOTSUP;
6059 }
6060 
6061 /*
6062  * Delete a name from the filesystem.
6063  */
6064 /* ARGSUSED */
6065 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6066 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6067     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6068 {
6069 	struct {
6070 		struct nameidata nd;
6071 #if CONFIG_FSE
6072 		struct vnode_attr va;
6073 		fse_info finfo;
6074 #endif
6075 	} *__unlink_data;
6076 	struct nameidata *ndp;
6077 	vnode_t vp, dvp;
6078 	int error;
6079 	struct componentname *cnp;
6080 	char  *path = NULL;
6081 	char  *no_firmlink_path = NULL;
6082 	int  len_path = 0;
6083 	int  len_no_firmlink_path = 0;
6084 	int flags;
6085 	int need_event;
6086 	int has_listeners;
6087 	int truncated_path;
6088 	int truncated_no_firmlink_path;
6089 	int batched;
6090 	struct vnode_attr *vap;
6091 	int do_retry;
6092 	int retry_count = 0;
6093 	int cn_flags;
6094 	int nofollow_any = 0;
6095 
6096 	cn_flags = LOCKPARENT;
6097 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6098 		cn_flags |= AUDITVNPATH1;
6099 	}
6100 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6101 		nofollow_any = NAMEI_NOFOLLOW_ANY;
6102 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6103 	}
6104 	/* If a starting dvp is passed, it trumps any fd passed. */
6105 	if (start_dvp) {
6106 		cn_flags |= USEDVP;
6107 	}
6108 
6109 #if NAMEDRSRCFORK
6110 	/* unlink or delete is allowed on rsrc forks and named streams */
6111 	cn_flags |= CN_ALLOWRSRCFORK;
6112 #endif
6113 
6114 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6115 	ndp = &__unlink_data->nd;
6116 #if CONFIG_FSE
6117 	fse_info *finfop = &__unlink_data->finfo;
6118 #endif
6119 
6120 retry:
6121 	do_retry = 0;
6122 	flags = 0;
6123 	need_event = 0;
6124 	has_listeners = 0;
6125 	truncated_path = 0;
6126 	truncated_no_firmlink_path = 0;
6127 	vap = NULL;
6128 
6129 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6130 
6131 	ndp->ni_dvp = start_dvp;
6132 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6133 	cnp = &ndp->ni_cnd;
6134 
6135 continue_lookup:
6136 	error = nameiat(ndp, fd);
6137 	if (error) {
6138 		goto early_out;
6139 	}
6140 
6141 	dvp = ndp->ni_dvp;
6142 	vp = ndp->ni_vp;
6143 
6144 	/* With Carbon delete semantics, busy files cannot be deleted */
6145 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6146 		flags |= VNODE_REMOVE_NODELETEBUSY;
6147 	}
6148 
6149 	/* Skip any potential upcalls if told to. */
6150 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6151 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6152 	}
6153 
6154 	if (vp) {
6155 		batched = vnode_compound_remove_available(vp);
6156 		/*
6157 		 * The root of a mounted filesystem cannot be deleted.
6158 		 */
6159 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6160 			error = EBUSY;
6161 			goto out;
6162 		}
6163 
6164 #if DEVELOPMENT || DEBUG
6165 		/*
6166 		 * XXX VSWAP: Check for entitlements or special flag here
6167 		 * so we can restrict access appropriately.
6168 		 */
6169 #else /* DEVELOPMENT || DEBUG */
6170 
6171 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6172 			error = EPERM;
6173 			goto out;
6174 		}
6175 #endif /* DEVELOPMENT || DEBUG */
6176 
6177 		if (!batched) {
6178 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6179 			if (error) {
6180 				if (error == ENOENT) {
6181 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6182 						do_retry = 1;
6183 						retry_count++;
6184 					}
6185 				}
6186 				goto out;
6187 			}
6188 		}
6189 	} else {
6190 		batched = 1;
6191 
6192 		if (!vnode_compound_remove_available(dvp)) {
6193 			panic("No vp, but no compound remove?");
6194 		}
6195 	}
6196 
6197 #if CONFIG_FSE
6198 	need_event = need_fsevent(FSE_DELETE, dvp);
6199 	if (need_event) {
6200 		if (!batched) {
6201 			if ((vp->v_flag & VISHARDLINK) == 0) {
6202 				/* XXX need to get these data in batched VNOP */
6203 				get_fse_info(vp, finfop, ctx);
6204 			}
6205 		} else {
6206 			error =
6207 			    vfs_get_notify_attributes(&__unlink_data->va);
6208 			if (error) {
6209 				goto out;
6210 			}
6211 
6212 			vap = &__unlink_data->va;
6213 		}
6214 	}
6215 #endif
6216 	has_listeners = kauth_authorize_fileop_has_listeners();
6217 	if (need_event || has_listeners) {
6218 		if (path == NULL) {
6219 			GET_PATH(path);
6220 		}
6221 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6222 		if (no_firmlink_path == NULL) {
6223 			GET_PATH(no_firmlink_path);
6224 		}
6225 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6226 	}
6227 
6228 #if NAMEDRSRCFORK
6229 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6230 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6231 	} else
6232 #endif
6233 	{
6234 #if CONFIG_FILE_LEASES
6235 		vnode_breakdirlease(dvp, false, O_WRONLY);
6236 #endif
6237 
6238 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6239 		vp = ndp->ni_vp;
6240 		if (error == EKEEPLOOKING) {
6241 			if (!batched) {
6242 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6243 			}
6244 
6245 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6246 				panic("EKEEPLOOKING, but continue flag not set?");
6247 			}
6248 
6249 			if (vnode_isdir(vp)) {
6250 				error = EISDIR;
6251 				goto out;
6252 			}
6253 			goto continue_lookup;
6254 		} else if (error == ENOENT && batched) {
6255 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6256 				/*
6257 				 * For compound VNOPs, the authorization callback may
6258 				 * return ENOENT in case of racing hardlink lookups
6259 				 * hitting the name  cache, redrive the lookup.
6260 				 */
6261 				do_retry = 1;
6262 				retry_count += 1;
6263 				goto out;
6264 			}
6265 		}
6266 	}
6267 
6268 	/*
6269 	 * Call out to allow 3rd party notification of delete.
6270 	 * Ignore result of kauth_authorize_fileop call.
6271 	 */
6272 	if (!error) {
6273 		if (has_listeners) {
6274 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6275 			    KAUTH_FILEOP_DELETE,
6276 			    (uintptr_t)vp,
6277 			    (uintptr_t)path);
6278 		}
6279 
6280 		if (vp->v_flag & VISHARDLINK) {
6281 			//
6282 			// if a hardlink gets deleted we want to blow away the
6283 			// v_parent link because the path that got us to this
6284 			// instance of the link is no longer valid.  this will
6285 			// force the next call to get the path to ask the file
6286 			// system instead of just following the v_parent link.
6287 			//
6288 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6289 		}
6290 
6291 #if CONFIG_FSE
6292 		if (need_event) {
6293 			if (vp->v_flag & VISHARDLINK) {
6294 				get_fse_info(vp, finfop, ctx);
6295 			} else if (vap) {
6296 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6297 			}
6298 			if (truncated_path) {
6299 				finfop->mode |= FSE_TRUNCATED_PATH;
6300 			}
6301 			add_fsevent(FSE_DELETE, ctx,
6302 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6303 			    FSE_ARG_FINFO, finfop,
6304 			    FSE_ARG_DONE);
6305 		}
6306 #endif
6307 
6308 #if CONFIG_MACF
6309 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6310 #endif
6311 	}
6312 
6313 out:
6314 	if (path != NULL) {
6315 		RELEASE_PATH(path);
6316 		path = NULL;
6317 	}
6318 
6319 	if (no_firmlink_path != NULL) {
6320 		RELEASE_PATH(no_firmlink_path);
6321 		no_firmlink_path = NULL;
6322 	}
6323 #if NAMEDRSRCFORK
6324 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6325 	 * will cause its shadow file to go away if necessary.
6326 	 */
6327 	if (vp && (vnode_isnamedstream(vp)) &&
6328 	    (vp->v_parent != NULLVP) &&
6329 	    vnode_isshadow(vp)) {
6330 		vnode_recycle(vp);
6331 	}
6332 #endif
6333 	/*
6334 	 * nameidone has to happen before we vnode_put(dvp)
6335 	 * since it may need to release the fs_nodelock on the dvp
6336 	 */
6337 	nameidone(ndp);
6338 	vnode_put(dvp);
6339 	if (vp) {
6340 		vnode_put(vp);
6341 	}
6342 
6343 	if (do_retry) {
6344 		goto retry;
6345 	}
6346 
6347 early_out:
6348 	kfree_type(typeof(*__unlink_data), __unlink_data);
6349 	return error;
6350 }
6351 
6352 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6353 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6354     enum uio_seg segflg, int unlink_flags)
6355 {
6356 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6357 	           unlink_flags);
6358 }
6359 
6360 /*
6361  * Delete a name from the filesystem using Carbon semantics.
6362  */
6363 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6364 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6365 {
6366 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6367 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6368 }
6369 
6370 /*
6371  * Delete a name from the filesystem using POSIX semantics.
6372  */
6373 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6374 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6375 {
6376 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6377 	           uap->path, UIO_USERSPACE, 0);
6378 }
6379 
6380 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6381 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6382 {
6383 	int unlink_flags = 0;
6384 
6385 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY)) {
6386 		return EINVAL;
6387 	}
6388 
6389 	if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6390 		unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6391 	}
6392 
6393 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6394 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6395 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6396 		}
6397 		return rmdirat_internal(vfs_context_current(), uap->fd,
6398 		           uap->path, UIO_USERSPACE, unlink_flags);
6399 	} else {
6400 		return unlinkat_internal(vfs_context_current(), uap->fd,
6401 		           NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6402 	}
6403 }
6404 
6405 /*
6406  * Reposition read/write file offset.
6407  */
6408 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6409 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6410 {
6411 	struct fileproc *fp;
6412 	vnode_t vp;
6413 	struct vfs_context *ctx;
6414 	off_t offset = uap->offset, file_size;
6415 	int error;
6416 
6417 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6418 		if (error == ENOTSUP) {
6419 			return ESPIPE;
6420 		}
6421 		return error;
6422 	}
6423 	if (vnode_isfifo(vp)) {
6424 		file_drop(uap->fd);
6425 		return ESPIPE;
6426 	}
6427 
6428 
6429 	ctx = vfs_context_current();
6430 #if CONFIG_MACF
6431 	if (uap->whence == L_INCR && uap->offset == 0) {
6432 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6433 		    fp->fp_glob);
6434 	} else {
6435 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6436 		    fp->fp_glob);
6437 	}
6438 	if (error) {
6439 		file_drop(uap->fd);
6440 		return error;
6441 	}
6442 #endif
6443 	if ((error = vnode_getwithref(vp))) {
6444 		file_drop(uap->fd);
6445 		return error;
6446 	}
6447 
6448 	switch (uap->whence) {
6449 	case L_INCR:
6450 		offset += fp->fp_glob->fg_offset;
6451 		break;
6452 	case L_XTND:
6453 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6454 			break;
6455 		}
6456 		offset += file_size;
6457 		break;
6458 	case L_SET:
6459 		break;
6460 	case SEEK_HOLE:
6461 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6462 		break;
6463 	case SEEK_DATA:
6464 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6465 		break;
6466 	default:
6467 		error = EINVAL;
6468 	}
6469 	if (error == 0) {
6470 		if (uap->offset > 0 && offset < 0) {
6471 			/* Incremented/relative move past max size */
6472 			error = EOVERFLOW;
6473 		} else {
6474 			/*
6475 			 * Allow negative offsets on character devices, per
6476 			 * POSIX 1003.1-2001.  Most likely for writing disk
6477 			 * labels.
6478 			 */
6479 			if (offset < 0 && vp->v_type != VCHR) {
6480 				/* Decremented/relative move before start */
6481 				error = EINVAL;
6482 			} else {
6483 				/* Success */
6484 				fp->fp_glob->fg_offset = offset;
6485 				*retval = fp->fp_glob->fg_offset;
6486 			}
6487 		}
6488 	}
6489 
6490 	/*
6491 	 * An lseek can affect whether data is "available to read."  Use
6492 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6493 	 */
6494 	post_event_if_success(vp, error, NOTE_NONE);
6495 	(void)vnode_put(vp);
6496 	file_drop(uap->fd);
6497 	return error;
6498 }
6499 
6500 
6501 /*
6502  * Check access permissions.
6503  *
6504  * Returns:	0			Success
6505  *		vnode_authorize:???
6506  */
6507 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6508 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6509 {
6510 	kauth_action_t action;
6511 	int error;
6512 
6513 	/*
6514 	 * If just the regular access bits, convert them to something
6515 	 * that vnode_authorize will understand.
6516 	 */
6517 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6518 		action = 0;
6519 		if (uflags & R_OK) {
6520 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6521 		}
6522 		if (uflags & W_OK) {
6523 			if (vnode_isdir(vp)) {
6524 				action |= KAUTH_VNODE_ADD_FILE |
6525 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6526 				/* might want delete rights here too */
6527 			} else {
6528 				action |= KAUTH_VNODE_WRITE_DATA;
6529 			}
6530 		}
6531 		if (uflags & X_OK) {
6532 			if (vnode_isdir(vp)) {
6533 				action |= KAUTH_VNODE_SEARCH;
6534 			} else {
6535 				action |= KAUTH_VNODE_EXECUTE;
6536 			}
6537 		}
6538 	} else {
6539 		/* take advantage of definition of uflags */
6540 		action = uflags >> 8;
6541 	}
6542 
6543 #if CONFIG_MACF
6544 	error = mac_vnode_check_access(ctx, vp, uflags);
6545 	if (error) {
6546 		return error;
6547 	}
6548 #endif /* MAC */
6549 
6550 	/* action == 0 means only check for existence */
6551 	if (action != 0) {
6552 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6553 	} else {
6554 		error = 0;
6555 	}
6556 
6557 	return error;
6558 }
6559 
6560 
6561 
6562 /*
6563  * access_extended: Check access permissions in bulk.
6564  *
6565  * Description:	uap->entries		Pointer to an array of accessx
6566  *                                      descriptor structs, plus one or
6567  *                                      more NULL terminated strings (see
6568  *                                      "Notes" section below).
6569  *		uap->size		Size of the area pointed to by
6570  *					uap->entries.
6571  *		uap->results		Pointer to the results array.
6572  *
6573  * Returns:	0			Success
6574  *		ENOMEM			Insufficient memory
6575  *		EINVAL			Invalid arguments
6576  *		namei:EFAULT		Bad address
6577  *		namei:ENAMETOOLONG	Filename too long
6578  *		namei:ENOENT		No such file or directory
6579  *		namei:ELOOP		Too many levels of symbolic links
6580  *		namei:EBADF		Bad file descriptor
6581  *		namei:ENOTDIR		Not a directory
6582  *		namei:???
6583  *		access1:
6584  *
6585  * Implicit returns:
6586  *		uap->results		Array contents modified
6587  *
6588  * Notes:	The uap->entries are structured as an arbitrary length array
6589  *		of accessx descriptors, followed by one or more NULL terminated
6590  *		strings
6591  *
6592  *			struct accessx_descriptor[0]
6593  *			...
6594  *			struct accessx_descriptor[n]
6595  *			char name_data[0];
6596  *
6597  *		We determine the entry count by walking the buffer containing
6598  *		the uap->entries argument descriptor.  For each descriptor we
6599  *		see, the valid values for the offset ad_name_offset will be
6600  *		in the byte range:
6601  *
6602  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6603  *						to
6604  *				[ uap->entries + uap->size - 2 ]
6605  *
6606  *		since we must have at least one string, and the string must
6607  *		be at least one character plus the NULL terminator in length.
6608  *
6609  * XXX:		Need to support the check-as uid argument
6610  */
6611 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6612 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6613 {
6614 	struct accessx_descriptor *input = NULL;
6615 	errno_t *result = NULL;
6616 	errno_t error = 0;
6617 	int wantdelete = 0;
6618 	size_t desc_max, desc_actual = 0;
6619 	unsigned int i, j;
6620 	struct vfs_context context;
6621 	struct nameidata nd;
6622 	int niopts;
6623 	vnode_t vp = NULL;
6624 	vnode_t dvp = NULL;
6625 #define ACCESSX_MAX_DESCR_ON_STACK 10
6626 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6627 
6628 	context.vc_ucred = NULL;
6629 
6630 	/*
6631 	 * Validate parameters; if valid, copy the descriptor array and string
6632 	 * arguments into local memory.  Before proceeding, the following
6633 	 * conditions must have been met:
6634 	 *
6635 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6636 	 * o	There must be sufficient room in the request for at least one
6637 	 *	descriptor and a one yte NUL terminated string.
6638 	 * o	The allocation of local storage must not fail.
6639 	 */
6640 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6641 		return ENOMEM;
6642 	}
6643 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6644 		return EINVAL;
6645 	}
6646 	if (uap->size <= sizeof(stack_input)) {
6647 		input = stack_input;
6648 	} else {
6649 		input = kalloc_data(uap->size, Z_WAITOK);
6650 		if (input == NULL) {
6651 			error = ENOMEM;
6652 			goto out;
6653 		}
6654 	}
6655 	error = copyin(uap->entries, input, uap->size);
6656 	if (error) {
6657 		goto out;
6658 	}
6659 
6660 	AUDIT_ARG(opaque, input, uap->size);
6661 
6662 	/*
6663 	 * Force NUL termination of the copyin buffer to avoid nami() running
6664 	 * off the end.  If the caller passes us bogus data, they may get a
6665 	 * bogus result.
6666 	 */
6667 	((char *)input)[uap->size - 1] = 0;
6668 
6669 	/*
6670 	 * Access is defined as checking against the process' real identity,
6671 	 * even if operations are checking the effective identity.  This
6672 	 * requires that we use a local vfs context.
6673 	 */
6674 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6675 	context.vc_thread = current_thread();
6676 
6677 	/*
6678 	 * Find out how many entries we have, so we can allocate the result
6679 	 * array by walking the list and adjusting the count downward by the
6680 	 * earliest string offset we see.
6681 	 */
6682 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6683 	desc_actual = desc_max;
6684 	for (i = 0; i < desc_actual; i++) {
6685 		/*
6686 		 * Take the offset to the name string for this entry and
6687 		 * convert to an input array index, which would be one off
6688 		 * the end of the array if this entry was the lowest-addressed
6689 		 * name string.
6690 		 */
6691 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6692 
6693 		/*
6694 		 * An offset greater than the max allowable offset is an error.
6695 		 * It is also an error for any valid entry to point
6696 		 * to a location prior to the end of the current entry, if
6697 		 * it's not a reference to the string of the previous entry.
6698 		 */
6699 		if (j > desc_max || (j != 0 && j <= i)) {
6700 			error = EINVAL;
6701 			goto out;
6702 		}
6703 
6704 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6705 		if (input[i].ad_name_offset >= uap->size) {
6706 			error = EINVAL;
6707 			goto out;
6708 		}
6709 
6710 		/*
6711 		 * An offset of 0 means use the previous descriptor's offset;
6712 		 * this is used to chain multiple requests for the same file
6713 		 * to avoid multiple lookups.
6714 		 */
6715 		if (j == 0) {
6716 			/* This is not valid for the first entry */
6717 			if (i == 0) {
6718 				error = EINVAL;
6719 				goto out;
6720 			}
6721 			continue;
6722 		}
6723 
6724 		/*
6725 		 * If the offset of the string for this descriptor is before
6726 		 * what we believe is the current actual last descriptor,
6727 		 * then we need to adjust our estimate downward; this permits
6728 		 * the string table following the last descriptor to be out
6729 		 * of order relative to the descriptor list.
6730 		 */
6731 		if (j < desc_actual) {
6732 			desc_actual = j;
6733 		}
6734 	}
6735 
6736 	/*
6737 	 * We limit the actual number of descriptors we are willing to process
6738 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6739 	 * requested does not exceed this limit,
6740 	 */
6741 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6742 		error = ENOMEM;
6743 		goto out;
6744 	}
6745 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6746 	if (result == NULL) {
6747 		error = ENOMEM;
6748 		goto out;
6749 	}
6750 
6751 	/*
6752 	 * Do the work by iterating over the descriptor entries we know to
6753 	 * at least appear to contain valid data.
6754 	 */
6755 	error = 0;
6756 	for (i = 0; i < desc_actual; i++) {
6757 		/*
6758 		 * If the ad_name_offset is 0, then we use the previous
6759 		 * results to make the check; otherwise, we are looking up
6760 		 * a new file name.
6761 		 */
6762 		if (input[i].ad_name_offset != 0) {
6763 			/* discard old vnodes */
6764 			if (vp) {
6765 				vnode_put(vp);
6766 				vp = NULL;
6767 			}
6768 			if (dvp) {
6769 				vnode_put(dvp);
6770 				dvp = NULL;
6771 			}
6772 
6773 			/*
6774 			 * Scan forward in the descriptor list to see if we
6775 			 * need the parent vnode.  We will need it if we are
6776 			 * deleting, since we must have rights  to remove
6777 			 * entries in the parent directory, as well as the
6778 			 * rights to delete the object itself.
6779 			 */
6780 			wantdelete = input[i].ad_flags & _DELETE_OK;
6781 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6782 				if (input[j].ad_flags & _DELETE_OK) {
6783 					wantdelete = 1;
6784 				}
6785 			}
6786 
6787 			niopts = FOLLOW | AUDITVNPATH1;
6788 
6789 			/* need parent for vnode_authorize for deletion test */
6790 			if (wantdelete) {
6791 				niopts |= WANTPARENT;
6792 			}
6793 
6794 			/* do the lookup */
6795 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6796 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6797 			    &context);
6798 			error = namei(&nd);
6799 			if (!error) {
6800 				vp = nd.ni_vp;
6801 				if (wantdelete) {
6802 					dvp = nd.ni_dvp;
6803 				}
6804 			}
6805 			nameidone(&nd);
6806 		}
6807 
6808 		/*
6809 		 * Handle lookup errors.
6810 		 */
6811 		switch (error) {
6812 		case ENOENT:
6813 		case EACCES:
6814 		case EPERM:
6815 		case ENOTDIR:
6816 			result[i] = error;
6817 			break;
6818 		case 0:
6819 			/* run this access check */
6820 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6821 			break;
6822 		default:
6823 			/* fatal lookup error */
6824 
6825 			goto out;
6826 		}
6827 	}
6828 
6829 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6830 
6831 	/* copy out results */
6832 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6833 
6834 out:
6835 	if (input && input != stack_input) {
6836 		kfree_data(input, uap->size);
6837 	}
6838 	if (result) {
6839 		kfree_data(result, desc_actual * sizeof(errno_t));
6840 	}
6841 	if (vp) {
6842 		vnode_put(vp);
6843 	}
6844 	if (dvp) {
6845 		vnode_put(dvp);
6846 	}
6847 	if (IS_VALID_CRED(context.vc_ucred)) {
6848 		kauth_cred_unref(&context.vc_ucred);
6849 	}
6850 	return error;
6851 }
6852 
6853 
6854 /*
6855  * Returns:	0			Success
6856  *		namei:EFAULT		Bad address
6857  *		namei:ENAMETOOLONG	Filename too long
6858  *		namei:ENOENT		No such file or directory
6859  *		namei:ELOOP		Too many levels of symbolic links
6860  *		namei:EBADF		Bad file descriptor
6861  *		namei:ENOTDIR		Not a directory
6862  *		namei:???
6863  *		access1:
6864  */
6865 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6866 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6867     int flag, enum uio_seg segflg)
6868 {
6869 	int error;
6870 	struct nameidata nd;
6871 	int niopts;
6872 	struct vfs_context context;
6873 #if NAMEDRSRCFORK
6874 	int is_namedstream = 0;
6875 #endif
6876 
6877 	/*
6878 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6879 	 * against the process' real identity, even if operations are checking
6880 	 * the effective identity.  So we need to tweak the credential
6881 	 * in the context for that case.
6882 	 */
6883 	if (!(flag & AT_EACCESS)) {
6884 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6885 	} else {
6886 		context.vc_ucred = ctx->vc_ucred;
6887 	}
6888 	context.vc_thread = ctx->vc_thread;
6889 
6890 
6891 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6892 	/* need parent for vnode_authorize for deletion test */
6893 	if (amode & _DELETE_OK) {
6894 		niopts |= WANTPARENT;
6895 	}
6896 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6897 	    path, &context);
6898 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6899 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6900 	}
6901 
6902 #if NAMEDRSRCFORK
6903 	/* access(F_OK) calls are allowed for resource forks. */
6904 	if (amode == F_OK) {
6905 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6906 	}
6907 #endif
6908 	error = nameiat(&nd, fd);
6909 	if (error) {
6910 		goto out;
6911 	}
6912 
6913 #if NAMEDRSRCFORK
6914 	/* Grab reference on the shadow stream file vnode to
6915 	 * force an inactive on release which will mark it
6916 	 * for recycle.
6917 	 */
6918 	if (vnode_isnamedstream(nd.ni_vp) &&
6919 	    (nd.ni_vp->v_parent != NULLVP) &&
6920 	    vnode_isshadow(nd.ni_vp)) {
6921 		is_namedstream = 1;
6922 		vnode_ref(nd.ni_vp);
6923 	}
6924 #endif
6925 
6926 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6927 
6928 #if NAMEDRSRCFORK
6929 	if (is_namedstream) {
6930 		vnode_rele(nd.ni_vp);
6931 	}
6932 #endif
6933 
6934 	vnode_put(nd.ni_vp);
6935 	if (amode & _DELETE_OK) {
6936 		vnode_put(nd.ni_dvp);
6937 	}
6938 	nameidone(&nd);
6939 
6940 out:
6941 	if (!(flag & AT_EACCESS)) {
6942 		kauth_cred_unref(&context.vc_ucred);
6943 	}
6944 	return error;
6945 }
6946 
6947 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6948 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6949 {
6950 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
6951 	           uap->path, uap->flags, 0, UIO_USERSPACE);
6952 }
6953 
6954 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6955 faccessat(__unused proc_t p, struct faccessat_args *uap,
6956     __unused int32_t *retval)
6957 {
6958 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6959 		return EINVAL;
6960 	}
6961 
6962 	return faccessat_internal(vfs_context_current(), uap->fd,
6963 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6964 }
6965 
6966 /*
6967  * Returns:	0			Success
6968  *		EFAULT
6969  *	copyout:EFAULT
6970  *	namei:???
6971  *	vn_stat:???
6972  */
6973 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6974 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6975     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6976     enum uio_seg segflg, int fd, int flag)
6977 {
6978 	struct nameidata *ndp = NULL;
6979 	int follow;
6980 	union {
6981 		struct stat sb;
6982 		struct stat64 sb64;
6983 	} source = {};
6984 	union {
6985 		struct user64_stat user64_sb;
6986 		struct user32_stat user32_sb;
6987 		struct user64_stat64 user64_sb64;
6988 		struct user32_stat64 user32_sb64;
6989 	} dest = {};
6990 	caddr_t sbp;
6991 	int error, my_size;
6992 	kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
6993 	size_t xsecurity_bufsize;
6994 	void * statptr;
6995 	struct fileproc *fp = NULL;
6996 	int needsrealdev = 0;
6997 
6998 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6999 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
7000 	NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
7001 	    segflg, path, ctx);
7002 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7003 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
7004 	}
7005 
7006 #if NAMEDRSRCFORK
7007 	int is_namedstream = 0;
7008 	/* stat calls are allowed for resource forks. */
7009 	ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7010 #endif
7011 
7012 	if (flag & AT_FDONLY) {
7013 		vnode_t fvp;
7014 
7015 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
7016 		if (error) {
7017 			goto out;
7018 		}
7019 		if ((error = vnode_getwithref(fvp))) {
7020 			file_drop(fd);
7021 			goto out;
7022 		}
7023 		ndp->ni_vp = fvp;
7024 	} else {
7025 		error = nameiat(ndp, fd);
7026 		if (error) {
7027 			goto out;
7028 		}
7029 	}
7030 
7031 	statptr = (void *)&source;
7032 
7033 #if NAMEDRSRCFORK
7034 	/* Grab reference on the shadow stream file vnode to
7035 	 * force an inactive on release which will mark it
7036 	 * for recycle.
7037 	 */
7038 	if (vnode_isnamedstream(ndp->ni_vp) &&
7039 	    (ndp->ni_vp->v_parent != NULLVP) &&
7040 	    vnode_isshadow(ndp->ni_vp)) {
7041 		is_namedstream = 1;
7042 		vnode_ref(ndp->ni_vp);
7043 	}
7044 #endif
7045 
7046 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
7047 	if (fp && (xsecurity == USER_ADDR_NULL)) {
7048 		/*
7049 		 * If the caller has the file open, and is not
7050 		 * requesting extended security information, we are
7051 		 * going to let them get the basic stat information.
7052 		 */
7053 		error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7054 		    fp->fp_glob->fg_cred);
7055 	} else {
7056 		error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7057 		    isstat64, needsrealdev, ctx);
7058 	}
7059 
7060 #if NAMEDRSRCFORK
7061 	if (is_namedstream) {
7062 		vnode_rele(ndp->ni_vp);
7063 	}
7064 #endif
7065 	vnode_put(ndp->ni_vp);
7066 	nameidone(ndp);
7067 
7068 	if (fp) {
7069 		file_drop(fd);
7070 		fp = NULL;
7071 	}
7072 
7073 	if (error) {
7074 		goto out;
7075 	}
7076 	/* Zap spare fields */
7077 	if (isstat64 != 0) {
7078 		source.sb64.st_lspare = 0;
7079 		source.sb64.st_qspare[0] = 0LL;
7080 		source.sb64.st_qspare[1] = 0LL;
7081 		if (vfs_context_is64bit(ctx)) {
7082 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7083 			my_size = sizeof(dest.user64_sb64);
7084 			sbp = (caddr_t)&dest.user64_sb64;
7085 		} else {
7086 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7087 			my_size = sizeof(dest.user32_sb64);
7088 			sbp = (caddr_t)&dest.user32_sb64;
7089 		}
7090 		/*
7091 		 * Check if we raced (post lookup) against the last unlink of a file.
7092 		 */
7093 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7094 			source.sb64.st_nlink = 1;
7095 		}
7096 	} else {
7097 		source.sb.st_lspare = 0;
7098 		source.sb.st_qspare[0] = 0LL;
7099 		source.sb.st_qspare[1] = 0LL;
7100 		if (vfs_context_is64bit(ctx)) {
7101 			munge_user64_stat(&source.sb, &dest.user64_sb);
7102 			my_size = sizeof(dest.user64_sb);
7103 			sbp = (caddr_t)&dest.user64_sb;
7104 		} else {
7105 			munge_user32_stat(&source.sb, &dest.user32_sb);
7106 			my_size = sizeof(dest.user32_sb);
7107 			sbp = (caddr_t)&dest.user32_sb;
7108 		}
7109 
7110 		/*
7111 		 * Check if we raced (post lookup) against the last unlink of a file.
7112 		 */
7113 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7114 			source.sb.st_nlink = 1;
7115 		}
7116 	}
7117 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7118 		goto out;
7119 	}
7120 
7121 	/* caller wants extended security information? */
7122 	if (xsecurity != USER_ADDR_NULL) {
7123 		/* did we get any? */
7124 		if (fsec == KAUTH_FILESEC_NONE) {
7125 			if (susize(xsecurity_size, 0) != 0) {
7126 				error = EFAULT;
7127 				goto out;
7128 			}
7129 		} else {
7130 			/* find the user buffer size */
7131 			xsecurity_bufsize = fusize(xsecurity_size);
7132 
7133 			/* copy out the actual data size */
7134 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7135 				error = EFAULT;
7136 				goto out;
7137 			}
7138 
7139 			/* if the caller supplied enough room, copy out to it */
7140 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7141 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7142 			}
7143 		}
7144 	}
7145 out:
7146 	if (ndp) {
7147 		kfree_type(struct nameidata, ndp);
7148 	}
7149 	if (fsec != KAUTH_FILESEC_NONE) {
7150 		kauth_filesec_free(fsec);
7151 	}
7152 	return error;
7153 }
7154 
7155 /*
7156  * stat_extended: Get file status; with extended security (ACL).
7157  *
7158  * Parameters:    p                       (ignored)
7159  *                uap                     User argument descriptor (see below)
7160  *                retval                  (ignored)
7161  *
7162  * Indirect:      uap->path               Path of file to get status from
7163  *                uap->ub                 User buffer (holds file status info)
7164  *                uap->xsecurity          ACL to get (extended security)
7165  *                uap->xsecurity_size     Size of ACL
7166  *
7167  * Returns:        0                      Success
7168  *                !0                      errno value
7169  *
7170  */
7171 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7172 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7173     __unused int32_t *retval)
7174 {
7175 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7176 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7177 	           0);
7178 }
7179 
7180 /*
7181  * Returns:	0			Success
7182  *	fstatat_internal:???		[see fstatat_internal() in this file]
7183  */
7184 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7185 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7186 {
7187 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7188 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7189 }
7190 
7191 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7192 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7193 {
7194 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7195 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7196 }
7197 
7198 /*
7199  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7200  *
7201  * Parameters:    p                       (ignored)
7202  *                uap                     User argument descriptor (see below)
7203  *                retval                  (ignored)
7204  *
7205  * Indirect:      uap->path               Path of file to get status from
7206  *                uap->ub                 User buffer (holds file status info)
7207  *                uap->xsecurity          ACL to get (extended security)
7208  *                uap->xsecurity_size     Size of ACL
7209  *
7210  * Returns:        0                      Success
7211  *                !0                      errno value
7212  *
7213  */
7214 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7215 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7216 {
7217 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7218 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7219 	           0);
7220 }
7221 
7222 /*
7223  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7224  *
7225  * Parameters:    p                       (ignored)
7226  *                uap                     User argument descriptor (see below)
7227  *                retval                  (ignored)
7228  *
7229  * Indirect:      uap->path               Path of file to get status from
7230  *                uap->ub                 User buffer (holds file status info)
7231  *                uap->xsecurity          ACL to get (extended security)
7232  *                uap->xsecurity_size     Size of ACL
7233  *
7234  * Returns:        0                      Success
7235  *                !0                      errno value
7236  *
7237  */
7238 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7239 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7240 {
7241 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7242 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7243 	           AT_SYMLINK_NOFOLLOW);
7244 }
7245 
7246 /*
7247  * Get file status; this version does not follow links.
7248  */
7249 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7250 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7251 {
7252 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7253 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7254 }
7255 
7256 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7257 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7258 {
7259 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7260 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7261 }
7262 
7263 /*
7264  * lstat64_extended: Get file status; can handle large inode numbers; does not
7265  * follow links; with extended security (ACL).
7266  *
7267  * Parameters:    p                       (ignored)
7268  *                uap                     User argument descriptor (see below)
7269  *                retval                  (ignored)
7270  *
7271  * Indirect:      uap->path               Path of file to get status from
7272  *                uap->ub                 User buffer (holds file status info)
7273  *                uap->xsecurity          ACL to get (extended security)
7274  *                uap->xsecurity_size     Size of ACL
7275  *
7276  * Returns:        0                      Success
7277  *                !0                      errno value
7278  *
7279  */
7280 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7281 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7282 {
7283 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7284 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7285 	           AT_SYMLINK_NOFOLLOW);
7286 }
7287 
7288 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7289 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7290 {
7291 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7292 		return EINVAL;
7293 	}
7294 
7295 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7296 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7297 }
7298 
7299 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7300 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7301     __unused int32_t *retval)
7302 {
7303 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7304 		return EINVAL;
7305 	}
7306 
7307 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7308 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7309 }
7310 
7311 /*
7312  * Get configurable pathname variables.
7313  *
7314  * Returns:	0			Success
7315  *	namei:???
7316  *	vn_pathconf:???
7317  *
7318  * Notes:	Global implementation  constants are intended to be
7319  *		implemented in this function directly; all other constants
7320  *		are per-FS implementation, and therefore must be handled in
7321  *		each respective FS, instead.
7322  *
7323  * XXX We implement some things globally right now that should actually be
7324  * XXX per-FS; we will need to deal with this at some point.
7325  */
7326 /* ARGSUSED */
7327 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7328 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7329 {
7330 	int error;
7331 	struct nameidata nd;
7332 	vfs_context_t ctx = vfs_context_current();
7333 
7334 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7335 	    UIO_USERSPACE, uap->path, ctx);
7336 	error = namei(&nd);
7337 	if (error) {
7338 		return error;
7339 	}
7340 
7341 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7342 
7343 	vnode_put(nd.ni_vp);
7344 	nameidone(&nd);
7345 	return error;
7346 }
7347 
7348 /*
7349  * Return target name of a symbolic link.
7350  */
7351 /* ARGSUSED */
7352 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7353 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7354     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7355     int *retval)
7356 {
7357 	vnode_t vp;
7358 	uio_t auio;
7359 	int error;
7360 	struct nameidata nd;
7361 	UIO_STACKBUF(uio_buf, 1);
7362 	bool put_vnode;
7363 
7364 	if (bufsize > INT32_MAX) {
7365 		return EINVAL;
7366 	}
7367 
7368 	if (lnk_vp) {
7369 		vp = lnk_vp;
7370 		put_vnode = false;
7371 	} else {
7372 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7373 		    seg, path, ctx);
7374 
7375 		error = nameiat(&nd, fd);
7376 		if (error) {
7377 			return error;
7378 		}
7379 		vp = nd.ni_vp;
7380 		put_vnode = true;
7381 		nameidone(&nd);
7382 	}
7383 
7384 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7385 	    &uio_buf[0], sizeof(uio_buf));
7386 	uio_addiov(auio, buf, bufsize);
7387 	if (vp->v_type != VLNK) {
7388 		error = EINVAL;
7389 	} else {
7390 #if CONFIG_MACF
7391 		error = mac_vnode_check_readlink(ctx, vp);
7392 #endif
7393 		if (error == 0) {
7394 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7395 			    ctx);
7396 		}
7397 		if (error == 0) {
7398 			error = VNOP_READLINK(vp, auio, ctx);
7399 		}
7400 	}
7401 
7402 	if (put_vnode) {
7403 		vnode_put(vp);
7404 	}
7405 
7406 	*retval = (int)(bufsize - uio_resid(auio));
7407 	return error;
7408 }
7409 
7410 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7411 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7412 {
7413 	enum uio_seg procseg;
7414 	vnode_t vp;
7415 	int error;
7416 
7417 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7418 
7419 	AUDIT_ARG(fd, uap->fd);
7420 
7421 	if ((error = file_vnode(uap->fd, &vp))) {
7422 		return error;
7423 	}
7424 	if ((error = vnode_getwithref(vp))) {
7425 		file_drop(uap->fd);
7426 		return error;
7427 	}
7428 
7429 	error = readlinkat_internal(vfs_context_current(), -1,
7430 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7431 	    uap->bufsize, procseg, retval);
7432 
7433 	vnode_put(vp);
7434 	file_drop(uap->fd);
7435 	return error;
7436 }
7437 
7438 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7439 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7440 {
7441 	enum uio_seg procseg;
7442 
7443 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7444 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7445 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7446 	           uap->count, procseg, retval);
7447 }
7448 
7449 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7450 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7451 {
7452 	enum uio_seg procseg;
7453 
7454 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7455 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7456 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7457 	           retval);
7458 }
7459 
7460 /*
7461  * Change file flags, the deep inner layer.
7462  */
7463 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7464 chflags0(vnode_t vp, struct vnode_attr *va,
7465     int (*setattr)(vnode_t, void *, vfs_context_t),
7466     void *arg, vfs_context_t ctx)
7467 {
7468 	kauth_action_t action = 0;
7469 	int error;
7470 
7471 #if CONFIG_MACF
7472 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7473 	if (error) {
7474 		goto out;
7475 	}
7476 #endif
7477 
7478 	/* request authorisation, disregard immutability */
7479 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7480 		goto out;
7481 	}
7482 	/*
7483 	 * Request that the auth layer disregard those file flags it's allowed to when
7484 	 * authorizing this operation; we need to do this in order to be able to
7485 	 * clear immutable flags.
7486 	 */
7487 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7488 		goto out;
7489 	}
7490 	error = (*setattr)(vp, arg, ctx);
7491 
7492 #if CONFIG_MACF
7493 	if (error == 0) {
7494 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7495 	}
7496 #endif
7497 
7498 out:
7499 	return error;
7500 }
7501 
7502 /*
7503  * Change file flags.
7504  *
7505  * NOTE: this will vnode_put() `vp'
7506  */
7507 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7508 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7509 {
7510 	struct vnode_attr va;
7511 	int error;
7512 
7513 	VATTR_INIT(&va);
7514 	VATTR_SET(&va, va_flags, flags);
7515 
7516 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7517 	vnode_put(vp);
7518 
7519 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7520 		error = ENOTSUP;
7521 	}
7522 
7523 	return error;
7524 }
7525 
7526 /*
7527  * Change flags of a file given a path name.
7528  */
7529 /* ARGSUSED */
7530 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7531 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7532 {
7533 	vnode_t vp;
7534 	vfs_context_t ctx = vfs_context_current();
7535 	int error;
7536 	struct nameidata nd;
7537 	uint32_t wantparent = 0;
7538 
7539 #if CONFIG_FILE_LEASES
7540 	wantparent = WANTPARENT;
7541 #endif
7542 
7543 	AUDIT_ARG(fflags, uap->flags);
7544 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7545 	    UIO_USERSPACE, uap->path, ctx);
7546 	error = namei(&nd);
7547 	if (error) {
7548 		return error;
7549 	}
7550 	vp = nd.ni_vp;
7551 
7552 #if CONFIG_FILE_LEASES
7553 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7554 	vnode_put(nd.ni_dvp);
7555 #endif
7556 
7557 	nameidone(&nd);
7558 
7559 	/* we don't vnode_put() here because chflags1 does internally */
7560 	error = chflags1(vp, uap->flags, ctx);
7561 
7562 	return error;
7563 }
7564 
7565 /*
7566  * Change flags of a file given a file descriptor.
7567  */
7568 /* ARGSUSED */
7569 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7570 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7571 {
7572 	vnode_t vp;
7573 	int error;
7574 
7575 	AUDIT_ARG(fd, uap->fd);
7576 	AUDIT_ARG(fflags, uap->flags);
7577 	if ((error = file_vnode(uap->fd, &vp))) {
7578 		return error;
7579 	}
7580 
7581 	if ((error = vnode_getwithref(vp))) {
7582 		file_drop(uap->fd);
7583 		return error;
7584 	}
7585 
7586 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7587 
7588 #if CONFIG_FILE_LEASES
7589 	vnode_breakdirlease(vp, true, O_WRONLY);
7590 #endif
7591 
7592 	/* we don't vnode_put() here because chflags1 does internally */
7593 	error = chflags1(vp, uap->flags, vfs_context_current());
7594 
7595 	file_drop(uap->fd);
7596 	return error;
7597 }
7598 
7599 /*
7600  * Change security information on a filesystem object.
7601  *
7602  * Returns:	0			Success
7603  *		EPERM			Operation not permitted
7604  *		vnode_authattr:???	[anything vnode_authattr can return]
7605  *		vnode_authorize:???	[anything vnode_authorize can return]
7606  *		vnode_setattr:???	[anything vnode_setattr can return]
7607  *
7608  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7609  *		translated to EPERM before being returned.
7610  */
7611 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7612 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7613 {
7614 	kauth_action_t action;
7615 	int error;
7616 
7617 	AUDIT_ARG(mode, vap->va_mode);
7618 	/* XXX audit new args */
7619 
7620 #if NAMEDSTREAMS
7621 	/* chmod calls are not allowed for resource forks. */
7622 	if (vp->v_flag & VISNAMEDSTREAM) {
7623 		return EPERM;
7624 	}
7625 #endif
7626 
7627 #if CONFIG_MACF
7628 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7629 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7630 		return error;
7631 	}
7632 
7633 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7634 		if ((error = mac_vnode_check_setowner(ctx, vp,
7635 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7636 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7637 			return error;
7638 		}
7639 	}
7640 
7641 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7642 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7643 		return error;
7644 	}
7645 #endif
7646 
7647 	/* make sure that the caller is allowed to set this security information */
7648 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7649 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7650 		if (error == EACCES) {
7651 			error = EPERM;
7652 		}
7653 		return error;
7654 	}
7655 
7656 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7657 		return error;
7658 	}
7659 
7660 #if CONFIG_MACF
7661 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7662 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7663 	}
7664 
7665 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7666 		mac_vnode_notify_setowner(ctx, vp,
7667 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7668 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7669 	}
7670 
7671 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7672 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7673 	}
7674 #endif
7675 
7676 	return error;
7677 }
7678 
7679 
7680 /*
7681  * Change mode of a file given a path name.
7682  *
7683  * Returns:	0			Success
7684  *		namei:???		[anything namei can return]
7685  *		chmod_vnode:???		[anything chmod_vnode can return]
7686  */
7687 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7688 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7689     int fd, int flag, enum uio_seg segflg)
7690 {
7691 	struct nameidata nd;
7692 	int follow, error;
7693 	uint32_t wantparent = 0;
7694 
7695 #if CONFIG_FILE_LEASES
7696 	wantparent = WANTPARENT;
7697 #endif
7698 
7699 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7700 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7701 	    segflg, path, ctx);
7702 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7703 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7704 	}
7705 	if ((error = nameiat(&nd, fd))) {
7706 		return error;
7707 	}
7708 
7709 #if CONFIG_FILE_LEASES
7710 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7711 	vnode_put(nd.ni_dvp);
7712 #endif
7713 
7714 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7715 	vnode_put(nd.ni_vp);
7716 	nameidone(&nd);
7717 	return error;
7718 }
7719 
7720 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7721 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7722     gid_t gid, user_addr_t xsecurity)
7723 {
7724 	int error;
7725 
7726 	VATTR_INIT(pva);
7727 
7728 	if (mode != -1) {
7729 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
7730 	} else {
7731 		pva->va_mode = 0;
7732 	}
7733 
7734 	if (uid != KAUTH_UID_NONE) {
7735 		VATTR_SET(pva, va_uid, uid);
7736 	}
7737 
7738 	if (gid != KAUTH_GID_NONE) {
7739 		VATTR_SET(pva, va_gid, gid);
7740 	}
7741 
7742 	*pxsecdst = NULL;
7743 	switch (xsecurity) {
7744 	case USER_ADDR_NULL:
7745 		break;
7746 
7747 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7748 		VATTR_SET(pva, va_acl, NULL);
7749 		break;
7750 
7751 	default:
7752 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7753 			return error;
7754 		}
7755 
7756 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7757 		pva->va_vaflags |= VA_FILESEC_ACL;
7758 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7759 		break;
7760 	}
7761 
7762 	return 0;
7763 }
7764 
7765 /*
7766  * chmod_extended: Change the mode of a file given a path name; with extended
7767  * argument list (including extended security (ACL)).
7768  *
7769  * Parameters:	p			Process requesting the open
7770  *		uap			User argument descriptor (see below)
7771  *		retval			(ignored)
7772  *
7773  * Indirect:	uap->path		Path to object (same as 'chmod')
7774  *		uap->uid		UID to set
7775  *		uap->gid		GID to set
7776  *		uap->mode		File mode to set (same as 'chmod')
7777  *		uap->xsecurity		ACL to set (or delete)
7778  *
7779  * Returns:	0			Success
7780  *		!0			errno value
7781  *
7782  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7783  *
7784  * XXX:		We should enummerate the possible errno values here, and where
7785  *		in the code they originated.
7786  */
7787 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7788 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7789 {
7790 	int error;
7791 	struct vnode_attr va;
7792 	kauth_filesec_t xsecdst = NULL;
7793 
7794 	AUDIT_ARG(owner, uap->uid, uap->gid);
7795 
7796 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7797 	    uap->gid, uap->xsecurity);
7798 
7799 	if (error) {
7800 		return error;
7801 	}
7802 
7803 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7804 	    UIO_USERSPACE);
7805 
7806 	if (xsecdst != NULL) {
7807 		kauth_filesec_free(xsecdst);
7808 	}
7809 	return error;
7810 }
7811 
7812 /*
7813  * Returns:	0			Success
7814  *		chmodat:???		[anything chmodat can return]
7815  */
7816 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7817 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7818     int flag, enum uio_seg segflg)
7819 {
7820 	struct vnode_attr va;
7821 
7822 	VATTR_INIT(&va);
7823 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7824 
7825 	return chmodat(ctx, path, &va, fd, flag, segflg);
7826 }
7827 
7828 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7829 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7830 {
7831 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7832 	           AT_FDCWD, 0, UIO_USERSPACE);
7833 }
7834 
7835 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7836 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7837 {
7838 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7839 		return EINVAL;
7840 	}
7841 
7842 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7843 	           uap->fd, uap->flag, UIO_USERSPACE);
7844 }
7845 
7846 /*
7847  * Change mode of a file given a file descriptor.
7848  */
7849 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7850 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7851 {
7852 	vnode_t vp;
7853 	int error;
7854 
7855 	AUDIT_ARG(fd, fd);
7856 
7857 	if ((error = file_vnode(fd, &vp)) != 0) {
7858 		return error;
7859 	}
7860 	if ((error = vnode_getwithref(vp)) != 0) {
7861 		file_drop(fd);
7862 		return error;
7863 	}
7864 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7865 
7866 #if CONFIG_FILE_LEASES
7867 	vnode_breakdirlease(vp, true, O_WRONLY);
7868 #endif
7869 
7870 	error = chmod_vnode(vfs_context_current(), vp, vap);
7871 	(void)vnode_put(vp);
7872 	file_drop(fd);
7873 
7874 	return error;
7875 }
7876 
7877 /*
7878  * fchmod_extended: Change mode of a file given a file descriptor; with
7879  * extended argument list (including extended security (ACL)).
7880  *
7881  * Parameters:    p                       Process requesting to change file mode
7882  *                uap                     User argument descriptor (see below)
7883  *                retval                  (ignored)
7884  *
7885  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7886  *                uap->uid                UID to set
7887  *                uap->gid                GID to set
7888  *                uap->xsecurity          ACL to set (or delete)
7889  *                uap->fd                 File descriptor of file to change mode
7890  *
7891  * Returns:        0                      Success
7892  *                !0                      errno value
7893  *
7894  */
7895 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7896 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7897 {
7898 	int error;
7899 	struct vnode_attr va;
7900 	kauth_filesec_t xsecdst = NULL;
7901 
7902 	AUDIT_ARG(owner, uap->uid, uap->gid);
7903 
7904 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7905 	    uap->gid, uap->xsecurity);
7906 
7907 	if (error) {
7908 		return error;
7909 	}
7910 
7911 	error = fchmod1(p, uap->fd, &va);
7912 
7913 	if (xsecdst != NULL) {
7914 		kauth_filesec_free(xsecdst);
7915 	}
7916 	return error;
7917 }
7918 
7919 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7920 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7921 {
7922 	struct vnode_attr va;
7923 
7924 	VATTR_INIT(&va);
7925 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7926 
7927 	return fchmod1(p, uap->fd, &va);
7928 }
7929 
7930 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)7931 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
7932 {
7933 	struct vnode_attr va;
7934 	kauth_action_t action;
7935 	int error;
7936 
7937 	VATTR_INIT(&va);
7938 	if (uid != (uid_t)VNOVAL) {
7939 		VATTR_SET(&va, va_uid, uid);
7940 	}
7941 	if (gid != (gid_t)VNOVAL) {
7942 		VATTR_SET(&va, va_gid, gid);
7943 	}
7944 
7945 #if NAMEDSTREAMS
7946 	/* chown calls are not allowed for resource forks. */
7947 	if (vp->v_flag & VISNAMEDSTREAM) {
7948 		error = EPERM;
7949 		goto out;
7950 	}
7951 #endif
7952 
7953 #if CONFIG_MACF
7954 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7955 	if (error) {
7956 		goto out;
7957 	}
7958 #endif
7959 
7960 	/* preflight and authorize attribute changes */
7961 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7962 		goto out;
7963 	}
7964 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7965 		/*
7966 		 * EACCES is only allowed from namei(); permissions failure should
7967 		 * return EPERM, so we need to translate the error code.
7968 		 */
7969 		if (error == EACCES) {
7970 			error = EPERM;
7971 		}
7972 
7973 		goto out;
7974 	}
7975 
7976 #if CONFIG_FILE_LEASES
7977 	vnode_breakdirlease(vp, true, O_WRONLY);
7978 #endif
7979 
7980 	error = vnode_setattr(vp, &va, ctx);
7981 
7982 #if CONFIG_MACF
7983 	if (error == 0) {
7984 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
7985 	}
7986 #endif
7987 
7988 out:
7989 	return error;
7990 }
7991 
7992 /*
7993  * Set ownership given a path name.
7994  */
7995 /* ARGSUSED */
7996 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7997 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7998     gid_t gid, int flag, enum uio_seg segflg)
7999 {
8000 	vnode_t vp;
8001 	int error;
8002 	struct nameidata nd;
8003 	int follow;
8004 
8005 	AUDIT_ARG(owner, uid, gid);
8006 
8007 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8008 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
8009 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8010 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8011 	}
8012 
8013 	error = nameiat(&nd, fd);
8014 	if (error) {
8015 		return error;
8016 	}
8017 
8018 	vp = nd.ni_vp;
8019 	error = vn_chown_internal(ctx, vp, uid, gid);
8020 
8021 	nameidone(&nd);
8022 	vnode_put(vp);
8023 	return error;
8024 }
8025 
8026 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8027 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8028 {
8029 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8030 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
8031 }
8032 
8033 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8034 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8035 {
8036 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8037 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8038 }
8039 
8040 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8041 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8042 {
8043 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
8044 		return EINVAL;
8045 	}
8046 
8047 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8048 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8049 }
8050 
8051 /*
8052  * Set ownership given a file descriptor.
8053  */
8054 /* ARGSUSED */
8055 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8056 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8057 {
8058 	vfs_context_t ctx = vfs_context_current();
8059 	vnode_t vp;
8060 	int error;
8061 
8062 	AUDIT_ARG(owner, uap->uid, uap->gid);
8063 	AUDIT_ARG(fd, uap->fd);
8064 
8065 	if ((error = file_vnode(uap->fd, &vp))) {
8066 		return error;
8067 	}
8068 
8069 	if ((error = vnode_getwithref(vp))) {
8070 		file_drop(uap->fd);
8071 		return error;
8072 	}
8073 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8074 
8075 	error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8076 
8077 	(void)vnode_put(vp);
8078 	file_drop(uap->fd);
8079 	return error;
8080 }
8081 
8082 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8083 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8084 {
8085 	int error;
8086 
8087 	if (usrtvp == USER_ADDR_NULL) {
8088 		struct timeval old_tv;
8089 		/* XXX Y2038 bug because of microtime argument */
8090 		microtime(&old_tv);
8091 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8092 		tsp[1] = tsp[0];
8093 	} else {
8094 		if (IS_64BIT_PROCESS(current_proc())) {
8095 			struct user64_timeval tv[2];
8096 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8097 			if (error) {
8098 				return error;
8099 			}
8100 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8101 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8102 		} else {
8103 			struct user32_timeval tv[2];
8104 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8105 			if (error) {
8106 				return error;
8107 			}
8108 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8109 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8110 		}
8111 	}
8112 	return 0;
8113 }
8114 
8115 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8116 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8117     int nullflag)
8118 {
8119 	int error;
8120 	struct vnode_attr va;
8121 	kauth_action_t action;
8122 
8123 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8124 
8125 	VATTR_INIT(&va);
8126 	VATTR_SET(&va, va_access_time, ts[0]);
8127 	VATTR_SET(&va, va_modify_time, ts[1]);
8128 	if (nullflag) {
8129 		va.va_vaflags |= VA_UTIMES_NULL;
8130 	}
8131 
8132 #if NAMEDSTREAMS
8133 	/* utimes calls are not allowed for resource forks. */
8134 	if (vp->v_flag & VISNAMEDSTREAM) {
8135 		error = EPERM;
8136 		goto out;
8137 	}
8138 #endif
8139 
8140 #if CONFIG_MACF
8141 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8142 	if (error) {
8143 		goto out;
8144 	}
8145 #endif
8146 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8147 		if (!nullflag && error == EACCES) {
8148 			error = EPERM;
8149 		}
8150 		goto out;
8151 	}
8152 
8153 	/* since we may not need to auth anything, check here */
8154 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8155 		if (!nullflag && error == EACCES) {
8156 			error = EPERM;
8157 		}
8158 		goto out;
8159 	}
8160 	error = vnode_setattr(vp, &va, ctx);
8161 
8162 #if CONFIG_MACF
8163 	if (error == 0) {
8164 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8165 	}
8166 #endif
8167 
8168 out:
8169 	return error;
8170 }
8171 
8172 /*
8173  * Set the access and modification times of a file.
8174  */
8175 /* ARGSUSED */
8176 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8177 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8178 {
8179 	struct timespec ts[2];
8180 	user_addr_t usrtvp;
8181 	int error;
8182 	struct nameidata nd;
8183 	vfs_context_t ctx = vfs_context_current();
8184 	uint32_t wantparent = 0;
8185 
8186 #if CONFIG_FILE_LEASES
8187 	wantparent = WANTPARENT;
8188 #endif
8189 
8190 	/*
8191 	 * AUDIT: Needed to change the order of operations to do the
8192 	 * name lookup first because auditing wants the path.
8193 	 */
8194 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8195 	    UIO_USERSPACE, uap->path, ctx);
8196 	error = namei(&nd);
8197 	if (error) {
8198 		return error;
8199 	}
8200 
8201 	/*
8202 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8203 	 * the current time instead.
8204 	 */
8205 	usrtvp = uap->tptr;
8206 	if ((error = getutimes(usrtvp, ts)) != 0) {
8207 		goto out;
8208 	}
8209 
8210 #if CONFIG_FILE_LEASES
8211 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8212 #endif
8213 
8214 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8215 
8216 out:
8217 #if CONFIG_FILE_LEASES
8218 	vnode_put(nd.ni_dvp);
8219 #endif
8220 	nameidone(&nd);
8221 	vnode_put(nd.ni_vp);
8222 	return error;
8223 }
8224 
8225 /*
8226  * Set the access and modification times of a file.
8227  */
8228 /* ARGSUSED */
8229 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8230 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8231 {
8232 	struct timespec ts[2];
8233 	vnode_t vp;
8234 	user_addr_t usrtvp;
8235 	int error;
8236 
8237 	AUDIT_ARG(fd, uap->fd);
8238 	usrtvp = uap->tptr;
8239 	if ((error = getutimes(usrtvp, ts)) != 0) {
8240 		return error;
8241 	}
8242 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8243 		return error;
8244 	}
8245 	if ((error = vnode_getwithref(vp))) {
8246 		file_drop(uap->fd);
8247 		return error;
8248 	}
8249 
8250 #if CONFIG_FILE_LEASES
8251 	vnode_breakdirlease(vp, true, O_WRONLY);
8252 #endif
8253 
8254 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8255 
8256 	vnode_put(vp);
8257 	file_drop(uap->fd);
8258 	return error;
8259 }
8260 
8261 static int
truncate_validate_common(proc_t p,off_t length)8262 truncate_validate_common(proc_t p, off_t length)
8263 {
8264 	rlim_t fsize_limit;
8265 
8266 	if (length < 0) {
8267 		return EINVAL;
8268 	}
8269 
8270 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8271 	if ((rlim_t)length > fsize_limit) {
8272 		psignal(p, SIGXFSZ);
8273 		return EFBIG;
8274 	}
8275 
8276 	return 0;
8277 }
8278 
8279 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8280 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8281     vfs_context_t ctx, boolean_t need_auth)
8282 {
8283 	struct vnode_attr va;
8284 	kauth_action_t action;
8285 	int error;
8286 
8287 	VATTR_INIT(&va);
8288 	VATTR_SET(&va, va_data_size, length);
8289 
8290 #if CONFIG_MACF
8291 	error = mac_vnode_check_truncate(ctx, cred, vp);
8292 	if (error) {
8293 		return error;
8294 	}
8295 #endif
8296 
8297 	/*
8298 	 * If we reached here from `ftruncate` then we already did an effective
8299 	 * `vnode_authorize` upon open.  We honour the result from then.
8300 	 */
8301 	if (need_auth) {
8302 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8303 			return error;
8304 		}
8305 
8306 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8307 			return error;
8308 		}
8309 	}
8310 
8311 #if CONFIG_FILE_LEASES
8312 	/* Check if there is a lease placed on the parent directory. */
8313 	vnode_breakdirlease(vp, true, O_WRONLY);
8314 
8315 	/* Now check if there is a lease placed on the file itself. */
8316 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8317 #endif
8318 
8319 	error = vnode_setattr(vp, &va, ctx);
8320 
8321 #if CONFIG_MACF
8322 	if (error == 0) {
8323 		mac_vnode_notify_truncate(ctx, cred, vp);
8324 	}
8325 #endif
8326 
8327 	return error;
8328 }
8329 
8330 /*
8331  * Truncate a file given its path name.
8332  */
8333 /* ARGSUSED */
8334 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8335 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8336 {
8337 	vfs_context_t ctx = vfs_context_current();
8338 	vnode_t vp;
8339 	int error;
8340 	struct nameidata nd;
8341 
8342 	if ((error = truncate_validate_common(p, uap->length))) {
8343 		return error;
8344 	}
8345 
8346 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8347 	    UIO_USERSPACE, uap->path, ctx);
8348 
8349 	if ((error = namei(&nd))) {
8350 		return error;
8351 	}
8352 
8353 	vp = nd.ni_vp;
8354 	nameidone(&nd);
8355 
8356 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8357 	vnode_put(vp);
8358 
8359 	return error;
8360 }
8361 
8362 /*
8363  * Truncate a file given a file descriptor.
8364  */
8365 /* ARGSUSED */
8366 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8367 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8368 {
8369 	vnode_t vp;
8370 	struct fileproc *fp;
8371 	int error;
8372 
8373 	AUDIT_ARG(fd, uap->fd);
8374 
8375 	if ((error = truncate_validate_common(p, uap->length))) {
8376 		return error;
8377 	}
8378 
8379 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8380 		return error;
8381 	}
8382 
8383 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8384 	case DTYPE_PSXSHM:
8385 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8386 		goto out;
8387 	case DTYPE_VNODE:
8388 		break;
8389 	default:
8390 		error = EINVAL;
8391 		goto out;
8392 	}
8393 
8394 	vp = (vnode_t)fp_get_data(fp);
8395 
8396 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8397 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8398 		error = EINVAL;
8399 		goto out;
8400 	}
8401 
8402 	if ((error = vnode_getwithref(vp)) != 0) {
8403 		goto out;
8404 	}
8405 
8406 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8407 
8408 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8409 	    vfs_context_current(), false);
8410 	vnode_put(vp);
8411 
8412 out:
8413 	file_drop(uap->fd);
8414 	return error;
8415 }
8416 
8417 
8418 /*
8419  * Sync an open file with synchronized I/O _file_ integrity completion
8420  */
8421 /* ARGSUSED */
8422 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8423 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8424 {
8425 	__pthread_testcancel(1);
8426 	return fsync_common(p, uap, MNT_WAIT);
8427 }
8428 
8429 
8430 /*
8431  * Sync an open file with synchronized I/O _file_ integrity completion
8432  *
8433  * Notes:	This is a legacy support function that does not test for
8434  *		thread cancellation points.
8435  */
8436 /* ARGSUSED */
8437 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8438 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8439 {
8440 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8441 }
8442 
8443 
8444 /*
8445  * Sync an open file with synchronized I/O _data_ integrity completion
8446  */
8447 /* ARGSUSED */
8448 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8449 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8450 {
8451 	__pthread_testcancel(1);
8452 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8453 }
8454 
8455 
8456 /*
8457  * fsync_common
8458  *
8459  * Common fsync code to support both synchronized I/O file integrity completion
8460  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8461  *
8462  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8463  * will only guarantee that the file data contents are retrievable.  If
8464  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8465  * includes additional metadata unnecessary for retrieving the file data
8466  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8467  * storage.
8468  *
8469  * Parameters:	p				The process
8470  *		uap->fd				The descriptor to synchronize
8471  *		flags				The data integrity flags
8472  *
8473  * Returns:	int				Success
8474  *	fp_getfvp:EBADF				Bad file descriptor
8475  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8476  *	VNOP_FSYNC:???				unspecified
8477  *
8478  * Notes:	We use struct fsync_args because it is a short name, and all
8479  *		caller argument structures are otherwise identical.
8480  */
8481 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8482 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8483 {
8484 	vnode_t vp;
8485 	struct fileproc *fp;
8486 	vfs_context_t ctx = vfs_context_current();
8487 	int error;
8488 
8489 	AUDIT_ARG(fd, uap->fd);
8490 
8491 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8492 		return error;
8493 	}
8494 	if ((error = vnode_getwithref(vp))) {
8495 		file_drop(uap->fd);
8496 		return error;
8497 	}
8498 
8499 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8500 
8501 	error = VNOP_FSYNC(vp, flags, ctx);
8502 
8503 #if NAMEDRSRCFORK
8504 	/* Sync resource fork shadow file if necessary. */
8505 	if ((error == 0) &&
8506 	    (vp->v_flag & VISNAMEDSTREAM) &&
8507 	    (vp->v_parent != NULLVP) &&
8508 	    vnode_isshadow(vp) &&
8509 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8510 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8511 	}
8512 #endif
8513 
8514 	(void)vnode_put(vp);
8515 	file_drop(uap->fd);
8516 	return error;
8517 }
8518 
8519 /*
8520  * Duplicate files.  Source must be a file, target must be a file or
8521  * must not exist.
8522  *
8523  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8524  *     perform inheritance correctly.
8525  */
8526 /* ARGSUSED */
8527 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8528 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8529 {
8530 	vnode_t tvp, fvp, tdvp, sdvp;
8531 	struct nameidata fromnd, tond;
8532 	int error;
8533 	vfs_context_t ctx = vfs_context_current();
8534 
8535 	/* Check that the flags are valid. */
8536 	if (uap->flags & ~CPF_MASK) {
8537 		return EINVAL;
8538 	}
8539 
8540 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8541 	    UIO_USERSPACE, uap->from, ctx);
8542 	if ((error = namei(&fromnd))) {
8543 		return error;
8544 	}
8545 	fvp = fromnd.ni_vp;
8546 
8547 	NDINIT(&tond, CREATE, OP_LINK,
8548 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8549 	    UIO_USERSPACE, uap->to, ctx);
8550 	if ((error = namei(&tond))) {
8551 		goto out1;
8552 	}
8553 	tdvp = tond.ni_dvp;
8554 	tvp = tond.ni_vp;
8555 
8556 	if (tvp != NULL) {
8557 		if (!(uap->flags & CPF_OVERWRITE)) {
8558 			error = EEXIST;
8559 			goto out;
8560 		}
8561 	}
8562 
8563 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8564 		error = EISDIR;
8565 		goto out;
8566 	}
8567 
8568 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8569 		error = EOPNOTSUPP;
8570 		goto out;
8571 	}
8572 
8573 #if CONFIG_MACF
8574 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8575 		goto out;
8576 	}
8577 #endif /* CONFIG_MACF */
8578 
8579 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8580 		goto out;
8581 	}
8582 	if (tvp) {
8583 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8584 			goto out;
8585 		}
8586 	}
8587 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8588 		goto out;
8589 	}
8590 
8591 	if (fvp == tdvp) {
8592 		error = EINVAL;
8593 	}
8594 	/*
8595 	 * If source is the same as the destination (that is the
8596 	 * same inode number) then there is nothing to do.
8597 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8598 	 */
8599 	if (fvp == tvp) {
8600 		error = -1;
8601 	}
8602 
8603 #if CONFIG_FILE_LEASES
8604 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8605 #endif
8606 
8607 	if (!error) {
8608 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8609 	}
8610 out:
8611 	sdvp = tond.ni_startdir;
8612 	/*
8613 	 * nameidone has to happen before we vnode_put(tdvp)
8614 	 * since it may need to release the fs_nodelock on the tdvp
8615 	 */
8616 	nameidone(&tond);
8617 
8618 	if (tvp) {
8619 		vnode_put(tvp);
8620 	}
8621 	vnode_put(tdvp);
8622 	vnode_put(sdvp);
8623 out1:
8624 	vnode_put(fvp);
8625 
8626 	nameidone(&fromnd);
8627 
8628 	if (error == -1) {
8629 		return 0;
8630 	}
8631 	return error;
8632 }
8633 
8634 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8635 
8636 /*
8637  * Helper function for doing clones. The caller is expected to provide an
8638  * iocounted source vnode and release it.
8639  */
8640 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8641 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8642     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8643 {
8644 	vnode_t tvp, tdvp;
8645 	struct nameidata *tondp = NULL;
8646 	int error;
8647 	int follow;
8648 	boolean_t free_src_acl;
8649 	boolean_t attr_cleanup;
8650 	enum vtype v_type;
8651 	kauth_action_t action;
8652 	struct componentname *cnp;
8653 	uint32_t defaulted = 0;
8654 	struct {
8655 		struct vnode_attr va[2];
8656 	} *va2p = NULL;
8657 	struct vnode_attr *vap = NULL;
8658 	struct vnode_attr *nvap = NULL;
8659 	uint32_t vnop_flags;
8660 
8661 	v_type = vnode_vtype(fvp);
8662 	switch (v_type) {
8663 	case VLNK:
8664 	/* FALLTHRU */
8665 	case VREG:
8666 		action = KAUTH_VNODE_ADD_FILE;
8667 		break;
8668 	case VDIR:
8669 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8670 		    fvp->v_mountedhere) {
8671 			return EINVAL;
8672 		}
8673 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8674 		break;
8675 	default:
8676 		return EINVAL;
8677 	}
8678 
8679 	AUDIT_ARG(fd2, dst_dirfd);
8680 	AUDIT_ARG(value32, flags);
8681 
8682 	tondp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8683 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8684 	NDINIT(tondp, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8685 	    UIO_USERSPACE, dst, ctx);
8686 	if (flags & CLONE_NOFOLLOW_ANY) {
8687 		tondp->ni_flag |= NAMEI_NOFOLLOW_ANY;
8688 	}
8689 
8690 	if ((error = nameiat(tondp, dst_dirfd))) {
8691 		kfree_type(struct nameidata, tondp);
8692 		return error;
8693 	}
8694 	cnp = &tondp->ni_cnd;
8695 	tdvp = tondp->ni_dvp;
8696 	tvp = tondp->ni_vp;
8697 
8698 	free_src_acl = FALSE;
8699 	attr_cleanup = FALSE;
8700 
8701 	if (tvp != NULL) {
8702 		error = EEXIST;
8703 		goto out;
8704 	}
8705 
8706 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8707 		error = EXDEV;
8708 		goto out;
8709 	}
8710 
8711 #if CONFIG_MACF
8712 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8713 		goto out;
8714 	}
8715 #endif
8716 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8717 		goto out;
8718 	}
8719 
8720 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8721 	if (data_read_authorised) {
8722 		action &= ~KAUTH_VNODE_READ_DATA;
8723 	}
8724 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8725 		goto out;
8726 	}
8727 
8728 	va2p = kalloc_type(typeof(*va2p), Z_WAITOK | Z_NOFAIL);
8729 	vap = &va2p->va[0];
8730 	nvap = &va2p->va[1];
8731 
8732 	/*
8733 	 * certain attributes may need to be changed from the source, we ask for
8734 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
8735 	 * flag is specified. By default, the clone file will inherit the target
8736 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8737 	 * will inherit the source file's ACLs instead.
8738 	 */
8739 	VATTR_INIT(vap);
8740 	VATTR_WANTED(vap, va_uid);
8741 	VATTR_WANTED(vap, va_gid);
8742 	VATTR_WANTED(vap, va_mode);
8743 	VATTR_WANTED(vap, va_flags);
8744 	if (flags & CLONE_ACL) {
8745 		VATTR_WANTED(vap, va_acl);
8746 	}
8747 
8748 	if ((error = vnode_getattr(fvp, vap, ctx)) != 0) {
8749 		goto out;
8750 	}
8751 
8752 	VATTR_INIT(nvap);
8753 	VATTR_SET(nvap, va_type, v_type);
8754 	if (VATTR_IS_SUPPORTED(vap, va_acl) && vap->va_acl != NULL) {
8755 		VATTR_SET(nvap, va_acl, vap->va_acl);
8756 		free_src_acl = TRUE;
8757 	}
8758 
8759 	/* Handle ACL inheritance, initialize vap. */
8760 	if (v_type == VLNK) {
8761 		error = vnode_authattr_new(tdvp, nvap, 0, ctx);
8762 	} else {
8763 		error = vn_attribute_prepare(tdvp, nvap, &defaulted, ctx);
8764 		if (error) {
8765 			goto out;
8766 		}
8767 		attr_cleanup = TRUE;
8768 	}
8769 
8770 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8771 	/*
8772 	 * We've got initial values for all security parameters,
8773 	 * If we are superuser, then we can change owners to be the
8774 	 * same as the source. Both superuser and the owner have default
8775 	 * WRITE_SECURITY privileges so all other fields can be taken
8776 	 * from source as well.
8777 	 */
8778 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8779 		if (VATTR_IS_SUPPORTED(vap, va_uid)) {
8780 			VATTR_SET(nvap, va_uid, vap->va_uid);
8781 		}
8782 		if (VATTR_IS_SUPPORTED(vap, va_gid)) {
8783 			VATTR_SET(nvap, va_gid, vap->va_gid);
8784 		}
8785 	} else {
8786 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8787 	}
8788 
8789 	if (VATTR_IS_SUPPORTED(vap, va_mode)) {
8790 		VATTR_SET(nvap, va_mode, vap->va_mode);
8791 	}
8792 	if (VATTR_IS_SUPPORTED(vap, va_flags)) {
8793 		VATTR_SET(nvap, va_flags,
8794 		    ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8795 		    (nvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8796 	}
8797 
8798 #if CONFIG_FILE_LEASES
8799 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8800 #endif
8801 
8802 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, nvap, vnop_flags, ctx);
8803 
8804 	if (!error && tvp) {
8805 		int     update_flags = 0;
8806 #if CONFIG_FSE
8807 		int fsevent;
8808 #endif /* CONFIG_FSE */
8809 
8810 		/*
8811 		 * If some of the requested attributes weren't handled by the
8812 		 * VNOP, use our fallback code.
8813 		 */
8814 		if (!VATTR_ALL_SUPPORTED(nvap)) {
8815 			(void)vnode_setattr_fallback(tvp, nvap, ctx);
8816 		}
8817 
8818 #if CONFIG_MACF
8819 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8820 		    VNODE_LABEL_CREATE, ctx);
8821 #endif
8822 
8823 		// Make sure the name & parent pointers are hooked up
8824 		if (tvp->v_name == NULL) {
8825 			update_flags |= VNODE_UPDATE_NAME;
8826 		}
8827 		if (tvp->v_parent == NULLVP) {
8828 			update_flags |= VNODE_UPDATE_PARENT;
8829 		}
8830 
8831 		if (update_flags) {
8832 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8833 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8834 		}
8835 
8836 #if CONFIG_FSE
8837 		switch (vnode_vtype(tvp)) {
8838 		case VLNK:
8839 		/* FALLTHRU */
8840 		case VREG:
8841 			fsevent = FSE_CREATE_FILE;
8842 			break;
8843 		case VDIR:
8844 			fsevent = FSE_CREATE_DIR;
8845 			break;
8846 		default:
8847 			goto out;
8848 		}
8849 
8850 		if (need_fsevent(fsevent, tvp)) {
8851 			/*
8852 			 * The following is a sequence of three explicit events.
8853 			 * A pair of FSE_CLONE events representing the source and destination
8854 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8855 			 * fseventsd may coalesce the destination clone and create events
8856 			 * into a single event resulting in the following sequence for a client
8857 			 * FSE_CLONE (src)
8858 			 * FSE_CLONE | FSE_CREATE (dst)
8859 			 */
8860 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8861 			    FSE_ARG_DONE);
8862 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8863 			    FSE_ARG_DONE);
8864 		}
8865 #endif /* CONFIG_FSE */
8866 	}
8867 
8868 out:
8869 	if (attr_cleanup) {
8870 		vn_attribute_cleanup(nvap, defaulted);
8871 	}
8872 	if (free_src_acl && vap->va_acl) {
8873 		kauth_acl_free(vap->va_acl);
8874 	}
8875 	if (va2p) {
8876 		kfree_type(typeof(*va2p), va2p);
8877 	}
8878 	nameidone(tondp);
8879 	kfree_type(struct nameidata, tondp);
8880 	if (tvp) {
8881 		vnode_put(tvp);
8882 	}
8883 	vnode_put(tdvp);
8884 	return error;
8885 }
8886 
8887 /*
8888  * clone files or directories, target must not exist.
8889  */
8890 /* ARGSUSED */
8891 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8892 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8893     __unused int32_t *retval)
8894 {
8895 	vnode_t fvp;
8896 	struct nameidata *ndp = NULL;
8897 	int follow;
8898 	int error;
8899 	vfs_context_t ctx = vfs_context_current();
8900 
8901 	/* Check that the flags are valid. */
8902 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
8903 	    CLONE_NOFOLLOW_ANY)) {
8904 		return EINVAL;
8905 	}
8906 
8907 	AUDIT_ARG(fd, uap->src_dirfd);
8908 
8909 	ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8910 
8911 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8912 	NDINIT(ndp, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8913 	    UIO_USERSPACE, uap->src, ctx);
8914 	if (uap->flags & CLONE_NOFOLLOW_ANY) {
8915 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
8916 	}
8917 
8918 	if ((error = nameiat(ndp, uap->src_dirfd))) {
8919 		kfree_type(struct nameidata, ndp);
8920 		return error;
8921 	}
8922 
8923 	fvp = ndp->ni_vp;
8924 	nameidone(ndp);
8925 	kfree_type(struct nameidata, ndp);
8926 
8927 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8928 	    uap->flags, ctx);
8929 
8930 	vnode_put(fvp);
8931 	return error;
8932 }
8933 
8934 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8935 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8936     __unused int32_t *retval)
8937 {
8938 	vnode_t fvp;
8939 	struct fileproc *fp;
8940 	int error;
8941 	vfs_context_t ctx = vfs_context_current();
8942 
8943 	/* Check that the flags are valid. */
8944 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
8945 	    CLONE_NOFOLLOW_ANY)) {
8946 		return EINVAL;
8947 	}
8948 
8949 	AUDIT_ARG(fd, uap->src_fd);
8950 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8951 	if (error) {
8952 		return error;
8953 	}
8954 
8955 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8956 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8957 		error = EBADF;
8958 		goto out;
8959 	}
8960 
8961 	if ((error = vnode_getwithref(fvp))) {
8962 		goto out;
8963 	}
8964 
8965 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8966 
8967 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8968 	    uap->flags, ctx);
8969 
8970 	vnode_put(fvp);
8971 out:
8972 	file_drop(uap->src_fd);
8973 	return error;
8974 }
8975 
8976 static int
rename_submounts_callback(mount_t mp,void * arg)8977 rename_submounts_callback(mount_t mp, void *arg)
8978 {
8979 	int error = 0;
8980 	mount_t pmp = (mount_t)arg;
8981 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8982 
8983 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8984 		return 0;
8985 	}
8986 
8987 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8988 		return 0;
8989 	}
8990 
8991 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8992 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8993 		return -1;
8994 	}
8995 
8996 	size_t pathlen = MAXPATHLEN;
8997 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8998 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8999 	}
9000 
9001 	vfs_unbusy(mp);
9002 
9003 	return error;
9004 }
9005 
9006 /*
9007  * Rename files.  Source and destination must either both be directories,
9008  * or both not be directories.  If target is a directory, it must be empty.
9009  */
9010 /* ARGSUSED */
9011 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)9012 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
9013     int tofd, user_addr_t to, int segflg, u_int uflags)
9014 {
9015 	vnode_t tvp, tdvp;
9016 	vnode_t fvp, fdvp;
9017 	vnode_t mnt_fvp;
9018 	struct nameidata *fromnd, *tond;
9019 	int error = 0;
9020 	int do_retry;
9021 	int retry_count;
9022 	int mntrename;
9023 	int need_event;
9024 	int need_kpath2;
9025 	int has_listeners;
9026 	const char *oname = NULL;
9027 	char *from_name = NULL, *to_name = NULL;
9028 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
9029 	int from_len = 0, to_len = 0;
9030 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
9031 	int holding_mntlock;
9032 	int vn_authorize_skipped;
9033 	mount_t locked_mp = NULL;
9034 	vnode_t oparent = NULLVP;
9035 #if CONFIG_FSE
9036 	fse_info from_finfo = {}, to_finfo;
9037 #endif
9038 	int from_truncated = 0, to_truncated = 0;
9039 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
9040 	int batched = 0;
9041 	struct vnode_attr *fvap, *tvap;
9042 	int continuing = 0;
9043 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
9044 	int32_t nofollow_any = 0;
9045 	/* carving out a chunk for structs that are too big to be on stack. */
9046 	struct {
9047 		struct nameidata from_node, to_node;
9048 		struct vnode_attr fv_attr, tv_attr;
9049 	} * __rename_data;
9050 
9051 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9052 	fromnd = &__rename_data->from_node;
9053 	tond = &__rename_data->to_node;
9054 
9055 	holding_mntlock = 0;
9056 	do_retry = 0;
9057 	retry_count = 0;
9058 retry:
9059 	fvp = tvp = NULL;
9060 	fdvp = tdvp = NULL;
9061 	fvap = tvap = NULL;
9062 	mnt_fvp = NULLVP;
9063 	mntrename = FALSE;
9064 	vn_authorize_skipped = FALSE;
9065 
9066 	if (uflags & RENAME_NOFOLLOW_ANY) {
9067 		nofollow_any = NAMEI_NOFOLLOW_ANY;
9068 	}
9069 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9070 	    segflg, from, ctx);
9071 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9072 
9073 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9074 	    segflg, to, ctx);
9075 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9076 
9077 continue_lookup:
9078 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9079 		if ((error = nameiat(fromnd, fromfd))) {
9080 			goto out1;
9081 		}
9082 		fdvp = fromnd->ni_dvp;
9083 		fvp  = fromnd->ni_vp;
9084 
9085 		if (fvp && fvp->v_type == VDIR) {
9086 			tond->ni_cnd.cn_flags |= WILLBEDIR;
9087 		}
9088 	}
9089 
9090 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9091 		if ((error = nameiat(tond, tofd))) {
9092 			/*
9093 			 * Translate error code for rename("dir1", "dir2/.").
9094 			 */
9095 			if (error == EISDIR && fvp->v_type == VDIR) {
9096 				error = EINVAL;
9097 			}
9098 			goto out1;
9099 		}
9100 		tdvp = tond->ni_dvp;
9101 		tvp  = tond->ni_vp;
9102 	}
9103 
9104 #if DEVELOPMENT || DEBUG
9105 	/*
9106 	 * XXX VSWAP: Check for entitlements or special flag here
9107 	 * so we can restrict access appropriately.
9108 	 */
9109 #else /* DEVELOPMENT || DEBUG */
9110 
9111 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9112 		error = EPERM;
9113 		goto out1;
9114 	}
9115 
9116 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9117 		error = EPERM;
9118 		goto out1;
9119 	}
9120 #endif /* DEVELOPMENT || DEBUG */
9121 
9122 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9123 		error = ENOENT;
9124 		goto out1;
9125 	}
9126 
9127 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9128 		int32_t pval = 0;
9129 		int err = 0;
9130 
9131 		/*
9132 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9133 		 * has the same name as target iff the following conditions are met:
9134 		 * 1. the target file system is case insensitive
9135 		 * 2. source and target directories are the same
9136 		 * 3. source and target files are the same
9137 		 * 4. name only differs in case (determined by underlying filesystem)
9138 		 */
9139 		if (fvp != tvp || fdvp != tdvp) {
9140 			error = EEXIST;
9141 			goto out1;
9142 		}
9143 
9144 		/*
9145 		 * Assume that the target file system is case sensitive if
9146 		 * _PC_CASE_SENSITIVE selector isn't supported.
9147 		 */
9148 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9149 		if (err != 0 || pval != 0) {
9150 			error = EEXIST;
9151 			goto out1;
9152 		}
9153 	}
9154 
9155 	batched = vnode_compound_rename_available(fdvp);
9156 
9157 #if CONFIG_FSE
9158 	need_event = need_fsevent(FSE_RENAME, fdvp);
9159 	if (need_event) {
9160 		if (fvp) {
9161 			get_fse_info(fvp, &from_finfo, ctx);
9162 		} else {
9163 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9164 			if (error) {
9165 				goto out1;
9166 			}
9167 
9168 			fvap = &__rename_data->fv_attr;
9169 		}
9170 
9171 		if (tvp) {
9172 			get_fse_info(tvp, &to_finfo, ctx);
9173 		} else if (batched) {
9174 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9175 			if (error) {
9176 				goto out1;
9177 			}
9178 
9179 			tvap = &__rename_data->tv_attr;
9180 		}
9181 	}
9182 #else
9183 	need_event = 0;
9184 #endif /* CONFIG_FSE */
9185 
9186 	has_listeners = kauth_authorize_fileop_has_listeners();
9187 
9188 	need_kpath2 = 0;
9189 #if CONFIG_AUDIT
9190 	if (AUDIT_RECORD_EXISTS()) {
9191 		need_kpath2 = 1;
9192 	}
9193 #endif
9194 
9195 	if (need_event || has_listeners) {
9196 		if (from_name == NULL) {
9197 			GET_PATH(from_name);
9198 		}
9199 
9200 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9201 
9202 		if (from_name_no_firmlink == NULL) {
9203 			GET_PATH(from_name_no_firmlink);
9204 		}
9205 
9206 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9207 	}
9208 
9209 	if (need_event || need_kpath2 || has_listeners) {
9210 		if (to_name == NULL) {
9211 			GET_PATH(to_name);
9212 		}
9213 
9214 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9215 
9216 		if (to_name_no_firmlink == NULL) {
9217 			GET_PATH(to_name_no_firmlink);
9218 		}
9219 
9220 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9221 		if (to_name && need_kpath2) {
9222 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9223 		}
9224 	}
9225 	if (!fvp) {
9226 		/*
9227 		 * Claim: this check will never reject a valid rename.
9228 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9229 		 * Suppose fdvp and tdvp are not on the same mount.
9230 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9231 		 *      then you can't move it to within another dir on the same mountpoint.
9232 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9233 		 *
9234 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9235 		 */
9236 		if (fdvp->v_mount != tdvp->v_mount) {
9237 			error = EXDEV;
9238 			goto out1;
9239 		}
9240 		goto skipped_lookup;
9241 	}
9242 
9243 	/*
9244 	 * If the source and destination are the same (i.e. they're
9245 	 * links to the same vnode) and the target file system is
9246 	 * case sensitive, then there is nothing to do.
9247 	 *
9248 	 * XXX Come back to this.
9249 	 */
9250 	if (fvp == tvp) {
9251 		int pathconf_val;
9252 
9253 		/*
9254 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9255 		 * then assume that this file system is case sensitive.
9256 		 */
9257 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9258 		    pathconf_val != 0) {
9259 			vn_authorize_skipped = TRUE;
9260 			goto out1;
9261 		}
9262 	}
9263 
9264 	/*
9265 	 * Allow the renaming of mount points.
9266 	 * - target must not exist
9267 	 * - target must reside in the same directory as source
9268 	 * - union mounts cannot be renamed
9269 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9270 	 *
9271 	 * XXX Handle this in VFS after a continued lookup (if we missed
9272 	 * in the cache to start off)
9273 	 *
9274 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9275 	 * we'll skip past here.  The file system is responsible for
9276 	 * checking that @tvp is not a descendent of @fvp and vice versa
9277 	 * so it should always return EINVAL if either @tvp or @fvp is the
9278 	 * root of a volume.
9279 	 */
9280 	if ((fvp->v_flag & VROOT) &&
9281 	    (fvp->v_type == VDIR) &&
9282 	    (tvp == NULL) &&
9283 	    (fvp->v_mountedhere == NULL) &&
9284 	    (fdvp == tdvp) &&
9285 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9286 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9287 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9288 		vnode_t coveredvp;
9289 
9290 		/* switch fvp to the covered vnode */
9291 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9292 		if ((vnode_getwithref(coveredvp))) {
9293 			error = ENOENT;
9294 			goto out1;
9295 		}
9296 		/*
9297 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9298 		 * later.
9299 		 */
9300 		mnt_fvp = fvp;
9301 
9302 		fvp = coveredvp;
9303 		mntrename = TRUE;
9304 	}
9305 	/*
9306 	 * Check for cross-device rename.
9307 	 * For rename on mountpoint, we want to also check the source and its parent
9308 	 * belong to the same mountpoint.
9309 	 */
9310 	if ((fvp->v_mount != tdvp->v_mount) ||
9311 	    (fvp->v_mount != fdvp->v_mount) ||
9312 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9313 		error = EXDEV;
9314 		goto out1;
9315 	}
9316 
9317 	/*
9318 	 * If source is the same as the destination (that is the
9319 	 * same inode number) then there is nothing to do...
9320 	 * EXCEPT if the underlying file system supports case
9321 	 * insensitivity and is case preserving.  In this case
9322 	 * the file system needs to handle the special case of
9323 	 * getting the same vnode as target (fvp) and source (tvp).
9324 	 *
9325 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9326 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9327 	 * handle the special case of getting the same vnode as target and
9328 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9329 	 * so not to cause locking problems. There is a single reference on tvp.
9330 	 *
9331 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9332 	 * that correct behaviour then is just to return success without doing
9333 	 * anything.
9334 	 *
9335 	 * XXX filesystem should take care of this itself, perhaps...
9336 	 */
9337 	if (fvp == tvp && fdvp == tdvp) {
9338 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9339 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9340 		    fromnd->ni_cnd.cn_namelen)) {
9341 			vn_authorize_skipped = TRUE;
9342 			goto out1;
9343 		}
9344 	}
9345 
9346 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9347 		/*
9348 		 * we're holding a reference and lock
9349 		 * on locked_mp, but it no longer matches
9350 		 * what we want to do... so drop our hold
9351 		 */
9352 		mount_unlock_renames(locked_mp);
9353 		mount_drop(locked_mp, 0);
9354 		holding_mntlock = 0;
9355 	}
9356 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9357 		/*
9358 		 * serialize renames that re-shape
9359 		 * the tree... if holding_mntlock is
9360 		 * set, then we're ready to go...
9361 		 * otherwise we
9362 		 * first need to drop the iocounts
9363 		 * we picked up, second take the
9364 		 * lock to serialize the access,
9365 		 * then finally start the lookup
9366 		 * process over with the lock held
9367 		 */
9368 		if (!holding_mntlock) {
9369 			/*
9370 			 * need to grab a reference on
9371 			 * the mount point before we
9372 			 * drop all the iocounts... once
9373 			 * the iocounts are gone, the mount
9374 			 * could follow
9375 			 */
9376 			locked_mp = fvp->v_mount;
9377 			mount_ref(locked_mp, 0);
9378 
9379 			/*
9380 			 * nameidone has to happen before we vnode_put(tvp)
9381 			 * since it may need to release the fs_nodelock on the tvp
9382 			 */
9383 			nameidone(tond);
9384 
9385 			if (tvp) {
9386 				vnode_put(tvp);
9387 			}
9388 			vnode_put(tdvp);
9389 
9390 			/*
9391 			 * nameidone has to happen before we vnode_put(fdvp)
9392 			 * since it may need to release the fs_nodelock on the fvp
9393 			 */
9394 			nameidone(fromnd);
9395 
9396 			vnode_put(fvp);
9397 			vnode_put(fdvp);
9398 
9399 			if (mnt_fvp != NULLVP) {
9400 				vnode_put(mnt_fvp);
9401 			}
9402 
9403 			mount_lock_renames(locked_mp);
9404 			holding_mntlock = 1;
9405 
9406 			goto retry;
9407 		}
9408 	} else {
9409 		/*
9410 		 * when we dropped the iocounts to take
9411 		 * the lock, we allowed the identity of
9412 		 * the various vnodes to change... if they did,
9413 		 * we may no longer be dealing with a rename
9414 		 * that reshapes the tree... once we're holding
9415 		 * the iocounts, the vnodes can't change type
9416 		 * so we're free to drop the lock at this point
9417 		 * and continue on
9418 		 */
9419 		if (holding_mntlock) {
9420 			mount_unlock_renames(locked_mp);
9421 			mount_drop(locked_mp, 0);
9422 			holding_mntlock = 0;
9423 		}
9424 	}
9425 
9426 	if (!batched) {
9427 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9428 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9429 		    flags, NULL);
9430 		if (error) {
9431 			if (error == ENOENT) {
9432 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9433 					/*
9434 					 * We encountered a race where after doing the namei,
9435 					 * tvp stops being valid. If so, simply re-drive the rename
9436 					 * call from the top.
9437 					 */
9438 					do_retry = 1;
9439 					retry_count += 1;
9440 				}
9441 			}
9442 			goto out1;
9443 		}
9444 	}
9445 
9446 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9447 	if (mnt_fvp != NULLVP) {
9448 		vnode_put(mnt_fvp);
9449 		mnt_fvp = NULLVP;
9450 	}
9451 
9452 	// save these off so we can later verify that fvp is the same
9453 	oname   = fvp->v_name;
9454 	oparent = fvp->v_parent;
9455 
9456 skipped_lookup:
9457 #if CONFIG_FILE_LEASES
9458 	/* Lease break needed for source's parent dir? */
9459 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9460 
9461 	/* Lease break needed for target's parent dir? */
9462 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9463 #endif
9464 
9465 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9466 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9467 	    flags, ctx);
9468 
9469 	if (holding_mntlock) {
9470 		/*
9471 		 * we can drop our serialization
9472 		 * lock now
9473 		 */
9474 		mount_unlock_renames(locked_mp);
9475 		mount_drop(locked_mp, 0);
9476 		holding_mntlock = 0;
9477 	}
9478 	if (error) {
9479 		if (error == EDATALESS) {
9480 			/*
9481 			 * If we've been here before, something has gone
9482 			 * horribly wrong and we should just get out lest
9483 			 * we spiral around the drain forever.
9484 			 */
9485 			if (flags & VFS_RENAME_DATALESS) {
9486 				error = EIO;
9487 				goto out1;
9488 			}
9489 
9490 			/*
9491 			 * The object we're renaming is dataless (or has a
9492 			 * dataless descendent) and requires materialization
9493 			 * before the rename occurs.  But we're holding the
9494 			 * mount point's rename lock, so it's not safe to
9495 			 * make the upcall.
9496 			 *
9497 			 * In this case, we release the lock (above), perform
9498 			 * the materialization, and start the whole thing over.
9499 			 */
9500 			error = vfs_materialize_reparent(fvp, tdvp);
9501 			if (error == 0) {
9502 				/*
9503 				 * The next time around we need to tell the
9504 				 * file system that the materializtaion has
9505 				 * been performed.
9506 				 */
9507 				flags |= VFS_RENAME_DATALESS;
9508 				do_retry = 1;
9509 			}
9510 			goto out1;
9511 		}
9512 		if (error == EKEEPLOOKING) {
9513 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9514 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9515 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9516 				}
9517 			}
9518 
9519 			fromnd->ni_vp = fvp;
9520 			tond->ni_vp = tvp;
9521 
9522 			goto continue_lookup;
9523 		}
9524 
9525 		/*
9526 		 * We may encounter a race in the VNOP where the destination didn't
9527 		 * exist when we did the namei, but it does by the time we go and
9528 		 * try to create the entry. In this case, we should re-drive this rename
9529 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9530 		 * but other filesystems susceptible to this race could return it, too.
9531 		 */
9532 		if (error == ERECYCLE) {
9533 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9534 				do_retry = 1;
9535 				retry_count += 1;
9536 			} else {
9537 				printf("rename retry limit due to ERECYCLE reached\n");
9538 				error = ENOENT;
9539 			}
9540 		}
9541 
9542 		/*
9543 		 * For compound VNOPs, the authorization callback may return
9544 		 * ENOENT in case of racing hardlink lookups hitting the name
9545 		 * cache, redrive the lookup.
9546 		 */
9547 		if (batched && error == ENOENT) {
9548 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9549 				do_retry = 1;
9550 				retry_count += 1;
9551 			}
9552 		}
9553 
9554 		goto out1;
9555 	}
9556 
9557 	/* call out to allow 3rd party notification of rename.
9558 	 * Ignore result of kauth_authorize_fileop call.
9559 	 */
9560 	kauth_authorize_fileop(vfs_context_ucred(ctx),
9561 	    KAUTH_FILEOP_RENAME,
9562 	    (uintptr_t)from_name, (uintptr_t)to_name);
9563 	if (flags & VFS_RENAME_SWAP) {
9564 		kauth_authorize_fileop(vfs_context_ucred(ctx),
9565 		    KAUTH_FILEOP_RENAME,
9566 		    (uintptr_t)to_name, (uintptr_t)from_name);
9567 	}
9568 
9569 #if CONFIG_FSE
9570 	if (from_name != NULL && to_name != NULL) {
9571 		if (from_truncated || to_truncated) {
9572 			// set it here since only the from_finfo gets reported up to user space
9573 			from_finfo.mode |= FSE_TRUNCATED_PATH;
9574 		}
9575 
9576 		if (tvap && tvp) {
9577 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9578 		}
9579 		if (fvap) {
9580 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9581 		}
9582 
9583 		if (tvp) {
9584 			add_fsevent(FSE_RENAME, ctx,
9585 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9586 			    FSE_ARG_FINFO, &from_finfo,
9587 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9588 			    FSE_ARG_FINFO, &to_finfo,
9589 			    FSE_ARG_DONE);
9590 			if (flags & VFS_RENAME_SWAP) {
9591 				/*
9592 				 * Strictly speaking, swap is the equivalent of
9593 				 * *three* renames.  FSEvents clients should only take
9594 				 * the events as a hint, so we only bother reporting
9595 				 * two.
9596 				 */
9597 				add_fsevent(FSE_RENAME, ctx,
9598 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9599 				    FSE_ARG_FINFO, &to_finfo,
9600 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9601 				    FSE_ARG_FINFO, &from_finfo,
9602 				    FSE_ARG_DONE);
9603 			}
9604 		} else {
9605 			add_fsevent(FSE_RENAME, ctx,
9606 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9607 			    FSE_ARG_FINFO, &from_finfo,
9608 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9609 			    FSE_ARG_DONE);
9610 		}
9611 	}
9612 #endif /* CONFIG_FSE */
9613 
9614 	/*
9615 	 * update filesystem's mount point data
9616 	 */
9617 	if (mntrename) {
9618 		char *cp, *pathend, *mpname;
9619 		char * tobuf;
9620 		struct mount *mp;
9621 		int maxlen;
9622 		size_t len = 0;
9623 
9624 		mp = fvp->v_mountedhere;
9625 
9626 		if (vfs_busy(mp, LK_NOWAIT)) {
9627 			error = EBUSY;
9628 			goto out1;
9629 		}
9630 		tobuf = zalloc(ZV_NAMEI);
9631 
9632 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
9633 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9634 		} else {
9635 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9636 		}
9637 		if (!error) {
9638 			/* find current mount point prefix */
9639 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
9640 			for (cp = pathend; *cp != '\0'; ++cp) {
9641 				if (*cp == '/') {
9642 					pathend = cp + 1;
9643 				}
9644 			}
9645 			/* find last component of target name */
9646 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9647 				if (*cp == '/') {
9648 					mpname = cp + 1;
9649 				}
9650 			}
9651 
9652 			/* Update f_mntonname of sub mounts */
9653 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
9654 
9655 			/* append name to prefix */
9656 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9657 			bzero(pathend, maxlen);
9658 
9659 			strlcpy(pathend, mpname, maxlen);
9660 		}
9661 		zfree(ZV_NAMEI, tobuf);
9662 
9663 		vfs_unbusy(mp);
9664 
9665 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9666 	}
9667 	/*
9668 	 * fix up name & parent pointers.  note that we first
9669 	 * check that fvp has the same name/parent pointers it
9670 	 * had before the rename call... this is a 'weak' check
9671 	 * at best...
9672 	 *
9673 	 * XXX oparent and oname may not be set in the compound vnop case
9674 	 */
9675 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9676 		int update_flags;
9677 
9678 		update_flags = VNODE_UPDATE_NAME;
9679 
9680 		if (fdvp != tdvp) {
9681 			update_flags |= VNODE_UPDATE_PARENT;
9682 		}
9683 
9684 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9685 	}
9686 out1:
9687 	/*
9688 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9689 	 * skipped earlier as no actual rename was performed.
9690 	 */
9691 	if (vn_authorize_skipped && error == 0) {
9692 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
9693 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9694 		    flags, NULL);
9695 		if (error && error == ENOENT) {
9696 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9697 				do_retry = 1;
9698 				retry_count += 1;
9699 			}
9700 		}
9701 	}
9702 	if (to_name != NULL) {
9703 		RELEASE_PATH(to_name);
9704 		to_name = NULL;
9705 	}
9706 	if (to_name_no_firmlink != NULL) {
9707 		RELEASE_PATH(to_name_no_firmlink);
9708 		to_name_no_firmlink = NULL;
9709 	}
9710 	if (from_name != NULL) {
9711 		RELEASE_PATH(from_name);
9712 		from_name = NULL;
9713 	}
9714 	if (from_name_no_firmlink != NULL) {
9715 		RELEASE_PATH(from_name_no_firmlink);
9716 		from_name_no_firmlink = NULL;
9717 	}
9718 	if (holding_mntlock) {
9719 		mount_unlock_renames(locked_mp);
9720 		mount_drop(locked_mp, 0);
9721 		holding_mntlock = 0;
9722 	}
9723 	if (tdvp) {
9724 		/*
9725 		 * nameidone has to happen before we vnode_put(tdvp)
9726 		 * since it may need to release the fs_nodelock on the tdvp
9727 		 */
9728 		nameidone(tond);
9729 
9730 		if (tvp) {
9731 			vnode_put(tvp);
9732 		}
9733 		vnode_put(tdvp);
9734 	}
9735 	if (fdvp) {
9736 		/*
9737 		 * nameidone has to happen before we vnode_put(fdvp)
9738 		 * since it may need to release the fs_nodelock on the fdvp
9739 		 */
9740 		nameidone(fromnd);
9741 
9742 		if (fvp) {
9743 			vnode_put(fvp);
9744 		}
9745 		vnode_put(fdvp);
9746 	}
9747 	if (mnt_fvp != NULLVP) {
9748 		vnode_put(mnt_fvp);
9749 	}
9750 	/*
9751 	 * If things changed after we did the namei, then we will re-drive
9752 	 * this rename call from the top.
9753 	 */
9754 	if (do_retry) {
9755 		do_retry = 0;
9756 		goto retry;
9757 	}
9758 
9759 	kfree_type(typeof(*__rename_data), __rename_data);
9760 	return error;
9761 }
9762 
9763 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9764 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9765 {
9766 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9767 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9768 }
9769 
9770 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9771 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9772 {
9773 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9774 		return EINVAL;
9775 	}
9776 
9777 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9778 		return EINVAL;
9779 	}
9780 
9781 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9782 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9783 }
9784 
9785 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9786 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9787 {
9788 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9789 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9790 }
9791 
9792 /*
9793  * Make a directory file.
9794  *
9795  * Returns:	0			Success
9796  *		EEXIST
9797  *	namei:???
9798  *	vnode_authorize:???
9799  *	vn_create:???
9800  */
9801 /* ARGSUSED */
9802 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9803 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9804     enum uio_seg segflg)
9805 {
9806 	vnode_t vp, dvp;
9807 	int error;
9808 	int update_flags = 0;
9809 	int batched;
9810 	struct nameidata nd;
9811 
9812 	AUDIT_ARG(mode, vap->va_mode);
9813 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9814 	    path, ctx);
9815 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9816 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9817 
9818 continue_lookup:
9819 	error = nameiat(&nd, fd);
9820 	if (error) {
9821 		return error;
9822 	}
9823 	dvp = nd.ni_dvp;
9824 	vp = nd.ni_vp;
9825 
9826 	if (vp != NULL) {
9827 		error = EEXIST;
9828 		goto out;
9829 	}
9830 
9831 	batched = vnode_compound_mkdir_available(dvp);
9832 
9833 	VATTR_SET(vap, va_type, VDIR);
9834 
9835 	/*
9836 	 * XXX
9837 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9838 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9839 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9840 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9841 	 */
9842 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9843 		if (error == EACCES || error == EPERM) {
9844 			int error2;
9845 
9846 			nameidone(&nd);
9847 			vnode_put(dvp);
9848 			dvp = NULLVP;
9849 
9850 			/*
9851 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9852 			 * rather than EACCESS if the target exists.
9853 			 */
9854 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9855 			    path, ctx);
9856 			error2 = nameiat(&nd, fd);
9857 			if (error2) {
9858 				goto out;
9859 			} else {
9860 				vp = nd.ni_vp;
9861 				error = EEXIST;
9862 				goto out;
9863 			}
9864 		}
9865 
9866 		goto out;
9867 	}
9868 
9869 #if CONFIG_FILE_LEASES
9870 	vnode_breakdirlease(dvp, false, O_WRONLY);
9871 #endif
9872 
9873 	/*
9874 	 * make the directory
9875 	 */
9876 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9877 		if (error == EKEEPLOOKING) {
9878 			nd.ni_vp = vp;
9879 			goto continue_lookup;
9880 		}
9881 
9882 		goto out;
9883 	}
9884 
9885 	// Make sure the name & parent pointers are hooked up
9886 	if (vp->v_name == NULL) {
9887 		update_flags |= VNODE_UPDATE_NAME;
9888 	}
9889 	if (vp->v_parent == NULLVP) {
9890 		update_flags |= VNODE_UPDATE_PARENT;
9891 	}
9892 
9893 	if (update_flags) {
9894 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9895 	}
9896 
9897 #if CONFIG_FSE
9898 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9899 #endif
9900 
9901 out:
9902 	/*
9903 	 * nameidone has to happen before we vnode_put(dvp)
9904 	 * since it may need to release the fs_nodelock on the dvp
9905 	 */
9906 	nameidone(&nd);
9907 
9908 	if (vp) {
9909 		vnode_put(vp);
9910 	}
9911 	if (dvp) {
9912 		vnode_put(dvp);
9913 	}
9914 
9915 	return error;
9916 }
9917 
9918 /*
9919  * mkdir_extended: Create a directory; with extended security (ACL).
9920  *
9921  * Parameters:    p                       Process requesting to create the directory
9922  *                uap                     User argument descriptor (see below)
9923  *                retval                  (ignored)
9924  *
9925  * Indirect:      uap->path               Path of directory to create
9926  *                uap->mode               Access permissions to set
9927  *                uap->xsecurity          ACL to set
9928  *
9929  * Returns:        0                      Success
9930  *                !0                      Not success
9931  *
9932  */
9933 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9934 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9935 {
9936 	int ciferror;
9937 	kauth_filesec_t xsecdst;
9938 	struct vnode_attr va;
9939 
9940 	AUDIT_ARG(owner, uap->uid, uap->gid);
9941 
9942 	xsecdst = NULL;
9943 	if ((uap->xsecurity != USER_ADDR_NULL) &&
9944 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9945 		return ciferror;
9946 	}
9947 
9948 	VATTR_INIT(&va);
9949 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9950 	if (xsecdst != NULL) {
9951 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9952 		va.va_vaflags |= VA_FILESEC_ACL;
9953 	}
9954 
9955 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9956 	    UIO_USERSPACE);
9957 	if (xsecdst != NULL) {
9958 		kauth_filesec_free(xsecdst);
9959 	}
9960 	return ciferror;
9961 }
9962 
9963 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9964 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9965 {
9966 	struct vnode_attr va;
9967 
9968 	VATTR_INIT(&va);
9969 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9970 
9971 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9972 	           UIO_USERSPACE);
9973 }
9974 
9975 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9976 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9977 {
9978 	struct vnode_attr va;
9979 
9980 	VATTR_INIT(&va);
9981 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9982 
9983 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9984 	           UIO_USERSPACE);
9985 }
9986 
9987 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9988 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9989     enum uio_seg segflg, int unlink_flags)
9990 {
9991 	struct {
9992 		struct nameidata nd;
9993 #if CONFIG_FSE
9994 		struct vnode_attr va;
9995 #endif /* CONFIG_FSE */
9996 	} *__rmdir_data;
9997 	vnode_t vp, dvp;
9998 	int error;
9999 	struct nameidata *ndp;
10000 	char     *path = NULL;
10001 	char     *no_firmlink_path = NULL;
10002 	int       len_path = 0;
10003 	int       len_no_firmlink_path = 0;
10004 	int has_listeners = 0;
10005 	int need_event = 0;
10006 	int truncated_path = 0;
10007 	int truncated_no_firmlink_path = 0;
10008 	struct vnode_attr *vap = NULL;
10009 	int restart_count = 0;
10010 	int batched;
10011 
10012 	int restart_flag;
10013 	int nofollow_any = 0;
10014 
10015 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
10016 	ndp = &__rmdir_data->nd;
10017 
10018 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
10019 		nofollow_any = NAMEI_NOFOLLOW_ANY;
10020 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
10021 	}
10022 
10023 	/*
10024 	 * This loop exists to restart rmdir in the unlikely case that two
10025 	 * processes are simultaneously trying to remove the same directory
10026 	 * containing orphaned appleDouble files.
10027 	 */
10028 	do {
10029 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
10030 		    segflg, dirpath, ctx);
10031 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
10032 continue_lookup:
10033 		restart_flag = 0;
10034 		vap = NULL;
10035 
10036 		error = nameiat(ndp, fd);
10037 		if (error) {
10038 			goto err_out;
10039 		}
10040 
10041 		dvp = ndp->ni_dvp;
10042 		vp = ndp->ni_vp;
10043 
10044 		if (vp) {
10045 			batched = vnode_compound_rmdir_available(vp);
10046 
10047 			if (vp->v_flag & VROOT) {
10048 				/*
10049 				 * The root of a mounted filesystem cannot be deleted.
10050 				 */
10051 				error = EBUSY;
10052 				goto out;
10053 			}
10054 
10055 #if DEVELOPMENT || DEBUG
10056 			/*
10057 			 * XXX VSWAP: Check for entitlements or special flag here
10058 			 * so we can restrict access appropriately.
10059 			 */
10060 #else /* DEVELOPMENT || DEBUG */
10061 
10062 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10063 				error = EPERM;
10064 				goto out;
10065 			}
10066 #endif /* DEVELOPMENT || DEBUG */
10067 
10068 			/*
10069 			 * Removed a check here; we used to abort if vp's vid
10070 			 * was not the same as what we'd seen the last time around.
10071 			 * I do not think that check was valid, because if we retry
10072 			 * and all dirents are gone, the directory could legitimately
10073 			 * be recycled but still be present in a situation where we would
10074 			 * have had permission to delete.  Therefore, we won't make
10075 			 * an effort to preserve that check now that we may not have a
10076 			 * vp here.
10077 			 */
10078 
10079 			if (!batched) {
10080 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10081 				if (error) {
10082 					if (error == ENOENT) {
10083 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10084 							restart_flag = 1;
10085 							restart_count += 1;
10086 						}
10087 					}
10088 					goto out;
10089 				}
10090 			}
10091 		} else {
10092 			batched = 1;
10093 
10094 			if (!vnode_compound_rmdir_available(dvp)) {
10095 				panic("No error, but no compound rmdir?");
10096 			}
10097 		}
10098 
10099 #if CONFIG_FSE
10100 		fse_info  finfo = {0};
10101 
10102 		need_event = need_fsevent(FSE_DELETE, dvp);
10103 		if (need_event) {
10104 			if (!batched) {
10105 				get_fse_info(vp, &finfo, ctx);
10106 			} else {
10107 				error = vfs_get_notify_attributes(&__rmdir_data->va);
10108 				if (error) {
10109 					goto out;
10110 				}
10111 
10112 				vap = &__rmdir_data->va;
10113 			}
10114 		}
10115 #endif
10116 		has_listeners = kauth_authorize_fileop_has_listeners();
10117 		if (need_event || has_listeners) {
10118 			if (path == NULL) {
10119 				GET_PATH(path);
10120 			}
10121 
10122 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10123 
10124 			if (no_firmlink_path == NULL) {
10125 				GET_PATH(no_firmlink_path);
10126 			}
10127 
10128 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10129 #if CONFIG_FSE
10130 			if (truncated_no_firmlink_path) {
10131 				finfo.mode |= FSE_TRUNCATED_PATH;
10132 			}
10133 #endif
10134 		}
10135 
10136 #if CONFIG_FILE_LEASES
10137 		vnode_breakdirlease(dvp, false, O_WRONLY);
10138 #endif
10139 
10140 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10141 		ndp->ni_vp = vp;
10142 		if (vp == NULLVP) {
10143 			/* Couldn't find a vnode */
10144 			goto out;
10145 		}
10146 
10147 		if (error == EKEEPLOOKING) {
10148 			goto continue_lookup;
10149 		} else if (batched && error == ENOENT) {
10150 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10151 				/*
10152 				 * For compound VNOPs, the authorization callback
10153 				 * may return ENOENT in case of racing hard link lookups
10154 				 * redrive the lookup.
10155 				 */
10156 				restart_flag = 1;
10157 				restart_count += 1;
10158 				goto out;
10159 			}
10160 		}
10161 
10162 		/*
10163 		 * XXX There's no provision for passing flags
10164 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10165 		 * because it's not empty, then we try again
10166 		 * with VNOP_REMOVE(), passing in a special
10167 		 * flag that clever file systems will know
10168 		 * how to handle.
10169 		 */
10170 		if (error == ENOTEMPTY &&
10171 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10172 			/*
10173 			 * Only do this if the directory is actually
10174 			 * marked as DATALESS.
10175 			 */
10176 			struct vnode_attr *lvap =
10177 			    kalloc_type(struct vnode_attr, Z_WAITOK);
10178 
10179 			VATTR_INIT(lvap);
10180 			VATTR_WANTED(lvap, va_flags);
10181 			if (vnode_getattr(vp, lvap, ctx) == 0 &&
10182 			    VATTR_IS_SUPPORTED(lvap, va_flags) &&
10183 			    (lvap->va_flags & SF_DATALESS) != 0) {
10184 				/*
10185 				 * If this fails, we want to keep the original
10186 				 * error.
10187 				 */
10188 				if (vn_remove(dvp, &vp, ndp,
10189 				    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10190 					error = 0;
10191 				}
10192 			}
10193 			kfree_type(struct vnode_attr, lvap);
10194 		}
10195 
10196 #if CONFIG_APPLEDOUBLE
10197 		/*
10198 		 * Special case to remove orphaned AppleDouble
10199 		 * files. I don't like putting this in the kernel,
10200 		 * but carbon does not like putting this in carbon either,
10201 		 * so here we are.
10202 		 */
10203 		if (error == ENOTEMPTY) {
10204 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10205 			if (ad_error == EBUSY) {
10206 				error = ad_error;
10207 				goto out;
10208 			}
10209 
10210 
10211 			/*
10212 			 * Assuming everything went well, we will try the RMDIR again
10213 			 */
10214 			if (!ad_error) {
10215 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10216 			}
10217 		}
10218 #endif /* CONFIG_APPLEDOUBLE */
10219 		/*
10220 		 * Call out to allow 3rd party notification of delete.
10221 		 * Ignore result of kauth_authorize_fileop call.
10222 		 */
10223 		if (!error) {
10224 			if (has_listeners) {
10225 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10226 				    KAUTH_FILEOP_DELETE,
10227 				    (uintptr_t)vp,
10228 				    (uintptr_t)path);
10229 			}
10230 
10231 			if (vp->v_flag & VISHARDLINK) {
10232 				// see the comment in unlink1() about why we update
10233 				// the parent of a hard link when it is removed
10234 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10235 			}
10236 
10237 #if CONFIG_FSE
10238 			if (need_event) {
10239 				if (vap) {
10240 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10241 				}
10242 				add_fsevent(FSE_DELETE, ctx,
10243 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10244 				    FSE_ARG_FINFO, &finfo,
10245 				    FSE_ARG_DONE);
10246 			}
10247 #endif
10248 
10249 #if CONFIG_MACF
10250 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10251 #endif
10252 		}
10253 
10254 out:
10255 		if (path != NULL) {
10256 			RELEASE_PATH(path);
10257 			path = NULL;
10258 		}
10259 
10260 		if (no_firmlink_path != NULL) {
10261 			RELEASE_PATH(no_firmlink_path);
10262 			no_firmlink_path = NULL;
10263 		}
10264 
10265 		/*
10266 		 * nameidone has to happen before we vnode_put(dvp)
10267 		 * since it may need to release the fs_nodelock on the dvp
10268 		 */
10269 		nameidone(ndp);
10270 		vnode_put(dvp);
10271 
10272 		if (vp) {
10273 			vnode_put(vp);
10274 		}
10275 
10276 		if (restart_flag == 0) {
10277 			wakeup_one((caddr_t)vp);
10278 			goto err_out;
10279 		}
10280 		tsleep(vp, PVFS, "rm AD", 1);
10281 	} while (restart_flag != 0);
10282 
10283 err_out:
10284 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10285 
10286 	return error;
10287 }
10288 
10289 /*
10290  * Remove a directory file.
10291  */
10292 /* ARGSUSED */
10293 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10294 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10295 {
10296 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10297 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10298 }
10299 
10300 /* Get direntry length padded to 8 byte alignment */
10301 #define DIRENT64_LEN(namlen) \
10302 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10303 
10304 /* Get dirent length padded to 4 byte alignment */
10305 #define DIRENT_LEN(namelen) \
10306 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10307 
10308 /* Get the end of this dirent */
10309 #define DIRENT_END(dep) \
10310 	(((char *)(dep)) + (dep)->d_reclen - 1)
10311 
10312 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10313 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10314     int *numdirent, vfs_context_t ctxp)
10315 {
10316 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10317 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10318 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10319 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10320 	} else {
10321 		size_t bufsize;
10322 		void * bufptr;
10323 		uio_t auio;
10324 		struct direntry *entry64;
10325 		struct dirent *dep;
10326 		size_t bytesread;
10327 		int error;
10328 
10329 		/*
10330 		 * We're here because the underlying file system does not
10331 		 * support direnties or we mounted denying support so we must
10332 		 * fall back to dirents and convert them to direntries.
10333 		 *
10334 		 * Our kernel buffer needs to be smaller since re-packing will
10335 		 * expand each dirent.  The worse case (when the name length
10336 		 * is 3 or less) corresponds to a struct direntry size of 32
10337 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10338 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10339 		 * will prevent us from reading more than we can pack.
10340 		 *
10341 		 * Since this buffer is wired memory, we will limit the
10342 		 * buffer size to a maximum of 32K. We would really like to
10343 		 * use 32K in the MIN(), but we use magic number 87371 to
10344 		 * prevent uio_resid() * 3 / 8 from overflowing.
10345 		 */
10346 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10347 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10348 		if (bufptr == NULL) {
10349 			return ENOMEM;
10350 		}
10351 
10352 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10353 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10354 		auio->uio_offset = uio->uio_offset;
10355 
10356 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10357 
10358 		dep = (struct dirent *)bufptr;
10359 		bytesread = bufsize - uio_resid(auio);
10360 
10361 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10362 		/*
10363 		 * Convert all the entries and copy them out to user's buffer.
10364 		 */
10365 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10366 			/* First check that the dirent struct up to d_name is within the buffer */
10367 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10368 			    /* Check that the length of the entire dirent is within the buffer */
10369 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10370 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10371 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10372 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10373 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10374 				    vp->v_name ? vp->v_name : "<unknown>");
10375 				error = EIO;
10376 				break;
10377 			}
10378 
10379 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10380 
10381 			bzero(entry64, enbufsize);
10382 			/* Convert a dirent to a dirent64. */
10383 			entry64->d_ino = dep->d_ino;
10384 			entry64->d_seekoff = 0;
10385 			entry64->d_reclen = (uint16_t)enbufsize;
10386 			entry64->d_namlen = dep->d_namlen;
10387 			entry64->d_type = dep->d_type;
10388 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10389 
10390 			/* Move to next entry. */
10391 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10392 
10393 			/* Copy entry64 to user's buffer. */
10394 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10395 		}
10396 
10397 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10398 		if (error == 0) {
10399 			uio->uio_offset = auio->uio_offset;
10400 		}
10401 		uio_free(auio);
10402 		kfree_data(bufptr, bufsize);
10403 		kfree_type(struct direntry, entry64);
10404 		return error;
10405 	}
10406 }
10407 
10408 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10409 
10410 /*
10411  * Read a block of directory entries in a file system independent format.
10412  */
10413 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10414 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10415     off_t *offset, int *eofflag, int flags)
10416 {
10417 	vnode_t vp;
10418 	struct vfs_context context = *vfs_context_current();    /* local copy */
10419 	struct fileproc *fp;
10420 	uio_t auio;
10421 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10422 	off_t loff;
10423 	int error, numdirent;
10424 	UIO_STACKBUF(uio_buf, 1);
10425 
10426 get_from_fd:
10427 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10428 	if (error) {
10429 		return error;
10430 	}
10431 
10432 	vn_offset_lock(fp->fp_glob);
10433 	if (((vnode_t)fp_get_data(fp)) != vp) {
10434 		vn_offset_unlock(fp->fp_glob);
10435 		file_drop(fd);
10436 		goto get_from_fd;
10437 	}
10438 
10439 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10440 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10441 		error = EBADF;
10442 		goto out;
10443 	}
10444 
10445 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10446 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10447 	}
10448 
10449 #if CONFIG_MACF
10450 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10451 	if (error) {
10452 		goto out;
10453 	}
10454 #endif
10455 
10456 	if ((error = vnode_getwithref(vp))) {
10457 		goto out;
10458 	}
10459 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10460 
10461 #if CONFIG_UNION_MOUNTS
10462 unionread:
10463 #endif /* CONFIG_UNION_MOUNTS */
10464 	if (vp->v_type != VDIR) {
10465 		(void)vnode_put(vp);
10466 		error = EINVAL;
10467 		goto out;
10468 	}
10469 
10470 #if CONFIG_MACF
10471 	error = mac_vnode_check_readdir(&context, vp);
10472 	if (error != 0) {
10473 		(void)vnode_put(vp);
10474 		goto out;
10475 	}
10476 #endif /* MAC */
10477 
10478 	loff = fp->fp_glob->fg_offset;
10479 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10480 	uio_addiov(auio, bufp, bufsize);
10481 
10482 	if (flags & VNODE_READDIR_EXTENDED) {
10483 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10484 		fp->fp_glob->fg_offset = uio_offset(auio);
10485 	} else {
10486 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10487 		fp->fp_glob->fg_offset = uio_offset(auio);
10488 	}
10489 	if (error) {
10490 		(void)vnode_put(vp);
10491 		goto out;
10492 	}
10493 
10494 #if CONFIG_UNION_MOUNTS
10495 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10496 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10497 		vnode_t uvp;
10498 
10499 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10500 			if (vnode_ref(uvp) == 0) {
10501 				fp_set_data(fp, uvp);
10502 				fp->fp_glob->fg_offset = 0;
10503 				vnode_rele(vp);
10504 				vnode_put(vp);
10505 				vp = uvp;
10506 				goto unionread;
10507 			} else {
10508 				/* could not get a ref, can't replace in fd */
10509 				vnode_put(uvp);
10510 			}
10511 		}
10512 	}
10513 #endif /* CONFIG_UNION_MOUNTS */
10514 
10515 	vnode_put(vp);
10516 	if (offset) {
10517 		*offset = loff;
10518 	}
10519 
10520 	*bytesread = bufsize - uio_resid(auio);
10521 out:
10522 	vn_offset_unlock(fp->fp_glob);
10523 	file_drop(fd);
10524 	return error;
10525 }
10526 
10527 
10528 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10529 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10530 {
10531 	off_t offset;
10532 	ssize_t bytesread;
10533 	int error, eofflag;
10534 
10535 	AUDIT_ARG(fd, uap->fd);
10536 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
10537 	    &bytesread, &offset, &eofflag, 0);
10538 
10539 	if (error == 0) {
10540 		if (proc_is64bit(p)) {
10541 			user64_long_t base = (user64_long_t)offset;
10542 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10543 		} else {
10544 			user32_long_t base = (user32_long_t)offset;
10545 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10546 		}
10547 		*retval = (int)bytesread;
10548 	}
10549 	return error;
10550 }
10551 
10552 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10553 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10554 {
10555 	off_t offset;
10556 	ssize_t bytesread;
10557 	int error, eofflag;
10558 	user_size_t bufsize;
10559 
10560 	AUDIT_ARG(fd, uap->fd);
10561 
10562 	/*
10563 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10564 	 * then the kernel carves out the last 4 bytes to return extended
10565 	 * information to userspace (namely whether we reached EOF with this call).
10566 	 */
10567 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10568 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10569 	} else {
10570 		bufsize = uap->bufsize;
10571 	}
10572 
10573 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
10574 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10575 
10576 	if (error == 0) {
10577 		*retval = bytesread;
10578 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10579 
10580 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10581 			getdirentries64_flags_t flags = 0;
10582 			if (eofflag) {
10583 				flags |= GETDIRENTRIES64_EOF;
10584 			}
10585 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10586 			    sizeof(flags));
10587 		}
10588 	}
10589 	return error;
10590 }
10591 
10592 
10593 /*
10594  * Set the mode mask for creation of filesystem nodes.
10595  * XXX implement xsecurity
10596  */
10597 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
10598 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10599 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10600 {
10601 	AUDIT_ARG(mask, newmask);
10602 	proc_fdlock(p);
10603 	*retval = p->p_fd.fd_cmask;
10604 	p->p_fd.fd_cmask = newmask & ALLPERMS;
10605 	proc_fdunlock(p);
10606 	return 0;
10607 }
10608 
10609 /*
10610  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10611  *
10612  * Parameters:    p                       Process requesting to set the umask
10613  *                uap                     User argument descriptor (see below)
10614  *                retval                  umask of the process (parameter p)
10615  *
10616  * Indirect:      uap->newmask            umask to set
10617  *                uap->xsecurity          ACL to set
10618  *
10619  * Returns:        0                      Success
10620  *                !0                      Not success
10621  *
10622  */
10623 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10624 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10625 {
10626 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10627 }
10628 
10629 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10630 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10631 {
10632 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10633 }
10634 
10635 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
10636 	"com.apple.private.vfs.revoke-mounted-device"
10637 
10638 /*
10639  * Void all references to file by ripping underlying filesystem
10640  * away from vnode.
10641  */
10642 /* ARGSUSED */
10643 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10644 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10645 {
10646 	vnode_t vp;
10647 	struct vnode_attr va;
10648 	vfs_context_t ctx = vfs_context_current();
10649 	int error;
10650 	struct nameidata nd;
10651 
10652 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10653 	    uap->path, ctx);
10654 	error = namei(&nd);
10655 	if (error) {
10656 		return error;
10657 	}
10658 	vp = nd.ni_vp;
10659 
10660 	nameidone(&nd);
10661 
10662 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10663 		error = ENOTSUP;
10664 		goto out;
10665 	}
10666 
10667 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10668 		error = EBUSY;
10669 		goto out;
10670 	}
10671 
10672 #if CONFIG_MACF
10673 	error = mac_vnode_check_revoke(ctx, vp);
10674 	if (error) {
10675 		goto out;
10676 	}
10677 #endif
10678 
10679 	VATTR_INIT(&va);
10680 	VATTR_WANTED(&va, va_uid);
10681 	if ((error = vnode_getattr(vp, &va, ctx))) {
10682 		goto out;
10683 	}
10684 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10685 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10686 		goto out;
10687 	}
10688 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10689 		VNOP_REVOKE(vp, REVOKEALL, ctx);
10690 	}
10691 out:
10692 	vnode_put(vp);
10693 	return error;
10694 }
10695 
10696 
10697 /*
10698  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10699  *  The following system calls are designed to support features
10700  *  which are specific to the HFS & HFS Plus volume formats
10701  */
10702 
10703 
10704 /*
10705  * Obtain attribute information on objects in a directory while enumerating
10706  * the directory.
10707  */
10708 /* ARGSUSED */
10709 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10710 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10711 {
10712 	vnode_t vp;
10713 	struct fileproc *fp;
10714 	uio_t auio = NULL;
10715 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10716 	uint32_t count = 0, savecount = 0;
10717 	uint32_t newstate = 0;
10718 	int error, eofflag = 0;
10719 	off_t loff = 0;
10720 	struct attrlist attributelist;
10721 	vfs_context_t ctx = vfs_context_current();
10722 	int fd = uap->fd;
10723 	UIO_STACKBUF(uio_buf, 1);
10724 	kauth_action_t action;
10725 
10726 	AUDIT_ARG(fd, fd);
10727 
10728 	/* Get the attributes into kernel space */
10729 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10730 		return error;
10731 	}
10732 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10733 		return error;
10734 	}
10735 	savecount = count;
10736 
10737 get_from_fd:
10738 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10739 		return error;
10740 	}
10741 
10742 	vn_offset_lock(fp->fp_glob);
10743 	if (((vnode_t)fp_get_data(fp)) != vp) {
10744 		vn_offset_unlock(fp->fp_glob);
10745 		file_drop(fd);
10746 		goto get_from_fd;
10747 	}
10748 
10749 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10750 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10751 		error = EBADF;
10752 		goto out;
10753 	}
10754 
10755 
10756 #if CONFIG_MACF
10757 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10758 	    fp->fp_glob);
10759 	if (error) {
10760 		goto out;
10761 	}
10762 #endif
10763 
10764 
10765 	if ((error = vnode_getwithref(vp))) {
10766 		goto out;
10767 	}
10768 
10769 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10770 
10771 #if CONFIG_UNION_MOUNTS
10772 unionread:
10773 #endif /* CONFIG_UNION_MOUNTS */
10774 	if (vp->v_type != VDIR) {
10775 		(void)vnode_put(vp);
10776 		error = EINVAL;
10777 		goto out;
10778 	}
10779 
10780 #if CONFIG_MACF
10781 	error = mac_vnode_check_readdir(ctx, vp);
10782 	if (error != 0) {
10783 		(void)vnode_put(vp);
10784 		goto out;
10785 	}
10786 #endif /* MAC */
10787 
10788 	/* set up the uio structure which will contain the users return buffer */
10789 	loff = fp->fp_glob->fg_offset;
10790 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10791 	uio_addiov(auio, uap->buffer, uap->buffersize);
10792 
10793 	/*
10794 	 * If the only item requested is file names, we can let that past with
10795 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10796 	 * they need SEARCH as well.
10797 	 */
10798 	action = KAUTH_VNODE_LIST_DIRECTORY;
10799 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10800 	    attributelist.fileattr || attributelist.dirattr) {
10801 		action |= KAUTH_VNODE_SEARCH;
10802 	}
10803 
10804 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10805 		/* Believe it or not, uap->options only has 32-bits of valid
10806 		 * info, so truncate before extending again */
10807 
10808 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10809 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10810 	}
10811 
10812 	if (error) {
10813 		(void) vnode_put(vp);
10814 		goto out;
10815 	}
10816 
10817 #if CONFIG_UNION_MOUNTS
10818 	/*
10819 	 * If we've got the last entry of a directory in a union mount
10820 	 * then reset the eofflag and pretend there's still more to come.
10821 	 * The next call will again set eofflag and the buffer will be empty,
10822 	 * so traverse to the underlying directory and do the directory
10823 	 * read there.
10824 	 */
10825 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10826 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10827 			eofflag = 0;
10828 		} else {                                                // Empty buffer
10829 			vnode_t uvp;
10830 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10831 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10832 					fp_set_data(fp, uvp);
10833 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10834 					count = savecount;
10835 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10836 					vnode_put(vp);
10837 					vp = uvp;
10838 					goto unionread;
10839 				} else {
10840 					/* could not get a ref, can't replace in fd */
10841 					vnode_put(uvp);
10842 				}
10843 			}
10844 		}
10845 	}
10846 #endif /* CONFIG_UNION_MOUNTS */
10847 
10848 	(void)vnode_put(vp);
10849 
10850 	if (error) {
10851 		goto out;
10852 	}
10853 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10854 
10855 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10856 		goto out;
10857 	}
10858 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10859 		goto out;
10860 	}
10861 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10862 		goto out;
10863 	}
10864 
10865 	*retval = eofflag;  /* similar to getdirentries */
10866 	error = 0;
10867 out:
10868 	vn_offset_unlock(fp->fp_glob);
10869 	file_drop(fd);
10870 	return error; /* return error earlier, an retval of 0 or 1 now */
10871 } /* end of getdirentriesattr system call */
10872 
10873 /*
10874  * Exchange data between two files
10875  */
10876 
10877 /* ARGSUSED */
10878 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10879 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10880 {
10881 	struct nameidata fnd, snd;
10882 	vfs_context_t ctx = vfs_context_current();
10883 	vnode_t fvp;
10884 	vnode_t svp;
10885 	int error;
10886 	u_int32_t nameiflags;
10887 	char *fpath = NULL;
10888 	char *spath = NULL;
10889 	int   flen = 0, slen = 0;
10890 	int from_truncated = 0, to_truncated = 0;
10891 #if CONFIG_FSE
10892 	fse_info f_finfo, s_finfo;
10893 #endif
10894 
10895 	nameiflags = 0;
10896 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10897 		nameiflags |= FOLLOW;
10898 	}
10899 
10900 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10901 	    UIO_USERSPACE, uap->path1, ctx);
10902 
10903 	error = namei(&fnd);
10904 	if (error) {
10905 		goto out2;
10906 	}
10907 
10908 	nameidone(&fnd);
10909 	fvp = fnd.ni_vp;
10910 
10911 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10912 	    UIO_USERSPACE, uap->path2, ctx);
10913 
10914 	error = namei(&snd);
10915 	if (error) {
10916 		vnode_put(fvp);
10917 		goto out2;
10918 	}
10919 	nameidone(&snd);
10920 	svp = snd.ni_vp;
10921 
10922 	/*
10923 	 * if the files are the same, return an inval error
10924 	 */
10925 	if (svp == fvp) {
10926 		error = EINVAL;
10927 		goto out;
10928 	}
10929 
10930 	/*
10931 	 * if the files are on different volumes, return an error
10932 	 */
10933 	if (svp->v_mount != fvp->v_mount) {
10934 		error = EXDEV;
10935 		goto out;
10936 	}
10937 
10938 	/* If they're not files, return an error */
10939 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10940 		error = EINVAL;
10941 		goto out;
10942 	}
10943 
10944 #if CONFIG_MACF
10945 	error = mac_vnode_check_exchangedata(ctx,
10946 	    fvp, svp);
10947 	if (error) {
10948 		goto out;
10949 	}
10950 #endif
10951 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10952 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10953 		goto out;
10954 	}
10955 
10956 	if (
10957 #if CONFIG_FSE
10958 		need_fsevent(FSE_EXCHANGE, fvp) ||
10959 #endif
10960 		kauth_authorize_fileop_has_listeners()) {
10961 		GET_PATH(fpath);
10962 		GET_PATH(spath);
10963 
10964 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10965 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10966 
10967 #if CONFIG_FSE
10968 		get_fse_info(fvp, &f_finfo, ctx);
10969 		get_fse_info(svp, &s_finfo, ctx);
10970 		if (from_truncated || to_truncated) {
10971 			// set it here since only the f_finfo gets reported up to user space
10972 			f_finfo.mode |= FSE_TRUNCATED_PATH;
10973 		}
10974 #endif
10975 	}
10976 	/* Ok, make the call */
10977 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10978 
10979 	if (error == 0) {
10980 		const char *tmpname;
10981 
10982 		if (fpath != NULL && spath != NULL) {
10983 			/* call out to allow 3rd party notification of exchangedata.
10984 			 * Ignore result of kauth_authorize_fileop call.
10985 			 */
10986 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10987 			    (uintptr_t)fpath, (uintptr_t)spath);
10988 		}
10989 		name_cache_lock();
10990 
10991 		tmpname     = fvp->v_name;
10992 		fvp->v_name = svp->v_name;
10993 		svp->v_name = tmpname;
10994 
10995 		if (fvp->v_parent != svp->v_parent) {
10996 			vnode_t tmp;
10997 
10998 			tmp           = fvp->v_parent;
10999 			fvp->v_parent = svp->v_parent;
11000 			svp->v_parent = tmp;
11001 		}
11002 		name_cache_unlock();
11003 
11004 #if CONFIG_FSE
11005 		if (fpath != NULL && spath != NULL) {
11006 			add_fsevent(FSE_EXCHANGE, ctx,
11007 			    FSE_ARG_STRING, flen, fpath,
11008 			    FSE_ARG_FINFO, &f_finfo,
11009 			    FSE_ARG_STRING, slen, spath,
11010 			    FSE_ARG_FINFO, &s_finfo,
11011 			    FSE_ARG_DONE);
11012 		}
11013 #endif
11014 	}
11015 
11016 out:
11017 	if (fpath != NULL) {
11018 		RELEASE_PATH(fpath);
11019 	}
11020 	if (spath != NULL) {
11021 		RELEASE_PATH(spath);
11022 	}
11023 	vnode_put(svp);
11024 	vnode_put(fvp);
11025 out2:
11026 	return error;
11027 }
11028 
11029 /*
11030  * Return (in MB) the amount of freespace on the given vnode's volume.
11031  */
11032 uint32_t freespace_mb(vnode_t vp);
11033 
11034 uint32_t
freespace_mb(vnode_t vp)11035 freespace_mb(vnode_t vp)
11036 {
11037 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
11038 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
11039 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
11040 }
11041 
11042 #if CONFIG_SEARCHFS
11043 
11044 /* ARGSUSED */
11045 
11046 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)11047 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
11048 {
11049 	vnode_t vp, tvp;
11050 	int i, error = 0;
11051 	int fserror = 0;
11052 	struct nameidata nd;
11053 	struct user64_fssearchblock searchblock;
11054 	struct searchstate *state;
11055 	struct attrlist *returnattrs;
11056 	struct timeval timelimit;
11057 	void *searchparams1, *searchparams2;
11058 	uio_t auio = NULL;
11059 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11060 	uint32_t nummatches;
11061 	size_t mallocsize;
11062 	uint32_t nameiflags;
11063 	vfs_context_t ctx = vfs_context_current();
11064 	UIO_STACKBUF(uio_buf, 1);
11065 
11066 	/* Start by copying in fsearchblock parameter list */
11067 	if (IS_64BIT_PROCESS(p)) {
11068 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11069 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
11070 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
11071 	} else {
11072 		struct user32_fssearchblock tmp_searchblock;
11073 
11074 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11075 		// munge into 64-bit version
11076 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11077 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11078 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11079 		searchblock.maxmatches = tmp_searchblock.maxmatches;
11080 		/*
11081 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11082 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11083 		 */
11084 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11085 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11086 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11087 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11088 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11089 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11090 		searchblock.searchattrs = tmp_searchblock.searchattrs;
11091 	}
11092 	if (error) {
11093 		return error;
11094 	}
11095 
11096 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11097 	 */
11098 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11099 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11100 		return EINVAL;
11101 	}
11102 
11103 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11104 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
11105 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11106 	/* block.                                                                                             */
11107 	/*												      */
11108 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
11109 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
11110 	/*       assumes the size is still 556 bytes it will continue to work				      */
11111 
11112 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11113 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11114 
11115 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11116 
11117 	/* Now set up the various pointers to the correct place in our newly allocated memory */
11118 
11119 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11120 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11121 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11122 
11123 	/* Now copy in the stuff given our local variables. */
11124 
11125 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11126 		goto freeandexit;
11127 	}
11128 
11129 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11130 		goto freeandexit;
11131 	}
11132 
11133 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11134 		goto freeandexit;
11135 	}
11136 
11137 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11138 		goto freeandexit;
11139 	}
11140 
11141 	/*
11142 	 * When searching a union mount, need to set the
11143 	 * start flag at the first call on each layer to
11144 	 * reset state for the new volume.
11145 	 */
11146 	if (uap->options & SRCHFS_START) {
11147 		state->ss_union_layer = 0;
11148 	} else {
11149 		uap->options |= state->ss_union_flags;
11150 	}
11151 	state->ss_union_flags = 0;
11152 
11153 	/*
11154 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11155 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11156 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11157 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11158 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11159 	 */
11160 
11161 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11162 		attrreference_t* string_ref;
11163 		u_int32_t* start_length;
11164 		user64_size_t param_length;
11165 
11166 		/* validate searchparams1 */
11167 		param_length = searchblock.sizeofsearchparams1;
11168 		/* skip the word that specifies length of the buffer */
11169 		start_length = (u_int32_t*) searchparams1;
11170 		start_length = start_length + 1;
11171 		string_ref = (attrreference_t*) start_length;
11172 
11173 		/* ensure no negative offsets or too big offsets */
11174 		if (string_ref->attr_dataoffset < 0) {
11175 			error = EINVAL;
11176 			goto freeandexit;
11177 		}
11178 		if (string_ref->attr_length > MAXPATHLEN) {
11179 			error = EINVAL;
11180 			goto freeandexit;
11181 		}
11182 
11183 		/* Check for pointer overflow in the string ref */
11184 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11185 			error = EINVAL;
11186 			goto freeandexit;
11187 		}
11188 
11189 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11190 			error = EINVAL;
11191 			goto freeandexit;
11192 		}
11193 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11194 			error = EINVAL;
11195 			goto freeandexit;
11196 		}
11197 	}
11198 
11199 	/* set up the uio structure which will contain the users return buffer */
11200 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11201 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11202 
11203 	nameiflags = 0;
11204 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11205 		nameiflags |= FOLLOW;
11206 	}
11207 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11208 	    UIO_USERSPACE, uap->path, ctx);
11209 
11210 	error = namei(&nd);
11211 	if (error) {
11212 		goto freeandexit;
11213 	}
11214 	vp = nd.ni_vp;
11215 	nameidone(&nd);
11216 
11217 	/*
11218 	 * Switch to the root vnode for the volume
11219 	 */
11220 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11221 	vnode_put(vp);
11222 	if (error) {
11223 		goto freeandexit;
11224 	}
11225 	vp = tvp;
11226 
11227 #if CONFIG_UNION_MOUNTS
11228 	/*
11229 	 * If it's a union mount, the path lookup takes
11230 	 * us to the top layer. But we may need to descend
11231 	 * to a lower layer. For non-union mounts the layer
11232 	 * is always zero.
11233 	 */
11234 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11235 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11236 			break;
11237 		}
11238 		tvp = vp;
11239 		vp = vp->v_mount->mnt_vnodecovered;
11240 		if (vp == NULL) {
11241 			vnode_put(tvp);
11242 			error = ENOENT;
11243 			goto freeandexit;
11244 		}
11245 		error = vnode_getwithref(vp);
11246 		vnode_put(tvp);
11247 		if (error) {
11248 			goto freeandexit;
11249 		}
11250 	}
11251 #endif /* CONFIG_UNION_MOUNTS */
11252 
11253 #if CONFIG_MACF
11254 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11255 	if (error) {
11256 		vnode_put(vp);
11257 		goto freeandexit;
11258 	}
11259 #endif
11260 
11261 
11262 	/*
11263 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11264 	 * before and sometimes the underlying code doesnt deal with it well.
11265 	 */
11266 	if (searchblock.maxmatches == 0) {
11267 		nummatches = 0;
11268 		goto saveandexit;
11269 	}
11270 
11271 	/*
11272 	 * Allright, we have everything we need, so lets make that call.
11273 	 *
11274 	 * We keep special track of the return value from the file system:
11275 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11276 	 * from copying out any results...
11277 	 */
11278 
11279 	fserror = VNOP_SEARCHFS(vp,
11280 	    searchparams1,
11281 	    searchparams2,
11282 	    &searchblock.searchattrs,
11283 	    (uint32_t)searchblock.maxmatches,
11284 	    &timelimit,
11285 	    returnattrs,
11286 	    &nummatches,
11287 	    (uint32_t)uap->scriptcode,
11288 	    (uint32_t)uap->options,
11289 	    auio,
11290 	    (struct searchstate *) &state->ss_fsstate,
11291 	    ctx);
11292 
11293 #if CONFIG_UNION_MOUNTS
11294 	/*
11295 	 * If it's a union mount we need to be called again
11296 	 * to search the mounted-on filesystem.
11297 	 */
11298 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11299 		state->ss_union_flags = SRCHFS_START;
11300 		state->ss_union_layer++;        // search next layer down
11301 		fserror = EAGAIN;
11302 	}
11303 #endif /* CONFIG_UNION_MOUNTS */
11304 
11305 saveandexit:
11306 
11307 	vnode_put(vp);
11308 
11309 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11310 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11311 
11312 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11313 		goto freeandexit;
11314 	}
11315 
11316 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11317 		goto freeandexit;
11318 	}
11319 
11320 	error = fserror;
11321 
11322 freeandexit:
11323 
11324 	kfree_data(searchparams1, mallocsize);
11325 
11326 	return error;
11327 } /* end of searchfs system call */
11328 
11329 #else /* CONFIG_SEARCHFS */
11330 
11331 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11332 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11333 {
11334 	return ENOTSUP;
11335 }
11336 
11337 #endif /* CONFIG_SEARCHFS */
11338 
11339 
11340 #if CONFIG_DATALESS_FILES
11341 
11342 /*
11343  * === Namespace Resolver Up-call Mechanism ===
11344  *
11345  * When I/O is performed to a dataless file or directory (read, write,
11346  * lookup-in, etc.), the file system performs an upcall to the namespace
11347  * resolver (filecoordinationd) to materialize the object.
11348  *
11349  * We need multiple up-calls to be in flight at once, and we need these
11350  * up-calls to be interruptible, thus the following implementation:
11351  *
11352  * => The nspace_resolver_request represents the in-kernel request state.
11353  *    It contains a request ID, storage space for the errno code returned
11354  *    by filecoordinationd, and flags.
11355  *
11356  * => The request ID is simply a global monotonically incrementing 32-bit
11357  *    number.  Outstanding requests are stored in a hash table, and the
11358  *    hash function is extremely simple.
11359  *
11360  * => When an upcall is to be made to filecoordinationd, a request structure
11361  *    is allocated on the stack (it is small, and needs to live only during
11362  *    the duration of the call to resolve_nspace_item_ext()).  It is
11363  *    initialized and inserted into the table.  Some backpressure from
11364  *    filecoordinationd is applied by limiting the numnber of entries that
11365  *    can be inserted into the table (and thus limiting the number of
11366  *    outstanding requests issued to filecoordinationd); waiting for an
11367  *    available slot is interruptible.
11368  *
11369  * => Once the request has been inserted into the table, the up-call is made
11370  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11371  *    immediately and filecoordinationd processes the request asynchronously.
11372  *
11373  * => The caller now waits for the request to complete.  Tnis is achieved by
11374  *    sleeping on the address of the request structure and waiting for
11375  *    filecoordinationd to mark the request structure as complete.  This
11376  *    is an interruptible sleep call; if interrupted, the request structure
11377  *    is removed from the table and EINTR is returned to the caller.  If
11378  *    this occurs, an advisory up-call is made to filecoordinationd with
11379  *    the request ID to indicate that the request can be aborted or
11380  *    de-prioritized at the discretion of filecoordinationd.
11381  *
11382  * => When filecoordinationd has completed the request, it signals completion
11383  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11384  *    decorated as a namespace resolver can write to this sysctl node.  The
11385  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11386  *    The request ID is looked up in the table, and if the request is found,
11387  *    the error code is stored in the request structure and a wakeup()
11388  *    issued on the address of the request structure.  If the request is not
11389  *    found, we simply drop the completion notification, assuming that the
11390  *    caller was interrupted.
11391  *
11392  * => When the waiting thread wakes up, it extracts the error code from the
11393  *    request structure, removes the request from the table, and returns the
11394  *    error code to the calling function.  Fini!
11395  */
11396 
11397 struct nspace_resolver_request {
11398 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11399 	vnode_t         r_vp;
11400 	vnode_t         r_tdvp;
11401 	uint32_t        r_req_id;
11402 	int             r_resolver_error;
11403 	int             r_flags;
11404 };
11405 
11406 #define RRF_COMPLETE    0x0001
11407 #define RRF_COMPLETING  0x0002
11408 
11409 struct nspace_resolver_completion_data {
11410 	uint32_t req_id;
11411 	int32_t  resolver_error;
11412 	uint64_t orig_gencount;
11413 	uint64_t orig_syncroot;
11414 };
11415 
11416 static uint32_t
next_nspace_req_id(void)11417 next_nspace_req_id(void)
11418 {
11419 	static uint32_t next_req_id;
11420 
11421 	return OSAddAtomic(1, &next_req_id);
11422 }
11423 
11424 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11425 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11426 
11427 static LIST_HEAD(nspace_resolver_requesthead,
11428     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11429 static u_long nspace_resolver_request_hashmask;
11430 static u_int nspace_resolver_request_count;
11431 static bool nspace_resolver_request_wait_slot;
11432 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11433 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11434     &nspace_resolver_request_lck_grp);
11435 
11436 #define NSPACE_REQ_LOCK() \
11437 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11438 #define NSPACE_REQ_UNLOCK() \
11439 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11440 
11441 #define NSPACE_RESOLVER_HASH(req_id)    \
11442 	(&nspace_resolver_request_hashtbl[(req_id) & \
11443 	 nspace_resolver_request_hashmask])
11444 
11445 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11446 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11447 {
11448 	struct nspace_resolver_requesthead *bucket;
11449 	struct nspace_resolver_request *req;
11450 
11451 	bucket = NSPACE_RESOLVER_HASH(req_id);
11452 	LIST_FOREACH(req, bucket, r_hashlink) {
11453 		if (req->r_req_id == req_id) {
11454 			/*
11455 			 * If this request already has a completion
11456 			 * pending, don't return it again.
11457 			 */
11458 			if ((req->r_flags & RRF_COMPLETING) != 0 &&
11459 			    skip_completing) {
11460 				req = NULL;
11461 			}
11462 			return req;
11463 		}
11464 	}
11465 
11466 	return NULL;
11467 }
11468 
11469 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11470 nspace_resolver_req_add(struct nspace_resolver_request *req)
11471 {
11472 	struct nspace_resolver_requesthead *bucket;
11473 	int error;
11474 
11475 	NSPACE_REQ_LOCK();
11476 
11477 	while (nspace_resolver_request_count >=
11478 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11479 		nspace_resolver_request_wait_slot = true;
11480 		error = msleep(&nspace_resolver_request_count,
11481 		    &nspace_resolver_request_hash_mutex,
11482 		    PVFS | PCATCH, "nspacerq", NULL);
11483 		if (error) {
11484 			NSPACE_REQ_UNLOCK();
11485 			return error;
11486 		}
11487 	}
11488 
11489 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11490 #if DIAGNOSTIC
11491 	assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11492 #endif /* DIAGNOSTIC */
11493 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11494 	nspace_resolver_request_count++;
11495 
11496 	NSPACE_REQ_UNLOCK();
11497 
11498 	return 0;
11499 }
11500 
11501 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11502 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11503 {
11504 	/*
11505 	 * If a completion is in-progress, we have to wait for the
11506 	 * completion handler to finish because it's still using 'req',
11507 	 * which is allocated on our stack a couple of frames up.
11508 	 */
11509 	while ((req->r_flags & RRF_COMPLETING) != 0) {
11510 		(void) msleep(req, &nspace_resolver_request_hash_mutex,
11511 		    PVFS, "nspacecmplt", NULL);
11512 	}
11513 }
11514 
11515 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11516 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11517 {
11518 	struct nspace_resolver_requesthead *bucket;
11519 
11520 	/* We're called with NSPACE_REQ_LOCK held. */
11521 
11522 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11523 #if DIAGNOSTIC
11524 	assert((req->r_flags & RRF_COMPLETING) == 0);
11525 	assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11526 #endif /* DIAGNOSTIC */
11527 	LIST_REMOVE(req, r_hashlink);
11528 	nspace_resolver_request_count--;
11529 
11530 	if (nspace_resolver_request_wait_slot) {
11531 		nspace_resolver_request_wait_slot = false;
11532 		wakeup(&nspace_resolver_request_count);
11533 	}
11534 
11535 	nspace_resolver_req_wait_pending_completion(req);
11536 
11537 	NSPACE_REQ_UNLOCK();
11538 }
11539 
11540 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11541 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11542 {
11543 	NSPACE_REQ_LOCK();
11544 	nspace_resolver_req_remove_and_unlock(req);
11545 }
11546 
11547 static void
nspace_resolver_req_cancel(uint32_t req_id)11548 nspace_resolver_req_cancel(uint32_t req_id)
11549 {
11550 	kern_return_t kr;
11551 	mach_port_t mp;
11552 
11553 	// Failures here aren't fatal -- the cancellation message
11554 	// sent to the resolver is merely advisory.
11555 
11556 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11557 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11558 		return;
11559 	}
11560 
11561 	kr = send_nspace_resolve_cancel(mp, req_id);
11562 	if (kr != KERN_SUCCESS) {
11563 		os_log_error(OS_LOG_DEFAULT,
11564 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11565 	}
11566 
11567 	ipc_port_release_send(mp);
11568 }
11569 
11570 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11571 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11572 {
11573 	bool send_cancel_message = false;
11574 	int error;
11575 
11576 	NSPACE_REQ_LOCK();
11577 
11578 	while ((req->r_flags & RRF_COMPLETE) == 0) {
11579 		error = msleep(req, &nspace_resolver_request_hash_mutex,
11580 		    PVFS | PCATCH, "nspace", NULL);
11581 		if (error && error != ERESTART) {
11582 			req->r_resolver_error = (error == EINTR) ? EINTR :
11583 			    ETIMEDOUT;
11584 			send_cancel_message = true;
11585 			break;
11586 		}
11587 	}
11588 
11589 	nspace_resolver_req_remove_and_unlock(req);
11590 
11591 	/*
11592 	 * It's safe to continue referencing 'req' here because it's
11593 	 * allocated on our caller's stack.
11594 	 */
11595 
11596 	if (send_cancel_message) {
11597 		nspace_resolver_req_cancel(req->r_req_id);
11598 	}
11599 
11600 	return req->r_resolver_error;
11601 }
11602 
11603 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11604 nspace_resolver_req_mark_complete(
11605 	struct nspace_resolver_request *req,
11606 	int resolver_error)
11607 {
11608 	req->r_resolver_error = resolver_error;
11609 	req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11610 	wakeup(req);
11611 }
11612 
11613 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11614 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11615 {
11616 	req->r_flags |= RRF_COMPLETING;
11617 }
11618 
11619 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11620 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11621 {
11622 	struct nspace_resolver_request *req;
11623 	int error;
11624 	struct vnode_attr va;
11625 	vnode_t vp;
11626 
11627 	NSPACE_REQ_LOCK();
11628 
11629 	req = nspace_resolver_req_lookup(c->req_id, true);
11630 	if (req == NULL) {
11631 		/*
11632 		 * If we don't find the request corresponding to our req_id,
11633 		 * just drop the completion on the floor; it's likely that
11634 		 * the requester interrupted with a signal, or it may already
11635 		 * be completing.
11636 		 */
11637 		NSPACE_REQ_UNLOCK();
11638 		return;
11639 	}
11640 
11641 	/*
11642 	 * Get out now if the resolver reported an error.
11643 	 */
11644 	if ((error = c->resolver_error) != 0) {
11645 		goto out;
11646 	}
11647 
11648 	/*
11649 	 * If the resolver did not specify any namespace shape criteria
11650 	 * for letting the operation proceed, then get out now.
11651 	 */
11652 	if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11653 		goto out;
11654 	}
11655 
11656 	/*
11657 	 * We're going to have to acquire the mount rename lock and do
11658 	 * some I/O in order to verify the criteria.  Mark the request
11659 	 * as pending so no one else messes with it after we drop the
11660 	 * NSPACE_REQ_LOCK.
11661 	 */
11662 	nspace_resolver_req_mark_completion_pending(req);
11663 	NSPACE_REQ_UNLOCK();
11664 
11665 	/*
11666 	 * Lock out renames from changing the shape of the tree while
11667 	 * validate the criteria.
11668 	 */
11669 	mount_t locked_mp = req->r_vp->v_mount;
11670 	mount_ref(locked_mp, 0);
11671 	mount_lock_renames(locked_mp);
11672 
11673 	if (c->orig_gencount != 0) {
11674 		vp = req->r_vp;
11675 		if (error) {
11676 			goto out_dropmount;
11677 		}
11678 
11679 		VATTR_INIT(&va);
11680 		VATTR_WANTED(&va, va_recursive_gencount);
11681 		error = vnode_getattr(vp, &va, vfs_context_kernel());
11682 		if (error) {
11683 			goto out_dropmount;
11684 		}
11685 		if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11686 		    va.va_recursive_gencount != c->orig_gencount) {
11687 			printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11688 			    c->orig_gencount, va.va_recursive_gencount);
11689 			error = EBUSY;
11690 			goto out_dropmount;
11691 		}
11692 	}
11693 
11694 	/*
11695 	 * Ignore orig_syncroot if a destination directory wasn't specified
11696 	 * in the request.
11697 	 */
11698 	if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11699 		uint64_t syncroot_id;
11700 
11701 		if (error) {
11702 			goto out_dropmount;
11703 		}
11704 
11705 #ifndef APFSIOC_GET_SYNC_ROOT
11706 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11707 #endif
11708 
11709 		error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11710 		    (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11711 		if (error) {
11712 			goto out_dropmount;
11713 		}
11714 		if (syncroot_id != c->orig_syncroot) {
11715 			printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11716 			    c->orig_syncroot, syncroot_id);
11717 			error = EBUSY;
11718 			goto out_dropmount;
11719 		}
11720 	}
11721 
11722 out_dropmount:
11723 	mount_unlock_renames(locked_mp);
11724 	mount_drop(locked_mp, 0);
11725 	NSPACE_REQ_LOCK();
11726 
11727 out:
11728 	nspace_resolver_req_mark_complete(req, error);
11729 	NSPACE_REQ_UNLOCK();
11730 }
11731 
11732 static struct proc *nspace_resolver_proc;
11733 
11734 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11735 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11736 {
11737 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11738 	    p == nspace_resolver_proc) ? 1 : 0;
11739 	return 0;
11740 }
11741 
11742 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11743 
11744 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11745 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11746 {
11747 	vfs_context_t ctx = vfs_context_current();
11748 	int error = 0;
11749 
11750 	//
11751 	// The system filecoordinationd runs as uid == 0.  This also
11752 	// has the nice side-effect of filtering out filecoordinationd
11753 	// running in the simulator.
11754 	//
11755 	if (!vfs_context_issuser(ctx) ||
11756 	    !vfs_context_is_dataless_resolver(ctx)) {
11757 		return EPERM;
11758 	}
11759 
11760 	if (is_resolver) {
11761 		NSPACE_REQ_LOCK();
11762 
11763 		if (nspace_resolver_proc == NULL) {
11764 			proc_lock(p);
11765 			p->p_lflag |= P_LNSPACE_RESOLVER;
11766 			proc_unlock(p);
11767 			nspace_resolver_proc = p;
11768 		} else {
11769 			error = EBUSY;
11770 		}
11771 
11772 		NSPACE_REQ_UNLOCK();
11773 	} else {
11774 		// This is basically just like the exit case.
11775 		// nspace_resolver_exited() will verify that the
11776 		// process is the resolver, and will clear the
11777 		// global.
11778 		nspace_resolver_exited(p);
11779 	}
11780 
11781 	return error;
11782 }
11783 
11784 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11785 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11786 {
11787 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11788 	    (p->p_vfs_iopolicy &
11789 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11790 		*is_prevented = 1;
11791 	} else {
11792 		*is_prevented = 0;
11793 	}
11794 	return 0;
11795 }
11796 
11797 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11798 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11799 {
11800 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11801 		return is_prevented ? 0 : EBUSY;
11802 	}
11803 
11804 	if (is_prevented) {
11805 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11806 	} else {
11807 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11808 	}
11809 	return 0;
11810 }
11811 
11812 static int
nspace_materialization_get_thread_state(int * is_prevented)11813 nspace_materialization_get_thread_state(int *is_prevented)
11814 {
11815 	uthread_t ut = current_uthread();
11816 
11817 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11818 	return 0;
11819 }
11820 
11821 static int
nspace_materialization_set_thread_state(int is_prevented)11822 nspace_materialization_set_thread_state(int is_prevented)
11823 {
11824 	uthread_t ut = current_uthread();
11825 
11826 	if (is_prevented) {
11827 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11828 	} else {
11829 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11830 	}
11831 	return 0;
11832 }
11833 
11834 /* the vfs.nspace branch */
11835 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11836 
11837 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11838 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11839     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11840 {
11841 	struct proc *p = req->p;
11842 	int new_value, old_value, changed = 0;
11843 	int error;
11844 
11845 	error = nspace_resolver_get_proc_state(p, &old_value);
11846 	if (error) {
11847 		return error;
11848 	}
11849 
11850 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11851 	    &changed);
11852 	if (error == 0 && changed) {
11853 		error = nspace_resolver_set_proc_state(p, new_value);
11854 	}
11855 	return error;
11856 }
11857 
11858 /* decorate this process as the dataless file resolver */
11859 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11860     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11861     0, 0, sysctl_nspace_resolver, "I", "");
11862 
11863 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11864 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11865     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11866 {
11867 	struct proc *p = req->p;
11868 	int new_value, old_value, changed = 0;
11869 	int error;
11870 
11871 	error = nspace_materialization_get_proc_state(p, &old_value);
11872 	if (error) {
11873 		return error;
11874 	}
11875 
11876 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11877 	    &changed);
11878 	if (error == 0 && changed) {
11879 		error = nspace_materialization_set_proc_state(p, new_value);
11880 	}
11881 	return error;
11882 }
11883 
11884 /* decorate this process as not wanting to materialize dataless files */
11885 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11886     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11887     0, 0, sysctl_nspace_prevent_materialization, "I", "");
11888 
11889 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11890 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11891     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11892 {
11893 	int new_value, old_value, changed = 0;
11894 	int error;
11895 
11896 	error = nspace_materialization_get_thread_state(&old_value);
11897 	if (error) {
11898 		return error;
11899 	}
11900 
11901 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11902 	    &changed);
11903 	if (error == 0 && changed) {
11904 		error = nspace_materialization_set_thread_state(new_value);
11905 	}
11906 	return error;
11907 }
11908 
11909 /* decorate this thread as not wanting to materialize dataless files */
11910 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11911     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11912     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11913 
11914 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11915 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11916     __unused int arg2, struct sysctl_req *req)
11917 {
11918 	struct proc *p = req->p;
11919 	uint32_t req_status[2] = { 0, 0 };
11920 	uint64_t gencount = 0;
11921 	uint64_t syncroot = 0;
11922 	int error, is_resolver, changed = 0, other_changed;
11923 
11924 	error = nspace_resolver_get_proc_state(p, &is_resolver);
11925 	if (error) {
11926 		return error;
11927 	}
11928 
11929 	if (!is_resolver) {
11930 		return EPERM;
11931 	}
11932 
11933 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11934 	    &changed);
11935 	if (error) {
11936 		return error;
11937 	}
11938 
11939 	/*
11940 	 * Get the gencount if it was passed.  Ignore errors, because
11941 	 * it's optional.
11942 	 */
11943 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11944 	    &other_changed);
11945 	if (error) {
11946 		gencount = 0;
11947 		error = 0;
11948 	}
11949 
11950 	/*
11951 	 * ...and now the syncroot ID.
11952 	 */
11953 	error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
11954 	    &other_changed);
11955 	if (error) {
11956 		syncroot = 0;
11957 		error = 0;
11958 	}
11959 
11960 	/*
11961 	 * req_status[0] is the req_id
11962 	 *
11963 	 * req_status[1] is the errno
11964 	 */
11965 	if (error == 0 && changed) {
11966 		const struct nspace_resolver_completion_data cd = {
11967 			.req_id = req_status[0],
11968 			.resolver_error = req_status[1],
11969 			.orig_gencount = gencount,
11970 			.orig_syncroot = syncroot,
11971 		};
11972 		nspace_resolver_req_completed(&cd);
11973 	}
11974 	return error;
11975 }
11976 
11977 /* Resolver reports completed reqs here. */
11978 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11979     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11980     0, 0, sysctl_nspace_complete, "-", "");
11981 
11982 #endif /* CONFIG_DATALESS_FILES */
11983 
11984 #if CONFIG_DATALESS_FILES
11985 #define __no_dataless_unused    /* nothing */
11986 #else
11987 #define __no_dataless_unused    __unused
11988 #endif
11989 
11990 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11991 vfs_context_dataless_materialization_is_prevented(
11992 	vfs_context_t const ctx __no_dataless_unused)
11993 {
11994 #if CONFIG_DATALESS_FILES
11995 	proc_t const p = vfs_context_proc(ctx);
11996 	thread_t const t = vfs_context_thread(ctx);
11997 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11998 
11999 	/*
12000 	 * Kernel context ==> return EDEADLK, as we would with any random
12001 	 * process decorated as no-materialize.
12002 	 */
12003 	if (ctx == vfs_context_kernel()) {
12004 		return EDEADLK;
12005 	}
12006 
12007 	/*
12008 	 * If the process has the dataless-manipulation entitlement,
12009 	 * materialization is prevented, and depending on the kind
12010 	 * of file system operation, things get to proceed as if the
12011 	 * object is not dataless.
12012 	 */
12013 	if (vfs_context_is_dataless_manipulator(ctx)) {
12014 		return EJUSTRETURN;
12015 	}
12016 
12017 	/*
12018 	 * Per-thread decorations override any process-wide decorations.
12019 	 * (Foundation uses this, and this overrides even the dataless-
12020 	 * manipulation entitlement so as to make API contracts consistent.)
12021 	 */
12022 	if (ut != NULL) {
12023 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
12024 			return EDEADLK;
12025 		}
12026 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
12027 			return 0;
12028 		}
12029 	}
12030 
12031 	/*
12032 	 * If the process's iopolicy specifies that dataless files
12033 	 * can be materialized, then we let it go ahead.
12034 	 */
12035 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
12036 		return 0;
12037 	}
12038 #endif /* CONFIG_DATALESS_FILES */
12039 
12040 	/*
12041 	 * The default behavior is to not materialize dataless files;
12042 	 * return to the caller that deadlock was detected.
12043 	 */
12044 	return EDEADLK;
12045 }
12046 
12047 void
nspace_resolver_init(void)12048 nspace_resolver_init(void)
12049 {
12050 #if CONFIG_DATALESS_FILES
12051 	nspace_resolver_request_hashtbl =
12052 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12053 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12054 #endif /* CONFIG_DATALESS_FILES */
12055 }
12056 
12057 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12058 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12059 {
12060 #if CONFIG_DATALESS_FILES
12061 	struct nspace_resolver_requesthead *bucket;
12062 	struct nspace_resolver_request *req;
12063 	u_long idx;
12064 
12065 	NSPACE_REQ_LOCK();
12066 
12067 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12068 	    p == nspace_resolver_proc) {
12069 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12070 			bucket = &nspace_resolver_request_hashtbl[idx];
12071 			LIST_FOREACH(req, bucket, r_hashlink) {
12072 				nspace_resolver_req_wait_pending_completion(req);
12073 				nspace_resolver_req_mark_complete(req,
12074 				    ETIMEDOUT);
12075 			}
12076 		}
12077 		nspace_resolver_proc = NULL;
12078 	}
12079 
12080 	NSPACE_REQ_UNLOCK();
12081 #endif /* CONFIG_DATALESS_FILES */
12082 }
12083 
12084 #define DATALESS_RESOLVER_ENTITLEMENT     \
12085 	"com.apple.private.vfs.dataless-resolver"
12086 #define DATALESS_MANIPULATION_ENTITLEMENT \
12087 	"com.apple.private.vfs.dataless-manipulation"
12088 
12089 #if CONFIG_DATALESS_FILES
12090 /*
12091  * Return TRUE if the vfs context is associated with the dataless
12092  * resolver.
12093  */
12094 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12095 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12096 {
12097 	return IOTaskHasEntitlement(vfs_context_task(ctx),
12098 	           DATALESS_RESOLVER_ENTITLEMENT);
12099 }
12100 #endif /* CONFIG_DATALESS_FILES */
12101 
12102 /*
12103  * Return TRUE if the vfs context is associated with a process entitled
12104  * for dataless manipulation.
12105  *
12106  * XXX Arguably belongs in vfs_subr.c, but is here because of the
12107  * complication around CONFIG_DATALESS_FILES.
12108  */
12109 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12110 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12111 {
12112 #if CONFIG_DATALESS_FILES
12113 	task_t task = vfs_context_task(ctx);
12114 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12115 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12116 #else
12117 	return false;
12118 #endif /* CONFIG_DATALESS_FILES */
12119 }
12120 
12121 #if CONFIG_DATALESS_FILES
12122 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12123 log_materialization_prevented(vnode_t vp, uint64_t op)
12124 {
12125 	char p_name[MAXCOMLEN + 1];
12126 	char *vntype;
12127 	proc_selfname(&p_name[0], sizeof(p_name));
12128 
12129 	if (vp->v_type == VREG) {
12130 		vntype = "File";
12131 	} else if (vp->v_type == VDIR) {
12132 		vntype = "Dir";
12133 	} else if (vp->v_type == VLNK) {
12134 		vntype = "SymLink";
12135 	} else {
12136 		vntype = "Other";
12137 	}
12138 
12139 #if DEVELOPMENT
12140 	struct vnode_attr *vap = kalloc_type(struct vnode_attr, Z_WAITOK);
12141 
12142 	VATTR_INIT(vap);
12143 	VATTR_WANTED(vap, va_fsid);
12144 	VATTR_WANTED(vap, va_fileid);
12145 	if (vnode_getattr(vp, vap, vfs_context_current()) == 0) {
12146 		os_log_debug(OS_LOG_DEFAULT,
12147 		    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) fsid 0x%08x/%u fileid=%llu",
12148 		    p_name, proc_selfpid(), op, vntype,
12149 		    vap->va_fsid, vap->va_fsid, vap->va_fileid);
12150 	} else
12151 #endif
12152 	{
12153 		os_log_debug(OS_LOG_DEFAULT,
12154 		    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12155 		    p_name, proc_selfpid(), op, vntype);
12156 	}
12157 #if DEVELOPMENT
12158 	kfree_type(struct vnode_attr, vap);
12159 #endif
12160 }
12161 #endif /* CONFIG_DATALESS_FILES */
12162 
12163 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12164 vfs_materialize_item(
12165 	vnode_t vp __no_dataless_unused,
12166 	uint32_t op __no_dataless_unused,
12167 	int64_t offset __no_dataless_unused,
12168 	int64_t size __no_dataless_unused,
12169 	char *lookup_name __no_dataless_unused,
12170 	size_t const namelen __no_dataless_unused,
12171 	vnode_t tdvp __no_dataless_unused)
12172 {
12173 #if CONFIG_DATALESS_FILES
12174 	kern_return_t kern_ret;
12175 	mach_port_t mach_port;
12176 	char *path = NULL;
12177 	vfs_context_t context;
12178 	int path_len;
12179 	int error;
12180 	audit_token_t atoken;
12181 	enum vtype vp_vtype;
12182 
12183 	/* Swap files are special; ignore them */
12184 	if (vnode_isswap(vp)) {
12185 		return 0;
12186 	}
12187 
12188 	/*
12189 	 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12190 	 * are no longer used nor supported.
12191 	 */
12192 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12193 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12194 		return ENOTSUP;
12195 	}
12196 	if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12197 		os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12198 		return ENOTSUP;
12199 	}
12200 
12201 	/* Normalize 'op'. */
12202 	op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12203 
12204 	/*
12205 	 * To-directory is only meaningful for rename operations;
12206 	 * ignore it if someone handed one to us unexpectedly.
12207 	 */
12208 	if (op != NAMESPACE_HANDLER_RENAME_OP) {
12209 		tdvp = NULL;
12210 	}
12211 
12212 	context = vfs_context_current();
12213 
12214 	/* Remember this for later. */
12215 	vp_vtype = vnode_vtype(vp);
12216 
12217 	error = vfs_context_dataless_materialization_is_prevented(context);
12218 	if (error) {
12219 		log_materialization_prevented(vp, op);
12220 		goto out_check_errors;
12221 	}
12222 
12223 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12224 	    &mach_port);
12225 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12226 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12227 		/*
12228 		 * Treat this like being unable to access the backing store
12229 		 * server.
12230 		 */
12231 		return ETIMEDOUT;
12232 	}
12233 
12234 	int path_alloc_len = MAXPATHLEN;
12235 	do {
12236 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12237 		if (path == NULL) {
12238 			return ENOMEM;
12239 		}
12240 
12241 		path_len = path_alloc_len;
12242 		error = vn_getpath(vp, path, &path_len);
12243 		if (error == 0) {
12244 			break;
12245 		} else if (error == ENOSPC) {
12246 			kfree_data(path, path_alloc_len);
12247 			path = NULL;
12248 		} else {
12249 			goto out_release_port;
12250 		}
12251 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12252 
12253 	error = vfs_context_copy_audit_token(context, &atoken);
12254 	if (error) {
12255 		goto out_release_port;
12256 	}
12257 
12258 	struct nspace_resolver_request req = {
12259 		.r_req_id = next_nspace_req_id(),
12260 		.r_vp = vp,
12261 		.r_tdvp = tdvp,
12262 	};
12263 
12264 	error = nspace_resolver_req_add(&req);
12265 	if (error) {
12266 		goto out_release_port;
12267 	}
12268 
12269 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12270 
12271 	if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12272 		char *dest_path = NULL;
12273 		int dest_path_len;
12274 
12275 		dest_path = zalloc(ZV_NAMEI);
12276 		dest_path_len = MAXPATHLEN;
12277 
12278 		error = vn_getpath(tdvp, dest_path, &dest_path_len);
12279 		if (error) {
12280 			zfree(ZV_NAMEI, dest_path);
12281 			goto out_release_port;
12282 		}
12283 
12284 		/*
12285 		 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12286 		 * compatibility with existing agents in user-space
12287 		 * who get passed this value.
12288 		 */
12289 		kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12290 		    req.r_req_id,
12291 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12292 		    path, dest_path, atoken);
12293 
12294 		zfree(ZV_NAMEI, dest_path);
12295 	} else if (vp_vtype == VDIR) {
12296 		char *tmpname = NULL;
12297 
12298 		/*
12299 		 * If the caller provided a lookup_name *and* a name length,
12300 		 * then we assume the lookup_name is not NUL-terminated.
12301 		 * Allocate a temporary buffer in this case to provide
12302 		 * a NUL-terminated path name to the IPC call.
12303 		 */
12304 		if (lookup_name != NULL && namelen != 0) {
12305 			if (namelen >= PATH_MAX) {
12306 				error = EINVAL;
12307 				goto out_req_remove;
12308 			}
12309 			tmpname = zalloc(ZV_NAMEI);
12310 			strlcpy(tmpname, lookup_name, namelen + 1);
12311 			lookup_name = tmpname;
12312 		} else if (lookup_name != NULL) {
12313 			/*
12314 			 * If the caller provided a lookup_name with a
12315 			 * zero name length, then we assume it's NUL-
12316 			 * terminated.  Verify it has a valid length.
12317 			 */
12318 			if (strlen(lookup_name) >= PATH_MAX) {
12319 				error = EINVAL;
12320 				goto out_req_remove;
12321 			}
12322 		}
12323 
12324 		/* (See above.) */
12325 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12326 		    req.r_req_id,
12327 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12328 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12329 
12330 		if (tmpname != NULL) {
12331 			zfree(ZV_NAMEI, tmpname);
12332 
12333 			/*
12334 			 * Poison lookup_name rather than reference
12335 			 * freed memory.
12336 			 */
12337 			lookup_name = NULL;
12338 		}
12339 	} else {
12340 		/* (See above.) */
12341 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12342 		    req.r_req_id,
12343 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12344 		    offset, size, path, atoken);
12345 	}
12346 	if (kern_ret != KERN_SUCCESS) {
12347 		/*
12348 		 * Also treat this like being unable to access the backing
12349 		 * store server.
12350 		 */
12351 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12352 		    kern_ret);
12353 		error = ETIMEDOUT;
12354 		goto out_req_remove;
12355 	}
12356 
12357 	/*
12358 	 * Give back the memory we allocated earlier while we wait; we
12359 	 * no longer need it.
12360 	 */
12361 	kfree_data(path, path_alloc_len);
12362 	path = NULL;
12363 
12364 	/*
12365 	 * Request has been submitted to the resolver. Now (interruptibly)
12366 	 * wait for completion. Upon requrn, the request will have been
12367 	 * removed from the lookup table.
12368 	 */
12369 	error = nspace_resolver_req_wait(&req);
12370 
12371 out_release_port:
12372 	if (path != NULL) {
12373 		kfree_data(path, path_alloc_len);
12374 		path = NULL;
12375 	}
12376 	ipc_port_release_send(mach_port);
12377 
12378 out_check_errors:
12379 	/*
12380 	 * The file resolver owns the logic about what error to return
12381 	 * to the caller.  We only need to handle a couple of special
12382 	 * cases here:
12383 	 */
12384 	if (error == EJUSTRETURN) {
12385 		/*
12386 		 * The requesting process is allowed to interact with
12387 		 * dataless objects.  Make a couple of sanity-checks
12388 		 * here to ensure the action makes sense.
12389 		 */
12390 		switch (op) {
12391 		case NAMESPACE_HANDLER_WRITE_OP:
12392 		case NAMESPACE_HANDLER_TRUNCATE_OP:
12393 		case NAMESPACE_HANDLER_RENAME_OP:
12394 			/*
12395 			 * This handles the case of the resolver itself
12396 			 * writing data to the file (or throwing it
12397 			 * away).
12398 			 */
12399 			error = 0;
12400 			break;
12401 		case NAMESPACE_HANDLER_READ_OP:
12402 		case NAMESPACE_HANDLER_LOOKUP_OP:
12403 			/*
12404 			 * This handles the case of the resolver needing
12405 			 * to look up inside of a dataless directory while
12406 			 * it's in the process of materializing it (for
12407 			 * example, creating files or directories).
12408 			 */
12409 			error = (vp_vtype == VDIR) ? 0 : EBADF;
12410 			break;
12411 		default:
12412 			error = EBADF;
12413 			break;
12414 		}
12415 	}
12416 
12417 	return error;
12418 
12419 out_req_remove:
12420 	nspace_resolver_req_remove(&req);
12421 	goto out_release_port;
12422 #else
12423 	return ENOTSUP;
12424 #endif /* CONFIG_DATALESS_FILES */
12425 }
12426 
12427 /*
12428  * vfs_materialize_file: Materialize a regular file.
12429  *
12430  * Inputs:
12431  * vp		The dataless file to be materialized.
12432  *
12433  * op		What kind of operation is being performed:
12434  *		-> NAMESPACE_HANDLER_READ_OP
12435  *		-> NAMESPACE_HANDLER_WRITE_OP
12436  *		-> NAMESPACE_HANDLER_LINK_CREATE
12437  *		-> NAMESPACE_HANDLER_DELETE_OP
12438  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12439  *		-> NAMESPACE_HANDLER_RENAME_OP
12440  *
12441  * offset	offset of I/O for READ or WRITE.  Ignored for
12442  *		other ops.
12443  *
12444  * size		size of I/O for READ or WRITE  Ignored for
12445  *		other ops.
12446  *
12447  * If offset or size are -1 for a READ or WRITE, then the resolver should
12448  * consider the range to be unknown.
12449  *
12450  * Upon successful return, the caller may proceed with the operation.
12451  * N.B. the file may still be "dataless" in this case.
12452  */
12453 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12454 vfs_materialize_file(
12455 	struct vnode *vp,
12456 	uint64_t op,
12457 	int64_t offset,
12458 	int64_t size)
12459 {
12460 	if (vp->v_type != VREG) {
12461 		return EFTYPE;
12462 	}
12463 	return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12464 	           NULL);
12465 }
12466 
12467 /*
12468  * vfs_materialize_dir:
12469  *
12470  * Inputs:
12471  * vp		The dataless directory to be materialized.
12472  *
12473  * op		What kind of operation is being performed:
12474  *		-> NAMESPACE_HANDLER_READ_OP
12475  *		-> NAMESPACE_HANDLER_WRITE_OP
12476  *		-> NAMESPACE_HANDLER_DELETE_OP
12477  *		-> NAMESPACE_HANDLER_RENAME_OP
12478  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12479  *
12480  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12481  *		other ops.  May or may not be NUL-terminated; see below.
12482  *
12483  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12484  *		terminated and namelen is the number of valid bytes in
12485  *		lookup_name. If zero, then lookup_name is assumed to be
12486  *		NUL-terminated.
12487  *
12488  * Upon successful return, the caller may proceed with the operation.
12489  * N.B. the directory may still be "dataless" in this case.
12490  */
12491 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12492 vfs_materialize_dir(
12493 	struct vnode *vp,
12494 	uint64_t op,
12495 	char *lookup_name,
12496 	size_t namelen)
12497 {
12498 	if (vp->v_type != VDIR) {
12499 		return EFTYPE;
12500 	}
12501 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12502 		return EINVAL;
12503 	}
12504 	return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12505 	           namelen, NULL);
12506 }
12507 
12508 /*
12509  * vfs_materialize_reparent:
12510  *
12511  * Inputs:
12512  * vp		The dataless file or directory to be materialized.
12513  *
12514  * tdvp		The new parent directory for the dataless file.
12515  *
12516  * Upon successful return, the caller may proceed with the operation.
12517  * N.B. the item may still be "dataless" in this case.
12518  */
12519 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12520 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12521 {
12522 	if (vp->v_type != VDIR && vp->v_type != VREG) {
12523 		return EFTYPE;
12524 	}
12525 	return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12526 	           0, 0, NULL, 0, tdvp);
12527 }
12528 
12529 #if 0
12530 static int
12531 build_volfs_path(struct vnode *vp, char *path, int *len)
12532 {
12533 	struct vnode_attr va;
12534 	int ret;
12535 
12536 	VATTR_INIT(&va);
12537 	VATTR_WANTED(&va, va_fsid);
12538 	VATTR_WANTED(&va, va_fileid);
12539 
12540 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12541 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12542 		ret = -1;
12543 	} else {
12544 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12545 		ret = 0;
12546 	}
12547 
12548 	return ret;
12549 }
12550 #endif
12551 
12552 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12553 fsctl_bogus_command_compat(unsigned long cmd)
12554 {
12555 	switch (cmd) {
12556 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12557 		return FSIOC_SYNC_VOLUME;
12558 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12559 		return FSIOC_ROUTEFS_SETROUTEID;
12560 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12561 		return FSIOC_SET_PACKAGE_EXTS;
12562 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12563 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
12564 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12565 		return DISK_CONDITIONER_IOC_GET;
12566 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12567 		return DISK_CONDITIONER_IOC_SET;
12568 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12569 		return FSIOC_FIOSEEKHOLE;
12570 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12571 		return FSIOC_FIOSEEKDATA;
12572 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12573 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12574 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12575 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
12576 	}
12577 
12578 	return cmd;
12579 }
12580 
12581 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12582 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12583 {
12584 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12585 }
12586 
12587 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12588 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12589 {
12590 	struct vfs_attr vfa;
12591 	mount_t mp = vp->v_mount;
12592 	unsigned arg;
12593 	int error;
12594 
12595 	/* record vid of vp so we can drop it below. */
12596 	uint32_t vvid = vp->v_id;
12597 
12598 	/*
12599 	 * Then grab mount_iterref so that we can release the vnode.
12600 	 * Without this, a thread may call vnode_iterate_prepare then
12601 	 * get into a deadlock because we've never released the root vp
12602 	 */
12603 	error = mount_iterref(mp, 0);
12604 	if (error) {
12605 		return error;
12606 	}
12607 	vnode_hold(vp);
12608 	vnode_put(vp);
12609 
12610 	arg = MNT_NOWAIT;
12611 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12612 		arg = MNT_WAIT;
12613 	}
12614 
12615 	/*
12616 	 * If the filessytem supports multiple filesytems in a
12617 	 * partition (For eg APFS volumes in a container, it knows
12618 	 * that the waitfor argument to VFS_SYNC are flags.
12619 	 */
12620 	VFSATTR_INIT(&vfa);
12621 	VFSATTR_WANTED(&vfa, f_capabilities);
12622 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12623 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12624 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12625 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12626 		arg |= MNT_VOLUME;
12627 	}
12628 
12629 	/* issue the sync for this volume */
12630 	(void)sync_callback(mp, &arg);
12631 
12632 	/*
12633 	 * Then release the mount_iterref once we're done syncing; it's not
12634 	 * needed for the VNOP_IOCTL below
12635 	 */
12636 	mount_iterdrop(mp);
12637 
12638 	if (arg & FSCTL_SYNC_FULLSYNC) {
12639 		/* re-obtain vnode iocount on the root vp, if possible */
12640 		error = vnode_getwithvid(vp, vvid);
12641 		if (error == 0) {
12642 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12643 			vnode_put(vp);
12644 		}
12645 	}
12646 	vnode_drop(vp);
12647 	/* mark the argument VP as having been released */
12648 	*arg_vp = NULL;
12649 	return error;
12650 }
12651 
12652 #if ROUTEFS
12653 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12654 handle_routes(user_addr_t udata)
12655 {
12656 	char routepath[MAXPATHLEN];
12657 	size_t len = 0;
12658 	int error;
12659 
12660 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12661 		return error;
12662 	}
12663 	bzero(routepath, MAXPATHLEN);
12664 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12665 	if (error) {
12666 		return error;
12667 	}
12668 	error = routefs_kernel_mount(routepath);
12669 	return error;
12670 }
12671 #endif
12672 
12673 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12674 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12675 {
12676 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12677 	struct vnode_attr va;
12678 	int error;
12679 
12680 	VATTR_INIT(&va);
12681 	VATTR_SET(&va, va_flags, cas->new_flags);
12682 
12683 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12684 
12685 #if CONFIG_FSE
12686 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12687 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12688 	}
12689 #endif
12690 
12691 	return error;
12692 }
12693 
12694 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12695 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12696 {
12697 	struct mount *mp = NULL;
12698 	errno_t rootauth = 0;
12699 
12700 	mp = vp->v_mount;
12701 
12702 	/*
12703 	 * query the underlying FS and see if it reports something
12704 	 * sane for this vnode. If volume is authenticated via
12705 	 * chunklist, leave that for the caller to determine.
12706 	 */
12707 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12708 
12709 	return rootauth;
12710 }
12711 
12712 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12713 	"com.apple.private.kernel.set-package-extensions"
12714 
12715 /*
12716  * Make a filesystem-specific control call:
12717  */
12718 /* ARGSUSED */
12719 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12720 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12721 {
12722 	int error = 0;
12723 	boolean_t is64bit;
12724 	u_int size;
12725 #define STK_PARAMS 128
12726 	char stkbuf[STK_PARAMS] = {0};
12727 	caddr_t data, memp;
12728 	vnode_t vp = *arg_vp;
12729 
12730 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
12731 		return ENOTTY;
12732 	}
12733 
12734 	cmd = fsctl_bogus_command_compat(cmd);
12735 
12736 	size = IOCPARM_LEN(cmd);
12737 	if (size > IOCPARM_MAX) {
12738 		return EINVAL;
12739 	}
12740 
12741 	is64bit = proc_is64bit(p);
12742 
12743 	memp = NULL;
12744 
12745 	if (size > sizeof(stkbuf)) {
12746 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12747 			return ENOMEM;
12748 		}
12749 		data = memp;
12750 	} else {
12751 		data = &stkbuf[0];
12752 	};
12753 
12754 	if (cmd & IOC_IN) {
12755 		if (size) {
12756 			error = copyin(udata, data, size);
12757 			if (error) {
12758 				if (memp) {
12759 					kfree_data(memp, size);
12760 				}
12761 				return error;
12762 			}
12763 		} else {
12764 			if (is64bit) {
12765 				*(user_addr_t *)data = udata;
12766 			} else {
12767 				*(uint32_t *)data = (uint32_t)udata;
12768 			}
12769 		};
12770 	} else if ((cmd & IOC_OUT) && size) {
12771 		/*
12772 		 * Zero the buffer so the user always
12773 		 * gets back something deterministic.
12774 		 */
12775 		bzero(data, size);
12776 	} else if (cmd & IOC_VOID) {
12777 		if (is64bit) {
12778 			*(user_addr_t *)data = udata;
12779 		} else {
12780 			*(uint32_t *)data = (uint32_t)udata;
12781 		}
12782 	}
12783 
12784 	/* Check to see if it's a generic command */
12785 	switch (cmd) {
12786 	case FSIOC_SYNC_VOLUME:
12787 		error = handle_sync_volume(vp, arg_vp, data, ctx);
12788 		break;
12789 
12790 	case FSIOC_ROUTEFS_SETROUTEID:
12791 #if ROUTEFS
12792 		error = handle_routes(udata);
12793 #endif
12794 		break;
12795 
12796 	case FSIOC_SET_PACKAGE_EXTS: {
12797 		user_addr_t ext_strings;
12798 		uint32_t    num_entries;
12799 		uint32_t    max_width;
12800 
12801 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12802 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12803 			error = EPERM;
12804 			break;
12805 		}
12806 
12807 		if ((is64bit && size != sizeof(user64_package_ext_info))
12808 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12809 			// either you're 64-bit and passed a 64-bit struct or
12810 			// you're 32-bit and passed a 32-bit struct.  otherwise
12811 			// it's not ok.
12812 			error = EINVAL;
12813 			break;
12814 		}
12815 
12816 		if (is64bit) {
12817 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12818 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12819 			}
12820 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12821 			num_entries = ((user64_package_ext_info *)data)->num_entries;
12822 			max_width   = ((user64_package_ext_info *)data)->max_width;
12823 		} else {
12824 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12825 			num_entries = ((user32_package_ext_info *)data)->num_entries;
12826 			max_width   = ((user32_package_ext_info *)data)->max_width;
12827 		}
12828 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
12829 	}
12830 	break;
12831 
12832 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12833 	{
12834 		mount_t mp;
12835 
12836 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12837 			break;
12838 		}
12839 		if ((mp = vp->v_mount) != NULL) {
12840 			mount_lock(mp);
12841 			if (data[0] != 0) {
12842 				for (int i = 0; i < MFSTYPENAMELEN; i++) {
12843 					if (!data[i]) {
12844 						goto continue_copy;
12845 					}
12846 				}
12847 				/*
12848 				 * Getting here means we have a user data
12849 				 * string which has no NULL termination in
12850 				 * its first MFSTYPENAMELEN bytes.  This is
12851 				 * bogus, let's avoid strlcpy-ing the read
12852 				 * data and return an error.
12853 				 */
12854 				error = EINVAL;
12855 				goto unlock;
12856 continue_copy:
12857 				vfs_setfstypename_locked(mp, data);
12858 				if (vfs_isrdonly(mp) &&
12859 				    strcmp(data, "mtmfs") == 0) {
12860 					mp->mnt_kern_flag |=
12861 					    MNTK_EXTENDED_SECURITY;
12862 					mp->mnt_kern_flag &=
12863 					    ~MNTK_AUTH_OPAQUE;
12864 				}
12865 			} else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12866 				const char *name =
12867 				    vfs_getfstypenameref_locked(mp, NULL);
12868 				if (strcmp(name, "mtmfs") == 0) {
12869 					mp->mnt_kern_flag &=
12870 					    ~MNTK_EXTENDED_SECURITY;
12871 				}
12872 				vfs_setfstypename_locked(mp, NULL);
12873 			}
12874 unlock:
12875 			mount_unlock(mp);
12876 		}
12877 	}
12878 	break;
12879 
12880 	case DISK_CONDITIONER_IOC_GET: {
12881 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12882 	}
12883 	break;
12884 
12885 	case DISK_CONDITIONER_IOC_SET: {
12886 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12887 	}
12888 	break;
12889 
12890 	case FSIOC_CAS_BSDFLAGS:
12891 		error = handle_flags(vp, data, ctx);
12892 		break;
12893 
12894 	case FSIOC_FD_ONLY_OPEN_ONCE: {
12895 		error = 0;
12896 		if (vnode_usecount(vp) > 1) {
12897 			vnode_lock_spin(vp);
12898 			if (vp->v_lflag & VL_HASSTREAMS) {
12899 				if (vnode_isinuse_locked(vp, 1, 1)) {
12900 					error = EBUSY;
12901 				}
12902 			} else if (vnode_usecount(vp) > 1) {
12903 				error = EBUSY;
12904 			}
12905 			vnode_unlock(vp);
12906 		}
12907 	}
12908 	break;
12909 
12910 	case FSIOC_EVAL_ROOTAUTH:
12911 		error = handle_auth(vp, cmd, data, options, ctx);
12912 		break;
12913 
12914 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
12915 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12916 		break;
12917 
12918 #if CONFIG_EXCLAVES
12919 	case FSIOC_EXCLAVE_FS_REGISTER:
12920 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12921 			error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
12922 		} else {
12923 			error = EPERM;
12924 		}
12925 		break;
12926 
12927 	case FSIOC_EXCLAVE_FS_UNREGISTER:
12928 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12929 			error = vfs_exclave_fs_unregister(vp);
12930 		} else {
12931 			error = EPERM;
12932 		}
12933 		break;
12934 
12935 	case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
12936 		exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
12937 		exclave_fs_base_dir_t *dirs = NULL;
12938 		if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12939 			error = EPERM;
12940 			break;
12941 		}
12942 		if (get_base_dirs->base_dirs) {
12943 			if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
12944 				error = EINVAL;
12945 				break;
12946 			}
12947 			dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
12948 			if (!dirs) {
12949 				error = ENOSPC;
12950 				break;
12951 			}
12952 		}
12953 		error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
12954 		if (!error && dirs) {
12955 			error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
12956 			    get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
12957 		}
12958 		if (dirs) {
12959 			kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
12960 		}
12961 	}
12962 	break;
12963 #endif
12964 
12965 	default: {
12966 		/*
12967 		 * Other, known commands shouldn't be passed down here.
12968 		 * (When adding a selector to this list, it may be prudent
12969 		 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
12970 		 */
12971 		switch (cmd) {
12972 		case F_PUNCHHOLE:
12973 		case F_TRIM_ACTIVE_FILE:
12974 		case F_RDADVISE:
12975 		case F_TRANSCODEKEY:
12976 		case F_GETPROTECTIONLEVEL:
12977 		case F_GETDEFAULTPROTLEVEL:
12978 		case F_MAKECOMPRESSED:
12979 		case F_SET_GREEDY_MODE:
12980 		case F_SETSTATICCONTENT:
12981 		case F_SETIOTYPE:
12982 		case F_SETBACKINGSTORE:
12983 		case F_GETPATH_MTMINFO:
12984 		case APFSIOC_REVERT_TO_SNAPSHOT:
12985 		case FSIOC_FIOSEEKHOLE:
12986 		case FSIOC_FIOSEEKDATA:
12987 		case HFS_GET_BOOT_INFO:
12988 		case HFS_SET_BOOT_INFO:
12989 		case FIOPINSWAP:
12990 		case F_CHKCLEAN:
12991 		case F_FULLFSYNC:
12992 		case F_BARRIERFSYNC:
12993 		case F_FREEZE_FS:
12994 		case F_THAW_FS:
12995 		case FSIOC_KERNEL_ROOTAUTH:
12996 		case FSIOC_GRAFT_FS:
12997 		case FSIOC_UNGRAFT_FS:
12998 		case FSIOC_AUTH_FS:
12999 		case F_SPECULATIVE_READ:
13000 		case F_ATTRIBUTION_TAG:
13001 		case F_TRANSFEREXTENTS:
13002 		case F_ASSERT_BG_ACCESS:
13003 		case F_RELEASE_BG_ACCESS:
13004 			error = EINVAL;
13005 			goto outdrop;
13006 		}
13007 		/* Invoke the filesystem-specific code */
13008 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13009 	}
13010 	} /* end switch stmt */
13011 
13012 	/*
13013 	 * if no errors, copy any data to user. Size was
13014 	 * already set and checked above.
13015 	 */
13016 	if (error == 0 && (cmd & IOC_OUT) && size) {
13017 		error = copyout(data, udata, size);
13018 	}
13019 
13020 outdrop:
13021 	if (memp) {
13022 		kfree_data(memp, size);
13023 	}
13024 
13025 	return error;
13026 }
13027 
13028 /* ARGSUSED */
13029 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)13030 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
13031 {
13032 	int error;
13033 	struct nameidata nd;
13034 	uint32_t nameiflags;
13035 	vnode_t vp = NULL;
13036 	vfs_context_t ctx = vfs_context_current();
13037 
13038 	AUDIT_ARG(cmd, (int)uap->cmd);
13039 	AUDIT_ARG(value32, uap->options);
13040 	/* Get the vnode for the file we are getting info on:  */
13041 	nameiflags = 0;
13042 	//
13043 	// if we come through fsctl() then the file is by definition not open.
13044 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
13045 	// lest the caller mistakenly thinks the only open is their own (but in
13046 	// reality it's someone elses).
13047 	//
13048 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
13049 		return EINVAL;
13050 	}
13051 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
13052 		nameiflags |= FOLLOW;
13053 	}
13054 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
13055 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13056 	}
13057 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13058 	    UIO_USERSPACE, uap->path, ctx);
13059 	if ((error = namei(&nd))) {
13060 		goto done;
13061 	}
13062 	vp = nd.ni_vp;
13063 	nameidone(&nd);
13064 
13065 #if CONFIG_MACF
13066 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13067 	if (error) {
13068 		goto done;
13069 	}
13070 #endif
13071 
13072 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13073 
13074 done:
13075 	if (vp) {
13076 		vnode_put(vp);
13077 	}
13078 	return error;
13079 }
13080 /* ARGSUSED */
13081 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13082 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13083 {
13084 	int error;
13085 	vnode_t vp = NULL;
13086 	vfs_context_t ctx = vfs_context_current();
13087 	int fd = -1;
13088 
13089 	AUDIT_ARG(fd, uap->fd);
13090 	AUDIT_ARG(cmd, (int)uap->cmd);
13091 	AUDIT_ARG(value32, uap->options);
13092 
13093 	/* Get the vnode for the file we are getting info on:  */
13094 	if ((error = file_vnode(uap->fd, &vp))) {
13095 		return error;
13096 	}
13097 	fd = uap->fd;
13098 	if ((error = vnode_getwithref(vp))) {
13099 		file_drop(fd);
13100 		return error;
13101 	}
13102 
13103 #if CONFIG_MACF
13104 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13105 		file_drop(fd);
13106 		vnode_put(vp);
13107 		return error;
13108 	}
13109 #endif
13110 
13111 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13112 
13113 	file_drop(fd);
13114 
13115 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13116 	if (vp) {
13117 		vnode_put(vp);
13118 	}
13119 
13120 	return error;
13121 }
13122 /* end of fsctl system call */
13123 
13124 #define FILESEC_ACCESS_ENTITLEMENT              \
13125 	"com.apple.private.vfs.filesec-access"
13126 
13127 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13128 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13129 {
13130 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13131 		/*
13132 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13133 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13134 		 */
13135 		if ((!setting && vfs_context_issuser(ctx)) ||
13136 		    IOTaskHasEntitlement(vfs_context_task(ctx),
13137 		    FILESEC_ACCESS_ENTITLEMENT)) {
13138 			return 0;
13139 		}
13140 	}
13141 
13142 	return EPERM;
13143 }
13144 
13145 /*
13146  *  Retrieve the data of an extended attribute.
13147  */
13148 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13149 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13150 {
13151 	vnode_t vp;
13152 	struct nameidata nd;
13153 	char attrname[XATTR_MAXNAMELEN + 1];
13154 	vfs_context_t ctx = vfs_context_current();
13155 	uio_t auio = NULL;
13156 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13157 	size_t attrsize = 0;
13158 	size_t namelen;
13159 	u_int32_t nameiflags;
13160 	int error;
13161 	UIO_STACKBUF(uio_buf, 1);
13162 
13163 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13164 		return EINVAL;
13165 	}
13166 
13167 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13168 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13169 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13170 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13171 	}
13172 
13173 	if ((error = namei(&nd))) {
13174 		return error;
13175 	}
13176 	vp = nd.ni_vp;
13177 	nameidone(&nd);
13178 
13179 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13180 	if (error != 0) {
13181 		goto out;
13182 	}
13183 	if (xattr_protected(attrname) &&
13184 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13185 		goto out;
13186 	}
13187 	/*
13188 	 * the specific check for 0xffffffff is a hack to preserve
13189 	 * binaray compatibilty in K64 with applications that discovered
13190 	 * that passing in a buf pointer and a size of -1 resulted in
13191 	 * just the size of the indicated extended attribute being returned.
13192 	 * this isn't part of the documented behavior, but because of the
13193 	 * original implemtation's check for "uap->size > 0", this behavior
13194 	 * was allowed. In K32 that check turned into a signed comparison
13195 	 * even though uap->size is unsigned...  in K64, we blow by that
13196 	 * check because uap->size is unsigned and doesn't get sign smeared
13197 	 * in the munger for a 32 bit user app.  we also need to add a
13198 	 * check to limit the maximum size of the buffer being passed in...
13199 	 * unfortunately, the underlying fileystems seem to just malloc
13200 	 * the requested size even if the actual extended attribute is tiny.
13201 	 * because that malloc is for kernel wired memory, we have to put a
13202 	 * sane limit on it.
13203 	 *
13204 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13205 	 * U64 running on K64 will yield -1 (64 bits wide)
13206 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
13207 	 */
13208 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13209 		goto no_uio;
13210 	}
13211 
13212 	if (uap->value) {
13213 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13214 			uap->size = XATTR_MAXSIZE;
13215 		}
13216 
13217 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13218 		    &uio_buf[0], sizeof(uio_buf));
13219 		uio_addiov(auio, uap->value, uap->size);
13220 	}
13221 no_uio:
13222 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13223 out:
13224 	vnode_put(vp);
13225 
13226 	if (auio) {
13227 		*retval = uap->size - uio_resid(auio);
13228 	} else {
13229 		*retval = (user_ssize_t)attrsize;
13230 	}
13231 
13232 	return error;
13233 }
13234 
13235 /*
13236  * Retrieve the data of an extended attribute.
13237  */
13238 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13239 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13240 {
13241 	vnode_t vp;
13242 	char attrname[XATTR_MAXNAMELEN + 1];
13243 	vfs_context_t ctx = vfs_context_current();
13244 	uio_t auio = NULL;
13245 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13246 	size_t attrsize = 0;
13247 	size_t namelen;
13248 	int error;
13249 	UIO_STACKBUF(uio_buf, 1);
13250 
13251 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13252 	    XATTR_NOFOLLOW_ANY)) {
13253 		return EINVAL;
13254 	}
13255 
13256 	if ((error = file_vnode(uap->fd, &vp))) {
13257 		return error;
13258 	}
13259 	if ((error = vnode_getwithref(vp))) {
13260 		file_drop(uap->fd);
13261 		return error;
13262 	}
13263 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13264 	if (error != 0) {
13265 		goto out;
13266 	}
13267 	if (xattr_protected(attrname) &&
13268 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13269 		goto out;
13270 	}
13271 	if (uap->value && uap->size > 0) {
13272 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13273 			uap->size = XATTR_MAXSIZE;
13274 		}
13275 
13276 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13277 		    &uio_buf[0], sizeof(uio_buf));
13278 		uio_addiov(auio, uap->value, uap->size);
13279 	}
13280 
13281 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13282 out:
13283 	(void)vnode_put(vp);
13284 	file_drop(uap->fd);
13285 
13286 	if (auio) {
13287 		*retval = uap->size - uio_resid(auio);
13288 	} else {
13289 		*retval = (user_ssize_t)attrsize;
13290 	}
13291 	return error;
13292 }
13293 
13294 /* struct for checkdirs iteration */
13295 struct setxattr_ctx {
13296 	struct nameidata nd;
13297 	char attrname[XATTR_MAXNAMELEN + 1];
13298 	UIO_STACKBUF(uio_buf, 1);
13299 };
13300 
13301 /*
13302  * Set the data of an extended attribute.
13303  */
13304 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13305 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13306 {
13307 	vnode_t vp;
13308 	vfs_context_t ctx = vfs_context_current();
13309 	uio_t auio = NULL;
13310 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13311 	size_t namelen;
13312 	u_int32_t nameiflags;
13313 	int error;
13314 	struct setxattr_ctx *sactx;
13315 
13316 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13317 		return EINVAL;
13318 	}
13319 
13320 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13321 	if (sactx == NULL) {
13322 		return ENOMEM;
13323 	}
13324 
13325 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13326 	if (error != 0) {
13327 		if (error == EPERM) {
13328 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13329 			error = ENAMETOOLONG;
13330 		}
13331 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13332 		goto out;
13333 	}
13334 	if (xattr_protected(sactx->attrname) &&
13335 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13336 		goto out;
13337 	}
13338 	if (uap->size != 0 && uap->value == 0) {
13339 		error = EINVAL;
13340 		goto out;
13341 	}
13342 	if (uap->size > INT_MAX) {
13343 		error = E2BIG;
13344 		goto out;
13345 	}
13346 
13347 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13348 #if CONFIG_FILE_LEASES
13349 	nameiflags |= WANTPARENT;
13350 #endif
13351 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13352 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13353 		sactx->nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13354 	}
13355 
13356 	if ((error = namei(&sactx->nd))) {
13357 		goto out;
13358 	}
13359 	vp = sactx->nd.ni_vp;
13360 #if CONFIG_FILE_LEASES
13361 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13362 	vnode_put(sactx->nd.ni_dvp);
13363 #endif
13364 	nameidone(&sactx->nd);
13365 
13366 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13367 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13368 	uio_addiov(auio, uap->value, uap->size);
13369 
13370 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13371 #if CONFIG_FSE
13372 	if (error == 0) {
13373 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13374 		    FSE_ARG_VNODE, vp,
13375 		    FSE_ARG_DONE);
13376 	}
13377 #endif
13378 	vnode_put(vp);
13379 out:
13380 	kfree_type(struct setxattr_ctx, sactx);
13381 	*retval = 0;
13382 	return error;
13383 }
13384 
13385 /*
13386  * Set the data of an extended attribute.
13387  */
13388 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13389 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13390 {
13391 	vnode_t vp;
13392 	char attrname[XATTR_MAXNAMELEN + 1];
13393 	vfs_context_t ctx = vfs_context_current();
13394 	uio_t auio = NULL;
13395 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13396 	size_t namelen;
13397 	int error;
13398 	UIO_STACKBUF(uio_buf, 1);
13399 
13400 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13401 	    XATTR_NOFOLLOW_ANY)) {
13402 		return EINVAL;
13403 	}
13404 
13405 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13406 	if (error != 0) {
13407 		if (error == EPERM) {
13408 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13409 			return ENAMETOOLONG;
13410 		}
13411 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13412 		return error;
13413 	}
13414 	if (xattr_protected(attrname) &&
13415 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13416 		return error;
13417 	}
13418 	if (uap->size != 0 && uap->value == 0) {
13419 		return EINVAL;
13420 	}
13421 	if (uap->size > INT_MAX) {
13422 		return E2BIG;
13423 	}
13424 	if ((error = file_vnode(uap->fd, &vp))) {
13425 		return error;
13426 	}
13427 	if ((error = vnode_getwithref(vp))) {
13428 		file_drop(uap->fd);
13429 		return error;
13430 	}
13431 
13432 #if CONFIG_FILE_LEASES
13433 	vnode_breakdirlease(vp, true, O_WRONLY);
13434 #endif
13435 
13436 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13437 	    &uio_buf[0], sizeof(uio_buf));
13438 	uio_addiov(auio, uap->value, uap->size);
13439 
13440 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13441 #if CONFIG_FSE
13442 	if (error == 0) {
13443 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13444 		    FSE_ARG_VNODE, vp,
13445 		    FSE_ARG_DONE);
13446 	}
13447 #endif
13448 	vnode_put(vp);
13449 	file_drop(uap->fd);
13450 	*retval = 0;
13451 	return error;
13452 }
13453 
13454 /*
13455  * Remove an extended attribute.
13456  * XXX Code duplication here.
13457  */
13458 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13459 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13460 {
13461 	vnode_t vp;
13462 	struct nameidata nd;
13463 	char attrname[XATTR_MAXNAMELEN + 1];
13464 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13465 	vfs_context_t ctx = vfs_context_current();
13466 	size_t namelen;
13467 	u_int32_t nameiflags;
13468 	int error;
13469 
13470 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13471 		return EINVAL;
13472 	}
13473 
13474 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13475 	if (error != 0) {
13476 		return error;
13477 	}
13478 	if (xattr_protected(attrname)) {
13479 		return EPERM;
13480 	}
13481 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13482 #if CONFIG_FILE_LEASES
13483 	nameiflags |= WANTPARENT;
13484 #endif
13485 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13486 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13487 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13488 	}
13489 
13490 	if ((error = namei(&nd))) {
13491 		return error;
13492 	}
13493 	vp = nd.ni_vp;
13494 #if CONFIG_FILE_LEASES
13495 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13496 	vnode_put(nd.ni_dvp);
13497 #endif
13498 	nameidone(&nd);
13499 
13500 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13501 #if CONFIG_FSE
13502 	if (error == 0) {
13503 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13504 		    FSE_ARG_VNODE, vp,
13505 		    FSE_ARG_DONE);
13506 	}
13507 #endif
13508 	vnode_put(vp);
13509 	*retval = 0;
13510 	return error;
13511 }
13512 
13513 /*
13514  * Remove an extended attribute.
13515  * XXX Code duplication here.
13516  */
13517 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13518 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13519 {
13520 	vnode_t vp;
13521 	char attrname[XATTR_MAXNAMELEN + 1];
13522 	size_t namelen;
13523 	int error;
13524 #if CONFIG_FSE
13525 	vfs_context_t ctx = vfs_context_current();
13526 #endif
13527 
13528 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13529 	    XATTR_NOFOLLOW_ANY)) {
13530 		return EINVAL;
13531 	}
13532 
13533 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13534 	if (error != 0) {
13535 		return error;
13536 	}
13537 	if (xattr_protected(attrname)) {
13538 		return EPERM;
13539 	}
13540 	if ((error = file_vnode(uap->fd, &vp))) {
13541 		return error;
13542 	}
13543 	if ((error = vnode_getwithref(vp))) {
13544 		file_drop(uap->fd);
13545 		return error;
13546 	}
13547 
13548 #if CONFIG_FILE_LEASES
13549 	vnode_breakdirlease(vp, true, O_WRONLY);
13550 #endif
13551 
13552 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13553 #if CONFIG_FSE
13554 	if (error == 0) {
13555 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13556 		    FSE_ARG_VNODE, vp,
13557 		    FSE_ARG_DONE);
13558 	}
13559 #endif
13560 	vnode_put(vp);
13561 	file_drop(uap->fd);
13562 	*retval = 0;
13563 	return error;
13564 }
13565 
13566 /*
13567  * Retrieve the list of extended attribute names.
13568  * XXX Code duplication here.
13569  */
13570 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13571 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13572 {
13573 	vnode_t vp;
13574 	struct nameidata nd;
13575 	vfs_context_t ctx = vfs_context_current();
13576 	uio_t auio = NULL;
13577 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13578 	size_t attrsize = 0;
13579 	u_int32_t nameiflags;
13580 	int error;
13581 	UIO_STACKBUF(uio_buf, 1);
13582 
13583 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13584 		return EINVAL;
13585 	}
13586 
13587 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13588 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13589 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13590 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13591 	}
13592 
13593 	if ((error = namei(&nd))) {
13594 		return error;
13595 	}
13596 	vp = nd.ni_vp;
13597 	nameidone(&nd);
13598 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13599 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13600 		    &uio_buf[0], sizeof(uio_buf));
13601 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13602 	}
13603 
13604 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13605 
13606 	vnode_put(vp);
13607 	if (auio) {
13608 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13609 	} else {
13610 		*retval = (user_ssize_t)attrsize;
13611 	}
13612 	return error;
13613 }
13614 
13615 /*
13616  * Retrieve the list of extended attribute names.
13617  * XXX Code duplication here.
13618  */
13619 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13620 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13621 {
13622 	vnode_t vp;
13623 	uio_t auio = NULL;
13624 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13625 	size_t attrsize = 0;
13626 	int error;
13627 	UIO_STACKBUF(uio_buf, 1);
13628 
13629 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13630 	    XATTR_NOFOLLOW_ANY)) {
13631 		return EINVAL;
13632 	}
13633 
13634 	if ((error = file_vnode(uap->fd, &vp))) {
13635 		return error;
13636 	}
13637 	if ((error = vnode_getwithref(vp))) {
13638 		file_drop(uap->fd);
13639 		return error;
13640 	}
13641 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13642 		auio = uio_createwithbuffer(1, 0, spacetype,
13643 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
13644 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13645 	}
13646 
13647 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13648 
13649 	vnode_put(vp);
13650 	file_drop(uap->fd);
13651 	if (auio) {
13652 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13653 	} else {
13654 		*retval = (user_ssize_t)attrsize;
13655 	}
13656 	return error;
13657 }
13658 
13659 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13660 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13661     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13662 {
13663 	int error;
13664 	struct mount *mp = NULL;
13665 	vnode_t vp;
13666 	int length;
13667 	int bpflags;
13668 	/* maximum number of times to retry build_path */
13669 	unsigned int retries = 0x10;
13670 
13671 	if (bufsize > FSGETPATH_MAXBUFLEN) {
13672 		return EINVAL;
13673 	}
13674 
13675 	if (buf == NULL) {
13676 		return ENOMEM;
13677 	}
13678 
13679 retry:
13680 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13681 		error = ENOTSUP;  /* unexpected failure */
13682 		return ENOTSUP;
13683 	}
13684 
13685 #if CONFIG_UNION_MOUNTS
13686 unionget:
13687 #endif /* CONFIG_UNION_MOUNTS */
13688 	if (objid == 2) {
13689 		struct vfs_attr vfsattr;
13690 		int use_vfs_root = TRUE;
13691 
13692 		VFSATTR_INIT(&vfsattr);
13693 		VFSATTR_WANTED(&vfsattr, f_capabilities);
13694 		if (!(options & FSOPT_ISREALFSID) &&
13695 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13696 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13697 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13698 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13699 				use_vfs_root = FALSE;
13700 			}
13701 		}
13702 
13703 		if (use_vfs_root) {
13704 			error = VFS_ROOT(mp, &vp, ctx);
13705 		} else {
13706 			error = VFS_VGET(mp, objid, &vp, ctx);
13707 		}
13708 	} else {
13709 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13710 	}
13711 
13712 #if CONFIG_UNION_MOUNTS
13713 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13714 		/*
13715 		 * If the fileid isn't found and we're in a union
13716 		 * mount volume, then see if the fileid is in the
13717 		 * mounted-on volume.
13718 		 */
13719 		struct mount *tmp = mp;
13720 		mp = vnode_mount(tmp->mnt_vnodecovered);
13721 		vfs_unbusy(tmp);
13722 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
13723 			goto unionget;
13724 		}
13725 	} else {
13726 		vfs_unbusy(mp);
13727 	}
13728 #else
13729 	vfs_unbusy(mp);
13730 #endif /* CONFIG_UNION_MOUNTS */
13731 
13732 	if (error) {
13733 		return error;
13734 	}
13735 
13736 #if CONFIG_MACF
13737 	error = mac_vnode_check_fsgetpath(ctx, vp);
13738 	if (error) {
13739 		vnode_put(vp);
13740 		return error;
13741 	}
13742 #endif
13743 
13744 	/* Obtain the absolute path to this vnode. */
13745 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13746 	if (options & FSOPT_NOFIRMLINKPATH) {
13747 		bpflags |= BUILDPATH_NO_FIRMLINK;
13748 	}
13749 	bpflags |= BUILDPATH_CHECK_MOVED;
13750 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13751 	vnode_put(vp);
13752 
13753 	if (error) {
13754 		/* there was a race building the path, try a few more times */
13755 		if (error == EAGAIN) {
13756 			--retries;
13757 			if (retries > 0) {
13758 				goto retry;
13759 			}
13760 
13761 			error = ENOENT;
13762 		}
13763 		goto out;
13764 	}
13765 
13766 	AUDIT_ARG(text, buf);
13767 
13768 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13769 		unsigned long path_words[NUMPARMS];
13770 		size_t path_len = sizeof(path_words);
13771 
13772 		if ((size_t)length < path_len) {
13773 			memcpy((char *)path_words, buf, length);
13774 			memset((char *)path_words + length, 0, path_len - length);
13775 
13776 			path_len = length;
13777 		} else {
13778 			memcpy((char *)path_words, buf + (length - path_len), path_len);
13779 		}
13780 
13781 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
13782 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13783 	}
13784 
13785 	*pathlen = length; /* may be superseded by error */
13786 
13787 out:
13788 	return error;
13789 }
13790 
13791 /*
13792  * Obtain the full pathname of a file system object by id.
13793  */
13794 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13795 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13796     uint32_t options, user_ssize_t *retval)
13797 {
13798 	vfs_context_t ctx = vfs_context_current();
13799 	fsid_t fsid;
13800 	char *realpath;
13801 	int length;
13802 	int error;
13803 
13804 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13805 		return EINVAL;
13806 	}
13807 
13808 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13809 		return error;
13810 	}
13811 	AUDIT_ARG(value32, fsid.val[0]);
13812 	AUDIT_ARG(value64, objid);
13813 	/* Restrict output buffer size for now. */
13814 
13815 	if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13816 		return EINVAL;
13817 	}
13818 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13819 	if (realpath == NULL) {
13820 		return ENOMEM;
13821 	}
13822 
13823 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13824 	    options, &length);
13825 
13826 	if (error) {
13827 		goto out;
13828 	}
13829 
13830 	error = copyout((caddr_t)realpath, buf, length);
13831 
13832 	*retval = (user_ssize_t)length; /* may be superseded by error */
13833 out:
13834 	kfree_data(realpath, bufsize);
13835 	return error;
13836 }
13837 
13838 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13839 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13840 {
13841 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13842 	           0, retval);
13843 }
13844 
13845 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13846 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13847 {
13848 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13849 	           uap->options, retval);
13850 }
13851 
13852 /*
13853  * Common routine to handle various flavors of statfs data heading out
13854  *	to user space.
13855  *
13856  * Returns:	0			Success
13857  *		EFAULT
13858  */
13859 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13860 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13861     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13862     boolean_t partial_copy)
13863 {
13864 	int             error;
13865 	int             my_size, copy_size;
13866 
13867 	if (is_64_bit) {
13868 		struct user64_statfs sfs;
13869 		my_size = copy_size = sizeof(sfs);
13870 		bzero(&sfs, my_size);
13871 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13872 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13873 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13874 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13875 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13876 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13877 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13878 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13879 		sfs.f_files = (user64_long_t)sfsp->f_files;
13880 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13881 		sfs.f_fsid = sfsp->f_fsid;
13882 		sfs.f_owner = sfsp->f_owner;
13883 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13884 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13885 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13886 
13887 		if (partial_copy) {
13888 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13889 		}
13890 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13891 	} else {
13892 		struct user32_statfs sfs;
13893 
13894 		my_size = copy_size = sizeof(sfs);
13895 		bzero(&sfs, my_size);
13896 
13897 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13898 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13899 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13900 
13901 		/*
13902 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13903 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
13904 		 * to reflect the filesystem size as best we can.
13905 		 */
13906 		if ((sfsp->f_blocks > INT_MAX)
13907 		    /* Hack for 4061702 . I think the real fix is for Carbon to
13908 		     * look for some volume capability and not depend on hidden
13909 		     * semantics agreed between a FS and carbon.
13910 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13911 		     * for Carbon to set bNoVolumeSizes volume attribute.
13912 		     * Without this the webdavfs files cannot be copied onto
13913 		     * disk as they look huge. This change should not affect
13914 		     * XSAN as they should not setting these to -1..
13915 		     */
13916 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
13917 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
13918 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13919 			int             shift;
13920 
13921 			/*
13922 			 * Work out how far we have to shift the block count down to make it fit.
13923 			 * Note that it's possible to have to shift so far that the resulting
13924 			 * blocksize would be unreportably large.  At that point, we will clip
13925 			 * any values that don't fit.
13926 			 *
13927 			 * For safety's sake, we also ensure that f_iosize is never reported as
13928 			 * being smaller than f_bsize.
13929 			 */
13930 			for (shift = 0; shift < 32; shift++) {
13931 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13932 					break;
13933 				}
13934 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13935 					break;
13936 				}
13937 			}
13938 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13939 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13940 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13941 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13942 #undef __SHIFT_OR_CLIP
13943 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13944 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13945 		} else {
13946 			/* filesystem is small enough to be reported honestly */
13947 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13948 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13949 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13950 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13951 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13952 		}
13953 		sfs.f_files = (user32_long_t)sfsp->f_files;
13954 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13955 		sfs.f_fsid = sfsp->f_fsid;
13956 		sfs.f_owner = sfsp->f_owner;
13957 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13958 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13959 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13960 
13961 		if (partial_copy) {
13962 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13963 		}
13964 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13965 	}
13966 
13967 	if (sizep != NULL) {
13968 		*sizep = my_size;
13969 	}
13970 	return error;
13971 }
13972 
13973 /*
13974  * copy stat structure into user_stat structure.
13975  */
13976 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13977 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13978 {
13979 	bzero(usbp, sizeof(*usbp));
13980 
13981 	usbp->st_dev = sbp->st_dev;
13982 	usbp->st_ino = sbp->st_ino;
13983 	usbp->st_mode = sbp->st_mode;
13984 	usbp->st_nlink = sbp->st_nlink;
13985 	usbp->st_uid = sbp->st_uid;
13986 	usbp->st_gid = sbp->st_gid;
13987 	usbp->st_rdev = sbp->st_rdev;
13988 #ifndef _POSIX_C_SOURCE
13989 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13990 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13991 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13992 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13993 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13994 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13995 #else
13996 	usbp->st_atime = sbp->st_atime;
13997 	usbp->st_atimensec = sbp->st_atimensec;
13998 	usbp->st_mtime = sbp->st_mtime;
13999 	usbp->st_mtimensec = sbp->st_mtimensec;
14000 	usbp->st_ctime = sbp->st_ctime;
14001 	usbp->st_ctimensec = sbp->st_ctimensec;
14002 #endif
14003 	usbp->st_size = sbp->st_size;
14004 	usbp->st_blocks = sbp->st_blocks;
14005 	usbp->st_blksize = sbp->st_blksize;
14006 	usbp->st_flags = sbp->st_flags;
14007 	usbp->st_gen = sbp->st_gen;
14008 	usbp->st_lspare = sbp->st_lspare;
14009 	usbp->st_qspare[0] = sbp->st_qspare[0];
14010 	usbp->st_qspare[1] = sbp->st_qspare[1];
14011 }
14012 
14013 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)14014 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
14015 {
14016 	bzero(usbp, sizeof(*usbp));
14017 
14018 	usbp->st_dev = sbp->st_dev;
14019 	usbp->st_ino = sbp->st_ino;
14020 	usbp->st_mode = sbp->st_mode;
14021 	usbp->st_nlink = sbp->st_nlink;
14022 	usbp->st_uid = sbp->st_uid;
14023 	usbp->st_gid = sbp->st_gid;
14024 	usbp->st_rdev = sbp->st_rdev;
14025 #ifndef _POSIX_C_SOURCE
14026 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14027 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14028 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14029 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14030 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14031 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14032 #else
14033 	usbp->st_atime = sbp->st_atime;
14034 	usbp->st_atimensec = sbp->st_atimensec;
14035 	usbp->st_mtime = sbp->st_mtime;
14036 	usbp->st_mtimensec = sbp->st_mtimensec;
14037 	usbp->st_ctime = sbp->st_ctime;
14038 	usbp->st_ctimensec = sbp->st_ctimensec;
14039 #endif
14040 	usbp->st_size = sbp->st_size;
14041 	usbp->st_blocks = sbp->st_blocks;
14042 	usbp->st_blksize = sbp->st_blksize;
14043 	usbp->st_flags = sbp->st_flags;
14044 	usbp->st_gen = sbp->st_gen;
14045 	usbp->st_lspare = sbp->st_lspare;
14046 	usbp->st_qspare[0] = sbp->st_qspare[0];
14047 	usbp->st_qspare[1] = sbp->st_qspare[1];
14048 }
14049 
14050 /*
14051  * copy stat64 structure into user_stat64 structure.
14052  */
14053 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)14054 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
14055 {
14056 	bzero(usbp, sizeof(*usbp));
14057 
14058 	usbp->st_dev = sbp->st_dev;
14059 	usbp->st_ino = sbp->st_ino;
14060 	usbp->st_mode = sbp->st_mode;
14061 	usbp->st_nlink = sbp->st_nlink;
14062 	usbp->st_uid = sbp->st_uid;
14063 	usbp->st_gid = sbp->st_gid;
14064 	usbp->st_rdev = sbp->st_rdev;
14065 #ifndef _POSIX_C_SOURCE
14066 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14067 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14068 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14069 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14070 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14071 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14072 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
14073 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
14074 #else
14075 	usbp->st_atime = sbp->st_atime;
14076 	usbp->st_atimensec = sbp->st_atimensec;
14077 	usbp->st_mtime = sbp->st_mtime;
14078 	usbp->st_mtimensec = sbp->st_mtimensec;
14079 	usbp->st_ctime = sbp->st_ctime;
14080 	usbp->st_ctimensec = sbp->st_ctimensec;
14081 	usbp->st_birthtime = sbp->st_birthtime;
14082 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14083 #endif
14084 	usbp->st_size = sbp->st_size;
14085 	usbp->st_blocks = sbp->st_blocks;
14086 	usbp->st_blksize = sbp->st_blksize;
14087 	usbp->st_flags = sbp->st_flags;
14088 	usbp->st_gen = sbp->st_gen;
14089 	usbp->st_lspare = sbp->st_lspare;
14090 	usbp->st_qspare[0] = sbp->st_qspare[0];
14091 	usbp->st_qspare[1] = sbp->st_qspare[1];
14092 }
14093 
14094 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14095 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14096 {
14097 	bzero(usbp, sizeof(*usbp));
14098 
14099 	usbp->st_dev = sbp->st_dev;
14100 	usbp->st_ino = sbp->st_ino;
14101 	usbp->st_mode = sbp->st_mode;
14102 	usbp->st_nlink = sbp->st_nlink;
14103 	usbp->st_uid = sbp->st_uid;
14104 	usbp->st_gid = sbp->st_gid;
14105 	usbp->st_rdev = sbp->st_rdev;
14106 #ifndef _POSIX_C_SOURCE
14107 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14108 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14109 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14110 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14111 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14112 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14113 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14114 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14115 #else
14116 	usbp->st_atime = sbp->st_atime;
14117 	usbp->st_atimensec = sbp->st_atimensec;
14118 	usbp->st_mtime = sbp->st_mtime;
14119 	usbp->st_mtimensec = sbp->st_mtimensec;
14120 	usbp->st_ctime = sbp->st_ctime;
14121 	usbp->st_ctimensec = sbp->st_ctimensec;
14122 	usbp->st_birthtime = sbp->st_birthtime;
14123 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14124 #endif
14125 	usbp->st_size = sbp->st_size;
14126 	usbp->st_blocks = sbp->st_blocks;
14127 	usbp->st_blksize = sbp->st_blksize;
14128 	usbp->st_flags = sbp->st_flags;
14129 	usbp->st_gen = sbp->st_gen;
14130 	usbp->st_lspare = sbp->st_lspare;
14131 	usbp->st_qspare[0] = sbp->st_qspare[0];
14132 	usbp->st_qspare[1] = sbp->st_qspare[1];
14133 }
14134 
14135 /*
14136  * Purge buffer cache for simulating cold starts
14137  */
14138 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14139 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14140 {
14141 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14142 
14143 	return VNODE_RETURNED;
14144 }
14145 
14146 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14147 vfs_purge_callback(mount_t mp, __unused void * arg)
14148 {
14149 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14150 
14151 	return VFS_RETURNED;
14152 }
14153 
14154 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14155 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14156 
14157 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14158 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14159 {
14160 	if (!kauth_cred_issuser(kauth_cred_get())) {
14161 		return EPERM;
14162 	}
14163 
14164 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14165 
14166 	/* also flush any VM pagers backed by files */
14167 	if (vfs_purge_vm_pagers) {
14168 		vm_purge_filebacked_pagers();
14169 	}
14170 
14171 	return 0;
14172 }
14173 
14174 /*
14175  * gets the vnode associated with the (unnamed) snapshot directory
14176  * for a Filesystem. The snapshot directory vnode is returned with
14177  * an iocount on it.
14178  */
14179 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14180 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14181 {
14182 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14183 }
14184 
14185 /*
14186  * Get the snapshot vnode.
14187  *
14188  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14189  * needs nameidone() on ndp.
14190  *
14191  * If the snapshot vnode exists it is returned in ndp->ni_vp.
14192  *
14193  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14194  * not needed.
14195  */
14196 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14197 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14198     user_addr_t name, struct nameidata *ndp, int32_t op,
14199 #if !CONFIG_TRIGGERS
14200     __unused
14201 #endif
14202     enum path_operation pathop,
14203     vfs_context_t ctx)
14204 {
14205 	int error, i;
14206 	caddr_t name_buf;
14207 	size_t name_len;
14208 	struct vfs_attr vfa;
14209 
14210 	*sdvpp = NULLVP;
14211 	*rvpp = NULLVP;
14212 
14213 	error = vnode_getfromfd(ctx, dirfd, rvpp);
14214 	if (error) {
14215 		return error;
14216 	}
14217 
14218 	if (!vnode_isvroot(*rvpp)) {
14219 		error = EINVAL;
14220 		goto out;
14221 	}
14222 
14223 	/* Make sure the filesystem supports snapshots */
14224 	VFSATTR_INIT(&vfa);
14225 	VFSATTR_WANTED(&vfa, f_capabilities);
14226 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14227 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14228 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14229 	    VOL_CAP_INT_SNAPSHOT)) ||
14230 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14231 	    VOL_CAP_INT_SNAPSHOT))) {
14232 		error = ENOTSUP;
14233 		goto out;
14234 	}
14235 
14236 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14237 	if (error) {
14238 		goto out;
14239 	}
14240 
14241 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14242 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14243 	if (error) {
14244 		goto out1;
14245 	}
14246 
14247 	/*
14248 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14249 	 * (the length returned by copyinstr includes the terminating NUL)
14250 	 */
14251 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14252 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14253 		error = EINVAL;
14254 		goto out1;
14255 	}
14256 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14257 		;
14258 	}
14259 	if (i < (int)name_len) {
14260 		error = EINVAL;
14261 		goto out1;
14262 	}
14263 
14264 #if CONFIG_MACF
14265 	if (op == CREATE) {
14266 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14267 		    name_buf);
14268 	} else if (op == DELETE) {
14269 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14270 		    name_buf);
14271 	}
14272 	if (error) {
14273 		goto out1;
14274 	}
14275 #endif
14276 
14277 	/* Check if the snapshot already exists ... */
14278 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14279 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14280 	ndp->ni_dvp = *sdvpp;
14281 
14282 	error = namei(ndp);
14283 out1:
14284 	zfree(ZV_NAMEI, name_buf);
14285 out:
14286 	if (error) {
14287 		if (*sdvpp) {
14288 			vnode_put(*sdvpp);
14289 			*sdvpp = NULLVP;
14290 		}
14291 		if (*rvpp) {
14292 			vnode_put(*rvpp);
14293 			*rvpp = NULLVP;
14294 		}
14295 	}
14296 	return error;
14297 }
14298 
14299 /*
14300  * create a filesystem snapshot (for supporting filesystems)
14301  *
14302  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14303  * We get to the (unnamed) snapshot directory vnode and create the vnode
14304  * for the snapshot in it.
14305  *
14306  * Restrictions:
14307  *
14308  *    a) Passed in name for snapshot cannot have slashes.
14309  *    b) name can't be "." or ".."
14310  *
14311  * Since this requires superuser privileges, vnode_authorize calls are not
14312  * made.
14313  */
14314 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14315 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14316     vfs_context_t ctx)
14317 {
14318 	vnode_t rvp, snapdvp;
14319 	int error;
14320 	struct nameidata *ndp;
14321 
14322 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14323 
14324 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14325 	    OP_LINK, ctx);
14326 	if (error) {
14327 		goto out;
14328 	}
14329 
14330 	if (ndp->ni_vp) {
14331 		vnode_put(ndp->ni_vp);
14332 		error = EEXIST;
14333 	} else {
14334 		struct vnode_attr *vap;
14335 		vnode_t vp = NULLVP;
14336 
14337 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14338 
14339 		VATTR_INIT(vap);
14340 		VATTR_SET(vap, va_type, VREG);
14341 		VATTR_SET(vap, va_mode, 0);
14342 
14343 		error = vn_create(snapdvp, &vp, ndp, vap,
14344 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14345 		if (!error && vp) {
14346 			vnode_put(vp);
14347 		}
14348 
14349 		kfree_type(struct vnode_attr, vap);
14350 	}
14351 
14352 	nameidone(ndp);
14353 	vnode_put(snapdvp);
14354 	vnode_put(rvp);
14355 out:
14356 	kfree_type(struct nameidata, ndp);
14357 
14358 	return error;
14359 }
14360 
14361 /*
14362  * Delete a Filesystem snapshot
14363  *
14364  * get the vnode for the unnamed snapshot directory and the snapshot and
14365  * delete the snapshot.
14366  */
14367 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14368 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14369     vfs_context_t ctx)
14370 {
14371 	vnode_t rvp, snapdvp;
14372 	int error;
14373 	struct nameidata *ndp;
14374 
14375 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14376 
14377 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14378 	    OP_UNLINK, ctx);
14379 	if (error) {
14380 		goto out;
14381 	}
14382 
14383 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14384 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14385 
14386 	vnode_put(ndp->ni_vp);
14387 	nameidone(ndp);
14388 	vnode_put(snapdvp);
14389 	vnode_put(rvp);
14390 out:
14391 	kfree_type(struct nameidata, ndp);
14392 
14393 	return error;
14394 }
14395 
14396 /*
14397  * Revert a filesystem to a snapshot
14398  *
14399  * Marks the filesystem to revert to the given snapshot on next mount.
14400  */
14401 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14402 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14403     vfs_context_t ctx)
14404 {
14405 	int error;
14406 	vnode_t rvp;
14407 	mount_t mp;
14408 	struct fs_snapshot_revert_args revert_data;
14409 	struct componentname cnp;
14410 	caddr_t name_buf;
14411 	size_t name_len;
14412 
14413 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14414 	if (error) {
14415 		return error;
14416 	}
14417 	mp = vnode_mount(rvp);
14418 
14419 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14420 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14421 	if (error) {
14422 		zfree(ZV_NAMEI, name_buf);
14423 		vnode_put(rvp);
14424 		return error;
14425 	}
14426 
14427 #if CONFIG_MACF
14428 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14429 	if (error) {
14430 		zfree(ZV_NAMEI, name_buf);
14431 		vnode_put(rvp);
14432 		return error;
14433 	}
14434 #endif
14435 
14436 	/*
14437 	 * Grab mount_iterref so that we can release the vnode,
14438 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14439 	 */
14440 	error = mount_iterref(mp, 0);
14441 	vnode_put(rvp);
14442 	if (error) {
14443 		zfree(ZV_NAMEI, name_buf);
14444 		return error;
14445 	}
14446 
14447 	memset(&cnp, 0, sizeof(cnp));
14448 	cnp.cn_pnbuf = (char *)name_buf;
14449 	cnp.cn_nameiop = LOOKUP;
14450 	cnp.cn_flags = ISLASTCN | HASBUF;
14451 	cnp.cn_pnlen = MAXPATHLEN;
14452 	cnp.cn_nameptr = cnp.cn_pnbuf;
14453 	cnp.cn_namelen = (int)name_len;
14454 	revert_data.sr_cnp = &cnp;
14455 
14456 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14457 	mount_iterdrop(mp);
14458 	zfree(ZV_NAMEI, name_buf);
14459 
14460 	if (error) {
14461 		/* If there was any error, try again using VNOP_IOCTL */
14462 
14463 		vnode_t snapdvp;
14464 		struct nameidata namend;
14465 
14466 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14467 		    OP_LOOKUP, ctx);
14468 		if (error) {
14469 			return error;
14470 		}
14471 
14472 
14473 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14474 		    0, ctx);
14475 
14476 		vnode_put(namend.ni_vp);
14477 		nameidone(&namend);
14478 		vnode_put(snapdvp);
14479 		vnode_put(rvp);
14480 	}
14481 
14482 	return error;
14483 }
14484 
14485 /*
14486  * rename a Filesystem snapshot
14487  *
14488  * get the vnode for the unnamed snapshot directory and the snapshot and
14489  * rename the snapshot. This is a very specialised (and simple) case of
14490  * rename(2) (which has to deal with a lot more complications). It differs
14491  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14492  */
14493 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14494 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14495     __unused uint32_t flags, vfs_context_t ctx)
14496 {
14497 	vnode_t rvp, snapdvp;
14498 	int error, i;
14499 	caddr_t newname_buf;
14500 	size_t name_len;
14501 	vnode_t fvp;
14502 	struct nameidata *fromnd, *tond;
14503 	/* carving out a chunk for structs that are too big to be on stack. */
14504 	struct {
14505 		struct nameidata from_node;
14506 		struct nameidata to_node;
14507 	} * __rename_data;
14508 
14509 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14510 	fromnd = &__rename_data->from_node;
14511 	tond = &__rename_data->to_node;
14512 
14513 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14514 	    OP_UNLINK, ctx);
14515 	if (error) {
14516 		goto out;
14517 	}
14518 	fvp  = fromnd->ni_vp;
14519 
14520 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14521 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14522 	if (error) {
14523 		goto out1;
14524 	}
14525 
14526 	/*
14527 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14528 	 * slashes.
14529 	 * (the length returned by copyinstr includes the terminating NUL)
14530 	 *
14531 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14532 	 * off here itself.
14533 	 */
14534 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14535 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14536 		error = EINVAL;
14537 		goto out1;
14538 	}
14539 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14540 		;
14541 	}
14542 	if (i < (int)name_len) {
14543 		error = EINVAL;
14544 		goto out1;
14545 	}
14546 
14547 #if CONFIG_MACF
14548 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14549 	    newname_buf);
14550 	if (error) {
14551 		goto out1;
14552 	}
14553 #endif
14554 
14555 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14556 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14557 	tond->ni_dvp = snapdvp;
14558 
14559 	error = namei(tond);
14560 	if (error) {
14561 		goto out2;
14562 	} else if (tond->ni_vp) {
14563 		/*
14564 		 * snapshot rename behaves differently than rename(2) - if the
14565 		 * new name exists, EEXIST is returned.
14566 		 */
14567 		vnode_put(tond->ni_vp);
14568 		error = EEXIST;
14569 		goto out2;
14570 	}
14571 
14572 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14573 	    &tond->ni_cnd, ctx);
14574 
14575 out2:
14576 	nameidone(tond);
14577 out1:
14578 	zfree(ZV_NAMEI, newname_buf);
14579 	vnode_put(fvp);
14580 	vnode_put(snapdvp);
14581 	vnode_put(rvp);
14582 	nameidone(fromnd);
14583 out:
14584 	kfree_type(typeof(*__rename_data), __rename_data);
14585 	return error;
14586 }
14587 
14588 /*
14589  * Mount a Filesystem snapshot
14590  *
14591  * get the vnode for the unnamed snapshot directory and the snapshot and
14592  * mount the snapshot.
14593  */
14594 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14595 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14596     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14597 {
14598 	mount_t mp;
14599 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14600 	struct fs_snapshot_mount_args smnt_data;
14601 	int error;
14602 	struct nameidata *snapndp, *dirndp;
14603 	/* carving out a chunk for structs that are too big to be on stack. */
14604 	struct {
14605 		struct nameidata snapnd;
14606 		struct nameidata dirnd;
14607 	} * __snapshot_mount_data;
14608 
14609 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14610 	snapndp = &__snapshot_mount_data->snapnd;
14611 	dirndp = &__snapshot_mount_data->dirnd;
14612 
14613 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14614 	    OP_LOOKUP, ctx);
14615 	if (error) {
14616 		goto out;
14617 	}
14618 
14619 	snapvp  = snapndp->ni_vp;
14620 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14621 		error = EIO;
14622 		goto out1;
14623 	}
14624 
14625 	/* Get the vnode to be covered */
14626 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14627 	    UIO_USERSPACE, directory, ctx);
14628 	error = namei(dirndp);
14629 	if (error) {
14630 		goto out1;
14631 	}
14632 
14633 	vp = dirndp->ni_vp;
14634 	pvp = dirndp->ni_dvp;
14635 	mp = vnode_mount(rvp);
14636 
14637 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14638 		error = EINVAL;
14639 		goto out2;
14640 	}
14641 
14642 #if CONFIG_MACF
14643 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14644 	    mp->mnt_vfsstat.f_fstypename);
14645 	if (error) {
14646 		goto out2;
14647 	}
14648 #endif
14649 
14650 	smnt_data.sm_mp  = mp;
14651 	smnt_data.sm_cnp = &snapndp->ni_cnd;
14652 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14653 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & (MNT_DONTBROWSE | MNT_IGNORE_OWNERSHIP),
14654 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14655 
14656 out2:
14657 	vnode_put(vp);
14658 	vnode_put(pvp);
14659 	nameidone(dirndp);
14660 out1:
14661 	vnode_put(snapvp);
14662 	vnode_put(snapdvp);
14663 	vnode_put(rvp);
14664 	nameidone(snapndp);
14665 out:
14666 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14667 	return error;
14668 }
14669 
14670 /*
14671  * Root from a snapshot of the filesystem
14672  *
14673  * Marks the filesystem to root from the given snapshot on next boot.
14674  */
14675 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14676 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14677     vfs_context_t ctx)
14678 {
14679 	int error;
14680 	vnode_t rvp;
14681 	mount_t mp;
14682 	struct fs_snapshot_root_args root_data;
14683 	struct componentname cnp;
14684 	caddr_t name_buf;
14685 	size_t name_len;
14686 
14687 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14688 	if (error) {
14689 		return error;
14690 	}
14691 	mp = vnode_mount(rvp);
14692 
14693 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14694 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14695 	if (error) {
14696 		zfree(ZV_NAMEI, name_buf);
14697 		vnode_put(rvp);
14698 		return error;
14699 	}
14700 
14701 	// XXX MAC checks ?
14702 
14703 	/*
14704 	 * Grab mount_iterref so that we can release the vnode,
14705 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14706 	 */
14707 	error = mount_iterref(mp, 0);
14708 	vnode_put(rvp);
14709 	if (error) {
14710 		zfree(ZV_NAMEI, name_buf);
14711 		return error;
14712 	}
14713 
14714 	memset(&cnp, 0, sizeof(cnp));
14715 	cnp.cn_pnbuf = (char *)name_buf;
14716 	cnp.cn_nameiop = LOOKUP;
14717 	cnp.cn_flags = ISLASTCN | HASBUF;
14718 	cnp.cn_pnlen = MAXPATHLEN;
14719 	cnp.cn_nameptr = cnp.cn_pnbuf;
14720 	cnp.cn_namelen = (int)name_len;
14721 	root_data.sr_cnp = &cnp;
14722 
14723 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14724 
14725 	mount_iterdrop(mp);
14726 	zfree(ZV_NAMEI, name_buf);
14727 
14728 	return error;
14729 }
14730 
14731 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14732 vfs_context_can_snapshot(vfs_context_t ctx)
14733 {
14734 	static const char * const snapshot_entitlements[] = {
14735 		"com.apple.private.vfs.snapshot",
14736 		"com.apple.developer.vfs.snapshot",
14737 		"com.apple.private.apfs.arv.limited.snapshot",
14738 	};
14739 	static const size_t nentitlements =
14740 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14741 	size_t i;
14742 
14743 	task_t task = vfs_context_task(ctx);
14744 	for (i = 0; i < nentitlements; i++) {
14745 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14746 			return TRUE;
14747 		}
14748 	}
14749 	return FALSE;
14750 }
14751 
14752 /*
14753  * FS snapshot operations dispatcher
14754  */
14755 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14756 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14757     __unused int32_t *retval)
14758 {
14759 	int error;
14760 	vfs_context_t ctx = vfs_context_current();
14761 
14762 	AUDIT_ARG(fd, uap->dirfd);
14763 	AUDIT_ARG(value32, uap->op);
14764 
14765 	if (!vfs_context_can_snapshot(ctx)) {
14766 		return EPERM;
14767 	}
14768 
14769 	/*
14770 	 * Enforce user authorization for snapshot modification operations,
14771 	 * or if trying to root from snapshot.
14772 	 */
14773 	if (uap->op != SNAPSHOT_OP_MOUNT) {
14774 		vnode_t dvp = NULLVP;
14775 		vnode_t devvp = NULLVP;
14776 		mount_t mp;
14777 
14778 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14779 		if (error) {
14780 			return error;
14781 		}
14782 		mp = vnode_mount(dvp);
14783 		devvp = mp->mnt_devvp;
14784 
14785 		/* get an iocount on devvp */
14786 		if (devvp == NULLVP) {
14787 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14788 			/* for mounts which arent block devices */
14789 			if (error == ENOENT) {
14790 				error = ENXIO;
14791 			}
14792 		} else {
14793 			error = vnode_getwithref(devvp);
14794 		}
14795 
14796 		if (error) {
14797 			vnode_put(dvp);
14798 			return error;
14799 		}
14800 
14801 		if ((vfs_context_issuser(ctx) == 0) &&
14802 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14803 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14804 			error = EPERM;
14805 		}
14806 		vnode_put(dvp);
14807 		vnode_put(devvp);
14808 
14809 		if (error) {
14810 			return error;
14811 		}
14812 	}
14813 
14814 	switch (uap->op) {
14815 	case SNAPSHOT_OP_CREATE:
14816 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14817 		break;
14818 	case SNAPSHOT_OP_DELETE:
14819 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14820 		break;
14821 	case SNAPSHOT_OP_RENAME:
14822 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14823 		    uap->flags, ctx);
14824 		break;
14825 	case SNAPSHOT_OP_MOUNT:
14826 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14827 		    uap->data, uap->flags, ctx);
14828 		break;
14829 	case SNAPSHOT_OP_REVERT:
14830 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14831 		break;
14832 #if CONFIG_MNT_ROOTSNAP
14833 	case SNAPSHOT_OP_ROOT:
14834 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14835 		break;
14836 #endif /* CONFIG_MNT_ROOTSNAP */
14837 	default:
14838 		error = ENOSYS;
14839 	}
14840 
14841 	return error;
14842 }
14843