xref: /xnu-11215.81.4/bsd/vfs/vfs_syscalls.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452)
1 /*
2  * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 #if CONFIG_EXCLAVES
115 #include <vfs/vfs_exclave_fs.h>
116 #endif
117 
118 #include <security/audit/audit.h>
119 #include <bsm/audit_kevents.h>
120 
121 #include <mach/mach_types.h>
122 #include <kern/kern_types.h>
123 #include <kern/kalloc.h>
124 #include <kern/task.h>
125 
126 #include <vm/vm_pageout.h>
127 #include <vm/vm_protos.h>
128 #include <vm/memory_object_xnu.h>
129 
130 #include <libkern/OSAtomic.h>
131 #include <os/atomic_private.h>
132 #include <pexpert/pexpert.h>
133 #include <IOKit/IOBSD.h>
134 
135 // deps for MIG call
136 #include <kern/host.h>
137 #include <kern/ipc_misc.h>
138 #include <mach/host_priv.h>
139 #include <mach/vfs_nspace.h>
140 #include <os/log.h>
141 
142 #include <nfs/nfs_conf.h>
143 
144 #if ROUTEFS
145 #include <miscfs/routefs/routefs.h>
146 #endif /* ROUTEFS */
147 
148 #if CONFIG_MACF
149 #include <security/mac.h>
150 #include <security/mac_framework.h>
151 #endif
152 
153 #if CONFIG_FSE
154 #define GET_PATH(x) \
155 	((x) = get_pathbuff())
156 #define RELEASE_PATH(x) \
157 	release_pathbuff(x)
158 #else
159 #define GET_PATH(x)     \
160 	((x) = zalloc(ZV_NAMEI))
161 #define RELEASE_PATH(x) \
162 	zfree(ZV_NAMEI, x)
163 #endif /* CONFIG_FSE */
164 
165 #ifndef HFS_GET_BOOT_INFO
166 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
167 #endif
168 
169 #ifndef HFS_SET_BOOT_INFO
170 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
171 #endif
172 
173 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
174 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
175 #endif
176 
177 extern void disk_conditioner_unmount(mount_t mp);
178 
179 /* struct for checkdirs iteration */
180 struct cdirargs {
181 	vnode_t olddp;
182 	vnode_t newdp;
183 };
184 /* callback  for checkdirs iteration */
185 static int checkdirs_callback(proc_t p, void * arg);
186 
187 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
188 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
189 void enablequotas(struct mount *mp, vfs_context_t ctx);
190 static int getfsstat_callback(mount_t mp, void * arg);
191 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
192 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
193 static int sync_callback(mount_t, void *);
194 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
195     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
196     boolean_t partial_copy);
197 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
198 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
199     struct componentname *cnp, user_addr_t fsmountargs,
200     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
201 void vfs_notify_mount(vnode_t pdvp);
202 
203 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
204 
205 struct fd_vn_data * fg_vn_data_alloc(void);
206 
207 /*
208  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
209  * Concurrent lookups (or lookups by ids) on hard links can cause the
210  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
211  * does) to return ENOENT as the path cannot be returned from the name cache
212  * alone. We have no option but to retry and hope to get one namei->reverse path
213  * generation done without an intervening lookup, lookup by id on the hard link
214  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
215  * which currently are the MAC hooks for rename, unlink and rmdir.
216  */
217 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
218 
219 /* Max retry limit for rename due to vnode recycling. */
220 #define MAX_RENAME_ERECYCLE_RETRIES 1024
221 
222 #define MAX_LINK_ENOENT_RETRIES 1024
223 
224 /* Max retries for concurrent mounts on the same covered vnode. */
225 #define MAX_MOUNT_RETRIES       10
226 
227 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
228     int unlink_flags);
229 
230 #ifdef CONFIG_IMGSRC_ACCESS
231 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
232 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
233 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
234 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
235 static void mount_end_update(mount_t mp);
236 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
237 #endif /* CONFIG_IMGSRC_ACCESS */
238 
239 //snapshot functions
240 #if CONFIG_MNT_ROOTSNAP
241 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
242 #else
243 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
244 #endif
245 
246 __private_extern__
247 int sync_internal(void);
248 
249 __private_extern__
250 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
251 
252 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
253 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
254 
255 /* vars for sync mutex */
256 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
257 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
258 
259 extern lck_rw_t rootvnode_rw_lock;
260 
261 VFS_SMR_DECLARE;
262 extern uint32_t nc_smr_enabled;
263 
264 /*
265  * incremented each time a mount or unmount operation occurs
266  * used to invalidate the cached value of the rootvp in the
267  * mount structure utilized by cache_lookup_path
268  */
269 uint32_t mount_generation = 0;
270 
271 /* counts number of mount and unmount operations */
272 unsigned int vfs_nummntops = 0;
273 
274 /* system-wide, per-boot unique mount ID */
275 static _Atomic uint64_t mount_unique_id = 1;
276 
277 extern const struct fileops vnops;
278 #if CONFIG_APPLEDOUBLE
279 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
280 #endif /* CONFIG_APPLEDOUBLE */
281 
282 /* Maximum buffer length supported by fsgetpath(2) */
283 #define FSGETPATH_MAXBUFLEN  8192
284 
285 /*
286  * Virtual File System System Calls
287  */
288 
289 /*
290  * Private in-kernel mounting spi (specific use-cases only)
291  */
292 boolean_t
vfs_iskernelmount(mount_t mp)293 vfs_iskernelmount(mount_t mp)
294 {
295 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
296 }
297 
298 __private_extern__
299 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)300 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
301     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
302     vfs_context_t ctx)
303 {
304 	struct nameidata nd;
305 	boolean_t did_namei;
306 	int error;
307 
308 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
309 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
310 	if (syscall_flags & MNT_NOFOLLOW) {
311 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
312 	}
313 
314 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
315 
316 	/*
317 	 * Get the vnode to be covered if it's not supplied
318 	 */
319 	if (vp == NULLVP) {
320 		error = namei(&nd);
321 		if (error) {
322 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
323 				printf("failed to locate mount-on path: %s ", path);
324 			}
325 			return error;
326 		}
327 		vp = nd.ni_vp;
328 		pvp = nd.ni_dvp;
329 		did_namei = TRUE;
330 	} else {
331 		char *pnbuf = CAST_DOWN(char *, path);
332 
333 		nd.ni_cnd.cn_pnbuf = pnbuf;
334 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
335 		did_namei = FALSE;
336 	}
337 
338 	kern_flags |= KERNEL_MOUNT_KMOUNT;
339 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
340 	    syscall_flags, kern_flags, NULL, ctx);
341 
342 	if (did_namei) {
343 		vnode_put(vp);
344 		vnode_put(pvp);
345 		nameidone(&nd);
346 	}
347 
348 	return error;
349 }
350 
351 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)352 vfs_mount_at_path(const char *fstype, const char *path,
353     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
354     int mnt_flags, int flags)
355 {
356 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
357 	int error, km_flags = 0;
358 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
359 
360 	/*
361 	 * This call is currently restricted to specific use cases.
362 	 */
363 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
364 		return ENOTSUP;
365 	}
366 
367 #if !defined(XNU_TARGET_OS_OSX)
368 	if (strcmp(fstype, "lifs") == 0) {
369 		syscall_flags |= MNT_NOEXEC;
370 	}
371 #endif
372 
373 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
374 		km_flags |= KERNEL_MOUNT_NOAUTH;
375 	}
376 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
377 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
378 	}
379 
380 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
381 	    syscall_flags, km_flags, ctx);
382 	if (error) {
383 		printf("%s: mount on %s failed, error %d\n", __func__, path,
384 		    error);
385 	}
386 
387 	return error;
388 }
389 
390 /*
391  * Mount a file system.
392  */
393 /* ARGSUSED */
394 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)395 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
396 {
397 	struct __mac_mount_args muap;
398 
399 	muap.type = uap->type;
400 	muap.path = uap->path;
401 	muap.flags = uap->flags;
402 	muap.data = uap->data;
403 	muap.mac_p = USER_ADDR_NULL;
404 	return __mac_mount(p, &muap, retval);
405 }
406 
407 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)408 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
409 {
410 	struct componentname    cn;
411 	vfs_context_t           ctx = vfs_context_current();
412 	size_t                  dummy = 0;
413 	int                     error;
414 	int                     flags = uap->flags;
415 	char                    fstypename[MFSNAMELEN];
416 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
417 	vnode_t                 pvp;
418 	vnode_t                 vp;
419 
420 	AUDIT_ARG(fd, uap->fd);
421 	AUDIT_ARG(fflags, flags);
422 	/* fstypename will get audited by mount_common */
423 
424 	/* Sanity check the flags */
425 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
426 		return ENOTSUP;
427 	}
428 
429 	if (flags & MNT_UNION) {
430 		return EPERM;
431 	}
432 
433 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
434 	if (error) {
435 		return error;
436 	}
437 
438 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
439 		return error;
440 	}
441 
442 	if ((error = vnode_getwithref(vp)) != 0) {
443 		file_drop(uap->fd);
444 		return error;
445 	}
446 
447 	pvp = vnode_getparent(vp);
448 	if (pvp == NULL) {
449 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
450 			error = EBUSY;
451 		} else {
452 			error = EINVAL;
453 		}
454 		vnode_put(vp);
455 		file_drop(uap->fd);
456 		return error;
457 	}
458 
459 	memset(&cn, 0, sizeof(struct componentname));
460 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
461 	cn.cn_pnlen = MAXPATHLEN;
462 
463 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
464 		zfree(ZV_NAMEI, cn.cn_pnbuf);
465 		vnode_put(pvp);
466 		vnode_put(vp);
467 		file_drop(uap->fd);
468 		return error;
469 	}
470 
471 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
472 
473 	zfree(ZV_NAMEI, cn.cn_pnbuf);
474 	vnode_put(pvp);
475 	vnode_put(vp);
476 	file_drop(uap->fd);
477 
478 	return error;
479 }
480 
481 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
482 
483 /*
484  * Get the size of a graft file (a manifest or payload file).
485  * The vp should be an iocounted vnode.
486  */
487 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)488 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
489 {
490 	struct stat64 sb = {};
491 	int error;
492 
493 	*size = 0;
494 
495 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
496 	if (error) {
497 		return error;
498 	}
499 
500 	if (sb.st_size == 0) {
501 		error = ENODATA;
502 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
503 		error = EFBIG;
504 	} else {
505 		*size = (size_t) sb.st_size;
506 	}
507 
508 	return error;
509 }
510 
511 /*
512  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
513  * `size` must already be validated.
514  */
515 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)516 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
517 {
518 	return vn_rdwr(UIO_READ, graft_vp,
519 	           (caddr_t) buf, (int) size, /* offset */ 0,
520 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
521 	           vfs_context_ucred(vctx), /* resid */ NULL,
522 	           vfs_context_proc(vctx));
523 }
524 
525 /*
526  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
527  * and read it into `buf`.
528  * If `path_prefix` is non-NULL, verify that the file path has that prefix.
529  */
530 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,const char * path_prefix,size_t * size,void * buf)531 graft_secureboot_read_fd(int fd, vfs_context_t vctx, const char *path_prefix, size_t *size, void *buf)
532 {
533 	vnode_t metadata_vp = NULLVP;
534 	char *path = NULL;
535 	int error;
536 
537 	// Convert this graft fd to a vnode.
538 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
539 		goto out;
540 	}
541 
542 	// Verify that the vnode path starts with `path_prefix` if it was passed.
543 	if (path_prefix) {
544 		int len = MAXPATHLEN;
545 		path = zalloc(ZV_NAMEI);
546 		if ((error = vn_getpath(metadata_vp, path, &len))) {
547 			goto out;
548 		}
549 		if (strncmp(path, path_prefix, strlen(path_prefix))) {
550 			error = EINVAL;
551 			goto out;
552 		}
553 	}
554 
555 	// Get (and validate) size information.
556 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
557 		goto out;
558 	}
559 
560 	// Read each file into the provided buffer - we must get the expected amount of bytes.
561 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
562 		goto out;
563 	}
564 
565 out:
566 	if (path) {
567 		zfree(ZV_NAMEI, path);
568 	}
569 	if (metadata_vp) {
570 		vnode_put(metadata_vp);
571 		metadata_vp = NULLVP;
572 	}
573 
574 	return error;
575 }
576 
577 #if XNU_TARGET_OS_OSX
578 #if defined(__arm64e__)
579 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/manifests/"
580 #else /* x86_64 */
581 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/"
582 #endif /* x86_64 */
583 #else /* !XNU_TARGET_OS_OSX */
584 #define MOBILE_ASSET_DATA_VAULT_PATH "/private/var/MobileAsset/AssetsV2/manifests/"
585 #endif /* !XNU_TARGET_OS_OSX */
586 
587 /*
588  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
589  * provided in `gfs`, saving the size of data read in `gfs`.
590  */
591 static int
graft_secureboot_read_metadata(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)592 graft_secureboot_read_metadata(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
593     vfs_context_t vctx, fsioc_graft_fs_t *gfs)
594 {
595 	const char *manifest_path_prefix = NULL;
596 	int error;
597 
598 	// For Mobile Asset, make sure that the manifest comes from a data vault.
599 	if (graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET) {
600 		manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_PATH;
601 	}
602 
603 	// Read the authentic manifest.
604 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
605 	    manifest_path_prefix, &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
606 		return error;
607 	}
608 
609 	// The user manifest is currently unused, but set its size.
610 	gfs->user_manifest_size = 0;
611 
612 	// Read the payload.
613 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
614 	    NULL, &gfs->payload_size, gfs->payload))) {
615 		return error;
616 	}
617 
618 	return 0;
619 }
620 
621 /*
622  * Call into the filesystem to verify and graft a cryptex.
623  */
624 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)625 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
626     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
627 {
628 	fsioc_graft_fs_t gfs = {};
629 	uint64_t graft_dir_ino = 0;
630 	struct stat64 sb = {};
631 	int error;
632 
633 	// Pre-flight arguments.
634 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
635 		// Make sure that this graft version matches what we support.
636 		return ENOTSUP;
637 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
638 		// For this type, cryptex VP must live on same volume as the target of graft.
639 		return EXDEV;
640 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
641 		// We cannot graft upon non-directories.
642 		return ENOTDIR;
643 	} else if (cryptex_vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) {
644 		// We do not allow grafts inside disk images.
645 		return ENODEV;
646 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
647 	    sbc_args->sbc_payload_fd < 0) {
648 		// We cannot graft without a manifest and payload.
649 		return EINVAL;
650 	}
651 
652 	if (mounton_vp) {
653 		// Get the mounton's inode number.
654 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
655 		if (error) {
656 			return error;
657 		}
658 		graft_dir_ino = (uint64_t) sb.st_ino;
659 	}
660 
661 	// Create buffers (of our maximum-defined size) to store authentication info.
662 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
663 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
664 
665 	if (!gfs.authentic_manifest || !gfs.payload) {
666 		error = ENOMEM;
667 		goto out;
668 	}
669 
670 	// Read our fd's into our buffers.
671 	// (Note that this will set the buffer size fields in `gfs`.)
672 	error = graft_secureboot_read_metadata(graft_type, sbc_args, vctx, &gfs);
673 	if (error) {
674 		goto out;
675 	}
676 
677 	gfs.graft_version = FSIOC_GRAFT_VERSION;
678 	gfs.graft_type = graft_type;
679 	gfs.graft_4cc = sbc_args->sbc_4cc;
680 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
681 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
682 	}
683 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
684 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
685 	}
686 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
687 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
688 	}
689 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
690 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
691 	}
692 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
693 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
694 	}
695 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
696 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
697 	}
698 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
699 
700 	// Call into the FS to perform the graft (and validation).
701 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
702 
703 out:
704 	if (gfs.authentic_manifest) {
705 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
706 		gfs.authentic_manifest = NULL;
707 	}
708 	if (gfs.payload) {
709 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
710 		gfs.payload = NULL;
711 	}
712 
713 	return error;
714 }
715 
716 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
717 
718 /*
719  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
720  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
721  */
722 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)723 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
724 {
725 	int ua_dmgfd = uap->dmg_fd;
726 	user_addr_t ua_mountdir = uap->mountdir;
727 	uint32_t ua_grafttype = uap->graft_type;
728 	user_addr_t ua_graftargs = uap->gda;
729 
730 	graftdmg_args_un kern_gda = {};
731 	int error = 0;
732 	secure_boot_cryptex_args_t *sbc_args = NULL;
733 
734 	vnode_t cryptex_vp = NULLVP;
735 	vnode_t mounton_vp = NULLVP;
736 	struct nameidata nd = {};
737 	vfs_context_t ctx = vfs_context_current();
738 
739 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
740 		return EPERM;
741 	}
742 
743 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
744 	if (error) {
745 		return error;
746 	}
747 
748 	// Copy mount dir in, if provided.
749 	if (ua_mountdir != USER_ADDR_NULL) {
750 		// Acquire vnode for mount-on path
751 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
752 		    UIO_USERSPACE, ua_mountdir, ctx);
753 
754 		error = namei(&nd);
755 		if (error) {
756 			return error;
757 		}
758 		mounton_vp = nd.ni_vp;
759 	}
760 
761 	// Convert fd to vnode.
762 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
763 	if (error) {
764 		goto graftout;
765 	}
766 
767 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
768 		error = EINVAL;
769 	} else {
770 		sbc_args = &kern_gda.sbc_args;
771 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
772 	}
773 
774 graftout:
775 	if (cryptex_vp) {
776 		vnode_put(cryptex_vp);
777 		cryptex_vp = NULLVP;
778 	}
779 	if (mounton_vp) {
780 		vnode_put(mounton_vp);
781 		mounton_vp = NULLVP;
782 	}
783 	if (ua_mountdir != USER_ADDR_NULL) {
784 		nameidone(&nd);
785 	}
786 
787 	return error;
788 }
789 
790 /*
791  * Ungraft a cryptex disk image (via mount dir FD)
792  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
793  */
794 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)795 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
796 {
797 	int error = 0;
798 	user_addr_t ua_mountdir = uap->mountdir;
799 	fsioc_ungraft_fs_t ugfs;
800 	vnode_t mounton_vp = NULLVP;
801 	struct nameidata nd = {};
802 	vfs_context_t ctx = vfs_context_current();
803 
804 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
805 		return EPERM;
806 	}
807 
808 	if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
809 		return EINVAL;
810 	}
811 
812 	ugfs.ungraft_flags = 0;
813 
814 	// Acquire vnode for mount-on path
815 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
816 	    UIO_USERSPACE, ua_mountdir, ctx);
817 
818 	error = namei(&nd);
819 	if (error) {
820 		return error;
821 	}
822 	mounton_vp = nd.ni_vp;
823 
824 	// Call into the FS to perform the ungraft
825 	error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
826 
827 	vnode_put(mounton_vp);
828 	nameidone(&nd);
829 
830 	return error;
831 }
832 
833 
834 void
vfs_notify_mount(vnode_t pdvp)835 vfs_notify_mount(vnode_t pdvp)
836 {
837 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
838 	lock_vnode_and_post(pdvp, NOTE_WRITE);
839 }
840 
841 /*
842  * __mac_mount:
843  *	Mount a file system taking into account MAC label behavior.
844  *	See mount(2) man page for more information
845  *
846  * Parameters:    p                        Process requesting the mount
847  *                uap                      User argument descriptor (see below)
848  *                retval                   (ignored)
849  *
850  * Indirect:      uap->type                Filesystem type
851  *                uap->path                Path to mount
852  *                uap->data                Mount arguments
853  *                uap->mac_p               MAC info
854  *                uap->flags               Mount flags
855  *
856  *
857  * Returns:        0                       Success
858  *                !0                       Not success
859  */
860 boolean_t root_fs_upgrade_try = FALSE;
861 
862 #define MAX_NESTED_UNION_MOUNTS  10
863 
864 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)865 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
866 {
867 	vnode_t pvp = NULLVP;
868 	vnode_t vp = NULLVP;
869 	int need_nameidone = 0;
870 	vfs_context_t ctx = vfs_context_current();
871 	char fstypename[MFSNAMELEN];
872 	struct nameidata nd;
873 	size_t dummy = 0;
874 	char *labelstr = NULL;
875 	size_t labelsz = 0;
876 	int flags = uap->flags;
877 	int error;
878 	int num_retries = 0;
879 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
880 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
881 #else
882 #pragma unused(p)
883 #endif
884 	/*
885 	 * Get the fs type name from user space
886 	 */
887 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
888 	if (error) {
889 		return error;
890 	}
891 
892 retry:
893 	/*
894 	 * Get the vnode to be covered
895 	 */
896 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
897 	    UIO_USERSPACE, uap->path, ctx);
898 	if (flags & MNT_NOFOLLOW) {
899 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
900 	}
901 	error = namei(&nd);
902 	if (error) {
903 		goto out;
904 	}
905 	need_nameidone = 1;
906 	vp = nd.ni_vp;
907 	pvp = nd.ni_dvp;
908 
909 #ifdef CONFIG_IMGSRC_ACCESS
910 	/* Mounting image source cannot be batched with other operations */
911 	if (flags == MNT_IMGSRC_BY_INDEX) {
912 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
913 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
914 		goto out;
915 	}
916 #endif /* CONFIG_IMGSRC_ACCESS */
917 
918 #if CONFIG_MACF
919 	/*
920 	 * Get the label string (if any) from user space
921 	 */
922 	if (uap->mac_p != USER_ADDR_NULL) {
923 		struct user_mac mac;
924 		size_t ulen = 0;
925 
926 		if (is_64bit) {
927 			struct user64_mac mac64;
928 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
929 			mac.m_buflen = (user_size_t)mac64.m_buflen;
930 			mac.m_string = (user_addr_t)mac64.m_string;
931 		} else {
932 			struct user32_mac mac32;
933 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
934 			mac.m_buflen = mac32.m_buflen;
935 			mac.m_string = mac32.m_string;
936 		}
937 		if (error) {
938 			goto out;
939 		}
940 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
941 		    (mac.m_buflen < 2)) {
942 			error = EINVAL;
943 			goto out;
944 		}
945 		labelsz = mac.m_buflen;
946 		labelstr = kalloc_data(labelsz, Z_WAITOK);
947 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
948 		if (error) {
949 			goto out;
950 		}
951 		AUDIT_ARG(mac_string, labelstr);
952 	}
953 #endif /* CONFIG_MACF */
954 
955 	AUDIT_ARG(fflags, flags);
956 
957 	if (flags & MNT_UNION) {
958 #if CONFIG_UNION_MOUNTS
959 		mount_t mp = vp->v_mount;
960 		int nested_union_mounts = 0;
961 
962 		name_cache_lock_shared();
963 
964 		/* Walk up the vnodecovered chain and check for nested union mounts. */
965 		mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
966 		while (mp) {
967 			if (!(mp->mnt_flag & MNT_UNION)) {
968 				break;
969 			}
970 			mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
971 
972 			/*
973 			 * Limit the max nested unon mounts to prevent stack exhaustion
974 			 * when calling lookup_traverse_union().
975 			 */
976 			if (++nested_union_mounts >= MAX_NESTED_UNION_MOUNTS) {
977 				error = ELOOP;
978 				break;
979 			}
980 		}
981 
982 		name_cache_unlock();
983 		if (error) {
984 			goto out;
985 		}
986 #else
987 		error = EPERM;
988 		goto out;
989 #endif /* CONFIG_UNION_MOUNTS */
990 	}
991 
992 	if ((vp->v_flag & VROOT) &&
993 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
994 #if CONFIG_UNION_MOUNTS
995 		if (!(flags & MNT_UNION)) {
996 			flags |= MNT_UPDATE;
997 		} else {
998 			/*
999 			 * For a union mount on '/', treat it as fresh
1000 			 * mount instead of update.
1001 			 * Otherwise, union mouting on '/' used to panic the
1002 			 * system before, since mnt_vnodecovered was found to
1003 			 * be NULL for '/' which is required for unionlookup
1004 			 * after it gets ENOENT on union mount.
1005 			 */
1006 			flags = (flags & ~(MNT_UPDATE));
1007 		}
1008 #else
1009 		flags |= MNT_UPDATE;
1010 #endif /* CONFIG_UNION_MOUNTS */
1011 
1012 #if SECURE_KERNEL
1013 		if ((flags & MNT_RDONLY) == 0) {
1014 			/* Release kernels are not allowed to mount "/" as rw */
1015 			error = EPERM;
1016 			goto out;
1017 		}
1018 #endif
1019 
1020 		/*
1021 		 * See 7392553 for more details on why this check exists.
1022 		 * Suffice to say: If this check is ON and something tries
1023 		 * to mount the rootFS RW, we'll turn off the codesign
1024 		 * bitmap optimization.
1025 		 */
1026 #if CHECK_CS_VALIDATION_BITMAP
1027 		if ((flags & MNT_RDONLY) == 0) {
1028 			root_fs_upgrade_try = TRUE;
1029 		}
1030 #endif
1031 	}
1032 
1033 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
1034 	    labelstr, ctx);
1035 
1036 out:
1037 
1038 #if CONFIG_MACF
1039 	kfree_data(labelstr, labelsz);
1040 #endif /* CONFIG_MACF */
1041 
1042 	if (vp) {
1043 		vnode_put(vp);
1044 		vp = NULLVP;
1045 	}
1046 	if (pvp) {
1047 		vnode_put(pvp);
1048 		pvp = NULLVP;
1049 	}
1050 	if (need_nameidone) {
1051 		nameidone(&nd);
1052 		need_nameidone = 0;
1053 	}
1054 
1055 	if (error == EBUSY) {
1056 		/* Retry the lookup and mount again due to concurrent mounts. */
1057 		if (++num_retries < MAX_MOUNT_RETRIES) {
1058 			goto retry;
1059 		}
1060 	}
1061 
1062 	return error;
1063 }
1064 
1065 /*
1066  * common mount implementation (final stage of mounting)
1067  *
1068  * Arguments:
1069  *  fstypename	file system type (ie it's vfs name)
1070  *  pvp		parent of covered vnode
1071  *  vp		covered vnode
1072  *  cnp		component name (ie path) of covered vnode
1073  *  flags	generic mount flags
1074  *  fsmountargs	file system specific data
1075  *  labelstr	optional MAC label
1076  *  kernelmount	TRUE for mounts initiated from inside the kernel
1077  *  ctx		caller's context
1078  */
1079 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1080 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1081     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1082     char *labelstr, vfs_context_t ctx)
1083 {
1084 #if !CONFIG_MACF
1085 #pragma unused(labelstr)
1086 #endif
1087 	struct vnode *devvp = NULLVP;
1088 	struct vnode *device_vnode = NULLVP;
1089 #if CONFIG_MACF
1090 	struct vnode *rvp;
1091 #endif
1092 	struct mount *mp = NULL;
1093 	struct vfstable *vfsp = (struct vfstable *)0;
1094 	struct proc *p = vfs_context_proc(ctx);
1095 	int error, flag = 0;
1096 	bool flag_set = false;
1097 	user_addr_t devpath = USER_ADDR_NULL;
1098 	int ronly = 0;
1099 	int mntalloc = 0;
1100 	boolean_t vfsp_ref = FALSE;
1101 	boolean_t is_rwlock_locked = FALSE;
1102 	boolean_t did_rele = FALSE;
1103 	boolean_t have_usecount = FALSE;
1104 	boolean_t did_set_lmount = FALSE;
1105 	boolean_t did_set_vmount = FALSE;
1106 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1107 
1108 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1109 	/* Check for mutually-exclusive flag bits */
1110 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1111 	int bitcount = 0;
1112 	while (checkflags != 0) {
1113 		checkflags &= (checkflags - 1);
1114 		bitcount++;
1115 	}
1116 
1117 	if (bitcount > 1) {
1118 		//not allowed to request multiple mount-by-role flags
1119 		error = EINVAL;
1120 		goto out1;
1121 	}
1122 #endif
1123 
1124 	/*
1125 	 * Process an update for an existing mount
1126 	 */
1127 	if (flags & MNT_UPDATE) {
1128 		if ((vp->v_flag & VROOT) == 0) {
1129 			error = EINVAL;
1130 			goto out1;
1131 		}
1132 		mp = vp->v_mount;
1133 
1134 		/* if unmount or mount in progress, return error */
1135 		mount_lock_spin(mp);
1136 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1137 			mount_unlock(mp);
1138 			error = EBUSY;
1139 			goto out1;
1140 		}
1141 		mp->mnt_lflag |= MNT_LMOUNT;
1142 		did_set_lmount = TRUE;
1143 		mount_unlock(mp);
1144 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1145 		is_rwlock_locked = TRUE;
1146 		/*
1147 		 * We only allow the filesystem to be reloaded if it
1148 		 * is currently mounted read-only.
1149 		 */
1150 		if ((flags & MNT_RELOAD) &&
1151 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1152 			error = ENOTSUP;
1153 			goto out1;
1154 		}
1155 
1156 		/*
1157 		 * If content protection is enabled, update mounts are not
1158 		 * allowed to turn it off.
1159 		 */
1160 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1161 		    ((flags & MNT_CPROTECT) == 0)) {
1162 			error = EINVAL;
1163 			goto out1;
1164 		}
1165 
1166 		/*
1167 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1168 		 * failure to return an error for this so we'll just silently
1169 		 * add it if it is not passed in.
1170 		 */
1171 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1172 		    ((flags & MNT_REMOVABLE) == 0)) {
1173 			flags |= MNT_REMOVABLE;
1174 		}
1175 
1176 		/* Can't downgrade the backer of the root FS */
1177 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1178 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1179 			error = ENOTSUP;
1180 			goto out1;
1181 		}
1182 
1183 		/*
1184 		 * Only root, or the user that did the original mount is
1185 		 * permitted to update it.
1186 		 */
1187 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1188 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1189 			goto out1;
1190 		}
1191 #if CONFIG_MACF
1192 		error = mac_mount_check_remount(ctx, mp, flags);
1193 		if (error != 0) {
1194 			goto out1;
1195 		}
1196 #endif
1197 		/*
1198 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1199 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1200 		 */
1201 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1202 			flags |= MNT_NOSUID | MNT_NODEV;
1203 			if (mp->mnt_flag & MNT_NOEXEC) {
1204 				flags |= MNT_NOEXEC;
1205 			}
1206 		}
1207 		flag = mp->mnt_flag;
1208 		flag_set = true;
1209 
1210 
1211 
1212 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1213 
1214 		vfsp = mp->mnt_vtable;
1215 		goto update;
1216 	} // MNT_UPDATE
1217 
1218 	/*
1219 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1220 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1221 	 */
1222 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1223 		flags |= MNT_NOSUID | MNT_NODEV;
1224 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1225 			flags |= MNT_NOEXEC;
1226 		}
1227 	}
1228 
1229 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1230 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1231 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1232 	mount_list_lock();
1233 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1234 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1235 			vfsp->vfc_refcount++;
1236 			vfsp_ref = TRUE;
1237 			break;
1238 		}
1239 	}
1240 	mount_list_unlock();
1241 	if (vfsp == NULL) {
1242 		error = ENODEV;
1243 		goto out1;
1244 	}
1245 
1246 	/*
1247 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1248 	 * except in ROSV configs and for the initial BaseSystem root.
1249 	 */
1250 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1251 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1252 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1253 		error = EINVAL;  /* unsupported request */
1254 		goto out1;
1255 	}
1256 
1257 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1258 	if (error != 0) {
1259 		goto out1;
1260 	}
1261 
1262 	/*
1263 	 * Upon successful of prepare_coveredvp(), VMOUNT is set for the covered vp.
1264 	 */
1265 	did_set_vmount = TRUE;
1266 
1267 	/*
1268 	 * Allocate and initialize the filesystem (mount_t)
1269 	 */
1270 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1271 	mntalloc = 1;
1272 
1273 	/* Initialize the default IO constraints */
1274 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1275 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1276 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1277 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1278 	mp->mnt_devblocksize = DEV_BSIZE;
1279 	mp->mnt_alignmentmask = PAGE_MASK;
1280 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1281 	mp->mnt_ioscale = 1;
1282 	mp->mnt_ioflags = 0;
1283 	mp->mnt_realrootvp = NULLVP;
1284 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1285 
1286 	mp->mnt_lflag |= MNT_LMOUNT;
1287 	did_set_lmount = TRUE;
1288 
1289 	TAILQ_INIT(&mp->mnt_vnodelist);
1290 	TAILQ_INIT(&mp->mnt_workerqueue);
1291 	TAILQ_INIT(&mp->mnt_newvnodes);
1292 	mount_lock_init(mp);
1293 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1294 	is_rwlock_locked = TRUE;
1295 	mp->mnt_op = vfsp->vfc_vfsops;
1296 	mp->mnt_vtable = vfsp;
1297 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1298 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1299 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1300 	do {
1301 		size_t pathlen = MAXPATHLEN;
1302 
1303 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1304 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1305 		}
1306 	} while (0);
1307 	mp->mnt_vnodecovered = vp;
1308 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1309 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1310 	mp->mnt_devbsdunit = 0;
1311 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1312 
1313 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1314 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1315 
1316 	if (kernelmount) {
1317 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1318 	}
1319 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1320 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1321 	}
1322 
1323 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1324 		// kernel mounted devfs
1325 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1326 	}
1327 
1328 update:
1329 
1330 	/*
1331 	 * Set the mount level flags.
1332 	 */
1333 	if (flags & MNT_RDONLY) {
1334 		mp->mnt_flag |= MNT_RDONLY;
1335 	} else if (mp->mnt_flag & MNT_RDONLY) {
1336 		// disallow read/write upgrades of file systems that
1337 		// had the TYPENAME_OVERRIDE feature set.
1338 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1339 			error = EPERM;
1340 			goto out1;
1341 		}
1342 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1343 	}
1344 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1345 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1346 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1347 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1348 	    MNT_QUARANTINE | MNT_CPROTECT);
1349 
1350 #if SECURE_KERNEL
1351 #if !CONFIG_MNT_SUID
1352 	/*
1353 	 * On release builds of iOS based platforms, always enforce NOSUID on
1354 	 * all mounts. We do this here because we can catch update mounts as well as
1355 	 * non-update mounts in this case.
1356 	 */
1357 	mp->mnt_flag |= (MNT_NOSUID);
1358 #endif
1359 #endif
1360 
1361 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1362 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1363 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1364 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1365 	    MNT_QUARANTINE | MNT_CPROTECT);
1366 
1367 #if CONFIG_MACF
1368 	if (flags & MNT_MULTILABEL) {
1369 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1370 			error = EINVAL;
1371 			goto out1;
1372 		}
1373 		mp->mnt_flag |= MNT_MULTILABEL;
1374 	}
1375 #endif
1376 	/*
1377 	 * Process device path for local file systems if requested.
1378 	 *
1379 	 * Snapshot and mount-by-role mounts do not use this path; they are
1380 	 * passing other opaque data in the device path field.
1381 	 *
1382 	 * Basesystemroot mounts pass a device path to be resolved here,
1383 	 * but it's just a char * already inside the kernel, which
1384 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1385 	 * mounts we must skip copyin (both of the address and of the string
1386 	 * (in NDINIT).
1387 	 */
1388 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1389 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1390 		boolean_t do_copyin_devpath = true;
1391 #if CONFIG_BASESYSTEMROOT
1392 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1393 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1394 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1395 			// but is actually a char ** pointing to a (kernelspace) string.
1396 			// We manually unpack it with a series of casts and dereferences
1397 			// that reverses what was done just above us on the stack in
1398 			// imageboot_pivot_image().
1399 			// After retrieving the path to the dev node (which we will NDINIT
1400 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1401 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1402 			char **devnamepp = (char **)fsmountargs;
1403 			char *devnamep = *devnamepp;
1404 			devpath = CAST_USER_ADDR_T(devnamep);
1405 			do_copyin_devpath = false;
1406 			fsmountargs = USER_ADDR_NULL;
1407 
1408 			//Now that we have a mp, denote that this mount is for the basesystem.
1409 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1410 		}
1411 #endif // CONFIG_BASESYSTEMROOT
1412 
1413 		if (do_copyin_devpath) {
1414 			if (vfs_context_is64bit(ctx)) {
1415 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1416 					goto out1;
1417 				}
1418 				fsmountargs += sizeof(devpath);
1419 			} else {
1420 				user32_addr_t tmp;
1421 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1422 					goto out1;
1423 				}
1424 				/* munge into LP64 addr */
1425 				devpath = CAST_USER_ADDR_T(tmp);
1426 				fsmountargs += sizeof(tmp);
1427 			}
1428 		}
1429 
1430 		/* Lookup device and authorize access to it */
1431 		if ((devpath)) {
1432 			struct nameidata nd;
1433 
1434 			enum uio_seg seg = UIO_USERSPACE;
1435 #if CONFIG_BASESYSTEMROOT
1436 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1437 				seg = UIO_SYSSPACE;
1438 			}
1439 #endif // CONFIG_BASESYSTEMROOT
1440 
1441 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1442 			if (flags & MNT_NOFOLLOW) {
1443 				nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
1444 			}
1445 			if ((error = namei(&nd))) {
1446 				goto out1;
1447 			}
1448 
1449 			devvp = nd.ni_vp;
1450 
1451 			if (devvp->v_type != VBLK) {
1452 				error = ENOTBLK;
1453 				nameidone(&nd);
1454 				goto out2;
1455 			}
1456 			if (major(devvp->v_rdev) >= nblkdev) {
1457 				error = ENXIO;
1458 				nameidone(&nd);
1459 				goto out2;
1460 			}
1461 			/*
1462 			 * If mount by non-root, then verify that user has necessary
1463 			 * permissions on the device.
1464 			 */
1465 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1466 				kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1467 
1468 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1469 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1470 				}
1471 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1472 					nameidone(&nd);
1473 					goto out2;
1474 				}
1475 			}
1476 
1477 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1478 			nameidone(&nd);
1479 		}
1480 		/* On first mount, preflight and open device */
1481 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1482 			if ((error = vnode_ref(devvp))) {
1483 				goto out2;
1484 			}
1485 			/*
1486 			 * Disallow multiple mounts of the same device.
1487 			 * Disallow mounting of a device that is currently in use
1488 			 * (except for root, which might share swap device for miniroot).
1489 			 * Flush out any old buffers remaining from a previous use.
1490 			 */
1491 			if ((error = vfs_setmounting(devvp))) {
1492 				vnode_rele(devvp);
1493 				goto out2;
1494 			}
1495 
1496 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1497 				error = EBUSY;
1498 				goto out3;
1499 			}
1500 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1501 				error = ENOTBLK;
1502 				goto out3;
1503 			}
1504 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1505 				goto out3;
1506 			}
1507 
1508 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1509 #if CONFIG_MACF
1510 			error = mac_vnode_check_open(ctx,
1511 			    devvp,
1512 			    ronly ? FREAD : FREAD | FWRITE);
1513 			if (error) {
1514 				goto out3;
1515 			}
1516 #endif /* MAC */
1517 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1518 				goto out3;
1519 			}
1520 
1521 			mp->mnt_devvp = devvp;
1522 			device_vnode = devvp;
1523 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1524 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1525 		    (device_vnode = mp->mnt_devvp)) {
1526 			dev_t dev;
1527 			int maj;
1528 			/*
1529 			 * If upgrade to read-write by non-root, then verify
1530 			 * that user has necessary permissions on the device.
1531 			 */
1532 			vnode_getalways(device_vnode);
1533 
1534 			if (suser(vfs_context_ucred(ctx), NULL) &&
1535 			    (error = vnode_authorize(device_vnode, NULL,
1536 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1537 			    ctx)) != 0) {
1538 				vnode_put(device_vnode);
1539 				goto out2;
1540 			}
1541 
1542 			/* Tell the device that we're upgrading */
1543 			dev = (dev_t)device_vnode->v_rdev;
1544 			maj = major(dev);
1545 
1546 			if ((u_int)maj >= (u_int)nblkdev) {
1547 				panic("Volume mounted on a device with invalid major number.");
1548 			}
1549 
1550 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1551 			vnode_put(device_vnode);
1552 			device_vnode = NULLVP;
1553 			if (error != 0) {
1554 				goto out2;
1555 			}
1556 		}
1557 	} // localargs && !(snapshot | data | vm)
1558 
1559 #if CONFIG_MACF
1560 	if ((flags & MNT_UPDATE) == 0) {
1561 		mac_mount_label_init(mp);
1562 		mac_mount_label_associate(ctx, mp);
1563 	}
1564 	if (labelstr) {
1565 		if ((flags & MNT_UPDATE) != 0) {
1566 			error = mac_mount_check_label_update(ctx, mp);
1567 			if (error != 0) {
1568 				goto out3;
1569 			}
1570 		}
1571 	}
1572 #endif
1573 	/*
1574 	 * Mount the filesystem.  We already asserted that internal_flags
1575 	 * cannot have more than one mount-by-role bit set.
1576 	 */
1577 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1578 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1579 		    (caddr_t)fsmountargs, 0, ctx);
1580 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1581 #if CONFIG_ROSV_STARTUP
1582 		struct mount *origin_mp = (struct mount*)fsmountargs;
1583 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1584 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1585 		if (error) {
1586 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1587 		} else {
1588 			/* Mark volume associated with system volume */
1589 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1590 
1591 			/* Attempt to acquire the mnt_devvp and set it up */
1592 			struct vnode *mp_devvp = NULL;
1593 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1594 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1595 				    0, &mp_devvp, vfs_context_kernel());
1596 				if (!lerr) {
1597 					mp->mnt_devvp = mp_devvp;
1598 					//vnode_lookup took an iocount, need to drop it.
1599 					vnode_put(mp_devvp);
1600 					// now set `device_vnode` to the devvp that was acquired.
1601 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1602 					// note that though the iocount above was dropped, the mount acquires
1603 					// an implicit reference against the device.
1604 					device_vnode = mp_devvp;
1605 				}
1606 			}
1607 		}
1608 #else
1609 		error = EINVAL;
1610 #endif
1611 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1612 #if CONFIG_MOUNT_VM
1613 		struct mount *origin_mp = (struct mount*)fsmountargs;
1614 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1615 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1616 		if (error) {
1617 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1618 		} else {
1619 			/* Mark volume associated with system volume and a swap mount */
1620 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1621 			/* Attempt to acquire the mnt_devvp and set it up */
1622 			struct vnode *mp_devvp = NULL;
1623 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1624 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1625 				    0, &mp_devvp, vfs_context_kernel());
1626 				if (!lerr) {
1627 					mp->mnt_devvp = mp_devvp;
1628 					//vnode_lookup took an iocount, need to drop it.
1629 					vnode_put(mp_devvp);
1630 
1631 					// now set `device_vnode` to the devvp that was acquired.
1632 					// note that though the iocount above was dropped, the mount acquires
1633 					// an implicit reference against the device.
1634 					device_vnode = mp_devvp;
1635 				}
1636 			}
1637 		}
1638 #else
1639 		error = EINVAL;
1640 #endif
1641 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1642 #if CONFIG_MOUNT_PREBOOTRECOVERY
1643 		struct mount *origin_mp = (struct mount*)fsmountargs;
1644 		uint32_t mount_role = 0;
1645 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1646 			mount_role = VFS_PREBOOT_ROLE;
1647 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1648 			mount_role = VFS_RECOVERY_ROLE;
1649 		}
1650 
1651 		if (mount_role != 0) {
1652 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1653 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1654 			if (error) {
1655 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1656 			} else {
1657 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1658 				/* Mark volume associated with system volume */
1659 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1660 				/* Attempt to acquire the mnt_devvp and set it up */
1661 				struct vnode *mp_devvp = NULL;
1662 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1663 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1664 					    0, &mp_devvp, vfs_context_kernel());
1665 					if (!lerr) {
1666 						mp->mnt_devvp = mp_devvp;
1667 						//vnode_lookup took an iocount, need to drop it.
1668 						vnode_put(mp_devvp);
1669 
1670 						// now set `device_vnode` to the devvp that was acquired.
1671 						// note that though the iocount above was dropped, the mount acquires
1672 						// an implicit reference against the device.
1673 						device_vnode = mp_devvp;
1674 					}
1675 				}
1676 			}
1677 		} else {
1678 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1679 			error = EINVAL;
1680 		}
1681 #else
1682 		error = EINVAL;
1683 #endif
1684 	} else {
1685 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1686 	}
1687 
1688 	if (flags & MNT_UPDATE) {
1689 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1690 			mp->mnt_flag &= ~MNT_RDONLY;
1691 		}
1692 		mp->mnt_flag &= ~
1693 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1694 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1695 		if (error) {
1696 			mp->mnt_flag = flag;  /* restore flag value */
1697 		}
1698 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1699 		lck_rw_done(&mp->mnt_rwlock);
1700 		is_rwlock_locked = FALSE;
1701 		if (!error) {
1702 			enablequotas(mp, ctx);
1703 		}
1704 		goto exit;
1705 	}
1706 
1707 	/*
1708 	 * Put the new filesystem on the mount list after root.
1709 	 */
1710 	if (error == 0) {
1711 		struct vfs_attr vfsattr;
1712 		if (device_vnode) {
1713 			/*
1714 			 *   cache the IO attributes for the underlying physical media...
1715 			 *   an error return indicates the underlying driver doesn't
1716 			 *   support all the queries necessary... however, reasonable
1717 			 *   defaults will have been set, so no reason to bail or care
1718 			 *
1719 			 *   Need to do this before calling the MAC hook as it needs
1720 			 *   information from this call.
1721 			 */
1722 			vfs_init_io_attributes(device_vnode, mp);
1723 		}
1724 
1725 #if CONFIG_MACF
1726 		error = mac_mount_check_mount_late(ctx, mp);
1727 		if (error != 0) {
1728 			goto out4;
1729 		}
1730 
1731 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1732 			error = VFS_ROOT(mp, &rvp, ctx);
1733 			if (error) {
1734 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1735 				goto out4;
1736 			}
1737 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1738 			/*
1739 			 * drop reference provided by VFS_ROOT
1740 			 */
1741 			vnode_put(rvp);
1742 
1743 			if (error) {
1744 				goto out4;
1745 			}
1746 		}
1747 #endif  /* MAC */
1748 
1749 		vnode_lock_spin(vp);
1750 		CLR(vp->v_flag, VMOUNT);
1751 		vp->v_mountedhere = mp;
1752 		SET(vp->v_flag, VMOUNTEDHERE);
1753 
1754 		/*
1755 		 * Wakeup any waiter(s) in prepare_coveredvp() that is waiting for the
1756 		 * 'v_mountedhere' to be planted.
1757 		 */
1758 		wakeup(&vp->v_flag);
1759 		vnode_unlock(vp);
1760 
1761 		/*
1762 		 * taking the name_cache_lock exclusively will
1763 		 * insure that everyone is out of the fast path who
1764 		 * might be trying to use a now stale copy of
1765 		 * vp->v_mountedhere->mnt_realrootvp
1766 		 * bumping mount_generation causes the cached values
1767 		 * to be invalidated
1768 		 */
1769 		name_cache_lock();
1770 		mount_generation++;
1771 		name_cache_unlock();
1772 
1773 		error = vnode_ref(vp);
1774 		if (error != 0) {
1775 			goto out4;
1776 		}
1777 
1778 		have_usecount = TRUE;
1779 
1780 		error = checkdirs(vp, ctx);
1781 		if (error != 0) {
1782 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1783 			goto out4;
1784 		}
1785 		/*
1786 		 * there is no cleanup code here so I have made it void
1787 		 * we need to revisit this
1788 		 */
1789 		(void)VFS_START(mp, 0, ctx);
1790 
1791 		if (mount_list_add(mp) != 0) {
1792 			/*
1793 			 * The system is shutting down trying to umount
1794 			 * everything, so fail with a plausible errno.
1795 			 */
1796 			error = EBUSY;
1797 			goto out4;
1798 		}
1799 		lck_rw_done(&mp->mnt_rwlock);
1800 		is_rwlock_locked = FALSE;
1801 
1802 		/* Check if this mounted file system supports EAs or named streams. */
1803 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1804 		VFSATTR_INIT(&vfsattr);
1805 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1806 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1807 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1808 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1809 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1810 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1811 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1812 			}
1813 #if NAMEDSTREAMS
1814 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1815 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1816 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1817 			}
1818 #endif
1819 			/* Check if this file system supports path from id lookups. */
1820 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1821 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1822 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1823 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1824 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1825 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1826 			}
1827 
1828 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1829 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1830 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1831 			}
1832 		}
1833 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1834 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1835 		}
1836 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1837 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1838 		}
1839 		/* increment the operations count */
1840 		OSAddAtomic(1, &vfs_nummntops);
1841 		enablequotas(mp, ctx);
1842 
1843 		if (device_vnode) {
1844 			vfs_setmountedon(device_vnode);
1845 		}
1846 
1847 		/* Now that mount is setup, notify the listeners */
1848 		vfs_notify_mount(pvp);
1849 		IOBSDMountChange(mp, kIOMountChangeMount);
1850 	} else {
1851 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1852 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1853 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1854 			    mp->mnt_vtable->vfc_name, error);
1855 		}
1856 
1857 		vnode_lock_spin(vp);
1858 		CLR(vp->v_flag, VMOUNT);
1859 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
1860 		wakeup(&vp->v_flag);
1861 		vnode_unlock(vp);
1862 		mount_list_lock();
1863 		mp->mnt_vtable->vfc_refcount--;
1864 		mount_list_unlock();
1865 
1866 		if (device_vnode) {
1867 			vnode_rele(device_vnode);
1868 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1869 			vfs_clearmounting(device_vnode);
1870 		}
1871 		lck_rw_done(&mp->mnt_rwlock);
1872 		is_rwlock_locked = FALSE;
1873 
1874 		if (nc_smr_enabled) {
1875 			vfs_smr_synchronize();
1876 		}
1877 
1878 		/*
1879 		 * if we get here, we have a mount structure that needs to be freed,
1880 		 * but since the coveredvp hasn't yet been updated to point at it,
1881 		 * no need to worry about other threads holding a crossref on this mp
1882 		 * so it's ok to just free it
1883 		 */
1884 		mount_lock_destroy(mp);
1885 #if CONFIG_MACF
1886 		mac_mount_label_destroy(mp);
1887 #endif
1888 		zfree(mount_zone, mp);
1889 		did_set_lmount = false;
1890 	}
1891 exit:
1892 	/*
1893 	 * drop I/O count on the device vp if there was one
1894 	 */
1895 	if (devpath && devvp) {
1896 		vnode_put(devvp);
1897 	}
1898 
1899 	if (did_set_lmount) {
1900 		mount_lock_spin(mp);
1901 		mp->mnt_lflag &= ~MNT_LMOUNT;
1902 		mount_unlock(mp);
1903 	}
1904 
1905 	return error;
1906 
1907 /* Error condition exits */
1908 out4:
1909 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1910 
1911 	/*
1912 	 * If the mount has been placed on the covered vp,
1913 	 * it may have been discovered by now, so we have
1914 	 * to treat this just like an unmount
1915 	 */
1916 	mount_lock_spin(mp);
1917 	mp->mnt_lflag |= MNT_LDEAD;
1918 	mount_unlock(mp);
1919 
1920 	if (device_vnode != NULLVP) {
1921 		vnode_rele(device_vnode);
1922 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1923 		    ctx);
1924 		vfs_clearmounting(device_vnode);
1925 		did_rele = TRUE;
1926 	}
1927 
1928 	vnode_lock_spin(vp);
1929 
1930 	mp->mnt_crossref++;
1931 	CLR(vp->v_flag, VMOUNTEDHERE);
1932 	vp->v_mountedhere = (mount_t) 0;
1933 
1934 	vnode_unlock(vp);
1935 
1936 	if (have_usecount) {
1937 		vnode_rele(vp);
1938 	}
1939 out3:
1940 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1941 		vnode_rele(devvp);
1942 		vfs_clearmounting(devvp);
1943 	}
1944 out2:
1945 	if (devpath && devvp) {
1946 		vnode_put(devvp);
1947 	}
1948 out1:
1949 	/* Release mnt_rwlock only when it was taken */
1950 	if (is_rwlock_locked == TRUE) {
1951 		if (flag_set) {
1952 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1953 		}
1954 		lck_rw_done(&mp->mnt_rwlock);
1955 	}
1956 
1957 	if (did_set_lmount) {
1958 		mount_lock_spin(mp);
1959 		mp->mnt_lflag &= ~MNT_LMOUNT;
1960 		mount_unlock(mp);
1961 	}
1962 
1963 	if (did_set_vmount) {
1964 		vnode_lock_spin(vp);
1965 		CLR(vp->v_flag, VMOUNT);
1966 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
1967 		wakeup(&vp->v_flag);
1968 		vnode_unlock(vp);
1969 	}
1970 
1971 	if (mntalloc) {
1972 		if (mp->mnt_crossref) {
1973 			mount_dropcrossref(mp, vp, 0);
1974 		} else {
1975 			if (nc_smr_enabled) {
1976 				vfs_smr_synchronize();
1977 			}
1978 
1979 			mount_lock_destroy(mp);
1980 #if CONFIG_MACF
1981 			mac_mount_label_destroy(mp);
1982 #endif
1983 			zfree(mount_zone, mp);
1984 		}
1985 	}
1986 	if (vfsp_ref) {
1987 		mount_list_lock();
1988 		vfsp->vfc_refcount--;
1989 		mount_list_unlock();
1990 	}
1991 
1992 	return error;
1993 }
1994 
1995 /*
1996  * Flush in-core data, check for competing mount attempts,
1997  * and set VMOUNT
1998  */
1999 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)2000 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
2001 {
2002 #if !CONFIG_MACF
2003 #pragma unused(cnp,fsname)
2004 #endif
2005 	struct vnode_attr va;
2006 	int error;
2007 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
2008 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
2009 	boolean_t is_kmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
2010 
2011 	if (!skip_auth) {
2012 		/*
2013 		 * If the user is not root, ensure that they own the directory
2014 		 * onto which we are attempting to mount.
2015 		 */
2016 		VATTR_INIT(&va);
2017 		VATTR_WANTED(&va, va_uid);
2018 		if ((error = vnode_getattr(vp, &va, ctx)) ||
2019 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2020 		    (!vfs_context_issuser(ctx)))) {
2021 			error = EPERM;
2022 			goto out;
2023 		}
2024 	}
2025 
2026 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
2027 		goto out;
2028 	}
2029 
2030 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
2031 		goto out;
2032 	}
2033 
2034 	if (vp->v_type != VDIR) {
2035 		error = ENOTDIR;
2036 		goto out;
2037 	}
2038 
2039 	vnode_lock_spin(vp);
2040 
2041 	if (is_fmount && (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL))) {
2042 		error = EBUSY;
2043 	} else if (!is_kmount && (ISSET(vp->v_flag, VMOUNT) ||
2044 	    (vp->v_mountedhere != NULL))) {
2045 		/*
2046 		 * For mount triggered from mount() call, we want to wait for the
2047 		 * current in-progress mount to complete, redo lookup and retry the
2048 		 * mount again. Similarly, we also want to retry if we lost the race
2049 		 * due to concurrent mounts and the 'VMOUNT' flag has been cleared and
2050 		 * 'v_mountedhere' has been planted after initial lookup.
2051 		 */
2052 		if (ISSET(vp->v_flag, VMOUNT)) {
2053 			vnode_lock_convert(vp);
2054 			msleep(&vp->v_flag, &vp->v_lock, PVFS, "vnode_waitformount", NULL);
2055 		}
2056 		error = EBUSY;
2057 	} else if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
2058 		error = EBUSY;
2059 	}
2060 
2061 	if (error) {
2062 		vnode_unlock(vp);
2063 		goto out;
2064 	}
2065 	SET(vp->v_flag, VMOUNT);
2066 	vnode_unlock(vp);
2067 
2068 #if CONFIG_MACF
2069 	error = mac_mount_check_mount(ctx, vp,
2070 	    cnp, fsname);
2071 	if (error != 0) {
2072 		vnode_lock_spin(vp);
2073 		CLR(vp->v_flag, VMOUNT);
2074 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2075 		wakeup(&vp->v_flag);
2076 		vnode_unlock(vp);
2077 	}
2078 #endif
2079 
2080 out:
2081 	return error;
2082 }
2083 
2084 #if CONFIG_IMGSRC_ACCESS
2085 
2086 #define DEBUG_IMGSRC 0
2087 
2088 #if DEBUG_IMGSRC
2089 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2090 #else
2091 #define IMGSRC_DEBUG(args...) do { } while(0)
2092 #endif
2093 
2094 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2095 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2096 {
2097 	struct nameidata nd;
2098 	vnode_t vp, realdevvp;
2099 	kauth_action_t accessmode;
2100 	int error;
2101 	enum uio_seg uio = UIO_USERSPACE;
2102 
2103 	if (ctx == vfs_context_kernel()) {
2104 		uio = UIO_SYSSPACE;
2105 	}
2106 
2107 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2108 	if ((error = namei(&nd))) {
2109 		IMGSRC_DEBUG("namei() failed with %d\n", error);
2110 		return error;
2111 	}
2112 
2113 	vp = nd.ni_vp;
2114 
2115 	if (!vnode_isblk(vp)) {
2116 		IMGSRC_DEBUG("Not block device.\n");
2117 		error = ENOTBLK;
2118 		goto out;
2119 	}
2120 
2121 	realdevvp = mp->mnt_devvp;
2122 	if (realdevvp == NULLVP) {
2123 		IMGSRC_DEBUG("No device backs the mount.\n");
2124 		error = ENXIO;
2125 		goto out;
2126 	}
2127 
2128 	error = vnode_getwithref(realdevvp);
2129 	if (error != 0) {
2130 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2131 		goto out;
2132 	}
2133 
2134 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2135 		IMGSRC_DEBUG("Wrong dev_t.\n");
2136 		error = ENXIO;
2137 		goto out1;
2138 	}
2139 
2140 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2141 
2142 	/*
2143 	 * If mount by non-root, then verify that user has necessary
2144 	 * permissions on the device.
2145 	 */
2146 	if (!vfs_context_issuser(ctx)) {
2147 		accessmode = KAUTH_VNODE_READ_DATA;
2148 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2149 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2150 		}
2151 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2152 			IMGSRC_DEBUG("Access denied.\n");
2153 			goto out1;
2154 		}
2155 	}
2156 
2157 	*devvpp = vp;
2158 
2159 out1:
2160 	vnode_put(realdevvp);
2161 
2162 out:
2163 	nameidone(&nd);
2164 
2165 	if (error) {
2166 		vnode_put(vp);
2167 	}
2168 
2169 	return error;
2170 }
2171 
2172 /*
2173  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2174  * and call checkdirs()
2175  */
2176 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2177 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2178 {
2179 	int error;
2180 
2181 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2182 
2183 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2184 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2185 
2186 	vnode_lock_spin(vp);
2187 	CLR(vp->v_flag, VMOUNT);
2188 	vp->v_mountedhere = mp;
2189 	SET(vp->v_flag, VMOUNTEDHERE);
2190 	/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2191 	wakeup(&vp->v_flag);
2192 	vnode_unlock(vp);
2193 
2194 	/*
2195 	 * taking the name_cache_lock exclusively will
2196 	 * insure that everyone is out of the fast path who
2197 	 * might be trying to use a now stale copy of
2198 	 * vp->v_mountedhere->mnt_realrootvp
2199 	 * bumping mount_generation causes the cached values
2200 	 * to be invalidated
2201 	 */
2202 	name_cache_lock();
2203 	mount_generation++;
2204 	name_cache_unlock();
2205 
2206 	error = vnode_ref(vp);
2207 	if (error != 0) {
2208 		goto out;
2209 	}
2210 
2211 	error = checkdirs(vp, ctx);
2212 	if (error != 0) {
2213 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2214 		vnode_rele(vp);
2215 		goto out;
2216 	}
2217 
2218 out:
2219 	if (error != 0) {
2220 		mp->mnt_vnodecovered = NULLVP;
2221 	}
2222 	return error;
2223 }
2224 
2225 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2226 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2227 {
2228 	vnode_rele(vp);
2229 	vnode_lock_spin(vp);
2230 	CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2231 	vp->v_mountedhere = (mount_t)NULL;
2232 	/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2233 	wakeup(&vp->v_flag);
2234 	vnode_unlock(vp);
2235 
2236 	mp->mnt_vnodecovered = NULLVP;
2237 }
2238 
2239 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2240 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2241 {
2242 	int error;
2243 
2244 	/* unmount in progress return error */
2245 	mount_lock_spin(mp);
2246 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2247 		mount_unlock(mp);
2248 		return EBUSY;
2249 	}
2250 	mount_unlock(mp);
2251 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2252 
2253 	/*
2254 	 * We only allow the filesystem to be reloaded if it
2255 	 * is currently mounted read-only.
2256 	 */
2257 	if ((flags & MNT_RELOAD) &&
2258 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2259 		error = ENOTSUP;
2260 		goto out;
2261 	}
2262 
2263 	/*
2264 	 * Only root, or the user that did the original mount is
2265 	 * permitted to update it.
2266 	 */
2267 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2268 	    (!vfs_context_issuser(ctx))) {
2269 		error = EPERM;
2270 		goto out;
2271 	}
2272 #if CONFIG_MACF
2273 	error = mac_mount_check_remount(ctx, mp, flags);
2274 	if (error != 0) {
2275 		goto out;
2276 	}
2277 #endif
2278 
2279 out:
2280 	if (error) {
2281 		lck_rw_done(&mp->mnt_rwlock);
2282 	}
2283 
2284 	return error;
2285 }
2286 
2287 static void
mount_end_update(mount_t mp)2288 mount_end_update(mount_t mp)
2289 {
2290 	lck_rw_done(&mp->mnt_rwlock);
2291 }
2292 
2293 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2294 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2295 {
2296 	vnode_t vp;
2297 
2298 	if (height >= MAX_IMAGEBOOT_NESTING) {
2299 		return EINVAL;
2300 	}
2301 
2302 	vp = imgsrc_rootvnodes[height];
2303 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2304 		*rvpp = vp;
2305 		return 0;
2306 	} else {
2307 		return ENOENT;
2308 	}
2309 }
2310 
2311 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2312 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2313     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2314     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2315 {
2316 	int error;
2317 	mount_t mp;
2318 	boolean_t placed = FALSE;
2319 	struct vfstable *vfsp;
2320 	user_addr_t devpath;
2321 	char *old_mntonname;
2322 	vnode_t rvp;
2323 	vnode_t devvp;
2324 	uint32_t height;
2325 	uint32_t flags;
2326 
2327 	/* If we didn't imageboot, nothing to move */
2328 	if (imgsrc_rootvnodes[0] == NULLVP) {
2329 		return EINVAL;
2330 	}
2331 
2332 	/* Only root can do this */
2333 	if (!vfs_context_issuser(ctx)) {
2334 		return EPERM;
2335 	}
2336 
2337 	IMGSRC_DEBUG("looking for root vnode.\n");
2338 
2339 	/*
2340 	 * Get root vnode of filesystem we're moving.
2341 	 */
2342 	if (by_index) {
2343 		if (is64bit) {
2344 			struct user64_mnt_imgsrc_args mia64;
2345 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2346 			if (error != 0) {
2347 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2348 				return error;
2349 			}
2350 
2351 			height = mia64.mi_height;
2352 			flags = mia64.mi_flags;
2353 			devpath = (user_addr_t)mia64.mi_devpath;
2354 		} else {
2355 			struct user32_mnt_imgsrc_args mia32;
2356 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2357 			if (error != 0) {
2358 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2359 				return error;
2360 			}
2361 
2362 			height = mia32.mi_height;
2363 			flags = mia32.mi_flags;
2364 			devpath = mia32.mi_devpath;
2365 		}
2366 	} else {
2367 		/*
2368 		 * For binary compatibility--assumes one level of nesting.
2369 		 */
2370 		if (is64bit) {
2371 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2372 				return error;
2373 			}
2374 		} else {
2375 			user32_addr_t tmp;
2376 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2377 				return error;
2378 			}
2379 
2380 			/* munge into LP64 addr */
2381 			devpath = CAST_USER_ADDR_T(tmp);
2382 		}
2383 
2384 		height = 0;
2385 		flags = 0;
2386 	}
2387 
2388 	if (flags != 0) {
2389 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2390 		return EINVAL;
2391 	}
2392 
2393 	error = get_imgsrc_rootvnode(height, &rvp);
2394 	if (error != 0) {
2395 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2396 		return error;
2397 	}
2398 
2399 	IMGSRC_DEBUG("got old root vnode\n");
2400 
2401 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2402 
2403 	/* Can only move once */
2404 	mp = vnode_mount(rvp);
2405 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2406 		IMGSRC_DEBUG("Already moved.\n");
2407 		error = EBUSY;
2408 		goto out0;
2409 	}
2410 
2411 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2412 	IMGSRC_DEBUG("Starting updated.\n");
2413 
2414 	/* Get exclusive rwlock on mount, authorize update on mp */
2415 	error = mount_begin_update(mp, ctx, 0);
2416 	if (error != 0) {
2417 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2418 		goto out0;
2419 	}
2420 
2421 	/*
2422 	 * It can only be moved once.  Flag is set under the rwlock,
2423 	 * so we're now safe to proceed.
2424 	 */
2425 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2426 		IMGSRC_DEBUG("Already moved [2]\n");
2427 		goto out1;
2428 	}
2429 
2430 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2431 
2432 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2433 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2434 	if (error != 0) {
2435 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2436 		goto out1;
2437 	}
2438 
2439 	IMGSRC_DEBUG("Covered vp OK.\n");
2440 
2441 	/* Sanity check the name caller has provided */
2442 	vfsp = mp->mnt_vtable;
2443 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2444 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2445 		    vfsp->vfc_name, fsname);
2446 		error = EINVAL;
2447 		goto out2;
2448 	}
2449 
2450 	/* Check the device vnode and update mount-from name, for local filesystems */
2451 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2452 		IMGSRC_DEBUG("Local, doing device validation.\n");
2453 
2454 		if (devpath != USER_ADDR_NULL) {
2455 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2456 			if (error) {
2457 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2458 				goto out2;
2459 			}
2460 
2461 			vnode_put(devvp);
2462 		}
2463 	}
2464 
2465 	/*
2466 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2467 	 * and increment the name cache's mount generation
2468 	 */
2469 
2470 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2471 	error = place_mount_and_checkdirs(mp, vp, ctx);
2472 	if (error != 0) {
2473 		goto out2;
2474 	}
2475 
2476 	placed = TRUE;
2477 
2478 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2479 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2480 
2481 	/* Forbid future moves */
2482 	mount_lock(mp);
2483 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2484 	mount_unlock(mp);
2485 
2486 	/* Finally, add to mount list, completely ready to go */
2487 	if (mount_list_add(mp) != 0) {
2488 		/*
2489 		 * The system is shutting down trying to umount
2490 		 * everything, so fail with a plausible errno.
2491 		 */
2492 		error = EBUSY;
2493 		goto out3;
2494 	}
2495 
2496 	mount_end_update(mp);
2497 	vnode_put(rvp);
2498 	zfree(ZV_NAMEI, old_mntonname);
2499 
2500 	vfs_notify_mount(pvp);
2501 
2502 	return 0;
2503 out3:
2504 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2505 
2506 	mount_lock(mp);
2507 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2508 	mount_unlock(mp);
2509 
2510 out2:
2511 	/*
2512 	 * Placing the mp on the vnode clears VMOUNT,
2513 	 * so cleanup is different after that point
2514 	 */
2515 	if (placed) {
2516 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2517 		undo_place_on_covered_vp(mp, vp);
2518 	} else {
2519 		vnode_lock_spin(vp);
2520 		CLR(vp->v_flag, VMOUNT);
2521 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2522 		wakeup(&vp->v_flag);
2523 		vnode_unlock(vp);
2524 	}
2525 out1:
2526 	mount_end_update(mp);
2527 
2528 out0:
2529 	vnode_put(rvp);
2530 	zfree(ZV_NAMEI, old_mntonname);
2531 	return error;
2532 }
2533 
2534 #endif /* CONFIG_IMGSRC_ACCESS */
2535 
2536 void
enablequotas(struct mount * mp,vfs_context_t ctx)2537 enablequotas(struct mount *mp, vfs_context_t ctx)
2538 {
2539 	struct nameidata qnd;
2540 	int type;
2541 	char qfpath[MAXPATHLEN];
2542 	const char *qfname = QUOTAFILENAME;
2543 	const char *qfopsname = QUOTAOPSNAME;
2544 	const char *qfextension[] = INITQFNAMES;
2545 
2546 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2547 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2548 		return;
2549 	}
2550 	/*
2551 	 * Enable filesystem disk quotas if necessary.
2552 	 * We ignore errors as this should not interfere with final mount
2553 	 */
2554 	for (type = 0; type < MAXQUOTAS; type++) {
2555 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2556 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2557 		    CAST_USER_ADDR_T(qfpath), ctx);
2558 		if (namei(&qnd) != 0) {
2559 			continue;           /* option file to trigger quotas is not present */
2560 		}
2561 		vnode_put(qnd.ni_vp);
2562 		nameidone(&qnd);
2563 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2564 
2565 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2566 	}
2567 	return;
2568 }
2569 
2570 
2571 static int
checkdirs_callback(proc_t p,void * arg)2572 checkdirs_callback(proc_t p, void * arg)
2573 {
2574 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2575 	vnode_t olddp = cdrp->olddp;
2576 	vnode_t newdp = cdrp->newdp;
2577 	struct filedesc *fdp = &p->p_fd;
2578 	vnode_t new_cvp = newdp;
2579 	vnode_t new_rvp = newdp;
2580 	vnode_t old_cvp = NULL;
2581 	vnode_t old_rvp = NULL;
2582 
2583 	/*
2584 	 * XXX Also needs to iterate each thread in the process to see if it
2585 	 * XXX is using a per-thread current working directory, and, if so,
2586 	 * XXX update that as well.
2587 	 */
2588 
2589 	/*
2590 	 * First, with the proc_fdlock held, check to see if we will need
2591 	 * to do any work.  If not, we will get out fast.
2592 	 */
2593 	proc_fdlock(p);
2594 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2595 		proc_fdunlock(p);
2596 		return PROC_RETURNED;
2597 	}
2598 	proc_fdunlock(p);
2599 
2600 	/*
2601 	 * Ok, we will have to do some work.  Always take two refs
2602 	 * because we might need that many.  We'll dispose of whatever
2603 	 * we ended up not using.
2604 	 */
2605 	if (vnode_ref(newdp) != 0) {
2606 		return PROC_RETURNED;
2607 	}
2608 	if (vnode_ref(newdp) != 0) {
2609 		vnode_rele(newdp);
2610 		return PROC_RETURNED;
2611 	}
2612 
2613 	proc_dirs_lock_exclusive(p);
2614 	/*
2615 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2616 	 * have to do all of the checks again.
2617 	 */
2618 	proc_fdlock(p);
2619 	if (fdp->fd_cdir == olddp) {
2620 		old_cvp = olddp;
2621 		fdp->fd_cdir = newdp;
2622 		new_cvp = NULL;
2623 	}
2624 	if (fdp->fd_rdir == olddp) {
2625 		old_rvp = olddp;
2626 		fdp->fd_rdir = newdp;
2627 		new_rvp = NULL;
2628 	}
2629 	proc_fdunlock(p);
2630 	proc_dirs_unlock_exclusive(p);
2631 
2632 	/*
2633 	 * Dispose of any references that are no longer needed.
2634 	 */
2635 	if (old_cvp != NULL) {
2636 		vnode_rele(old_cvp);
2637 	}
2638 	if (old_rvp != NULL) {
2639 		vnode_rele(old_rvp);
2640 	}
2641 	if (new_cvp != NULL) {
2642 		vnode_rele(new_cvp);
2643 	}
2644 	if (new_rvp != NULL) {
2645 		vnode_rele(new_rvp);
2646 	}
2647 
2648 	return PROC_RETURNED;
2649 }
2650 
2651 
2652 
2653 /*
2654  * Scan all active processes to see if any of them have a current
2655  * or root directory onto which the new filesystem has just been
2656  * mounted. If so, replace them with the new mount point.
2657  */
2658 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2659 checkdirs(vnode_t olddp, vfs_context_t ctx)
2660 {
2661 	vnode_t newdp;
2662 	vnode_t tvp;
2663 	int err;
2664 	struct cdirargs cdr;
2665 
2666 	if (olddp->v_usecount == 1) {
2667 		return 0;
2668 	}
2669 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2670 
2671 	if (err != 0) {
2672 #if DIAGNOSTIC
2673 		panic("mount: lost mount: error %d", err);
2674 #endif
2675 		return err;
2676 	}
2677 
2678 	cdr.olddp = olddp;
2679 	cdr.newdp = newdp;
2680 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2681 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2682 
2683 	if (rootvnode == olddp) {
2684 		vnode_ref(newdp);
2685 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2686 		tvp = rootvnode;
2687 		rootvnode = newdp;
2688 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2689 		vnode_rele(tvp);
2690 	}
2691 
2692 	vnode_put(newdp);
2693 	return 0;
2694 }
2695 
2696 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2697 	"com.apple.private.vfs.role-account-unmount"
2698 
2699 /*
2700  * Unmount a file system.
2701  *
2702  * Note: unmount takes a path to the vnode mounted on as argument,
2703  * not special file (as before).
2704  */
2705 /* ARGSUSED */
2706 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2707 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2708 {
2709 	vnode_t vp;
2710 	struct mount *mp;
2711 	int flags = uap->flags;
2712 	int error;
2713 	struct nameidata nd;
2714 	vfs_context_t ctx;
2715 
2716 	/*
2717 	 * If the process has the entitlement, use the kernel's context when
2718 	 * performing lookup on the mount path as the process might lack proper
2719 	 * permission to access the directory.
2720 	 */
2721 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2722 	    vfs_context_kernel() : vfs_context_current();
2723 
2724 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2725 	    UIO_USERSPACE, uap->path, ctx);
2726 	if (flags & MNT_NOFOLLOW) {
2727 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
2728 	}
2729 
2730 	error = namei(&nd);
2731 	if (error) {
2732 		return error;
2733 	}
2734 	vp = nd.ni_vp;
2735 	mp = vp->v_mount;
2736 	nameidone(&nd);
2737 
2738 	/*
2739 	 * Must be the root of the filesystem
2740 	 */
2741 	if ((vp->v_flag & VROOT) == 0) {
2742 		vnode_put(vp);
2743 		return EINVAL;
2744 	}
2745 #if CONFIG_MACF
2746 	error = mac_mount_check_umount(ctx, mp);
2747 	if (error != 0) {
2748 		vnode_put(vp);
2749 		return error;
2750 	}
2751 #endif
2752 	mount_ref(mp, 0);
2753 	vnode_put(vp);
2754 	/* safedounmount consumes the mount ref */
2755 	return safedounmount(mp, flags, ctx);
2756 }
2757 
2758 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2759 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2760 {
2761 	mount_t mp;
2762 
2763 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2764 	if (mp == (mount_t)0) {
2765 		return ENOENT;
2766 	}
2767 	mount_ref(mp, 0);
2768 	mount_iterdrop(mp);
2769 	/* safedounmount consumes the mount ref */
2770 	return safedounmount(mp, flags, ctx);
2771 }
2772 
2773 /*
2774  * The mount struct comes with a mount ref which will be consumed.
2775  * Do the actual file system unmount, prevent some common foot shooting.
2776  */
2777 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2778 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2779 {
2780 	int error;
2781 	proc_t p = vfs_context_proc(ctx);
2782 
2783 	/*
2784 	 * If the file system is not responding and MNT_NOBLOCK
2785 	 * is set and not a forced unmount then return EBUSY.
2786 	 */
2787 	if ((mp->mnt_lflag & MNT_LNOTRESP) &&
2788 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2789 		error = EBUSY;
2790 		goto out;
2791 	}
2792 
2793 	/*
2794 	 * Skip authorization in two cases:
2795 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2796 	 *   This entitlement allows non-root processes unmount volumes mounted by
2797 	 *   other processes.
2798 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2799 	 *   attempt.
2800 	 */
2801 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2802 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2803 		/*
2804 		 * Only root, or the user that did the original mount is
2805 		 * permitted to unmount this filesystem.
2806 		 */
2807 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2808 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2809 			goto out;
2810 		}
2811 	}
2812 	/*
2813 	 * Don't allow unmounting the root file system, or other volumes
2814 	 * associated with it (for example, the associated VM or DATA mounts) .
2815 	 */
2816 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2817 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2818 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2819 			    mp->mnt_vfsstat.f_mntonname);
2820 		}
2821 		error = EBUSY; /* the root (or associated volumes) is always busy */
2822 		goto out;
2823 	}
2824 
2825 	/*
2826 	 * If the mount is providing the root filesystem's disk image
2827 	 * (i.e. imageboot), don't allow unmounting
2828 	 */
2829 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2830 		error = EBUSY;
2831 		goto out;
2832 	}
2833 
2834 	return dounmount(mp, flags, 1, ctx);
2835 
2836 out:
2837 	mount_drop(mp, 0);
2838 	return error;
2839 }
2840 
2841 /*
2842  * Do the actual file system unmount.
2843  */
2844 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2845 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2846 {
2847 	vnode_t coveredvp = (vnode_t)0;
2848 	int error;
2849 	int needwakeup = 0;
2850 	int forcedunmount = 0;
2851 	int lflags = 0;
2852 	struct vnode *devvp = NULLVP;
2853 #if CONFIG_TRIGGERS
2854 	proc_t p = vfs_context_proc(ctx);
2855 	int did_vflush = 0;
2856 	int pflags_save = 0;
2857 #endif /* CONFIG_TRIGGERS */
2858 
2859 #if CONFIG_FSE
2860 	if (!(flags & MNT_FORCE)) {
2861 		fsevent_unmount(mp, ctx);  /* has to come first! */
2862 	}
2863 #endif
2864 
2865 	mount_lock(mp);
2866 
2867 	/*
2868 	 * If already an unmount in progress just return EBUSY.
2869 	 * Even a forced unmount cannot override.
2870 	 */
2871 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2872 		if (withref != 0) {
2873 			mount_drop(mp, 1);
2874 		}
2875 		mount_unlock(mp);
2876 		return EBUSY;
2877 	}
2878 
2879 	if (flags & MNT_FORCE) {
2880 		forcedunmount = 1;
2881 		mp->mnt_lflag |= MNT_LFORCE;
2882 	}
2883 
2884 #if CONFIG_TRIGGERS
2885 	if (flags & MNT_NOBLOCK && p != kernproc) {
2886 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2887 	}
2888 #endif
2889 
2890 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2891 	mp->mnt_lflag |= MNT_LUNMOUNT;
2892 	mp->mnt_flag &= ~MNT_ASYNC;
2893 	/*
2894 	 * anyone currently in the fast path that
2895 	 * trips over the cached rootvp will be
2896 	 * dumped out and forced into the slow path
2897 	 * to regenerate a new cached value
2898 	 */
2899 	mp->mnt_realrootvp = NULLVP;
2900 	mount_unlock(mp);
2901 
2902 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2903 		/*
2904 		 * Force unmount any mounts in this filesystem.
2905 		 * If any unmounts fail - just leave them dangling.
2906 		 * Avoids recursion.
2907 		 */
2908 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2909 	}
2910 
2911 	/*
2912 	 * taking the name_cache_lock exclusively will
2913 	 * insure that everyone is out of the fast path who
2914 	 * might be trying to use a now stale copy of
2915 	 * vp->v_mountedhere->mnt_realrootvp
2916 	 * bumping mount_generation causes the cached values
2917 	 * to be invalidated
2918 	 */
2919 	name_cache_lock();
2920 	mount_generation++;
2921 	name_cache_unlock();
2922 
2923 
2924 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2925 	if (withref != 0) {
2926 		mount_drop(mp, 0);
2927 	}
2928 	error = 0;
2929 	if (forcedunmount == 0) {
2930 		ubc_umount(mp); /* release cached vnodes */
2931 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2932 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2933 			if (error) {
2934 				mount_lock(mp);
2935 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2936 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2937 				mp->mnt_lflag &= ~MNT_LFORCE;
2938 				goto out;
2939 			}
2940 		}
2941 	}
2942 
2943 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2944 
2945 #if CONFIG_TRIGGERS
2946 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2947 	did_vflush = 1;
2948 #endif
2949 	if (forcedunmount) {
2950 		lflags |= FORCECLOSE;
2951 	}
2952 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2953 	if ((forcedunmount == 0) && error) {
2954 		mount_lock(mp);
2955 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2956 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2957 		mp->mnt_lflag &= ~MNT_LFORCE;
2958 		goto out;
2959 	}
2960 
2961 	/* make sure there are no one in the mount iterations or lookup */
2962 	mount_iterdrain(mp);
2963 
2964 	error = VFS_UNMOUNT(mp, flags, ctx);
2965 	if (error) {
2966 		mount_iterreset(mp);
2967 		mount_lock(mp);
2968 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2969 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2970 		mp->mnt_lflag &= ~MNT_LFORCE;
2971 		goto out;
2972 	}
2973 
2974 	/* increment the operations count */
2975 	if (!error) {
2976 		OSAddAtomic(1, &vfs_nummntops);
2977 	}
2978 
2979 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2980 		/* hold an io reference and drop the usecount before close */
2981 		devvp = mp->mnt_devvp;
2982 		vnode_getalways(devvp);
2983 		vnode_rele(devvp);
2984 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2985 		    ctx);
2986 		vnode_clearmountedon(devvp);
2987 		vnode_put(devvp);
2988 	}
2989 	lck_rw_done(&mp->mnt_rwlock);
2990 	mount_list_remove(mp);
2991 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2992 
2993 	/* mark the mount point hook in the vp but not drop the ref yet */
2994 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2995 		/*
2996 		 * The covered vnode needs special handling. Trying to get an
2997 		 * iocount must not block here as this may lead to deadlocks
2998 		 * if the Filesystem to which the covered vnode belongs is
2999 		 * undergoing forced unmounts. Since we hold a usecount, the
3000 		 * vnode cannot be reused (it can, however, still be terminated)
3001 		 */
3002 		vnode_getalways(coveredvp);
3003 		vnode_lock_spin(coveredvp);
3004 
3005 		mp->mnt_crossref++;
3006 		coveredvp->v_mountedhere = (struct mount *)0;
3007 		CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
3008 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
3009 		wakeup(&coveredvp->v_flag);
3010 		vnode_unlock(coveredvp);
3011 		vnode_put(coveredvp);
3012 	}
3013 
3014 	mount_list_lock();
3015 	mp->mnt_vtable->vfc_refcount--;
3016 	mount_list_unlock();
3017 
3018 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
3019 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
3020 	mount_lock(mp);
3021 	mp->mnt_lflag |= MNT_LDEAD;
3022 
3023 	if (mp->mnt_lflag & MNT_LWAIT) {
3024 		/*
3025 		 * do the wakeup here
3026 		 * in case we block in mount_refdrain
3027 		 * which will drop the mount lock
3028 		 * and allow anyone blocked in vfs_busy
3029 		 * to wakeup and see the LDEAD state
3030 		 */
3031 		mp->mnt_lflag &= ~MNT_LWAIT;
3032 		wakeup((caddr_t)mp);
3033 	}
3034 	mount_refdrain(mp);
3035 
3036 	/* free disk_conditioner_info structure for this mount */
3037 	disk_conditioner_unmount(mp);
3038 
3039 out:
3040 	if (mp->mnt_lflag & MNT_LWAIT) {
3041 		mp->mnt_lflag &= ~MNT_LWAIT;
3042 		needwakeup = 1;
3043 	}
3044 
3045 #if CONFIG_TRIGGERS
3046 	if (flags & MNT_NOBLOCK && p != kernproc) {
3047 		// Restore P_NOREMOTEHANG bit to its previous value
3048 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
3049 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
3050 		}
3051 	}
3052 
3053 	/*
3054 	 * Callback and context are set together under the mount lock, and
3055 	 * never cleared, so we're safe to examine them here, drop the lock,
3056 	 * and call out.
3057 	 */
3058 	if (mp->mnt_triggercallback != NULL) {
3059 		mount_unlock(mp);
3060 		if (error == 0) {
3061 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
3062 		} else if (did_vflush) {
3063 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
3064 		}
3065 	} else {
3066 		mount_unlock(mp);
3067 	}
3068 #else
3069 	mount_unlock(mp);
3070 #endif /* CONFIG_TRIGGERS */
3071 
3072 	lck_rw_done(&mp->mnt_rwlock);
3073 
3074 	if (needwakeup) {
3075 		wakeup((caddr_t)mp);
3076 	}
3077 
3078 	if (!error) {
3079 		if ((coveredvp != NULLVP)) {
3080 			vnode_t pvp = NULLVP;
3081 
3082 			/*
3083 			 * The covered vnode needs special handling. Trying to
3084 			 * get an iocount must not block here as this may lead
3085 			 * to deadlocks if the Filesystem to which the covered
3086 			 * vnode belongs is undergoing forced unmounts. Since we
3087 			 * hold a usecount, the  vnode cannot be reused
3088 			 * (it can, however, still be terminated).
3089 			 */
3090 			vnode_getalways(coveredvp);
3091 
3092 			mount_dropcrossref(mp, coveredvp, 0);
3093 			/*
3094 			 * We'll _try_ to detect if this really needs to be
3095 			 * done. The coveredvp can only be in termination (or
3096 			 * terminated) if the coveredvp's mount point is in a
3097 			 * forced unmount (or has been) since we still hold the
3098 			 * ref.
3099 			 */
3100 			if (!vnode_isrecycled(coveredvp)) {
3101 				pvp = vnode_getparent(coveredvp);
3102 #if CONFIG_TRIGGERS
3103 				if (coveredvp->v_resolve) {
3104 					vnode_trigger_rearm(coveredvp, ctx);
3105 				}
3106 #endif
3107 			}
3108 
3109 			vnode_rele(coveredvp);
3110 			vnode_put(coveredvp);
3111 			coveredvp = NULLVP;
3112 
3113 			if (pvp) {
3114 				lock_vnode_and_post(pvp, NOTE_WRITE);
3115 				vnode_put(pvp);
3116 			}
3117 		} else if (mp->mnt_flag & MNT_ROOTFS) {
3118 			if (nc_smr_enabled) {
3119 				vfs_smr_synchronize();
3120 			}
3121 
3122 			mount_lock_destroy(mp);
3123 #if CONFIG_MACF
3124 			mac_mount_label_destroy(mp);
3125 #endif
3126 			zfree(mount_zone, mp);
3127 		} else {
3128 			panic("dounmount: no coveredvp");
3129 		}
3130 	}
3131 	return error;
3132 }
3133 
3134 /*
3135  * Unmount any mounts in this filesystem.
3136  */
3137 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3138 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3139 {
3140 	mount_t smp;
3141 	fsid_t *fsids, fsid;
3142 	int fsids_sz;
3143 	int count = 0, i, m = 0;
3144 	vnode_t vp;
3145 
3146 	mount_list_lock();
3147 
3148 	// Get an array to hold the submounts fsids.
3149 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
3150 	count++;
3151 	fsids_sz = count * sizeof(fsid_t);
3152 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3153 	if (fsids == NULL) {
3154 		mount_list_unlock();
3155 		goto out;
3156 	}
3157 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
3158 
3159 	/*
3160 	 * Fill the array with submount fsids.
3161 	 * Since mounts are always added to the tail of the mount list, the
3162 	 * list is always in mount order.
3163 	 * For each mount check if the mounted-on vnode belongs to a
3164 	 * mount that's already added to our array of mounts to be unmounted.
3165 	 */
3166 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3167 		vp = smp->mnt_vnodecovered;
3168 		if (vp == NULL) {
3169 			continue;
3170 		}
3171 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3172 		for (i = 0; i <= m; i++) {
3173 			if (fsids[i].val[0] == fsid.val[0] &&
3174 			    fsids[i].val[1] == fsid.val[1]) {
3175 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3176 				break;
3177 			}
3178 		}
3179 	}
3180 	mount_list_unlock();
3181 
3182 	// Unmount the submounts in reverse order. Ignore errors.
3183 	for (i = m; i > 0; i--) {
3184 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3185 		if (smp) {
3186 			mount_ref(smp, 0);
3187 			mount_iterdrop(smp);
3188 			(void) dounmount(smp, flags, 1, ctx);
3189 		}
3190 	}
3191 out:
3192 	kfree_data(fsids, fsids_sz);
3193 }
3194 
3195 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3196 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3197 {
3198 	vnode_hold(dp);
3199 	vnode_lock(dp);
3200 	mp->mnt_crossref--;
3201 
3202 	if (mp->mnt_crossref < 0) {
3203 		panic("mount cross refs -ve");
3204 	}
3205 
3206 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3207 		if (need_put) {
3208 			vnode_put_locked(dp);
3209 		}
3210 		vnode_drop_and_unlock(dp);
3211 
3212 		if (nc_smr_enabled) {
3213 			vfs_smr_synchronize();
3214 		}
3215 
3216 		mount_lock_destroy(mp);
3217 #if CONFIG_MACF
3218 		mac_mount_label_destroy(mp);
3219 #endif
3220 		zfree(mount_zone, mp);
3221 		return;
3222 	}
3223 	if (need_put) {
3224 		vnode_put_locked(dp);
3225 	}
3226 	vnode_drop_and_unlock(dp);
3227 }
3228 
3229 
3230 /*
3231  * Sync each mounted filesystem.
3232  */
3233 #if DIAGNOSTIC
3234 int syncprt = 0;
3235 #endif
3236 
3237 int print_vmpage_stat = 0;
3238 
3239 /*
3240  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3241  *			mounted read-write with the passed waitfor value.
3242  *
3243  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3244  *		arg	user argument (please see below)
3245  *
3246  * User argument is a pointer to 32 bit unsigned integer which describes the
3247  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3248  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3249  * waitfor value.
3250  *
3251  * Returns:		VFS_RETURNED
3252  */
3253 static int
sync_callback(mount_t mp,void * arg)3254 sync_callback(mount_t mp, void *arg)
3255 {
3256 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3257 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3258 		unsigned waitfor = MNT_NOWAIT;
3259 
3260 		if (arg) {
3261 			waitfor = *(uint32_t*)arg;
3262 		}
3263 
3264 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3265 		if (waitfor != MNT_WAIT &&
3266 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3267 		    waitfor != MNT_NOWAIT &&
3268 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3269 		    waitfor != MNT_DWAIT &&
3270 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3271 			panic("Passed inappropriate waitfor %u to "
3272 			    "sync_callback()", waitfor);
3273 		}
3274 
3275 		mp->mnt_flag &= ~MNT_ASYNC;
3276 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3277 		if (asyncflag) {
3278 			mp->mnt_flag |= MNT_ASYNC;
3279 		}
3280 	}
3281 
3282 	return VFS_RETURNED;
3283 }
3284 
3285 /* ARGSUSED */
3286 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3287 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3288 {
3289 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3290 
3291 	if (print_vmpage_stat) {
3292 		vm_countdirtypages();
3293 	}
3294 
3295 #if DIAGNOSTIC
3296 	if (syncprt) {
3297 		vfs_bufstats();
3298 	}
3299 #endif /* DIAGNOSTIC */
3300 	return 0;
3301 }
3302 
3303 typedef enum {
3304 	SYNC_ALL = 0,
3305 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3306 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3307 } sync_type_t;
3308 
3309 static int
sync_internal_callback(mount_t mp,void * arg)3310 sync_internal_callback(mount_t mp, void *arg)
3311 {
3312 	if (arg) {
3313 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3314 		    (mp->mnt_flag & MNT_LOCAL);
3315 		sync_type_t sync_type = *((sync_type_t *)arg);
3316 
3317 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3318 			return VFS_RETURNED;
3319 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3320 			return VFS_RETURNED;
3321 		}
3322 	}
3323 
3324 	(void)sync_callback(mp, NULL);
3325 
3326 	return VFS_RETURNED;
3327 }
3328 
3329 int sync_thread_state = 0;
3330 int sync_timeout_seconds = 5;
3331 
3332 #define SYNC_THREAD_RUN       0x0001
3333 #define SYNC_THREAD_RUNNING   0x0002
3334 
3335 #if CONFIG_PHYS_WRITE_ACCT
3336 thread_t pm_sync_thread;
3337 #endif /* CONFIG_PHYS_WRITE_ACCT */
3338 
3339 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3340 sync_thread(__unused void *arg, __unused wait_result_t wr)
3341 {
3342 	sync_type_t sync_type;
3343 #if CONFIG_PHYS_WRITE_ACCT
3344 	pm_sync_thread = current_thread();
3345 #endif /* CONFIG_PHYS_WRITE_ACCT */
3346 
3347 	lck_mtx_lock(&sync_mtx_lck);
3348 	while (sync_thread_state & SYNC_THREAD_RUN) {
3349 		sync_thread_state &= ~SYNC_THREAD_RUN;
3350 		lck_mtx_unlock(&sync_mtx_lck);
3351 
3352 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3353 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3354 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3355 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3356 
3357 		lck_mtx_lock(&sync_mtx_lck);
3358 	}
3359 	/*
3360 	 * This wakeup _has_ to be issued before the lock is released otherwise
3361 	 * we may end up waking up a thread in sync_internal which is
3362 	 * expecting a wakeup from a thread it just created and not from this
3363 	 * thread which is about to exit.
3364 	 */
3365 	wakeup(&sync_thread_state);
3366 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3367 #if CONFIG_PHYS_WRITE_ACCT
3368 	pm_sync_thread = NULL;
3369 #endif /* CONFIG_PHYS_WRITE_ACCT */
3370 	lck_mtx_unlock(&sync_mtx_lck);
3371 
3372 	if (print_vmpage_stat) {
3373 		vm_countdirtypages();
3374 	}
3375 
3376 #if DIAGNOSTIC
3377 	if (syncprt) {
3378 		vfs_bufstats();
3379 	}
3380 #endif /* DIAGNOSTIC */
3381 }
3382 
3383 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3384 
3385 /*
3386  * An in-kernel sync for power management to call.
3387  * This function always returns within sync_timeout seconds.
3388  */
3389 __private_extern__ int
sync_internal(void)3390 sync_internal(void)
3391 {
3392 	thread_t thd = NULL;
3393 	int error;
3394 	int thread_created = FALSE;
3395 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3396 
3397 	lck_mtx_lock(&sync_mtx_lck);
3398 	sync_thread_state |= SYNC_THREAD_RUN;
3399 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3400 		int kr;
3401 
3402 		sync_thread_state |= SYNC_THREAD_RUNNING;
3403 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3404 		if (kr != KERN_SUCCESS) {
3405 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3406 			lck_mtx_unlock(&sync_mtx_lck);
3407 			printf("sync_thread failed\n");
3408 			return 0;
3409 		}
3410 		thread_created = TRUE;
3411 	}
3412 
3413 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3414 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3415 	if (error) {
3416 		struct timeval now;
3417 
3418 		microtime(&now);
3419 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3420 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3421 			sync_timeout_last_print.tv_sec = now.tv_sec;
3422 		}
3423 	}
3424 
3425 	if (thread_created) {
3426 		thread_deallocate(thd);
3427 	}
3428 
3429 	return 0;
3430 } /* end of sync_internal call */
3431 
3432 /*
3433  * Change filesystem quotas.
3434  */
3435 #if QUOTA
3436 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3437 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3438 {
3439 	struct mount *mp;
3440 	int error, quota_cmd, quota_status = 0;
3441 	caddr_t datap;
3442 	size_t fnamelen;
3443 	struct nameidata nd;
3444 	vfs_context_t ctx = vfs_context_current();
3445 	struct dqblk my_dqblk = {};
3446 
3447 	AUDIT_ARG(uid, uap->uid);
3448 	AUDIT_ARG(cmd, uap->cmd);
3449 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3450 	    uap->path, ctx);
3451 	error = namei(&nd);
3452 	if (error) {
3453 		return error;
3454 	}
3455 	mp = nd.ni_vp->v_mount;
3456 	mount_ref(mp, 0);
3457 	vnode_put(nd.ni_vp);
3458 	nameidone(&nd);
3459 
3460 #if CONFIG_MACF
3461 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3462 	if (error != 0) {
3463 		goto out;
3464 	}
3465 #endif
3466 
3467 	/* copyin any data we will need for downstream code */
3468 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3469 
3470 	switch (quota_cmd) {
3471 	case Q_QUOTAON:
3472 		/* uap->arg specifies a file from which to take the quotas */
3473 		fnamelen = MAXPATHLEN;
3474 		datap = zalloc(ZV_NAMEI);
3475 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3476 		break;
3477 	case Q_GETQUOTA:
3478 		/* uap->arg is a pointer to a dqblk structure. */
3479 		datap = (caddr_t) &my_dqblk;
3480 		break;
3481 	case Q_SETQUOTA:
3482 	case Q_SETUSE:
3483 		/* uap->arg is a pointer to a dqblk structure. */
3484 		datap = (caddr_t) &my_dqblk;
3485 		if (proc_is64bit(p)) {
3486 			struct user_dqblk       my_dqblk64;
3487 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3488 			if (error == 0) {
3489 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3490 			}
3491 		} else {
3492 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3493 		}
3494 		break;
3495 	case Q_QUOTASTAT:
3496 		/* uap->arg is a pointer to an integer */
3497 		datap = (caddr_t) &quota_status;
3498 		break;
3499 	default:
3500 		datap = NULL;
3501 		break;
3502 	} /* switch */
3503 
3504 	if (error == 0) {
3505 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3506 	}
3507 
3508 	switch (quota_cmd) {
3509 	case Q_QUOTAON:
3510 		if (datap != NULL) {
3511 			zfree(ZV_NAMEI, datap);
3512 		}
3513 		break;
3514 	case Q_GETQUOTA:
3515 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3516 		if (error == 0) {
3517 			if (proc_is64bit(p)) {
3518 				struct user_dqblk       my_dqblk64;
3519 
3520 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3521 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3522 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3523 			} else {
3524 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3525 			}
3526 		}
3527 		break;
3528 	case Q_QUOTASTAT:
3529 		/* uap->arg is a pointer to an integer */
3530 		if (error == 0) {
3531 			error = copyout(datap, uap->arg, sizeof(quota_status));
3532 		}
3533 		break;
3534 	default:
3535 		break;
3536 	} /* switch */
3537 
3538 out:
3539 	mount_drop(mp, 0);
3540 	return error;
3541 }
3542 #else
3543 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3544 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3545 {
3546 	return EOPNOTSUPP;
3547 }
3548 #endif /* QUOTA */
3549 
3550 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3551 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3552 {
3553 	int error;
3554 	vfs_context_t ctx = vfs_context_current();
3555 
3556 #if CONFIG_MACF
3557 	error = mac_mount_check_stat(ctx, mp);
3558 	if (error != 0) {
3559 		return error;
3560 	}
3561 #endif
3562 
3563 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3564 	if (error != 0) {
3565 		return error;
3566 	}
3567 
3568 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3569 }
3570 
3571 /*
3572  * Get filesystem statistics.
3573  *
3574  * Returns:	0			Success
3575  *	namei:???
3576  *	vfs_update_vfsstat:???
3577  *	munge_statfs:EFAULT
3578  */
3579 /* ARGSUSED */
3580 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3581 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3582 {
3583 	int error;
3584 	struct mount *mp;
3585 	struct nameidata nd;
3586 	vfs_context_t ctx = vfs_context_current();
3587 	vnode_t vp;
3588 
3589 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3590 	    UIO_USERSPACE, uap->path, ctx);
3591 	error = namei(&nd);
3592 	if (error != 0) {
3593 		return error;
3594 	}
3595 	vp = nd.ni_vp;
3596 	mp = vp->v_mount;
3597 	nameidone(&nd);
3598 
3599 	error = statfs_internal(p, mp, uap->buf);
3600 	vnode_put(vp);
3601 
3602 	return error;
3603 }
3604 
3605 /*
3606  * Get filesystem statistics.
3607  */
3608 /* ARGSUSED */
3609 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3610 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3611 {
3612 	int error;
3613 	vnode_t vp = NULL;
3614 	struct mount *mp;
3615 
3616 	AUDIT_ARG(fd, uap->fd);
3617 
3618 	if ((error = file_vnode(uap->fd, &vp)) ||
3619 	    (error = vnode_getwithref(vp))) {
3620 		goto out;
3621 	}
3622 
3623 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3624 
3625 	mp = vp->v_mount;
3626 	if (!mp) {
3627 		error = EBADF;
3628 		goto out_vnode;
3629 	}
3630 
3631 	error = statfs_internal(p, mp, uap->buf);
3632 
3633 out_vnode:
3634 	vnode_put(vp);
3635 
3636 out:
3637 	if (vp != NULL) {
3638 		file_drop(uap->fd);
3639 	}
3640 
3641 	return error;
3642 }
3643 
3644 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3645 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3646 {
3647 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3648 
3649 	bzero(sfs, sizeof(*sfs));
3650 
3651 	sfs->f_bsize = vsfs->f_bsize;
3652 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3653 	sfs->f_blocks = vsfs->f_blocks;
3654 	sfs->f_bfree = vsfs->f_bfree;
3655 	sfs->f_bavail = vsfs->f_bavail;
3656 	sfs->f_files = vsfs->f_files;
3657 	sfs->f_ffree = vsfs->f_ffree;
3658 	sfs->f_fsid = vsfs->f_fsid;
3659 	sfs->f_owner = vsfs->f_owner;
3660 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3661 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3662 	sfs->f_fssubtype = vsfs->f_fssubtype;
3663 	sfs->f_flags_ext = vfs_getextflags(mp);
3664 	vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3665 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3666 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3667 }
3668 
3669 /*
3670  * Get file system statistics in 64-bit mode
3671  */
3672 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3673 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3674 {
3675 	struct mount *mp;
3676 	int error;
3677 	struct nameidata *ndp;
3678 	struct statfs64 *sfsp;
3679 	vfs_context_t ctxp = vfs_context_current();
3680 	vnode_t vp;
3681 	struct {
3682 		struct nameidata nd;
3683 		struct statfs64 sfs;
3684 	} *__nameidata_statfs64;
3685 
3686 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3687 	    Z_WAITOK);
3688 	ndp = &__nameidata_statfs64->nd;
3689 
3690 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3691 	    UIO_USERSPACE, uap->path, ctxp);
3692 	error = namei(ndp);
3693 	if (error != 0) {
3694 		goto out;
3695 	}
3696 	vp = ndp->ni_vp;
3697 	mp = vp->v_mount;
3698 	nameidone(ndp);
3699 
3700 #if CONFIG_MACF
3701 	error = mac_mount_check_stat(ctxp, mp);
3702 	if (error != 0) {
3703 		vnode_put(vp);
3704 		goto out;
3705 	}
3706 #endif
3707 
3708 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3709 	if (error != 0) {
3710 		vnode_put(vp);
3711 		goto out;
3712 	}
3713 
3714 	sfsp = &__nameidata_statfs64->sfs;
3715 	vfs_get_statfs64(mp, sfsp);
3716 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3717 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3718 		/* This process does not want to see a seperate data volume mountpoint */
3719 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3720 	}
3721 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3722 	vnode_put(vp);
3723 
3724 out:
3725 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3726 
3727 	return error;
3728 }
3729 
3730 /*
3731  * Get file system statistics in 64-bit mode
3732  */
3733 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3734 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3735 {
3736 	struct vnode *vp;
3737 	struct mount *mp;
3738 	struct statfs64 sfs;
3739 	int error;
3740 
3741 	AUDIT_ARG(fd, uap->fd);
3742 
3743 	if ((error = file_vnode(uap->fd, &vp))) {
3744 		return error;
3745 	}
3746 
3747 	error = vnode_getwithref(vp);
3748 	if (error) {
3749 		file_drop(uap->fd);
3750 		return error;
3751 	}
3752 
3753 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3754 
3755 	mp = vp->v_mount;
3756 	if (!mp) {
3757 		error = EBADF;
3758 		goto out;
3759 	}
3760 
3761 #if CONFIG_MACF
3762 	error = mac_mount_check_stat(vfs_context_current(), mp);
3763 	if (error != 0) {
3764 		goto out;
3765 	}
3766 #endif
3767 
3768 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3769 		goto out;
3770 	}
3771 
3772 	vfs_get_statfs64(mp, &sfs);
3773 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3774 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3775 		/* This process does not want to see a seperate data volume mountpoint */
3776 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3777 	}
3778 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3779 
3780 out:
3781 	file_drop(uap->fd);
3782 	vnode_put(vp);
3783 
3784 	return error;
3785 }
3786 
3787 struct getfsstat_struct {
3788 	user_addr_t     sfsp;
3789 	user_addr_t     *mp;
3790 	int             count;
3791 	int             maxcount;
3792 	int             flags;
3793 	int             error;
3794 };
3795 
3796 
3797 static int
getfsstat_callback(mount_t mp,void * arg)3798 getfsstat_callback(mount_t mp, void * arg)
3799 {
3800 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3801 	struct vfsstatfs *sp;
3802 	int error, my_size;
3803 	vfs_context_t ctx = vfs_context_current();
3804 
3805 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3806 #if CONFIG_MACF
3807 		error = mac_mount_check_stat(ctx, mp);
3808 		if (error != 0) {
3809 			fstp->error = error;
3810 			return VFS_RETURNED_DONE;
3811 		}
3812 #endif
3813 		sp = &mp->mnt_vfsstat;
3814 		/*
3815 		 * If MNT_NOWAIT is specified, do not refresh the
3816 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3817 		 */
3818 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3819 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3820 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3821 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3822 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3823 			return VFS_RETURNED;
3824 		}
3825 
3826 		/*
3827 		 * Need to handle LP64 version of struct statfs
3828 		 */
3829 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3830 		if (error) {
3831 			fstp->error = error;
3832 			return VFS_RETURNED_DONE;
3833 		}
3834 		fstp->sfsp += my_size;
3835 
3836 		if (fstp->mp) {
3837 #if CONFIG_MACF
3838 			error = mac_mount_label_get(mp, *fstp->mp);
3839 			if (error) {
3840 				fstp->error = error;
3841 				return VFS_RETURNED_DONE;
3842 			}
3843 #endif
3844 			fstp->mp++;
3845 		}
3846 	}
3847 	fstp->count++;
3848 	return VFS_RETURNED;
3849 }
3850 
3851 /*
3852  * Get statistics on all filesystems.
3853  */
3854 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3855 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3856 {
3857 	struct __mac_getfsstat_args muap;
3858 
3859 	muap.buf = uap->buf;
3860 	muap.bufsize = uap->bufsize;
3861 	muap.mac = USER_ADDR_NULL;
3862 	muap.macsize = 0;
3863 	muap.flags = uap->flags;
3864 
3865 	return __mac_getfsstat(p, &muap, retval);
3866 }
3867 
3868 /*
3869  * __mac_getfsstat: Get MAC-related file system statistics
3870  *
3871  * Parameters:    p                        (ignored)
3872  *                uap                      User argument descriptor (see below)
3873  *                retval                   Count of file system statistics (N stats)
3874  *
3875  * Indirect:      uap->bufsize             Buffer size
3876  *                uap->macsize             MAC info size
3877  *                uap->buf                 Buffer where information will be returned
3878  *                uap->mac                 MAC info
3879  *                uap->flags               File system flags
3880  *
3881  *
3882  * Returns:        0                       Success
3883  *                !0                       Not success
3884  *
3885  */
3886 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3887 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3888 {
3889 	user_addr_t sfsp;
3890 	user_addr_t *mp;
3891 	size_t count, maxcount, bufsize, macsize;
3892 	struct getfsstat_struct fst;
3893 
3894 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3895 		return EINVAL;
3896 	}
3897 
3898 	bufsize = (size_t) uap->bufsize;
3899 	macsize = (size_t) uap->macsize;
3900 
3901 	if (IS_64BIT_PROCESS(p)) {
3902 		maxcount = bufsize / sizeof(struct user64_statfs);
3903 	} else {
3904 		maxcount = bufsize / sizeof(struct user32_statfs);
3905 	}
3906 	sfsp = uap->buf;
3907 	count = 0;
3908 
3909 	mp = NULL;
3910 
3911 #if CONFIG_MACF
3912 	if (uap->mac != USER_ADDR_NULL) {
3913 		u_int32_t *mp0;
3914 		int error;
3915 		unsigned int i;
3916 
3917 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3918 		if (count != maxcount) {
3919 			return EINVAL;
3920 		}
3921 
3922 		/* Copy in the array */
3923 		mp0 = kalloc_data(macsize, Z_WAITOK);
3924 		if (mp0 == NULL) {
3925 			return ENOMEM;
3926 		}
3927 
3928 		error = copyin(uap->mac, mp0, macsize);
3929 		if (error) {
3930 			kfree_data(mp0, macsize);
3931 			return error;
3932 		}
3933 
3934 		/* Normalize to an array of user_addr_t */
3935 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3936 		if (mp == NULL) {
3937 			kfree_data(mp0, macsize);
3938 			return ENOMEM;
3939 		}
3940 
3941 		for (i = 0; i < count; i++) {
3942 			if (IS_64BIT_PROCESS(p)) {
3943 				mp[i] = ((user_addr_t *)mp0)[i];
3944 			} else {
3945 				mp[i] = (user_addr_t)mp0[i];
3946 			}
3947 		}
3948 		kfree_data(mp0, macsize);
3949 	}
3950 #endif
3951 
3952 
3953 	fst.sfsp = sfsp;
3954 	fst.mp = mp;
3955 	fst.flags = uap->flags;
3956 	fst.count = 0;
3957 	fst.error = 0;
3958 	fst.maxcount = (int)maxcount;
3959 
3960 
3961 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3962 
3963 	if (mp) {
3964 		kfree_data(mp, count * sizeof(user_addr_t));
3965 	}
3966 
3967 	if (fst.error) {
3968 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3969 		return fst.error;
3970 	}
3971 
3972 	if (fst.sfsp && fst.count > fst.maxcount) {
3973 		*retval = fst.maxcount;
3974 	} else {
3975 		*retval = fst.count;
3976 	}
3977 	return 0;
3978 }
3979 
3980 static int
getfsstat64_callback(mount_t mp,void * arg)3981 getfsstat64_callback(mount_t mp, void * arg)
3982 {
3983 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3984 	struct vfsstatfs *sp;
3985 	struct statfs64 sfs;
3986 	int error;
3987 
3988 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3989 #if CONFIG_MACF
3990 		error = mac_mount_check_stat(vfs_context_current(), mp);
3991 		if (error != 0) {
3992 			fstp->error = error;
3993 			return VFS_RETURNED_DONE;
3994 		}
3995 #endif
3996 		sp = &mp->mnt_vfsstat;
3997 		/*
3998 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3999 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
4000 		 *
4001 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
4002 		 * getfsstat, since the constants are out of the same
4003 		 * namespace.
4004 		 */
4005 		if ((mp->mnt_lflag & MNT_LDEAD) ||
4006 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
4007 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
4008 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
4009 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
4010 			return VFS_RETURNED;
4011 		}
4012 
4013 		vfs_get_statfs64(mp, &sfs);
4014 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
4015 		if (error) {
4016 			fstp->error = error;
4017 			return VFS_RETURNED_DONE;
4018 		}
4019 		fstp->sfsp += sizeof(sfs);
4020 	}
4021 	fstp->count++;
4022 	return VFS_RETURNED;
4023 }
4024 
4025 /*
4026  * Get statistics on all file systems in 64 bit mode.
4027  */
4028 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)4029 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
4030 {
4031 	user_addr_t sfsp;
4032 	int count, maxcount;
4033 	struct getfsstat_struct fst;
4034 
4035 	maxcount = uap->bufsize / sizeof(struct statfs64);
4036 
4037 	sfsp = uap->buf;
4038 	count = 0;
4039 
4040 	fst.sfsp = sfsp;
4041 	fst.flags = uap->flags;
4042 	fst.count = 0;
4043 	fst.error = 0;
4044 	fst.maxcount = maxcount;
4045 
4046 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
4047 
4048 	if (fst.error) {
4049 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4050 		return fst.error;
4051 	}
4052 
4053 	if (fst.sfsp && fst.count > fst.maxcount) {
4054 		*retval = fst.maxcount;
4055 	} else {
4056 		*retval = fst.count;
4057 	}
4058 
4059 	return 0;
4060 }
4061 
4062 /*
4063  * gets the associated vnode with the file descriptor passed.
4064  * as input
4065  *
4066  * INPUT
4067  * ctx - vfs context of caller
4068  * fd - file descriptor for which vnode is required.
4069  * vpp - Pointer to pointer to vnode to be returned.
4070  *
4071  * The vnode is returned with an iocount so any vnode obtained
4072  * by this call needs a vnode_put
4073  *
4074  */
4075 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)4076 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
4077 {
4078 	int error;
4079 	vnode_t vp;
4080 	struct fileproc *fp;
4081 	proc_t p = vfs_context_proc(ctx);
4082 
4083 	*vpp =  NULLVP;
4084 
4085 	error = fp_getfvp(p, fd, &fp, &vp);
4086 	if (error) {
4087 		return error;
4088 	}
4089 
4090 	error = vnode_getwithref(vp);
4091 	if (error) {
4092 		(void)fp_drop(p, fd, fp, 0);
4093 		return error;
4094 	}
4095 
4096 	(void)fp_drop(p, fd, fp, 0);
4097 	*vpp = vp;
4098 	return error;
4099 }
4100 
4101 /*
4102  * Wrapper function around namei to start lookup from a directory
4103  * specified by a file descriptor ni_dirfd.
4104  *
4105  * In addition to all the errors returned by namei, this call can
4106  * return ENOTDIR if the file descriptor does not refer to a directory.
4107  * and EBADF if the file descriptor is not valid.
4108  */
4109 int
nameiat(struct nameidata * ndp,int dirfd)4110 nameiat(struct nameidata *ndp, int dirfd)
4111 {
4112 	if ((dirfd != AT_FDCWD) &&
4113 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4114 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
4115 		int error = 0;
4116 		char c;
4117 
4118 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4119 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4120 			if (error) {
4121 				return error;
4122 			}
4123 		} else {
4124 			c = *((char *)(ndp->ni_dirp));
4125 		}
4126 
4127 		if (c != '/') {
4128 			vnode_t dvp_at;
4129 
4130 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4131 			    &dvp_at);
4132 			if (error) {
4133 				return error;
4134 			}
4135 
4136 			if (vnode_vtype(dvp_at) != VDIR) {
4137 				vnode_put(dvp_at);
4138 				return ENOTDIR;
4139 			}
4140 
4141 			ndp->ni_dvp = dvp_at;
4142 			ndp->ni_cnd.cn_flags |= USEDVP;
4143 			error = namei(ndp);
4144 			ndp->ni_cnd.cn_flags &= ~USEDVP;
4145 			vnode_put(dvp_at);
4146 			return error;
4147 		}
4148 	}
4149 
4150 	return namei(ndp);
4151 }
4152 
4153 /*
4154  * Change current working directory to a given file descriptor.
4155  */
4156 /* ARGSUSED */
4157 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4158 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4159 {
4160 	vnode_t vp;
4161 	vnode_t tdp;
4162 	vnode_t tvp;
4163 	struct mount *mp;
4164 	int error, should_put = 1;
4165 
4166 	AUDIT_ARG(fd, fd);
4167 	if (per_thread && fd == -1) {
4168 		/*
4169 		 * Switching back from per-thread to per process CWD; verify we
4170 		 * in fact have one before proceeding.  The only success case
4171 		 * for this code path is to return 0 preemptively after zapping
4172 		 * the thread structure contents.
4173 		 */
4174 		thread_t th = vfs_context_thread(ctx);
4175 		if (th) {
4176 			uthread_t uth = get_bsdthread_info(th);
4177 			tvp = uth->uu_cdir;
4178 			uth->uu_cdir = NULLVP;
4179 			if (tvp != NULLVP) {
4180 				vnode_rele(tvp);
4181 				return 0;
4182 			}
4183 		}
4184 		return EBADF;
4185 	}
4186 
4187 	if ((error = file_vnode(fd, &vp))) {
4188 		return error;
4189 	}
4190 	if ((error = vnode_getwithref(vp))) {
4191 		file_drop(fd);
4192 		return error;
4193 	}
4194 
4195 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4196 
4197 	if (vp->v_type != VDIR) {
4198 		error = ENOTDIR;
4199 		goto out;
4200 	}
4201 
4202 #if CONFIG_MACF
4203 	error = mac_vnode_check_chdir(ctx, vp);
4204 	if (error) {
4205 		goto out;
4206 	}
4207 #endif
4208 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4209 	if (error) {
4210 		goto out;
4211 	}
4212 
4213 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4214 		if (vfs_busy(mp, LK_NOWAIT)) {
4215 			error = EACCES;
4216 			goto out;
4217 		}
4218 		error = VFS_ROOT(mp, &tdp, ctx);
4219 		vfs_unbusy(mp);
4220 		if (error) {
4221 			break;
4222 		}
4223 		vnode_put(vp);
4224 		vp = tdp;
4225 	}
4226 	if (error) {
4227 		goto out;
4228 	}
4229 	if ((error = vnode_ref(vp))) {
4230 		goto out;
4231 	}
4232 	vnode_put(vp);
4233 	should_put = 0;
4234 
4235 	if (per_thread) {
4236 		thread_t th = vfs_context_thread(ctx);
4237 		if (th) {
4238 			uthread_t uth = get_bsdthread_info(th);
4239 			tvp = uth->uu_cdir;
4240 			uth->uu_cdir = vp;
4241 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4242 		} else {
4243 			vnode_rele(vp);
4244 			error = ENOENT;
4245 			goto out;
4246 		}
4247 	} else {
4248 		proc_dirs_lock_exclusive(p);
4249 		proc_fdlock(p);
4250 		tvp = p->p_fd.fd_cdir;
4251 		p->p_fd.fd_cdir = vp;
4252 		proc_fdunlock(p);
4253 		proc_dirs_unlock_exclusive(p);
4254 	}
4255 
4256 	if (tvp) {
4257 		vnode_rele(tvp);
4258 	}
4259 
4260 out:
4261 	if (should_put) {
4262 		vnode_put(vp);
4263 	}
4264 	file_drop(fd);
4265 
4266 	return error;
4267 }
4268 
4269 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4270 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4271 {
4272 	return fchdir(p, vfs_context_current(), uap->fd, false);
4273 }
4274 
4275 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4276 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4277 {
4278 	return fchdir(p, vfs_context_current(), uap->fd, true);
4279 }
4280 
4281 
4282 /*
4283  * Change current working directory (".").
4284  *
4285  * Returns:	0			Success
4286  *	change_dir:ENOTDIR
4287  *	change_dir:???
4288  *	vnode_ref:ENOENT		No such file or directory
4289  */
4290 /* ARGSUSED */
4291 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4292 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4293 {
4294 	int error;
4295 	vnode_t tvp;
4296 
4297 	error = change_dir(ndp, ctx);
4298 	if (error) {
4299 		return error;
4300 	}
4301 	if ((error = vnode_ref(ndp->ni_vp))) {
4302 		vnode_put(ndp->ni_vp);
4303 		return error;
4304 	}
4305 	/*
4306 	 * drop the iocount we picked up in change_dir
4307 	 */
4308 	vnode_put(ndp->ni_vp);
4309 
4310 	if (per_thread) {
4311 		thread_t th = vfs_context_thread(ctx);
4312 		if (th) {
4313 			uthread_t uth = get_bsdthread_info(th);
4314 			tvp = uth->uu_cdir;
4315 			uth->uu_cdir = ndp->ni_vp;
4316 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4317 		} else {
4318 			vnode_rele(ndp->ni_vp);
4319 			return ENOENT;
4320 		}
4321 	} else {
4322 		proc_dirs_lock_exclusive(p);
4323 		proc_fdlock(p);
4324 		tvp = p->p_fd.fd_cdir;
4325 		p->p_fd.fd_cdir = ndp->ni_vp;
4326 		proc_fdunlock(p);
4327 		proc_dirs_unlock_exclusive(p);
4328 	}
4329 
4330 	if (tvp) {
4331 		vnode_rele(tvp);
4332 	}
4333 
4334 	return 0;
4335 }
4336 
4337 
4338 /*
4339  * Change current working directory (".").
4340  *
4341  * Returns:	0			Success
4342  *	chdir_internal:ENOTDIR
4343  *	chdir_internal:ENOENT		No such file or directory
4344  *	chdir_internal:???
4345  */
4346 /* ARGSUSED */
4347 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4348 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4349 {
4350 	struct nameidata nd;
4351 	vfs_context_t ctx = vfs_context_current();
4352 
4353 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4354 	    UIO_USERSPACE, uap->path, ctx);
4355 
4356 	return chdir_internal(p, ctx, &nd, per_thread);
4357 }
4358 
4359 
4360 /*
4361  * chdir
4362  *
4363  * Change current working directory (".") for the entire process
4364  *
4365  * Parameters:  p       Process requesting the call
4366  *              uap     User argument descriptor (see below)
4367  *              retval  (ignored)
4368  *
4369  * Indirect parameters:	uap->path	Directory path
4370  *
4371  * Returns:	0			Success
4372  *              common_chdir: ENOTDIR
4373  *              common_chdir: ENOENT	No such file or directory
4374  *              common_chdir: ???
4375  *
4376  */
4377 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4378 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4379 {
4380 	return common_chdir(p, (void *)uap, 0);
4381 }
4382 
4383 /*
4384  * __pthread_chdir
4385  *
4386  * Change current working directory (".") for a single thread
4387  *
4388  * Parameters:  p       Process requesting the call
4389  *              uap     User argument descriptor (see below)
4390  *              retval  (ignored)
4391  *
4392  * Indirect parameters:	uap->path	Directory path
4393  *
4394  * Returns:	0			Success
4395  *              common_chdir: ENOTDIR
4396  *		common_chdir: ENOENT	No such file or directory
4397  *		common_chdir: ???
4398  *
4399  */
4400 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4401 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4402 {
4403 	return common_chdir(p, (void *)uap, 1);
4404 }
4405 
4406 
4407 /*
4408  * Change notion of root (``/'') directory.
4409  */
4410 /* ARGSUSED */
4411 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4412 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4413 {
4414 	struct filedesc *fdp = &p->p_fd;
4415 	int error;
4416 	struct nameidata nd;
4417 	vnode_t tvp;
4418 	vfs_context_t ctx = vfs_context_current();
4419 
4420 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4421 		return error;
4422 	}
4423 
4424 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4425 	    UIO_USERSPACE, uap->path, ctx);
4426 	error = change_dir(&nd, ctx);
4427 	if (error) {
4428 		return error;
4429 	}
4430 
4431 #if CONFIG_MACF
4432 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4433 	    &nd.ni_cnd);
4434 	if (error) {
4435 		vnode_put(nd.ni_vp);
4436 		return error;
4437 	}
4438 #endif
4439 
4440 	if ((error = vnode_ref(nd.ni_vp))) {
4441 		vnode_put(nd.ni_vp);
4442 		return error;
4443 	}
4444 	vnode_put(nd.ni_vp);
4445 
4446 	/*
4447 	 * This lock provides the guarantee that as long as you hold the lock
4448 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4449 	 * on a referenced vnode in namei when determining the rootvnode for
4450 	 * a process.
4451 	 */
4452 	/* needed for synchronization with lookup */
4453 	proc_dirs_lock_exclusive(p);
4454 	/* needed for setting the flag and other activities on the fd itself */
4455 	proc_fdlock(p);
4456 	tvp = fdp->fd_rdir;
4457 	fdp->fd_rdir = nd.ni_vp;
4458 	fdt_flag_set(fdp, FD_CHROOT);
4459 	proc_fdunlock(p);
4460 	proc_dirs_unlock_exclusive(p);
4461 
4462 	if (tvp != NULL) {
4463 		vnode_rele(tvp);
4464 	}
4465 
4466 	return 0;
4467 }
4468 
4469 #define PATHSTATICBUFLEN 256
4470 #define PIVOT_ROOT_ENTITLEMENT              \
4471        "com.apple.private.vfs.pivot-root"
4472 
4473 #if defined(XNU_TARGET_OS_OSX)
4474 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4475 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4476 {
4477 	int error;
4478 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4479 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4480 	char *new_rootfs_path_before_buf = NULL;
4481 	char *old_rootfs_path_after_buf = NULL;
4482 	char *incoming = NULL;
4483 	char *outgoing = NULL;
4484 	vnode_t incoming_rootvp = NULLVP;
4485 	size_t bytes_copied;
4486 
4487 	/*
4488 	 * XXX : Additional restrictions needed
4489 	 * - perhaps callable only once.
4490 	 */
4491 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4492 		return error;
4493 	}
4494 
4495 	/*
4496 	 * pivot_root can be executed by launchd only.
4497 	 * Enforce entitlement.
4498 	 */
4499 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4500 		return EPERM;
4501 	}
4502 
4503 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4504 	if (error == ENAMETOOLONG) {
4505 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4506 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4507 	}
4508 
4509 	if (error) {
4510 		goto out;
4511 	}
4512 
4513 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4514 	if (error == ENAMETOOLONG) {
4515 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4516 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4517 	}
4518 	if (error) {
4519 		goto out;
4520 	}
4521 
4522 	if (new_rootfs_path_before_buf) {
4523 		incoming = new_rootfs_path_before_buf;
4524 	} else {
4525 		incoming = &new_rootfs_path_before[0];
4526 	}
4527 
4528 	if (old_rootfs_path_after_buf) {
4529 		outgoing = old_rootfs_path_after_buf;
4530 	} else {
4531 		outgoing = &old_rootfs_path_after[0];
4532 	}
4533 
4534 	/*
4535 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4536 	 * Userland is not allowed to pivot to an image.
4537 	 */
4538 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4539 	if (error) {
4540 		goto out;
4541 	}
4542 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4543 	if (error) {
4544 		goto out;
4545 	}
4546 
4547 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4548 
4549 out:
4550 	if (incoming_rootvp != NULLVP) {
4551 		vnode_put(incoming_rootvp);
4552 		incoming_rootvp = NULLVP;
4553 	}
4554 
4555 	if (old_rootfs_path_after_buf) {
4556 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4557 	}
4558 
4559 	if (new_rootfs_path_before_buf) {
4560 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4561 	}
4562 
4563 	return error;
4564 }
4565 #else
4566 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4567 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4568 {
4569 	return nosys(p, NULL, retval);
4570 }
4571 #endif /* XNU_TARGET_OS_OSX */
4572 
4573 /*
4574  * Common routine for chroot and chdir.
4575  *
4576  * Returns:	0			Success
4577  *		ENOTDIR			Not a directory
4578  *		namei:???		[anything namei can return]
4579  *		vnode_authorize:???	[anything vnode_authorize can return]
4580  */
4581 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4582 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4583 {
4584 	vnode_t vp;
4585 	int error;
4586 
4587 	if ((error = namei(ndp))) {
4588 		return error;
4589 	}
4590 	nameidone(ndp);
4591 	vp = ndp->ni_vp;
4592 
4593 	if (vp->v_type != VDIR) {
4594 		vnode_put(vp);
4595 		return ENOTDIR;
4596 	}
4597 
4598 #if CONFIG_MACF
4599 	error = mac_vnode_check_chdir(ctx, vp);
4600 	if (error) {
4601 		vnode_put(vp);
4602 		return error;
4603 	}
4604 #endif
4605 
4606 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4607 	if (error) {
4608 		vnode_put(vp);
4609 		return error;
4610 	}
4611 
4612 	return error;
4613 }
4614 
4615 /*
4616  * Free the vnode data (for directories) associated with the file glob.
4617  */
4618 struct fd_vn_data *
fg_vn_data_alloc(void)4619 fg_vn_data_alloc(void)
4620 {
4621 	struct fd_vn_data *fvdata;
4622 
4623 	/* Allocate per fd vnode data */
4624 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4625 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4626 	return fvdata;
4627 }
4628 
4629 /*
4630  * Free the vnode data (for directories) associated with the file glob.
4631  */
4632 void
fg_vn_data_free(void * fgvndata)4633 fg_vn_data_free(void *fgvndata)
4634 {
4635 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4636 
4637 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4638 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4639 	kfree_type(struct fd_vn_data, fvdata);
4640 }
4641 
4642 /*
4643  * Check permissions, allocate an open file structure,
4644  * and call the device open routine if any.
4645  *
4646  * Returns:	0			Success
4647  *		EINVAL
4648  *		EINTR
4649  *	falloc:ENFILE
4650  *	falloc:EMFILE
4651  *	falloc:ENOMEM
4652  *	vn_open_auth:???
4653  *	dupfdopen:???
4654  *	VNOP_ADVLOCK:???
4655  *	vnode_setsize:???
4656  *
4657  * XXX Need to implement uid, gid
4658  */
4659 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4660 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4661     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4662 {
4663 	proc_t p = vfs_context_proc(ctx);
4664 	kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4665 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4666 	struct fileproc *fp;
4667 	vnode_t vp;
4668 	int flags, oflags, amode;
4669 	int type, indx, error;
4670 	struct vfs_context context;
4671 	vnode_t authvp = NULLVP;
4672 
4673 	oflags = uflags;
4674 
4675 	amode = oflags & O_ACCMODE;
4676 	/*
4677 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4678 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4679 	 * with FREAD/FWRITE.
4680 	 */
4681 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4682 		return EINVAL;
4683 	}
4684 
4685 	flags = FFLAGS(uflags);
4686 	CLR(flags, FENCRYPTED);
4687 	CLR(flags, FUNENCRYPTED);
4688 
4689 	AUDIT_ARG(fflags, oflags);
4690 	AUDIT_ARG(mode, vap->va_mode);
4691 
4692 	if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4693 		return error;
4694 	}
4695 	if (flags & O_CLOEXEC) {
4696 		fp->fp_flags |= FP_CLOEXEC;
4697 	}
4698 	if (flags & O_CLOFORK) {
4699 		fp->fp_flags |= FP_CLOFORK;
4700 	}
4701 
4702 	/* setup state to recognize when fdesc_open was called */
4703 	uu->uu_dupfd = -1;
4704 
4705 	/*
4706 	 * Disable read/write access if file is opened with O_EVTONLY and
4707 	 * the process has requested to deny read/write access.
4708 	 */
4709 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4710 		flags &= ~(FREAD | FWRITE);
4711 	}
4712 
4713 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4714 		error = vnode_getfromfd(ctx, authfd, &authvp);
4715 		if (error) {
4716 			fp_free(p, indx, fp);
4717 			return error;
4718 		}
4719 	}
4720 
4721 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4722 		if (authvp != NULLVP) {
4723 			vnode_put(authvp);
4724 		}
4725 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4726 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4727 				*retval = indx;
4728 				return 0;
4729 			}
4730 		}
4731 		if (error == ERESTART) {
4732 			error = EINTR;
4733 		}
4734 		fp_free(p, indx, fp);
4735 		return error;
4736 	}
4737 
4738 	if (authvp != NULLVP) {
4739 		vnode_put(authvp);
4740 	}
4741 
4742 	uu->uu_dupfd = 0;
4743 	vp = ndp->ni_vp;
4744 
4745 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4746 	fp->fp_glob->fg_ops = &vnops;
4747 	fp_set_data(fp, vp);
4748 
4749 #if CONFIG_FILE_LEASES
4750 	/*
4751 	 * If we are creating a file or open with truncate, we need to break the
4752 	 * lease if there is a read lease placed on the parent dir.
4753 	 */
4754 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4755 		vnode_breakdirlease(vp, true, oflags);
4756 	}
4757 	/* Now check if there is a lease placed on the file itself. */
4758 	error = vnode_breaklease(vp, oflags, ctx);
4759 	if (error) {
4760 		goto bad;
4761 	}
4762 #endif /* CONFIG_FILE_LEASES */
4763 
4764 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4765 		struct flock lf = {
4766 			.l_whence = SEEK_SET,
4767 		};
4768 
4769 		if (flags & O_EXLOCK) {
4770 			lf.l_type = F_WRLCK;
4771 		} else {
4772 			lf.l_type = F_RDLCK;
4773 		}
4774 		type = F_FLOCK;
4775 		if ((flags & FNONBLOCK) == 0) {
4776 			type |= F_WAIT;
4777 		}
4778 #if CONFIG_MACF
4779 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4780 		    F_SETLK, &lf);
4781 		if (error) {
4782 			goto bad;
4783 		}
4784 #endif
4785 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4786 			goto bad;
4787 		}
4788 		fp->fp_glob->fg_flag |= FWASLOCKED;
4789 	}
4790 
4791 	/* try to truncate by setting the size attribute */
4792 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4793 		goto bad;
4794 	}
4795 
4796 	/*
4797 	 * For directories we hold some additional information in the fd.
4798 	 */
4799 	if (vnode_vtype(vp) == VDIR) {
4800 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4801 	} else {
4802 		fp->fp_glob->fg_vn_data = NULL;
4803 	}
4804 
4805 #if CONFIG_SECLUDED_MEMORY
4806 	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4807 		memory_object_control_t moc;
4808 		const char *v_name;
4809 
4810 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4811 
4812 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4813 			/* nothing to do... */
4814 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4815 			/* writable -> no longer  eligible for secluded pages */
4816 			memory_object_mark_eligible_for_secluded(moc,
4817 			    FALSE);
4818 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4819 			char pathname[32] = { 0, };
4820 			size_t copied;
4821 			/* XXX FBDP: better way to detect /Applications/ ? */
4822 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4823 				(void)copyinstr(ndp->ni_dirp,
4824 				    pathname,
4825 				    sizeof(pathname),
4826 				    &copied);
4827 			} else {
4828 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4829 				    pathname,
4830 				    sizeof(pathname),
4831 				    &copied);
4832 			}
4833 			pathname[sizeof(pathname) - 1] = '\0';
4834 			if (strncmp(pathname,
4835 			    "/Applications/",
4836 			    strlen("/Applications/")) == 0 &&
4837 			    strncmp(pathname,
4838 			    "/Applications/Camera.app/",
4839 			    strlen("/Applications/Camera.app/")) != 0) {
4840 				/*
4841 				 * not writable
4842 				 * AND from "/Applications/"
4843 				 * AND not from "/Applications/Camera.app/"
4844 				 * ==> eligible for secluded
4845 				 */
4846 				memory_object_mark_eligible_for_secluded(moc,
4847 				    TRUE);
4848 			}
4849 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4850 		    (v_name = vnode_getname(vp))) {
4851 			size_t len = strlen(v_name);
4852 
4853 			if (!strncmp(v_name, "dyld", len) ||
4854 			    !strncmp(v_name, "launchd", len) ||
4855 			    !strncmp(v_name, "Camera", len) ||
4856 			    !strncmp(v_name, "SpringBoard", len) ||
4857 			    !strncmp(v_name, "backboardd", len) ||
4858 			    !strncmp(v_name, "cameracaptured", len)) {
4859 				/*
4860 				 * This file matters when launching Camera:
4861 				 * do not store its contents in the secluded
4862 				 * pool that will be drained on Camera launch.
4863 				 */
4864 				memory_object_mark_eligible_for_secluded(moc,
4865 				    FALSE);
4866 			} else if (!strncmp(v_name, "audiomxd", len) ||
4867 			    !strncmp(v_name, "mediaplaybackd", len)) {
4868 				memory_object_mark_eligible_for_secluded(moc,
4869 				    FALSE);
4870 				memory_object_mark_for_realtime(moc,
4871 				    true);
4872 			} else if (!strncmp(v_name, "bluetoothd", len)) {
4873 				/*
4874 				 * bluetoothd might be needed for realtime audio
4875 				 * playback.
4876 				 */
4877 				memory_object_mark_eligible_for_secluded(moc,
4878 				    FALSE);
4879 				memory_object_mark_for_realtime(moc,
4880 				    true);
4881 			} else {
4882 				char pathname[64] = { 0, };
4883 				size_t copied;
4884 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4885 					(void)copyinstr(ndp->ni_dirp,
4886 					    pathname,
4887 					    sizeof(pathname),
4888 					    &copied);
4889 				} else {
4890 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
4891 					    pathname,
4892 					    sizeof(pathname),
4893 					    &copied);
4894 				}
4895 				pathname[sizeof(pathname) - 1] = '\0';
4896 				if (strncmp(pathname,
4897 				    "/Library/Audio/Plug-Ins/",
4898 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4899 				    strncmp(pathname,
4900 				    "/System/Library/Audio/Plug-Ins/",
4901 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4902 					/*
4903 					 * This may be an audio plugin required
4904 					 * for realtime playback.
4905 					 * ==> NOT eligible for secluded.
4906 					 */
4907 					memory_object_mark_eligible_for_secluded(moc,
4908 					    FALSE);
4909 					memory_object_mark_for_realtime(moc,
4910 					    true);
4911 				}
4912 			}
4913 			vnode_putname(v_name);
4914 		}
4915 	}
4916 #endif /* CONFIG_SECLUDED_MEMORY */
4917 
4918 	vnode_put(vp);
4919 
4920 	/*
4921 	 * The first terminal open (without a O_NOCTTY) by a session leader
4922 	 * results in it being set as the controlling terminal.
4923 	 */
4924 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4925 	    !(flags & O_NOCTTY)) {
4926 		int tmp = 0;
4927 
4928 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4929 		    (caddr_t)&tmp, ctx);
4930 	}
4931 
4932 	proc_fdlock(p);
4933 	procfdtbl_releasefd(p, indx, NULL);
4934 
4935 	fp_drop(p, indx, fp, 1);
4936 	proc_fdunlock(p);
4937 
4938 	*retval = indx;
4939 
4940 	return 0;
4941 bad:
4942 	context = *vfs_context_current();
4943 	context.vc_ucred = fp->fp_glob->fg_cred;
4944 
4945 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4946 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4947 		struct flock lf = {
4948 			.l_whence = SEEK_SET,
4949 			.l_type = F_UNLCK,
4950 		};
4951 
4952 		(void)VNOP_ADVLOCK(
4953 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4954 	}
4955 
4956 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4957 	vnode_put(vp);
4958 	fp_free(p, indx, fp);
4959 
4960 	return error;
4961 }
4962 
4963 /*
4964  * While most of the *at syscall handlers can call nameiat() which
4965  * is a wrapper around namei, the use of namei and initialisation
4966  * of nameidata are far removed and in different functions  - namei
4967  * gets called in vn_open_auth for open1. So we'll just do here what
4968  * nameiat() does.
4969  */
4970 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4971 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4972     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4973     int dirfd, int authfd)
4974 {
4975 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4976 		int error;
4977 		char c;
4978 
4979 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4980 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4981 			if (error) {
4982 				return error;
4983 			}
4984 		} else {
4985 			c = *((char *)(ndp->ni_dirp));
4986 		}
4987 
4988 		if (c != '/') {
4989 			vnode_t dvp_at;
4990 
4991 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4992 			    &dvp_at);
4993 			if (error) {
4994 				return error;
4995 			}
4996 
4997 			if (vnode_vtype(dvp_at) != VDIR) {
4998 				vnode_put(dvp_at);
4999 				return ENOTDIR;
5000 			}
5001 
5002 			ndp->ni_dvp = dvp_at;
5003 			ndp->ni_cnd.cn_flags |= USEDVP;
5004 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
5005 			    retval, authfd);
5006 			vnode_put(dvp_at);
5007 			return error;
5008 		}
5009 	}
5010 
5011 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
5012 }
5013 
5014 /*
5015  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
5016  *
5017  * Parameters:	p			Process requesting the open
5018  *		uap			User argument descriptor (see below)
5019  *		retval			Pointer to an area to receive the
5020  *					return calue from the system call
5021  *
5022  * Indirect:	uap->path		Path to open (same as 'open')
5023  *		uap->flags		Flags to open (same as 'open'
5024  *		uap->uid		UID to set, if creating
5025  *		uap->gid		GID to set, if creating
5026  *		uap->mode		File mode, if creating (same as 'open')
5027  *		uap->xsecurity		ACL to set, if creating
5028  *
5029  * Returns:	0			Success
5030  *		!0			errno value
5031  *
5032  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5033  *
5034  * XXX:		We should enummerate the possible errno values here, and where
5035  *		in the code they originated.
5036  */
5037 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)5038 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
5039 {
5040 	int ciferror;
5041 	kauth_filesec_t xsecdst;
5042 	struct vnode_attr va;
5043 	struct nameidata nd;
5044 	int cmode;
5045 
5046 	AUDIT_ARG(owner, uap->uid, uap->gid);
5047 
5048 	xsecdst = NULL;
5049 	if ((uap->xsecurity != USER_ADDR_NULL) &&
5050 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
5051 		return ciferror;
5052 	}
5053 
5054 	VATTR_INIT(&va);
5055 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
5056 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5057 	if (uap->uid != KAUTH_UID_NONE) {
5058 		VATTR_SET(&va, va_uid, uap->uid);
5059 	}
5060 	if (uap->gid != KAUTH_GID_NONE) {
5061 		VATTR_SET(&va, va_gid, uap->gid);
5062 	}
5063 	if (xsecdst != NULL) {
5064 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5065 		va.va_vaflags |= VA_FILESEC_ACL;
5066 	}
5067 
5068 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
5069 	    uap->path, vfs_context_current());
5070 
5071 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
5072 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
5073 	if (xsecdst != NULL) {
5074 		kauth_filesec_free(xsecdst);
5075 	}
5076 
5077 	return ciferror;
5078 }
5079 
5080 /*
5081  * Go through the data-protected atomically controlled open (2)
5082  *
5083  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5084  */
5085 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5086 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5087     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5088 {
5089 	/*
5090 	 * Follow the same path as normal open(2)
5091 	 * Look up the item if it exists, and acquire the vnode.
5092 	 */
5093 	struct vnode_attr va;
5094 	struct nameidata nd;
5095 	int cmode;
5096 	int error;
5097 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5098 
5099 	VATTR_INIT(&va);
5100 	/* Mask off all but regular access permissions */
5101 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5102 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5103 
5104 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5105 	    path, ctx);
5106 
5107 	/*
5108 	 * Initialize the extra fields in vnode_attr to pass down our
5109 	 * extra fields.
5110 	 * 1. target cprotect class.
5111 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5112 	 */
5113 	if (flags & O_CREAT) {
5114 		/* lower level kernel code validates that the class is valid before applying it. */
5115 		if (class != PROTECTION_CLASS_DEFAULT) {
5116 			/*
5117 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5118 			 * file behave the same as open (2)
5119 			 */
5120 			VATTR_SET(&va, va_dataprotect_class, class);
5121 		}
5122 	}
5123 
5124 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5125 		if (flags & (O_RDWR | O_WRONLY)) {
5126 			/*
5127 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
5128 			 */
5129 			return EINVAL;
5130 		}
5131 		if (dpflags & O_DP_GETRAWENCRYPTED) {
5132 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5133 		}
5134 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5135 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5136 		}
5137 		if (dpflags & O_DP_AUTHENTICATE) {
5138 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5139 		}
5140 	}
5141 
5142 	error = open1at(vfs_context_current(), &nd, flags, &va,
5143 	    NULL, NULL, retval, fd, authfd);
5144 
5145 	return error;
5146 }
5147 
5148 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5149 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5150 {
5151 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5152 		return EINVAL;
5153 	}
5154 
5155 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5156 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5157 }
5158 
5159 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5160 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5161 {
5162 	if (uap->dpflags & O_DP_AUTHENTICATE) {
5163 		return EINVAL;
5164 	}
5165 
5166 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5167 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5168 }
5169 
5170 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5171 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5172     int fd, enum uio_seg segflg, int *retval)
5173 {
5174 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5175 	struct {
5176 		struct vnode_attr va;
5177 		struct nameidata nd;
5178 	} *__open_data;
5179 	struct vnode_attr *vap;
5180 	struct nameidata *ndp;
5181 	int cmode;
5182 	int error;
5183 
5184 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5185 	vap = &__open_data->va;
5186 	ndp = &__open_data->nd;
5187 
5188 	VATTR_INIT(vap);
5189 	/* Mask off all but regular access permissions */
5190 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5191 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5192 
5193 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5194 	    segflg, path, ctx);
5195 
5196 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5197 
5198 	kfree_type(typeof(*__open_data), __open_data);
5199 
5200 	return error;
5201 }
5202 
5203 int
open(proc_t p,struct open_args * uap,int32_t * retval)5204 open(proc_t p, struct open_args *uap, int32_t *retval)
5205 {
5206 	__pthread_testcancel(1);
5207 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5208 }
5209 
5210 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5211 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5212     int32_t *retval)
5213 {
5214 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5215 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5216 }
5217 
5218 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5219 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5220     int32_t *retval)
5221 {
5222 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5223 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
5224 }
5225 
5226 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5227 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5228 {
5229 	__pthread_testcancel(1);
5230 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5231 }
5232 
5233 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5234 
5235 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5236 vfs_context_can_open_by_id(vfs_context_t ctx)
5237 {
5238 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5239 		return TRUE;
5240 	}
5241 
5242 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5243 	           OPEN_BY_ID_ENTITLEMENT);
5244 }
5245 
5246 /*
5247  * openbyid_np: open a file given a file system id and a file system object id
5248  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5249  *	file systems that don't support object ids it is a node id (uint64_t).
5250  *
5251  * Parameters:	p			Process requesting the open
5252  *		uap			User argument descriptor (see below)
5253  *		retval			Pointer to an area to receive the
5254  *					return calue from the system call
5255  *
5256  * Indirect:	uap->path		Path to open (same as 'open')
5257  *
5258  *		uap->fsid		id of target file system
5259  *		uap->objid		id of target file system object
5260  *		uap->flags		Flags to open (same as 'open')
5261  *
5262  * Returns:	0			Success
5263  *		!0			errno value
5264  *
5265  *
5266  * XXX:		We should enummerate the possible errno values here, and where
5267  *		in the code they originated.
5268  */
5269 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5270 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5271 {
5272 	fsid_t fsid;
5273 	uint64_t objid;
5274 	int error;
5275 	char *buf = NULL;
5276 	int buflen = MAXPATHLEN;
5277 	int pathlen = 0;
5278 	vfs_context_t ctx = vfs_context_current();
5279 
5280 	if (!vfs_context_can_open_by_id(ctx)) {
5281 		return EPERM;
5282 	}
5283 
5284 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5285 		return error;
5286 	}
5287 
5288 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5289 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5290 		return error;
5291 	}
5292 
5293 	AUDIT_ARG(value32, fsid.val[0]);
5294 	AUDIT_ARG(value64, objid);
5295 
5296 	/*resolve path from fsis, objid*/
5297 	do {
5298 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5299 		if (buf == NULL) {
5300 			return ENOMEM;
5301 		}
5302 
5303 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5304 		    buf, FSOPT_ISREALFSID, &pathlen);
5305 
5306 		if (error) {
5307 			kfree_data(buf, buflen + 1);
5308 			buf = NULL;
5309 		}
5310 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5311 
5312 	if (error) {
5313 		return error;
5314 	}
5315 
5316 	buf[pathlen] = 0;
5317 
5318 	error = openat_internal(
5319 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5320 
5321 	kfree_data(buf, buflen + 1);
5322 
5323 	return error;
5324 }
5325 
5326 
5327 /*
5328  * Create a special file.
5329  */
5330 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5331     int fd);
5332 
5333 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5334 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5335     mode_t mode, int fd)
5336 {
5337 	vfs_context_t ctx = vfs_context_current();
5338 	struct nameidata nd;
5339 	vnode_t vp, dvp;
5340 	int error;
5341 
5342 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5343 	if ((mode & S_IFMT) == S_IFIFO) {
5344 		return mkfifo1(ctx, upath, vap, fd);
5345 	}
5346 
5347 	AUDIT_ARG(mode, mode);
5348 	AUDIT_ARG(value32, vap->va_rdev);
5349 
5350 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5351 		return error;
5352 	}
5353 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5354 	    UIO_USERSPACE, upath, ctx);
5355 	error = nameiat(&nd, fd);
5356 	if (error) {
5357 		return error;
5358 	}
5359 	dvp = nd.ni_dvp;
5360 	vp = nd.ni_vp;
5361 
5362 	if (vp != NULL) {
5363 		error = EEXIST;
5364 		goto out;
5365 	}
5366 
5367 	switch (mode & S_IFMT) {
5368 	case S_IFCHR:
5369 		VATTR_SET(vap, va_type, VCHR);
5370 		break;
5371 	case S_IFBLK:
5372 		VATTR_SET(vap, va_type, VBLK);
5373 		break;
5374 	default:
5375 		error = EINVAL;
5376 		goto out;
5377 	}
5378 
5379 #if CONFIG_MACF
5380 	error = mac_vnode_check_create(ctx,
5381 	    nd.ni_dvp, &nd.ni_cnd, vap);
5382 	if (error) {
5383 		goto out;
5384 	}
5385 #endif
5386 
5387 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5388 		goto out;
5389 	}
5390 
5391 #if CONFIG_FILE_LEASES
5392 	vnode_breakdirlease(dvp, false, O_WRONLY);
5393 #endif
5394 
5395 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5396 		goto out;
5397 	}
5398 
5399 	if (vp) {
5400 		int     update_flags = 0;
5401 
5402 		// Make sure the name & parent pointers are hooked up
5403 		if (vp->v_name == NULL) {
5404 			update_flags |= VNODE_UPDATE_NAME;
5405 		}
5406 		if (vp->v_parent == NULLVP) {
5407 			update_flags |= VNODE_UPDATE_PARENT;
5408 		}
5409 
5410 		if (update_flags) {
5411 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5412 		}
5413 
5414 #if CONFIG_FSE
5415 		add_fsevent(FSE_CREATE_FILE, ctx,
5416 		    FSE_ARG_VNODE, vp,
5417 		    FSE_ARG_DONE);
5418 #endif
5419 	}
5420 
5421 out:
5422 	/*
5423 	 * nameidone has to happen before we vnode_put(dvp)
5424 	 * since it may need to release the fs_nodelock on the dvp
5425 	 */
5426 	nameidone(&nd);
5427 
5428 	if (vp) {
5429 		vnode_put(vp);
5430 	}
5431 	vnode_put(dvp);
5432 
5433 	return error;
5434 }
5435 
5436 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5437 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5438 {
5439 	struct vnode_attr va;
5440 
5441 	VATTR_INIT(&va);
5442 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5443 	VATTR_SET(&va, va_rdev, uap->dev);
5444 
5445 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5446 }
5447 
5448 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5449 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5450 {
5451 	struct vnode_attr va;
5452 
5453 	VATTR_INIT(&va);
5454 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5455 	VATTR_SET(&va, va_rdev, uap->dev);
5456 
5457 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5458 }
5459 
5460 /*
5461  * Create a named pipe.
5462  *
5463  * Returns:	0			Success
5464  *		EEXIST
5465  *	namei:???
5466  *	vnode_authorize:???
5467  *	vn_create:???
5468  */
5469 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5470 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5471 {
5472 	vnode_t vp, dvp;
5473 	int error;
5474 	struct nameidata nd;
5475 
5476 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5477 	    UIO_USERSPACE, upath, ctx);
5478 	error = nameiat(&nd, fd);
5479 	if (error) {
5480 		return error;
5481 	}
5482 	dvp = nd.ni_dvp;
5483 	vp = nd.ni_vp;
5484 
5485 	/* check that this is a new file and authorize addition */
5486 	if (vp != NULL) {
5487 		error = EEXIST;
5488 		goto out;
5489 	}
5490 	VATTR_SET(vap, va_type, VFIFO);
5491 
5492 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5493 		goto out;
5494 	}
5495 
5496 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5497 out:
5498 	/*
5499 	 * nameidone has to happen before we vnode_put(dvp)
5500 	 * since it may need to release the fs_nodelock on the dvp
5501 	 */
5502 	nameidone(&nd);
5503 
5504 	if (vp) {
5505 		vnode_put(vp);
5506 	}
5507 	vnode_put(dvp);
5508 
5509 	return error;
5510 }
5511 
5512 
5513 /*
5514  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5515  *
5516  * Parameters:	p			Process requesting the open
5517  *		uap			User argument descriptor (see below)
5518  *		retval			(Ignored)
5519  *
5520  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5521  *		uap->uid		UID to set
5522  *		uap->gid		GID to set
5523  *		uap->mode		File mode to set (same as 'mkfifo')
5524  *		uap->xsecurity		ACL to set, if creating
5525  *
5526  * Returns:	0			Success
5527  *		!0			errno value
5528  *
5529  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5530  *
5531  * XXX:		We should enummerate the possible errno values here, and where
5532  *		in the code they originated.
5533  */
5534 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5535 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5536 {
5537 	int ciferror;
5538 	kauth_filesec_t xsecdst;
5539 	struct vnode_attr va;
5540 
5541 	AUDIT_ARG(owner, uap->uid, uap->gid);
5542 
5543 	xsecdst = KAUTH_FILESEC_NONE;
5544 	if (uap->xsecurity != USER_ADDR_NULL) {
5545 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5546 			return ciferror;
5547 		}
5548 	}
5549 
5550 	VATTR_INIT(&va);
5551 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5552 	if (uap->uid != KAUTH_UID_NONE) {
5553 		VATTR_SET(&va, va_uid, uap->uid);
5554 	}
5555 	if (uap->gid != KAUTH_GID_NONE) {
5556 		VATTR_SET(&va, va_gid, uap->gid);
5557 	}
5558 	if (xsecdst != KAUTH_FILESEC_NONE) {
5559 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5560 		va.va_vaflags |= VA_FILESEC_ACL;
5561 	}
5562 
5563 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5564 
5565 	if (xsecdst != KAUTH_FILESEC_NONE) {
5566 		kauth_filesec_free(xsecdst);
5567 	}
5568 	return ciferror;
5569 }
5570 
5571 /* ARGSUSED */
5572 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5573 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5574 {
5575 	struct vnode_attr va;
5576 
5577 	VATTR_INIT(&va);
5578 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5579 
5580 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5581 }
5582 
5583 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5584 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5585 {
5586 	struct vnode_attr va;
5587 
5588 	VATTR_INIT(&va);
5589 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5590 
5591 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5592 }
5593 
5594 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5595 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5596 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5597 
5598 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5599 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5600 {
5601 	int ret, len = _len;
5602 
5603 	*truncated_path = 0;
5604 
5605 	if (firmlink) {
5606 		ret = vn_getpath(dvp, path, &len);
5607 	} else {
5608 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5609 	}
5610 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5611 		if (leafname) {
5612 			path[len - 1] = '/';
5613 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5614 			if (len > MAXPATHLEN) {
5615 				char *ptr;
5616 
5617 				// the string got truncated!
5618 				*truncated_path = 1;
5619 				ptr = strrchr(path, '/');
5620 				if (ptr) {
5621 					*ptr = '\0';   // chop off the string at the last directory component
5622 				}
5623 				len = (int)strlen(path) + 1;
5624 			}
5625 		}
5626 	} else if (ret == 0) {
5627 		*truncated_path = 1;
5628 	} else if (ret != 0) {
5629 		struct vnode *mydvp = dvp;
5630 
5631 		if (ret != ENOSPC) {
5632 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5633 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5634 		}
5635 		*truncated_path = 1;
5636 
5637 		do {
5638 			if (mydvp->v_parent != NULL) {
5639 				mydvp = mydvp->v_parent;
5640 			} else if (mydvp->v_mount) {
5641 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5642 				break;
5643 			} else {
5644 				// no parent and no mount point?  only thing is to punt and say "/" changed
5645 				strlcpy(path, "/", _len);
5646 				len = 2;
5647 				mydvp = NULL;
5648 			}
5649 
5650 			if (mydvp == NULL) {
5651 				break;
5652 			}
5653 
5654 			len = _len;
5655 			if (firmlink) {
5656 				ret = vn_getpath(mydvp, path, &len);
5657 			} else {
5658 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5659 			}
5660 		} while (ret == ENOSPC);
5661 	}
5662 
5663 	return len;
5664 }
5665 
5666 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5667 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5668 {
5669 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5670 }
5671 
5672 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5673 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5674 {
5675 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5676 }
5677 
5678 /*
5679  * Make a hard file link.
5680  *
5681  * Returns:	0			Success
5682  *		EPERM
5683  *		EEXIST
5684  *		EXDEV
5685  *	namei:???
5686  *	vnode_authorize:???
5687  *	VNOP_LINK:???
5688  */
5689 /* ARGSUSED */
5690 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5691 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5692     user_addr_t link, int flag, enum uio_seg segflg)
5693 {
5694 	vnode_t vp, pvp, dvp, lvp;
5695 	struct nameidata nd;
5696 	int follow;
5697 	int error;
5698 #if CONFIG_FSE
5699 	fse_info finfo;
5700 #endif
5701 	char *target_path = NULL;
5702 	char  *no_firmlink_path = NULL;
5703 	vnode_t locked_vp = NULLVP;
5704 	int truncated = 0;
5705 	int truncated_no_firmlink_path = 0;
5706 	int num_retries = 0;
5707 	int need_event, has_listeners, need_kpath2;
5708 	bool do_retry;
5709 
5710 	/* look up the object we are linking to */
5711 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5712 
5713 retry:
5714 	do_retry = false;
5715 	vp = dvp = lvp = NULLVP;
5716 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5717 	    segflg, path, ctx);
5718 
5719 	error = nameiat(&nd, fd1);
5720 	if (error) {
5721 		return error;
5722 	}
5723 	vp = nd.ni_vp;
5724 
5725 	nameidone(&nd);
5726 
5727 	/*
5728 	 * Normally, linking to directories is not supported.
5729 	 * However, some file systems may have limited support.
5730 	 */
5731 	if (vp->v_type == VDIR) {
5732 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5733 			error = EPERM;   /* POSIX */
5734 			goto out;
5735 		}
5736 
5737 		/* Linking to a directory requires ownership. */
5738 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5739 			struct vnode_attr dva;
5740 
5741 			VATTR_INIT(&dva);
5742 			VATTR_WANTED(&dva, va_uid);
5743 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5744 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5745 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5746 				error = EACCES;
5747 				goto out;
5748 			}
5749 		}
5750 	}
5751 
5752 	/* lookup the target node */
5753 #if CONFIG_TRIGGERS
5754 	nd.ni_op = OP_LINK;
5755 #endif
5756 	nd.ni_cnd.cn_nameiop = CREATE;
5757 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5758 	nd.ni_dirp = link;
5759 	error = nameiat(&nd, fd2);
5760 	if (error != 0) {
5761 		goto out;
5762 	}
5763 	dvp = nd.ni_dvp;
5764 	lvp = nd.ni_vp;
5765 
5766 	assert(locked_vp == NULLVP);
5767 	vnode_link_lock(vp);
5768 	locked_vp = vp;
5769 
5770 #if CONFIG_MACF
5771 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5772 		goto out2;
5773 	}
5774 #endif
5775 
5776 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5777 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5778 		goto out2;
5779 	}
5780 
5781 	/* target node must not exist */
5782 	if (lvp != NULLVP) {
5783 		error = EEXIST;
5784 		goto out2;
5785 	}
5786 	/* cannot link across mountpoints */
5787 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5788 		error = EXDEV;
5789 		goto out2;
5790 	}
5791 
5792 	/* authorize creation of the target note */
5793 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5794 		goto out2;
5795 	}
5796 
5797 #if CONFIG_FILE_LEASES
5798 	vnode_breakdirlease(dvp, false, O_WRONLY);
5799 #endif
5800 
5801 	/* and finally make the link */
5802 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5803 	if (error) {
5804 		if (error == ENOENT && num_retries < MAX_LINK_ENOENT_RETRIES) {
5805 			do_retry = true;
5806 			num_retries += 1;
5807 		}
5808 		goto out2;
5809 	}
5810 
5811 #if CONFIG_MACF
5812 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5813 #endif
5814 
5815 	assert(locked_vp == vp);
5816 	vnode_link_unlock(locked_vp);
5817 	locked_vp = NULLVP;
5818 
5819 #if CONFIG_FSE
5820 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5821 #else
5822 	need_event = 0;
5823 #endif
5824 	has_listeners = kauth_authorize_fileop_has_listeners();
5825 
5826 	need_kpath2 = 0;
5827 #if CONFIG_AUDIT
5828 	if (AUDIT_RECORD_EXISTS()) {
5829 		need_kpath2 = 1;
5830 	}
5831 #endif
5832 
5833 	if (need_event || has_listeners || need_kpath2) {
5834 		char *link_to_path = NULL;
5835 		int len, link_name_len;
5836 		int  len_no_firmlink_path = 0;
5837 
5838 		/* build the path to the new link file */
5839 		GET_PATH(target_path);
5840 
5841 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5842 		if (no_firmlink_path == NULL) {
5843 			GET_PATH(no_firmlink_path);
5844 		}
5845 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5846 
5847 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5848 
5849 		if (has_listeners) {
5850 			/* build the path to file we are linking to */
5851 			GET_PATH(link_to_path);
5852 
5853 			link_name_len = MAXPATHLEN;
5854 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5855 				/*
5856 				 * Call out to allow 3rd party notification of rename.
5857 				 * Ignore result of kauth_authorize_fileop call.
5858 				 */
5859 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5860 				    (uintptr_t)link_to_path,
5861 				    (uintptr_t)target_path);
5862 			}
5863 			if (link_to_path != NULL) {
5864 				RELEASE_PATH(link_to_path);
5865 			}
5866 		}
5867 #if CONFIG_FSE
5868 		if (need_event) {
5869 			/* construct fsevent */
5870 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5871 				if (truncated_no_firmlink_path) {
5872 					finfo.mode |= FSE_TRUNCATED_PATH;
5873 				}
5874 
5875 				// build the path to the destination of the link
5876 				add_fsevent(FSE_CREATE_FILE, ctx,
5877 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5878 				    FSE_ARG_FINFO, &finfo,
5879 				    FSE_ARG_DONE);
5880 			}
5881 
5882 			pvp = vp->v_parent;
5883 			// need an iocount on parent vnode in this case
5884 			if (pvp && pvp != dvp) {
5885 				pvp = vnode_getparent_if_different(vp, dvp);
5886 			}
5887 			if (pvp) {
5888 				add_fsevent(FSE_STAT_CHANGED, ctx,
5889 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5890 			}
5891 			if (pvp && pvp != dvp) {
5892 				vnode_put(pvp);
5893 			}
5894 		}
5895 #endif
5896 	}
5897 out2:
5898 	/*
5899 	 * nameidone has to happen before we vnode_put(dvp)
5900 	 * since it may need to release the fs_nodelock on the dvp
5901 	 */
5902 	nameidone(&nd);
5903 	if (target_path != NULL) {
5904 		RELEASE_PATH(target_path);
5905 		target_path = NULL;
5906 	}
5907 	if (no_firmlink_path != NULL) {
5908 		RELEASE_PATH(no_firmlink_path);
5909 		no_firmlink_path = NULL;
5910 	}
5911 out:
5912 	if (locked_vp) {
5913 		assert(locked_vp == vp);
5914 		vnode_link_unlock(locked_vp);
5915 		locked_vp = NULLVP;
5916 	}
5917 	if (lvp) {
5918 		vnode_put(lvp);
5919 	}
5920 	if (dvp) {
5921 		vnode_put(dvp);
5922 	}
5923 	vnode_put(vp);
5924 
5925 	if (do_retry) {
5926 		goto retry;
5927 	}
5928 
5929 	return error;
5930 }
5931 
5932 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5933 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5934 {
5935 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5936 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5937 }
5938 
5939 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5940 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5941 {
5942 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5943 		return EINVAL;
5944 	}
5945 
5946 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5947 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5948 }
5949 
5950 /*
5951  * Make a symbolic link.
5952  *
5953  * We could add support for ACLs here too...
5954  */
5955 /* ARGSUSED */
5956 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5957 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5958     user_addr_t link, enum uio_seg segflg)
5959 {
5960 	struct vnode_attr va;
5961 	char *path;
5962 	int error;
5963 	struct nameidata nd;
5964 	vnode_t vp, dvp;
5965 	size_t dummy = 0;
5966 	proc_t p;
5967 
5968 	error = 0;
5969 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5970 		path = zalloc(ZV_NAMEI);
5971 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5972 	} else {
5973 		path = (char *)path_data;
5974 	}
5975 	if (error) {
5976 		goto out;
5977 	}
5978 	AUDIT_ARG(text, path);  /* This is the link string */
5979 
5980 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5981 	    segflg, link, ctx);
5982 
5983 	error = nameiat(&nd, fd);
5984 	if (error) {
5985 		goto out;
5986 	}
5987 	dvp = nd.ni_dvp;
5988 	vp = nd.ni_vp;
5989 
5990 	p = vfs_context_proc(ctx);
5991 	VATTR_INIT(&va);
5992 	VATTR_SET(&va, va_type, VLNK);
5993 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5994 
5995 #if CONFIG_MACF
5996 	error = mac_vnode_check_create(ctx,
5997 	    dvp, &nd.ni_cnd, &va);
5998 #endif
5999 	if (error != 0) {
6000 		goto skipit;
6001 	}
6002 
6003 	if (vp != NULL) {
6004 		error = EEXIST;
6005 		goto skipit;
6006 	}
6007 
6008 	/* authorize */
6009 	if (error == 0) {
6010 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6011 	}
6012 	/* get default ownership, etc. */
6013 	if (error == 0) {
6014 		error = vnode_authattr_new(dvp, &va, 0, ctx);
6015 	}
6016 
6017 #if CONFIG_FILE_LEASES
6018 	vnode_breakdirlease(dvp, false, O_WRONLY);
6019 #endif
6020 
6021 	if (error == 0) {
6022 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
6023 	}
6024 
6025 	/* do fallback attribute handling */
6026 	if (error == 0 && vp) {
6027 		error = vnode_setattr_fallback(vp, &va, ctx);
6028 	}
6029 
6030 #if CONFIG_MACF
6031 	if (error == 0 && vp) {
6032 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
6033 	}
6034 #endif
6035 
6036 	if (error == 0) {
6037 		int     update_flags = 0;
6038 
6039 		/*check if a new vnode was created, else try to get one*/
6040 		if (vp == NULL) {
6041 			nd.ni_cnd.cn_nameiop = LOOKUP;
6042 #if CONFIG_TRIGGERS
6043 			nd.ni_op = OP_LOOKUP;
6044 #endif
6045 			/*
6046 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
6047 			 * reallocated again in namei().
6048 			 */
6049 			nd.ni_cnd.cn_flags &= HASBUF;
6050 			error = nameiat(&nd, fd);
6051 			if (error) {
6052 				goto skipit;
6053 			}
6054 			vp = nd.ni_vp;
6055 		}
6056 
6057 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
6058 		/* call out to allow 3rd party notification of rename.
6059 		 * Ignore result of kauth_authorize_fileop call.
6060 		 */
6061 		if (kauth_authorize_fileop_has_listeners() &&
6062 		    namei(&nd) == 0) {
6063 			char *new_link_path = NULL;
6064 			int             len;
6065 
6066 			/* build the path to the new link file */
6067 			new_link_path = get_pathbuff();
6068 			len = MAXPATHLEN;
6069 			vn_getpath(dvp, new_link_path, &len);
6070 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
6071 				new_link_path[len - 1] = '/';
6072 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
6073 			}
6074 
6075 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
6076 			    (uintptr_t)path, (uintptr_t)new_link_path);
6077 			if (new_link_path != NULL) {
6078 				release_pathbuff(new_link_path);
6079 			}
6080 		}
6081 #endif
6082 		// Make sure the name & parent pointers are hooked up
6083 		if (vp->v_name == NULL) {
6084 			update_flags |= VNODE_UPDATE_NAME;
6085 		}
6086 		if (vp->v_parent == NULLVP) {
6087 			update_flags |= VNODE_UPDATE_PARENT;
6088 		}
6089 
6090 		if (update_flags) {
6091 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
6092 		}
6093 
6094 #if CONFIG_FSE
6095 		add_fsevent(FSE_CREATE_FILE, ctx,
6096 		    FSE_ARG_VNODE, vp,
6097 		    FSE_ARG_DONE);
6098 #endif
6099 	}
6100 
6101 skipit:
6102 	/*
6103 	 * nameidone has to happen before we vnode_put(dvp)
6104 	 * since it may need to release the fs_nodelock on the dvp
6105 	 */
6106 	nameidone(&nd);
6107 
6108 	if (vp) {
6109 		vnode_put(vp);
6110 	}
6111 	vnode_put(dvp);
6112 out:
6113 	if (path && (path != (char *)path_data)) {
6114 		zfree(ZV_NAMEI, path);
6115 	}
6116 
6117 	return error;
6118 }
6119 
6120 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6121 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6122 {
6123 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6124 	           uap->link, UIO_USERSPACE);
6125 }
6126 
6127 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6128 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6129     __unused int32_t *retval)
6130 {
6131 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6132 	           uap->path2, UIO_USERSPACE);
6133 }
6134 
6135 /*
6136  * Delete a whiteout from the filesystem.
6137  * No longer supported.
6138  */
6139 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6140 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6141 {
6142 	return ENOTSUP;
6143 }
6144 
6145 /*
6146  * Delete a name from the filesystem.
6147  */
6148 /* ARGSUSED */
6149 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6150 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6151     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6152 {
6153 	struct {
6154 		struct nameidata nd;
6155 #if CONFIG_FSE
6156 		struct vnode_attr va;
6157 		fse_info finfo;
6158 #endif
6159 	} *__unlink_data;
6160 	struct nameidata *ndp;
6161 	vnode_t vp, dvp;
6162 	int error;
6163 	struct componentname *cnp;
6164 	char  *path = NULL;
6165 	char  *no_firmlink_path = NULL;
6166 	int  len_path = 0;
6167 	int  len_no_firmlink_path = 0;
6168 	int flags;
6169 	int need_event;
6170 	int has_listeners;
6171 	int truncated_path;
6172 	int truncated_no_firmlink_path;
6173 	int batched;
6174 	struct vnode_attr *vap;
6175 	vnode_t locked_vp = NULLVP;
6176 	int do_retry;
6177 	int retry_count = 0;
6178 	int cn_flags;
6179 	int nofollow_any = 0;
6180 
6181 	cn_flags = LOCKPARENT;
6182 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6183 		cn_flags |= AUDITVNPATH1;
6184 	}
6185 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6186 		nofollow_any = NAMEI_NOFOLLOW_ANY;
6187 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6188 	}
6189 	/* If a starting dvp is passed, it trumps any fd passed. */
6190 	if (start_dvp) {
6191 		cn_flags |= USEDVP;
6192 	}
6193 
6194 #if NAMEDRSRCFORK
6195 	/* unlink or delete is allowed on rsrc forks and named streams */
6196 	cn_flags |= CN_ALLOWRSRCFORK;
6197 #endif
6198 
6199 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6200 	ndp = &__unlink_data->nd;
6201 #if CONFIG_FSE
6202 	fse_info *finfop = &__unlink_data->finfo;
6203 #endif
6204 
6205 retry:
6206 	do_retry = 0;
6207 	flags = 0;
6208 	need_event = 0;
6209 	has_listeners = 0;
6210 	truncated_path = 0;
6211 	truncated_no_firmlink_path = 0;
6212 	vap = NULL;
6213 
6214 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6215 
6216 	ndp->ni_dvp = start_dvp;
6217 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6218 	cnp = &ndp->ni_cnd;
6219 
6220 continue_lookup:
6221 	error = nameiat(ndp, fd);
6222 	if (error) {
6223 		goto early_out;
6224 	}
6225 
6226 	dvp = ndp->ni_dvp;
6227 	vp = ndp->ni_vp;
6228 
6229 	/* With Carbon delete semantics, busy files cannot be deleted */
6230 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6231 		flags |= VNODE_REMOVE_NODELETEBUSY;
6232 	}
6233 
6234 	/* Skip any potential upcalls if told to. */
6235 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6236 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6237 	}
6238 
6239 	/* Update speculative telemetry with system discarded use state */
6240 	if (unlink_flags & VNODE_REMOVE_SYSTEM_DISCARDED) {
6241 		flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6242 	}
6243 
6244 	if (vp) {
6245 		batched = vnode_compound_remove_available(vp);
6246 		/*
6247 		 * The root of a mounted filesystem cannot be deleted.
6248 		 */
6249 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6250 			error = EBUSY;
6251 			goto out;
6252 		}
6253 
6254 #if DEVELOPMENT || DEBUG
6255 		/*
6256 		 * XXX VSWAP: Check for entitlements or special flag here
6257 		 * so we can restrict access appropriately.
6258 		 */
6259 #else /* DEVELOPMENT || DEBUG */
6260 
6261 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6262 			error = EPERM;
6263 			goto out;
6264 		}
6265 #endif /* DEVELOPMENT || DEBUG */
6266 
6267 		if (!batched) {
6268 			vnode_link_lock(vp);
6269 			locked_vp = vp;
6270 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6271 			if (error) {
6272 				if (error == ENOENT) {
6273 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6274 						do_retry = 1;
6275 						retry_count++;
6276 					}
6277 				}
6278 				vnode_link_unlock(vp);
6279 				locked_vp = NULLVP;
6280 				goto out;
6281 			}
6282 		}
6283 	} else {
6284 		batched = 1;
6285 
6286 		if (!vnode_compound_remove_available(dvp)) {
6287 			panic("No vp, but no compound remove?");
6288 		}
6289 	}
6290 
6291 #if CONFIG_FSE
6292 	need_event = need_fsevent(FSE_DELETE, dvp);
6293 	if (need_event) {
6294 		if (!batched) {
6295 			if ((vp->v_flag & VISHARDLINK) == 0) {
6296 				/* XXX need to get these data in batched VNOP */
6297 				get_fse_info(vp, finfop, ctx);
6298 			}
6299 		} else {
6300 			error =
6301 			    vfs_get_notify_attributes(&__unlink_data->va);
6302 			if (error) {
6303 				goto out;
6304 			}
6305 
6306 			vap = &__unlink_data->va;
6307 		}
6308 	}
6309 #endif
6310 	has_listeners = kauth_authorize_fileop_has_listeners();
6311 	if (need_event || has_listeners) {
6312 		if (path == NULL) {
6313 			GET_PATH(path);
6314 		}
6315 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6316 		if (no_firmlink_path == NULL) {
6317 			GET_PATH(no_firmlink_path);
6318 		}
6319 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6320 	}
6321 
6322 #if NAMEDRSRCFORK
6323 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6324 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6325 	} else
6326 #endif
6327 	{
6328 #if CONFIG_FILE_LEASES
6329 		vnode_breakdirlease(dvp, false, O_WRONLY);
6330 #endif
6331 
6332 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6333 		vp = ndp->ni_vp;
6334 		if (error == EKEEPLOOKING) {
6335 			if (!batched) {
6336 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6337 			}
6338 
6339 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6340 				panic("EKEEPLOOKING, but continue flag not set?");
6341 			}
6342 
6343 			if (vnode_isdir(vp)) {
6344 				error = EISDIR;
6345 				goto out;
6346 			}
6347 			goto continue_lookup;
6348 		} else if (error == ENOENT && batched) {
6349 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6350 				/*
6351 				 * For compound VNOPs, the authorization callback may
6352 				 * return ENOENT in case of racing hardlink lookups
6353 				 * hitting the name  cache, redrive the lookup.
6354 				 */
6355 				do_retry = 1;
6356 				retry_count += 1;
6357 				goto out;
6358 			}
6359 		}
6360 	}
6361 
6362 	/*
6363 	 * Call out to allow 3rd party notification of delete.
6364 	 * Ignore result of kauth_authorize_fileop call.
6365 	 */
6366 	if (!error) {
6367 		if (has_listeners) {
6368 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6369 			    KAUTH_FILEOP_DELETE,
6370 			    (uintptr_t)vp,
6371 			    (uintptr_t)path);
6372 		}
6373 
6374 		if (vp->v_flag & VISHARDLINK) {
6375 			//
6376 			// if a hardlink gets deleted we want to blow away the
6377 			// v_parent link because the path that got us to this
6378 			// instance of the link is no longer valid.  this will
6379 			// force the next call to get the path to ask the file
6380 			// system instead of just following the v_parent link.
6381 			//
6382 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6383 		}
6384 
6385 #if CONFIG_FSE
6386 		if (need_event) {
6387 			if (vp->v_flag & VISHARDLINK) {
6388 				get_fse_info(vp, finfop, ctx);
6389 			} else if (vap) {
6390 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6391 			}
6392 			if (truncated_path) {
6393 				finfop->mode |= FSE_TRUNCATED_PATH;
6394 			}
6395 			add_fsevent(FSE_DELETE, ctx,
6396 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6397 			    FSE_ARG_FINFO, finfop,
6398 			    FSE_ARG_DONE);
6399 		}
6400 #endif
6401 
6402 #if CONFIG_MACF
6403 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6404 #endif
6405 	}
6406 
6407 out:
6408 	if (locked_vp) {
6409 		assert(locked_vp == vp);
6410 		vnode_link_unlock(locked_vp);
6411 		locked_vp = NULLVP;
6412 	}
6413 
6414 	if (path != NULL) {
6415 		RELEASE_PATH(path);
6416 		path = NULL;
6417 	}
6418 
6419 	if (no_firmlink_path != NULL) {
6420 		RELEASE_PATH(no_firmlink_path);
6421 		no_firmlink_path = NULL;
6422 	}
6423 #if NAMEDRSRCFORK
6424 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6425 	 * will cause its shadow file to go away if necessary.
6426 	 */
6427 	if (vp && (vnode_isnamedstream(vp)) &&
6428 	    (vp->v_parent != NULLVP) &&
6429 	    vnode_isshadow(vp)) {
6430 		vnode_recycle(vp);
6431 	}
6432 #endif
6433 	/*
6434 	 * nameidone has to happen before we vnode_put(dvp)
6435 	 * since it may need to release the fs_nodelock on the dvp
6436 	 */
6437 	nameidone(ndp);
6438 	vnode_put(dvp);
6439 	if (vp) {
6440 		vnode_put(vp);
6441 	}
6442 
6443 	if (do_retry) {
6444 		goto retry;
6445 	}
6446 
6447 early_out:
6448 	kfree_type(typeof(*__unlink_data), __unlink_data);
6449 	return error;
6450 }
6451 
6452 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6453 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6454     enum uio_seg segflg, int unlink_flags)
6455 {
6456 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6457 	           unlink_flags);
6458 }
6459 
6460 /*
6461  * Delete a name from the filesystem using Carbon semantics.
6462  */
6463 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6464 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6465 {
6466 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6467 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6468 }
6469 
6470 /*
6471  * Delete a name from the filesystem using POSIX semantics.
6472  */
6473 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6474 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6475 {
6476 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6477 	           uap->path, UIO_USERSPACE, 0);
6478 }
6479 
6480 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6481 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6482 {
6483 	int unlink_flags = 0;
6484 
6485 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY | AT_SYSTEM_DISCARDED)) {
6486 		return EINVAL;
6487 	}
6488 
6489 	if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6490 		unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6491 	}
6492 
6493 	if (uap->flag & AT_SYSTEM_DISCARDED) {
6494 		unlink_flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6495 	}
6496 
6497 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6498 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6499 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6500 		}
6501 		return rmdirat_internal(vfs_context_current(), uap->fd,
6502 		           uap->path, UIO_USERSPACE, unlink_flags);
6503 	} else {
6504 		return unlinkat_internal(vfs_context_current(), uap->fd,
6505 		           NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6506 	}
6507 }
6508 
6509 /*
6510  * Reposition read/write file offset.
6511  */
6512 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6513 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6514 {
6515 	struct fileproc *fp;
6516 	vnode_t vp;
6517 	struct vfs_context *ctx;
6518 	off_t offset = uap->offset, file_size;
6519 	int error;
6520 
6521 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6522 		if (error == ENOTSUP) {
6523 			return ESPIPE;
6524 		}
6525 		return error;
6526 	}
6527 	if (vnode_isfifo(vp)) {
6528 		file_drop(uap->fd);
6529 		return ESPIPE;
6530 	}
6531 
6532 
6533 	ctx = vfs_context_current();
6534 #if CONFIG_MACF
6535 	if (uap->whence == L_INCR && uap->offset == 0) {
6536 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6537 		    fp->fp_glob);
6538 	} else {
6539 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6540 		    fp->fp_glob);
6541 	}
6542 	if (error) {
6543 		file_drop(uap->fd);
6544 		return error;
6545 	}
6546 #endif
6547 	if ((error = vnode_getwithref(vp))) {
6548 		file_drop(uap->fd);
6549 		return error;
6550 	}
6551 
6552 	switch (uap->whence) {
6553 	case L_INCR:
6554 		offset += fp->fp_glob->fg_offset;
6555 		break;
6556 	case L_XTND:
6557 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6558 			break;
6559 		}
6560 		offset += file_size;
6561 		break;
6562 	case L_SET:
6563 		break;
6564 	case SEEK_HOLE:
6565 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6566 		break;
6567 	case SEEK_DATA:
6568 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6569 		break;
6570 	default:
6571 		error = EINVAL;
6572 	}
6573 	if (error == 0) {
6574 		if (uap->offset > 0 && offset < 0) {
6575 			/* Incremented/relative move past max size */
6576 			error = EOVERFLOW;
6577 		} else {
6578 			/*
6579 			 * Allow negative offsets on character devices, per
6580 			 * POSIX 1003.1-2001.  Most likely for writing disk
6581 			 * labels.
6582 			 */
6583 			if (offset < 0 && vp->v_type != VCHR) {
6584 				/* Decremented/relative move before start */
6585 				error = EINVAL;
6586 			} else {
6587 				/* Success */
6588 				fp->fp_glob->fg_offset = offset;
6589 				*retval = fp->fp_glob->fg_offset;
6590 			}
6591 		}
6592 	}
6593 
6594 	/*
6595 	 * An lseek can affect whether data is "available to read."  Use
6596 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6597 	 */
6598 	post_event_if_success(vp, error, NOTE_NONE);
6599 	(void)vnode_put(vp);
6600 	file_drop(uap->fd);
6601 	return error;
6602 }
6603 
6604 
6605 /*
6606  * Check access permissions.
6607  *
6608  * Returns:	0			Success
6609  *		vnode_authorize:???
6610  */
6611 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6612 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6613 {
6614 	kauth_action_t action;
6615 	int error;
6616 
6617 	/*
6618 	 * If just the regular access bits, convert them to something
6619 	 * that vnode_authorize will understand.
6620 	 */
6621 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6622 		action = 0;
6623 		if (uflags & R_OK) {
6624 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6625 		}
6626 		if (uflags & W_OK) {
6627 			if (vnode_isdir(vp)) {
6628 				action |= KAUTH_VNODE_ADD_FILE |
6629 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6630 				/* might want delete rights here too */
6631 			} else {
6632 				action |= KAUTH_VNODE_WRITE_DATA;
6633 			}
6634 		}
6635 		if (uflags & X_OK) {
6636 			if (vnode_isdir(vp)) {
6637 				action |= KAUTH_VNODE_SEARCH;
6638 			} else {
6639 				action |= KAUTH_VNODE_EXECUTE;
6640 			}
6641 		}
6642 	} else {
6643 		/* take advantage of definition of uflags */
6644 		action = uflags >> 8;
6645 	}
6646 
6647 #if CONFIG_MACF
6648 	error = mac_vnode_check_access(ctx, vp, uflags);
6649 	if (error) {
6650 		return error;
6651 	}
6652 #endif /* MAC */
6653 
6654 	/* action == 0 means only check for existence */
6655 	if (action != 0) {
6656 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6657 	} else {
6658 		error = 0;
6659 	}
6660 
6661 	return error;
6662 }
6663 
6664 
6665 
6666 /*
6667  * access_extended: Check access permissions in bulk.
6668  *
6669  * Description:	uap->entries		Pointer to an array of accessx
6670  *                                      descriptor structs, plus one or
6671  *                                      more NULL terminated strings (see
6672  *                                      "Notes" section below).
6673  *		uap->size		Size of the area pointed to by
6674  *					uap->entries.
6675  *		uap->results		Pointer to the results array.
6676  *
6677  * Returns:	0			Success
6678  *		ENOMEM			Insufficient memory
6679  *		EINVAL			Invalid arguments
6680  *		namei:EFAULT		Bad address
6681  *		namei:ENAMETOOLONG	Filename too long
6682  *		namei:ENOENT		No such file or directory
6683  *		namei:ELOOP		Too many levels of symbolic links
6684  *		namei:EBADF		Bad file descriptor
6685  *		namei:ENOTDIR		Not a directory
6686  *		namei:???
6687  *		access1:
6688  *
6689  * Implicit returns:
6690  *		uap->results		Array contents modified
6691  *
6692  * Notes:	The uap->entries are structured as an arbitrary length array
6693  *		of accessx descriptors, followed by one or more NULL terminated
6694  *		strings
6695  *
6696  *			struct accessx_descriptor[0]
6697  *			...
6698  *			struct accessx_descriptor[n]
6699  *			char name_data[0];
6700  *
6701  *		We determine the entry count by walking the buffer containing
6702  *		the uap->entries argument descriptor.  For each descriptor we
6703  *		see, the valid values for the offset ad_name_offset will be
6704  *		in the byte range:
6705  *
6706  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6707  *						to
6708  *				[ uap->entries + uap->size - 2 ]
6709  *
6710  *		since we must have at least one string, and the string must
6711  *		be at least one character plus the NULL terminator in length.
6712  *
6713  * XXX:		Need to support the check-as uid argument
6714  */
6715 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6716 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6717 {
6718 	struct accessx_descriptor *input = NULL;
6719 	errno_t *result = NULL;
6720 	errno_t error = 0;
6721 	int wantdelete = 0;
6722 	size_t desc_max, desc_actual = 0;
6723 	unsigned int i, j;
6724 	struct vfs_context context;
6725 	struct nameidata nd;
6726 	int niopts;
6727 	vnode_t vp = NULL;
6728 	vnode_t dvp = NULL;
6729 #define ACCESSX_MAX_DESCR_ON_STACK 10
6730 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6731 
6732 	context.vc_ucred = NULL;
6733 
6734 	/*
6735 	 * Validate parameters; if valid, copy the descriptor array and string
6736 	 * arguments into local memory.  Before proceeding, the following
6737 	 * conditions must have been met:
6738 	 *
6739 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6740 	 * o	There must be sufficient room in the request for at least one
6741 	 *	descriptor and a one yte NUL terminated string.
6742 	 * o	The allocation of local storage must not fail.
6743 	 */
6744 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6745 		return ENOMEM;
6746 	}
6747 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6748 		return EINVAL;
6749 	}
6750 	if (uap->size <= sizeof(stack_input)) {
6751 		input = stack_input;
6752 	} else {
6753 		input = kalloc_data(uap->size, Z_WAITOK);
6754 		if (input == NULL) {
6755 			error = ENOMEM;
6756 			goto out;
6757 		}
6758 	}
6759 	error = copyin(uap->entries, input, uap->size);
6760 	if (error) {
6761 		goto out;
6762 	}
6763 
6764 	AUDIT_ARG(opaque, input, uap->size);
6765 
6766 	/*
6767 	 * Force NUL termination of the copyin buffer to avoid nami() running
6768 	 * off the end.  If the caller passes us bogus data, they may get a
6769 	 * bogus result.
6770 	 */
6771 	((char *)input)[uap->size - 1] = 0;
6772 
6773 	/*
6774 	 * Access is defined as checking against the process' real identity,
6775 	 * even if operations are checking the effective identity.  This
6776 	 * requires that we use a local vfs context.
6777 	 */
6778 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6779 	context.vc_thread = current_thread();
6780 
6781 	/*
6782 	 * Find out how many entries we have, so we can allocate the result
6783 	 * array by walking the list and adjusting the count downward by the
6784 	 * earliest string offset we see.
6785 	 */
6786 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6787 	desc_actual = desc_max;
6788 	for (i = 0; i < desc_actual; i++) {
6789 		/*
6790 		 * Take the offset to the name string for this entry and
6791 		 * convert to an input array index, which would be one off
6792 		 * the end of the array if this entry was the lowest-addressed
6793 		 * name string.
6794 		 */
6795 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6796 
6797 		/*
6798 		 * An offset greater than the max allowable offset is an error.
6799 		 * It is also an error for any valid entry to point
6800 		 * to a location prior to the end of the current entry, if
6801 		 * it's not a reference to the string of the previous entry.
6802 		 */
6803 		if (j > desc_max || (j != 0 && j <= i)) {
6804 			error = EINVAL;
6805 			goto out;
6806 		}
6807 
6808 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6809 		if (input[i].ad_name_offset >= uap->size) {
6810 			error = EINVAL;
6811 			goto out;
6812 		}
6813 
6814 		/*
6815 		 * An offset of 0 means use the previous descriptor's offset;
6816 		 * this is used to chain multiple requests for the same file
6817 		 * to avoid multiple lookups.
6818 		 */
6819 		if (j == 0) {
6820 			/* This is not valid for the first entry */
6821 			if (i == 0) {
6822 				error = EINVAL;
6823 				goto out;
6824 			}
6825 			continue;
6826 		}
6827 
6828 		/*
6829 		 * If the offset of the string for this descriptor is before
6830 		 * what we believe is the current actual last descriptor,
6831 		 * then we need to adjust our estimate downward; this permits
6832 		 * the string table following the last descriptor to be out
6833 		 * of order relative to the descriptor list.
6834 		 */
6835 		if (j < desc_actual) {
6836 			desc_actual = j;
6837 		}
6838 	}
6839 
6840 	/*
6841 	 * We limit the actual number of descriptors we are willing to process
6842 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6843 	 * requested does not exceed this limit,
6844 	 */
6845 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6846 		error = ENOMEM;
6847 		goto out;
6848 	}
6849 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6850 	if (result == NULL) {
6851 		error = ENOMEM;
6852 		goto out;
6853 	}
6854 
6855 	/*
6856 	 * Do the work by iterating over the descriptor entries we know to
6857 	 * at least appear to contain valid data.
6858 	 */
6859 	error = 0;
6860 	for (i = 0; i < desc_actual; i++) {
6861 		/*
6862 		 * If the ad_name_offset is 0, then we use the previous
6863 		 * results to make the check; otherwise, we are looking up
6864 		 * a new file name.
6865 		 */
6866 		if (input[i].ad_name_offset != 0) {
6867 			/* discard old vnodes */
6868 			if (vp) {
6869 				vnode_put(vp);
6870 				vp = NULL;
6871 			}
6872 			if (dvp) {
6873 				vnode_put(dvp);
6874 				dvp = NULL;
6875 			}
6876 
6877 			/*
6878 			 * Scan forward in the descriptor list to see if we
6879 			 * need the parent vnode.  We will need it if we are
6880 			 * deleting, since we must have rights  to remove
6881 			 * entries in the parent directory, as well as the
6882 			 * rights to delete the object itself.
6883 			 */
6884 			wantdelete = input[i].ad_flags & _DELETE_OK;
6885 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6886 				if (input[j].ad_flags & _DELETE_OK) {
6887 					wantdelete = 1;
6888 				}
6889 			}
6890 
6891 			niopts = FOLLOW | AUDITVNPATH1;
6892 
6893 			/* need parent for vnode_authorize for deletion test */
6894 			if (wantdelete) {
6895 				niopts |= WANTPARENT;
6896 			}
6897 
6898 			/* do the lookup */
6899 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6900 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6901 			    &context);
6902 			error = namei(&nd);
6903 			if (!error) {
6904 				vp = nd.ni_vp;
6905 				if (wantdelete) {
6906 					dvp = nd.ni_dvp;
6907 				}
6908 			}
6909 			nameidone(&nd);
6910 		}
6911 
6912 		/*
6913 		 * Handle lookup errors.
6914 		 */
6915 		switch (error) {
6916 		case ENOENT:
6917 		case EACCES:
6918 		case EPERM:
6919 		case ENOTDIR:
6920 			result[i] = error;
6921 			break;
6922 		case 0:
6923 			/* run this access check */
6924 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6925 			break;
6926 		default:
6927 			/* fatal lookup error */
6928 
6929 			goto out;
6930 		}
6931 	}
6932 
6933 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6934 
6935 	/* copy out results */
6936 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6937 
6938 out:
6939 	if (input && input != stack_input) {
6940 		kfree_data(input, uap->size);
6941 	}
6942 	if (result) {
6943 		kfree_data(result, desc_actual * sizeof(errno_t));
6944 	}
6945 	if (vp) {
6946 		vnode_put(vp);
6947 	}
6948 	if (dvp) {
6949 		vnode_put(dvp);
6950 	}
6951 	if (IS_VALID_CRED(context.vc_ucred)) {
6952 		kauth_cred_unref(&context.vc_ucred);
6953 	}
6954 	return error;
6955 }
6956 
6957 
6958 /*
6959  * Returns:	0			Success
6960  *		namei:EFAULT		Bad address
6961  *		namei:ENAMETOOLONG	Filename too long
6962  *		namei:ENOENT		No such file or directory
6963  *		namei:ELOOP		Too many levels of symbolic links
6964  *		namei:EBADF		Bad file descriptor
6965  *		namei:ENOTDIR		Not a directory
6966  *		namei:???
6967  *		access1:
6968  */
6969 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6970 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6971     int flag, enum uio_seg segflg)
6972 {
6973 	int error;
6974 	struct nameidata nd;
6975 	int niopts;
6976 	struct vfs_context context;
6977 #if NAMEDRSRCFORK
6978 	int is_namedstream = 0;
6979 #endif
6980 
6981 	/*
6982 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6983 	 * against the process' real identity, even if operations are checking
6984 	 * the effective identity.  So we need to tweak the credential
6985 	 * in the context for that case.
6986 	 */
6987 	if (!(flag & AT_EACCESS)) {
6988 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6989 	} else {
6990 		context.vc_ucred = ctx->vc_ucred;
6991 	}
6992 	context.vc_thread = ctx->vc_thread;
6993 
6994 
6995 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6996 	/* need parent for vnode_authorize for deletion test */
6997 	if (amode & _DELETE_OK) {
6998 		niopts |= WANTPARENT;
6999 	}
7000 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
7001 	    path, &context);
7002 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7003 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7004 	}
7005 
7006 #if NAMEDRSRCFORK
7007 	/* access(F_OK) calls are allowed for resource forks. */
7008 	if (amode == F_OK) {
7009 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7010 	}
7011 #endif
7012 	error = nameiat(&nd, fd);
7013 	if (error) {
7014 		goto out;
7015 	}
7016 
7017 #if NAMEDRSRCFORK
7018 	/* Grab reference on the shadow stream file vnode to
7019 	 * force an inactive on release which will mark it
7020 	 * for recycle.
7021 	 */
7022 	if (vnode_isnamedstream(nd.ni_vp) &&
7023 	    (nd.ni_vp->v_parent != NULLVP) &&
7024 	    vnode_isshadow(nd.ni_vp)) {
7025 		is_namedstream = 1;
7026 		vnode_ref(nd.ni_vp);
7027 	}
7028 #endif
7029 
7030 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
7031 
7032 #if NAMEDRSRCFORK
7033 	if (is_namedstream) {
7034 		vnode_rele(nd.ni_vp);
7035 	}
7036 #endif
7037 
7038 	vnode_put(nd.ni_vp);
7039 	if (amode & _DELETE_OK) {
7040 		vnode_put(nd.ni_dvp);
7041 	}
7042 	nameidone(&nd);
7043 
7044 out:
7045 	if (!(flag & AT_EACCESS)) {
7046 		kauth_cred_unref(&context.vc_ucred);
7047 	}
7048 	return error;
7049 }
7050 
7051 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)7052 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
7053 {
7054 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
7055 	           uap->path, uap->flags, 0, UIO_USERSPACE);
7056 }
7057 
7058 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)7059 faccessat(__unused proc_t p, struct faccessat_args *uap,
7060     __unused int32_t *retval)
7061 {
7062 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7063 		return EINVAL;
7064 	}
7065 
7066 	return faccessat_internal(vfs_context_current(), uap->fd,
7067 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
7068 }
7069 
7070 /*
7071  * Returns:	0			Success
7072  *		EFAULT
7073  *	copyout:EFAULT
7074  *	namei:???
7075  *	vn_stat:???
7076  */
7077 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)7078 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
7079     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
7080     enum uio_seg segflg, int fd, int flag)
7081 {
7082 	struct nameidata *ndp = NULL;
7083 	int follow;
7084 	union {
7085 		struct stat sb;
7086 		struct stat64 sb64;
7087 	} source = {};
7088 	union {
7089 		struct user64_stat user64_sb;
7090 		struct user32_stat user32_sb;
7091 		struct user64_stat64 user64_sb64;
7092 		struct user32_stat64 user32_sb64;
7093 	} dest = {};
7094 	caddr_t sbp;
7095 	int error, my_size;
7096 	kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
7097 	size_t xsecurity_bufsize;
7098 	void * statptr;
7099 	struct fileproc *fp = NULL;
7100 	int needsrealdev = 0;
7101 
7102 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7103 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
7104 	NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
7105 	    segflg, path, ctx);
7106 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7107 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
7108 	}
7109 
7110 #if NAMEDRSRCFORK
7111 	int is_namedstream = 0;
7112 	/* stat calls are allowed for resource forks. */
7113 	ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7114 #endif
7115 
7116 	if (flag & AT_FDONLY) {
7117 		vnode_t fvp;
7118 
7119 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
7120 		if (error) {
7121 			goto out;
7122 		}
7123 		if ((error = vnode_getwithref(fvp))) {
7124 			file_drop(fd);
7125 			goto out;
7126 		}
7127 		ndp->ni_vp = fvp;
7128 	} else {
7129 		error = nameiat(ndp, fd);
7130 		if (error) {
7131 			goto out;
7132 		}
7133 	}
7134 
7135 	statptr = (void *)&source;
7136 
7137 #if NAMEDRSRCFORK
7138 	/* Grab reference on the shadow stream file vnode to
7139 	 * force an inactive on release which will mark it
7140 	 * for recycle.
7141 	 */
7142 	if (vnode_isnamedstream(ndp->ni_vp) &&
7143 	    (ndp->ni_vp->v_parent != NULLVP) &&
7144 	    vnode_isshadow(ndp->ni_vp)) {
7145 		is_namedstream = 1;
7146 		vnode_ref(ndp->ni_vp);
7147 	}
7148 #endif
7149 
7150 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
7151 	if (fp && (xsecurity == USER_ADDR_NULL)) {
7152 		/*
7153 		 * If the caller has the file open, and is not
7154 		 * requesting extended security information, we are
7155 		 * going to let them get the basic stat information.
7156 		 */
7157 		error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7158 		    fp->fp_glob->fg_cred);
7159 	} else {
7160 		error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7161 		    isstat64, needsrealdev, ctx);
7162 	}
7163 
7164 #if NAMEDRSRCFORK
7165 	if (is_namedstream) {
7166 		vnode_rele(ndp->ni_vp);
7167 	}
7168 #endif
7169 	vnode_put(ndp->ni_vp);
7170 	nameidone(ndp);
7171 
7172 	if (fp) {
7173 		file_drop(fd);
7174 		fp = NULL;
7175 	}
7176 
7177 	if (error) {
7178 		goto out;
7179 	}
7180 	/* Zap spare fields */
7181 	if (isstat64 != 0) {
7182 		source.sb64.st_lspare = 0;
7183 		source.sb64.st_qspare[0] = 0LL;
7184 		source.sb64.st_qspare[1] = 0LL;
7185 		if (vfs_context_is64bit(ctx)) {
7186 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7187 			my_size = sizeof(dest.user64_sb64);
7188 			sbp = (caddr_t)&dest.user64_sb64;
7189 		} else {
7190 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7191 			my_size = sizeof(dest.user32_sb64);
7192 			sbp = (caddr_t)&dest.user32_sb64;
7193 		}
7194 		/*
7195 		 * Check if we raced (post lookup) against the last unlink of a file.
7196 		 */
7197 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7198 			source.sb64.st_nlink = 1;
7199 		}
7200 	} else {
7201 		source.sb.st_lspare = 0;
7202 		source.sb.st_qspare[0] = 0LL;
7203 		source.sb.st_qspare[1] = 0LL;
7204 		if (vfs_context_is64bit(ctx)) {
7205 			munge_user64_stat(&source.sb, &dest.user64_sb);
7206 			my_size = sizeof(dest.user64_sb);
7207 			sbp = (caddr_t)&dest.user64_sb;
7208 		} else {
7209 			munge_user32_stat(&source.sb, &dest.user32_sb);
7210 			my_size = sizeof(dest.user32_sb);
7211 			sbp = (caddr_t)&dest.user32_sb;
7212 		}
7213 
7214 		/*
7215 		 * Check if we raced (post lookup) against the last unlink of a file.
7216 		 */
7217 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7218 			source.sb.st_nlink = 1;
7219 		}
7220 	}
7221 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7222 		goto out;
7223 	}
7224 
7225 	/* caller wants extended security information? */
7226 	if (xsecurity != USER_ADDR_NULL) {
7227 		/* did we get any? */
7228 		if (fsec == KAUTH_FILESEC_NONE) {
7229 			if (susize(xsecurity_size, 0) != 0) {
7230 				error = EFAULT;
7231 				goto out;
7232 			}
7233 		} else {
7234 			/* find the user buffer size */
7235 			xsecurity_bufsize = fusize(xsecurity_size);
7236 
7237 			/* copy out the actual data size */
7238 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7239 				error = EFAULT;
7240 				goto out;
7241 			}
7242 
7243 			/* if the caller supplied enough room, copy out to it */
7244 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7245 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7246 			}
7247 		}
7248 	}
7249 out:
7250 	if (ndp) {
7251 		kfree_type(struct nameidata, ndp);
7252 	}
7253 	if (fsec != KAUTH_FILESEC_NONE) {
7254 		kauth_filesec_free(fsec);
7255 	}
7256 	return error;
7257 }
7258 
7259 /*
7260  * stat_extended: Get file status; with extended security (ACL).
7261  *
7262  * Parameters:    p                       (ignored)
7263  *                uap                     User argument descriptor (see below)
7264  *                retval                  (ignored)
7265  *
7266  * Indirect:      uap->path               Path of file to get status from
7267  *                uap->ub                 User buffer (holds file status info)
7268  *                uap->xsecurity          ACL to get (extended security)
7269  *                uap->xsecurity_size     Size of ACL
7270  *
7271  * Returns:        0                      Success
7272  *                !0                      errno value
7273  *
7274  */
7275 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7276 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7277     __unused int32_t *retval)
7278 {
7279 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7280 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7281 	           0);
7282 }
7283 
7284 /*
7285  * Returns:	0			Success
7286  *	fstatat_internal:???		[see fstatat_internal() in this file]
7287  */
7288 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7289 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7290 {
7291 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7292 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7293 }
7294 
7295 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7296 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7297 {
7298 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7299 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7300 }
7301 
7302 /*
7303  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7304  *
7305  * Parameters:    p                       (ignored)
7306  *                uap                     User argument descriptor (see below)
7307  *                retval                  (ignored)
7308  *
7309  * Indirect:      uap->path               Path of file to get status from
7310  *                uap->ub                 User buffer (holds file status info)
7311  *                uap->xsecurity          ACL to get (extended security)
7312  *                uap->xsecurity_size     Size of ACL
7313  *
7314  * Returns:        0                      Success
7315  *                !0                      errno value
7316  *
7317  */
7318 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7319 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7320 {
7321 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7322 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7323 	           0);
7324 }
7325 
7326 /*
7327  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7328  *
7329  * Parameters:    p                       (ignored)
7330  *                uap                     User argument descriptor (see below)
7331  *                retval                  (ignored)
7332  *
7333  * Indirect:      uap->path               Path of file to get status from
7334  *                uap->ub                 User buffer (holds file status info)
7335  *                uap->xsecurity          ACL to get (extended security)
7336  *                uap->xsecurity_size     Size of ACL
7337  *
7338  * Returns:        0                      Success
7339  *                !0                      errno value
7340  *
7341  */
7342 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7343 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7344 {
7345 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7346 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7347 	           AT_SYMLINK_NOFOLLOW);
7348 }
7349 
7350 /*
7351  * Get file status; this version does not follow links.
7352  */
7353 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7354 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7355 {
7356 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7357 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7358 }
7359 
7360 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7361 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7362 {
7363 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7364 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7365 }
7366 
7367 /*
7368  * lstat64_extended: Get file status; can handle large inode numbers; does not
7369  * follow links; with extended security (ACL).
7370  *
7371  * Parameters:    p                       (ignored)
7372  *                uap                     User argument descriptor (see below)
7373  *                retval                  (ignored)
7374  *
7375  * Indirect:      uap->path               Path of file to get status from
7376  *                uap->ub                 User buffer (holds file status info)
7377  *                uap->xsecurity          ACL to get (extended security)
7378  *                uap->xsecurity_size     Size of ACL
7379  *
7380  * Returns:        0                      Success
7381  *                !0                      errno value
7382  *
7383  */
7384 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7385 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7386 {
7387 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7388 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7389 	           AT_SYMLINK_NOFOLLOW);
7390 }
7391 
7392 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7393 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7394 {
7395 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7396 		return EINVAL;
7397 	}
7398 
7399 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7400 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7401 }
7402 
7403 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7404 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7405     __unused int32_t *retval)
7406 {
7407 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7408 		return EINVAL;
7409 	}
7410 
7411 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7412 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7413 }
7414 
7415 /*
7416  * Get configurable pathname variables.
7417  *
7418  * Returns:	0			Success
7419  *	namei:???
7420  *	vn_pathconf:???
7421  *
7422  * Notes:	Global implementation  constants are intended to be
7423  *		implemented in this function directly; all other constants
7424  *		are per-FS implementation, and therefore must be handled in
7425  *		each respective FS, instead.
7426  *
7427  * XXX We implement some things globally right now that should actually be
7428  * XXX per-FS; we will need to deal with this at some point.
7429  */
7430 /* ARGSUSED */
7431 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7432 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7433 {
7434 	int error;
7435 	struct nameidata nd;
7436 	vfs_context_t ctx = vfs_context_current();
7437 
7438 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7439 	    UIO_USERSPACE, uap->path, ctx);
7440 	error = namei(&nd);
7441 	if (error) {
7442 		return error;
7443 	}
7444 
7445 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7446 
7447 	vnode_put(nd.ni_vp);
7448 	nameidone(&nd);
7449 	return error;
7450 }
7451 
7452 /*
7453  * Return target name of a symbolic link.
7454  */
7455 /* ARGSUSED */
7456 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7457 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7458     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7459     int *retval)
7460 {
7461 	vnode_t vp;
7462 	uio_t auio;
7463 	int error;
7464 	struct nameidata nd;
7465 	UIO_STACKBUF(uio_buf, 1);
7466 	bool put_vnode;
7467 
7468 	if (bufsize > INT32_MAX) {
7469 		return EINVAL;
7470 	}
7471 
7472 	if (lnk_vp) {
7473 		vp = lnk_vp;
7474 		put_vnode = false;
7475 	} else {
7476 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7477 		    seg, path, ctx);
7478 
7479 		error = nameiat(&nd, fd);
7480 		if (error) {
7481 			return error;
7482 		}
7483 		vp = nd.ni_vp;
7484 		put_vnode = true;
7485 		nameidone(&nd);
7486 	}
7487 
7488 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7489 	    &uio_buf[0], sizeof(uio_buf));
7490 	uio_addiov(auio, buf, bufsize);
7491 	if (vp->v_type != VLNK) {
7492 		error = EINVAL;
7493 	} else {
7494 #if CONFIG_MACF
7495 		error = mac_vnode_check_readlink(ctx, vp);
7496 #endif
7497 		if (error == 0) {
7498 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7499 			    ctx);
7500 		}
7501 		if (error == 0) {
7502 			error = VNOP_READLINK(vp, auio, ctx);
7503 		}
7504 	}
7505 
7506 	if (put_vnode) {
7507 		vnode_put(vp);
7508 	}
7509 
7510 	*retval = (int)(bufsize - uio_resid(auio));
7511 	return error;
7512 }
7513 
7514 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7515 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7516 {
7517 	enum uio_seg procseg;
7518 	vnode_t vp;
7519 	int error;
7520 
7521 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7522 
7523 	AUDIT_ARG(fd, uap->fd);
7524 
7525 	if ((error = file_vnode(uap->fd, &vp))) {
7526 		return error;
7527 	}
7528 	if ((error = vnode_getwithref(vp))) {
7529 		file_drop(uap->fd);
7530 		return error;
7531 	}
7532 
7533 	error = readlinkat_internal(vfs_context_current(), -1,
7534 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7535 	    uap->bufsize, procseg, retval);
7536 
7537 	vnode_put(vp);
7538 	file_drop(uap->fd);
7539 	return error;
7540 }
7541 
7542 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7543 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7544 {
7545 	enum uio_seg procseg;
7546 
7547 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7548 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7549 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7550 	           uap->count, procseg, retval);
7551 }
7552 
7553 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7554 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7555 {
7556 	enum uio_seg procseg;
7557 
7558 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7559 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7560 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7561 	           retval);
7562 }
7563 
7564 /*
7565  * Change file flags, the deep inner layer.
7566  */
7567 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7568 chflags0(vnode_t vp, struct vnode_attr *va,
7569     int (*setattr)(vnode_t, void *, vfs_context_t),
7570     void *arg, vfs_context_t ctx)
7571 {
7572 	kauth_action_t action = 0;
7573 	int error;
7574 
7575 #if CONFIG_MACF
7576 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7577 	if (error) {
7578 		goto out;
7579 	}
7580 #endif
7581 
7582 	/* request authorisation, disregard immutability */
7583 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7584 		goto out;
7585 	}
7586 	/*
7587 	 * Request that the auth layer disregard those file flags it's allowed to when
7588 	 * authorizing this operation; we need to do this in order to be able to
7589 	 * clear immutable flags.
7590 	 */
7591 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7592 		goto out;
7593 	}
7594 	error = (*setattr)(vp, arg, ctx);
7595 
7596 #if CONFIG_MACF
7597 	if (error == 0) {
7598 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7599 	}
7600 #endif
7601 
7602 out:
7603 	return error;
7604 }
7605 
7606 /*
7607  * Change file flags.
7608  *
7609  * NOTE: this will vnode_put() `vp'
7610  */
7611 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7612 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7613 {
7614 	struct vnode_attr va;
7615 	int error;
7616 
7617 	VATTR_INIT(&va);
7618 	VATTR_SET(&va, va_flags, flags);
7619 
7620 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7621 	vnode_put(vp);
7622 
7623 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7624 		error = ENOTSUP;
7625 	}
7626 
7627 	return error;
7628 }
7629 
7630 /*
7631  * Change flags of a file given a path name.
7632  */
7633 /* ARGSUSED */
7634 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7635 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7636 {
7637 	vnode_t vp;
7638 	vfs_context_t ctx = vfs_context_current();
7639 	int error;
7640 	struct nameidata nd;
7641 	uint32_t wantparent = 0;
7642 
7643 #if CONFIG_FILE_LEASES
7644 	wantparent = WANTPARENT;
7645 #endif
7646 
7647 	AUDIT_ARG(fflags, uap->flags);
7648 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7649 	    UIO_USERSPACE, uap->path, ctx);
7650 	error = namei(&nd);
7651 	if (error) {
7652 		return error;
7653 	}
7654 	vp = nd.ni_vp;
7655 
7656 #if CONFIG_FILE_LEASES
7657 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7658 	vnode_put(nd.ni_dvp);
7659 #endif
7660 
7661 	nameidone(&nd);
7662 
7663 	/* we don't vnode_put() here because chflags1 does internally */
7664 	error = chflags1(vp, uap->flags, ctx);
7665 
7666 	return error;
7667 }
7668 
7669 /*
7670  * Change flags of a file given a file descriptor.
7671  */
7672 /* ARGSUSED */
7673 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7674 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7675 {
7676 	vnode_t vp;
7677 	int error;
7678 
7679 	AUDIT_ARG(fd, uap->fd);
7680 	AUDIT_ARG(fflags, uap->flags);
7681 	if ((error = file_vnode(uap->fd, &vp))) {
7682 		return error;
7683 	}
7684 
7685 	if ((error = vnode_getwithref(vp))) {
7686 		file_drop(uap->fd);
7687 		return error;
7688 	}
7689 
7690 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7691 
7692 #if CONFIG_FILE_LEASES
7693 	vnode_breakdirlease(vp, true, O_WRONLY);
7694 #endif
7695 
7696 	/* we don't vnode_put() here because chflags1 does internally */
7697 	error = chflags1(vp, uap->flags, vfs_context_current());
7698 
7699 	file_drop(uap->fd);
7700 	return error;
7701 }
7702 
7703 /*
7704  * Change security information on a filesystem object.
7705  *
7706  * Returns:	0			Success
7707  *		EPERM			Operation not permitted
7708  *		vnode_authattr:???	[anything vnode_authattr can return]
7709  *		vnode_authorize:???	[anything vnode_authorize can return]
7710  *		vnode_setattr:???	[anything vnode_setattr can return]
7711  *
7712  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7713  *		translated to EPERM before being returned.
7714  */
7715 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7716 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7717 {
7718 	kauth_action_t action;
7719 	int error;
7720 
7721 	AUDIT_ARG(mode, vap->va_mode);
7722 	/* XXX audit new args */
7723 
7724 #if NAMEDSTREAMS
7725 	/* chmod calls are not allowed for resource forks. */
7726 	if (vp->v_flag & VISNAMEDSTREAM) {
7727 		return EPERM;
7728 	}
7729 #endif
7730 
7731 #if CONFIG_MACF
7732 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7733 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7734 		return error;
7735 	}
7736 
7737 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7738 		if ((error = mac_vnode_check_setowner(ctx, vp,
7739 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7740 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7741 			return error;
7742 		}
7743 	}
7744 
7745 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7746 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7747 		return error;
7748 	}
7749 #endif
7750 
7751 	/* make sure that the caller is allowed to set this security information */
7752 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7753 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7754 		if (error == EACCES) {
7755 			error = EPERM;
7756 		}
7757 		return error;
7758 	}
7759 
7760 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7761 		return error;
7762 	}
7763 
7764 #if CONFIG_MACF
7765 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7766 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7767 	}
7768 
7769 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7770 		mac_vnode_notify_setowner(ctx, vp,
7771 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7772 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7773 	}
7774 
7775 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7776 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7777 	}
7778 #endif
7779 
7780 	return error;
7781 }
7782 
7783 
7784 /*
7785  * Change mode of a file given a path name.
7786  *
7787  * Returns:	0			Success
7788  *		namei:???		[anything namei can return]
7789  *		chmod_vnode:???		[anything chmod_vnode can return]
7790  */
7791 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7792 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7793     int fd, int flag, enum uio_seg segflg)
7794 {
7795 	struct nameidata nd;
7796 	int follow, error;
7797 	uint32_t wantparent = 0;
7798 
7799 #if CONFIG_FILE_LEASES
7800 	wantparent = WANTPARENT;
7801 #endif
7802 
7803 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7804 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7805 	    segflg, path, ctx);
7806 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7807 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7808 	}
7809 	if ((error = nameiat(&nd, fd))) {
7810 		return error;
7811 	}
7812 
7813 #if CONFIG_FILE_LEASES
7814 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7815 	vnode_put(nd.ni_dvp);
7816 #endif
7817 
7818 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7819 	vnode_put(nd.ni_vp);
7820 	nameidone(&nd);
7821 	return error;
7822 }
7823 
7824 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7825 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7826     gid_t gid, user_addr_t xsecurity)
7827 {
7828 	int error;
7829 
7830 	VATTR_INIT(pva);
7831 
7832 	if (mode != -1) {
7833 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
7834 	} else {
7835 		pva->va_mode = 0;
7836 	}
7837 
7838 	if (uid != KAUTH_UID_NONE) {
7839 		VATTR_SET(pva, va_uid, uid);
7840 	}
7841 
7842 	if (gid != KAUTH_GID_NONE) {
7843 		VATTR_SET(pva, va_gid, gid);
7844 	}
7845 
7846 	*pxsecdst = NULL;
7847 	switch (xsecurity) {
7848 	case USER_ADDR_NULL:
7849 		break;
7850 
7851 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7852 		VATTR_SET(pva, va_acl, NULL);
7853 		break;
7854 
7855 	default:
7856 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7857 			return error;
7858 		}
7859 
7860 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7861 		pva->va_vaflags |= VA_FILESEC_ACL;
7862 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7863 		break;
7864 	}
7865 
7866 	return 0;
7867 }
7868 
7869 /*
7870  * chmod_extended: Change the mode of a file given a path name; with extended
7871  * argument list (including extended security (ACL)).
7872  *
7873  * Parameters:	p			Process requesting the open
7874  *		uap			User argument descriptor (see below)
7875  *		retval			(ignored)
7876  *
7877  * Indirect:	uap->path		Path to object (same as 'chmod')
7878  *		uap->uid		UID to set
7879  *		uap->gid		GID to set
7880  *		uap->mode		File mode to set (same as 'chmod')
7881  *		uap->xsecurity		ACL to set (or delete)
7882  *
7883  * Returns:	0			Success
7884  *		!0			errno value
7885  *
7886  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7887  *
7888  * XXX:		We should enummerate the possible errno values here, and where
7889  *		in the code they originated.
7890  */
7891 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7892 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7893 {
7894 	int error;
7895 	struct vnode_attr va;
7896 	kauth_filesec_t xsecdst = NULL;
7897 
7898 	AUDIT_ARG(owner, uap->uid, uap->gid);
7899 
7900 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7901 	    uap->gid, uap->xsecurity);
7902 
7903 	if (error) {
7904 		return error;
7905 	}
7906 
7907 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7908 	    UIO_USERSPACE);
7909 
7910 	if (xsecdst != NULL) {
7911 		kauth_filesec_free(xsecdst);
7912 	}
7913 	return error;
7914 }
7915 
7916 /*
7917  * Returns:	0			Success
7918  *		chmodat:???		[anything chmodat can return]
7919  */
7920 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7921 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7922     int flag, enum uio_seg segflg)
7923 {
7924 	struct vnode_attr va;
7925 
7926 	VATTR_INIT(&va);
7927 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7928 
7929 	return chmodat(ctx, path, &va, fd, flag, segflg);
7930 }
7931 
7932 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7933 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7934 {
7935 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7936 	           AT_FDCWD, 0, UIO_USERSPACE);
7937 }
7938 
7939 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7940 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7941 {
7942 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7943 		return EINVAL;
7944 	}
7945 
7946 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7947 	           uap->fd, uap->flag, UIO_USERSPACE);
7948 }
7949 
7950 /*
7951  * Change mode of a file given a file descriptor.
7952  */
7953 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7954 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7955 {
7956 	vnode_t vp;
7957 	int error;
7958 
7959 	AUDIT_ARG(fd, fd);
7960 
7961 	if ((error = file_vnode(fd, &vp)) != 0) {
7962 		return error;
7963 	}
7964 	if ((error = vnode_getwithref(vp)) != 0) {
7965 		file_drop(fd);
7966 		return error;
7967 	}
7968 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7969 
7970 #if CONFIG_FILE_LEASES
7971 	vnode_breakdirlease(vp, true, O_WRONLY);
7972 #endif
7973 
7974 	error = chmod_vnode(vfs_context_current(), vp, vap);
7975 	(void)vnode_put(vp);
7976 	file_drop(fd);
7977 
7978 	return error;
7979 }
7980 
7981 /*
7982  * fchmod_extended: Change mode of a file given a file descriptor; with
7983  * extended argument list (including extended security (ACL)).
7984  *
7985  * Parameters:    p                       Process requesting to change file mode
7986  *                uap                     User argument descriptor (see below)
7987  *                retval                  (ignored)
7988  *
7989  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7990  *                uap->uid                UID to set
7991  *                uap->gid                GID to set
7992  *                uap->xsecurity          ACL to set (or delete)
7993  *                uap->fd                 File descriptor of file to change mode
7994  *
7995  * Returns:        0                      Success
7996  *                !0                      errno value
7997  *
7998  */
7999 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)8000 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
8001 {
8002 	int error;
8003 	struct vnode_attr va;
8004 	kauth_filesec_t xsecdst = NULL;
8005 
8006 	AUDIT_ARG(owner, uap->uid, uap->gid);
8007 
8008 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8009 	    uap->gid, uap->xsecurity);
8010 
8011 	if (error) {
8012 		return error;
8013 	}
8014 
8015 	error = fchmod1(p, uap->fd, &va);
8016 
8017 	if (xsecdst != NULL) {
8018 		kauth_filesec_free(xsecdst);
8019 	}
8020 	return error;
8021 }
8022 
8023 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)8024 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
8025 {
8026 	struct vnode_attr va;
8027 
8028 	VATTR_INIT(&va);
8029 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
8030 
8031 	return fchmod1(p, uap->fd, &va);
8032 }
8033 
8034 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)8035 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
8036 {
8037 	struct vnode_attr va;
8038 	kauth_action_t action;
8039 	int error;
8040 
8041 	VATTR_INIT(&va);
8042 	if (uid != (uid_t)VNOVAL) {
8043 		VATTR_SET(&va, va_uid, uid);
8044 	}
8045 	if (gid != (gid_t)VNOVAL) {
8046 		VATTR_SET(&va, va_gid, gid);
8047 	}
8048 
8049 #if NAMEDSTREAMS
8050 	/* chown calls are not allowed for resource forks. */
8051 	if (vp->v_flag & VISNAMEDSTREAM) {
8052 		error = EPERM;
8053 		goto out;
8054 	}
8055 #endif
8056 
8057 #if CONFIG_MACF
8058 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
8059 	if (error) {
8060 		goto out;
8061 	}
8062 #endif
8063 
8064 	/* preflight and authorize attribute changes */
8065 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8066 		goto out;
8067 	}
8068 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8069 		/*
8070 		 * EACCES is only allowed from namei(); permissions failure should
8071 		 * return EPERM, so we need to translate the error code.
8072 		 */
8073 		if (error == EACCES) {
8074 			error = EPERM;
8075 		}
8076 
8077 		goto out;
8078 	}
8079 
8080 #if CONFIG_FILE_LEASES
8081 	vnode_breakdirlease(vp, true, O_WRONLY);
8082 #endif
8083 
8084 	error = vnode_setattr(vp, &va, ctx);
8085 
8086 #if CONFIG_MACF
8087 	if (error == 0) {
8088 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
8089 	}
8090 #endif
8091 
8092 out:
8093 	return error;
8094 }
8095 
8096 /*
8097  * Set ownership given a path name.
8098  */
8099 /* ARGSUSED */
8100 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)8101 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
8102     gid_t gid, int flag, enum uio_seg segflg)
8103 {
8104 	vnode_t vp;
8105 	int error;
8106 	struct nameidata nd;
8107 	int follow;
8108 
8109 	AUDIT_ARG(owner, uid, gid);
8110 
8111 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8112 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
8113 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8114 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8115 	}
8116 
8117 	error = nameiat(&nd, fd);
8118 	if (error) {
8119 		return error;
8120 	}
8121 
8122 	vp = nd.ni_vp;
8123 	error = vn_chown_internal(ctx, vp, uid, gid);
8124 
8125 	nameidone(&nd);
8126 	vnode_put(vp);
8127 	return error;
8128 }
8129 
8130 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8131 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8132 {
8133 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8134 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
8135 }
8136 
8137 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8138 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8139 {
8140 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8141 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8142 }
8143 
8144 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8145 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8146 {
8147 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
8148 		return EINVAL;
8149 	}
8150 
8151 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8152 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8153 }
8154 
8155 /*
8156  * Set ownership given a file descriptor.
8157  */
8158 /* ARGSUSED */
8159 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8160 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8161 {
8162 	vfs_context_t ctx = vfs_context_current();
8163 	vnode_t vp;
8164 	int error;
8165 
8166 	AUDIT_ARG(owner, uap->uid, uap->gid);
8167 	AUDIT_ARG(fd, uap->fd);
8168 
8169 	if ((error = file_vnode(uap->fd, &vp))) {
8170 		return error;
8171 	}
8172 
8173 	if ((error = vnode_getwithref(vp))) {
8174 		file_drop(uap->fd);
8175 		return error;
8176 	}
8177 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8178 
8179 	error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8180 
8181 	(void)vnode_put(vp);
8182 	file_drop(uap->fd);
8183 	return error;
8184 }
8185 
8186 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8187 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8188 {
8189 	int error;
8190 
8191 	if (usrtvp == USER_ADDR_NULL) {
8192 		struct timeval old_tv;
8193 		/* XXX Y2038 bug because of microtime argument */
8194 		microtime(&old_tv);
8195 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8196 		tsp[1] = tsp[0];
8197 	} else {
8198 		if (IS_64BIT_PROCESS(current_proc())) {
8199 			struct user64_timeval tv[2];
8200 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8201 			if (error) {
8202 				return error;
8203 			}
8204 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8205 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8206 		} else {
8207 			struct user32_timeval tv[2];
8208 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8209 			if (error) {
8210 				return error;
8211 			}
8212 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8213 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8214 		}
8215 	}
8216 	return 0;
8217 }
8218 
8219 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8220 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8221     int nullflag)
8222 {
8223 	int error;
8224 	struct vnode_attr va;
8225 	kauth_action_t action;
8226 
8227 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8228 
8229 	VATTR_INIT(&va);
8230 	VATTR_SET(&va, va_access_time, ts[0]);
8231 	VATTR_SET(&va, va_modify_time, ts[1]);
8232 	if (nullflag) {
8233 		va.va_vaflags |= VA_UTIMES_NULL;
8234 	}
8235 
8236 #if NAMEDSTREAMS
8237 	/* utimes calls are not allowed for resource forks. */
8238 	if (vp->v_flag & VISNAMEDSTREAM) {
8239 		error = EPERM;
8240 		goto out;
8241 	}
8242 #endif
8243 
8244 #if CONFIG_MACF
8245 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8246 	if (error) {
8247 		goto out;
8248 	}
8249 #endif
8250 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8251 		if (!nullflag && error == EACCES) {
8252 			error = EPERM;
8253 		}
8254 		goto out;
8255 	}
8256 
8257 	/* since we may not need to auth anything, check here */
8258 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8259 		if (!nullflag && error == EACCES) {
8260 			error = EPERM;
8261 		}
8262 		goto out;
8263 	}
8264 	error = vnode_setattr(vp, &va, ctx);
8265 
8266 #if CONFIG_MACF
8267 	if (error == 0) {
8268 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8269 	}
8270 #endif
8271 
8272 out:
8273 	return error;
8274 }
8275 
8276 /*
8277  * Set the access and modification times of a file.
8278  */
8279 /* ARGSUSED */
8280 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8281 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8282 {
8283 	struct timespec ts[2];
8284 	user_addr_t usrtvp;
8285 	int error;
8286 	struct nameidata nd;
8287 	vfs_context_t ctx = vfs_context_current();
8288 	uint32_t wantparent = 0;
8289 
8290 #if CONFIG_FILE_LEASES
8291 	wantparent = WANTPARENT;
8292 #endif
8293 
8294 	/*
8295 	 * AUDIT: Needed to change the order of operations to do the
8296 	 * name lookup first because auditing wants the path.
8297 	 */
8298 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8299 	    UIO_USERSPACE, uap->path, ctx);
8300 	error = namei(&nd);
8301 	if (error) {
8302 		return error;
8303 	}
8304 
8305 	/*
8306 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8307 	 * the current time instead.
8308 	 */
8309 	usrtvp = uap->tptr;
8310 	if ((error = getutimes(usrtvp, ts)) != 0) {
8311 		goto out;
8312 	}
8313 
8314 #if CONFIG_FILE_LEASES
8315 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8316 #endif
8317 
8318 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8319 
8320 out:
8321 #if CONFIG_FILE_LEASES
8322 	vnode_put(nd.ni_dvp);
8323 #endif
8324 	nameidone(&nd);
8325 	vnode_put(nd.ni_vp);
8326 	return error;
8327 }
8328 
8329 /*
8330  * Set the access and modification times of a file.
8331  */
8332 /* ARGSUSED */
8333 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8334 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8335 {
8336 	struct timespec ts[2];
8337 	vnode_t vp;
8338 	user_addr_t usrtvp;
8339 	int error;
8340 
8341 	AUDIT_ARG(fd, uap->fd);
8342 	usrtvp = uap->tptr;
8343 	if ((error = getutimes(usrtvp, ts)) != 0) {
8344 		return error;
8345 	}
8346 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8347 		return error;
8348 	}
8349 	if ((error = vnode_getwithref(vp))) {
8350 		file_drop(uap->fd);
8351 		return error;
8352 	}
8353 
8354 #if CONFIG_FILE_LEASES
8355 	vnode_breakdirlease(vp, true, O_WRONLY);
8356 #endif
8357 
8358 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8359 
8360 	vnode_put(vp);
8361 	file_drop(uap->fd);
8362 	return error;
8363 }
8364 
8365 static int
truncate_validate_common(proc_t p,off_t length)8366 truncate_validate_common(proc_t p, off_t length)
8367 {
8368 	rlim_t fsize_limit;
8369 
8370 	if (length < 0) {
8371 		return EINVAL;
8372 	}
8373 
8374 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8375 	if ((rlim_t)length > fsize_limit) {
8376 		psignal(p, SIGXFSZ);
8377 		return EFBIG;
8378 	}
8379 
8380 	return 0;
8381 }
8382 
8383 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8384 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8385     vfs_context_t ctx, boolean_t need_auth)
8386 {
8387 	struct vnode_attr va;
8388 	kauth_action_t action;
8389 	int error;
8390 
8391 	VATTR_INIT(&va);
8392 	VATTR_SET(&va, va_data_size, length);
8393 
8394 #if CONFIG_MACF
8395 	error = mac_vnode_check_truncate(ctx, cred, vp);
8396 	if (error) {
8397 		return error;
8398 	}
8399 #endif
8400 
8401 	/*
8402 	 * If we reached here from `ftruncate` then we already did an effective
8403 	 * `vnode_authorize` upon open.  We honour the result from then.
8404 	 */
8405 	if (need_auth) {
8406 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8407 			return error;
8408 		}
8409 
8410 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8411 			return error;
8412 		}
8413 	}
8414 
8415 #if CONFIG_FILE_LEASES
8416 	/* Check if there is a lease placed on the parent directory. */
8417 	vnode_breakdirlease(vp, true, O_WRONLY);
8418 
8419 	/* Now check if there is a lease placed on the file itself. */
8420 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8421 #endif
8422 
8423 	error = vnode_setattr(vp, &va, ctx);
8424 
8425 #if CONFIG_MACF
8426 	if (error == 0) {
8427 		mac_vnode_notify_truncate(ctx, cred, vp);
8428 	}
8429 #endif
8430 
8431 	return error;
8432 }
8433 
8434 /*
8435  * Truncate a file given its path name.
8436  */
8437 /* ARGSUSED */
8438 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8439 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8440 {
8441 	vfs_context_t ctx = vfs_context_current();
8442 	vnode_t vp;
8443 	int error;
8444 	struct nameidata nd;
8445 
8446 	if ((error = truncate_validate_common(p, uap->length))) {
8447 		return error;
8448 	}
8449 
8450 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8451 	    UIO_USERSPACE, uap->path, ctx);
8452 
8453 	if ((error = namei(&nd))) {
8454 		return error;
8455 	}
8456 
8457 	vp = nd.ni_vp;
8458 	nameidone(&nd);
8459 
8460 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8461 	vnode_put(vp);
8462 
8463 	return error;
8464 }
8465 
8466 /*
8467  * Truncate a file given a file descriptor.
8468  */
8469 /* ARGSUSED */
8470 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8471 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8472 {
8473 	vnode_t vp;
8474 	struct fileproc *fp;
8475 	int error;
8476 
8477 	AUDIT_ARG(fd, uap->fd);
8478 
8479 	if ((error = truncate_validate_common(p, uap->length))) {
8480 		return error;
8481 	}
8482 
8483 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8484 		return error;
8485 	}
8486 
8487 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8488 	case DTYPE_PSXSHM:
8489 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8490 		goto out;
8491 	case DTYPE_VNODE:
8492 		break;
8493 	default:
8494 		error = EINVAL;
8495 		goto out;
8496 	}
8497 
8498 	vp = (vnode_t)fp_get_data(fp);
8499 
8500 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8501 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8502 		error = EINVAL;
8503 		goto out;
8504 	}
8505 
8506 	if ((error = vnode_getwithref(vp)) != 0) {
8507 		goto out;
8508 	}
8509 
8510 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8511 
8512 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8513 	    vfs_context_current(), false);
8514 	vnode_put(vp);
8515 
8516 out:
8517 	file_drop(uap->fd);
8518 	return error;
8519 }
8520 
8521 
8522 /*
8523  * Sync an open file with synchronized I/O _file_ integrity completion
8524  */
8525 /* ARGSUSED */
8526 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8527 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8528 {
8529 	__pthread_testcancel(1);
8530 	return fsync_common(p, uap, MNT_WAIT);
8531 }
8532 
8533 
8534 /*
8535  * Sync an open file with synchronized I/O _file_ integrity completion
8536  *
8537  * Notes:	This is a legacy support function that does not test for
8538  *		thread cancellation points.
8539  */
8540 /* ARGSUSED */
8541 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8542 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8543 {
8544 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8545 }
8546 
8547 
8548 /*
8549  * Sync an open file with synchronized I/O _data_ integrity completion
8550  */
8551 /* ARGSUSED */
8552 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8553 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8554 {
8555 	__pthread_testcancel(1);
8556 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8557 }
8558 
8559 
8560 /*
8561  * fsync_common
8562  *
8563  * Common fsync code to support both synchronized I/O file integrity completion
8564  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8565  *
8566  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8567  * will only guarantee that the file data contents are retrievable.  If
8568  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8569  * includes additional metadata unnecessary for retrieving the file data
8570  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8571  * storage.
8572  *
8573  * Parameters:	p				The process
8574  *		uap->fd				The descriptor to synchronize
8575  *		flags				The data integrity flags
8576  *
8577  * Returns:	int				Success
8578  *	fp_getfvp:EBADF				Bad file descriptor
8579  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8580  *	VNOP_FSYNC:???				unspecified
8581  *
8582  * Notes:	We use struct fsync_args because it is a short name, and all
8583  *		caller argument structures are otherwise identical.
8584  */
8585 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8586 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8587 {
8588 	vnode_t vp;
8589 	struct fileproc *fp;
8590 	vfs_context_t ctx = vfs_context_current();
8591 	int error;
8592 
8593 	AUDIT_ARG(fd, uap->fd);
8594 
8595 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8596 		return error;
8597 	}
8598 	if ((error = vnode_getwithref(vp))) {
8599 		file_drop(uap->fd);
8600 		return error;
8601 	}
8602 
8603 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8604 
8605 	error = VNOP_FSYNC(vp, flags, ctx);
8606 
8607 #if NAMEDRSRCFORK
8608 	/* Sync resource fork shadow file if necessary. */
8609 	if ((error == 0) &&
8610 	    (vp->v_flag & VISNAMEDSTREAM) &&
8611 	    (vp->v_parent != NULLVP) &&
8612 	    vnode_isshadow(vp) &&
8613 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8614 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8615 	}
8616 #endif
8617 
8618 	(void)vnode_put(vp);
8619 	file_drop(uap->fd);
8620 	return error;
8621 }
8622 
8623 /*
8624  * Duplicate files.  Source must be a file, target must be a file or
8625  * must not exist.
8626  *
8627  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8628  *     perform inheritance correctly.
8629  */
8630 /* ARGSUSED */
8631 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8632 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8633 {
8634 	vnode_t tvp, fvp, tdvp, sdvp;
8635 	struct nameidata fromnd, tond;
8636 	int error;
8637 	vfs_context_t ctx = vfs_context_current();
8638 
8639 	/* Check that the flags are valid. */
8640 	if (uap->flags & ~CPF_MASK) {
8641 		return EINVAL;
8642 	}
8643 
8644 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8645 	    UIO_USERSPACE, uap->from, ctx);
8646 	if ((error = namei(&fromnd))) {
8647 		return error;
8648 	}
8649 	fvp = fromnd.ni_vp;
8650 
8651 	NDINIT(&tond, CREATE, OP_LINK,
8652 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8653 	    UIO_USERSPACE, uap->to, ctx);
8654 	if ((error = namei(&tond))) {
8655 		goto out1;
8656 	}
8657 	tdvp = tond.ni_dvp;
8658 	tvp = tond.ni_vp;
8659 
8660 	if (tvp != NULL) {
8661 		if (!(uap->flags & CPF_OVERWRITE)) {
8662 			error = EEXIST;
8663 			goto out;
8664 		}
8665 	}
8666 
8667 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8668 		error = EISDIR;
8669 		goto out;
8670 	}
8671 
8672 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8673 		error = EOPNOTSUPP;
8674 		goto out;
8675 	}
8676 
8677 #if CONFIG_MACF
8678 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8679 		goto out;
8680 	}
8681 #endif /* CONFIG_MACF */
8682 
8683 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8684 		goto out;
8685 	}
8686 	if (tvp) {
8687 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8688 			goto out;
8689 		}
8690 	}
8691 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8692 		goto out;
8693 	}
8694 
8695 	if (fvp == tdvp) {
8696 		error = EINVAL;
8697 	}
8698 	/*
8699 	 * If source is the same as the destination (that is the
8700 	 * same inode number) then there is nothing to do.
8701 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8702 	 */
8703 	if (fvp == tvp) {
8704 		error = -1;
8705 	}
8706 
8707 #if CONFIG_FILE_LEASES
8708 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8709 #endif
8710 
8711 	if (!error) {
8712 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8713 	}
8714 out:
8715 	sdvp = tond.ni_startdir;
8716 	/*
8717 	 * nameidone has to happen before we vnode_put(tdvp)
8718 	 * since it may need to release the fs_nodelock on the tdvp
8719 	 */
8720 	nameidone(&tond);
8721 
8722 	if (tvp) {
8723 		vnode_put(tvp);
8724 	}
8725 	vnode_put(tdvp);
8726 	vnode_put(sdvp);
8727 out1:
8728 	vnode_put(fvp);
8729 
8730 	nameidone(&fromnd);
8731 
8732 	if (error == -1) {
8733 		return 0;
8734 	}
8735 	return error;
8736 }
8737 
8738 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8739 
8740 /*
8741  * Helper function for doing clones. The caller is expected to provide an
8742  * iocounted source vnode and release it.
8743  */
8744 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8745 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8746     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8747 {
8748 	vnode_t tvp, tdvp;
8749 	struct nameidata *tondp = NULL;
8750 	int error;
8751 	int follow;
8752 	boolean_t free_src_acl;
8753 	boolean_t attr_cleanup;
8754 	enum vtype v_type;
8755 	kauth_action_t action;
8756 	struct componentname *cnp;
8757 	uint32_t defaulted = 0;
8758 	struct {
8759 		struct vnode_attr va[2];
8760 	} *va2p = NULL;
8761 	struct vnode_attr *vap = NULL;
8762 	struct vnode_attr *nvap = NULL;
8763 	uint32_t vnop_flags;
8764 
8765 	v_type = vnode_vtype(fvp);
8766 	switch (v_type) {
8767 	case VLNK:
8768 	/* FALLTHRU */
8769 	case VREG:
8770 		action = KAUTH_VNODE_ADD_FILE;
8771 		break;
8772 	case VDIR:
8773 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8774 		    fvp->v_mountedhere) {
8775 			return EINVAL;
8776 		}
8777 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8778 		break;
8779 	default:
8780 		return EINVAL;
8781 	}
8782 
8783 	AUDIT_ARG(fd2, dst_dirfd);
8784 	AUDIT_ARG(value32, flags);
8785 
8786 	tondp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8787 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8788 	NDINIT(tondp, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8789 	    UIO_USERSPACE, dst, ctx);
8790 	if (flags & CLONE_NOFOLLOW_ANY) {
8791 		tondp->ni_flag |= NAMEI_NOFOLLOW_ANY;
8792 	}
8793 
8794 	if ((error = nameiat(tondp, dst_dirfd))) {
8795 		kfree_type(struct nameidata, tondp);
8796 		return error;
8797 	}
8798 	cnp = &tondp->ni_cnd;
8799 	tdvp = tondp->ni_dvp;
8800 	tvp = tondp->ni_vp;
8801 
8802 	free_src_acl = FALSE;
8803 	attr_cleanup = FALSE;
8804 
8805 	if (tvp != NULL) {
8806 		error = EEXIST;
8807 		goto out;
8808 	}
8809 
8810 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8811 		error = EXDEV;
8812 		goto out;
8813 	}
8814 
8815 #if CONFIG_MACF
8816 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8817 		goto out;
8818 	}
8819 #endif
8820 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8821 		goto out;
8822 	}
8823 
8824 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8825 	if (data_read_authorised) {
8826 		action &= ~KAUTH_VNODE_READ_DATA;
8827 	}
8828 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8829 		goto out;
8830 	}
8831 
8832 	va2p = kalloc_type(typeof(*va2p), Z_WAITOK | Z_NOFAIL);
8833 	vap = &va2p->va[0];
8834 	nvap = &va2p->va[1];
8835 
8836 	/*
8837 	 * certain attributes may need to be changed from the source, we ask for
8838 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
8839 	 * flag is specified. By default, the clone file will inherit the target
8840 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8841 	 * will inherit the source file's ACLs instead.
8842 	 */
8843 	VATTR_INIT(vap);
8844 	VATTR_WANTED(vap, va_uid);
8845 	VATTR_WANTED(vap, va_gid);
8846 	VATTR_WANTED(vap, va_mode);
8847 	VATTR_WANTED(vap, va_flags);
8848 	if (flags & CLONE_ACL) {
8849 		VATTR_WANTED(vap, va_acl);
8850 	}
8851 
8852 	if ((error = vnode_getattr(fvp, vap, ctx)) != 0) {
8853 		goto out;
8854 	}
8855 
8856 	VATTR_INIT(nvap);
8857 	VATTR_SET(nvap, va_type, v_type);
8858 	if (VATTR_IS_SUPPORTED(vap, va_acl) && vap->va_acl != NULL) {
8859 		VATTR_SET(nvap, va_acl, vap->va_acl);
8860 		free_src_acl = TRUE;
8861 	}
8862 
8863 	/* Handle ACL inheritance, initialize vap. */
8864 	if (v_type == VLNK) {
8865 		error = vnode_authattr_new(tdvp, nvap, 0, ctx);
8866 	} else {
8867 		error = vn_attribute_prepare(tdvp, nvap, &defaulted, ctx);
8868 		if (error) {
8869 			goto out;
8870 		}
8871 		attr_cleanup = TRUE;
8872 	}
8873 
8874 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8875 	/*
8876 	 * We've got initial values for all security parameters,
8877 	 * If we are superuser, then we can change owners to be the
8878 	 * same as the source. Both superuser and the owner have default
8879 	 * WRITE_SECURITY privileges so all other fields can be taken
8880 	 * from source as well.
8881 	 */
8882 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8883 		if (VATTR_IS_SUPPORTED(vap, va_uid)) {
8884 			VATTR_SET(nvap, va_uid, vap->va_uid);
8885 		}
8886 		if (VATTR_IS_SUPPORTED(vap, va_gid)) {
8887 			VATTR_SET(nvap, va_gid, vap->va_gid);
8888 		}
8889 	} else {
8890 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8891 	}
8892 
8893 	if (VATTR_IS_SUPPORTED(vap, va_mode)) {
8894 		VATTR_SET(nvap, va_mode, vap->va_mode);
8895 	}
8896 	if (VATTR_IS_SUPPORTED(vap, va_flags)) {
8897 		VATTR_SET(nvap, va_flags,
8898 		    ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8899 		    (nvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8900 	}
8901 
8902 #if CONFIG_FILE_LEASES
8903 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8904 #endif
8905 
8906 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, nvap, vnop_flags, ctx);
8907 
8908 	if (!error && tvp) {
8909 		int     update_flags = 0;
8910 #if CONFIG_FSE
8911 		int fsevent;
8912 #endif /* CONFIG_FSE */
8913 
8914 		/*
8915 		 * If some of the requested attributes weren't handled by the
8916 		 * VNOP, use our fallback code.
8917 		 */
8918 		if (!VATTR_ALL_SUPPORTED(nvap)) {
8919 			(void)vnode_setattr_fallback(tvp, nvap, ctx);
8920 		}
8921 
8922 #if CONFIG_MACF
8923 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8924 		    VNODE_LABEL_CREATE, ctx);
8925 #endif
8926 
8927 		// Make sure the name & parent pointers are hooked up
8928 		if (tvp->v_name == NULL) {
8929 			update_flags |= VNODE_UPDATE_NAME;
8930 		}
8931 		if (tvp->v_parent == NULLVP) {
8932 			update_flags |= VNODE_UPDATE_PARENT;
8933 		}
8934 
8935 		if (update_flags) {
8936 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8937 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8938 		}
8939 
8940 #if CONFIG_FSE
8941 		switch (vnode_vtype(tvp)) {
8942 		case VLNK:
8943 		/* FALLTHRU */
8944 		case VREG:
8945 			fsevent = FSE_CREATE_FILE;
8946 			break;
8947 		case VDIR:
8948 			fsevent = FSE_CREATE_DIR;
8949 			break;
8950 		default:
8951 			goto out;
8952 		}
8953 
8954 		if (need_fsevent(fsevent, tvp)) {
8955 			/*
8956 			 * The following is a sequence of three explicit events.
8957 			 * A pair of FSE_CLONE events representing the source and destination
8958 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8959 			 * fseventsd may coalesce the destination clone and create events
8960 			 * into a single event resulting in the following sequence for a client
8961 			 * FSE_CLONE (src)
8962 			 * FSE_CLONE | FSE_CREATE (dst)
8963 			 */
8964 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8965 			    FSE_ARG_DONE);
8966 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8967 			    FSE_ARG_DONE);
8968 		}
8969 #endif /* CONFIG_FSE */
8970 	}
8971 
8972 out:
8973 	if (attr_cleanup) {
8974 		vn_attribute_cleanup(nvap, defaulted);
8975 	}
8976 	if (free_src_acl && vap->va_acl) {
8977 		kauth_acl_free(vap->va_acl);
8978 	}
8979 	if (va2p) {
8980 		kfree_type(typeof(*va2p), va2p);
8981 	}
8982 	nameidone(tondp);
8983 	kfree_type(struct nameidata, tondp);
8984 	if (tvp) {
8985 		vnode_put(tvp);
8986 	}
8987 	vnode_put(tdvp);
8988 	return error;
8989 }
8990 
8991 /*
8992  * clone files or directories, target must not exist.
8993  */
8994 /* ARGSUSED */
8995 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8996 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8997     __unused int32_t *retval)
8998 {
8999 	vnode_t fvp;
9000 	struct nameidata *ndp = NULL;
9001 	int follow;
9002 	int error;
9003 	vfs_context_t ctx = vfs_context_current();
9004 
9005 	/* Check that the flags are valid. */
9006 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9007 	    CLONE_NOFOLLOW_ANY)) {
9008 		return EINVAL;
9009 	}
9010 
9011 	AUDIT_ARG(fd, uap->src_dirfd);
9012 
9013 	ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9014 
9015 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9016 	NDINIT(ndp, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
9017 	    UIO_USERSPACE, uap->src, ctx);
9018 	if (uap->flags & CLONE_NOFOLLOW_ANY) {
9019 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9020 	}
9021 
9022 	if ((error = nameiat(ndp, uap->src_dirfd))) {
9023 		kfree_type(struct nameidata, ndp);
9024 		return error;
9025 	}
9026 
9027 	fvp = ndp->ni_vp;
9028 	nameidone(ndp);
9029 	kfree_type(struct nameidata, ndp);
9030 
9031 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
9032 	    uap->flags, ctx);
9033 
9034 	vnode_put(fvp);
9035 	return error;
9036 }
9037 
9038 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)9039 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
9040     __unused int32_t *retval)
9041 {
9042 	vnode_t fvp;
9043 	struct fileproc *fp;
9044 	int error;
9045 	vfs_context_t ctx = vfs_context_current();
9046 
9047 	/* Check that the flags are valid. */
9048 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9049 	    CLONE_NOFOLLOW_ANY)) {
9050 		return EINVAL;
9051 	}
9052 
9053 	AUDIT_ARG(fd, uap->src_fd);
9054 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
9055 	if (error) {
9056 		return error;
9057 	}
9058 
9059 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9060 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
9061 		error = EBADF;
9062 		goto out;
9063 	}
9064 
9065 	if ((error = vnode_getwithref(fvp))) {
9066 		goto out;
9067 	}
9068 
9069 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
9070 
9071 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
9072 	    uap->flags, ctx);
9073 
9074 	vnode_put(fvp);
9075 out:
9076 	file_drop(uap->src_fd);
9077 	return error;
9078 }
9079 
9080 static int
rename_submounts_callback(mount_t mp,void * arg)9081 rename_submounts_callback(mount_t mp, void *arg)
9082 {
9083 	int error = 0;
9084 	mount_t pmp = (mount_t)arg;
9085 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
9086 
9087 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
9088 		return 0;
9089 	}
9090 
9091 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
9092 		return 0;
9093 	}
9094 
9095 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
9096 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
9097 		return -1;
9098 	}
9099 
9100 	size_t pathlen = MAXPATHLEN;
9101 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
9102 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
9103 	}
9104 
9105 	vfs_unbusy(mp);
9106 
9107 	return error;
9108 }
9109 
9110 /*
9111  * Rename files.  Source and destination must either both be directories,
9112  * or both not be directories.  If target is a directory, it must be empty.
9113  */
9114 /* ARGSUSED */
9115 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)9116 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
9117     int tofd, user_addr_t to, int segflg, u_int uflags)
9118 {
9119 	vnode_t tvp, tdvp;
9120 	vnode_t fvp, fdvp;
9121 	vnode_t mnt_fvp;
9122 	struct nameidata *fromnd, *tond;
9123 	int error = 0;
9124 	int do_retry;
9125 	int retry_count;
9126 	int mntrename;
9127 	int need_event;
9128 	int need_kpath2;
9129 	int has_listeners;
9130 	const char *oname = NULL;
9131 	char *from_name = NULL, *to_name = NULL;
9132 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
9133 	int from_len = 0, to_len = 0;
9134 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
9135 	int holding_mntlock;
9136 	int vn_authorize_skipped;
9137 	mount_t locked_mp = NULL;
9138 	vnode_t oparent = NULLVP;
9139 	vnode_t locked_vp = NULLVP;
9140 #if CONFIG_FSE
9141 	fse_info from_finfo = {}, to_finfo;
9142 #endif
9143 	int from_truncated = 0, to_truncated = 0;
9144 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
9145 	int batched = 0;
9146 	struct vnode_attr *fvap, *tvap;
9147 	int continuing = 0;
9148 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
9149 	int32_t nofollow_any = 0;
9150 	/* carving out a chunk for structs that are too big to be on stack. */
9151 	struct {
9152 		struct nameidata from_node, to_node;
9153 		struct vnode_attr fv_attr, tv_attr;
9154 	} * __rename_data;
9155 
9156 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9157 	fromnd = &__rename_data->from_node;
9158 	tond = &__rename_data->to_node;
9159 
9160 	holding_mntlock = 0;
9161 	do_retry = 0;
9162 	retry_count = 0;
9163 retry:
9164 	fvp = tvp = NULL;
9165 	fdvp = tdvp = NULL;
9166 	fvap = tvap = NULL;
9167 	mnt_fvp = NULLVP;
9168 	mntrename = FALSE;
9169 	vn_authorize_skipped = FALSE;
9170 
9171 	if (uflags & RENAME_NOFOLLOW_ANY) {
9172 		nofollow_any = NAMEI_NOFOLLOW_ANY;
9173 	}
9174 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9175 	    segflg, from, ctx);
9176 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9177 
9178 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9179 	    segflg, to, ctx);
9180 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9181 
9182 continue_lookup:
9183 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9184 		if ((error = nameiat(fromnd, fromfd))) {
9185 			goto out1;
9186 		}
9187 		fdvp = fromnd->ni_dvp;
9188 		fvp  = fromnd->ni_vp;
9189 
9190 		if (fvp && fvp->v_type == VDIR) {
9191 			tond->ni_cnd.cn_flags |= WILLBEDIR;
9192 		}
9193 	}
9194 
9195 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9196 		if ((error = nameiat(tond, tofd))) {
9197 			/*
9198 			 * Translate error code for rename("dir1", "dir2/.").
9199 			 */
9200 			if (error == EISDIR && fvp->v_type == VDIR) {
9201 				error = EINVAL;
9202 			}
9203 			goto out1;
9204 		}
9205 		tdvp = tond->ni_dvp;
9206 		tvp  = tond->ni_vp;
9207 	}
9208 
9209 #if DEVELOPMENT || DEBUG
9210 	/*
9211 	 * XXX VSWAP: Check for entitlements or special flag here
9212 	 * so we can restrict access appropriately.
9213 	 */
9214 #else /* DEVELOPMENT || DEBUG */
9215 
9216 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9217 		error = EPERM;
9218 		goto out1;
9219 	}
9220 
9221 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9222 		error = EPERM;
9223 		goto out1;
9224 	}
9225 #endif /* DEVELOPMENT || DEBUG */
9226 
9227 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9228 		error = ENOENT;
9229 		goto out1;
9230 	}
9231 
9232 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9233 		int32_t pval = 0;
9234 		int err = 0;
9235 
9236 		/*
9237 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9238 		 * has the same name as target iff the following conditions are met:
9239 		 * 1. the target file system is case insensitive
9240 		 * 2. source and target directories are the same
9241 		 * 3. source and target files are the same
9242 		 * 4. name only differs in case (determined by underlying filesystem)
9243 		 */
9244 		if (fvp != tvp || fdvp != tdvp) {
9245 			error = EEXIST;
9246 			goto out1;
9247 		}
9248 
9249 		/*
9250 		 * Assume that the target file system is case sensitive if
9251 		 * _PC_CASE_SENSITIVE selector isn't supported.
9252 		 */
9253 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9254 		if (err != 0 || pval != 0) {
9255 			error = EEXIST;
9256 			goto out1;
9257 		}
9258 	}
9259 
9260 	batched = vnode_compound_rename_available(fdvp);
9261 
9262 #if CONFIG_FSE
9263 	need_event = need_fsevent(FSE_RENAME, fdvp);
9264 	if (need_event) {
9265 		if (fvp) {
9266 			get_fse_info(fvp, &from_finfo, ctx);
9267 		} else {
9268 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9269 			if (error) {
9270 				goto out1;
9271 			}
9272 
9273 			fvap = &__rename_data->fv_attr;
9274 		}
9275 
9276 		if (tvp) {
9277 			get_fse_info(tvp, &to_finfo, ctx);
9278 		} else if (batched) {
9279 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9280 			if (error) {
9281 				goto out1;
9282 			}
9283 
9284 			tvap = &__rename_data->tv_attr;
9285 		}
9286 	}
9287 #else
9288 	need_event = 0;
9289 #endif /* CONFIG_FSE */
9290 
9291 	has_listeners = kauth_authorize_fileop_has_listeners();
9292 
9293 	need_kpath2 = 0;
9294 #if CONFIG_AUDIT
9295 	if (AUDIT_RECORD_EXISTS()) {
9296 		need_kpath2 = 1;
9297 	}
9298 #endif
9299 
9300 	if (need_event || has_listeners) {
9301 		if (from_name == NULL) {
9302 			GET_PATH(from_name);
9303 		}
9304 
9305 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9306 
9307 		if (from_name_no_firmlink == NULL) {
9308 			GET_PATH(from_name_no_firmlink);
9309 		}
9310 
9311 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9312 	}
9313 
9314 	if (need_event || need_kpath2 || has_listeners) {
9315 		if (to_name == NULL) {
9316 			GET_PATH(to_name);
9317 		}
9318 
9319 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9320 
9321 		if (to_name_no_firmlink == NULL) {
9322 			GET_PATH(to_name_no_firmlink);
9323 		}
9324 
9325 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9326 		if (to_name && need_kpath2) {
9327 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9328 		}
9329 	}
9330 	if (!fvp) {
9331 		/*
9332 		 * Claim: this check will never reject a valid rename.
9333 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9334 		 * Suppose fdvp and tdvp are not on the same mount.
9335 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9336 		 *      then you can't move it to within another dir on the same mountpoint.
9337 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9338 		 *
9339 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9340 		 */
9341 		if (fdvp->v_mount != tdvp->v_mount) {
9342 			error = EXDEV;
9343 			goto out1;
9344 		}
9345 		goto skipped_lookup;
9346 	}
9347 
9348 	/*
9349 	 * If the source and destination are the same (i.e. they're
9350 	 * links to the same vnode) and the target file system is
9351 	 * case sensitive, then there is nothing to do.
9352 	 *
9353 	 * XXX Come back to this.
9354 	 */
9355 	if (fvp == tvp) {
9356 		int pathconf_val;
9357 
9358 		/*
9359 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9360 		 * then assume that this file system is case sensitive.
9361 		 */
9362 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9363 		    pathconf_val != 0) {
9364 			vn_authorize_skipped = TRUE;
9365 			goto out1;
9366 		}
9367 	}
9368 
9369 	/*
9370 	 * Allow the renaming of mount points.
9371 	 * - target must not exist
9372 	 * - target must reside in the same directory as source
9373 	 * - union mounts cannot be renamed
9374 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9375 	 *
9376 	 * XXX Handle this in VFS after a continued lookup (if we missed
9377 	 * in the cache to start off)
9378 	 *
9379 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9380 	 * we'll skip past here.  The file system is responsible for
9381 	 * checking that @tvp is not a descendent of @fvp and vice versa
9382 	 * so it should always return EINVAL if either @tvp or @fvp is the
9383 	 * root of a volume.
9384 	 */
9385 	if ((fvp->v_flag & VROOT) &&
9386 	    (fvp->v_type == VDIR) &&
9387 	    (tvp == NULL) &&
9388 	    (fvp->v_mountedhere == NULL) &&
9389 	    (fdvp == tdvp) &&
9390 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9391 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9392 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9393 		vnode_t coveredvp;
9394 
9395 		/* switch fvp to the covered vnode */
9396 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9397 		if ((vnode_getwithref(coveredvp))) {
9398 			error = ENOENT;
9399 			goto out1;
9400 		}
9401 		/*
9402 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9403 		 * later.
9404 		 */
9405 		mnt_fvp = fvp;
9406 
9407 		fvp = coveredvp;
9408 		mntrename = TRUE;
9409 	}
9410 	/*
9411 	 * Check for cross-device rename.
9412 	 * For rename on mountpoint, we want to also check the source and its parent
9413 	 * belong to the same mountpoint.
9414 	 */
9415 	if ((fvp->v_mount != tdvp->v_mount) ||
9416 	    (fvp->v_mount != fdvp->v_mount) ||
9417 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9418 		error = EXDEV;
9419 		goto out1;
9420 	}
9421 
9422 	/*
9423 	 * If source is the same as the destination (that is the
9424 	 * same inode number) then there is nothing to do...
9425 	 * EXCEPT if the underlying file system supports case
9426 	 * insensitivity and is case preserving.  In this case
9427 	 * the file system needs to handle the special case of
9428 	 * getting the same vnode as target (fvp) and source (tvp).
9429 	 *
9430 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9431 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9432 	 * handle the special case of getting the same vnode as target and
9433 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9434 	 * so not to cause locking problems. There is a single reference on tvp.
9435 	 *
9436 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9437 	 * that correct behaviour then is just to return success without doing
9438 	 * anything.
9439 	 *
9440 	 * XXX filesystem should take care of this itself, perhaps...
9441 	 */
9442 	if (fvp == tvp && fdvp == tdvp) {
9443 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9444 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9445 		    fromnd->ni_cnd.cn_namelen)) {
9446 			vn_authorize_skipped = TRUE;
9447 			goto out1;
9448 		}
9449 	}
9450 
9451 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9452 		/*
9453 		 * we're holding a reference and lock
9454 		 * on locked_mp, but it no longer matches
9455 		 * what we want to do... so drop our hold
9456 		 */
9457 		mount_unlock_renames(locked_mp);
9458 		mount_drop(locked_mp, 0);
9459 		holding_mntlock = 0;
9460 	}
9461 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9462 		/*
9463 		 * serialize renames that re-shape
9464 		 * the tree... if holding_mntlock is
9465 		 * set, then we're ready to go...
9466 		 * otherwise we
9467 		 * first need to drop the iocounts
9468 		 * we picked up, second take the
9469 		 * lock to serialize the access,
9470 		 * then finally start the lookup
9471 		 * process over with the lock held
9472 		 */
9473 		if (!holding_mntlock) {
9474 			/*
9475 			 * need to grab a reference on
9476 			 * the mount point before we
9477 			 * drop all the iocounts... once
9478 			 * the iocounts are gone, the mount
9479 			 * could follow
9480 			 */
9481 			locked_mp = fvp->v_mount;
9482 			mount_ref(locked_mp, 0);
9483 
9484 			/*
9485 			 * nameidone has to happen before we vnode_put(tvp)
9486 			 * since it may need to release the fs_nodelock on the tvp
9487 			 */
9488 			nameidone(tond);
9489 
9490 			if (tvp) {
9491 				vnode_put(tvp);
9492 			}
9493 			vnode_put(tdvp);
9494 
9495 			/*
9496 			 * nameidone has to happen before we vnode_put(fdvp)
9497 			 * since it may need to release the fs_nodelock on the fvp
9498 			 */
9499 			nameidone(fromnd);
9500 
9501 			vnode_put(fvp);
9502 			vnode_put(fdvp);
9503 
9504 			if (mnt_fvp != NULLVP) {
9505 				vnode_put(mnt_fvp);
9506 			}
9507 
9508 			mount_lock_renames(locked_mp);
9509 			holding_mntlock = 1;
9510 
9511 			goto retry;
9512 		}
9513 	} else {
9514 		/*
9515 		 * when we dropped the iocounts to take
9516 		 * the lock, we allowed the identity of
9517 		 * the various vnodes to change... if they did,
9518 		 * we may no longer be dealing with a rename
9519 		 * that reshapes the tree... once we're holding
9520 		 * the iocounts, the vnodes can't change type
9521 		 * so we're free to drop the lock at this point
9522 		 * and continue on
9523 		 */
9524 		if (holding_mntlock) {
9525 			mount_unlock_renames(locked_mp);
9526 			mount_drop(locked_mp, 0);
9527 			holding_mntlock = 0;
9528 		}
9529 	}
9530 
9531 	if (!batched) {
9532 		assert(locked_vp == NULLVP);
9533 		vnode_link_lock(fvp);
9534 		locked_vp = fvp;
9535 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9536 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9537 		    flags, NULL);
9538 		if (error) {
9539 			if (error == ENOENT) {
9540 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9541 					/*
9542 					 * We encountered a race where after doing the namei,
9543 					 * tvp stops being valid. If so, simply re-drive the rename
9544 					 * call from the top.
9545 					 */
9546 					do_retry = 1;
9547 					retry_count += 1;
9548 				}
9549 			}
9550 			vnode_link_unlock(fvp);
9551 			locked_vp = NULLVP;
9552 			goto out1;
9553 		}
9554 	}
9555 
9556 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9557 	if (mnt_fvp != NULLVP) {
9558 		vnode_put(mnt_fvp);
9559 		mnt_fvp = NULLVP;
9560 	}
9561 
9562 	// save these off so we can later verify that fvp is the same
9563 	oname   = fvp->v_name;
9564 	oparent = fvp->v_parent;
9565 
9566 skipped_lookup:
9567 #if CONFIG_FILE_LEASES
9568 	/* Lease break needed for source's parent dir? */
9569 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9570 
9571 	/* Lease break needed for target's parent dir? */
9572 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9573 #endif
9574 
9575 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9576 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9577 	    flags, ctx);
9578 
9579 	if (locked_vp) {
9580 		vnode_link_unlock(fvp);
9581 		locked_vp = NULLVP;
9582 	}
9583 
9584 	if (holding_mntlock) {
9585 		/*
9586 		 * we can drop our serialization
9587 		 * lock now
9588 		 */
9589 		mount_unlock_renames(locked_mp);
9590 		mount_drop(locked_mp, 0);
9591 		holding_mntlock = 0;
9592 	}
9593 	if (error) {
9594 		if (error == EDATALESS) {
9595 			/*
9596 			 * If we've been here before, something has gone
9597 			 * horribly wrong and we should just get out lest
9598 			 * we spiral around the drain forever.
9599 			 */
9600 			if (flags & VFS_RENAME_DATALESS) {
9601 				error = EIO;
9602 				goto out1;
9603 			}
9604 
9605 			/*
9606 			 * The object we're renaming is dataless (or has a
9607 			 * dataless descendent) and requires materialization
9608 			 * before the rename occurs.  But we're holding the
9609 			 * mount point's rename lock, so it's not safe to
9610 			 * make the upcall.
9611 			 *
9612 			 * In this case, we release the lock (above), perform
9613 			 * the materialization, and start the whole thing over.
9614 			 */
9615 			error = vfs_materialize_reparent(fvp, tdvp);
9616 			if (error == 0) {
9617 				/*
9618 				 * The next time around we need to tell the
9619 				 * file system that the materializtaion has
9620 				 * been performed.
9621 				 */
9622 				flags |= VFS_RENAME_DATALESS;
9623 				do_retry = 1;
9624 			}
9625 			goto out1;
9626 		}
9627 		if (error == EKEEPLOOKING) {
9628 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9629 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9630 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9631 				}
9632 			}
9633 
9634 			fromnd->ni_vp = fvp;
9635 			tond->ni_vp = tvp;
9636 
9637 			goto continue_lookup;
9638 		}
9639 
9640 		/*
9641 		 * We may encounter a race in the VNOP where the destination didn't
9642 		 * exist when we did the namei, but it does by the time we go and
9643 		 * try to create the entry. In this case, we should re-drive this rename
9644 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9645 		 * but other filesystems susceptible to this race could return it, too.
9646 		 */
9647 		if (error == ERECYCLE) {
9648 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9649 				do_retry = 1;
9650 				retry_count += 1;
9651 			} else {
9652 				printf("rename retry limit due to ERECYCLE reached\n");
9653 				error = ENOENT;
9654 			}
9655 		}
9656 
9657 		/*
9658 		 * For compound VNOPs, the authorization callback may return
9659 		 * ENOENT in case of racing hardlink lookups hitting the name
9660 		 * cache, redrive the lookup.
9661 		 */
9662 		if (batched && error == ENOENT) {
9663 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9664 				do_retry = 1;
9665 				retry_count += 1;
9666 			}
9667 		}
9668 
9669 		goto out1;
9670 	}
9671 
9672 	/* call out to allow 3rd party notification of rename.
9673 	 * Ignore result of kauth_authorize_fileop call.
9674 	 */
9675 	kauth_authorize_fileop(vfs_context_ucred(ctx),
9676 	    KAUTH_FILEOP_RENAME,
9677 	    (uintptr_t)from_name, (uintptr_t)to_name);
9678 	if (flags & VFS_RENAME_SWAP) {
9679 		kauth_authorize_fileop(vfs_context_ucred(ctx),
9680 		    KAUTH_FILEOP_RENAME,
9681 		    (uintptr_t)to_name, (uintptr_t)from_name);
9682 	}
9683 
9684 #if CONFIG_FSE
9685 	if (from_name != NULL && to_name != NULL) {
9686 		if (from_truncated || to_truncated) {
9687 			// set it here since only the from_finfo gets reported up to user space
9688 			from_finfo.mode |= FSE_TRUNCATED_PATH;
9689 		}
9690 
9691 		if (tvap && tvp) {
9692 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9693 		}
9694 		if (fvap) {
9695 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9696 		}
9697 
9698 		if (tvp) {
9699 			add_fsevent(FSE_RENAME, ctx,
9700 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9701 			    FSE_ARG_FINFO, &from_finfo,
9702 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9703 			    FSE_ARG_FINFO, &to_finfo,
9704 			    FSE_ARG_DONE);
9705 			if (flags & VFS_RENAME_SWAP) {
9706 				/*
9707 				 * Strictly speaking, swap is the equivalent of
9708 				 * *three* renames.  FSEvents clients should only take
9709 				 * the events as a hint, so we only bother reporting
9710 				 * two.
9711 				 */
9712 				add_fsevent(FSE_RENAME, ctx,
9713 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9714 				    FSE_ARG_FINFO, &to_finfo,
9715 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9716 				    FSE_ARG_FINFO, &from_finfo,
9717 				    FSE_ARG_DONE);
9718 			}
9719 		} else {
9720 			add_fsevent(FSE_RENAME, ctx,
9721 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9722 			    FSE_ARG_FINFO, &from_finfo,
9723 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9724 			    FSE_ARG_DONE);
9725 		}
9726 	}
9727 #endif /* CONFIG_FSE */
9728 
9729 	/*
9730 	 * update filesystem's mount point data
9731 	 */
9732 	if (mntrename) {
9733 		char *cp, *pathend, *mpname;
9734 		char * tobuf;
9735 		struct mount *mp;
9736 		int maxlen;
9737 		size_t len = 0;
9738 
9739 		mp = fvp->v_mountedhere;
9740 
9741 		if (vfs_busy(mp, LK_NOWAIT)) {
9742 			error = EBUSY;
9743 			goto out1;
9744 		}
9745 		tobuf = zalloc(ZV_NAMEI);
9746 
9747 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
9748 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9749 		} else {
9750 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9751 		}
9752 		if (!error) {
9753 			/* find current mount point prefix */
9754 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
9755 			for (cp = pathend; *cp != '\0'; ++cp) {
9756 				if (*cp == '/') {
9757 					pathend = cp + 1;
9758 				}
9759 			}
9760 			/* find last component of target name */
9761 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9762 				if (*cp == '/') {
9763 					mpname = cp + 1;
9764 				}
9765 			}
9766 
9767 			/* Update f_mntonname of sub mounts */
9768 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
9769 
9770 			/* append name to prefix */
9771 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9772 			bzero(pathend, maxlen);
9773 
9774 			strlcpy(pathend, mpname, maxlen);
9775 		}
9776 		zfree(ZV_NAMEI, tobuf);
9777 
9778 		vfs_unbusy(mp);
9779 
9780 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9781 	}
9782 	/*
9783 	 * fix up name & parent pointers.  note that we first
9784 	 * check that fvp has the same name/parent pointers it
9785 	 * had before the rename call... this is a 'weak' check
9786 	 * at best...
9787 	 *
9788 	 * XXX oparent and oname may not be set in the compound vnop case
9789 	 */
9790 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9791 		int update_flags;
9792 
9793 		update_flags = VNODE_UPDATE_NAME;
9794 
9795 		if (fdvp != tdvp) {
9796 			update_flags |= VNODE_UPDATE_PARENT;
9797 		}
9798 
9799 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9800 	}
9801 out1:
9802 	/*
9803 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9804 	 * skipped earlier as no actual rename was performed.
9805 	 */
9806 	if (vn_authorize_skipped && error == 0) {
9807 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
9808 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9809 		    flags, NULL);
9810 		if (error && error == ENOENT) {
9811 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9812 				do_retry = 1;
9813 				retry_count += 1;
9814 			}
9815 		}
9816 	}
9817 	if (to_name != NULL) {
9818 		RELEASE_PATH(to_name);
9819 		to_name = NULL;
9820 	}
9821 	if (to_name_no_firmlink != NULL) {
9822 		RELEASE_PATH(to_name_no_firmlink);
9823 		to_name_no_firmlink = NULL;
9824 	}
9825 	if (from_name != NULL) {
9826 		RELEASE_PATH(from_name);
9827 		from_name = NULL;
9828 	}
9829 	if (from_name_no_firmlink != NULL) {
9830 		RELEASE_PATH(from_name_no_firmlink);
9831 		from_name_no_firmlink = NULL;
9832 	}
9833 	if (holding_mntlock) {
9834 		mount_unlock_renames(locked_mp);
9835 		mount_drop(locked_mp, 0);
9836 		holding_mntlock = 0;
9837 	}
9838 	if (tdvp) {
9839 		/*
9840 		 * nameidone has to happen before we vnode_put(tdvp)
9841 		 * since it may need to release the fs_nodelock on the tdvp
9842 		 */
9843 		nameidone(tond);
9844 
9845 		if (tvp) {
9846 			vnode_put(tvp);
9847 		}
9848 		vnode_put(tdvp);
9849 	}
9850 	if (fdvp) {
9851 		/*
9852 		 * nameidone has to happen before we vnode_put(fdvp)
9853 		 * since it may need to release the fs_nodelock on the fdvp
9854 		 */
9855 		nameidone(fromnd);
9856 
9857 		if (fvp) {
9858 			vnode_put(fvp);
9859 		}
9860 		vnode_put(fdvp);
9861 	}
9862 	if (mnt_fvp != NULLVP) {
9863 		vnode_put(mnt_fvp);
9864 	}
9865 	/*
9866 	 * If things changed after we did the namei, then we will re-drive
9867 	 * this rename call from the top.
9868 	 */
9869 	if (do_retry) {
9870 		do_retry = 0;
9871 		goto retry;
9872 	}
9873 
9874 	kfree_type(typeof(*__rename_data), __rename_data);
9875 	return error;
9876 }
9877 
9878 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9879 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9880 {
9881 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9882 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9883 }
9884 
9885 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9886 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9887 {
9888 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9889 		return EINVAL;
9890 	}
9891 
9892 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9893 		return EINVAL;
9894 	}
9895 
9896 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9897 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9898 }
9899 
9900 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9901 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9902 {
9903 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9904 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9905 }
9906 
9907 /*
9908  * Make a directory file.
9909  *
9910  * Returns:	0			Success
9911  *		EEXIST
9912  *	namei:???
9913  *	vnode_authorize:???
9914  *	vn_create:???
9915  */
9916 /* ARGSUSED */
9917 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9918 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9919     enum uio_seg segflg)
9920 {
9921 	vnode_t vp, dvp;
9922 	int error;
9923 	int update_flags = 0;
9924 	int batched;
9925 	struct nameidata nd;
9926 
9927 	AUDIT_ARG(mode, vap->va_mode);
9928 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9929 	    path, ctx);
9930 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9931 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9932 
9933 continue_lookup:
9934 	error = nameiat(&nd, fd);
9935 	if (error) {
9936 		return error;
9937 	}
9938 	dvp = nd.ni_dvp;
9939 	vp = nd.ni_vp;
9940 
9941 	if (vp != NULL) {
9942 		error = EEXIST;
9943 		goto out;
9944 	}
9945 
9946 	batched = vnode_compound_mkdir_available(dvp);
9947 
9948 	VATTR_SET(vap, va_type, VDIR);
9949 
9950 	/*
9951 	 * XXX
9952 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9953 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9954 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9955 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9956 	 */
9957 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9958 		if (error == EACCES || error == EPERM) {
9959 			int error2;
9960 
9961 			nameidone(&nd);
9962 			vnode_put(dvp);
9963 			dvp = NULLVP;
9964 
9965 			/*
9966 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9967 			 * rather than EACCESS if the target exists.
9968 			 */
9969 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9970 			    path, ctx);
9971 			error2 = nameiat(&nd, fd);
9972 			if (error2) {
9973 				goto out;
9974 			} else {
9975 				vp = nd.ni_vp;
9976 				error = EEXIST;
9977 				goto out;
9978 			}
9979 		}
9980 
9981 		goto out;
9982 	}
9983 
9984 #if CONFIG_FILE_LEASES
9985 	vnode_breakdirlease(dvp, false, O_WRONLY);
9986 #endif
9987 
9988 	/*
9989 	 * make the directory
9990 	 */
9991 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9992 		if (error == EKEEPLOOKING) {
9993 			nd.ni_vp = vp;
9994 			goto continue_lookup;
9995 		}
9996 
9997 		goto out;
9998 	}
9999 
10000 	// Make sure the name & parent pointers are hooked up
10001 	if (vp->v_name == NULL) {
10002 		update_flags |= VNODE_UPDATE_NAME;
10003 	}
10004 	if (vp->v_parent == NULLVP) {
10005 		update_flags |= VNODE_UPDATE_PARENT;
10006 	}
10007 
10008 	if (update_flags) {
10009 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
10010 	}
10011 
10012 #if CONFIG_FSE
10013 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
10014 #endif
10015 
10016 out:
10017 	/*
10018 	 * nameidone has to happen before we vnode_put(dvp)
10019 	 * since it may need to release the fs_nodelock on the dvp
10020 	 */
10021 	nameidone(&nd);
10022 
10023 	if (vp) {
10024 		vnode_put(vp);
10025 	}
10026 	if (dvp) {
10027 		vnode_put(dvp);
10028 	}
10029 
10030 	return error;
10031 }
10032 
10033 /*
10034  * mkdir_extended: Create a directory; with extended security (ACL).
10035  *
10036  * Parameters:    p                       Process requesting to create the directory
10037  *                uap                     User argument descriptor (see below)
10038  *                retval                  (ignored)
10039  *
10040  * Indirect:      uap->path               Path of directory to create
10041  *                uap->mode               Access permissions to set
10042  *                uap->xsecurity          ACL to set
10043  *
10044  * Returns:        0                      Success
10045  *                !0                      Not success
10046  *
10047  */
10048 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)10049 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
10050 {
10051 	int ciferror;
10052 	kauth_filesec_t xsecdst;
10053 	struct vnode_attr va;
10054 
10055 	AUDIT_ARG(owner, uap->uid, uap->gid);
10056 
10057 	xsecdst = NULL;
10058 	if ((uap->xsecurity != USER_ADDR_NULL) &&
10059 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
10060 		return ciferror;
10061 	}
10062 
10063 	VATTR_INIT(&va);
10064 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10065 	if (xsecdst != NULL) {
10066 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
10067 		va.va_vaflags |= VA_FILESEC_ACL;
10068 	}
10069 
10070 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10071 	    UIO_USERSPACE);
10072 	if (xsecdst != NULL) {
10073 		kauth_filesec_free(xsecdst);
10074 	}
10075 	return ciferror;
10076 }
10077 
10078 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)10079 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
10080 {
10081 	struct vnode_attr va;
10082 
10083 	VATTR_INIT(&va);
10084 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10085 
10086 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10087 	           UIO_USERSPACE);
10088 }
10089 
10090 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)10091 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
10092 {
10093 	struct vnode_attr va;
10094 
10095 	VATTR_INIT(&va);
10096 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10097 
10098 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
10099 	           UIO_USERSPACE);
10100 }
10101 
10102 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)10103 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
10104     enum uio_seg segflg, int unlink_flags)
10105 {
10106 	struct {
10107 		struct nameidata nd;
10108 #if CONFIG_FSE
10109 		struct vnode_attr va;
10110 #endif /* CONFIG_FSE */
10111 	} *__rmdir_data;
10112 	vnode_t vp, dvp;
10113 	int error;
10114 	struct nameidata *ndp;
10115 	char     *path = NULL;
10116 	char     *no_firmlink_path = NULL;
10117 	int       len_path = 0;
10118 	int       len_no_firmlink_path = 0;
10119 	int has_listeners = 0;
10120 	int need_event = 0;
10121 	int truncated_path = 0;
10122 	int truncated_no_firmlink_path = 0;
10123 	struct vnode_attr *vap = NULL;
10124 	int restart_count = 0;
10125 	int batched;
10126 
10127 	int restart_flag;
10128 	int nofollow_any = 0;
10129 
10130 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
10131 	ndp = &__rmdir_data->nd;
10132 
10133 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
10134 		nofollow_any = NAMEI_NOFOLLOW_ANY;
10135 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
10136 	}
10137 
10138 	/*
10139 	 * This loop exists to restart rmdir in the unlikely case that two
10140 	 * processes are simultaneously trying to remove the same directory
10141 	 * containing orphaned appleDouble files.
10142 	 */
10143 	do {
10144 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
10145 		    segflg, dirpath, ctx);
10146 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
10147 continue_lookup:
10148 		restart_flag = 0;
10149 		vap = NULL;
10150 
10151 		error = nameiat(ndp, fd);
10152 		if (error) {
10153 			goto err_out;
10154 		}
10155 
10156 		dvp = ndp->ni_dvp;
10157 		vp = ndp->ni_vp;
10158 
10159 		if (vp) {
10160 			batched = vnode_compound_rmdir_available(vp);
10161 
10162 			if (vp->v_flag & VROOT) {
10163 				/*
10164 				 * The root of a mounted filesystem cannot be deleted.
10165 				 */
10166 				error = EBUSY;
10167 				goto out;
10168 			}
10169 
10170 #if DEVELOPMENT || DEBUG
10171 			/*
10172 			 * XXX VSWAP: Check for entitlements or special flag here
10173 			 * so we can restrict access appropriately.
10174 			 */
10175 #else /* DEVELOPMENT || DEBUG */
10176 
10177 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10178 				error = EPERM;
10179 				goto out;
10180 			}
10181 #endif /* DEVELOPMENT || DEBUG */
10182 
10183 			/*
10184 			 * Removed a check here; we used to abort if vp's vid
10185 			 * was not the same as what we'd seen the last time around.
10186 			 * I do not think that check was valid, because if we retry
10187 			 * and all dirents are gone, the directory could legitimately
10188 			 * be recycled but still be present in a situation where we would
10189 			 * have had permission to delete.  Therefore, we won't make
10190 			 * an effort to preserve that check now that we may not have a
10191 			 * vp here.
10192 			 */
10193 
10194 			if (!batched) {
10195 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10196 				if (error) {
10197 					if (error == ENOENT) {
10198 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10199 							restart_flag = 1;
10200 							restart_count += 1;
10201 						}
10202 					}
10203 					goto out;
10204 				}
10205 			}
10206 		} else {
10207 			batched = 1;
10208 
10209 			if (!vnode_compound_rmdir_available(dvp)) {
10210 				panic("No error, but no compound rmdir?");
10211 			}
10212 		}
10213 
10214 #if CONFIG_FSE
10215 		fse_info  finfo = {0};
10216 
10217 		need_event = need_fsevent(FSE_DELETE, dvp);
10218 		if (need_event) {
10219 			if (!batched) {
10220 				get_fse_info(vp, &finfo, ctx);
10221 			} else {
10222 				error = vfs_get_notify_attributes(&__rmdir_data->va);
10223 				if (error) {
10224 					goto out;
10225 				}
10226 
10227 				vap = &__rmdir_data->va;
10228 			}
10229 		}
10230 #endif
10231 		has_listeners = kauth_authorize_fileop_has_listeners();
10232 		if (need_event || has_listeners) {
10233 			if (path == NULL) {
10234 				GET_PATH(path);
10235 			}
10236 
10237 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10238 
10239 			if (no_firmlink_path == NULL) {
10240 				GET_PATH(no_firmlink_path);
10241 			}
10242 
10243 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10244 #if CONFIG_FSE
10245 			if (truncated_no_firmlink_path) {
10246 				finfo.mode |= FSE_TRUNCATED_PATH;
10247 			}
10248 #endif
10249 		}
10250 
10251 #if CONFIG_FILE_LEASES
10252 		vnode_breakdirlease(dvp, false, O_WRONLY);
10253 #endif
10254 
10255 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10256 		ndp->ni_vp = vp;
10257 		if (vp == NULLVP) {
10258 			/* Couldn't find a vnode */
10259 			goto out;
10260 		}
10261 
10262 		if (error == EKEEPLOOKING) {
10263 			goto continue_lookup;
10264 		} else if (batched && error == ENOENT) {
10265 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10266 				/*
10267 				 * For compound VNOPs, the authorization callback
10268 				 * may return ENOENT in case of racing hard link lookups
10269 				 * redrive the lookup.
10270 				 */
10271 				restart_flag = 1;
10272 				restart_count += 1;
10273 				goto out;
10274 			}
10275 		}
10276 
10277 		/*
10278 		 * XXX There's no provision for passing flags
10279 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10280 		 * because it's not empty, then we try again
10281 		 * with VNOP_REMOVE(), passing in a special
10282 		 * flag that clever file systems will know
10283 		 * how to handle.
10284 		 */
10285 		if (error == ENOTEMPTY &&
10286 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10287 			/*
10288 			 * Only do this if the directory is actually
10289 			 * marked as DATALESS.
10290 			 */
10291 			struct vnode_attr *lvap =
10292 			    kalloc_type(struct vnode_attr, Z_WAITOK);
10293 
10294 			VATTR_INIT(lvap);
10295 			VATTR_WANTED(lvap, va_flags);
10296 			if (vnode_getattr(vp, lvap, ctx) == 0 &&
10297 			    VATTR_IS_SUPPORTED(lvap, va_flags) &&
10298 			    (lvap->va_flags & SF_DATALESS) != 0) {
10299 				/*
10300 				 * If this fails, we want to keep the original
10301 				 * error.
10302 				 */
10303 				if (vn_remove(dvp, &vp, ndp,
10304 				    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10305 					error = 0;
10306 				}
10307 			}
10308 			kfree_type(struct vnode_attr, lvap);
10309 		}
10310 
10311 #if CONFIG_APPLEDOUBLE
10312 		/*
10313 		 * Special case to remove orphaned AppleDouble
10314 		 * files. I don't like putting this in the kernel,
10315 		 * but carbon does not like putting this in carbon either,
10316 		 * so here we are.
10317 		 */
10318 		if (error == ENOTEMPTY) {
10319 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10320 			if (ad_error == EBUSY) {
10321 				error = ad_error;
10322 				goto out;
10323 			}
10324 
10325 
10326 			/*
10327 			 * Assuming everything went well, we will try the RMDIR again
10328 			 */
10329 			if (!ad_error) {
10330 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10331 			}
10332 		}
10333 #endif /* CONFIG_APPLEDOUBLE */
10334 		/*
10335 		 * Call out to allow 3rd party notification of delete.
10336 		 * Ignore result of kauth_authorize_fileop call.
10337 		 */
10338 		if (!error) {
10339 			if (has_listeners) {
10340 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10341 				    KAUTH_FILEOP_DELETE,
10342 				    (uintptr_t)vp,
10343 				    (uintptr_t)path);
10344 			}
10345 
10346 			if (vp->v_flag & VISHARDLINK) {
10347 				// see the comment in unlink1() about why we update
10348 				// the parent of a hard link when it is removed
10349 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10350 			}
10351 
10352 #if CONFIG_FSE
10353 			if (need_event) {
10354 				if (vap) {
10355 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10356 				}
10357 				add_fsevent(FSE_DELETE, ctx,
10358 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10359 				    FSE_ARG_FINFO, &finfo,
10360 				    FSE_ARG_DONE);
10361 			}
10362 #endif
10363 
10364 #if CONFIG_MACF
10365 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10366 #endif
10367 		}
10368 
10369 out:
10370 		if (path != NULL) {
10371 			RELEASE_PATH(path);
10372 			path = NULL;
10373 		}
10374 
10375 		if (no_firmlink_path != NULL) {
10376 			RELEASE_PATH(no_firmlink_path);
10377 			no_firmlink_path = NULL;
10378 		}
10379 
10380 		/*
10381 		 * nameidone has to happen before we vnode_put(dvp)
10382 		 * since it may need to release the fs_nodelock on the dvp
10383 		 */
10384 		nameidone(ndp);
10385 		vnode_put(dvp);
10386 
10387 		if (vp) {
10388 			vnode_put(vp);
10389 		}
10390 
10391 		if (restart_flag == 0) {
10392 			wakeup_one((caddr_t)vp);
10393 			goto err_out;
10394 		}
10395 		tsleep(vp, PVFS, "rm AD", 1);
10396 	} while (restart_flag != 0);
10397 
10398 err_out:
10399 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10400 
10401 	return error;
10402 }
10403 
10404 /*
10405  * Remove a directory file.
10406  */
10407 /* ARGSUSED */
10408 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10409 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10410 {
10411 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10412 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10413 }
10414 
10415 /* Get direntry length padded to 8 byte alignment */
10416 #define DIRENT64_LEN(namlen) \
10417 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10418 
10419 /* Get dirent length padded to 4 byte alignment */
10420 #define DIRENT_LEN(namelen) \
10421 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10422 
10423 /* Get the end of this dirent */
10424 #define DIRENT_END(dep) \
10425 	(((char *)(dep)) + (dep)->d_reclen - 1)
10426 
10427 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10428 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10429     int *numdirent, vfs_context_t ctxp)
10430 {
10431 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10432 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10433 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10434 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10435 	} else {
10436 		size_t bufsize;
10437 		void * bufptr;
10438 		uio_t auio;
10439 		struct direntry *entry64;
10440 		struct dirent *dep;
10441 		size_t bytesread;
10442 		int error;
10443 
10444 		/*
10445 		 * We're here because the underlying file system does not
10446 		 * support direnties or we mounted denying support so we must
10447 		 * fall back to dirents and convert them to direntries.
10448 		 *
10449 		 * Our kernel buffer needs to be smaller since re-packing will
10450 		 * expand each dirent.  The worse case (when the name length
10451 		 * is 3 or less) corresponds to a struct direntry size of 32
10452 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10453 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10454 		 * will prevent us from reading more than we can pack.
10455 		 *
10456 		 * Since this buffer is wired memory, we will limit the
10457 		 * buffer size to a maximum of 32K. We would really like to
10458 		 * use 32K in the MIN(), but we use magic number 87371 to
10459 		 * prevent uio_resid() * 3 / 8 from overflowing.
10460 		 */
10461 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10462 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10463 		if (bufptr == NULL) {
10464 			return ENOMEM;
10465 		}
10466 
10467 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10468 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10469 		auio->uio_offset = uio->uio_offset;
10470 
10471 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10472 
10473 		dep = (struct dirent *)bufptr;
10474 		bytesread = bufsize - uio_resid(auio);
10475 
10476 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10477 		/*
10478 		 * Convert all the entries and copy them out to user's buffer.
10479 		 */
10480 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10481 			/* First check that the dirent struct up to d_name is within the buffer */
10482 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10483 			    /* Check that the length of the entire dirent is within the buffer */
10484 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10485 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10486 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10487 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10488 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10489 				    vp->v_name ? vp->v_name : "<unknown>");
10490 				error = EIO;
10491 				break;
10492 			}
10493 
10494 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10495 
10496 			bzero(entry64, enbufsize);
10497 			/* Convert a dirent to a dirent64. */
10498 			entry64->d_ino = dep->d_ino;
10499 			entry64->d_seekoff = 0;
10500 			entry64->d_reclen = (uint16_t)enbufsize;
10501 			entry64->d_namlen = dep->d_namlen;
10502 			entry64->d_type = dep->d_type;
10503 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10504 
10505 			/* Move to next entry. */
10506 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10507 
10508 			/* Copy entry64 to user's buffer. */
10509 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10510 		}
10511 
10512 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10513 		if (error == 0) {
10514 			uio->uio_offset = auio->uio_offset;
10515 		}
10516 		uio_free(auio);
10517 		kfree_data(bufptr, bufsize);
10518 		kfree_type(struct direntry, entry64);
10519 		return error;
10520 	}
10521 }
10522 
10523 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10524 
10525 /*
10526  * Read a block of directory entries in a file system independent format.
10527  */
10528 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10529 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10530     off_t *offset, int *eofflag, int flags)
10531 {
10532 	vnode_t vp;
10533 	struct vfs_context context = *vfs_context_current();    /* local copy */
10534 	struct fileproc *fp;
10535 	uio_t auio;
10536 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10537 	off_t loff;
10538 	int error, numdirent;
10539 	UIO_STACKBUF(uio_buf, 1);
10540 
10541 get_from_fd:
10542 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10543 	if (error) {
10544 		return error;
10545 	}
10546 
10547 	vn_offset_lock(fp->fp_glob);
10548 	if (((vnode_t)fp_get_data(fp)) != vp) {
10549 		vn_offset_unlock(fp->fp_glob);
10550 		file_drop(fd);
10551 		goto get_from_fd;
10552 	}
10553 
10554 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10555 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10556 		error = EBADF;
10557 		goto out;
10558 	}
10559 
10560 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10561 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10562 	}
10563 
10564 #if CONFIG_MACF
10565 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10566 	if (error) {
10567 		goto out;
10568 	}
10569 #endif
10570 
10571 	if ((error = vnode_getwithref(vp))) {
10572 		goto out;
10573 	}
10574 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10575 
10576 #if CONFIG_UNION_MOUNTS
10577 unionread:
10578 #endif /* CONFIG_UNION_MOUNTS */
10579 	if (vp->v_type != VDIR) {
10580 		(void)vnode_put(vp);
10581 		error = EINVAL;
10582 		goto out;
10583 	}
10584 
10585 #if CONFIG_MACF
10586 	error = mac_vnode_check_readdir(&context, vp);
10587 	if (error != 0) {
10588 		(void)vnode_put(vp);
10589 		goto out;
10590 	}
10591 #endif /* MAC */
10592 
10593 	loff = fp->fp_glob->fg_offset;
10594 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10595 	uio_addiov(auio, bufp, bufsize);
10596 
10597 	if (flags & VNODE_READDIR_EXTENDED) {
10598 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10599 		fp->fp_glob->fg_offset = uio_offset(auio);
10600 	} else {
10601 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10602 		fp->fp_glob->fg_offset = uio_offset(auio);
10603 	}
10604 	if (error) {
10605 		(void)vnode_put(vp);
10606 		goto out;
10607 	}
10608 
10609 #if CONFIG_UNION_MOUNTS
10610 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10611 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10612 		vnode_t uvp;
10613 
10614 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10615 			if (vnode_ref(uvp) == 0) {
10616 				fp_set_data(fp, uvp);
10617 				fp->fp_glob->fg_offset = 0;
10618 				vnode_rele(vp);
10619 				vnode_put(vp);
10620 				vp = uvp;
10621 				goto unionread;
10622 			} else {
10623 				/* could not get a ref, can't replace in fd */
10624 				vnode_put(uvp);
10625 			}
10626 		}
10627 	}
10628 #endif /* CONFIG_UNION_MOUNTS */
10629 
10630 	vnode_put(vp);
10631 	if (offset) {
10632 		*offset = loff;
10633 	}
10634 
10635 	*bytesread = bufsize - uio_resid(auio);
10636 out:
10637 	vn_offset_unlock(fp->fp_glob);
10638 	file_drop(fd);
10639 	return error;
10640 }
10641 
10642 
10643 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10644 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10645 {
10646 	off_t offset;
10647 	ssize_t bytesread;
10648 	int error, eofflag;
10649 
10650 	AUDIT_ARG(fd, uap->fd);
10651 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
10652 	    &bytesread, &offset, &eofflag, 0);
10653 
10654 	if (error == 0) {
10655 		if (proc_is64bit(p)) {
10656 			user64_long_t base = (user64_long_t)offset;
10657 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10658 		} else {
10659 			user32_long_t base = (user32_long_t)offset;
10660 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10661 		}
10662 		*retval = (int)bytesread;
10663 	}
10664 	return error;
10665 }
10666 
10667 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10668 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10669 {
10670 	off_t offset;
10671 	ssize_t bytesread;
10672 	int error, eofflag;
10673 	user_size_t bufsize;
10674 
10675 	AUDIT_ARG(fd, uap->fd);
10676 
10677 	/*
10678 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10679 	 * then the kernel carves out the last 4 bytes to return extended
10680 	 * information to userspace (namely whether we reached EOF with this call).
10681 	 */
10682 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10683 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10684 	} else {
10685 		bufsize = uap->bufsize;
10686 	}
10687 
10688 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
10689 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10690 
10691 	if (error == 0) {
10692 		*retval = bytesread;
10693 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10694 
10695 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10696 			getdirentries64_flags_t flags = 0;
10697 			if (eofflag) {
10698 				flags |= GETDIRENTRIES64_EOF;
10699 			}
10700 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10701 			    sizeof(flags));
10702 		}
10703 	}
10704 	return error;
10705 }
10706 
10707 
10708 /*
10709  * Set the mode mask for creation of filesystem nodes.
10710  * XXX implement xsecurity
10711  */
10712 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
10713 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10714 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10715 {
10716 	AUDIT_ARG(mask, newmask);
10717 	proc_fdlock(p);
10718 	*retval = p->p_fd.fd_cmask;
10719 	p->p_fd.fd_cmask = newmask & ALLPERMS;
10720 	proc_fdunlock(p);
10721 	return 0;
10722 }
10723 
10724 /*
10725  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10726  *
10727  * Parameters:    p                       Process requesting to set the umask
10728  *                uap                     User argument descriptor (see below)
10729  *                retval                  umask of the process (parameter p)
10730  *
10731  * Indirect:      uap->newmask            umask to set
10732  *                uap->xsecurity          ACL to set
10733  *
10734  * Returns:        0                      Success
10735  *                !0                      Not success
10736  *
10737  */
10738 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10739 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10740 {
10741 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10742 }
10743 
10744 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10745 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10746 {
10747 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10748 }
10749 
10750 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
10751 	"com.apple.private.vfs.revoke-mounted-device"
10752 
10753 /*
10754  * Void all references to file by ripping underlying filesystem
10755  * away from vnode.
10756  */
10757 /* ARGSUSED */
10758 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10759 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10760 {
10761 	vnode_t vp;
10762 	struct vnode_attr va;
10763 	vfs_context_t ctx = vfs_context_current();
10764 	int error;
10765 	struct nameidata nd;
10766 
10767 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10768 	    uap->path, ctx);
10769 	error = namei(&nd);
10770 	if (error) {
10771 		return error;
10772 	}
10773 	vp = nd.ni_vp;
10774 
10775 	nameidone(&nd);
10776 
10777 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10778 		error = ENOTSUP;
10779 		goto out;
10780 	}
10781 
10782 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10783 		error = EBUSY;
10784 		goto out;
10785 	}
10786 
10787 #if CONFIG_MACF
10788 	error = mac_vnode_check_revoke(ctx, vp);
10789 	if (error) {
10790 		goto out;
10791 	}
10792 #endif
10793 
10794 	VATTR_INIT(&va);
10795 	VATTR_WANTED(&va, va_uid);
10796 	if ((error = vnode_getattr(vp, &va, ctx))) {
10797 		goto out;
10798 	}
10799 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10800 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10801 		goto out;
10802 	}
10803 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10804 		VNOP_REVOKE(vp, REVOKEALL, ctx);
10805 	}
10806 out:
10807 	vnode_put(vp);
10808 	return error;
10809 }
10810 
10811 
10812 /*
10813  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10814  *  The following system calls are designed to support features
10815  *  which are specific to the HFS & HFS Plus volume formats
10816  */
10817 
10818 
10819 /*
10820  * Obtain attribute information on objects in a directory while enumerating
10821  * the directory.
10822  */
10823 /* ARGSUSED */
10824 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10825 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10826 {
10827 	vnode_t vp;
10828 	struct fileproc *fp;
10829 	uio_t auio = NULL;
10830 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10831 	uint32_t count = 0, savecount = 0;
10832 	uint32_t newstate = 0;
10833 	int error, eofflag = 0;
10834 	off_t loff = 0;
10835 	struct attrlist attributelist;
10836 	vfs_context_t ctx = vfs_context_current();
10837 	int fd = uap->fd;
10838 	UIO_STACKBUF(uio_buf, 1);
10839 	kauth_action_t action;
10840 
10841 	AUDIT_ARG(fd, fd);
10842 
10843 	/* Get the attributes into kernel space */
10844 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10845 		return error;
10846 	}
10847 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10848 		return error;
10849 	}
10850 	savecount = count;
10851 
10852 get_from_fd:
10853 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10854 		return error;
10855 	}
10856 
10857 	vn_offset_lock(fp->fp_glob);
10858 	if (((vnode_t)fp_get_data(fp)) != vp) {
10859 		vn_offset_unlock(fp->fp_glob);
10860 		file_drop(fd);
10861 		goto get_from_fd;
10862 	}
10863 
10864 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10865 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10866 		error = EBADF;
10867 		goto out;
10868 	}
10869 
10870 
10871 #if CONFIG_MACF
10872 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10873 	    fp->fp_glob);
10874 	if (error) {
10875 		goto out;
10876 	}
10877 #endif
10878 
10879 
10880 	if ((error = vnode_getwithref(vp))) {
10881 		goto out;
10882 	}
10883 
10884 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10885 
10886 #if CONFIG_UNION_MOUNTS
10887 unionread:
10888 #endif /* CONFIG_UNION_MOUNTS */
10889 	if (vp->v_type != VDIR) {
10890 		(void)vnode_put(vp);
10891 		error = EINVAL;
10892 		goto out;
10893 	}
10894 
10895 #if CONFIG_MACF
10896 	error = mac_vnode_check_readdir(ctx, vp);
10897 	if (error != 0) {
10898 		(void)vnode_put(vp);
10899 		goto out;
10900 	}
10901 #endif /* MAC */
10902 
10903 	/* set up the uio structure which will contain the users return buffer */
10904 	loff = fp->fp_glob->fg_offset;
10905 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10906 	uio_addiov(auio, uap->buffer, uap->buffersize);
10907 
10908 	/*
10909 	 * If the only item requested is file names, we can let that past with
10910 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10911 	 * they need SEARCH as well.
10912 	 */
10913 	action = KAUTH_VNODE_LIST_DIRECTORY;
10914 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10915 	    attributelist.fileattr || attributelist.dirattr) {
10916 		action |= KAUTH_VNODE_SEARCH;
10917 	}
10918 
10919 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10920 		/* Believe it or not, uap->options only has 32-bits of valid
10921 		 * info, so truncate before extending again */
10922 
10923 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10924 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10925 	}
10926 
10927 	if (error) {
10928 		(void) vnode_put(vp);
10929 		goto out;
10930 	}
10931 
10932 #if CONFIG_UNION_MOUNTS
10933 	/*
10934 	 * If we've got the last entry of a directory in a union mount
10935 	 * then reset the eofflag and pretend there's still more to come.
10936 	 * The next call will again set eofflag and the buffer will be empty,
10937 	 * so traverse to the underlying directory and do the directory
10938 	 * read there.
10939 	 */
10940 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10941 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10942 			eofflag = 0;
10943 		} else {                                                // Empty buffer
10944 			vnode_t uvp;
10945 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10946 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10947 					fp_set_data(fp, uvp);
10948 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10949 					count = savecount;
10950 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10951 					vnode_put(vp);
10952 					vp = uvp;
10953 					goto unionread;
10954 				} else {
10955 					/* could not get a ref, can't replace in fd */
10956 					vnode_put(uvp);
10957 				}
10958 			}
10959 		}
10960 	}
10961 #endif /* CONFIG_UNION_MOUNTS */
10962 
10963 	(void)vnode_put(vp);
10964 
10965 	if (error) {
10966 		goto out;
10967 	}
10968 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10969 
10970 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10971 		goto out;
10972 	}
10973 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10974 		goto out;
10975 	}
10976 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10977 		goto out;
10978 	}
10979 
10980 	*retval = eofflag;  /* similar to getdirentries */
10981 	error = 0;
10982 out:
10983 	vn_offset_unlock(fp->fp_glob);
10984 	file_drop(fd);
10985 	return error; /* return error earlier, an retval of 0 or 1 now */
10986 } /* end of getdirentriesattr system call */
10987 
10988 /*
10989  * Exchange data between two files
10990  */
10991 
10992 /* ARGSUSED */
10993 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10994 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10995 {
10996 	struct nameidata fnd, snd;
10997 	vfs_context_t ctx = vfs_context_current();
10998 	vnode_t fvp;
10999 	vnode_t svp;
11000 	int error;
11001 	u_int32_t nameiflags;
11002 	char *fpath = NULL;
11003 	char *spath = NULL;
11004 	int   flen = 0, slen = 0;
11005 	int from_truncated = 0, to_truncated = 0;
11006 #if CONFIG_FSE
11007 	fse_info f_finfo, s_finfo;
11008 #endif
11009 
11010 	nameiflags = 0;
11011 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11012 		nameiflags |= FOLLOW;
11013 	}
11014 
11015 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
11016 	    UIO_USERSPACE, uap->path1, ctx);
11017 
11018 	error = namei(&fnd);
11019 	if (error) {
11020 		goto out2;
11021 	}
11022 
11023 	nameidone(&fnd);
11024 	fvp = fnd.ni_vp;
11025 
11026 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
11027 	    UIO_USERSPACE, uap->path2, ctx);
11028 
11029 	error = namei(&snd);
11030 	if (error) {
11031 		vnode_put(fvp);
11032 		goto out2;
11033 	}
11034 	nameidone(&snd);
11035 	svp = snd.ni_vp;
11036 
11037 	/*
11038 	 * if the files are the same, return an inval error
11039 	 */
11040 	if (svp == fvp) {
11041 		error = EINVAL;
11042 		goto out;
11043 	}
11044 
11045 	/*
11046 	 * if the files are on different volumes, return an error
11047 	 */
11048 	if (svp->v_mount != fvp->v_mount) {
11049 		error = EXDEV;
11050 		goto out;
11051 	}
11052 
11053 	/* If they're not files, return an error */
11054 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
11055 		error = EINVAL;
11056 		goto out;
11057 	}
11058 
11059 #if CONFIG_MACF
11060 	error = mac_vnode_check_exchangedata(ctx,
11061 	    fvp, svp);
11062 	if (error) {
11063 		goto out;
11064 	}
11065 #endif
11066 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
11067 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
11068 		goto out;
11069 	}
11070 
11071 	if (
11072 #if CONFIG_FSE
11073 		need_fsevent(FSE_EXCHANGE, fvp) ||
11074 #endif
11075 		kauth_authorize_fileop_has_listeners()) {
11076 		GET_PATH(fpath);
11077 		GET_PATH(spath);
11078 
11079 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
11080 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
11081 
11082 #if CONFIG_FSE
11083 		get_fse_info(fvp, &f_finfo, ctx);
11084 		get_fse_info(svp, &s_finfo, ctx);
11085 		if (from_truncated || to_truncated) {
11086 			// set it here since only the f_finfo gets reported up to user space
11087 			f_finfo.mode |= FSE_TRUNCATED_PATH;
11088 		}
11089 #endif
11090 	}
11091 	/* Ok, make the call */
11092 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
11093 
11094 	if (error == 0) {
11095 		const char *tmpname;
11096 
11097 		if (fpath != NULL && spath != NULL) {
11098 			/* call out to allow 3rd party notification of exchangedata.
11099 			 * Ignore result of kauth_authorize_fileop call.
11100 			 */
11101 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
11102 			    (uintptr_t)fpath, (uintptr_t)spath);
11103 		}
11104 		name_cache_lock();
11105 
11106 		tmpname     = fvp->v_name;
11107 		fvp->v_name = svp->v_name;
11108 		svp->v_name = tmpname;
11109 
11110 		if (fvp->v_parent != svp->v_parent) {
11111 			vnode_t tmp;
11112 
11113 			tmp           = fvp->v_parent;
11114 			fvp->v_parent = svp->v_parent;
11115 			svp->v_parent = tmp;
11116 		}
11117 		name_cache_unlock();
11118 
11119 #if CONFIG_FSE
11120 		if (fpath != NULL && spath != NULL) {
11121 			add_fsevent(FSE_EXCHANGE, ctx,
11122 			    FSE_ARG_STRING, flen, fpath,
11123 			    FSE_ARG_FINFO, &f_finfo,
11124 			    FSE_ARG_STRING, slen, spath,
11125 			    FSE_ARG_FINFO, &s_finfo,
11126 			    FSE_ARG_DONE);
11127 		}
11128 #endif
11129 	}
11130 
11131 out:
11132 	if (fpath != NULL) {
11133 		RELEASE_PATH(fpath);
11134 	}
11135 	if (spath != NULL) {
11136 		RELEASE_PATH(spath);
11137 	}
11138 	vnode_put(svp);
11139 	vnode_put(fvp);
11140 out2:
11141 	return error;
11142 }
11143 
11144 /*
11145  * Return (in MB) the amount of freespace on the given vnode's volume.
11146  */
11147 uint32_t freespace_mb(vnode_t vp);
11148 
11149 uint32_t
freespace_mb(vnode_t vp)11150 freespace_mb(vnode_t vp)
11151 {
11152 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
11153 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
11154 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
11155 }
11156 
11157 #if CONFIG_SEARCHFS
11158 
11159 /* ARGSUSED */
11160 
11161 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)11162 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
11163 {
11164 	vnode_t vp, tvp;
11165 	int i, error = 0;
11166 	int fserror = 0;
11167 	struct nameidata nd;
11168 	struct user64_fssearchblock searchblock;
11169 	struct searchstate *state;
11170 	struct attrlist *returnattrs;
11171 	struct timeval timelimit;
11172 	void *searchparams1, *searchparams2;
11173 	uio_t auio = NULL;
11174 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11175 	uint32_t nummatches;
11176 	size_t mallocsize;
11177 	uint32_t nameiflags;
11178 	vfs_context_t ctx = vfs_context_current();
11179 	UIO_STACKBUF(uio_buf, 1);
11180 
11181 	/* Start by copying in fsearchblock parameter list */
11182 	if (IS_64BIT_PROCESS(p)) {
11183 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11184 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
11185 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
11186 	} else {
11187 		struct user32_fssearchblock tmp_searchblock;
11188 
11189 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11190 		// munge into 64-bit version
11191 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11192 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11193 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11194 		searchblock.maxmatches = tmp_searchblock.maxmatches;
11195 		/*
11196 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11197 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11198 		 */
11199 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11200 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11201 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11202 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11203 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11204 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11205 		searchblock.searchattrs = tmp_searchblock.searchattrs;
11206 	}
11207 	if (error) {
11208 		return error;
11209 	}
11210 
11211 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11212 	 */
11213 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11214 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11215 		return EINVAL;
11216 	}
11217 
11218 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11219 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
11220 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11221 	/* block.                                                                                             */
11222 	/*												      */
11223 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
11224 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
11225 	/*       assumes the size is still 556 bytes it will continue to work				      */
11226 
11227 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11228 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11229 
11230 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11231 
11232 	/* Now set up the various pointers to the correct place in our newly allocated memory */
11233 
11234 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11235 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11236 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11237 
11238 	/* Now copy in the stuff given our local variables. */
11239 
11240 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11241 		goto freeandexit;
11242 	}
11243 
11244 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11245 		goto freeandexit;
11246 	}
11247 
11248 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11249 		goto freeandexit;
11250 	}
11251 
11252 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11253 		goto freeandexit;
11254 	}
11255 
11256 	/*
11257 	 * When searching a union mount, need to set the
11258 	 * start flag at the first call on each layer to
11259 	 * reset state for the new volume.
11260 	 */
11261 	if (uap->options & SRCHFS_START) {
11262 		state->ss_union_layer = 0;
11263 	} else {
11264 		uap->options |= state->ss_union_flags;
11265 	}
11266 	state->ss_union_flags = 0;
11267 
11268 	/*
11269 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11270 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11271 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11272 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11273 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11274 	 */
11275 
11276 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11277 		attrreference_t* string_ref;
11278 		u_int32_t* start_length;
11279 		user64_size_t param_length;
11280 
11281 		/* validate searchparams1 */
11282 		param_length = searchblock.sizeofsearchparams1;
11283 		/* skip the word that specifies length of the buffer */
11284 		start_length = (u_int32_t*) searchparams1;
11285 		start_length = start_length + 1;
11286 		string_ref = (attrreference_t*) start_length;
11287 
11288 		/* ensure no negative offsets or too big offsets */
11289 		if (string_ref->attr_dataoffset < 0) {
11290 			error = EINVAL;
11291 			goto freeandexit;
11292 		}
11293 		if (string_ref->attr_length > MAXPATHLEN) {
11294 			error = EINVAL;
11295 			goto freeandexit;
11296 		}
11297 
11298 		/* Check for pointer overflow in the string ref */
11299 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11300 			error = EINVAL;
11301 			goto freeandexit;
11302 		}
11303 
11304 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11305 			error = EINVAL;
11306 			goto freeandexit;
11307 		}
11308 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11309 			error = EINVAL;
11310 			goto freeandexit;
11311 		}
11312 	}
11313 
11314 	/* set up the uio structure which will contain the users return buffer */
11315 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11316 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11317 
11318 	nameiflags = 0;
11319 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11320 		nameiflags |= FOLLOW;
11321 	}
11322 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11323 	    UIO_USERSPACE, uap->path, ctx);
11324 
11325 	error = namei(&nd);
11326 	if (error) {
11327 		goto freeandexit;
11328 	}
11329 	vp = nd.ni_vp;
11330 	nameidone(&nd);
11331 
11332 	/*
11333 	 * Switch to the root vnode for the volume
11334 	 */
11335 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11336 	vnode_put(vp);
11337 	if (error) {
11338 		goto freeandexit;
11339 	}
11340 	vp = tvp;
11341 
11342 #if CONFIG_UNION_MOUNTS
11343 	/*
11344 	 * If it's a union mount, the path lookup takes
11345 	 * us to the top layer. But we may need to descend
11346 	 * to a lower layer. For non-union mounts the layer
11347 	 * is always zero.
11348 	 */
11349 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11350 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11351 			break;
11352 		}
11353 		tvp = vp;
11354 		vp = vp->v_mount->mnt_vnodecovered;
11355 		if (vp == NULL) {
11356 			vnode_put(tvp);
11357 			error = ENOENT;
11358 			goto freeandexit;
11359 		}
11360 		error = vnode_getwithref(vp);
11361 		vnode_put(tvp);
11362 		if (error) {
11363 			goto freeandexit;
11364 		}
11365 	}
11366 #endif /* CONFIG_UNION_MOUNTS */
11367 
11368 #if CONFIG_MACF
11369 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11370 	if (error) {
11371 		vnode_put(vp);
11372 		goto freeandexit;
11373 	}
11374 #endif
11375 
11376 
11377 	/*
11378 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11379 	 * before and sometimes the underlying code doesnt deal with it well.
11380 	 */
11381 	if (searchblock.maxmatches == 0) {
11382 		nummatches = 0;
11383 		goto saveandexit;
11384 	}
11385 
11386 	/*
11387 	 * Allright, we have everything we need, so lets make that call.
11388 	 *
11389 	 * We keep special track of the return value from the file system:
11390 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11391 	 * from copying out any results...
11392 	 */
11393 
11394 	fserror = VNOP_SEARCHFS(vp,
11395 	    searchparams1,
11396 	    searchparams2,
11397 	    &searchblock.searchattrs,
11398 	    (uint32_t)searchblock.maxmatches,
11399 	    &timelimit,
11400 	    returnattrs,
11401 	    &nummatches,
11402 	    (uint32_t)uap->scriptcode,
11403 	    (uint32_t)uap->options,
11404 	    auio,
11405 	    (struct searchstate *) &state->ss_fsstate,
11406 	    ctx);
11407 
11408 #if CONFIG_UNION_MOUNTS
11409 	/*
11410 	 * If it's a union mount we need to be called again
11411 	 * to search the mounted-on filesystem.
11412 	 */
11413 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11414 		state->ss_union_flags = SRCHFS_START;
11415 		state->ss_union_layer++;        // search next layer down
11416 		fserror = EAGAIN;
11417 	}
11418 #endif /* CONFIG_UNION_MOUNTS */
11419 
11420 saveandexit:
11421 
11422 	vnode_put(vp);
11423 
11424 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11425 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11426 
11427 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11428 		goto freeandexit;
11429 	}
11430 
11431 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11432 		goto freeandexit;
11433 	}
11434 
11435 	error = fserror;
11436 
11437 freeandexit:
11438 
11439 	kfree_data(searchparams1, mallocsize);
11440 
11441 	return error;
11442 } /* end of searchfs system call */
11443 
11444 #else /* CONFIG_SEARCHFS */
11445 
11446 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11447 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11448 {
11449 	return ENOTSUP;
11450 }
11451 
11452 #endif /* CONFIG_SEARCHFS */
11453 
11454 
11455 #if CONFIG_DATALESS_FILES
11456 
11457 /*
11458  * === Namespace Resolver Up-call Mechanism ===
11459  *
11460  * When I/O is performed to a dataless file or directory (read, write,
11461  * lookup-in, etc.), the file system performs an upcall to the namespace
11462  * resolver (filecoordinationd) to materialize the object.
11463  *
11464  * We need multiple up-calls to be in flight at once, and we need these
11465  * up-calls to be interruptible, thus the following implementation:
11466  *
11467  * => The nspace_resolver_request represents the in-kernel request state.
11468  *    It contains a request ID, storage space for the errno code returned
11469  *    by filecoordinationd, and flags.
11470  *
11471  * => The request ID is simply a global monotonically incrementing 32-bit
11472  *    number.  Outstanding requests are stored in a hash table, and the
11473  *    hash function is extremely simple.
11474  *
11475  * => When an upcall is to be made to filecoordinationd, a request structure
11476  *    is allocated on the stack (it is small, and needs to live only during
11477  *    the duration of the call to resolve_nspace_item_ext()).  It is
11478  *    initialized and inserted into the table.  Some backpressure from
11479  *    filecoordinationd is applied by limiting the numnber of entries that
11480  *    can be inserted into the table (and thus limiting the number of
11481  *    outstanding requests issued to filecoordinationd); waiting for an
11482  *    available slot is interruptible.
11483  *
11484  * => Once the request has been inserted into the table, the up-call is made
11485  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11486  *    immediately and filecoordinationd processes the request asynchronously.
11487  *
11488  * => The caller now waits for the request to complete.  Tnis is achieved by
11489  *    sleeping on the address of the request structure and waiting for
11490  *    filecoordinationd to mark the request structure as complete.  This
11491  *    is an interruptible sleep call; if interrupted, the request structure
11492  *    is removed from the table and EINTR is returned to the caller.  If
11493  *    this occurs, an advisory up-call is made to filecoordinationd with
11494  *    the request ID to indicate that the request can be aborted or
11495  *    de-prioritized at the discretion of filecoordinationd.
11496  *
11497  * => When filecoordinationd has completed the request, it signals completion
11498  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11499  *    decorated as a namespace resolver can write to this sysctl node.  The
11500  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11501  *    The request ID is looked up in the table, and if the request is found,
11502  *    the error code is stored in the request structure and a wakeup()
11503  *    issued on the address of the request structure.  If the request is not
11504  *    found, we simply drop the completion notification, assuming that the
11505  *    caller was interrupted.
11506  *
11507  * => When the waiting thread wakes up, it extracts the error code from the
11508  *    request structure, removes the request from the table, and returns the
11509  *    error code to the calling function.  Fini!
11510  */
11511 
11512 struct nspace_resolver_request {
11513 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11514 	vnode_t         r_vp;
11515 	vnode_t         r_tdvp;
11516 	uint32_t        r_req_id;
11517 	int             r_resolver_error;
11518 	int             r_flags;
11519 };
11520 
11521 #define RRF_COMPLETE    0x0001
11522 #define RRF_COMPLETING  0x0002
11523 
11524 struct nspace_resolver_completion_data {
11525 	uint32_t req_id;
11526 	int32_t  resolver_error;
11527 	uint64_t orig_gencount;
11528 	uint64_t orig_syncroot;
11529 };
11530 
11531 static uint32_t
next_nspace_req_id(void)11532 next_nspace_req_id(void)
11533 {
11534 	static uint32_t next_req_id;
11535 
11536 	return OSAddAtomic(1, &next_req_id);
11537 }
11538 
11539 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11540 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11541 
11542 static LIST_HEAD(nspace_resolver_requesthead,
11543     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11544 static u_long nspace_resolver_request_hashmask;
11545 static u_int nspace_resolver_request_count;
11546 static bool nspace_resolver_request_wait_slot;
11547 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11548 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11549     &nspace_resolver_request_lck_grp);
11550 
11551 #define NSPACE_REQ_LOCK() \
11552 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11553 #define NSPACE_REQ_UNLOCK() \
11554 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11555 
11556 #define NSPACE_RESOLVER_HASH(req_id)    \
11557 	(&nspace_resolver_request_hashtbl[(req_id) & \
11558 	 nspace_resolver_request_hashmask])
11559 
11560 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11561 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11562 {
11563 	struct nspace_resolver_requesthead *bucket;
11564 	struct nspace_resolver_request *req;
11565 
11566 	bucket = NSPACE_RESOLVER_HASH(req_id);
11567 	LIST_FOREACH(req, bucket, r_hashlink) {
11568 		if (req->r_req_id == req_id) {
11569 			/*
11570 			 * If this request already has a completion
11571 			 * pending, don't return it again.
11572 			 */
11573 			if ((req->r_flags & RRF_COMPLETING) != 0 &&
11574 			    skip_completing) {
11575 				req = NULL;
11576 			}
11577 			return req;
11578 		}
11579 	}
11580 
11581 	return NULL;
11582 }
11583 
11584 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11585 nspace_resolver_req_add(struct nspace_resolver_request *req)
11586 {
11587 	struct nspace_resolver_requesthead *bucket;
11588 	int error;
11589 
11590 	NSPACE_REQ_LOCK();
11591 
11592 	while (nspace_resolver_request_count >=
11593 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11594 		nspace_resolver_request_wait_slot = true;
11595 		error = msleep(&nspace_resolver_request_count,
11596 		    &nspace_resolver_request_hash_mutex,
11597 		    PVFS | PCATCH, "nspacerq", NULL);
11598 		if (error) {
11599 			NSPACE_REQ_UNLOCK();
11600 			return error;
11601 		}
11602 	}
11603 
11604 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11605 #if DIAGNOSTIC
11606 	assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11607 #endif /* DIAGNOSTIC */
11608 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11609 	nspace_resolver_request_count++;
11610 
11611 	NSPACE_REQ_UNLOCK();
11612 
11613 	return 0;
11614 }
11615 
11616 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11617 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11618 {
11619 	/*
11620 	 * If a completion is in-progress, we have to wait for the
11621 	 * completion handler to finish because it's still using 'req',
11622 	 * which is allocated on our stack a couple of frames up.
11623 	 */
11624 	while ((req->r_flags & RRF_COMPLETING) != 0) {
11625 		(void) msleep(req, &nspace_resolver_request_hash_mutex,
11626 		    PVFS, "nspacecmplt", NULL);
11627 	}
11628 }
11629 
11630 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11631 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11632 {
11633 	struct nspace_resolver_requesthead *bucket;
11634 
11635 	/* We're called with NSPACE_REQ_LOCK held. */
11636 
11637 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11638 #if DIAGNOSTIC
11639 	assert((req->r_flags & RRF_COMPLETING) == 0);
11640 	assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11641 #endif /* DIAGNOSTIC */
11642 	LIST_REMOVE(req, r_hashlink);
11643 	nspace_resolver_request_count--;
11644 
11645 	if (nspace_resolver_request_wait_slot) {
11646 		nspace_resolver_request_wait_slot = false;
11647 		wakeup(&nspace_resolver_request_count);
11648 	}
11649 
11650 	nspace_resolver_req_wait_pending_completion(req);
11651 
11652 	NSPACE_REQ_UNLOCK();
11653 }
11654 
11655 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11656 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11657 {
11658 	NSPACE_REQ_LOCK();
11659 	nspace_resolver_req_remove_and_unlock(req);
11660 }
11661 
11662 static void
nspace_resolver_req_cancel(uint32_t req_id)11663 nspace_resolver_req_cancel(uint32_t req_id)
11664 {
11665 	kern_return_t kr;
11666 	mach_port_t mp;
11667 
11668 	// Failures here aren't fatal -- the cancellation message
11669 	// sent to the resolver is merely advisory.
11670 
11671 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11672 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11673 		return;
11674 	}
11675 
11676 	kr = send_nspace_resolve_cancel(mp, req_id);
11677 	if (kr != KERN_SUCCESS) {
11678 		os_log_error(OS_LOG_DEFAULT,
11679 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11680 	}
11681 
11682 	ipc_port_release_send(mp);
11683 }
11684 
11685 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11686 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11687 {
11688 	bool send_cancel_message = false;
11689 	int error;
11690 
11691 	NSPACE_REQ_LOCK();
11692 
11693 	while ((req->r_flags & RRF_COMPLETE) == 0) {
11694 		error = msleep(req, &nspace_resolver_request_hash_mutex,
11695 		    PVFS | PCATCH, "nspace", NULL);
11696 		if (error && error != ERESTART) {
11697 			req->r_resolver_error = (error == EINTR) ? EINTR :
11698 			    ETIMEDOUT;
11699 			send_cancel_message = true;
11700 			break;
11701 		}
11702 	}
11703 
11704 	nspace_resolver_req_remove_and_unlock(req);
11705 
11706 	/*
11707 	 * It's safe to continue referencing 'req' here because it's
11708 	 * allocated on our caller's stack.
11709 	 */
11710 
11711 	if (send_cancel_message) {
11712 		nspace_resolver_req_cancel(req->r_req_id);
11713 	}
11714 
11715 	return req->r_resolver_error;
11716 }
11717 
11718 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11719 nspace_resolver_req_mark_complete(
11720 	struct nspace_resolver_request *req,
11721 	int resolver_error)
11722 {
11723 	req->r_resolver_error = resolver_error;
11724 	req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11725 	wakeup(req);
11726 }
11727 
11728 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11729 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11730 {
11731 	req->r_flags |= RRF_COMPLETING;
11732 }
11733 
11734 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11735 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11736 {
11737 	struct nspace_resolver_request *req;
11738 	int error;
11739 	struct vnode_attr va;
11740 	vnode_t vp;
11741 
11742 	NSPACE_REQ_LOCK();
11743 
11744 	req = nspace_resolver_req_lookup(c->req_id, true);
11745 	if (req == NULL) {
11746 		/*
11747 		 * If we don't find the request corresponding to our req_id,
11748 		 * just drop the completion on the floor; it's likely that
11749 		 * the requester interrupted with a signal, or it may already
11750 		 * be completing.
11751 		 */
11752 		NSPACE_REQ_UNLOCK();
11753 		return;
11754 	}
11755 
11756 	/*
11757 	 * Get out now if the resolver reported an error.
11758 	 */
11759 	if ((error = c->resolver_error) != 0) {
11760 		goto out;
11761 	}
11762 
11763 	/*
11764 	 * If the resolver did not specify any namespace shape criteria
11765 	 * for letting the operation proceed, then get out now.
11766 	 */
11767 	if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11768 		goto out;
11769 	}
11770 
11771 	/*
11772 	 * We're going to have to acquire the mount rename lock and do
11773 	 * some I/O in order to verify the criteria.  Mark the request
11774 	 * as pending so no one else messes with it after we drop the
11775 	 * NSPACE_REQ_LOCK.
11776 	 */
11777 	nspace_resolver_req_mark_completion_pending(req);
11778 	NSPACE_REQ_UNLOCK();
11779 
11780 	/*
11781 	 * Lock out renames from changing the shape of the tree while
11782 	 * validate the criteria.
11783 	 */
11784 	mount_t locked_mp = req->r_vp->v_mount;
11785 	mount_ref(locked_mp, 0);
11786 	mount_lock_renames(locked_mp);
11787 
11788 	if (c->orig_gencount != 0) {
11789 		vp = req->r_vp;
11790 		if (error) {
11791 			goto out_dropmount;
11792 		}
11793 
11794 		VATTR_INIT(&va);
11795 		VATTR_WANTED(&va, va_recursive_gencount);
11796 		error = vnode_getattr(vp, &va, vfs_context_kernel());
11797 		if (error) {
11798 			goto out_dropmount;
11799 		}
11800 		if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11801 		    va.va_recursive_gencount != c->orig_gencount) {
11802 			printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11803 			    c->orig_gencount, va.va_recursive_gencount);
11804 			error = EBUSY;
11805 			goto out_dropmount;
11806 		}
11807 	}
11808 
11809 	/*
11810 	 * Ignore orig_syncroot if a destination directory wasn't specified
11811 	 * in the request.
11812 	 */
11813 	if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11814 		uint64_t syncroot_id;
11815 
11816 		if (error) {
11817 			goto out_dropmount;
11818 		}
11819 
11820 #ifndef APFSIOC_GET_SYNC_ROOT
11821 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11822 #endif
11823 
11824 		error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11825 		    (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11826 		if (error) {
11827 			goto out_dropmount;
11828 		}
11829 		if (syncroot_id != c->orig_syncroot) {
11830 			printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11831 			    c->orig_syncroot, syncroot_id);
11832 			error = EBUSY;
11833 			goto out_dropmount;
11834 		}
11835 	}
11836 
11837 out_dropmount:
11838 	mount_unlock_renames(locked_mp);
11839 	mount_drop(locked_mp, 0);
11840 	NSPACE_REQ_LOCK();
11841 
11842 out:
11843 	nspace_resolver_req_mark_complete(req, error);
11844 	NSPACE_REQ_UNLOCK();
11845 }
11846 
11847 static struct proc *nspace_resolver_proc;
11848 
11849 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11850 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11851 {
11852 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11853 	    p == nspace_resolver_proc) ? 1 : 0;
11854 	return 0;
11855 }
11856 
11857 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11858 
11859 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11860 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11861 {
11862 	vfs_context_t ctx = vfs_context_current();
11863 	int error = 0;
11864 
11865 	//
11866 	// The system filecoordinationd runs as uid == 0.  This also
11867 	// has the nice side-effect of filtering out filecoordinationd
11868 	// running in the simulator.
11869 	//
11870 	if (!vfs_context_issuser(ctx) ||
11871 	    !vfs_context_is_dataless_resolver(ctx)) {
11872 		return EPERM;
11873 	}
11874 
11875 	if (is_resolver) {
11876 		NSPACE_REQ_LOCK();
11877 
11878 		if (nspace_resolver_proc == NULL) {
11879 			proc_lock(p);
11880 			p->p_lflag |= P_LNSPACE_RESOLVER;
11881 			proc_unlock(p);
11882 			nspace_resolver_proc = p;
11883 		} else {
11884 			error = EBUSY;
11885 		}
11886 
11887 		NSPACE_REQ_UNLOCK();
11888 	} else {
11889 		// This is basically just like the exit case.
11890 		// nspace_resolver_exited() will verify that the
11891 		// process is the resolver, and will clear the
11892 		// global.
11893 		nspace_resolver_exited(p);
11894 	}
11895 
11896 	return error;
11897 }
11898 
11899 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11900 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11901 {
11902 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11903 	    (p->p_vfs_iopolicy &
11904 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11905 		*is_prevented = 1;
11906 	} else {
11907 		*is_prevented = 0;
11908 	}
11909 	return 0;
11910 }
11911 
11912 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11913 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11914 {
11915 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11916 		return is_prevented ? 0 : EBUSY;
11917 	}
11918 
11919 	if (is_prevented) {
11920 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11921 	} else {
11922 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11923 	}
11924 	return 0;
11925 }
11926 
11927 static int
nspace_materialization_get_thread_state(int * is_prevented)11928 nspace_materialization_get_thread_state(int *is_prevented)
11929 {
11930 	uthread_t ut = current_uthread();
11931 
11932 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11933 	return 0;
11934 }
11935 
11936 static int
nspace_materialization_set_thread_state(int is_prevented)11937 nspace_materialization_set_thread_state(int is_prevented)
11938 {
11939 	uthread_t ut = current_uthread();
11940 
11941 	if (is_prevented) {
11942 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11943 	} else {
11944 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11945 	}
11946 	return 0;
11947 }
11948 
11949 /* the vfs.nspace branch */
11950 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11951 
11952 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11953 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11954     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11955 {
11956 	struct proc *p = req->p;
11957 	int new_value, old_value, changed = 0;
11958 	int error;
11959 
11960 	error = nspace_resolver_get_proc_state(p, &old_value);
11961 	if (error) {
11962 		return error;
11963 	}
11964 
11965 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11966 	    &changed);
11967 	if (error == 0 && changed) {
11968 		error = nspace_resolver_set_proc_state(p, new_value);
11969 	}
11970 	return error;
11971 }
11972 
11973 /* decorate this process as the dataless file resolver */
11974 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11975     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11976     0, 0, sysctl_nspace_resolver, "I", "");
11977 
11978 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11979 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11980     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11981 {
11982 	struct proc *p = req->p;
11983 	int new_value, old_value, changed = 0;
11984 	int error;
11985 
11986 	error = nspace_materialization_get_proc_state(p, &old_value);
11987 	if (error) {
11988 		return error;
11989 	}
11990 
11991 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11992 	    &changed);
11993 	if (error == 0 && changed) {
11994 		error = nspace_materialization_set_proc_state(p, new_value);
11995 	}
11996 	return error;
11997 }
11998 
11999 /* decorate this process as not wanting to materialize dataless files */
12000 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
12001     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12002     0, 0, sysctl_nspace_prevent_materialization, "I", "");
12003 
12004 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12005 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
12006     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12007 {
12008 	int new_value, old_value, changed = 0;
12009 	int error;
12010 
12011 	error = nspace_materialization_get_thread_state(&old_value);
12012 	if (error) {
12013 		return error;
12014 	}
12015 
12016 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12017 	    &changed);
12018 	if (error == 0 && changed) {
12019 		error = nspace_materialization_set_thread_state(new_value);
12020 	}
12021 	return error;
12022 }
12023 
12024 /* decorate this thread as not wanting to materialize dataless files */
12025 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
12026     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12027     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
12028 
12029 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12030 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
12031     __unused int arg2, struct sysctl_req *req)
12032 {
12033 	struct proc *p = req->p;
12034 	uint32_t req_status[2] = { 0, 0 };
12035 	uint64_t gencount = 0;
12036 	uint64_t syncroot = 0;
12037 	int error, is_resolver, changed = 0, other_changed;
12038 
12039 	error = nspace_resolver_get_proc_state(p, &is_resolver);
12040 	if (error) {
12041 		return error;
12042 	}
12043 
12044 	if (!is_resolver) {
12045 		return EPERM;
12046 	}
12047 
12048 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
12049 	    &changed);
12050 	if (error) {
12051 		return error;
12052 	}
12053 
12054 	/*
12055 	 * Get the gencount if it was passed.  Ignore errors, because
12056 	 * it's optional.
12057 	 */
12058 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
12059 	    &other_changed);
12060 	if (error) {
12061 		gencount = 0;
12062 		error = 0;
12063 	}
12064 
12065 	/*
12066 	 * ...and now the syncroot ID.
12067 	 */
12068 	error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
12069 	    &other_changed);
12070 	if (error) {
12071 		syncroot = 0;
12072 		error = 0;
12073 	}
12074 
12075 	/*
12076 	 * req_status[0] is the req_id
12077 	 *
12078 	 * req_status[1] is the errno
12079 	 */
12080 	if (error == 0 && changed) {
12081 		const struct nspace_resolver_completion_data cd = {
12082 			.req_id = req_status[0],
12083 			.resolver_error = req_status[1],
12084 			.orig_gencount = gencount,
12085 			.orig_syncroot = syncroot,
12086 		};
12087 		nspace_resolver_req_completed(&cd);
12088 	}
12089 	return error;
12090 }
12091 
12092 /* Resolver reports completed reqs here. */
12093 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
12094     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12095     0, 0, sysctl_nspace_complete, "-", "");
12096 
12097 #endif /* CONFIG_DATALESS_FILES */
12098 
12099 #if CONFIG_DATALESS_FILES
12100 #define __no_dataless_unused    /* nothing */
12101 #else
12102 #define __no_dataless_unused    __unused
12103 #endif
12104 
12105 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)12106 vfs_context_dataless_materialization_is_prevented(
12107 	vfs_context_t const ctx __no_dataless_unused)
12108 {
12109 #if CONFIG_DATALESS_FILES
12110 	proc_t const p = vfs_context_proc(ctx);
12111 	thread_t const t = vfs_context_thread(ctx);
12112 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
12113 
12114 	/*
12115 	 * Kernel context ==> return EDEADLK, as we would with any random
12116 	 * process decorated as no-materialize.
12117 	 */
12118 	if (ctx == vfs_context_kernel()) {
12119 		return EDEADLK;
12120 	}
12121 
12122 	/*
12123 	 * If the process has the dataless-manipulation entitlement,
12124 	 * materialization is prevented, and depending on the kind
12125 	 * of file system operation, things get to proceed as if the
12126 	 * object is not dataless.
12127 	 */
12128 	if (vfs_context_is_dataless_manipulator(ctx)) {
12129 		return EJUSTRETURN;
12130 	}
12131 
12132 	/*
12133 	 * Per-thread decorations override any process-wide decorations.
12134 	 * (Foundation uses this, and this overrides even the dataless-
12135 	 * manipulation entitlement so as to make API contracts consistent.)
12136 	 */
12137 	if (ut != NULL) {
12138 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
12139 			return EDEADLK;
12140 		}
12141 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
12142 			return 0;
12143 		}
12144 	}
12145 
12146 	/*
12147 	 * If the process's iopolicy specifies that dataless files
12148 	 * can be materialized, then we let it go ahead.
12149 	 */
12150 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
12151 		return 0;
12152 	}
12153 #endif /* CONFIG_DATALESS_FILES */
12154 
12155 	/*
12156 	 * The default behavior is to not materialize dataless files;
12157 	 * return to the caller that deadlock was detected.
12158 	 */
12159 	return EDEADLK;
12160 }
12161 
12162 void
nspace_resolver_init(void)12163 nspace_resolver_init(void)
12164 {
12165 #if CONFIG_DATALESS_FILES
12166 	nspace_resolver_request_hashtbl =
12167 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12168 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12169 #endif /* CONFIG_DATALESS_FILES */
12170 }
12171 
12172 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12173 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12174 {
12175 #if CONFIG_DATALESS_FILES
12176 	struct nspace_resolver_requesthead *bucket;
12177 	struct nspace_resolver_request *req;
12178 	u_long idx;
12179 
12180 	NSPACE_REQ_LOCK();
12181 
12182 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12183 	    p == nspace_resolver_proc) {
12184 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12185 			bucket = &nspace_resolver_request_hashtbl[idx];
12186 			LIST_FOREACH(req, bucket, r_hashlink) {
12187 				nspace_resolver_req_wait_pending_completion(req);
12188 				nspace_resolver_req_mark_complete(req,
12189 				    ETIMEDOUT);
12190 			}
12191 		}
12192 		nspace_resolver_proc = NULL;
12193 	}
12194 
12195 	NSPACE_REQ_UNLOCK();
12196 #endif /* CONFIG_DATALESS_FILES */
12197 }
12198 
12199 #define DATALESS_RESOLVER_ENTITLEMENT     \
12200 	"com.apple.private.vfs.dataless-resolver"
12201 #define DATALESS_MANIPULATION_ENTITLEMENT \
12202 	"com.apple.private.vfs.dataless-manipulation"
12203 
12204 #if CONFIG_DATALESS_FILES
12205 /*
12206  * Return TRUE if the vfs context is associated with the dataless
12207  * resolver.
12208  */
12209 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12210 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12211 {
12212 	return IOTaskHasEntitlement(vfs_context_task(ctx),
12213 	           DATALESS_RESOLVER_ENTITLEMENT);
12214 }
12215 #endif /* CONFIG_DATALESS_FILES */
12216 
12217 /*
12218  * Return TRUE if the vfs context is associated with a process entitled
12219  * for dataless manipulation.
12220  *
12221  * XXX Arguably belongs in vfs_subr.c, but is here because of the
12222  * complication around CONFIG_DATALESS_FILES.
12223  */
12224 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12225 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12226 {
12227 #if CONFIG_DATALESS_FILES
12228 	task_t task = vfs_context_task(ctx);
12229 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12230 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12231 #else
12232 	return false;
12233 #endif /* CONFIG_DATALESS_FILES */
12234 }
12235 
12236 #if CONFIG_DATALESS_FILES
12237 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12238 log_materialization_prevented(vnode_t vp, uint64_t op)
12239 {
12240 	char p_name[MAXCOMLEN + 1];
12241 	char *vntype;
12242 	proc_selfname(&p_name[0], sizeof(p_name));
12243 
12244 	if (vp->v_type == VREG) {
12245 		vntype = "File";
12246 	} else if (vp->v_type == VDIR) {
12247 		vntype = "Dir";
12248 	} else if (vp->v_type == VLNK) {
12249 		vntype = "SymLink";
12250 	} else {
12251 		vntype = "Other";
12252 	}
12253 
12254 #if DEVELOPMENT
12255 	struct vnode_attr *vap = kalloc_type(struct vnode_attr, Z_WAITOK);
12256 
12257 	VATTR_INIT(vap);
12258 	VATTR_WANTED(vap, va_fsid);
12259 	VATTR_WANTED(vap, va_fileid);
12260 	if (vnode_getattr(vp, vap, vfs_context_current()) == 0) {
12261 		os_log_debug(OS_LOG_DEFAULT,
12262 		    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) fsid 0x%08x/%u fileid=%llu",
12263 		    p_name, proc_selfpid(), op, vntype,
12264 		    vap->va_fsid, vap->va_fsid, vap->va_fileid);
12265 	} else
12266 #endif
12267 	{
12268 		os_log_debug(OS_LOG_DEFAULT,
12269 		    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12270 		    p_name, proc_selfpid(), op, vntype);
12271 	}
12272 #if DEVELOPMENT
12273 	kfree_type(struct vnode_attr, vap);
12274 #endif
12275 }
12276 #endif /* CONFIG_DATALESS_FILES */
12277 
12278 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12279 vfs_materialize_item(
12280 	vnode_t vp __no_dataless_unused,
12281 	uint32_t op __no_dataless_unused,
12282 	int64_t offset __no_dataless_unused,
12283 	int64_t size __no_dataless_unused,
12284 	char *lookup_name __no_dataless_unused,
12285 	size_t const namelen __no_dataless_unused,
12286 	vnode_t tdvp __no_dataless_unused)
12287 {
12288 #if CONFIG_DATALESS_FILES
12289 	kern_return_t kern_ret;
12290 	mach_port_t mach_port;
12291 	char *path = NULL;
12292 	vfs_context_t context;
12293 	int path_len;
12294 	int error;
12295 	audit_token_t atoken;
12296 	enum vtype vp_vtype;
12297 
12298 	/* Swap files are special; ignore them */
12299 	if (vnode_isswap(vp)) {
12300 		return 0;
12301 	}
12302 
12303 	/*
12304 	 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12305 	 * are no longer used nor supported.
12306 	 */
12307 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12308 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12309 		return ENOTSUP;
12310 	}
12311 	if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12312 		os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12313 		return ENOTSUP;
12314 	}
12315 
12316 	/* Normalize 'op'. */
12317 	op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12318 
12319 	/*
12320 	 * To-directory is only meaningful for rename operations;
12321 	 * ignore it if someone handed one to us unexpectedly.
12322 	 */
12323 	if (op != NAMESPACE_HANDLER_RENAME_OP) {
12324 		tdvp = NULL;
12325 	}
12326 
12327 	context = vfs_context_current();
12328 
12329 	/* Remember this for later. */
12330 	vp_vtype = vnode_vtype(vp);
12331 
12332 	error = vfs_context_dataless_materialization_is_prevented(context);
12333 	if (error) {
12334 		log_materialization_prevented(vp, op);
12335 		goto out_check_errors;
12336 	}
12337 
12338 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12339 	    &mach_port);
12340 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12341 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12342 		/*
12343 		 * Treat this like being unable to access the backing store
12344 		 * server.
12345 		 */
12346 		return ETIMEDOUT;
12347 	}
12348 
12349 	int path_alloc_len = MAXPATHLEN;
12350 	do {
12351 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12352 		if (path == NULL) {
12353 			return ENOMEM;
12354 		}
12355 
12356 		path_len = path_alloc_len;
12357 		error = vn_getpath(vp, path, &path_len);
12358 		if (error == 0) {
12359 			break;
12360 		} else if (error == ENOSPC) {
12361 			kfree_data(path, path_alloc_len);
12362 			path = NULL;
12363 		} else {
12364 			goto out_release_port;
12365 		}
12366 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12367 
12368 	error = vfs_context_copy_audit_token(context, &atoken);
12369 	if (error) {
12370 		goto out_release_port;
12371 	}
12372 
12373 	struct nspace_resolver_request req = {
12374 		.r_req_id = next_nspace_req_id(),
12375 		.r_vp = vp,
12376 		.r_tdvp = tdvp,
12377 	};
12378 
12379 	error = nspace_resolver_req_add(&req);
12380 	if (error) {
12381 		goto out_release_port;
12382 	}
12383 
12384 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12385 
12386 	if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12387 		char *dest_path = NULL;
12388 		int dest_path_len;
12389 
12390 		dest_path = zalloc(ZV_NAMEI);
12391 		dest_path_len = MAXPATHLEN;
12392 
12393 		error = vn_getpath(tdvp, dest_path, &dest_path_len);
12394 		if (error) {
12395 			zfree(ZV_NAMEI, dest_path);
12396 			goto out_release_port;
12397 		}
12398 
12399 		/*
12400 		 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12401 		 * compatibility with existing agents in user-space
12402 		 * who get passed this value.
12403 		 */
12404 		kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12405 		    req.r_req_id,
12406 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12407 		    path, dest_path, atoken);
12408 
12409 		zfree(ZV_NAMEI, dest_path);
12410 	} else if (vp_vtype == VDIR) {
12411 		char *tmpname = NULL;
12412 
12413 		/*
12414 		 * If the caller provided a lookup_name *and* a name length,
12415 		 * then we assume the lookup_name is not NUL-terminated.
12416 		 * Allocate a temporary buffer in this case to provide
12417 		 * a NUL-terminated path name to the IPC call.
12418 		 */
12419 		if (lookup_name != NULL && namelen != 0) {
12420 			if (namelen >= PATH_MAX) {
12421 				error = EINVAL;
12422 				goto out_req_remove;
12423 			}
12424 			tmpname = zalloc(ZV_NAMEI);
12425 			strlcpy(tmpname, lookup_name, namelen + 1);
12426 			lookup_name = tmpname;
12427 		} else if (lookup_name != NULL) {
12428 			/*
12429 			 * If the caller provided a lookup_name with a
12430 			 * zero name length, then we assume it's NUL-
12431 			 * terminated.  Verify it has a valid length.
12432 			 */
12433 			if (strlen(lookup_name) >= PATH_MAX) {
12434 				error = EINVAL;
12435 				goto out_req_remove;
12436 			}
12437 		}
12438 
12439 		/* (See above.) */
12440 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12441 		    req.r_req_id,
12442 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12443 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12444 
12445 		if (tmpname != NULL) {
12446 			zfree(ZV_NAMEI, tmpname);
12447 
12448 			/*
12449 			 * Poison lookup_name rather than reference
12450 			 * freed memory.
12451 			 */
12452 			lookup_name = NULL;
12453 		}
12454 	} else {
12455 		/* (See above.) */
12456 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12457 		    req.r_req_id,
12458 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12459 		    offset, size, path, atoken);
12460 	}
12461 	if (kern_ret != KERN_SUCCESS) {
12462 		/*
12463 		 * Also treat this like being unable to access the backing
12464 		 * store server.
12465 		 */
12466 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12467 		    kern_ret);
12468 		error = ETIMEDOUT;
12469 		goto out_req_remove;
12470 	}
12471 
12472 	/*
12473 	 * Give back the memory we allocated earlier while we wait; we
12474 	 * no longer need it.
12475 	 */
12476 	kfree_data(path, path_alloc_len);
12477 	path = NULL;
12478 
12479 	/*
12480 	 * Request has been submitted to the resolver. Now (interruptibly)
12481 	 * wait for completion. Upon requrn, the request will have been
12482 	 * removed from the lookup table.
12483 	 */
12484 	error = nspace_resolver_req_wait(&req);
12485 
12486 out_release_port:
12487 	if (path != NULL) {
12488 		kfree_data(path, path_alloc_len);
12489 		path = NULL;
12490 	}
12491 	ipc_port_release_send(mach_port);
12492 
12493 out_check_errors:
12494 	/*
12495 	 * The file resolver owns the logic about what error to return
12496 	 * to the caller.  We only need to handle a couple of special
12497 	 * cases here:
12498 	 */
12499 	if (error == EJUSTRETURN) {
12500 		/*
12501 		 * The requesting process is allowed to interact with
12502 		 * dataless objects.  Make a couple of sanity-checks
12503 		 * here to ensure the action makes sense.
12504 		 */
12505 		switch (op) {
12506 		case NAMESPACE_HANDLER_WRITE_OP:
12507 		case NAMESPACE_HANDLER_TRUNCATE_OP:
12508 		case NAMESPACE_HANDLER_RENAME_OP:
12509 			/*
12510 			 * This handles the case of the resolver itself
12511 			 * writing data to the file (or throwing it
12512 			 * away).
12513 			 */
12514 			error = 0;
12515 			break;
12516 		case NAMESPACE_HANDLER_READ_OP:
12517 		case NAMESPACE_HANDLER_LOOKUP_OP:
12518 			/*
12519 			 * This handles the case of the resolver needing
12520 			 * to look up inside of a dataless directory while
12521 			 * it's in the process of materializing it (for
12522 			 * example, creating files or directories).
12523 			 */
12524 			error = (vp_vtype == VDIR) ? 0 : EBADF;
12525 			break;
12526 		default:
12527 			error = EBADF;
12528 			break;
12529 		}
12530 	}
12531 
12532 	return error;
12533 
12534 out_req_remove:
12535 	nspace_resolver_req_remove(&req);
12536 	goto out_release_port;
12537 #else
12538 	return ENOTSUP;
12539 #endif /* CONFIG_DATALESS_FILES */
12540 }
12541 
12542 /*
12543  * vfs_materialize_file: Materialize a regular file.
12544  *
12545  * Inputs:
12546  * vp		The dataless file to be materialized.
12547  *
12548  * op		What kind of operation is being performed:
12549  *		-> NAMESPACE_HANDLER_READ_OP
12550  *		-> NAMESPACE_HANDLER_WRITE_OP
12551  *		-> NAMESPACE_HANDLER_LINK_CREATE
12552  *		-> NAMESPACE_HANDLER_DELETE_OP
12553  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12554  *		-> NAMESPACE_HANDLER_RENAME_OP
12555  *
12556  * offset	offset of I/O for READ or WRITE.  Ignored for
12557  *		other ops.
12558  *
12559  * size		size of I/O for READ or WRITE  Ignored for
12560  *		other ops.
12561  *
12562  * If offset or size are -1 for a READ or WRITE, then the resolver should
12563  * consider the range to be unknown.
12564  *
12565  * Upon successful return, the caller may proceed with the operation.
12566  * N.B. the file may still be "dataless" in this case.
12567  */
12568 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12569 vfs_materialize_file(
12570 	struct vnode *vp,
12571 	uint64_t op,
12572 	int64_t offset,
12573 	int64_t size)
12574 {
12575 	if (vp->v_type != VREG) {
12576 		return EFTYPE;
12577 	}
12578 	return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12579 	           NULL);
12580 }
12581 
12582 /*
12583  * vfs_materialize_dir:
12584  *
12585  * Inputs:
12586  * vp		The dataless directory to be materialized.
12587  *
12588  * op		What kind of operation is being performed:
12589  *		-> NAMESPACE_HANDLER_READ_OP
12590  *		-> NAMESPACE_HANDLER_WRITE_OP
12591  *		-> NAMESPACE_HANDLER_DELETE_OP
12592  *		-> NAMESPACE_HANDLER_RENAME_OP
12593  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12594  *
12595  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12596  *		other ops.  May or may not be NUL-terminated; see below.
12597  *
12598  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12599  *		terminated and namelen is the number of valid bytes in
12600  *		lookup_name. If zero, then lookup_name is assumed to be
12601  *		NUL-terminated.
12602  *
12603  * Upon successful return, the caller may proceed with the operation.
12604  * N.B. the directory may still be "dataless" in this case.
12605  */
12606 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12607 vfs_materialize_dir(
12608 	struct vnode *vp,
12609 	uint64_t op,
12610 	char *lookup_name,
12611 	size_t namelen)
12612 {
12613 	if (vp->v_type != VDIR) {
12614 		return EFTYPE;
12615 	}
12616 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12617 		return EINVAL;
12618 	}
12619 	return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12620 	           namelen, NULL);
12621 }
12622 
12623 /*
12624  * vfs_materialize_reparent:
12625  *
12626  * Inputs:
12627  * vp		The dataless file or directory to be materialized.
12628  *
12629  * tdvp		The new parent directory for the dataless file.
12630  *
12631  * Upon successful return, the caller may proceed with the operation.
12632  * N.B. the item may still be "dataless" in this case.
12633  */
12634 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12635 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12636 {
12637 	if (vp->v_type != VDIR && vp->v_type != VREG) {
12638 		return EFTYPE;
12639 	}
12640 	return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12641 	           0, 0, NULL, 0, tdvp);
12642 }
12643 
12644 #if 0
12645 static int
12646 build_volfs_path(struct vnode *vp, char *path, int *len)
12647 {
12648 	struct vnode_attr va;
12649 	int ret;
12650 
12651 	VATTR_INIT(&va);
12652 	VATTR_WANTED(&va, va_fsid);
12653 	VATTR_WANTED(&va, va_fileid);
12654 
12655 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12656 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12657 		ret = -1;
12658 	} else {
12659 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12660 		ret = 0;
12661 	}
12662 
12663 	return ret;
12664 }
12665 #endif
12666 
12667 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12668 fsctl_bogus_command_compat(unsigned long cmd)
12669 {
12670 	switch (cmd) {
12671 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12672 		return FSIOC_SYNC_VOLUME;
12673 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12674 		return FSIOC_ROUTEFS_SETROUTEID;
12675 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12676 		return FSIOC_SET_PACKAGE_EXTS;
12677 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12678 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
12679 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12680 		return DISK_CONDITIONER_IOC_GET;
12681 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12682 		return DISK_CONDITIONER_IOC_SET;
12683 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12684 		return FSIOC_FIOSEEKHOLE;
12685 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12686 		return FSIOC_FIOSEEKDATA;
12687 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12688 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12689 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12690 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
12691 	}
12692 
12693 	return cmd;
12694 }
12695 
12696 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12697 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12698 {
12699 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12700 }
12701 
12702 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12703 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12704 {
12705 	struct vfs_attr vfa;
12706 	mount_t mp = vp->v_mount;
12707 	unsigned arg;
12708 	int error;
12709 
12710 	/* record vid of vp so we can drop it below. */
12711 	uint32_t vvid = vp->v_id;
12712 
12713 	/*
12714 	 * Then grab mount_iterref so that we can release the vnode.
12715 	 * Without this, a thread may call vnode_iterate_prepare then
12716 	 * get into a deadlock because we've never released the root vp
12717 	 */
12718 	error = mount_iterref(mp, 0);
12719 	if (error) {
12720 		return error;
12721 	}
12722 	vnode_hold(vp);
12723 	vnode_put(vp);
12724 
12725 	arg = MNT_NOWAIT;
12726 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12727 		arg = MNT_WAIT;
12728 	}
12729 
12730 	/*
12731 	 * If the filessytem supports multiple filesytems in a
12732 	 * partition (For eg APFS volumes in a container, it knows
12733 	 * that the waitfor argument to VFS_SYNC are flags.
12734 	 */
12735 	VFSATTR_INIT(&vfa);
12736 	VFSATTR_WANTED(&vfa, f_capabilities);
12737 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12738 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12739 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12740 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12741 		arg |= MNT_VOLUME;
12742 	}
12743 
12744 	/* issue the sync for this volume */
12745 	(void)sync_callback(mp, &arg);
12746 
12747 	/*
12748 	 * Then release the mount_iterref once we're done syncing; it's not
12749 	 * needed for the VNOP_IOCTL below
12750 	 */
12751 	mount_iterdrop(mp);
12752 
12753 	if (arg & FSCTL_SYNC_FULLSYNC) {
12754 		/* re-obtain vnode iocount on the root vp, if possible */
12755 		error = vnode_getwithvid(vp, vvid);
12756 		if (error == 0) {
12757 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12758 			vnode_put(vp);
12759 		}
12760 	}
12761 	vnode_drop(vp);
12762 	/* mark the argument VP as having been released */
12763 	*arg_vp = NULL;
12764 	return error;
12765 }
12766 
12767 #if ROUTEFS
12768 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12769 handle_routes(user_addr_t udata)
12770 {
12771 	char routepath[MAXPATHLEN];
12772 	size_t len = 0;
12773 	int error;
12774 
12775 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12776 		return error;
12777 	}
12778 	bzero(routepath, MAXPATHLEN);
12779 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12780 	if (error) {
12781 		return error;
12782 	}
12783 	error = routefs_kernel_mount(routepath);
12784 	return error;
12785 }
12786 #endif
12787 
12788 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12789 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12790 {
12791 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12792 	struct vnode_attr va;
12793 	int error;
12794 
12795 	VATTR_INIT(&va);
12796 	VATTR_SET(&va, va_flags, cas->new_flags);
12797 
12798 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12799 
12800 #if CONFIG_FSE
12801 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12802 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12803 	}
12804 #endif
12805 
12806 	return error;
12807 }
12808 
12809 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12810 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12811 {
12812 	struct mount *mp = NULL;
12813 	errno_t rootauth = 0;
12814 
12815 	mp = vp->v_mount;
12816 
12817 	/*
12818 	 * query the underlying FS and see if it reports something
12819 	 * sane for this vnode. If volume is authenticated via
12820 	 * chunklist, leave that for the caller to determine.
12821 	 */
12822 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12823 
12824 	return rootauth;
12825 }
12826 
12827 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12828 	"com.apple.private.kernel.set-package-extensions"
12829 
12830 /*
12831  * Make a filesystem-specific control call:
12832  */
12833 /* ARGSUSED */
12834 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12835 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12836 {
12837 	int error = 0;
12838 	boolean_t is64bit;
12839 	u_int size;
12840 #define STK_PARAMS 128
12841 	char stkbuf[STK_PARAMS] = {0};
12842 	caddr_t data, memp;
12843 	vnode_t vp = *arg_vp;
12844 
12845 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
12846 		return ENOTTY;
12847 	}
12848 
12849 	cmd = fsctl_bogus_command_compat(cmd);
12850 
12851 	size = IOCPARM_LEN(cmd);
12852 	if (size > IOCPARM_MAX) {
12853 		return EINVAL;
12854 	}
12855 
12856 	is64bit = proc_is64bit(p);
12857 
12858 	memp = NULL;
12859 
12860 	if (size > sizeof(stkbuf)) {
12861 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12862 			return ENOMEM;
12863 		}
12864 		data = memp;
12865 	} else {
12866 		data = &stkbuf[0];
12867 	};
12868 
12869 	if (cmd & IOC_IN) {
12870 		if (size) {
12871 			error = copyin(udata, data, size);
12872 			if (error) {
12873 				if (memp) {
12874 					kfree_data(memp, size);
12875 				}
12876 				return error;
12877 			}
12878 		} else {
12879 			if (is64bit) {
12880 				*(user_addr_t *)data = udata;
12881 			} else {
12882 				*(uint32_t *)data = (uint32_t)udata;
12883 			}
12884 		};
12885 	} else if ((cmd & IOC_OUT) && size) {
12886 		/*
12887 		 * Zero the buffer so the user always
12888 		 * gets back something deterministic.
12889 		 */
12890 		bzero(data, size);
12891 	} else if (cmd & IOC_VOID) {
12892 		if (is64bit) {
12893 			*(user_addr_t *)data = udata;
12894 		} else {
12895 			*(uint32_t *)data = (uint32_t)udata;
12896 		}
12897 	}
12898 
12899 	/* Check to see if it's a generic command */
12900 	switch (cmd) {
12901 	case FSIOC_SYNC_VOLUME:
12902 		error = handle_sync_volume(vp, arg_vp, data, ctx);
12903 		break;
12904 
12905 	case FSIOC_ROUTEFS_SETROUTEID:
12906 #if ROUTEFS
12907 		error = handle_routes(udata);
12908 #endif
12909 		break;
12910 
12911 	case FSIOC_SET_PACKAGE_EXTS: {
12912 		user_addr_t ext_strings;
12913 		uint32_t    num_entries;
12914 		uint32_t    max_width;
12915 
12916 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12917 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12918 			error = EPERM;
12919 			break;
12920 		}
12921 
12922 		if ((is64bit && size != sizeof(user64_package_ext_info))
12923 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12924 			// either you're 64-bit and passed a 64-bit struct or
12925 			// you're 32-bit and passed a 32-bit struct.  otherwise
12926 			// it's not ok.
12927 			error = EINVAL;
12928 			break;
12929 		}
12930 
12931 		if (is64bit) {
12932 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12933 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12934 			}
12935 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12936 			num_entries = ((user64_package_ext_info *)data)->num_entries;
12937 			max_width   = ((user64_package_ext_info *)data)->max_width;
12938 		} else {
12939 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12940 			num_entries = ((user32_package_ext_info *)data)->num_entries;
12941 			max_width   = ((user32_package_ext_info *)data)->max_width;
12942 		}
12943 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
12944 	}
12945 	break;
12946 
12947 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12948 	{
12949 		mount_t mp;
12950 
12951 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12952 			break;
12953 		}
12954 		if ((mp = vp->v_mount) != NULL) {
12955 			mount_lock(mp);
12956 			if (data[0] != 0) {
12957 				for (int i = 0; i < MFSTYPENAMELEN; i++) {
12958 					if (!data[i]) {
12959 						goto continue_copy;
12960 					}
12961 				}
12962 				/*
12963 				 * Getting here means we have a user data
12964 				 * string which has no NULL termination in
12965 				 * its first MFSTYPENAMELEN bytes.  This is
12966 				 * bogus, let's avoid strlcpy-ing the read
12967 				 * data and return an error.
12968 				 */
12969 				error = EINVAL;
12970 				goto unlock;
12971 continue_copy:
12972 				vfs_setfstypename_locked(mp, data);
12973 				if (vfs_isrdonly(mp) &&
12974 				    strcmp(data, "mtmfs") == 0) {
12975 					mp->mnt_kern_flag |=
12976 					    MNTK_EXTENDED_SECURITY;
12977 					mp->mnt_kern_flag &=
12978 					    ~MNTK_AUTH_OPAQUE;
12979 				}
12980 			} else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12981 				const char *name =
12982 				    vfs_getfstypenameref_locked(mp, NULL);
12983 				if (strcmp(name, "mtmfs") == 0) {
12984 					mp->mnt_kern_flag &=
12985 					    ~MNTK_EXTENDED_SECURITY;
12986 				}
12987 				vfs_setfstypename_locked(mp, NULL);
12988 			}
12989 unlock:
12990 			mount_unlock(mp);
12991 		}
12992 	}
12993 	break;
12994 
12995 	case DISK_CONDITIONER_IOC_GET: {
12996 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12997 	}
12998 	break;
12999 
13000 	case DISK_CONDITIONER_IOC_SET: {
13001 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
13002 	}
13003 	break;
13004 
13005 	case FSIOC_CAS_BSDFLAGS:
13006 		error = handle_flags(vp, data, ctx);
13007 		break;
13008 
13009 	case FSIOC_FD_ONLY_OPEN_ONCE: {
13010 		error = 0;
13011 		if (vnode_usecount(vp) > 1) {
13012 			vnode_lock_spin(vp);
13013 			if (vp->v_lflag & VL_HASSTREAMS) {
13014 				if (vnode_isinuse_locked(vp, 1, 1)) {
13015 					error = EBUSY;
13016 				}
13017 			} else if (vnode_usecount(vp) > 1) {
13018 				error = EBUSY;
13019 			}
13020 			vnode_unlock(vp);
13021 		}
13022 	}
13023 	break;
13024 
13025 	case FSIOC_EVAL_ROOTAUTH:
13026 		error = handle_auth(vp, cmd, data, options, ctx);
13027 		break;
13028 
13029 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
13030 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
13031 		break;
13032 
13033 #if CONFIG_EXCLAVES
13034 	case FSIOC_EXCLAVE_FS_REGISTER:
13035 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13036 			error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
13037 		} else {
13038 			error = EPERM;
13039 		}
13040 		break;
13041 
13042 	case FSIOC_EXCLAVE_FS_UNREGISTER:
13043 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13044 			error = vfs_exclave_fs_unregister(vp);
13045 		} else {
13046 			error = EPERM;
13047 		}
13048 		break;
13049 
13050 	case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
13051 		exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
13052 		exclave_fs_base_dir_t *dirs = NULL;
13053 		if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13054 			error = EPERM;
13055 			break;
13056 		}
13057 		if (get_base_dirs->base_dirs) {
13058 			if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
13059 				error = EINVAL;
13060 				break;
13061 			}
13062 			dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
13063 			if (!dirs) {
13064 				error = ENOSPC;
13065 				break;
13066 			}
13067 		}
13068 		error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
13069 		if (!error && dirs) {
13070 			error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
13071 			    get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
13072 		}
13073 		if (dirs) {
13074 			kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
13075 		}
13076 	}
13077 	break;
13078 #endif
13079 
13080 	default: {
13081 		/*
13082 		 * Other, known commands shouldn't be passed down here.
13083 		 * (When adding a selector to this list, it may be prudent
13084 		 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
13085 		 */
13086 		switch (cmd) {
13087 		case F_PUNCHHOLE:
13088 		case F_TRIM_ACTIVE_FILE:
13089 		case F_RDADVISE:
13090 		case F_TRANSCODEKEY:
13091 		case F_GETPROTECTIONLEVEL:
13092 		case F_GETDEFAULTPROTLEVEL:
13093 		case F_MAKECOMPRESSED:
13094 		case F_SET_GREEDY_MODE:
13095 		case F_SETSTATICCONTENT:
13096 		case F_SETIOTYPE:
13097 		case F_SETBACKINGSTORE:
13098 		case F_GETPATH_MTMINFO:
13099 		case APFSIOC_REVERT_TO_SNAPSHOT:
13100 		case FSIOC_FIOSEEKHOLE:
13101 		case FSIOC_FIOSEEKDATA:
13102 		case HFS_GET_BOOT_INFO:
13103 		case HFS_SET_BOOT_INFO:
13104 		case FIOPINSWAP:
13105 		case F_CHKCLEAN:
13106 		case F_FULLFSYNC:
13107 		case F_BARRIERFSYNC:
13108 		case F_FREEZE_FS:
13109 		case F_THAW_FS:
13110 		case FSIOC_KERNEL_ROOTAUTH:
13111 		case FSIOC_GRAFT_FS:
13112 		case FSIOC_UNGRAFT_FS:
13113 		case FSIOC_AUTH_FS:
13114 		case F_SPECULATIVE_READ:
13115 		case F_ATTRIBUTION_TAG:
13116 		case F_TRANSFEREXTENTS:
13117 		case F_ASSERT_BG_ACCESS:
13118 		case F_RELEASE_BG_ACCESS:
13119 			error = EINVAL;
13120 			goto outdrop;
13121 		}
13122 		/* Invoke the filesystem-specific code */
13123 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13124 	}
13125 	} /* end switch stmt */
13126 
13127 	/*
13128 	 * if no errors, copy any data to user. Size was
13129 	 * already set and checked above.
13130 	 */
13131 	if (error == 0 && (cmd & IOC_OUT) && size) {
13132 		error = copyout(data, udata, size);
13133 	}
13134 
13135 outdrop:
13136 	if (memp) {
13137 		kfree_data(memp, size);
13138 	}
13139 
13140 	return error;
13141 }
13142 
13143 /* ARGSUSED */
13144 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)13145 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
13146 {
13147 	int error;
13148 	struct nameidata nd;
13149 	uint32_t nameiflags;
13150 	vnode_t vp = NULL;
13151 	vfs_context_t ctx = vfs_context_current();
13152 
13153 	AUDIT_ARG(cmd, (int)uap->cmd);
13154 	AUDIT_ARG(value32, uap->options);
13155 	/* Get the vnode for the file we are getting info on:  */
13156 	nameiflags = 0;
13157 	//
13158 	// if we come through fsctl() then the file is by definition not open.
13159 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
13160 	// lest the caller mistakenly thinks the only open is their own (but in
13161 	// reality it's someone elses).
13162 	//
13163 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
13164 		return EINVAL;
13165 	}
13166 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
13167 		nameiflags |= FOLLOW;
13168 	}
13169 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
13170 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13171 	}
13172 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13173 	    UIO_USERSPACE, uap->path, ctx);
13174 	if ((error = namei(&nd))) {
13175 		goto done;
13176 	}
13177 	vp = nd.ni_vp;
13178 	nameidone(&nd);
13179 
13180 #if CONFIG_MACF
13181 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13182 	if (error) {
13183 		goto done;
13184 	}
13185 #endif
13186 
13187 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13188 
13189 done:
13190 	if (vp) {
13191 		vnode_put(vp);
13192 	}
13193 	return error;
13194 }
13195 /* ARGSUSED */
13196 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13197 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13198 {
13199 	int error;
13200 	vnode_t vp = NULL;
13201 	vfs_context_t ctx = vfs_context_current();
13202 	int fd = -1;
13203 
13204 	AUDIT_ARG(fd, uap->fd);
13205 	AUDIT_ARG(cmd, (int)uap->cmd);
13206 	AUDIT_ARG(value32, uap->options);
13207 
13208 	/* Get the vnode for the file we are getting info on:  */
13209 	if ((error = file_vnode(uap->fd, &vp))) {
13210 		return error;
13211 	}
13212 	fd = uap->fd;
13213 	if ((error = vnode_getwithref(vp))) {
13214 		file_drop(fd);
13215 		return error;
13216 	}
13217 
13218 #if CONFIG_MACF
13219 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13220 		file_drop(fd);
13221 		vnode_put(vp);
13222 		return error;
13223 	}
13224 #endif
13225 
13226 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13227 
13228 	file_drop(fd);
13229 
13230 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13231 	if (vp) {
13232 		vnode_put(vp);
13233 	}
13234 
13235 	return error;
13236 }
13237 /* end of fsctl system call */
13238 
13239 #define FILESEC_ACCESS_ENTITLEMENT              \
13240 	"com.apple.private.vfs.filesec-access"
13241 
13242 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13243 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13244 {
13245 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13246 		/*
13247 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13248 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13249 		 */
13250 		if ((!setting && vfs_context_issuser(ctx)) ||
13251 		    IOTaskHasEntitlement(vfs_context_task(ctx),
13252 		    FILESEC_ACCESS_ENTITLEMENT)) {
13253 			return 0;
13254 		}
13255 	}
13256 
13257 	return EPERM;
13258 }
13259 
13260 /*
13261  *  Retrieve the data of an extended attribute.
13262  */
13263 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13264 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13265 {
13266 	vnode_t vp;
13267 	struct nameidata nd;
13268 	char attrname[XATTR_MAXNAMELEN + 1];
13269 	vfs_context_t ctx = vfs_context_current();
13270 	uio_t auio = NULL;
13271 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13272 	size_t attrsize = 0;
13273 	size_t namelen;
13274 	u_int32_t nameiflags;
13275 	int error;
13276 	UIO_STACKBUF(uio_buf, 1);
13277 
13278 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13279 		return EINVAL;
13280 	}
13281 
13282 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13283 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13284 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13285 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13286 	}
13287 
13288 	if ((error = namei(&nd))) {
13289 		return error;
13290 	}
13291 	vp = nd.ni_vp;
13292 	nameidone(&nd);
13293 
13294 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13295 	if (error != 0) {
13296 		goto out;
13297 	}
13298 	if (xattr_protected(attrname) &&
13299 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13300 		goto out;
13301 	}
13302 	/*
13303 	 * the specific check for 0xffffffff is a hack to preserve
13304 	 * binaray compatibilty in K64 with applications that discovered
13305 	 * that passing in a buf pointer and a size of -1 resulted in
13306 	 * just the size of the indicated extended attribute being returned.
13307 	 * this isn't part of the documented behavior, but because of the
13308 	 * original implemtation's check for "uap->size > 0", this behavior
13309 	 * was allowed. In K32 that check turned into a signed comparison
13310 	 * even though uap->size is unsigned...  in K64, we blow by that
13311 	 * check because uap->size is unsigned and doesn't get sign smeared
13312 	 * in the munger for a 32 bit user app.  we also need to add a
13313 	 * check to limit the maximum size of the buffer being passed in...
13314 	 * unfortunately, the underlying fileystems seem to just malloc
13315 	 * the requested size even if the actual extended attribute is tiny.
13316 	 * because that malloc is for kernel wired memory, we have to put a
13317 	 * sane limit on it.
13318 	 *
13319 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13320 	 * U64 running on K64 will yield -1 (64 bits wide)
13321 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
13322 	 */
13323 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13324 		goto no_uio;
13325 	}
13326 
13327 	if (uap->value) {
13328 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13329 			uap->size = XATTR_MAXSIZE;
13330 		}
13331 
13332 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13333 		    &uio_buf[0], sizeof(uio_buf));
13334 		uio_addiov(auio, uap->value, uap->size);
13335 	}
13336 no_uio:
13337 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13338 out:
13339 	vnode_put(vp);
13340 
13341 	if (auio) {
13342 		*retval = uap->size - uio_resid(auio);
13343 	} else {
13344 		*retval = (user_ssize_t)attrsize;
13345 	}
13346 
13347 	return error;
13348 }
13349 
13350 /*
13351  * Retrieve the data of an extended attribute.
13352  */
13353 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13354 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13355 {
13356 	vnode_t vp;
13357 	char attrname[XATTR_MAXNAMELEN + 1];
13358 	vfs_context_t ctx = vfs_context_current();
13359 	uio_t auio = NULL;
13360 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13361 	size_t attrsize = 0;
13362 	size_t namelen;
13363 	int error;
13364 	UIO_STACKBUF(uio_buf, 1);
13365 
13366 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13367 	    XATTR_NOFOLLOW_ANY)) {
13368 		return EINVAL;
13369 	}
13370 
13371 	if ((error = file_vnode(uap->fd, &vp))) {
13372 		return error;
13373 	}
13374 	if ((error = vnode_getwithref(vp))) {
13375 		file_drop(uap->fd);
13376 		return error;
13377 	}
13378 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13379 	if (error != 0) {
13380 		goto out;
13381 	}
13382 	if (xattr_protected(attrname) &&
13383 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13384 		goto out;
13385 	}
13386 	if (uap->value && uap->size > 0) {
13387 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13388 			uap->size = XATTR_MAXSIZE;
13389 		}
13390 
13391 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13392 		    &uio_buf[0], sizeof(uio_buf));
13393 		uio_addiov(auio, uap->value, uap->size);
13394 	}
13395 
13396 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13397 out:
13398 	(void)vnode_put(vp);
13399 	file_drop(uap->fd);
13400 
13401 	if (auio) {
13402 		*retval = uap->size - uio_resid(auio);
13403 	} else {
13404 		*retval = (user_ssize_t)attrsize;
13405 	}
13406 	return error;
13407 }
13408 
13409 /* struct for checkdirs iteration */
13410 struct setxattr_ctx {
13411 	struct nameidata nd;
13412 	char attrname[XATTR_MAXNAMELEN + 1];
13413 	UIO_STACKBUF(uio_buf, 1);
13414 };
13415 
13416 /*
13417  * Set the data of an extended attribute.
13418  */
13419 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13420 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13421 {
13422 	vnode_t vp;
13423 	vfs_context_t ctx = vfs_context_current();
13424 	uio_t auio = NULL;
13425 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13426 	size_t namelen;
13427 	u_int32_t nameiflags;
13428 	int error;
13429 	struct setxattr_ctx *sactx;
13430 
13431 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13432 		return EINVAL;
13433 	}
13434 
13435 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13436 	if (sactx == NULL) {
13437 		return ENOMEM;
13438 	}
13439 
13440 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13441 	if (error != 0) {
13442 		if (error == EPERM) {
13443 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13444 			error = ENAMETOOLONG;
13445 		}
13446 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13447 		goto out;
13448 	}
13449 	if (xattr_protected(sactx->attrname) &&
13450 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13451 		goto out;
13452 	}
13453 	if (uap->size != 0 && uap->value == 0) {
13454 		error = EINVAL;
13455 		goto out;
13456 	}
13457 	if (uap->size > INT_MAX) {
13458 		error = E2BIG;
13459 		goto out;
13460 	}
13461 
13462 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13463 #if CONFIG_FILE_LEASES
13464 	nameiflags |= WANTPARENT;
13465 #endif
13466 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13467 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13468 		sactx->nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13469 	}
13470 
13471 	if ((error = namei(&sactx->nd))) {
13472 		goto out;
13473 	}
13474 	vp = sactx->nd.ni_vp;
13475 #if CONFIG_FILE_LEASES
13476 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13477 	vnode_put(sactx->nd.ni_dvp);
13478 #endif
13479 	nameidone(&sactx->nd);
13480 
13481 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13482 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13483 	uio_addiov(auio, uap->value, uap->size);
13484 
13485 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13486 #if CONFIG_FSE
13487 	if (error == 0) {
13488 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13489 		    FSE_ARG_VNODE, vp,
13490 		    FSE_ARG_DONE);
13491 	}
13492 #endif
13493 	vnode_put(vp);
13494 out:
13495 	kfree_type(struct setxattr_ctx, sactx);
13496 	*retval = 0;
13497 	return error;
13498 }
13499 
13500 /*
13501  * Set the data of an extended attribute.
13502  */
13503 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13504 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13505 {
13506 	vnode_t vp;
13507 	char attrname[XATTR_MAXNAMELEN + 1];
13508 	vfs_context_t ctx = vfs_context_current();
13509 	uio_t auio = NULL;
13510 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13511 	size_t namelen;
13512 	int error;
13513 	UIO_STACKBUF(uio_buf, 1);
13514 
13515 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13516 	    XATTR_NOFOLLOW_ANY)) {
13517 		return EINVAL;
13518 	}
13519 
13520 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13521 	if (error != 0) {
13522 		if (error == EPERM) {
13523 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13524 			return ENAMETOOLONG;
13525 		}
13526 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13527 		return error;
13528 	}
13529 	if (xattr_protected(attrname) &&
13530 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13531 		return error;
13532 	}
13533 	if (uap->size != 0 && uap->value == 0) {
13534 		return EINVAL;
13535 	}
13536 	if (uap->size > INT_MAX) {
13537 		return E2BIG;
13538 	}
13539 	if ((error = file_vnode(uap->fd, &vp))) {
13540 		return error;
13541 	}
13542 	if ((error = vnode_getwithref(vp))) {
13543 		file_drop(uap->fd);
13544 		return error;
13545 	}
13546 
13547 #if CONFIG_FILE_LEASES
13548 	vnode_breakdirlease(vp, true, O_WRONLY);
13549 #endif
13550 
13551 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13552 	    &uio_buf[0], sizeof(uio_buf));
13553 	uio_addiov(auio, uap->value, uap->size);
13554 
13555 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13556 #if CONFIG_FSE
13557 	if (error == 0) {
13558 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13559 		    FSE_ARG_VNODE, vp,
13560 		    FSE_ARG_DONE);
13561 	}
13562 #endif
13563 	vnode_put(vp);
13564 	file_drop(uap->fd);
13565 	*retval = 0;
13566 	return error;
13567 }
13568 
13569 /*
13570  * Remove an extended attribute.
13571  * XXX Code duplication here.
13572  */
13573 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13574 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13575 {
13576 	vnode_t vp;
13577 	struct nameidata nd;
13578 	char attrname[XATTR_MAXNAMELEN + 1];
13579 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13580 	vfs_context_t ctx = vfs_context_current();
13581 	size_t namelen;
13582 	u_int32_t nameiflags;
13583 	int error;
13584 
13585 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13586 		return EINVAL;
13587 	}
13588 
13589 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13590 	if (error != 0) {
13591 		return error;
13592 	}
13593 	if (xattr_protected(attrname)) {
13594 		return EPERM;
13595 	}
13596 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13597 #if CONFIG_FILE_LEASES
13598 	nameiflags |= WANTPARENT;
13599 #endif
13600 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13601 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13602 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13603 	}
13604 
13605 	if ((error = namei(&nd))) {
13606 		return error;
13607 	}
13608 	vp = nd.ni_vp;
13609 #if CONFIG_FILE_LEASES
13610 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13611 	vnode_put(nd.ni_dvp);
13612 #endif
13613 	nameidone(&nd);
13614 
13615 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13616 #if CONFIG_FSE
13617 	if (error == 0) {
13618 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13619 		    FSE_ARG_VNODE, vp,
13620 		    FSE_ARG_DONE);
13621 	}
13622 #endif
13623 	vnode_put(vp);
13624 	*retval = 0;
13625 	return error;
13626 }
13627 
13628 /*
13629  * Remove an extended attribute.
13630  * XXX Code duplication here.
13631  */
13632 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13633 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13634 {
13635 	vnode_t vp;
13636 	char attrname[XATTR_MAXNAMELEN + 1];
13637 	size_t namelen;
13638 	int error;
13639 #if CONFIG_FSE
13640 	vfs_context_t ctx = vfs_context_current();
13641 #endif
13642 
13643 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13644 	    XATTR_NOFOLLOW_ANY)) {
13645 		return EINVAL;
13646 	}
13647 
13648 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13649 	if (error != 0) {
13650 		return error;
13651 	}
13652 	if (xattr_protected(attrname)) {
13653 		return EPERM;
13654 	}
13655 	if ((error = file_vnode(uap->fd, &vp))) {
13656 		return error;
13657 	}
13658 	if ((error = vnode_getwithref(vp))) {
13659 		file_drop(uap->fd);
13660 		return error;
13661 	}
13662 
13663 #if CONFIG_FILE_LEASES
13664 	vnode_breakdirlease(vp, true, O_WRONLY);
13665 #endif
13666 
13667 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13668 #if CONFIG_FSE
13669 	if (error == 0) {
13670 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13671 		    FSE_ARG_VNODE, vp,
13672 		    FSE_ARG_DONE);
13673 	}
13674 #endif
13675 	vnode_put(vp);
13676 	file_drop(uap->fd);
13677 	*retval = 0;
13678 	return error;
13679 }
13680 
13681 /*
13682  * Retrieve the list of extended attribute names.
13683  * XXX Code duplication here.
13684  */
13685 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13686 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13687 {
13688 	vnode_t vp;
13689 	struct nameidata nd;
13690 	vfs_context_t ctx = vfs_context_current();
13691 	uio_t auio = NULL;
13692 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13693 	size_t attrsize = 0;
13694 	u_int32_t nameiflags;
13695 	int error;
13696 	UIO_STACKBUF(uio_buf, 1);
13697 
13698 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13699 		return EINVAL;
13700 	}
13701 
13702 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13703 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13704 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13705 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13706 	}
13707 
13708 	if ((error = namei(&nd))) {
13709 		return error;
13710 	}
13711 	vp = nd.ni_vp;
13712 	nameidone(&nd);
13713 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13714 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13715 		    &uio_buf[0], sizeof(uio_buf));
13716 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13717 	}
13718 
13719 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13720 
13721 	vnode_put(vp);
13722 	if (auio) {
13723 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13724 	} else {
13725 		*retval = (user_ssize_t)attrsize;
13726 	}
13727 	return error;
13728 }
13729 
13730 /*
13731  * Retrieve the list of extended attribute names.
13732  * XXX Code duplication here.
13733  */
13734 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13735 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13736 {
13737 	vnode_t vp;
13738 	uio_t auio = NULL;
13739 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13740 	size_t attrsize = 0;
13741 	int error;
13742 	UIO_STACKBUF(uio_buf, 1);
13743 
13744 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13745 	    XATTR_NOFOLLOW_ANY)) {
13746 		return EINVAL;
13747 	}
13748 
13749 	if ((error = file_vnode(uap->fd, &vp))) {
13750 		return error;
13751 	}
13752 	if ((error = vnode_getwithref(vp))) {
13753 		file_drop(uap->fd);
13754 		return error;
13755 	}
13756 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13757 		auio = uio_createwithbuffer(1, 0, spacetype,
13758 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
13759 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13760 	}
13761 
13762 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13763 
13764 	vnode_put(vp);
13765 	file_drop(uap->fd);
13766 	if (auio) {
13767 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13768 	} else {
13769 		*retval = (user_ssize_t)attrsize;
13770 	}
13771 	return error;
13772 }
13773 
13774 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13775 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13776     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13777 {
13778 	int error;
13779 	struct mount *mp = NULL;
13780 	vnode_t vp;
13781 	int length;
13782 	int bpflags;
13783 	/* maximum number of times to retry build_path */
13784 	unsigned int retries = 0x10;
13785 
13786 	if (bufsize > FSGETPATH_MAXBUFLEN) {
13787 		return EINVAL;
13788 	}
13789 
13790 	if (buf == NULL) {
13791 		return ENOMEM;
13792 	}
13793 
13794 retry:
13795 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13796 		error = ENOTSUP;  /* unexpected failure */
13797 		return ENOTSUP;
13798 	}
13799 
13800 #if CONFIG_UNION_MOUNTS
13801 unionget:
13802 #endif /* CONFIG_UNION_MOUNTS */
13803 	if (objid == 2) {
13804 		struct vfs_attr vfsattr;
13805 		int use_vfs_root = TRUE;
13806 
13807 		VFSATTR_INIT(&vfsattr);
13808 		VFSATTR_WANTED(&vfsattr, f_capabilities);
13809 		if (!(options & FSOPT_ISREALFSID) &&
13810 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13811 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13812 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13813 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13814 				use_vfs_root = FALSE;
13815 			}
13816 		}
13817 
13818 		if (use_vfs_root) {
13819 			error = VFS_ROOT(mp, &vp, ctx);
13820 		} else {
13821 			error = VFS_VGET(mp, objid, &vp, ctx);
13822 		}
13823 	} else {
13824 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13825 	}
13826 
13827 #if CONFIG_UNION_MOUNTS
13828 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13829 		/*
13830 		 * If the fileid isn't found and we're in a union
13831 		 * mount volume, then see if the fileid is in the
13832 		 * mounted-on volume.
13833 		 */
13834 		struct mount *tmp = mp;
13835 		mp = vnode_mount(tmp->mnt_vnodecovered);
13836 		vfs_unbusy(tmp);
13837 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
13838 			goto unionget;
13839 		}
13840 	} else {
13841 		vfs_unbusy(mp);
13842 	}
13843 #else
13844 	vfs_unbusy(mp);
13845 #endif /* CONFIG_UNION_MOUNTS */
13846 
13847 	if (error) {
13848 		return error;
13849 	}
13850 
13851 #if CONFIG_MACF
13852 	error = mac_vnode_check_fsgetpath(ctx, vp);
13853 	if (error) {
13854 		vnode_put(vp);
13855 		return error;
13856 	}
13857 #endif
13858 
13859 	/* Obtain the absolute path to this vnode. */
13860 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13861 	if (options & FSOPT_NOFIRMLINKPATH) {
13862 		bpflags |= BUILDPATH_NO_FIRMLINK;
13863 	}
13864 	bpflags |= BUILDPATH_CHECK_MOVED;
13865 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13866 	vnode_put(vp);
13867 
13868 	if (error) {
13869 		/* there was a race building the path, try a few more times */
13870 		if (error == EAGAIN) {
13871 			--retries;
13872 			if (retries > 0) {
13873 				goto retry;
13874 			}
13875 
13876 			error = ENOENT;
13877 		}
13878 		goto out;
13879 	}
13880 
13881 	AUDIT_ARG(text, buf);
13882 
13883 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13884 		unsigned long path_words[NUMPARMS];
13885 		size_t path_len = sizeof(path_words);
13886 
13887 		if ((size_t)length < path_len) {
13888 			memcpy((char *)path_words, buf, length);
13889 			memset((char *)path_words + length, 0, path_len - length);
13890 
13891 			path_len = length;
13892 		} else {
13893 			memcpy((char *)path_words, buf + (length - path_len), path_len);
13894 		}
13895 
13896 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
13897 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13898 	}
13899 
13900 	*pathlen = length; /* may be superseded by error */
13901 
13902 out:
13903 	return error;
13904 }
13905 
13906 /*
13907  * Obtain the full pathname of a file system object by id.
13908  */
13909 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13910 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13911     uint32_t options, user_ssize_t *retval)
13912 {
13913 	vfs_context_t ctx = vfs_context_current();
13914 	fsid_t fsid;
13915 	char *realpath;
13916 	int length;
13917 	int error;
13918 
13919 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13920 		return EINVAL;
13921 	}
13922 
13923 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13924 		return error;
13925 	}
13926 	AUDIT_ARG(value32, fsid.val[0]);
13927 	AUDIT_ARG(value64, objid);
13928 	/* Restrict output buffer size for now. */
13929 
13930 	if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13931 		return EINVAL;
13932 	}
13933 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13934 	if (realpath == NULL) {
13935 		return ENOMEM;
13936 	}
13937 
13938 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13939 	    options, &length);
13940 
13941 	if (error) {
13942 		goto out;
13943 	}
13944 
13945 	error = copyout((caddr_t)realpath, buf, length);
13946 
13947 	*retval = (user_ssize_t)length; /* may be superseded by error */
13948 out:
13949 	kfree_data(realpath, bufsize);
13950 	return error;
13951 }
13952 
13953 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13954 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13955 {
13956 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13957 	           0, retval);
13958 }
13959 
13960 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13961 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13962 {
13963 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13964 	           uap->options, retval);
13965 }
13966 
13967 /*
13968  * Common routine to handle various flavors of statfs data heading out
13969  *	to user space.
13970  *
13971  * Returns:	0			Success
13972  *		EFAULT
13973  */
13974 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13975 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13976     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13977     boolean_t partial_copy)
13978 {
13979 	int             error;
13980 	int             my_size, copy_size;
13981 
13982 	if (is_64_bit) {
13983 		struct user64_statfs sfs;
13984 		my_size = copy_size = sizeof(sfs);
13985 		bzero(&sfs, my_size);
13986 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13987 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13988 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13989 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13990 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13991 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13992 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13993 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13994 		sfs.f_files = (user64_long_t)sfsp->f_files;
13995 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13996 		sfs.f_fsid = sfsp->f_fsid;
13997 		sfs.f_owner = sfsp->f_owner;
13998 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13999 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14000 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14001 
14002 		if (partial_copy) {
14003 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14004 		}
14005 		error = copyout((caddr_t)&sfs, bufp, copy_size);
14006 	} else {
14007 		struct user32_statfs sfs;
14008 
14009 		my_size = copy_size = sizeof(sfs);
14010 		bzero(&sfs, my_size);
14011 
14012 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14013 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14014 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14015 
14016 		/*
14017 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
14018 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
14019 		 * to reflect the filesystem size as best we can.
14020 		 */
14021 		if ((sfsp->f_blocks > INT_MAX)
14022 		    /* Hack for 4061702 . I think the real fix is for Carbon to
14023 		     * look for some volume capability and not depend on hidden
14024 		     * semantics agreed between a FS and carbon.
14025 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
14026 		     * for Carbon to set bNoVolumeSizes volume attribute.
14027 		     * Without this the webdavfs files cannot be copied onto
14028 		     * disk as they look huge. This change should not affect
14029 		     * XSAN as they should not setting these to -1..
14030 		     */
14031 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
14032 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
14033 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
14034 			int             shift;
14035 
14036 			/*
14037 			 * Work out how far we have to shift the block count down to make it fit.
14038 			 * Note that it's possible to have to shift so far that the resulting
14039 			 * blocksize would be unreportably large.  At that point, we will clip
14040 			 * any values that don't fit.
14041 			 *
14042 			 * For safety's sake, we also ensure that f_iosize is never reported as
14043 			 * being smaller than f_bsize.
14044 			 */
14045 			for (shift = 0; shift < 32; shift++) {
14046 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
14047 					break;
14048 				}
14049 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
14050 					break;
14051 				}
14052 			}
14053 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
14054 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
14055 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
14056 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
14057 #undef __SHIFT_OR_CLIP
14058 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
14059 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
14060 		} else {
14061 			/* filesystem is small enough to be reported honestly */
14062 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
14063 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
14064 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
14065 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
14066 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
14067 		}
14068 		sfs.f_files = (user32_long_t)sfsp->f_files;
14069 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
14070 		sfs.f_fsid = sfsp->f_fsid;
14071 		sfs.f_owner = sfsp->f_owner;
14072 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14073 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14074 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14075 
14076 		if (partial_copy) {
14077 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14078 		}
14079 		error = copyout((caddr_t)&sfs, bufp, copy_size);
14080 	}
14081 
14082 	if (sizep != NULL) {
14083 		*sizep = my_size;
14084 	}
14085 	return error;
14086 }
14087 
14088 /*
14089  * copy stat structure into user_stat structure.
14090  */
14091 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)14092 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
14093 {
14094 	bzero(usbp, sizeof(*usbp));
14095 
14096 	usbp->st_dev = sbp->st_dev;
14097 	usbp->st_ino = sbp->st_ino;
14098 	usbp->st_mode = sbp->st_mode;
14099 	usbp->st_nlink = sbp->st_nlink;
14100 	usbp->st_uid = sbp->st_uid;
14101 	usbp->st_gid = sbp->st_gid;
14102 	usbp->st_rdev = sbp->st_rdev;
14103 #ifndef _POSIX_C_SOURCE
14104 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14105 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14106 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14107 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14108 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14109 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14110 #else
14111 	usbp->st_atime = sbp->st_atime;
14112 	usbp->st_atimensec = sbp->st_atimensec;
14113 	usbp->st_mtime = sbp->st_mtime;
14114 	usbp->st_mtimensec = sbp->st_mtimensec;
14115 	usbp->st_ctime = sbp->st_ctime;
14116 	usbp->st_ctimensec = sbp->st_ctimensec;
14117 #endif
14118 	usbp->st_size = sbp->st_size;
14119 	usbp->st_blocks = sbp->st_blocks;
14120 	usbp->st_blksize = sbp->st_blksize;
14121 	usbp->st_flags = sbp->st_flags;
14122 	usbp->st_gen = sbp->st_gen;
14123 	usbp->st_lspare = sbp->st_lspare;
14124 	usbp->st_qspare[0] = sbp->st_qspare[0];
14125 	usbp->st_qspare[1] = sbp->st_qspare[1];
14126 }
14127 
14128 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)14129 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
14130 {
14131 	bzero(usbp, sizeof(*usbp));
14132 
14133 	usbp->st_dev = sbp->st_dev;
14134 	usbp->st_ino = sbp->st_ino;
14135 	usbp->st_mode = sbp->st_mode;
14136 	usbp->st_nlink = sbp->st_nlink;
14137 	usbp->st_uid = sbp->st_uid;
14138 	usbp->st_gid = sbp->st_gid;
14139 	usbp->st_rdev = sbp->st_rdev;
14140 #ifndef _POSIX_C_SOURCE
14141 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14142 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14143 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14144 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14145 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14146 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14147 #else
14148 	usbp->st_atime = sbp->st_atime;
14149 	usbp->st_atimensec = sbp->st_atimensec;
14150 	usbp->st_mtime = sbp->st_mtime;
14151 	usbp->st_mtimensec = sbp->st_mtimensec;
14152 	usbp->st_ctime = sbp->st_ctime;
14153 	usbp->st_ctimensec = sbp->st_ctimensec;
14154 #endif
14155 	usbp->st_size = sbp->st_size;
14156 	usbp->st_blocks = sbp->st_blocks;
14157 	usbp->st_blksize = sbp->st_blksize;
14158 	usbp->st_flags = sbp->st_flags;
14159 	usbp->st_gen = sbp->st_gen;
14160 	usbp->st_lspare = sbp->st_lspare;
14161 	usbp->st_qspare[0] = sbp->st_qspare[0];
14162 	usbp->st_qspare[1] = sbp->st_qspare[1];
14163 }
14164 
14165 /*
14166  * copy stat64 structure into user_stat64 structure.
14167  */
14168 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)14169 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
14170 {
14171 	bzero(usbp, sizeof(*usbp));
14172 
14173 	usbp->st_dev = sbp->st_dev;
14174 	usbp->st_ino = sbp->st_ino;
14175 	usbp->st_mode = sbp->st_mode;
14176 	usbp->st_nlink = sbp->st_nlink;
14177 	usbp->st_uid = sbp->st_uid;
14178 	usbp->st_gid = sbp->st_gid;
14179 	usbp->st_rdev = sbp->st_rdev;
14180 #ifndef _POSIX_C_SOURCE
14181 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14182 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14183 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14184 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14185 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14186 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14187 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
14188 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
14189 #else
14190 	usbp->st_atime = sbp->st_atime;
14191 	usbp->st_atimensec = sbp->st_atimensec;
14192 	usbp->st_mtime = sbp->st_mtime;
14193 	usbp->st_mtimensec = sbp->st_mtimensec;
14194 	usbp->st_ctime = sbp->st_ctime;
14195 	usbp->st_ctimensec = sbp->st_ctimensec;
14196 	usbp->st_birthtime = sbp->st_birthtime;
14197 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14198 #endif
14199 	usbp->st_size = sbp->st_size;
14200 	usbp->st_blocks = sbp->st_blocks;
14201 	usbp->st_blksize = sbp->st_blksize;
14202 	usbp->st_flags = sbp->st_flags;
14203 	usbp->st_gen = sbp->st_gen;
14204 	usbp->st_lspare = sbp->st_lspare;
14205 	usbp->st_qspare[0] = sbp->st_qspare[0];
14206 	usbp->st_qspare[1] = sbp->st_qspare[1];
14207 }
14208 
14209 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14210 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14211 {
14212 	bzero(usbp, sizeof(*usbp));
14213 
14214 	usbp->st_dev = sbp->st_dev;
14215 	usbp->st_ino = sbp->st_ino;
14216 	usbp->st_mode = sbp->st_mode;
14217 	usbp->st_nlink = sbp->st_nlink;
14218 	usbp->st_uid = sbp->st_uid;
14219 	usbp->st_gid = sbp->st_gid;
14220 	usbp->st_rdev = sbp->st_rdev;
14221 #ifndef _POSIX_C_SOURCE
14222 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14223 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14224 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14225 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14226 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14227 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14228 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14229 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14230 #else
14231 	usbp->st_atime = sbp->st_atime;
14232 	usbp->st_atimensec = sbp->st_atimensec;
14233 	usbp->st_mtime = sbp->st_mtime;
14234 	usbp->st_mtimensec = sbp->st_mtimensec;
14235 	usbp->st_ctime = sbp->st_ctime;
14236 	usbp->st_ctimensec = sbp->st_ctimensec;
14237 	usbp->st_birthtime = sbp->st_birthtime;
14238 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14239 #endif
14240 	usbp->st_size = sbp->st_size;
14241 	usbp->st_blocks = sbp->st_blocks;
14242 	usbp->st_blksize = sbp->st_blksize;
14243 	usbp->st_flags = sbp->st_flags;
14244 	usbp->st_gen = sbp->st_gen;
14245 	usbp->st_lspare = sbp->st_lspare;
14246 	usbp->st_qspare[0] = sbp->st_qspare[0];
14247 	usbp->st_qspare[1] = sbp->st_qspare[1];
14248 }
14249 
14250 /*
14251  * Purge buffer cache for simulating cold starts
14252  */
14253 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14254 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14255 {
14256 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14257 
14258 	return VNODE_RETURNED;
14259 }
14260 
14261 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14262 vfs_purge_callback(mount_t mp, __unused void * arg)
14263 {
14264 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14265 
14266 	return VFS_RETURNED;
14267 }
14268 
14269 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14270 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14271 
14272 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14273 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14274 {
14275 	if (!kauth_cred_issuser(kauth_cred_get())) {
14276 		return EPERM;
14277 	}
14278 
14279 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14280 
14281 	/* also flush any VM pagers backed by files */
14282 	if (vfs_purge_vm_pagers) {
14283 		vm_purge_filebacked_pagers();
14284 	}
14285 
14286 	return 0;
14287 }
14288 
14289 /*
14290  * gets the vnode associated with the (unnamed) snapshot directory
14291  * for a Filesystem. The snapshot directory vnode is returned with
14292  * an iocount on it.
14293  */
14294 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14295 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14296 {
14297 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14298 }
14299 
14300 /*
14301  * Get the snapshot vnode.
14302  *
14303  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14304  * needs nameidone() on ndp.
14305  *
14306  * If the snapshot vnode exists it is returned in ndp->ni_vp.
14307  *
14308  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14309  * not needed.
14310  */
14311 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14312 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14313     user_addr_t name, struct nameidata *ndp, int32_t op,
14314 #if !CONFIG_TRIGGERS
14315     __unused
14316 #endif
14317     enum path_operation pathop,
14318     vfs_context_t ctx)
14319 {
14320 	int error, i;
14321 	caddr_t name_buf;
14322 	size_t name_len;
14323 	struct vfs_attr vfa;
14324 
14325 	*sdvpp = NULLVP;
14326 	*rvpp = NULLVP;
14327 
14328 	error = vnode_getfromfd(ctx, dirfd, rvpp);
14329 	if (error) {
14330 		return error;
14331 	}
14332 
14333 	if (!vnode_isvroot(*rvpp)) {
14334 		error = EINVAL;
14335 		goto out;
14336 	}
14337 
14338 	/* Make sure the filesystem supports snapshots */
14339 	VFSATTR_INIT(&vfa);
14340 	VFSATTR_WANTED(&vfa, f_capabilities);
14341 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14342 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14343 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14344 	    VOL_CAP_INT_SNAPSHOT)) ||
14345 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14346 	    VOL_CAP_INT_SNAPSHOT))) {
14347 		error = ENOTSUP;
14348 		goto out;
14349 	}
14350 
14351 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14352 	if (error) {
14353 		goto out;
14354 	}
14355 
14356 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14357 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14358 	if (error) {
14359 		goto out1;
14360 	}
14361 
14362 	/*
14363 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14364 	 * (the length returned by copyinstr includes the terminating NUL)
14365 	 */
14366 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14367 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14368 		error = EINVAL;
14369 		goto out1;
14370 	}
14371 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14372 		;
14373 	}
14374 	if (i < (int)name_len) {
14375 		error = EINVAL;
14376 		goto out1;
14377 	}
14378 
14379 #if CONFIG_MACF
14380 	if (op == CREATE) {
14381 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14382 		    name_buf);
14383 	} else if (op == DELETE) {
14384 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14385 		    name_buf);
14386 	}
14387 	if (error) {
14388 		goto out1;
14389 	}
14390 #endif
14391 
14392 	/* Check if the snapshot already exists ... */
14393 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14394 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14395 	ndp->ni_dvp = *sdvpp;
14396 
14397 	error = namei(ndp);
14398 out1:
14399 	zfree(ZV_NAMEI, name_buf);
14400 out:
14401 	if (error) {
14402 		if (*sdvpp) {
14403 			vnode_put(*sdvpp);
14404 			*sdvpp = NULLVP;
14405 		}
14406 		if (*rvpp) {
14407 			vnode_put(*rvpp);
14408 			*rvpp = NULLVP;
14409 		}
14410 	}
14411 	return error;
14412 }
14413 
14414 /*
14415  * create a filesystem snapshot (for supporting filesystems)
14416  *
14417  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14418  * We get to the (unnamed) snapshot directory vnode and create the vnode
14419  * for the snapshot in it.
14420  *
14421  * Restrictions:
14422  *
14423  *    a) Passed in name for snapshot cannot have slashes.
14424  *    b) name can't be "." or ".."
14425  *
14426  * Since this requires superuser privileges, vnode_authorize calls are not
14427  * made.
14428  */
14429 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14430 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14431     vfs_context_t ctx)
14432 {
14433 	vnode_t rvp, snapdvp;
14434 	int error;
14435 	struct nameidata *ndp;
14436 
14437 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14438 
14439 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14440 	    OP_LINK, ctx);
14441 	if (error) {
14442 		goto out;
14443 	}
14444 
14445 	if (ndp->ni_vp) {
14446 		vnode_put(ndp->ni_vp);
14447 		error = EEXIST;
14448 	} else {
14449 		struct vnode_attr *vap;
14450 		vnode_t vp = NULLVP;
14451 
14452 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14453 
14454 		VATTR_INIT(vap);
14455 		VATTR_SET(vap, va_type, VREG);
14456 		VATTR_SET(vap, va_mode, 0);
14457 
14458 		error = vn_create(snapdvp, &vp, ndp, vap,
14459 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14460 		if (!error && vp) {
14461 			vnode_put(vp);
14462 		}
14463 
14464 		kfree_type(struct vnode_attr, vap);
14465 	}
14466 
14467 	nameidone(ndp);
14468 	vnode_put(snapdvp);
14469 	vnode_put(rvp);
14470 out:
14471 	kfree_type(struct nameidata, ndp);
14472 
14473 	return error;
14474 }
14475 
14476 /*
14477  * Delete a Filesystem snapshot
14478  *
14479  * get the vnode for the unnamed snapshot directory and the snapshot and
14480  * delete the snapshot.
14481  */
14482 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14483 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14484     vfs_context_t ctx)
14485 {
14486 	vnode_t rvp, snapdvp;
14487 	int error;
14488 	struct nameidata *ndp;
14489 
14490 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14491 
14492 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14493 	    OP_UNLINK, ctx);
14494 	if (error) {
14495 		goto out;
14496 	}
14497 
14498 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14499 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14500 
14501 	vnode_put(ndp->ni_vp);
14502 	nameidone(ndp);
14503 	vnode_put(snapdvp);
14504 	vnode_put(rvp);
14505 out:
14506 	kfree_type(struct nameidata, ndp);
14507 
14508 	return error;
14509 }
14510 
14511 /*
14512  * Revert a filesystem to a snapshot
14513  *
14514  * Marks the filesystem to revert to the given snapshot on next mount.
14515  */
14516 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14517 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14518     vfs_context_t ctx)
14519 {
14520 	int error;
14521 	vnode_t rvp;
14522 	mount_t mp;
14523 	struct fs_snapshot_revert_args revert_data;
14524 	struct componentname cnp;
14525 	caddr_t name_buf;
14526 	size_t name_len;
14527 
14528 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14529 	if (error) {
14530 		return error;
14531 	}
14532 	mp = vnode_mount(rvp);
14533 
14534 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14535 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14536 	if (error) {
14537 		zfree(ZV_NAMEI, name_buf);
14538 		vnode_put(rvp);
14539 		return error;
14540 	}
14541 
14542 #if CONFIG_MACF
14543 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14544 	if (error) {
14545 		zfree(ZV_NAMEI, name_buf);
14546 		vnode_put(rvp);
14547 		return error;
14548 	}
14549 #endif
14550 
14551 	/*
14552 	 * Grab mount_iterref so that we can release the vnode,
14553 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14554 	 */
14555 	error = mount_iterref(mp, 0);
14556 	vnode_put(rvp);
14557 	if (error) {
14558 		zfree(ZV_NAMEI, name_buf);
14559 		return error;
14560 	}
14561 
14562 	memset(&cnp, 0, sizeof(cnp));
14563 	cnp.cn_pnbuf = (char *)name_buf;
14564 	cnp.cn_nameiop = LOOKUP;
14565 	cnp.cn_flags = ISLASTCN | HASBUF;
14566 	cnp.cn_pnlen = MAXPATHLEN;
14567 	cnp.cn_nameptr = cnp.cn_pnbuf;
14568 	cnp.cn_namelen = (int)name_len;
14569 	revert_data.sr_cnp = &cnp;
14570 
14571 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14572 	mount_iterdrop(mp);
14573 	zfree(ZV_NAMEI, name_buf);
14574 
14575 	if (error) {
14576 		/* If there was any error, try again using VNOP_IOCTL */
14577 
14578 		vnode_t snapdvp;
14579 		struct nameidata namend;
14580 
14581 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14582 		    OP_LOOKUP, ctx);
14583 		if (error) {
14584 			return error;
14585 		}
14586 
14587 
14588 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14589 		    0, ctx);
14590 
14591 		vnode_put(namend.ni_vp);
14592 		nameidone(&namend);
14593 		vnode_put(snapdvp);
14594 		vnode_put(rvp);
14595 	}
14596 
14597 	return error;
14598 }
14599 
14600 /*
14601  * rename a Filesystem snapshot
14602  *
14603  * get the vnode for the unnamed snapshot directory and the snapshot and
14604  * rename the snapshot. This is a very specialised (and simple) case of
14605  * rename(2) (which has to deal with a lot more complications). It differs
14606  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14607  */
14608 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14609 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14610     __unused uint32_t flags, vfs_context_t ctx)
14611 {
14612 	vnode_t rvp, snapdvp;
14613 	int error, i;
14614 	caddr_t newname_buf;
14615 	size_t name_len;
14616 	vnode_t fvp;
14617 	struct nameidata *fromnd, *tond;
14618 	/* carving out a chunk for structs that are too big to be on stack. */
14619 	struct {
14620 		struct nameidata from_node;
14621 		struct nameidata to_node;
14622 	} * __rename_data;
14623 
14624 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14625 	fromnd = &__rename_data->from_node;
14626 	tond = &__rename_data->to_node;
14627 
14628 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14629 	    OP_UNLINK, ctx);
14630 	if (error) {
14631 		goto out;
14632 	}
14633 	fvp  = fromnd->ni_vp;
14634 
14635 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14636 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14637 	if (error) {
14638 		goto out1;
14639 	}
14640 
14641 	/*
14642 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14643 	 * slashes.
14644 	 * (the length returned by copyinstr includes the terminating NUL)
14645 	 *
14646 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14647 	 * off here itself.
14648 	 */
14649 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14650 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14651 		error = EINVAL;
14652 		goto out1;
14653 	}
14654 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14655 		;
14656 	}
14657 	if (i < (int)name_len) {
14658 		error = EINVAL;
14659 		goto out1;
14660 	}
14661 
14662 #if CONFIG_MACF
14663 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14664 	    newname_buf);
14665 	if (error) {
14666 		goto out1;
14667 	}
14668 #endif
14669 
14670 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14671 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14672 	tond->ni_dvp = snapdvp;
14673 
14674 	error = namei(tond);
14675 	if (error) {
14676 		goto out2;
14677 	} else if (tond->ni_vp) {
14678 		/*
14679 		 * snapshot rename behaves differently than rename(2) - if the
14680 		 * new name exists, EEXIST is returned.
14681 		 */
14682 		vnode_put(tond->ni_vp);
14683 		error = EEXIST;
14684 		goto out2;
14685 	}
14686 
14687 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14688 	    &tond->ni_cnd, ctx);
14689 
14690 out2:
14691 	nameidone(tond);
14692 out1:
14693 	zfree(ZV_NAMEI, newname_buf);
14694 	vnode_put(fvp);
14695 	vnode_put(snapdvp);
14696 	vnode_put(rvp);
14697 	nameidone(fromnd);
14698 out:
14699 	kfree_type(typeof(*__rename_data), __rename_data);
14700 	return error;
14701 }
14702 
14703 /*
14704  * Mount a Filesystem snapshot
14705  *
14706  * get the vnode for the unnamed snapshot directory and the snapshot and
14707  * mount the snapshot.
14708  */
14709 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14710 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14711     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14712 {
14713 	mount_t mp;
14714 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14715 	struct fs_snapshot_mount_args smnt_data;
14716 	int error, mount_flags = 0;
14717 	struct nameidata *snapndp, *dirndp;
14718 	/* carving out a chunk for structs that are too big to be on stack. */
14719 	struct {
14720 		struct nameidata snapnd;
14721 		struct nameidata dirnd;
14722 	} * __snapshot_mount_data;
14723 
14724 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14725 	snapndp = &__snapshot_mount_data->snapnd;
14726 	dirndp = &__snapshot_mount_data->dirnd;
14727 
14728 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14729 	    OP_LOOKUP, ctx);
14730 	if (error) {
14731 		goto out;
14732 	}
14733 
14734 	snapvp  = snapndp->ni_vp;
14735 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14736 		error = EIO;
14737 		goto out1;
14738 	}
14739 
14740 	/* Convert snapshot_mount flags to mount flags */
14741 	if (flags & SNAPSHOT_MNT_NOSUID) {
14742 		mount_flags |= MNT_NOSUID;
14743 	}
14744 	if (flags & SNAPSHOT_MNT_NODEV) {
14745 		mount_flags |= MNT_NODEV;
14746 	}
14747 	if (flags & SNAPSHOT_MNT_DONTBROWSE) {
14748 		mount_flags |= MNT_DONTBROWSE;
14749 	}
14750 	if (flags & SNAPSHOT_MNT_IGNORE_OWNERSHIP) {
14751 		mount_flags |= MNT_IGNORE_OWNERSHIP;
14752 	}
14753 	if (flags & SNAPSHOT_MNT_NOFOLLOW) {
14754 		mount_flags |= MNT_NOFOLLOW;
14755 	}
14756 
14757 	/* Get the vnode to be covered */
14758 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14759 	    UIO_USERSPACE, directory, ctx);
14760 	if (mount_flags & MNT_NOFOLLOW) {
14761 		dirndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
14762 	}
14763 
14764 	error = namei(dirndp);
14765 	if (error) {
14766 		goto out1;
14767 	}
14768 
14769 	vp = dirndp->ni_vp;
14770 	pvp = dirndp->ni_dvp;
14771 	mp = vnode_mount(rvp);
14772 
14773 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14774 		error = EINVAL;
14775 		goto out2;
14776 	}
14777 
14778 #if CONFIG_MACF
14779 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14780 	    mp->mnt_vfsstat.f_fstypename);
14781 	if (error) {
14782 		goto out2;
14783 	}
14784 #endif
14785 
14786 	smnt_data.sm_mp  = mp;
14787 	smnt_data.sm_cnp = &snapndp->ni_cnd;
14788 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14789 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), mount_flags,
14790 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14791 
14792 out2:
14793 	vnode_put(vp);
14794 	vnode_put(pvp);
14795 	nameidone(dirndp);
14796 out1:
14797 	vnode_put(snapvp);
14798 	vnode_put(snapdvp);
14799 	vnode_put(rvp);
14800 	nameidone(snapndp);
14801 out:
14802 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14803 	return error;
14804 }
14805 
14806 /*
14807  * Root from a snapshot of the filesystem
14808  *
14809  * Marks the filesystem to root from the given snapshot on next boot.
14810  */
14811 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14812 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14813     vfs_context_t ctx)
14814 {
14815 	int error;
14816 	vnode_t rvp;
14817 	mount_t mp;
14818 	struct fs_snapshot_root_args root_data;
14819 	struct componentname cnp;
14820 	caddr_t name_buf;
14821 	size_t name_len;
14822 
14823 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14824 	if (error) {
14825 		return error;
14826 	}
14827 	mp = vnode_mount(rvp);
14828 
14829 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14830 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14831 	if (error) {
14832 		zfree(ZV_NAMEI, name_buf);
14833 		vnode_put(rvp);
14834 		return error;
14835 	}
14836 
14837 	// XXX MAC checks ?
14838 
14839 	/*
14840 	 * Grab mount_iterref so that we can release the vnode,
14841 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14842 	 */
14843 	error = mount_iterref(mp, 0);
14844 	vnode_put(rvp);
14845 	if (error) {
14846 		zfree(ZV_NAMEI, name_buf);
14847 		return error;
14848 	}
14849 
14850 	memset(&cnp, 0, sizeof(cnp));
14851 	cnp.cn_pnbuf = (char *)name_buf;
14852 	cnp.cn_nameiop = LOOKUP;
14853 	cnp.cn_flags = ISLASTCN | HASBUF;
14854 	cnp.cn_pnlen = MAXPATHLEN;
14855 	cnp.cn_nameptr = cnp.cn_pnbuf;
14856 	cnp.cn_namelen = (int)name_len;
14857 	root_data.sr_cnp = &cnp;
14858 
14859 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14860 
14861 	mount_iterdrop(mp);
14862 	zfree(ZV_NAMEI, name_buf);
14863 
14864 	return error;
14865 }
14866 
14867 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14868 vfs_context_can_snapshot(vfs_context_t ctx)
14869 {
14870 	static const char * const snapshot_entitlements[] = {
14871 		"com.apple.private.vfs.snapshot",
14872 		"com.apple.developer.vfs.snapshot",
14873 		"com.apple.private.apfs.arv.limited.snapshot",
14874 	};
14875 	static const size_t nentitlements =
14876 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14877 	size_t i;
14878 
14879 	task_t task = vfs_context_task(ctx);
14880 	for (i = 0; i < nentitlements; i++) {
14881 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14882 			return TRUE;
14883 		}
14884 	}
14885 	return FALSE;
14886 }
14887 
14888 /*
14889  * FS snapshot operations dispatcher
14890  */
14891 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14892 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14893     __unused int32_t *retval)
14894 {
14895 	int error;
14896 	vfs_context_t ctx = vfs_context_current();
14897 
14898 	AUDIT_ARG(fd, uap->dirfd);
14899 	AUDIT_ARG(value32, uap->op);
14900 
14901 	if (!vfs_context_can_snapshot(ctx)) {
14902 		return EPERM;
14903 	}
14904 
14905 	/*
14906 	 * Enforce user authorization for snapshot modification operations,
14907 	 * or if trying to root from snapshot.
14908 	 */
14909 	if (uap->op != SNAPSHOT_OP_MOUNT) {
14910 		vnode_t dvp = NULLVP;
14911 		vnode_t devvp = NULLVP;
14912 		mount_t mp;
14913 
14914 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14915 		if (error) {
14916 			return error;
14917 		}
14918 		mp = vnode_mount(dvp);
14919 		devvp = mp->mnt_devvp;
14920 
14921 		/* get an iocount on devvp */
14922 		if (devvp == NULLVP) {
14923 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14924 			/* for mounts which arent block devices */
14925 			if (error == ENOENT) {
14926 				error = ENXIO;
14927 			}
14928 		} else {
14929 			error = vnode_getwithref(devvp);
14930 		}
14931 
14932 		if (error) {
14933 			vnode_put(dvp);
14934 			return error;
14935 		}
14936 
14937 		if ((vfs_context_issuser(ctx) == 0) &&
14938 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14939 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14940 			error = EPERM;
14941 		}
14942 		vnode_put(dvp);
14943 		vnode_put(devvp);
14944 
14945 		if (error) {
14946 			return error;
14947 		}
14948 	}
14949 
14950 	switch (uap->op) {
14951 	case SNAPSHOT_OP_CREATE:
14952 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14953 		break;
14954 	case SNAPSHOT_OP_DELETE:
14955 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14956 		break;
14957 	case SNAPSHOT_OP_RENAME:
14958 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14959 		    uap->flags, ctx);
14960 		break;
14961 	case SNAPSHOT_OP_MOUNT:
14962 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14963 		    uap->data, uap->flags, ctx);
14964 		break;
14965 	case SNAPSHOT_OP_REVERT:
14966 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14967 		break;
14968 #if CONFIG_MNT_ROOTSNAP
14969 	case SNAPSHOT_OP_ROOT:
14970 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14971 		break;
14972 #endif /* CONFIG_MNT_ROOTSNAP */
14973 	default:
14974 		error = ENOSYS;
14975 	}
14976 
14977 	return error;
14978 }
14979