xref: /xnu-11215.61.5/bsd/vfs/vfs_syscalls.c (revision 4f1223e81cd707a65cc109d0b8ad6653699da3c4) !
1 /*
2  * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 #if CONFIG_EXCLAVES
115 #include <vfs/vfs_exclave_fs.h>
116 #endif
117 
118 #include <security/audit/audit.h>
119 #include <bsm/audit_kevents.h>
120 
121 #include <mach/mach_types.h>
122 #include <kern/kern_types.h>
123 #include <kern/kalloc.h>
124 #include <kern/task.h>
125 
126 #include <vm/vm_pageout.h>
127 #include <vm/vm_protos.h>
128 #include <vm/memory_object_xnu.h>
129 
130 #include <libkern/OSAtomic.h>
131 #include <os/atomic_private.h>
132 #include <pexpert/pexpert.h>
133 #include <IOKit/IOBSD.h>
134 
135 // deps for MIG call
136 #include <kern/host.h>
137 #include <kern/ipc_misc.h>
138 #include <mach/host_priv.h>
139 #include <mach/vfs_nspace.h>
140 #include <os/log.h>
141 
142 #include <nfs/nfs_conf.h>
143 
144 #if ROUTEFS
145 #include <miscfs/routefs/routefs.h>
146 #endif /* ROUTEFS */
147 
148 #if CONFIG_MACF
149 #include <security/mac.h>
150 #include <security/mac_framework.h>
151 #endif
152 
153 #if CONFIG_FSE
154 #define GET_PATH(x) \
155 	((x) = get_pathbuff())
156 #define RELEASE_PATH(x) \
157 	release_pathbuff(x)
158 #else
159 #define GET_PATH(x)     \
160 	((x) = zalloc(ZV_NAMEI))
161 #define RELEASE_PATH(x) \
162 	zfree(ZV_NAMEI, x)
163 #endif /* CONFIG_FSE */
164 
165 #ifndef HFS_GET_BOOT_INFO
166 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
167 #endif
168 
169 #ifndef HFS_SET_BOOT_INFO
170 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
171 #endif
172 
173 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
174 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
175 #endif
176 
177 extern void disk_conditioner_unmount(mount_t mp);
178 
179 /* struct for checkdirs iteration */
180 struct cdirargs {
181 	vnode_t olddp;
182 	vnode_t newdp;
183 };
184 /* callback  for checkdirs iteration */
185 static int checkdirs_callback(proc_t p, void * arg);
186 
187 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
188 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
189 void enablequotas(struct mount *mp, vfs_context_t ctx);
190 static int getfsstat_callback(mount_t mp, void * arg);
191 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
192 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
193 static int sync_callback(mount_t, void *);
194 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
195     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
196     boolean_t partial_copy);
197 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
198 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
199     struct componentname *cnp, user_addr_t fsmountargs,
200     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
201 void vfs_notify_mount(vnode_t pdvp);
202 
203 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
204 
205 struct fd_vn_data * fg_vn_data_alloc(void);
206 
207 /*
208  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
209  * Concurrent lookups (or lookups by ids) on hard links can cause the
210  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
211  * does) to return ENOENT as the path cannot be returned from the name cache
212  * alone. We have no option but to retry and hope to get one namei->reverse path
213  * generation done without an intervening lookup, lookup by id on the hard link
214  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
215  * which currently are the MAC hooks for rename, unlink and rmdir.
216  */
217 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
218 
219 /* Max retry limit for rename due to vnode recycling. */
220 #define MAX_RENAME_ERECYCLE_RETRIES 1024
221 
222 #define MAX_LINK_ENOENT_RETRIES 1024
223 
224 /* Max retries for concurrent mounts on the same covered vnode. */
225 #define MAX_MOUNT_RETRIES       10
226 
227 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
228     int unlink_flags);
229 
230 #ifdef CONFIG_IMGSRC_ACCESS
231 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
232 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
233 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
234 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
235 static void mount_end_update(mount_t mp);
236 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
237 #endif /* CONFIG_IMGSRC_ACCESS */
238 
239 //snapshot functions
240 #if CONFIG_MNT_ROOTSNAP
241 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
242 #else
243 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
244 #endif
245 
246 __private_extern__
247 int sync_internal(void);
248 
249 __private_extern__
250 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
251 
252 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
253 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
254 
255 /* vars for sync mutex */
256 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
257 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
258 
259 extern lck_rw_t rootvnode_rw_lock;
260 
261 VFS_SMR_DECLARE;
262 extern uint32_t nc_smr_enabled;
263 
264 /*
265  * incremented each time a mount or unmount operation occurs
266  * used to invalidate the cached value of the rootvp in the
267  * mount structure utilized by cache_lookup_path
268  */
269 uint32_t mount_generation = 0;
270 
271 /* counts number of mount and unmount operations */
272 unsigned int vfs_nummntops = 0;
273 
274 /* system-wide, per-boot unique mount ID */
275 static _Atomic uint64_t mount_unique_id = 1;
276 
277 extern const struct fileops vnops;
278 #if CONFIG_APPLEDOUBLE
279 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
280 #endif /* CONFIG_APPLEDOUBLE */
281 
282 /* Maximum buffer length supported by fsgetpath(2) */
283 #define FSGETPATH_MAXBUFLEN  8192
284 
285 /*
286  * Virtual File System System Calls
287  */
288 
289 /*
290  * Private in-kernel mounting spi (specific use-cases only)
291  */
292 boolean_t
vfs_iskernelmount(mount_t mp)293 vfs_iskernelmount(mount_t mp)
294 {
295 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
296 }
297 
298 __private_extern__
299 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)300 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
301     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
302     vfs_context_t ctx)
303 {
304 	struct nameidata nd;
305 	boolean_t did_namei;
306 	int error;
307 
308 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
309 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
310 	if (syscall_flags & MNT_NOFOLLOW) {
311 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
312 	}
313 
314 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
315 
316 	/*
317 	 * Get the vnode to be covered if it's not supplied
318 	 */
319 	if (vp == NULLVP) {
320 		error = namei(&nd);
321 		if (error) {
322 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
323 				printf("failed to locate mount-on path: %s ", path);
324 			}
325 			return error;
326 		}
327 		vp = nd.ni_vp;
328 		pvp = nd.ni_dvp;
329 		did_namei = TRUE;
330 	} else {
331 		char *pnbuf = CAST_DOWN(char *, path);
332 
333 		nd.ni_cnd.cn_pnbuf = pnbuf;
334 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
335 		did_namei = FALSE;
336 	}
337 
338 	kern_flags |= KERNEL_MOUNT_KMOUNT;
339 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
340 	    syscall_flags, kern_flags, NULL, ctx);
341 
342 	if (did_namei) {
343 		vnode_put(vp);
344 		vnode_put(pvp);
345 		nameidone(&nd);
346 	}
347 
348 	return error;
349 }
350 
351 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)352 vfs_mount_at_path(const char *fstype, const char *path,
353     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
354     int mnt_flags, int flags)
355 {
356 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
357 	int error, km_flags = 0;
358 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
359 
360 	/*
361 	 * This call is currently restricted to specific use cases.
362 	 */
363 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
364 		return ENOTSUP;
365 	}
366 
367 #if !defined(XNU_TARGET_OS_OSX)
368 	if (strcmp(fstype, "lifs") == 0) {
369 		syscall_flags |= MNT_NOEXEC;
370 	}
371 #endif
372 
373 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
374 		km_flags |= KERNEL_MOUNT_NOAUTH;
375 	}
376 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
377 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
378 	}
379 
380 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
381 	    syscall_flags, km_flags, ctx);
382 	if (error) {
383 		printf("%s: mount on %s failed, error %d\n", __func__, path,
384 		    error);
385 	}
386 
387 	return error;
388 }
389 
390 /*
391  * Mount a file system.
392  */
393 /* ARGSUSED */
394 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)395 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
396 {
397 	struct __mac_mount_args muap;
398 
399 	muap.type = uap->type;
400 	muap.path = uap->path;
401 	muap.flags = uap->flags;
402 	muap.data = uap->data;
403 	muap.mac_p = USER_ADDR_NULL;
404 	return __mac_mount(p, &muap, retval);
405 }
406 
407 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)408 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
409 {
410 	struct componentname    cn;
411 	vfs_context_t           ctx = vfs_context_current();
412 	size_t                  dummy = 0;
413 	int                     error;
414 	int                     flags = uap->flags;
415 	char                    fstypename[MFSNAMELEN];
416 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
417 	vnode_t                 pvp;
418 	vnode_t                 vp;
419 
420 	AUDIT_ARG(fd, uap->fd);
421 	AUDIT_ARG(fflags, flags);
422 	/* fstypename will get audited by mount_common */
423 
424 	/* Sanity check the flags */
425 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
426 		return ENOTSUP;
427 	}
428 
429 	if (flags & MNT_UNION) {
430 		return EPERM;
431 	}
432 
433 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
434 	if (error) {
435 		return error;
436 	}
437 
438 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
439 		return error;
440 	}
441 
442 	if ((error = vnode_getwithref(vp)) != 0) {
443 		file_drop(uap->fd);
444 		return error;
445 	}
446 
447 	pvp = vnode_getparent(vp);
448 	if (pvp == NULL) {
449 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
450 			error = EBUSY;
451 		} else {
452 			error = EINVAL;
453 		}
454 		vnode_put(vp);
455 		file_drop(uap->fd);
456 		return error;
457 	}
458 
459 	memset(&cn, 0, sizeof(struct componentname));
460 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
461 	cn.cn_pnlen = MAXPATHLEN;
462 
463 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
464 		zfree(ZV_NAMEI, cn.cn_pnbuf);
465 		vnode_put(pvp);
466 		vnode_put(vp);
467 		file_drop(uap->fd);
468 		return error;
469 	}
470 
471 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
472 
473 	zfree(ZV_NAMEI, cn.cn_pnbuf);
474 	vnode_put(pvp);
475 	vnode_put(vp);
476 	file_drop(uap->fd);
477 
478 	return error;
479 }
480 
481 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
482 
483 /*
484  * Get the size of a graft file (a manifest or payload file).
485  * The vp should be an iocounted vnode.
486  */
487 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)488 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
489 {
490 	struct stat64 sb = {};
491 	int error;
492 
493 	*size = 0;
494 
495 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
496 	if (error) {
497 		return error;
498 	}
499 
500 	if (sb.st_size == 0) {
501 		error = ENODATA;
502 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
503 		error = EFBIG;
504 	} else {
505 		*size = (size_t) sb.st_size;
506 	}
507 
508 	return error;
509 }
510 
511 /*
512  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
513  * `size` must already be validated.
514  */
515 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)516 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
517 {
518 	return vn_rdwr(UIO_READ, graft_vp,
519 	           (caddr_t) buf, (int) size, /* offset */ 0,
520 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
521 	           vfs_context_ucred(vctx), /* resid */ NULL,
522 	           vfs_context_proc(vctx));
523 }
524 
525 /*
526  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
527  * and read it into `buf`.
528  * If `path_prefix` is non-NULL, verify that the file path has that prefix.
529  */
530 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,const char * path_prefix,size_t * size,void * buf)531 graft_secureboot_read_fd(int fd, vfs_context_t vctx, const char *path_prefix, size_t *size, void *buf)
532 {
533 	vnode_t metadata_vp = NULLVP;
534 	char *path = NULL;
535 	int error;
536 
537 	// Convert this graft fd to a vnode.
538 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
539 		goto out;
540 	}
541 
542 	// Verify that the vnode path starts with `path_prefix` if it was passed.
543 	if (path_prefix) {
544 		int len = MAXPATHLEN;
545 		path = zalloc(ZV_NAMEI);
546 		if ((error = vn_getpath(metadata_vp, path, &len))) {
547 			goto out;
548 		}
549 		if (strncmp(path, path_prefix, strlen(path_prefix))) {
550 			error = EINVAL;
551 			goto out;
552 		}
553 	}
554 
555 	// Get (and validate) size information.
556 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
557 		goto out;
558 	}
559 
560 	// Read each file into the provided buffer - we must get the expected amount of bytes.
561 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
562 		goto out;
563 	}
564 
565 out:
566 	if (path) {
567 		zfree(ZV_NAMEI, path);
568 	}
569 	if (metadata_vp) {
570 		vnode_put(metadata_vp);
571 		metadata_vp = NULLVP;
572 	}
573 
574 	return error;
575 }
576 
577 #if XNU_TARGET_OS_OSX
578 #if defined(__arm64e__)
579 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/manifests/"
580 #else /* x86_64 */
581 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/"
582 #endif /* x86_64 */
583 #else /* !XNU_TARGET_OS_OSX */
584 #define MOBILE_ASSET_DATA_VAULT_PATH "/private/var/MobileAsset/AssetsV2/manifests/"
585 #endif /* !XNU_TARGET_OS_OSX */
586 
587 /*
588  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
589  * provided in `gfs`, saving the size of data read in `gfs`.
590  */
591 static int
graft_secureboot_read_metadata(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)592 graft_secureboot_read_metadata(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
593     vfs_context_t vctx, fsioc_graft_fs_t *gfs)
594 {
595 	const char *manifest_path_prefix = NULL;
596 	int error;
597 
598 	// For Mobile Asset, make sure that the manifest comes from a data vault.
599 	if (graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET) {
600 		manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_PATH;
601 	}
602 
603 	// Read the authentic manifest.
604 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
605 	    manifest_path_prefix, &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
606 		return error;
607 	}
608 
609 	// The user manifest is currently unused, but set its size.
610 	gfs->user_manifest_size = 0;
611 
612 	// Read the payload.
613 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
614 	    NULL, &gfs->payload_size, gfs->payload))) {
615 		return error;
616 	}
617 
618 	return 0;
619 }
620 
621 /*
622  * Call into the filesystem to verify and graft a cryptex.
623  */
624 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)625 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
626     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
627 {
628 	fsioc_graft_fs_t gfs = {};
629 	uint64_t graft_dir_ino = 0;
630 	struct stat64 sb = {};
631 	int error;
632 
633 	// Pre-flight arguments.
634 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
635 		// Make sure that this graft version matches what we support.
636 		return ENOTSUP;
637 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
638 		// For this type, cryptex VP must live on same volume as the target of graft.
639 		return EXDEV;
640 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
641 		// We cannot graft upon non-directories.
642 		return ENOTDIR;
643 	} else if (cryptex_vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) {
644 		// We do not allow grafts inside disk images.
645 		return ENODEV;
646 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
647 	    sbc_args->sbc_payload_fd < 0) {
648 		// We cannot graft without a manifest and payload.
649 		return EINVAL;
650 	}
651 
652 	if (mounton_vp) {
653 		// Get the mounton's inode number.
654 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
655 		if (error) {
656 			return error;
657 		}
658 		graft_dir_ino = (uint64_t) sb.st_ino;
659 	}
660 
661 	// Create buffers (of our maximum-defined size) to store authentication info.
662 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
663 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
664 
665 	if (!gfs.authentic_manifest || !gfs.payload) {
666 		error = ENOMEM;
667 		goto out;
668 	}
669 
670 	// Read our fd's into our buffers.
671 	// (Note that this will set the buffer size fields in `gfs`.)
672 	error = graft_secureboot_read_metadata(graft_type, sbc_args, vctx, &gfs);
673 	if (error) {
674 		goto out;
675 	}
676 
677 	gfs.graft_version = FSIOC_GRAFT_VERSION;
678 	gfs.graft_type = graft_type;
679 	gfs.graft_4cc = sbc_args->sbc_4cc;
680 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
681 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
682 	}
683 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
684 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
685 	}
686 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
687 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
688 	}
689 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
690 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
691 	}
692 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
693 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
694 	}
695 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
696 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
697 	}
698 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
699 
700 	// Call into the FS to perform the graft (and validation).
701 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
702 
703 out:
704 	if (gfs.authentic_manifest) {
705 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
706 		gfs.authentic_manifest = NULL;
707 	}
708 	if (gfs.payload) {
709 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
710 		gfs.payload = NULL;
711 	}
712 
713 	return error;
714 }
715 
716 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
717 
718 /*
719  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
720  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
721  */
722 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)723 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
724 {
725 	int ua_dmgfd = uap->dmg_fd;
726 	user_addr_t ua_mountdir = uap->mountdir;
727 	uint32_t ua_grafttype = uap->graft_type;
728 	user_addr_t ua_graftargs = uap->gda;
729 
730 	graftdmg_args_un kern_gda = {};
731 	int error = 0;
732 	secure_boot_cryptex_args_t *sbc_args = NULL;
733 
734 	vnode_t cryptex_vp = NULLVP;
735 	vnode_t mounton_vp = NULLVP;
736 	struct nameidata nd = {};
737 	vfs_context_t ctx = vfs_context_current();
738 
739 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
740 		return EPERM;
741 	}
742 
743 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
744 	if (error) {
745 		return error;
746 	}
747 
748 	// Copy mount dir in, if provided.
749 	if (ua_mountdir != USER_ADDR_NULL) {
750 		// Acquire vnode for mount-on path
751 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
752 		    UIO_USERSPACE, ua_mountdir, ctx);
753 
754 		error = namei(&nd);
755 		if (error) {
756 			return error;
757 		}
758 		mounton_vp = nd.ni_vp;
759 	}
760 
761 	// Convert fd to vnode.
762 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
763 	if (error) {
764 		goto graftout;
765 	}
766 
767 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
768 		error = EINVAL;
769 	} else {
770 		sbc_args = &kern_gda.sbc_args;
771 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
772 	}
773 
774 graftout:
775 	if (cryptex_vp) {
776 		vnode_put(cryptex_vp);
777 		cryptex_vp = NULLVP;
778 	}
779 	if (mounton_vp) {
780 		vnode_put(mounton_vp);
781 		mounton_vp = NULLVP;
782 	}
783 	if (ua_mountdir != USER_ADDR_NULL) {
784 		nameidone(&nd);
785 	}
786 
787 	return error;
788 }
789 
790 /*
791  * Ungraft a cryptex disk image (via mount dir FD)
792  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
793  */
794 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)795 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
796 {
797 	int error = 0;
798 	user_addr_t ua_mountdir = uap->mountdir;
799 	fsioc_ungraft_fs_t ugfs;
800 	vnode_t mounton_vp = NULLVP;
801 	struct nameidata nd = {};
802 	vfs_context_t ctx = vfs_context_current();
803 
804 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
805 		return EPERM;
806 	}
807 
808 	if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
809 		return EINVAL;
810 	}
811 
812 	ugfs.ungraft_flags = 0;
813 
814 	// Acquire vnode for mount-on path
815 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
816 	    UIO_USERSPACE, ua_mountdir, ctx);
817 
818 	error = namei(&nd);
819 	if (error) {
820 		return error;
821 	}
822 	mounton_vp = nd.ni_vp;
823 
824 	// Call into the FS to perform the ungraft
825 	error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
826 
827 	vnode_put(mounton_vp);
828 	nameidone(&nd);
829 
830 	return error;
831 }
832 
833 
834 void
vfs_notify_mount(vnode_t pdvp)835 vfs_notify_mount(vnode_t pdvp)
836 {
837 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
838 	lock_vnode_and_post(pdvp, NOTE_WRITE);
839 }
840 
841 /*
842  * __mac_mount:
843  *	Mount a file system taking into account MAC label behavior.
844  *	See mount(2) man page for more information
845  *
846  * Parameters:    p                        Process requesting the mount
847  *                uap                      User argument descriptor (see below)
848  *                retval                   (ignored)
849  *
850  * Indirect:      uap->type                Filesystem type
851  *                uap->path                Path to mount
852  *                uap->data                Mount arguments
853  *                uap->mac_p               MAC info
854  *                uap->flags               Mount flags
855  *
856  *
857  * Returns:        0                       Success
858  *                !0                       Not success
859  */
860 boolean_t root_fs_upgrade_try = FALSE;
861 
862 #define MAX_NESTED_UNION_MOUNTS  10
863 
864 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)865 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
866 {
867 	vnode_t pvp = NULLVP;
868 	vnode_t vp = NULLVP;
869 	int need_nameidone = 0;
870 	vfs_context_t ctx = vfs_context_current();
871 	char fstypename[MFSNAMELEN];
872 	struct nameidata nd;
873 	size_t dummy = 0;
874 	char *labelstr = NULL;
875 	size_t labelsz = 0;
876 	int flags = uap->flags;
877 	int error;
878 	int num_retries = 0;
879 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
880 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
881 #else
882 #pragma unused(p)
883 #endif
884 	/*
885 	 * Get the fs type name from user space
886 	 */
887 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
888 	if (error) {
889 		return error;
890 	}
891 
892 retry:
893 	/*
894 	 * Get the vnode to be covered
895 	 */
896 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
897 	    UIO_USERSPACE, uap->path, ctx);
898 	if (flags & MNT_NOFOLLOW) {
899 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
900 	}
901 	error = namei(&nd);
902 	if (error) {
903 		goto out;
904 	}
905 	need_nameidone = 1;
906 	vp = nd.ni_vp;
907 	pvp = nd.ni_dvp;
908 
909 #ifdef CONFIG_IMGSRC_ACCESS
910 	/* Mounting image source cannot be batched with other operations */
911 	if (flags == MNT_IMGSRC_BY_INDEX) {
912 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
913 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
914 		goto out;
915 	}
916 #endif /* CONFIG_IMGSRC_ACCESS */
917 
918 #if CONFIG_MACF
919 	/*
920 	 * Get the label string (if any) from user space
921 	 */
922 	if (uap->mac_p != USER_ADDR_NULL) {
923 		struct user_mac mac;
924 		size_t ulen = 0;
925 
926 		if (is_64bit) {
927 			struct user64_mac mac64;
928 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
929 			mac.m_buflen = (user_size_t)mac64.m_buflen;
930 			mac.m_string = (user_addr_t)mac64.m_string;
931 		} else {
932 			struct user32_mac mac32;
933 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
934 			mac.m_buflen = mac32.m_buflen;
935 			mac.m_string = mac32.m_string;
936 		}
937 		if (error) {
938 			goto out;
939 		}
940 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
941 		    (mac.m_buflen < 2)) {
942 			error = EINVAL;
943 			goto out;
944 		}
945 		labelsz = mac.m_buflen;
946 		labelstr = kalloc_data(labelsz, Z_WAITOK);
947 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
948 		if (error) {
949 			goto out;
950 		}
951 		AUDIT_ARG(mac_string, labelstr);
952 	}
953 #endif /* CONFIG_MACF */
954 
955 	AUDIT_ARG(fflags, flags);
956 
957 	if (flags & MNT_UNION) {
958 #if CONFIG_UNION_MOUNTS
959 		mount_t mp = vp->v_mount;
960 		int nested_union_mounts = 0;
961 
962 		name_cache_lock_shared();
963 
964 		/* Walk up the vnodecovered chain and check for nested union mounts. */
965 		mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
966 		while (mp) {
967 			if (!(mp->mnt_flag & MNT_UNION)) {
968 				break;
969 			}
970 			mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
971 
972 			/*
973 			 * Limit the max nested unon mounts to prevent stack exhaustion
974 			 * when calling lookup_traverse_union().
975 			 */
976 			if (++nested_union_mounts >= MAX_NESTED_UNION_MOUNTS) {
977 				error = ELOOP;
978 				break;
979 			}
980 		}
981 
982 		name_cache_unlock();
983 		if (error) {
984 			goto out;
985 		}
986 #else
987 		error = EPERM;
988 		goto out;
989 #endif /* CONFIG_UNION_MOUNTS */
990 	}
991 
992 	if ((vp->v_flag & VROOT) &&
993 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
994 #if CONFIG_UNION_MOUNTS
995 		if (!(flags & MNT_UNION)) {
996 			flags |= MNT_UPDATE;
997 		} else {
998 			/*
999 			 * For a union mount on '/', treat it as fresh
1000 			 * mount instead of update.
1001 			 * Otherwise, union mouting on '/' used to panic the
1002 			 * system before, since mnt_vnodecovered was found to
1003 			 * be NULL for '/' which is required for unionlookup
1004 			 * after it gets ENOENT on union mount.
1005 			 */
1006 			flags = (flags & ~(MNT_UPDATE));
1007 		}
1008 #else
1009 		flags |= MNT_UPDATE;
1010 #endif /* CONFIG_UNION_MOUNTS */
1011 
1012 #if SECURE_KERNEL
1013 		if ((flags & MNT_RDONLY) == 0) {
1014 			/* Release kernels are not allowed to mount "/" as rw */
1015 			error = EPERM;
1016 			goto out;
1017 		}
1018 #endif
1019 
1020 		/*
1021 		 * See 7392553 for more details on why this check exists.
1022 		 * Suffice to say: If this check is ON and something tries
1023 		 * to mount the rootFS RW, we'll turn off the codesign
1024 		 * bitmap optimization.
1025 		 */
1026 #if CHECK_CS_VALIDATION_BITMAP
1027 		if ((flags & MNT_RDONLY) == 0) {
1028 			root_fs_upgrade_try = TRUE;
1029 		}
1030 #endif
1031 	}
1032 
1033 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
1034 	    labelstr, ctx);
1035 
1036 out:
1037 
1038 #if CONFIG_MACF
1039 	kfree_data(labelstr, labelsz);
1040 #endif /* CONFIG_MACF */
1041 
1042 	if (vp) {
1043 		vnode_put(vp);
1044 		vp = NULLVP;
1045 	}
1046 	if (pvp) {
1047 		vnode_put(pvp);
1048 		pvp = NULLVP;
1049 	}
1050 	if (need_nameidone) {
1051 		nameidone(&nd);
1052 		need_nameidone = 0;
1053 	}
1054 
1055 	if (error == EBUSY) {
1056 		/* Retry the lookup and mount again due to concurrent mounts. */
1057 		if (++num_retries < MAX_MOUNT_RETRIES) {
1058 			goto retry;
1059 		}
1060 	}
1061 
1062 	return error;
1063 }
1064 
1065 /*
1066  * common mount implementation (final stage of mounting)
1067  *
1068  * Arguments:
1069  *  fstypename	file system type (ie it's vfs name)
1070  *  pvp		parent of covered vnode
1071  *  vp		covered vnode
1072  *  cnp		component name (ie path) of covered vnode
1073  *  flags	generic mount flags
1074  *  fsmountargs	file system specific data
1075  *  labelstr	optional MAC label
1076  *  kernelmount	TRUE for mounts initiated from inside the kernel
1077  *  ctx		caller's context
1078  */
1079 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1080 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1081     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1082     char *labelstr, vfs_context_t ctx)
1083 {
1084 #if !CONFIG_MACF
1085 #pragma unused(labelstr)
1086 #endif
1087 	struct vnode *devvp = NULLVP;
1088 	struct vnode *device_vnode = NULLVP;
1089 #if CONFIG_MACF
1090 	struct vnode *rvp;
1091 #endif
1092 	struct mount *mp = NULL;
1093 	struct vfstable *vfsp = (struct vfstable *)0;
1094 	struct proc *p = vfs_context_proc(ctx);
1095 	int error, flag = 0;
1096 	bool flag_set = false;
1097 	user_addr_t devpath = USER_ADDR_NULL;
1098 	int ronly = 0;
1099 	int mntalloc = 0;
1100 	boolean_t vfsp_ref = FALSE;
1101 	boolean_t is_rwlock_locked = FALSE;
1102 	boolean_t did_rele = FALSE;
1103 	boolean_t have_usecount = FALSE;
1104 	boolean_t did_set_lmount = FALSE;
1105 	boolean_t did_set_vmount = FALSE;
1106 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1107 
1108 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1109 	/* Check for mutually-exclusive flag bits */
1110 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1111 	int bitcount = 0;
1112 	while (checkflags != 0) {
1113 		checkflags &= (checkflags - 1);
1114 		bitcount++;
1115 	}
1116 
1117 	if (bitcount > 1) {
1118 		//not allowed to request multiple mount-by-role flags
1119 		error = EINVAL;
1120 		goto out1;
1121 	}
1122 #endif
1123 
1124 	/*
1125 	 * Process an update for an existing mount
1126 	 */
1127 	if (flags & MNT_UPDATE) {
1128 		if ((vp->v_flag & VROOT) == 0) {
1129 			error = EINVAL;
1130 			goto out1;
1131 		}
1132 		mp = vp->v_mount;
1133 
1134 		/* if unmount or mount in progress, return error */
1135 		mount_lock_spin(mp);
1136 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1137 			mount_unlock(mp);
1138 			error = EBUSY;
1139 			goto out1;
1140 		}
1141 		mp->mnt_lflag |= MNT_LMOUNT;
1142 		did_set_lmount = TRUE;
1143 		mount_unlock(mp);
1144 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1145 		is_rwlock_locked = TRUE;
1146 		/*
1147 		 * We only allow the filesystem to be reloaded if it
1148 		 * is currently mounted read-only.
1149 		 */
1150 		if ((flags & MNT_RELOAD) &&
1151 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1152 			error = ENOTSUP;
1153 			goto out1;
1154 		}
1155 
1156 		/*
1157 		 * If content protection is enabled, update mounts are not
1158 		 * allowed to turn it off.
1159 		 */
1160 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1161 		    ((flags & MNT_CPROTECT) == 0)) {
1162 			error = EINVAL;
1163 			goto out1;
1164 		}
1165 
1166 		/*
1167 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1168 		 * failure to return an error for this so we'll just silently
1169 		 * add it if it is not passed in.
1170 		 */
1171 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1172 		    ((flags & MNT_REMOVABLE) == 0)) {
1173 			flags |= MNT_REMOVABLE;
1174 		}
1175 
1176 		/* Can't downgrade the backer of the root FS */
1177 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1178 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1179 			error = ENOTSUP;
1180 			goto out1;
1181 		}
1182 
1183 		/*
1184 		 * Only root, or the user that did the original mount is
1185 		 * permitted to update it.
1186 		 */
1187 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1188 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1189 			goto out1;
1190 		}
1191 #if CONFIG_MACF
1192 		error = mac_mount_check_remount(ctx, mp, flags);
1193 		if (error != 0) {
1194 			goto out1;
1195 		}
1196 #endif
1197 		/*
1198 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1199 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1200 		 */
1201 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1202 			flags |= MNT_NOSUID | MNT_NODEV;
1203 			if (mp->mnt_flag & MNT_NOEXEC) {
1204 				flags |= MNT_NOEXEC;
1205 			}
1206 		}
1207 		flag = mp->mnt_flag;
1208 		flag_set = true;
1209 
1210 
1211 
1212 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1213 
1214 		vfsp = mp->mnt_vtable;
1215 		goto update;
1216 	} // MNT_UPDATE
1217 
1218 	/*
1219 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1220 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1221 	 */
1222 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1223 		flags |= MNT_NOSUID | MNT_NODEV;
1224 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1225 			flags |= MNT_NOEXEC;
1226 		}
1227 	}
1228 
1229 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1230 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1231 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1232 	mount_list_lock();
1233 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1234 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1235 			vfsp->vfc_refcount++;
1236 			vfsp_ref = TRUE;
1237 			break;
1238 		}
1239 	}
1240 	mount_list_unlock();
1241 	if (vfsp == NULL) {
1242 		error = ENODEV;
1243 		goto out1;
1244 	}
1245 
1246 	/*
1247 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1248 	 * except in ROSV configs and for the initial BaseSystem root.
1249 	 */
1250 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1251 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1252 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1253 		error = EINVAL;  /* unsupported request */
1254 		goto out1;
1255 	}
1256 
1257 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1258 	if (error != 0) {
1259 		goto out1;
1260 	}
1261 
1262 	/*
1263 	 * Upon successful of prepare_coveredvp(), VMOUNT is set for the covered vp.
1264 	 */
1265 	did_set_vmount = TRUE;
1266 
1267 	/*
1268 	 * Allocate and initialize the filesystem (mount_t)
1269 	 */
1270 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1271 	mntalloc = 1;
1272 
1273 	/* Initialize the default IO constraints */
1274 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1275 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1276 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1277 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1278 	mp->mnt_devblocksize = DEV_BSIZE;
1279 	mp->mnt_alignmentmask = PAGE_MASK;
1280 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1281 	mp->mnt_ioscale = 1;
1282 	mp->mnt_ioflags = 0;
1283 	mp->mnt_realrootvp = NULLVP;
1284 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1285 
1286 	mp->mnt_lflag |= MNT_LMOUNT;
1287 	did_set_lmount = TRUE;
1288 
1289 	TAILQ_INIT(&mp->mnt_vnodelist);
1290 	TAILQ_INIT(&mp->mnt_workerqueue);
1291 	TAILQ_INIT(&mp->mnt_newvnodes);
1292 	mount_lock_init(mp);
1293 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1294 	is_rwlock_locked = TRUE;
1295 	mp->mnt_op = vfsp->vfc_vfsops;
1296 	mp->mnt_vtable = vfsp;
1297 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1298 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1299 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1300 	do {
1301 		size_t pathlen = MAXPATHLEN;
1302 
1303 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1304 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1305 		}
1306 	} while (0);
1307 	mp->mnt_vnodecovered = vp;
1308 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1309 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1310 	mp->mnt_devbsdunit = 0;
1311 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1312 
1313 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1314 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1315 
1316 	if (kernelmount) {
1317 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1318 	}
1319 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1320 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1321 	}
1322 
1323 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1324 		// kernel mounted devfs
1325 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1326 	}
1327 
1328 update:
1329 
1330 	/*
1331 	 * Set the mount level flags.
1332 	 */
1333 	if (flags & MNT_RDONLY) {
1334 		mp->mnt_flag |= MNT_RDONLY;
1335 	} else if (mp->mnt_flag & MNT_RDONLY) {
1336 		// disallow read/write upgrades of file systems that
1337 		// had the TYPENAME_OVERRIDE feature set.
1338 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1339 			error = EPERM;
1340 			goto out1;
1341 		}
1342 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1343 	}
1344 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1345 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1346 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1347 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1348 	    MNT_QUARANTINE | MNT_CPROTECT);
1349 
1350 #if SECURE_KERNEL
1351 #if !CONFIG_MNT_SUID
1352 	/*
1353 	 * On release builds of iOS based platforms, always enforce NOSUID on
1354 	 * all mounts. We do this here because we can catch update mounts as well as
1355 	 * non-update mounts in this case.
1356 	 */
1357 	mp->mnt_flag |= (MNT_NOSUID);
1358 #endif
1359 #endif
1360 
1361 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1362 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1363 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1364 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1365 	    MNT_QUARANTINE | MNT_CPROTECT);
1366 
1367 #if CONFIG_MACF
1368 	if (flags & MNT_MULTILABEL) {
1369 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1370 			error = EINVAL;
1371 			goto out1;
1372 		}
1373 		mp->mnt_flag |= MNT_MULTILABEL;
1374 	}
1375 #endif
1376 	/*
1377 	 * Process device path for local file systems if requested.
1378 	 *
1379 	 * Snapshot and mount-by-role mounts do not use this path; they are
1380 	 * passing other opaque data in the device path field.
1381 	 *
1382 	 * Basesystemroot mounts pass a device path to be resolved here,
1383 	 * but it's just a char * already inside the kernel, which
1384 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1385 	 * mounts we must skip copyin (both of the address and of the string
1386 	 * (in NDINIT).
1387 	 */
1388 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1389 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1390 		boolean_t do_copyin_devpath = true;
1391 #if CONFIG_BASESYSTEMROOT
1392 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1393 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1394 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1395 			// but is actually a char ** pointing to a (kernelspace) string.
1396 			// We manually unpack it with a series of casts and dereferences
1397 			// that reverses what was done just above us on the stack in
1398 			// imageboot_pivot_image().
1399 			// After retrieving the path to the dev node (which we will NDINIT
1400 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1401 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1402 			char **devnamepp = (char **)fsmountargs;
1403 			char *devnamep = *devnamepp;
1404 			devpath = CAST_USER_ADDR_T(devnamep);
1405 			do_copyin_devpath = false;
1406 			fsmountargs = USER_ADDR_NULL;
1407 
1408 			//Now that we have a mp, denote that this mount is for the basesystem.
1409 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1410 		}
1411 #endif // CONFIG_BASESYSTEMROOT
1412 
1413 		if (do_copyin_devpath) {
1414 			if (vfs_context_is64bit(ctx)) {
1415 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1416 					goto out1;
1417 				}
1418 				fsmountargs += sizeof(devpath);
1419 			} else {
1420 				user32_addr_t tmp;
1421 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1422 					goto out1;
1423 				}
1424 				/* munge into LP64 addr */
1425 				devpath = CAST_USER_ADDR_T(tmp);
1426 				fsmountargs += sizeof(tmp);
1427 			}
1428 		}
1429 
1430 		/* Lookup device and authorize access to it */
1431 		if ((devpath)) {
1432 			struct nameidata nd;
1433 
1434 			enum uio_seg seg = UIO_USERSPACE;
1435 #if CONFIG_BASESYSTEMROOT
1436 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1437 				seg = UIO_SYSSPACE;
1438 			}
1439 #endif // CONFIG_BASESYSTEMROOT
1440 
1441 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1442 			if (flags & MNT_NOFOLLOW) {
1443 				nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
1444 			}
1445 			if ((error = namei(&nd))) {
1446 				goto out1;
1447 			}
1448 
1449 			devvp = nd.ni_vp;
1450 
1451 			if (devvp->v_type != VBLK) {
1452 				error = ENOTBLK;
1453 				nameidone(&nd);
1454 				goto out2;
1455 			}
1456 			if (major(devvp->v_rdev) >= nblkdev) {
1457 				error = ENXIO;
1458 				nameidone(&nd);
1459 				goto out2;
1460 			}
1461 			/*
1462 			 * If mount by non-root, then verify that user has necessary
1463 			 * permissions on the device.
1464 			 */
1465 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1466 				kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1467 
1468 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1469 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1470 				}
1471 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1472 					nameidone(&nd);
1473 					goto out2;
1474 				}
1475 			}
1476 
1477 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1478 			nameidone(&nd);
1479 		}
1480 		/* On first mount, preflight and open device */
1481 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1482 			if ((error = vnode_ref(devvp))) {
1483 				goto out2;
1484 			}
1485 			/*
1486 			 * Disallow multiple mounts of the same device.
1487 			 * Disallow mounting of a device that is currently in use
1488 			 * (except for root, which might share swap device for miniroot).
1489 			 * Flush out any old buffers remaining from a previous use.
1490 			 */
1491 			if ((error = vfs_setmounting(devvp))) {
1492 				vnode_rele(devvp);
1493 				goto out2;
1494 			}
1495 
1496 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1497 				error = EBUSY;
1498 				goto out3;
1499 			}
1500 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1501 				error = ENOTBLK;
1502 				goto out3;
1503 			}
1504 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1505 				goto out3;
1506 			}
1507 
1508 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1509 #if CONFIG_MACF
1510 			error = mac_vnode_check_open(ctx,
1511 			    devvp,
1512 			    ronly ? FREAD : FREAD | FWRITE);
1513 			if (error) {
1514 				goto out3;
1515 			}
1516 #endif /* MAC */
1517 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1518 				goto out3;
1519 			}
1520 
1521 			mp->mnt_devvp = devvp;
1522 			device_vnode = devvp;
1523 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1524 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1525 		    (device_vnode = mp->mnt_devvp)) {
1526 			dev_t dev;
1527 			int maj;
1528 			/*
1529 			 * If upgrade to read-write by non-root, then verify
1530 			 * that user has necessary permissions on the device.
1531 			 */
1532 			vnode_getalways(device_vnode);
1533 
1534 			if (suser(vfs_context_ucred(ctx), NULL) &&
1535 			    (error = vnode_authorize(device_vnode, NULL,
1536 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1537 			    ctx)) != 0) {
1538 				vnode_put(device_vnode);
1539 				goto out2;
1540 			}
1541 
1542 			/* Tell the device that we're upgrading */
1543 			dev = (dev_t)device_vnode->v_rdev;
1544 			maj = major(dev);
1545 
1546 			if ((u_int)maj >= (u_int)nblkdev) {
1547 				panic("Volume mounted on a device with invalid major number.");
1548 			}
1549 
1550 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1551 			vnode_put(device_vnode);
1552 			device_vnode = NULLVP;
1553 			if (error != 0) {
1554 				goto out2;
1555 			}
1556 		}
1557 	} // localargs && !(snapshot | data | vm)
1558 
1559 #if CONFIG_MACF
1560 	if ((flags & MNT_UPDATE) == 0) {
1561 		mac_mount_label_init(mp);
1562 		mac_mount_label_associate(ctx, mp);
1563 	}
1564 	if (labelstr) {
1565 		if ((flags & MNT_UPDATE) != 0) {
1566 			error = mac_mount_check_label_update(ctx, mp);
1567 			if (error != 0) {
1568 				goto out3;
1569 			}
1570 		}
1571 	}
1572 #endif
1573 	/*
1574 	 * Mount the filesystem.  We already asserted that internal_flags
1575 	 * cannot have more than one mount-by-role bit set.
1576 	 */
1577 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1578 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1579 		    (caddr_t)fsmountargs, 0, ctx);
1580 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1581 #if CONFIG_ROSV_STARTUP
1582 		struct mount *origin_mp = (struct mount*)fsmountargs;
1583 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1584 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1585 		if (error) {
1586 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1587 		} else {
1588 			/* Mark volume associated with system volume */
1589 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1590 
1591 			/* Attempt to acquire the mnt_devvp and set it up */
1592 			struct vnode *mp_devvp = NULL;
1593 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1594 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1595 				    0, &mp_devvp, vfs_context_kernel());
1596 				if (!lerr) {
1597 					mp->mnt_devvp = mp_devvp;
1598 					//vnode_lookup took an iocount, need to drop it.
1599 					vnode_put(mp_devvp);
1600 					// now set `device_vnode` to the devvp that was acquired.
1601 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1602 					// note that though the iocount above was dropped, the mount acquires
1603 					// an implicit reference against the device.
1604 					device_vnode = mp_devvp;
1605 				}
1606 			}
1607 		}
1608 #else
1609 		error = EINVAL;
1610 #endif
1611 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1612 #if CONFIG_MOUNT_VM
1613 		struct mount *origin_mp = (struct mount*)fsmountargs;
1614 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1615 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1616 		if (error) {
1617 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1618 		} else {
1619 			/* Mark volume associated with system volume and a swap mount */
1620 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1621 			/* Attempt to acquire the mnt_devvp and set it up */
1622 			struct vnode *mp_devvp = NULL;
1623 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1624 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1625 				    0, &mp_devvp, vfs_context_kernel());
1626 				if (!lerr) {
1627 					mp->mnt_devvp = mp_devvp;
1628 					//vnode_lookup took an iocount, need to drop it.
1629 					vnode_put(mp_devvp);
1630 
1631 					// now set `device_vnode` to the devvp that was acquired.
1632 					// note that though the iocount above was dropped, the mount acquires
1633 					// an implicit reference against the device.
1634 					device_vnode = mp_devvp;
1635 				}
1636 			}
1637 		}
1638 #else
1639 		error = EINVAL;
1640 #endif
1641 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1642 #if CONFIG_MOUNT_PREBOOTRECOVERY
1643 		struct mount *origin_mp = (struct mount*)fsmountargs;
1644 		uint32_t mount_role = 0;
1645 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1646 			mount_role = VFS_PREBOOT_ROLE;
1647 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1648 			mount_role = VFS_RECOVERY_ROLE;
1649 		}
1650 
1651 		if (mount_role != 0) {
1652 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1653 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1654 			if (error) {
1655 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1656 			} else {
1657 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1658 				/* Mark volume associated with system volume */
1659 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1660 				/* Attempt to acquire the mnt_devvp and set it up */
1661 				struct vnode *mp_devvp = NULL;
1662 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1663 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1664 					    0, &mp_devvp, vfs_context_kernel());
1665 					if (!lerr) {
1666 						mp->mnt_devvp = mp_devvp;
1667 						//vnode_lookup took an iocount, need to drop it.
1668 						vnode_put(mp_devvp);
1669 
1670 						// now set `device_vnode` to the devvp that was acquired.
1671 						// note that though the iocount above was dropped, the mount acquires
1672 						// an implicit reference against the device.
1673 						device_vnode = mp_devvp;
1674 					}
1675 				}
1676 			}
1677 		} else {
1678 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1679 			error = EINVAL;
1680 		}
1681 #else
1682 		error = EINVAL;
1683 #endif
1684 	} else {
1685 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1686 	}
1687 
1688 	if (flags & MNT_UPDATE) {
1689 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1690 			mp->mnt_flag &= ~MNT_RDONLY;
1691 		}
1692 		mp->mnt_flag &= ~
1693 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1694 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1695 		if (error) {
1696 			mp->mnt_flag = flag;  /* restore flag value */
1697 		}
1698 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1699 		lck_rw_done(&mp->mnt_rwlock);
1700 		is_rwlock_locked = FALSE;
1701 		if (!error) {
1702 			enablequotas(mp, ctx);
1703 		}
1704 		goto exit;
1705 	}
1706 
1707 	/*
1708 	 * Put the new filesystem on the mount list after root.
1709 	 */
1710 	if (error == 0) {
1711 		struct vfs_attr vfsattr;
1712 		if (device_vnode) {
1713 			/*
1714 			 *   cache the IO attributes for the underlying physical media...
1715 			 *   an error return indicates the underlying driver doesn't
1716 			 *   support all the queries necessary... however, reasonable
1717 			 *   defaults will have been set, so no reason to bail or care
1718 			 *
1719 			 *   Need to do this before calling the MAC hook as it needs
1720 			 *   information from this call.
1721 			 */
1722 			vfs_init_io_attributes(device_vnode, mp);
1723 		}
1724 
1725 #if CONFIG_MACF
1726 		error = mac_mount_check_mount_late(ctx, mp);
1727 		if (error != 0) {
1728 			goto out4;
1729 		}
1730 
1731 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1732 			error = VFS_ROOT(mp, &rvp, ctx);
1733 			if (error) {
1734 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1735 				goto out4;
1736 			}
1737 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1738 			/*
1739 			 * drop reference provided by VFS_ROOT
1740 			 */
1741 			vnode_put(rvp);
1742 
1743 			if (error) {
1744 				goto out4;
1745 			}
1746 		}
1747 #endif  /* MAC */
1748 
1749 		vnode_lock_spin(vp);
1750 		CLR(vp->v_flag, VMOUNT);
1751 		vp->v_mountedhere = mp;
1752 		SET(vp->v_flag, VMOUNTEDHERE);
1753 
1754 		/*
1755 		 * Wakeup any waiter(s) in prepare_coveredvp() that is waiting for the
1756 		 * 'v_mountedhere' to be planted.
1757 		 */
1758 		wakeup(&vp->v_flag);
1759 		vnode_unlock(vp);
1760 
1761 		/*
1762 		 * taking the name_cache_lock exclusively will
1763 		 * insure that everyone is out of the fast path who
1764 		 * might be trying to use a now stale copy of
1765 		 * vp->v_mountedhere->mnt_realrootvp
1766 		 * bumping mount_generation causes the cached values
1767 		 * to be invalidated
1768 		 */
1769 		name_cache_lock();
1770 		mount_generation++;
1771 		name_cache_unlock();
1772 
1773 		error = vnode_ref(vp);
1774 		if (error != 0) {
1775 			goto out4;
1776 		}
1777 
1778 		have_usecount = TRUE;
1779 
1780 		error = checkdirs(vp, ctx);
1781 		if (error != 0) {
1782 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1783 			goto out4;
1784 		}
1785 		/*
1786 		 * there is no cleanup code here so I have made it void
1787 		 * we need to revisit this
1788 		 */
1789 		(void)VFS_START(mp, 0, ctx);
1790 
1791 		if (mount_list_add(mp) != 0) {
1792 			/*
1793 			 * The system is shutting down trying to umount
1794 			 * everything, so fail with a plausible errno.
1795 			 */
1796 			error = EBUSY;
1797 			goto out4;
1798 		}
1799 		lck_rw_done(&mp->mnt_rwlock);
1800 		is_rwlock_locked = FALSE;
1801 
1802 		/* Check if this mounted file system supports EAs or named streams. */
1803 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1804 		VFSATTR_INIT(&vfsattr);
1805 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1806 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1807 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1808 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1809 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1810 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1811 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1812 			}
1813 #if NAMEDSTREAMS
1814 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1815 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1816 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1817 			}
1818 #endif
1819 			/* Check if this file system supports path from id lookups. */
1820 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1821 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1822 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1823 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1824 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1825 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1826 			}
1827 
1828 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1829 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1830 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1831 			}
1832 		}
1833 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1834 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1835 		}
1836 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1837 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1838 		}
1839 		/* increment the operations count */
1840 		OSAddAtomic(1, &vfs_nummntops);
1841 		enablequotas(mp, ctx);
1842 
1843 		if (device_vnode) {
1844 			vfs_setmountedon(device_vnode);
1845 		}
1846 
1847 		/* Now that mount is setup, notify the listeners */
1848 		vfs_notify_mount(pvp);
1849 		IOBSDMountChange(mp, kIOMountChangeMount);
1850 	} else {
1851 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1852 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1853 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1854 			    mp->mnt_vtable->vfc_name, error);
1855 		}
1856 
1857 		vnode_lock_spin(vp);
1858 		CLR(vp->v_flag, VMOUNT);
1859 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
1860 		wakeup(&vp->v_flag);
1861 		vnode_unlock(vp);
1862 		mount_list_lock();
1863 		mp->mnt_vtable->vfc_refcount--;
1864 		mount_list_unlock();
1865 
1866 		if (device_vnode) {
1867 			vnode_rele(device_vnode);
1868 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1869 			vfs_clearmounting(device_vnode);
1870 		}
1871 		lck_rw_done(&mp->mnt_rwlock);
1872 		is_rwlock_locked = FALSE;
1873 
1874 		if (nc_smr_enabled) {
1875 			vfs_smr_synchronize();
1876 		}
1877 
1878 		/*
1879 		 * if we get here, we have a mount structure that needs to be freed,
1880 		 * but since the coveredvp hasn't yet been updated to point at it,
1881 		 * no need to worry about other threads holding a crossref on this mp
1882 		 * so it's ok to just free it
1883 		 */
1884 		mount_lock_destroy(mp);
1885 #if CONFIG_MACF
1886 		mac_mount_label_destroy(mp);
1887 #endif
1888 		zfree(mount_zone, mp);
1889 		did_set_lmount = false;
1890 	}
1891 exit:
1892 	/*
1893 	 * drop I/O count on the device vp if there was one
1894 	 */
1895 	if (devpath && devvp) {
1896 		vnode_put(devvp);
1897 	}
1898 
1899 	if (did_set_lmount) {
1900 		mount_lock_spin(mp);
1901 		mp->mnt_lflag &= ~MNT_LMOUNT;
1902 		mount_unlock(mp);
1903 	}
1904 
1905 	return error;
1906 
1907 /* Error condition exits */
1908 out4:
1909 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1910 
1911 	/*
1912 	 * If the mount has been placed on the covered vp,
1913 	 * it may have been discovered by now, so we have
1914 	 * to treat this just like an unmount
1915 	 */
1916 	mount_lock_spin(mp);
1917 	mp->mnt_lflag |= MNT_LDEAD;
1918 	mount_unlock(mp);
1919 
1920 	if (device_vnode != NULLVP) {
1921 		vnode_rele(device_vnode);
1922 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1923 		    ctx);
1924 		vfs_clearmounting(device_vnode);
1925 		did_rele = TRUE;
1926 	}
1927 
1928 	vnode_lock_spin(vp);
1929 
1930 	mp->mnt_crossref++;
1931 	CLR(vp->v_flag, VMOUNTEDHERE);
1932 	vp->v_mountedhere = (mount_t) 0;
1933 
1934 	vnode_unlock(vp);
1935 
1936 	if (have_usecount) {
1937 		vnode_rele(vp);
1938 	}
1939 out3:
1940 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1941 		vnode_rele(devvp);
1942 		vfs_clearmounting(devvp);
1943 	}
1944 out2:
1945 	if (devpath && devvp) {
1946 		vnode_put(devvp);
1947 	}
1948 out1:
1949 	/* Release mnt_rwlock only when it was taken */
1950 	if (is_rwlock_locked == TRUE) {
1951 		if (flag_set) {
1952 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1953 		}
1954 		lck_rw_done(&mp->mnt_rwlock);
1955 	}
1956 
1957 	if (did_set_lmount) {
1958 		mount_lock_spin(mp);
1959 		mp->mnt_lflag &= ~MNT_LMOUNT;
1960 		mount_unlock(mp);
1961 	}
1962 
1963 	if (did_set_vmount) {
1964 		vnode_lock_spin(vp);
1965 		CLR(vp->v_flag, VMOUNT);
1966 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
1967 		wakeup(&vp->v_flag);
1968 		vnode_unlock(vp);
1969 	}
1970 
1971 	if (mntalloc) {
1972 		if (mp->mnt_crossref) {
1973 			mount_dropcrossref(mp, vp, 0);
1974 		} else {
1975 			if (nc_smr_enabled) {
1976 				vfs_smr_synchronize();
1977 			}
1978 
1979 			mount_lock_destroy(mp);
1980 #if CONFIG_MACF
1981 			mac_mount_label_destroy(mp);
1982 #endif
1983 			zfree(mount_zone, mp);
1984 		}
1985 	}
1986 	if (vfsp_ref) {
1987 		mount_list_lock();
1988 		vfsp->vfc_refcount--;
1989 		mount_list_unlock();
1990 	}
1991 
1992 	return error;
1993 }
1994 
1995 /*
1996  * Flush in-core data, check for competing mount attempts,
1997  * and set VMOUNT
1998  */
1999 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)2000 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
2001 {
2002 #if !CONFIG_MACF
2003 #pragma unused(cnp,fsname)
2004 #endif
2005 	struct vnode_attr va;
2006 	int error;
2007 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
2008 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
2009 	boolean_t is_kmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
2010 
2011 	if (!skip_auth) {
2012 		/*
2013 		 * If the user is not root, ensure that they own the directory
2014 		 * onto which we are attempting to mount.
2015 		 */
2016 		VATTR_INIT(&va);
2017 		VATTR_WANTED(&va, va_uid);
2018 		if ((error = vnode_getattr(vp, &va, ctx)) ||
2019 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2020 		    (!vfs_context_issuser(ctx)))) {
2021 			error = EPERM;
2022 			goto out;
2023 		}
2024 	}
2025 
2026 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
2027 		goto out;
2028 	}
2029 
2030 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
2031 		goto out;
2032 	}
2033 
2034 	if (vp->v_type != VDIR) {
2035 		error = ENOTDIR;
2036 		goto out;
2037 	}
2038 
2039 	vnode_lock_spin(vp);
2040 
2041 	if (is_fmount && (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL))) {
2042 		error = EBUSY;
2043 	} else if (!is_kmount && (ISSET(vp->v_flag, VMOUNT) ||
2044 	    (vp->v_mountedhere != NULL))) {
2045 		/*
2046 		 * For mount triggered from mount() call, we want to wait for the
2047 		 * current in-progress mount to complete, redo lookup and retry the
2048 		 * mount again. Similarly, we also want to retry if we lost the race
2049 		 * due to concurrent mounts and the 'VMOUNT' flag has been cleared and
2050 		 * 'v_mountedhere' has been planted after initial lookup.
2051 		 */
2052 		if (ISSET(vp->v_flag, VMOUNT)) {
2053 			vnode_lock_convert(vp);
2054 			msleep(&vp->v_flag, &vp->v_lock, PVFS, "vnode_waitformount", NULL);
2055 		}
2056 		error = EBUSY;
2057 	} else if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
2058 		error = EBUSY;
2059 	}
2060 
2061 	if (error) {
2062 		vnode_unlock(vp);
2063 		goto out;
2064 	}
2065 	SET(vp->v_flag, VMOUNT);
2066 	vnode_unlock(vp);
2067 
2068 #if CONFIG_MACF
2069 	error = mac_mount_check_mount(ctx, vp,
2070 	    cnp, fsname);
2071 	if (error != 0) {
2072 		vnode_lock_spin(vp);
2073 		CLR(vp->v_flag, VMOUNT);
2074 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2075 		wakeup(&vp->v_flag);
2076 		vnode_unlock(vp);
2077 	}
2078 #endif
2079 
2080 out:
2081 	return error;
2082 }
2083 
2084 #if CONFIG_IMGSRC_ACCESS
2085 
2086 #define DEBUG_IMGSRC 0
2087 
2088 #if DEBUG_IMGSRC
2089 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2090 #else
2091 #define IMGSRC_DEBUG(args...) do { } while(0)
2092 #endif
2093 
2094 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2095 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2096 {
2097 	struct nameidata nd;
2098 	vnode_t vp, realdevvp;
2099 	kauth_action_t accessmode;
2100 	int error;
2101 	enum uio_seg uio = UIO_USERSPACE;
2102 
2103 	if (ctx == vfs_context_kernel()) {
2104 		uio = UIO_SYSSPACE;
2105 	}
2106 
2107 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2108 	if ((error = namei(&nd))) {
2109 		IMGSRC_DEBUG("namei() failed with %d\n", error);
2110 		return error;
2111 	}
2112 
2113 	vp = nd.ni_vp;
2114 
2115 	if (!vnode_isblk(vp)) {
2116 		IMGSRC_DEBUG("Not block device.\n");
2117 		error = ENOTBLK;
2118 		goto out;
2119 	}
2120 
2121 	realdevvp = mp->mnt_devvp;
2122 	if (realdevvp == NULLVP) {
2123 		IMGSRC_DEBUG("No device backs the mount.\n");
2124 		error = ENXIO;
2125 		goto out;
2126 	}
2127 
2128 	error = vnode_getwithref(realdevvp);
2129 	if (error != 0) {
2130 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2131 		goto out;
2132 	}
2133 
2134 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2135 		IMGSRC_DEBUG("Wrong dev_t.\n");
2136 		error = ENXIO;
2137 		goto out1;
2138 	}
2139 
2140 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2141 
2142 	/*
2143 	 * If mount by non-root, then verify that user has necessary
2144 	 * permissions on the device.
2145 	 */
2146 	if (!vfs_context_issuser(ctx)) {
2147 		accessmode = KAUTH_VNODE_READ_DATA;
2148 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2149 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2150 		}
2151 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2152 			IMGSRC_DEBUG("Access denied.\n");
2153 			goto out1;
2154 		}
2155 	}
2156 
2157 	*devvpp = vp;
2158 
2159 out1:
2160 	vnode_put(realdevvp);
2161 
2162 out:
2163 	nameidone(&nd);
2164 
2165 	if (error) {
2166 		vnode_put(vp);
2167 	}
2168 
2169 	return error;
2170 }
2171 
2172 /*
2173  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2174  * and call checkdirs()
2175  */
2176 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2177 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2178 {
2179 	int error;
2180 
2181 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2182 
2183 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2184 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2185 
2186 	vnode_lock_spin(vp);
2187 	CLR(vp->v_flag, VMOUNT);
2188 	vp->v_mountedhere = mp;
2189 	SET(vp->v_flag, VMOUNTEDHERE);
2190 	/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2191 	wakeup(&vp->v_flag);
2192 	vnode_unlock(vp);
2193 
2194 	/*
2195 	 * taking the name_cache_lock exclusively will
2196 	 * insure that everyone is out of the fast path who
2197 	 * might be trying to use a now stale copy of
2198 	 * vp->v_mountedhere->mnt_realrootvp
2199 	 * bumping mount_generation causes the cached values
2200 	 * to be invalidated
2201 	 */
2202 	name_cache_lock();
2203 	mount_generation++;
2204 	name_cache_unlock();
2205 
2206 	error = vnode_ref(vp);
2207 	if (error != 0) {
2208 		goto out;
2209 	}
2210 
2211 	error = checkdirs(vp, ctx);
2212 	if (error != 0) {
2213 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2214 		vnode_rele(vp);
2215 		goto out;
2216 	}
2217 
2218 out:
2219 	if (error != 0) {
2220 		mp->mnt_vnodecovered = NULLVP;
2221 	}
2222 	return error;
2223 }
2224 
2225 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2226 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2227 {
2228 	vnode_rele(vp);
2229 	vnode_lock_spin(vp);
2230 	CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2231 	vp->v_mountedhere = (mount_t)NULL;
2232 	/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2233 	wakeup(&vp->v_flag);
2234 	vnode_unlock(vp);
2235 
2236 	mp->mnt_vnodecovered = NULLVP;
2237 }
2238 
2239 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2240 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2241 {
2242 	int error;
2243 
2244 	/* unmount in progress return error */
2245 	mount_lock_spin(mp);
2246 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2247 		mount_unlock(mp);
2248 		return EBUSY;
2249 	}
2250 	mount_unlock(mp);
2251 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2252 
2253 	/*
2254 	 * We only allow the filesystem to be reloaded if it
2255 	 * is currently mounted read-only.
2256 	 */
2257 	if ((flags & MNT_RELOAD) &&
2258 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2259 		error = ENOTSUP;
2260 		goto out;
2261 	}
2262 
2263 	/*
2264 	 * Only root, or the user that did the original mount is
2265 	 * permitted to update it.
2266 	 */
2267 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2268 	    (!vfs_context_issuser(ctx))) {
2269 		error = EPERM;
2270 		goto out;
2271 	}
2272 #if CONFIG_MACF
2273 	error = mac_mount_check_remount(ctx, mp, flags);
2274 	if (error != 0) {
2275 		goto out;
2276 	}
2277 #endif
2278 
2279 out:
2280 	if (error) {
2281 		lck_rw_done(&mp->mnt_rwlock);
2282 	}
2283 
2284 	return error;
2285 }
2286 
2287 static void
mount_end_update(mount_t mp)2288 mount_end_update(mount_t mp)
2289 {
2290 	lck_rw_done(&mp->mnt_rwlock);
2291 }
2292 
2293 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2294 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2295 {
2296 	vnode_t vp;
2297 
2298 	if (height >= MAX_IMAGEBOOT_NESTING) {
2299 		return EINVAL;
2300 	}
2301 
2302 	vp = imgsrc_rootvnodes[height];
2303 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2304 		*rvpp = vp;
2305 		return 0;
2306 	} else {
2307 		return ENOENT;
2308 	}
2309 }
2310 
2311 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2312 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2313     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2314     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2315 {
2316 	int error;
2317 	mount_t mp;
2318 	boolean_t placed = FALSE;
2319 	struct vfstable *vfsp;
2320 	user_addr_t devpath;
2321 	char *old_mntonname;
2322 	vnode_t rvp;
2323 	vnode_t devvp;
2324 	uint32_t height;
2325 	uint32_t flags;
2326 
2327 	/* If we didn't imageboot, nothing to move */
2328 	if (imgsrc_rootvnodes[0] == NULLVP) {
2329 		return EINVAL;
2330 	}
2331 
2332 	/* Only root can do this */
2333 	if (!vfs_context_issuser(ctx)) {
2334 		return EPERM;
2335 	}
2336 
2337 	IMGSRC_DEBUG("looking for root vnode.\n");
2338 
2339 	/*
2340 	 * Get root vnode of filesystem we're moving.
2341 	 */
2342 	if (by_index) {
2343 		if (is64bit) {
2344 			struct user64_mnt_imgsrc_args mia64;
2345 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2346 			if (error != 0) {
2347 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2348 				return error;
2349 			}
2350 
2351 			height = mia64.mi_height;
2352 			flags = mia64.mi_flags;
2353 			devpath = (user_addr_t)mia64.mi_devpath;
2354 		} else {
2355 			struct user32_mnt_imgsrc_args mia32;
2356 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2357 			if (error != 0) {
2358 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2359 				return error;
2360 			}
2361 
2362 			height = mia32.mi_height;
2363 			flags = mia32.mi_flags;
2364 			devpath = mia32.mi_devpath;
2365 		}
2366 	} else {
2367 		/*
2368 		 * For binary compatibility--assumes one level of nesting.
2369 		 */
2370 		if (is64bit) {
2371 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2372 				return error;
2373 			}
2374 		} else {
2375 			user32_addr_t tmp;
2376 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2377 				return error;
2378 			}
2379 
2380 			/* munge into LP64 addr */
2381 			devpath = CAST_USER_ADDR_T(tmp);
2382 		}
2383 
2384 		height = 0;
2385 		flags = 0;
2386 	}
2387 
2388 	if (flags != 0) {
2389 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2390 		return EINVAL;
2391 	}
2392 
2393 	error = get_imgsrc_rootvnode(height, &rvp);
2394 	if (error != 0) {
2395 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2396 		return error;
2397 	}
2398 
2399 	IMGSRC_DEBUG("got old root vnode\n");
2400 
2401 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2402 
2403 	/* Can only move once */
2404 	mp = vnode_mount(rvp);
2405 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2406 		IMGSRC_DEBUG("Already moved.\n");
2407 		error = EBUSY;
2408 		goto out0;
2409 	}
2410 
2411 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2412 	IMGSRC_DEBUG("Starting updated.\n");
2413 
2414 	/* Get exclusive rwlock on mount, authorize update on mp */
2415 	error = mount_begin_update(mp, ctx, 0);
2416 	if (error != 0) {
2417 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2418 		goto out0;
2419 	}
2420 
2421 	/*
2422 	 * It can only be moved once.  Flag is set under the rwlock,
2423 	 * so we're now safe to proceed.
2424 	 */
2425 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2426 		IMGSRC_DEBUG("Already moved [2]\n");
2427 		goto out1;
2428 	}
2429 
2430 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2431 
2432 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2433 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2434 	if (error != 0) {
2435 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2436 		goto out1;
2437 	}
2438 
2439 	IMGSRC_DEBUG("Covered vp OK.\n");
2440 
2441 	/* Sanity check the name caller has provided */
2442 	vfsp = mp->mnt_vtable;
2443 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2444 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2445 		    vfsp->vfc_name, fsname);
2446 		error = EINVAL;
2447 		goto out2;
2448 	}
2449 
2450 	/* Check the device vnode and update mount-from name, for local filesystems */
2451 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2452 		IMGSRC_DEBUG("Local, doing device validation.\n");
2453 
2454 		if (devpath != USER_ADDR_NULL) {
2455 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2456 			if (error) {
2457 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2458 				goto out2;
2459 			}
2460 
2461 			vnode_put(devvp);
2462 		}
2463 	}
2464 
2465 	/*
2466 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2467 	 * and increment the name cache's mount generation
2468 	 */
2469 
2470 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2471 	error = place_mount_and_checkdirs(mp, vp, ctx);
2472 	if (error != 0) {
2473 		goto out2;
2474 	}
2475 
2476 	placed = TRUE;
2477 
2478 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2479 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2480 
2481 	/* Forbid future moves */
2482 	mount_lock(mp);
2483 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2484 	mount_unlock(mp);
2485 
2486 	/* Finally, add to mount list, completely ready to go */
2487 	if (mount_list_add(mp) != 0) {
2488 		/*
2489 		 * The system is shutting down trying to umount
2490 		 * everything, so fail with a plausible errno.
2491 		 */
2492 		error = EBUSY;
2493 		goto out3;
2494 	}
2495 
2496 	mount_end_update(mp);
2497 	vnode_put(rvp);
2498 	zfree(ZV_NAMEI, old_mntonname);
2499 
2500 	vfs_notify_mount(pvp);
2501 
2502 	return 0;
2503 out3:
2504 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2505 
2506 	mount_lock(mp);
2507 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2508 	mount_unlock(mp);
2509 
2510 out2:
2511 	/*
2512 	 * Placing the mp on the vnode clears VMOUNT,
2513 	 * so cleanup is different after that point
2514 	 */
2515 	if (placed) {
2516 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2517 		undo_place_on_covered_vp(mp, vp);
2518 	} else {
2519 		vnode_lock_spin(vp);
2520 		CLR(vp->v_flag, VMOUNT);
2521 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2522 		wakeup(&vp->v_flag);
2523 		vnode_unlock(vp);
2524 	}
2525 out1:
2526 	mount_end_update(mp);
2527 
2528 out0:
2529 	vnode_put(rvp);
2530 	zfree(ZV_NAMEI, old_mntonname);
2531 	return error;
2532 }
2533 
2534 #endif /* CONFIG_IMGSRC_ACCESS */
2535 
2536 void
enablequotas(struct mount * mp,vfs_context_t ctx)2537 enablequotas(struct mount *mp, vfs_context_t ctx)
2538 {
2539 	struct nameidata qnd;
2540 	int type;
2541 	char qfpath[MAXPATHLEN];
2542 	const char *qfname = QUOTAFILENAME;
2543 	const char *qfopsname = QUOTAOPSNAME;
2544 	const char *qfextension[] = INITQFNAMES;
2545 
2546 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2547 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2548 		return;
2549 	}
2550 	/*
2551 	 * Enable filesystem disk quotas if necessary.
2552 	 * We ignore errors as this should not interfere with final mount
2553 	 */
2554 	for (type = 0; type < MAXQUOTAS; type++) {
2555 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2556 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2557 		    CAST_USER_ADDR_T(qfpath), ctx);
2558 		if (namei(&qnd) != 0) {
2559 			continue;           /* option file to trigger quotas is not present */
2560 		}
2561 		vnode_put(qnd.ni_vp);
2562 		nameidone(&qnd);
2563 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2564 
2565 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2566 	}
2567 	return;
2568 }
2569 
2570 
2571 static int
checkdirs_callback(proc_t p,void * arg)2572 checkdirs_callback(proc_t p, void * arg)
2573 {
2574 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2575 	vnode_t olddp = cdrp->olddp;
2576 	vnode_t newdp = cdrp->newdp;
2577 	struct filedesc *fdp = &p->p_fd;
2578 	vnode_t new_cvp = newdp;
2579 	vnode_t new_rvp = newdp;
2580 	vnode_t old_cvp = NULL;
2581 	vnode_t old_rvp = NULL;
2582 
2583 	/*
2584 	 * XXX Also needs to iterate each thread in the process to see if it
2585 	 * XXX is using a per-thread current working directory, and, if so,
2586 	 * XXX update that as well.
2587 	 */
2588 
2589 	/*
2590 	 * First, with the proc_fdlock held, check to see if we will need
2591 	 * to do any work.  If not, we will get out fast.
2592 	 */
2593 	proc_fdlock(p);
2594 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2595 		proc_fdunlock(p);
2596 		return PROC_RETURNED;
2597 	}
2598 	proc_fdunlock(p);
2599 
2600 	/*
2601 	 * Ok, we will have to do some work.  Always take two refs
2602 	 * because we might need that many.  We'll dispose of whatever
2603 	 * we ended up not using.
2604 	 */
2605 	if (vnode_ref(newdp) != 0) {
2606 		return PROC_RETURNED;
2607 	}
2608 	if (vnode_ref(newdp) != 0) {
2609 		vnode_rele(newdp);
2610 		return PROC_RETURNED;
2611 	}
2612 
2613 	proc_dirs_lock_exclusive(p);
2614 	/*
2615 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2616 	 * have to do all of the checks again.
2617 	 */
2618 	proc_fdlock(p);
2619 	if (fdp->fd_cdir == olddp) {
2620 		old_cvp = olddp;
2621 		fdp->fd_cdir = newdp;
2622 		new_cvp = NULL;
2623 	}
2624 	if (fdp->fd_rdir == olddp) {
2625 		old_rvp = olddp;
2626 		fdp->fd_rdir = newdp;
2627 		new_rvp = NULL;
2628 	}
2629 	proc_fdunlock(p);
2630 	proc_dirs_unlock_exclusive(p);
2631 
2632 	/*
2633 	 * Dispose of any references that are no longer needed.
2634 	 */
2635 	if (old_cvp != NULL) {
2636 		vnode_rele(old_cvp);
2637 	}
2638 	if (old_rvp != NULL) {
2639 		vnode_rele(old_rvp);
2640 	}
2641 	if (new_cvp != NULL) {
2642 		vnode_rele(new_cvp);
2643 	}
2644 	if (new_rvp != NULL) {
2645 		vnode_rele(new_rvp);
2646 	}
2647 
2648 	return PROC_RETURNED;
2649 }
2650 
2651 
2652 
2653 /*
2654  * Scan all active processes to see if any of them have a current
2655  * or root directory onto which the new filesystem has just been
2656  * mounted. If so, replace them with the new mount point.
2657  */
2658 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2659 checkdirs(vnode_t olddp, vfs_context_t ctx)
2660 {
2661 	vnode_t newdp;
2662 	vnode_t tvp;
2663 	int err;
2664 	struct cdirargs cdr;
2665 
2666 	if (olddp->v_usecount == 1) {
2667 		return 0;
2668 	}
2669 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2670 
2671 	if (err != 0) {
2672 #if DIAGNOSTIC
2673 		panic("mount: lost mount: error %d", err);
2674 #endif
2675 		return err;
2676 	}
2677 
2678 	cdr.olddp = olddp;
2679 	cdr.newdp = newdp;
2680 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2681 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2682 
2683 	if (rootvnode == olddp) {
2684 		vnode_ref(newdp);
2685 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2686 		tvp = rootvnode;
2687 		rootvnode = newdp;
2688 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2689 		vnode_rele(tvp);
2690 	}
2691 
2692 	vnode_put(newdp);
2693 	return 0;
2694 }
2695 
2696 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2697 	"com.apple.private.vfs.role-account-unmount"
2698 
2699 /*
2700  * Unmount a file system.
2701  *
2702  * Note: unmount takes a path to the vnode mounted on as argument,
2703  * not special file (as before).
2704  */
2705 /* ARGSUSED */
2706 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2707 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2708 {
2709 	vnode_t vp;
2710 	struct mount *mp;
2711 	int flags = uap->flags;
2712 	int error;
2713 	struct nameidata nd;
2714 	vfs_context_t ctx;
2715 
2716 	/*
2717 	 * If the process has the entitlement, use the kernel's context when
2718 	 * performing lookup on the mount path as the process might lack proper
2719 	 * permission to access the directory.
2720 	 */
2721 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2722 	    vfs_context_kernel() : vfs_context_current();
2723 
2724 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2725 	    UIO_USERSPACE, uap->path, ctx);
2726 	if (flags & MNT_NOFOLLOW) {
2727 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
2728 	}
2729 
2730 	error = namei(&nd);
2731 	if (error) {
2732 		return error;
2733 	}
2734 	vp = nd.ni_vp;
2735 	mp = vp->v_mount;
2736 	nameidone(&nd);
2737 
2738 	/*
2739 	 * Must be the root of the filesystem
2740 	 */
2741 	if ((vp->v_flag & VROOT) == 0) {
2742 		vnode_put(vp);
2743 		return EINVAL;
2744 	}
2745 #if CONFIG_MACF
2746 	error = mac_mount_check_umount(ctx, mp);
2747 	if (error != 0) {
2748 		vnode_put(vp);
2749 		return error;
2750 	}
2751 #endif
2752 	mount_ref(mp, 0);
2753 	vnode_put(vp);
2754 	/* safedounmount consumes the mount ref */
2755 	return safedounmount(mp, flags, ctx);
2756 }
2757 
2758 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2759 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2760 {
2761 	mount_t mp;
2762 
2763 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2764 	if (mp == (mount_t)0) {
2765 		return ENOENT;
2766 	}
2767 	mount_ref(mp, 0);
2768 	mount_iterdrop(mp);
2769 	/* safedounmount consumes the mount ref */
2770 	return safedounmount(mp, flags, ctx);
2771 }
2772 
2773 /*
2774  * The mount struct comes with a mount ref which will be consumed.
2775  * Do the actual file system unmount, prevent some common foot shooting.
2776  */
2777 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2778 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2779 {
2780 	int error;
2781 	proc_t p = vfs_context_proc(ctx);
2782 
2783 	/*
2784 	 * If the file system is not responding and MNT_NOBLOCK
2785 	 * is set and not a forced unmount then return EBUSY.
2786 	 */
2787 	if ((mp->mnt_lflag & MNT_LNOTRESP) &&
2788 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2789 		error = EBUSY;
2790 		goto out;
2791 	}
2792 
2793 	/*
2794 	 * Skip authorization in two cases:
2795 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2796 	 *   This entitlement allows non-root processes unmount volumes mounted by
2797 	 *   other processes.
2798 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2799 	 *   attempt.
2800 	 */
2801 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2802 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2803 		/*
2804 		 * Only root, or the user that did the original mount is
2805 		 * permitted to unmount this filesystem.
2806 		 */
2807 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2808 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2809 			goto out;
2810 		}
2811 	}
2812 	/*
2813 	 * Don't allow unmounting the root file system, or other volumes
2814 	 * associated with it (for example, the associated VM or DATA mounts) .
2815 	 */
2816 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2817 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2818 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2819 			    mp->mnt_vfsstat.f_mntonname);
2820 		}
2821 		error = EBUSY; /* the root (or associated volumes) is always busy */
2822 		goto out;
2823 	}
2824 
2825 	/*
2826 	 * If the mount is providing the root filesystem's disk image
2827 	 * (i.e. imageboot), don't allow unmounting
2828 	 */
2829 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2830 		error = EBUSY;
2831 		goto out;
2832 	}
2833 
2834 	return dounmount(mp, flags, 1, ctx);
2835 
2836 out:
2837 	mount_drop(mp, 0);
2838 	return error;
2839 }
2840 
2841 /*
2842  * Do the actual file system unmount.
2843  */
2844 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2845 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2846 {
2847 	vnode_t coveredvp = (vnode_t)0;
2848 	int error;
2849 	int needwakeup = 0;
2850 	int forcedunmount = 0;
2851 	int lflags = 0;
2852 	struct vnode *devvp = NULLVP;
2853 #if CONFIG_TRIGGERS
2854 	proc_t p = vfs_context_proc(ctx);
2855 	int did_vflush = 0;
2856 	int pflags_save = 0;
2857 #endif /* CONFIG_TRIGGERS */
2858 
2859 #if CONFIG_FSE
2860 	if (!(flags & MNT_FORCE)) {
2861 		fsevent_unmount(mp, ctx);  /* has to come first! */
2862 	}
2863 #endif
2864 
2865 	mount_lock(mp);
2866 
2867 	/*
2868 	 * If already an unmount in progress just return EBUSY.
2869 	 * Even a forced unmount cannot override.
2870 	 */
2871 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2872 		if (withref != 0) {
2873 			mount_drop(mp, 1);
2874 		}
2875 		mount_unlock(mp);
2876 		return EBUSY;
2877 	}
2878 
2879 	if (flags & MNT_FORCE) {
2880 		forcedunmount = 1;
2881 		mp->mnt_lflag |= MNT_LFORCE;
2882 	}
2883 
2884 #if CONFIG_TRIGGERS
2885 	if (flags & MNT_NOBLOCK && p != kernproc) {
2886 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2887 	}
2888 #endif
2889 
2890 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2891 	mp->mnt_lflag |= MNT_LUNMOUNT;
2892 	mp->mnt_flag &= ~MNT_ASYNC;
2893 	/*
2894 	 * anyone currently in the fast path that
2895 	 * trips over the cached rootvp will be
2896 	 * dumped out and forced into the slow path
2897 	 * to regenerate a new cached value
2898 	 */
2899 	mp->mnt_realrootvp = NULLVP;
2900 	mount_unlock(mp);
2901 
2902 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2903 		/*
2904 		 * Force unmount any mounts in this filesystem.
2905 		 * If any unmounts fail - just leave them dangling.
2906 		 * Avoids recursion.
2907 		 */
2908 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2909 	}
2910 
2911 	/*
2912 	 * taking the name_cache_lock exclusively will
2913 	 * insure that everyone is out of the fast path who
2914 	 * might be trying to use a now stale copy of
2915 	 * vp->v_mountedhere->mnt_realrootvp
2916 	 * bumping mount_generation causes the cached values
2917 	 * to be invalidated
2918 	 */
2919 	name_cache_lock();
2920 	mount_generation++;
2921 	name_cache_unlock();
2922 
2923 
2924 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2925 	if (withref != 0) {
2926 		mount_drop(mp, 0);
2927 	}
2928 	error = 0;
2929 	if (forcedunmount == 0) {
2930 		ubc_umount(mp); /* release cached vnodes */
2931 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2932 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2933 			if (error) {
2934 				mount_lock(mp);
2935 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2936 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2937 				mp->mnt_lflag &= ~MNT_LFORCE;
2938 				goto out;
2939 			}
2940 		}
2941 	}
2942 
2943 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2944 
2945 #if CONFIG_TRIGGERS
2946 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2947 	did_vflush = 1;
2948 #endif
2949 	if (forcedunmount) {
2950 		lflags |= FORCECLOSE;
2951 	}
2952 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2953 	if ((forcedunmount == 0) && error) {
2954 		mount_lock(mp);
2955 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2956 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2957 		mp->mnt_lflag &= ~MNT_LFORCE;
2958 		goto out;
2959 	}
2960 
2961 	/* make sure there are no one in the mount iterations or lookup */
2962 	mount_iterdrain(mp);
2963 
2964 	error = VFS_UNMOUNT(mp, flags, ctx);
2965 	if (error) {
2966 		mount_iterreset(mp);
2967 		mount_lock(mp);
2968 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2969 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2970 		mp->mnt_lflag &= ~MNT_LFORCE;
2971 		goto out;
2972 	}
2973 
2974 	/* increment the operations count */
2975 	if (!error) {
2976 		OSAddAtomic(1, &vfs_nummntops);
2977 	}
2978 
2979 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2980 		/* hold an io reference and drop the usecount before close */
2981 		devvp = mp->mnt_devvp;
2982 		vnode_getalways(devvp);
2983 		vnode_rele(devvp);
2984 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2985 		    ctx);
2986 		vnode_clearmountedon(devvp);
2987 		vnode_put(devvp);
2988 	}
2989 	lck_rw_done(&mp->mnt_rwlock);
2990 	mount_list_remove(mp);
2991 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2992 
2993 	/* mark the mount point hook in the vp but not drop the ref yet */
2994 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2995 		/*
2996 		 * The covered vnode needs special handling. Trying to get an
2997 		 * iocount must not block here as this may lead to deadlocks
2998 		 * if the Filesystem to which the covered vnode belongs is
2999 		 * undergoing forced unmounts. Since we hold a usecount, the
3000 		 * vnode cannot be reused (it can, however, still be terminated)
3001 		 */
3002 		vnode_getalways(coveredvp);
3003 		vnode_lock_spin(coveredvp);
3004 
3005 		mp->mnt_crossref++;
3006 		coveredvp->v_mountedhere = (struct mount *)0;
3007 		CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
3008 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
3009 		wakeup(&coveredvp->v_flag);
3010 		vnode_unlock(coveredvp);
3011 		vnode_put(coveredvp);
3012 	}
3013 
3014 	mount_list_lock();
3015 	mp->mnt_vtable->vfc_refcount--;
3016 	mount_list_unlock();
3017 
3018 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
3019 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
3020 	mount_lock(mp);
3021 	mp->mnt_lflag |= MNT_LDEAD;
3022 
3023 	if (mp->mnt_lflag & MNT_LWAIT) {
3024 		/*
3025 		 * do the wakeup here
3026 		 * in case we block in mount_refdrain
3027 		 * which will drop the mount lock
3028 		 * and allow anyone blocked in vfs_busy
3029 		 * to wakeup and see the LDEAD state
3030 		 */
3031 		mp->mnt_lflag &= ~MNT_LWAIT;
3032 		wakeup((caddr_t)mp);
3033 	}
3034 	mount_refdrain(mp);
3035 
3036 	/* free disk_conditioner_info structure for this mount */
3037 	disk_conditioner_unmount(mp);
3038 
3039 out:
3040 	if (mp->mnt_lflag & MNT_LWAIT) {
3041 		mp->mnt_lflag &= ~MNT_LWAIT;
3042 		needwakeup = 1;
3043 	}
3044 
3045 #if CONFIG_TRIGGERS
3046 	if (flags & MNT_NOBLOCK && p != kernproc) {
3047 		// Restore P_NOREMOTEHANG bit to its previous value
3048 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
3049 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
3050 		}
3051 	}
3052 
3053 	/*
3054 	 * Callback and context are set together under the mount lock, and
3055 	 * never cleared, so we're safe to examine them here, drop the lock,
3056 	 * and call out.
3057 	 */
3058 	if (mp->mnt_triggercallback != NULL) {
3059 		mount_unlock(mp);
3060 		if (error == 0) {
3061 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
3062 		} else if (did_vflush) {
3063 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
3064 		}
3065 	} else {
3066 		mount_unlock(mp);
3067 	}
3068 #else
3069 	mount_unlock(mp);
3070 #endif /* CONFIG_TRIGGERS */
3071 
3072 	lck_rw_done(&mp->mnt_rwlock);
3073 
3074 	if (needwakeup) {
3075 		wakeup((caddr_t)mp);
3076 	}
3077 
3078 	if (!error) {
3079 		if ((coveredvp != NULLVP)) {
3080 			vnode_t pvp = NULLVP;
3081 
3082 			/*
3083 			 * The covered vnode needs special handling. Trying to
3084 			 * get an iocount must not block here as this may lead
3085 			 * to deadlocks if the Filesystem to which the covered
3086 			 * vnode belongs is undergoing forced unmounts. Since we
3087 			 * hold a usecount, the  vnode cannot be reused
3088 			 * (it can, however, still be terminated).
3089 			 */
3090 			vnode_getalways(coveredvp);
3091 
3092 			mount_dropcrossref(mp, coveredvp, 0);
3093 			/*
3094 			 * We'll _try_ to detect if this really needs to be
3095 			 * done. The coveredvp can only be in termination (or
3096 			 * terminated) if the coveredvp's mount point is in a
3097 			 * forced unmount (or has been) since we still hold the
3098 			 * ref.
3099 			 */
3100 			if (!vnode_isrecycled(coveredvp)) {
3101 				pvp = vnode_getparent(coveredvp);
3102 #if CONFIG_TRIGGERS
3103 				if (coveredvp->v_resolve) {
3104 					vnode_trigger_rearm(coveredvp, ctx);
3105 				}
3106 #endif
3107 			}
3108 
3109 			vnode_rele(coveredvp);
3110 			vnode_put(coveredvp);
3111 			coveredvp = NULLVP;
3112 
3113 			if (pvp) {
3114 				lock_vnode_and_post(pvp, NOTE_WRITE);
3115 				vnode_put(pvp);
3116 			}
3117 		} else if (mp->mnt_flag & MNT_ROOTFS) {
3118 			if (nc_smr_enabled) {
3119 				vfs_smr_synchronize();
3120 			}
3121 
3122 			mount_lock_destroy(mp);
3123 #if CONFIG_MACF
3124 			mac_mount_label_destroy(mp);
3125 #endif
3126 			zfree(mount_zone, mp);
3127 		} else {
3128 			panic("dounmount: no coveredvp");
3129 		}
3130 	}
3131 	return error;
3132 }
3133 
3134 /*
3135  * Unmount any mounts in this filesystem.
3136  */
3137 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3138 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3139 {
3140 	mount_t smp;
3141 	fsid_t *fsids, fsid;
3142 	int fsids_sz;
3143 	int count = 0, i, m = 0;
3144 	vnode_t vp;
3145 
3146 	mount_list_lock();
3147 
3148 	// Get an array to hold the submounts fsids.
3149 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
3150 	count++;
3151 	fsids_sz = count * sizeof(fsid_t);
3152 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3153 	if (fsids == NULL) {
3154 		mount_list_unlock();
3155 		goto out;
3156 	}
3157 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
3158 
3159 	/*
3160 	 * Fill the array with submount fsids.
3161 	 * Since mounts are always added to the tail of the mount list, the
3162 	 * list is always in mount order.
3163 	 * For each mount check if the mounted-on vnode belongs to a
3164 	 * mount that's already added to our array of mounts to be unmounted.
3165 	 */
3166 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3167 		vp = smp->mnt_vnodecovered;
3168 		if (vp == NULL) {
3169 			continue;
3170 		}
3171 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3172 		for (i = 0; i <= m; i++) {
3173 			if (fsids[i].val[0] == fsid.val[0] &&
3174 			    fsids[i].val[1] == fsid.val[1]) {
3175 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3176 				break;
3177 			}
3178 		}
3179 	}
3180 	mount_list_unlock();
3181 
3182 	// Unmount the submounts in reverse order. Ignore errors.
3183 	for (i = m; i > 0; i--) {
3184 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3185 		if (smp) {
3186 			mount_ref(smp, 0);
3187 			mount_iterdrop(smp);
3188 			(void) dounmount(smp, flags, 1, ctx);
3189 		}
3190 	}
3191 out:
3192 	kfree_data(fsids, fsids_sz);
3193 }
3194 
3195 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3196 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3197 {
3198 	vnode_hold(dp);
3199 	vnode_lock(dp);
3200 	mp->mnt_crossref--;
3201 
3202 	if (mp->mnt_crossref < 0) {
3203 		panic("mount cross refs -ve");
3204 	}
3205 
3206 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3207 		if (need_put) {
3208 			vnode_put_locked(dp);
3209 		}
3210 		vnode_drop_and_unlock(dp);
3211 
3212 		if (nc_smr_enabled) {
3213 			vfs_smr_synchronize();
3214 		}
3215 
3216 		mount_lock_destroy(mp);
3217 #if CONFIG_MACF
3218 		mac_mount_label_destroy(mp);
3219 #endif
3220 		zfree(mount_zone, mp);
3221 		return;
3222 	}
3223 	if (need_put) {
3224 		vnode_put_locked(dp);
3225 	}
3226 	vnode_drop_and_unlock(dp);
3227 }
3228 
3229 
3230 /*
3231  * Sync each mounted filesystem.
3232  */
3233 #if DIAGNOSTIC
3234 int syncprt = 0;
3235 #endif
3236 
3237 int print_vmpage_stat = 0;
3238 
3239 /*
3240  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3241  *			mounted read-write with the passed waitfor value.
3242  *
3243  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3244  *		arg	user argument (please see below)
3245  *
3246  * User argument is a pointer to 32 bit unsigned integer which describes the
3247  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3248  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3249  * waitfor value.
3250  *
3251  * Returns:		VFS_RETURNED
3252  */
3253 static int
sync_callback(mount_t mp,void * arg)3254 sync_callback(mount_t mp, void *arg)
3255 {
3256 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3257 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3258 		unsigned waitfor = MNT_NOWAIT;
3259 
3260 		if (arg) {
3261 			waitfor = *(uint32_t*)arg;
3262 		}
3263 
3264 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3265 		if (waitfor != MNT_WAIT &&
3266 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3267 		    waitfor != MNT_NOWAIT &&
3268 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3269 		    waitfor != MNT_DWAIT &&
3270 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3271 			panic("Passed inappropriate waitfor %u to "
3272 			    "sync_callback()", waitfor);
3273 		}
3274 
3275 		mp->mnt_flag &= ~MNT_ASYNC;
3276 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3277 		if (asyncflag) {
3278 			mp->mnt_flag |= MNT_ASYNC;
3279 		}
3280 	}
3281 
3282 	return VFS_RETURNED;
3283 }
3284 
3285 /* ARGSUSED */
3286 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3287 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3288 {
3289 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3290 
3291 	if (print_vmpage_stat) {
3292 		vm_countdirtypages();
3293 	}
3294 
3295 #if DIAGNOSTIC
3296 	if (syncprt) {
3297 		vfs_bufstats();
3298 	}
3299 #endif /* DIAGNOSTIC */
3300 	return 0;
3301 }
3302 
3303 typedef enum {
3304 	SYNC_ALL = 0,
3305 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3306 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3307 } sync_type_t;
3308 
3309 static int
sync_internal_callback(mount_t mp,void * arg)3310 sync_internal_callback(mount_t mp, void *arg)
3311 {
3312 	if (arg) {
3313 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3314 		    (mp->mnt_flag & MNT_LOCAL);
3315 		sync_type_t sync_type = *((sync_type_t *)arg);
3316 
3317 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3318 			return VFS_RETURNED;
3319 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3320 			return VFS_RETURNED;
3321 		}
3322 	}
3323 
3324 	(void)sync_callback(mp, NULL);
3325 
3326 	return VFS_RETURNED;
3327 }
3328 
3329 int sync_thread_state = 0;
3330 int sync_timeout_seconds = 5;
3331 
3332 #define SYNC_THREAD_RUN       0x0001
3333 #define SYNC_THREAD_RUNNING   0x0002
3334 
3335 #if CONFIG_PHYS_WRITE_ACCT
3336 thread_t pm_sync_thread;
3337 #endif /* CONFIG_PHYS_WRITE_ACCT */
3338 
3339 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3340 sync_thread(__unused void *arg, __unused wait_result_t wr)
3341 {
3342 	sync_type_t sync_type;
3343 #if CONFIG_PHYS_WRITE_ACCT
3344 	pm_sync_thread = current_thread();
3345 #endif /* CONFIG_PHYS_WRITE_ACCT */
3346 
3347 	lck_mtx_lock(&sync_mtx_lck);
3348 	while (sync_thread_state & SYNC_THREAD_RUN) {
3349 		sync_thread_state &= ~SYNC_THREAD_RUN;
3350 		lck_mtx_unlock(&sync_mtx_lck);
3351 
3352 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3353 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3354 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3355 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3356 
3357 		lck_mtx_lock(&sync_mtx_lck);
3358 	}
3359 	/*
3360 	 * This wakeup _has_ to be issued before the lock is released otherwise
3361 	 * we may end up waking up a thread in sync_internal which is
3362 	 * expecting a wakeup from a thread it just created and not from this
3363 	 * thread which is about to exit.
3364 	 */
3365 	wakeup(&sync_thread_state);
3366 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3367 #if CONFIG_PHYS_WRITE_ACCT
3368 	pm_sync_thread = NULL;
3369 #endif /* CONFIG_PHYS_WRITE_ACCT */
3370 	lck_mtx_unlock(&sync_mtx_lck);
3371 
3372 	if (print_vmpage_stat) {
3373 		vm_countdirtypages();
3374 	}
3375 
3376 #if DIAGNOSTIC
3377 	if (syncprt) {
3378 		vfs_bufstats();
3379 	}
3380 #endif /* DIAGNOSTIC */
3381 }
3382 
3383 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3384 
3385 /*
3386  * An in-kernel sync for power management to call.
3387  * This function always returns within sync_timeout seconds.
3388  */
3389 __private_extern__ int
sync_internal(void)3390 sync_internal(void)
3391 {
3392 	thread_t thd = NULL;
3393 	int error;
3394 	int thread_created = FALSE;
3395 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3396 
3397 	lck_mtx_lock(&sync_mtx_lck);
3398 	sync_thread_state |= SYNC_THREAD_RUN;
3399 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3400 		int kr;
3401 
3402 		sync_thread_state |= SYNC_THREAD_RUNNING;
3403 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3404 		if (kr != KERN_SUCCESS) {
3405 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3406 			lck_mtx_unlock(&sync_mtx_lck);
3407 			printf("sync_thread failed\n");
3408 			return 0;
3409 		}
3410 		thread_created = TRUE;
3411 	}
3412 
3413 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3414 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3415 	if (error) {
3416 		struct timeval now;
3417 
3418 		microtime(&now);
3419 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3420 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3421 			sync_timeout_last_print.tv_sec = now.tv_sec;
3422 		}
3423 	}
3424 
3425 	if (thread_created) {
3426 		thread_deallocate(thd);
3427 	}
3428 
3429 	return 0;
3430 } /* end of sync_internal call */
3431 
3432 /*
3433  * Change filesystem quotas.
3434  */
3435 #if QUOTA
3436 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3437 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3438 {
3439 	struct mount *mp;
3440 	int error, quota_cmd, quota_status = 0;
3441 	caddr_t datap;
3442 	size_t fnamelen;
3443 	struct nameidata nd;
3444 	vfs_context_t ctx = vfs_context_current();
3445 	struct dqblk my_dqblk = {};
3446 
3447 	AUDIT_ARG(uid, uap->uid);
3448 	AUDIT_ARG(cmd, uap->cmd);
3449 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3450 	    uap->path, ctx);
3451 	error = namei(&nd);
3452 	if (error) {
3453 		return error;
3454 	}
3455 	mp = nd.ni_vp->v_mount;
3456 	mount_ref(mp, 0);
3457 	vnode_put(nd.ni_vp);
3458 	nameidone(&nd);
3459 
3460 #if CONFIG_MACF
3461 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3462 	if (error != 0) {
3463 		goto out;
3464 	}
3465 #endif
3466 
3467 	/* copyin any data we will need for downstream code */
3468 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3469 
3470 	switch (quota_cmd) {
3471 	case Q_QUOTAON:
3472 		/* uap->arg specifies a file from which to take the quotas */
3473 		fnamelen = MAXPATHLEN;
3474 		datap = zalloc(ZV_NAMEI);
3475 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3476 		break;
3477 	case Q_GETQUOTA:
3478 		/* uap->arg is a pointer to a dqblk structure. */
3479 		datap = (caddr_t) &my_dqblk;
3480 		break;
3481 	case Q_SETQUOTA:
3482 	case Q_SETUSE:
3483 		/* uap->arg is a pointer to a dqblk structure. */
3484 		datap = (caddr_t) &my_dqblk;
3485 		if (proc_is64bit(p)) {
3486 			struct user_dqblk       my_dqblk64;
3487 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3488 			if (error == 0) {
3489 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3490 			}
3491 		} else {
3492 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3493 		}
3494 		break;
3495 	case Q_QUOTASTAT:
3496 		/* uap->arg is a pointer to an integer */
3497 		datap = (caddr_t) &quota_status;
3498 		break;
3499 	default:
3500 		datap = NULL;
3501 		break;
3502 	} /* switch */
3503 
3504 	if (error == 0) {
3505 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3506 	}
3507 
3508 	switch (quota_cmd) {
3509 	case Q_QUOTAON:
3510 		if (datap != NULL) {
3511 			zfree(ZV_NAMEI, datap);
3512 		}
3513 		break;
3514 	case Q_GETQUOTA:
3515 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3516 		if (error == 0) {
3517 			if (proc_is64bit(p)) {
3518 				struct user_dqblk       my_dqblk64;
3519 
3520 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3521 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3522 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3523 			} else {
3524 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3525 			}
3526 		}
3527 		break;
3528 	case Q_QUOTASTAT:
3529 		/* uap->arg is a pointer to an integer */
3530 		if (error == 0) {
3531 			error = copyout(datap, uap->arg, sizeof(quota_status));
3532 		}
3533 		break;
3534 	default:
3535 		break;
3536 	} /* switch */
3537 
3538 out:
3539 	mount_drop(mp, 0);
3540 	return error;
3541 }
3542 #else
3543 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3544 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3545 {
3546 	return EOPNOTSUPP;
3547 }
3548 #endif /* QUOTA */
3549 
3550 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3551 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3552 {
3553 	int error;
3554 	vfs_context_t ctx = vfs_context_current();
3555 
3556 #if CONFIG_MACF
3557 	error = mac_mount_check_stat(ctx, mp);
3558 	if (error != 0) {
3559 		return error;
3560 	}
3561 #endif
3562 
3563 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3564 	if (error != 0) {
3565 		return error;
3566 	}
3567 
3568 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3569 }
3570 
3571 /*
3572  * Get filesystem statistics.
3573  *
3574  * Returns:	0			Success
3575  *	namei:???
3576  *	vfs_update_vfsstat:???
3577  *	munge_statfs:EFAULT
3578  */
3579 /* ARGSUSED */
3580 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3581 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3582 {
3583 	int error;
3584 	struct mount *mp;
3585 	struct nameidata nd;
3586 	vfs_context_t ctx = vfs_context_current();
3587 	vnode_t vp;
3588 
3589 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3590 	    UIO_USERSPACE, uap->path, ctx);
3591 	error = namei(&nd);
3592 	if (error != 0) {
3593 		return error;
3594 	}
3595 	vp = nd.ni_vp;
3596 	mp = vp->v_mount;
3597 	nameidone(&nd);
3598 
3599 	error = statfs_internal(p, mp, uap->buf);
3600 	vnode_put(vp);
3601 
3602 	return error;
3603 }
3604 
3605 /*
3606  * Get filesystem statistics.
3607  */
3608 /* ARGSUSED */
3609 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3610 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3611 {
3612 	int error;
3613 	vnode_t vp = NULL;
3614 	struct mount *mp;
3615 
3616 	AUDIT_ARG(fd, uap->fd);
3617 
3618 	if ((error = file_vnode(uap->fd, &vp)) ||
3619 	    (error = vnode_getwithref(vp))) {
3620 		goto out;
3621 	}
3622 
3623 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3624 
3625 	mp = vp->v_mount;
3626 	if (!mp) {
3627 		error = EBADF;
3628 		goto out_vnode;
3629 	}
3630 
3631 	error = statfs_internal(p, mp, uap->buf);
3632 
3633 out_vnode:
3634 	vnode_put(vp);
3635 
3636 out:
3637 	if (vp != NULL) {
3638 		file_drop(uap->fd);
3639 	}
3640 
3641 	return error;
3642 }
3643 
3644 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3645 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3646 {
3647 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3648 
3649 	bzero(sfs, sizeof(*sfs));
3650 
3651 	sfs->f_bsize = vsfs->f_bsize;
3652 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3653 	sfs->f_blocks = vsfs->f_blocks;
3654 	sfs->f_bfree = vsfs->f_bfree;
3655 	sfs->f_bavail = vsfs->f_bavail;
3656 	sfs->f_files = vsfs->f_files;
3657 	sfs->f_ffree = vsfs->f_ffree;
3658 	sfs->f_fsid = vsfs->f_fsid;
3659 	sfs->f_owner = vsfs->f_owner;
3660 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3661 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3662 	sfs->f_fssubtype = vsfs->f_fssubtype;
3663 	sfs->f_flags_ext = vfs_getextflags(mp);
3664 	vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3665 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3666 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3667 }
3668 
3669 /*
3670  * Get file system statistics in 64-bit mode
3671  */
3672 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3673 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3674 {
3675 	struct mount *mp;
3676 	int error;
3677 	struct nameidata *ndp;
3678 	struct statfs64 *sfsp;
3679 	vfs_context_t ctxp = vfs_context_current();
3680 	vnode_t vp;
3681 	struct {
3682 		struct nameidata nd;
3683 		struct statfs64 sfs;
3684 	} *__nameidata_statfs64;
3685 
3686 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3687 	    Z_WAITOK);
3688 	ndp = &__nameidata_statfs64->nd;
3689 
3690 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3691 	    UIO_USERSPACE, uap->path, ctxp);
3692 	error = namei(ndp);
3693 	if (error != 0) {
3694 		goto out;
3695 	}
3696 	vp = ndp->ni_vp;
3697 	mp = vp->v_mount;
3698 	nameidone(ndp);
3699 
3700 #if CONFIG_MACF
3701 	error = mac_mount_check_stat(ctxp, mp);
3702 	if (error != 0) {
3703 		vnode_put(vp);
3704 		goto out;
3705 	}
3706 #endif
3707 
3708 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3709 	if (error != 0) {
3710 		vnode_put(vp);
3711 		goto out;
3712 	}
3713 
3714 	sfsp = &__nameidata_statfs64->sfs;
3715 	vfs_get_statfs64(mp, sfsp);
3716 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3717 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3718 		/* This process does not want to see a seperate data volume mountpoint */
3719 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3720 	}
3721 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3722 	vnode_put(vp);
3723 
3724 out:
3725 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3726 
3727 	return error;
3728 }
3729 
3730 /*
3731  * Get file system statistics in 64-bit mode
3732  */
3733 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3734 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3735 {
3736 	struct vnode *vp;
3737 	struct mount *mp;
3738 	struct statfs64 sfs;
3739 	int error;
3740 
3741 	AUDIT_ARG(fd, uap->fd);
3742 
3743 	if ((error = file_vnode(uap->fd, &vp))) {
3744 		return error;
3745 	}
3746 
3747 	error = vnode_getwithref(vp);
3748 	if (error) {
3749 		file_drop(uap->fd);
3750 		return error;
3751 	}
3752 
3753 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3754 
3755 	mp = vp->v_mount;
3756 	if (!mp) {
3757 		error = EBADF;
3758 		goto out;
3759 	}
3760 
3761 #if CONFIG_MACF
3762 	error = mac_mount_check_stat(vfs_context_current(), mp);
3763 	if (error != 0) {
3764 		goto out;
3765 	}
3766 #endif
3767 
3768 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3769 		goto out;
3770 	}
3771 
3772 	vfs_get_statfs64(mp, &sfs);
3773 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3774 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3775 		/* This process does not want to see a seperate data volume mountpoint */
3776 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3777 	}
3778 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3779 
3780 out:
3781 	file_drop(uap->fd);
3782 	vnode_put(vp);
3783 
3784 	return error;
3785 }
3786 
3787 struct getfsstat_struct {
3788 	user_addr_t     sfsp;
3789 	user_addr_t     *mp;
3790 	int             count;
3791 	int             maxcount;
3792 	int             flags;
3793 	int             error;
3794 };
3795 
3796 
3797 static int
getfsstat_callback(mount_t mp,void * arg)3798 getfsstat_callback(mount_t mp, void * arg)
3799 {
3800 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3801 	struct vfsstatfs *sp;
3802 	int error, my_size;
3803 	vfs_context_t ctx = vfs_context_current();
3804 
3805 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3806 #if CONFIG_MACF
3807 		error = mac_mount_check_stat(ctx, mp);
3808 		if (error != 0) {
3809 			fstp->error = error;
3810 			return VFS_RETURNED_DONE;
3811 		}
3812 #endif
3813 		sp = &mp->mnt_vfsstat;
3814 		/*
3815 		 * If MNT_NOWAIT is specified, do not refresh the
3816 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3817 		 */
3818 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3819 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3820 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3821 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3822 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3823 			return VFS_RETURNED;
3824 		}
3825 
3826 		/*
3827 		 * Need to handle LP64 version of struct statfs
3828 		 */
3829 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3830 		if (error) {
3831 			fstp->error = error;
3832 			return VFS_RETURNED_DONE;
3833 		}
3834 		fstp->sfsp += my_size;
3835 
3836 		if (fstp->mp) {
3837 #if CONFIG_MACF
3838 			error = mac_mount_label_get(mp, *fstp->mp);
3839 			if (error) {
3840 				fstp->error = error;
3841 				return VFS_RETURNED_DONE;
3842 			}
3843 #endif
3844 			fstp->mp++;
3845 		}
3846 	}
3847 	fstp->count++;
3848 	return VFS_RETURNED;
3849 }
3850 
3851 /*
3852  * Get statistics on all filesystems.
3853  */
3854 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3855 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3856 {
3857 	struct __mac_getfsstat_args muap;
3858 
3859 	muap.buf = uap->buf;
3860 	muap.bufsize = uap->bufsize;
3861 	muap.mac = USER_ADDR_NULL;
3862 	muap.macsize = 0;
3863 	muap.flags = uap->flags;
3864 
3865 	return __mac_getfsstat(p, &muap, retval);
3866 }
3867 
3868 /*
3869  * __mac_getfsstat: Get MAC-related file system statistics
3870  *
3871  * Parameters:    p                        (ignored)
3872  *                uap                      User argument descriptor (see below)
3873  *                retval                   Count of file system statistics (N stats)
3874  *
3875  * Indirect:      uap->bufsize             Buffer size
3876  *                uap->macsize             MAC info size
3877  *                uap->buf                 Buffer where information will be returned
3878  *                uap->mac                 MAC info
3879  *                uap->flags               File system flags
3880  *
3881  *
3882  * Returns:        0                       Success
3883  *                !0                       Not success
3884  *
3885  */
3886 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3887 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3888 {
3889 	user_addr_t sfsp;
3890 	user_addr_t *mp;
3891 	size_t count, maxcount, bufsize, macsize;
3892 	struct getfsstat_struct fst;
3893 
3894 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3895 		return EINVAL;
3896 	}
3897 
3898 	bufsize = (size_t) uap->bufsize;
3899 	macsize = (size_t) uap->macsize;
3900 
3901 	if (IS_64BIT_PROCESS(p)) {
3902 		maxcount = bufsize / sizeof(struct user64_statfs);
3903 	} else {
3904 		maxcount = bufsize / sizeof(struct user32_statfs);
3905 	}
3906 	sfsp = uap->buf;
3907 	count = 0;
3908 
3909 	mp = NULL;
3910 
3911 #if CONFIG_MACF
3912 	if (uap->mac != USER_ADDR_NULL) {
3913 		u_int32_t *mp0;
3914 		int error;
3915 		unsigned int i;
3916 
3917 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3918 		if (count != maxcount) {
3919 			return EINVAL;
3920 		}
3921 
3922 		/* Copy in the array */
3923 		mp0 = kalloc_data(macsize, Z_WAITOK);
3924 		if (mp0 == NULL) {
3925 			return ENOMEM;
3926 		}
3927 
3928 		error = copyin(uap->mac, mp0, macsize);
3929 		if (error) {
3930 			kfree_data(mp0, macsize);
3931 			return error;
3932 		}
3933 
3934 		/* Normalize to an array of user_addr_t */
3935 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3936 		if (mp == NULL) {
3937 			kfree_data(mp0, macsize);
3938 			return ENOMEM;
3939 		}
3940 
3941 		for (i = 0; i < count; i++) {
3942 			if (IS_64BIT_PROCESS(p)) {
3943 				mp[i] = ((user_addr_t *)mp0)[i];
3944 			} else {
3945 				mp[i] = (user_addr_t)mp0[i];
3946 			}
3947 		}
3948 		kfree_data(mp0, macsize);
3949 	}
3950 #endif
3951 
3952 
3953 	fst.sfsp = sfsp;
3954 	fst.mp = mp;
3955 	fst.flags = uap->flags;
3956 	fst.count = 0;
3957 	fst.error = 0;
3958 	fst.maxcount = (int)maxcount;
3959 
3960 
3961 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3962 
3963 	if (mp) {
3964 		kfree_data(mp, count * sizeof(user_addr_t));
3965 	}
3966 
3967 	if (fst.error) {
3968 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3969 		return fst.error;
3970 	}
3971 
3972 	if (fst.sfsp && fst.count > fst.maxcount) {
3973 		*retval = fst.maxcount;
3974 	} else {
3975 		*retval = fst.count;
3976 	}
3977 	return 0;
3978 }
3979 
3980 static int
getfsstat64_callback(mount_t mp,void * arg)3981 getfsstat64_callback(mount_t mp, void * arg)
3982 {
3983 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3984 	struct vfsstatfs *sp;
3985 	struct statfs64 sfs;
3986 	int error;
3987 
3988 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3989 #if CONFIG_MACF
3990 		error = mac_mount_check_stat(vfs_context_current(), mp);
3991 		if (error != 0) {
3992 			fstp->error = error;
3993 			return VFS_RETURNED_DONE;
3994 		}
3995 #endif
3996 		sp = &mp->mnt_vfsstat;
3997 		/*
3998 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3999 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
4000 		 *
4001 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
4002 		 * getfsstat, since the constants are out of the same
4003 		 * namespace.
4004 		 */
4005 		if ((mp->mnt_lflag & MNT_LDEAD) ||
4006 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
4007 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
4008 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
4009 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
4010 			return VFS_RETURNED;
4011 		}
4012 
4013 		vfs_get_statfs64(mp, &sfs);
4014 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
4015 		if (error) {
4016 			fstp->error = error;
4017 			return VFS_RETURNED_DONE;
4018 		}
4019 		fstp->sfsp += sizeof(sfs);
4020 	}
4021 	fstp->count++;
4022 	return VFS_RETURNED;
4023 }
4024 
4025 /*
4026  * Get statistics on all file systems in 64 bit mode.
4027  */
4028 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)4029 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
4030 {
4031 	user_addr_t sfsp;
4032 	int count, maxcount;
4033 	struct getfsstat_struct fst;
4034 
4035 	maxcount = uap->bufsize / sizeof(struct statfs64);
4036 
4037 	sfsp = uap->buf;
4038 	count = 0;
4039 
4040 	fst.sfsp = sfsp;
4041 	fst.flags = uap->flags;
4042 	fst.count = 0;
4043 	fst.error = 0;
4044 	fst.maxcount = maxcount;
4045 
4046 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
4047 
4048 	if (fst.error) {
4049 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4050 		return fst.error;
4051 	}
4052 
4053 	if (fst.sfsp && fst.count > fst.maxcount) {
4054 		*retval = fst.maxcount;
4055 	} else {
4056 		*retval = fst.count;
4057 	}
4058 
4059 	return 0;
4060 }
4061 
4062 /*
4063  * gets the associated vnode with the file descriptor passed.
4064  * as input
4065  *
4066  * INPUT
4067  * ctx - vfs context of caller
4068  * fd - file descriptor for which vnode is required.
4069  * vpp - Pointer to pointer to vnode to be returned.
4070  *
4071  * The vnode is returned with an iocount so any vnode obtained
4072  * by this call needs a vnode_put
4073  *
4074  */
4075 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)4076 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
4077 {
4078 	int error;
4079 	vnode_t vp;
4080 	struct fileproc *fp;
4081 	proc_t p = vfs_context_proc(ctx);
4082 
4083 	*vpp =  NULLVP;
4084 
4085 	error = fp_getfvp(p, fd, &fp, &vp);
4086 	if (error) {
4087 		return error;
4088 	}
4089 
4090 	error = vnode_getwithref(vp);
4091 	if (error) {
4092 		(void)fp_drop(p, fd, fp, 0);
4093 		return error;
4094 	}
4095 
4096 	(void)fp_drop(p, fd, fp, 0);
4097 	*vpp = vp;
4098 	return error;
4099 }
4100 
4101 /*
4102  * Wrapper function around namei to start lookup from a directory
4103  * specified by a file descriptor ni_dirfd.
4104  *
4105  * In addition to all the errors returned by namei, this call can
4106  * return ENOTDIR if the file descriptor does not refer to a directory.
4107  * and EBADF if the file descriptor is not valid.
4108  */
4109 int
nameiat(struct nameidata * ndp,int dirfd)4110 nameiat(struct nameidata *ndp, int dirfd)
4111 {
4112 	if ((dirfd != AT_FDCWD) &&
4113 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4114 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
4115 		int error = 0;
4116 		char c;
4117 
4118 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4119 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4120 			if (error) {
4121 				return error;
4122 			}
4123 		} else {
4124 			c = *((char *)(ndp->ni_dirp));
4125 		}
4126 
4127 		if (c != '/') {
4128 			vnode_t dvp_at;
4129 
4130 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4131 			    &dvp_at);
4132 			if (error) {
4133 				return error;
4134 			}
4135 
4136 			if (vnode_vtype(dvp_at) != VDIR) {
4137 				vnode_put(dvp_at);
4138 				return ENOTDIR;
4139 			}
4140 
4141 			ndp->ni_dvp = dvp_at;
4142 			ndp->ni_cnd.cn_flags |= USEDVP;
4143 			error = namei(ndp);
4144 			ndp->ni_cnd.cn_flags &= ~USEDVP;
4145 			vnode_put(dvp_at);
4146 			return error;
4147 		}
4148 	}
4149 
4150 	return namei(ndp);
4151 }
4152 
4153 /*
4154  * Change current working directory to a given file descriptor.
4155  */
4156 /* ARGSUSED */
4157 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4158 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4159 {
4160 	vnode_t vp;
4161 	vnode_t tdp;
4162 	vnode_t tvp;
4163 	struct mount *mp;
4164 	int error, should_put = 1;
4165 
4166 	AUDIT_ARG(fd, fd);
4167 	if (per_thread && fd == -1) {
4168 		/*
4169 		 * Switching back from per-thread to per process CWD; verify we
4170 		 * in fact have one before proceeding.  The only success case
4171 		 * for this code path is to return 0 preemptively after zapping
4172 		 * the thread structure contents.
4173 		 */
4174 		thread_t th = vfs_context_thread(ctx);
4175 		if (th) {
4176 			uthread_t uth = get_bsdthread_info(th);
4177 			tvp = uth->uu_cdir;
4178 			uth->uu_cdir = NULLVP;
4179 			if (tvp != NULLVP) {
4180 				vnode_rele(tvp);
4181 				return 0;
4182 			}
4183 		}
4184 		return EBADF;
4185 	}
4186 
4187 	if ((error = file_vnode(fd, &vp))) {
4188 		return error;
4189 	}
4190 	if ((error = vnode_getwithref(vp))) {
4191 		file_drop(fd);
4192 		return error;
4193 	}
4194 
4195 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4196 
4197 	if (vp->v_type != VDIR) {
4198 		error = ENOTDIR;
4199 		goto out;
4200 	}
4201 
4202 #if CONFIG_MACF
4203 	error = mac_vnode_check_chdir(ctx, vp);
4204 	if (error) {
4205 		goto out;
4206 	}
4207 #endif
4208 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4209 	if (error) {
4210 		goto out;
4211 	}
4212 
4213 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4214 		if (vfs_busy(mp, LK_NOWAIT)) {
4215 			error = EACCES;
4216 			goto out;
4217 		}
4218 		error = VFS_ROOT(mp, &tdp, ctx);
4219 		vfs_unbusy(mp);
4220 		if (error) {
4221 			break;
4222 		}
4223 		vnode_put(vp);
4224 		vp = tdp;
4225 	}
4226 	if (error) {
4227 		goto out;
4228 	}
4229 	if ((error = vnode_ref(vp))) {
4230 		goto out;
4231 	}
4232 	vnode_put(vp);
4233 	should_put = 0;
4234 
4235 	if (per_thread) {
4236 		thread_t th = vfs_context_thread(ctx);
4237 		if (th) {
4238 			uthread_t uth = get_bsdthread_info(th);
4239 			tvp = uth->uu_cdir;
4240 			uth->uu_cdir = vp;
4241 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4242 		} else {
4243 			vnode_rele(vp);
4244 			error = ENOENT;
4245 			goto out;
4246 		}
4247 	} else {
4248 		proc_dirs_lock_exclusive(p);
4249 		proc_fdlock(p);
4250 		tvp = p->p_fd.fd_cdir;
4251 		p->p_fd.fd_cdir = vp;
4252 		proc_fdunlock(p);
4253 		proc_dirs_unlock_exclusive(p);
4254 	}
4255 
4256 	if (tvp) {
4257 		vnode_rele(tvp);
4258 	}
4259 
4260 out:
4261 	if (should_put) {
4262 		vnode_put(vp);
4263 	}
4264 	file_drop(fd);
4265 
4266 	return error;
4267 }
4268 
4269 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4270 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4271 {
4272 	return fchdir(p, vfs_context_current(), uap->fd, false);
4273 }
4274 
4275 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4276 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4277 {
4278 	return fchdir(p, vfs_context_current(), uap->fd, true);
4279 }
4280 
4281 
4282 /*
4283  * Change current working directory (".").
4284  *
4285  * Returns:	0			Success
4286  *	change_dir:ENOTDIR
4287  *	change_dir:???
4288  *	vnode_ref:ENOENT		No such file or directory
4289  */
4290 /* ARGSUSED */
4291 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4292 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4293 {
4294 	int error;
4295 	vnode_t tvp;
4296 
4297 	error = change_dir(ndp, ctx);
4298 	if (error) {
4299 		return error;
4300 	}
4301 	if ((error = vnode_ref(ndp->ni_vp))) {
4302 		vnode_put(ndp->ni_vp);
4303 		return error;
4304 	}
4305 	/*
4306 	 * drop the iocount we picked up in change_dir
4307 	 */
4308 	vnode_put(ndp->ni_vp);
4309 
4310 	if (per_thread) {
4311 		thread_t th = vfs_context_thread(ctx);
4312 		if (th) {
4313 			uthread_t uth = get_bsdthread_info(th);
4314 			tvp = uth->uu_cdir;
4315 			uth->uu_cdir = ndp->ni_vp;
4316 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4317 		} else {
4318 			vnode_rele(ndp->ni_vp);
4319 			return ENOENT;
4320 		}
4321 	} else {
4322 		proc_dirs_lock_exclusive(p);
4323 		proc_fdlock(p);
4324 		tvp = p->p_fd.fd_cdir;
4325 		p->p_fd.fd_cdir = ndp->ni_vp;
4326 		proc_fdunlock(p);
4327 		proc_dirs_unlock_exclusive(p);
4328 	}
4329 
4330 	if (tvp) {
4331 		vnode_rele(tvp);
4332 	}
4333 
4334 	return 0;
4335 }
4336 
4337 
4338 /*
4339  * Change current working directory (".").
4340  *
4341  * Returns:	0			Success
4342  *	chdir_internal:ENOTDIR
4343  *	chdir_internal:ENOENT		No such file or directory
4344  *	chdir_internal:???
4345  */
4346 /* ARGSUSED */
4347 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4348 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4349 {
4350 	struct nameidata nd;
4351 	vfs_context_t ctx = vfs_context_current();
4352 
4353 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4354 	    UIO_USERSPACE, uap->path, ctx);
4355 
4356 	return chdir_internal(p, ctx, &nd, per_thread);
4357 }
4358 
4359 
4360 /*
4361  * chdir
4362  *
4363  * Change current working directory (".") for the entire process
4364  *
4365  * Parameters:  p       Process requesting the call
4366  *              uap     User argument descriptor (see below)
4367  *              retval  (ignored)
4368  *
4369  * Indirect parameters:	uap->path	Directory path
4370  *
4371  * Returns:	0			Success
4372  *              common_chdir: ENOTDIR
4373  *              common_chdir: ENOENT	No such file or directory
4374  *              common_chdir: ???
4375  *
4376  */
4377 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4378 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4379 {
4380 	return common_chdir(p, (void *)uap, 0);
4381 }
4382 
4383 /*
4384  * __pthread_chdir
4385  *
4386  * Change current working directory (".") for a single thread
4387  *
4388  * Parameters:  p       Process requesting the call
4389  *              uap     User argument descriptor (see below)
4390  *              retval  (ignored)
4391  *
4392  * Indirect parameters:	uap->path	Directory path
4393  *
4394  * Returns:	0			Success
4395  *              common_chdir: ENOTDIR
4396  *		common_chdir: ENOENT	No such file or directory
4397  *		common_chdir: ???
4398  *
4399  */
4400 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4401 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4402 {
4403 	return common_chdir(p, (void *)uap, 1);
4404 }
4405 
4406 
4407 /*
4408  * Change notion of root (``/'') directory.
4409  */
4410 /* ARGSUSED */
4411 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4412 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4413 {
4414 	struct filedesc *fdp = &p->p_fd;
4415 	int error;
4416 	struct nameidata nd;
4417 	vnode_t tvp;
4418 	vfs_context_t ctx = vfs_context_current();
4419 
4420 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4421 		return error;
4422 	}
4423 
4424 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4425 	    UIO_USERSPACE, uap->path, ctx);
4426 	error = change_dir(&nd, ctx);
4427 	if (error) {
4428 		return error;
4429 	}
4430 
4431 #if CONFIG_MACF
4432 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4433 	    &nd.ni_cnd);
4434 	if (error) {
4435 		vnode_put(nd.ni_vp);
4436 		return error;
4437 	}
4438 #endif
4439 
4440 	if ((error = vnode_ref(nd.ni_vp))) {
4441 		vnode_put(nd.ni_vp);
4442 		return error;
4443 	}
4444 	vnode_put(nd.ni_vp);
4445 
4446 	/*
4447 	 * This lock provides the guarantee that as long as you hold the lock
4448 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4449 	 * on a referenced vnode in namei when determining the rootvnode for
4450 	 * a process.
4451 	 */
4452 	/* needed for synchronization with lookup */
4453 	proc_dirs_lock_exclusive(p);
4454 	/* needed for setting the flag and other activities on the fd itself */
4455 	proc_fdlock(p);
4456 	tvp = fdp->fd_rdir;
4457 	fdp->fd_rdir = nd.ni_vp;
4458 	fdt_flag_set(fdp, FD_CHROOT);
4459 	proc_fdunlock(p);
4460 	proc_dirs_unlock_exclusive(p);
4461 
4462 	if (tvp != NULL) {
4463 		vnode_rele(tvp);
4464 	}
4465 
4466 	return 0;
4467 }
4468 
4469 #define PATHSTATICBUFLEN 256
4470 #define PIVOT_ROOT_ENTITLEMENT              \
4471        "com.apple.private.vfs.pivot-root"
4472 
4473 #if defined(XNU_TARGET_OS_OSX)
4474 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4475 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4476 {
4477 	int error;
4478 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4479 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4480 	char *new_rootfs_path_before_buf = NULL;
4481 	char *old_rootfs_path_after_buf = NULL;
4482 	char *incoming = NULL;
4483 	char *outgoing = NULL;
4484 	vnode_t incoming_rootvp = NULLVP;
4485 	size_t bytes_copied;
4486 
4487 	/*
4488 	 * XXX : Additional restrictions needed
4489 	 * - perhaps callable only once.
4490 	 */
4491 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4492 		return error;
4493 	}
4494 
4495 	/*
4496 	 * pivot_root can be executed by launchd only.
4497 	 * Enforce entitlement.
4498 	 */
4499 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4500 		return EPERM;
4501 	}
4502 
4503 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4504 	if (error == ENAMETOOLONG) {
4505 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4506 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4507 	}
4508 
4509 	if (error) {
4510 		goto out;
4511 	}
4512 
4513 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4514 	if (error == ENAMETOOLONG) {
4515 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4516 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4517 	}
4518 	if (error) {
4519 		goto out;
4520 	}
4521 
4522 	if (new_rootfs_path_before_buf) {
4523 		incoming = new_rootfs_path_before_buf;
4524 	} else {
4525 		incoming = &new_rootfs_path_before[0];
4526 	}
4527 
4528 	if (old_rootfs_path_after_buf) {
4529 		outgoing = old_rootfs_path_after_buf;
4530 	} else {
4531 		outgoing = &old_rootfs_path_after[0];
4532 	}
4533 
4534 	/*
4535 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4536 	 * Userland is not allowed to pivot to an image.
4537 	 */
4538 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4539 	if (error) {
4540 		goto out;
4541 	}
4542 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4543 	if (error) {
4544 		goto out;
4545 	}
4546 
4547 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4548 
4549 out:
4550 	if (incoming_rootvp != NULLVP) {
4551 		vnode_put(incoming_rootvp);
4552 		incoming_rootvp = NULLVP;
4553 	}
4554 
4555 	if (old_rootfs_path_after_buf) {
4556 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4557 	}
4558 
4559 	if (new_rootfs_path_before_buf) {
4560 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4561 	}
4562 
4563 	return error;
4564 }
4565 #else
4566 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4567 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4568 {
4569 	return nosys(p, NULL, retval);
4570 }
4571 #endif /* XNU_TARGET_OS_OSX */
4572 
4573 /*
4574  * Common routine for chroot and chdir.
4575  *
4576  * Returns:	0			Success
4577  *		ENOTDIR			Not a directory
4578  *		namei:???		[anything namei can return]
4579  *		vnode_authorize:???	[anything vnode_authorize can return]
4580  */
4581 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4582 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4583 {
4584 	vnode_t vp;
4585 	int error;
4586 
4587 	if ((error = namei(ndp))) {
4588 		return error;
4589 	}
4590 	nameidone(ndp);
4591 	vp = ndp->ni_vp;
4592 
4593 	if (vp->v_type != VDIR) {
4594 		vnode_put(vp);
4595 		return ENOTDIR;
4596 	}
4597 
4598 #if CONFIG_MACF
4599 	error = mac_vnode_check_chdir(ctx, vp);
4600 	if (error) {
4601 		vnode_put(vp);
4602 		return error;
4603 	}
4604 #endif
4605 
4606 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4607 	if (error) {
4608 		vnode_put(vp);
4609 		return error;
4610 	}
4611 
4612 	return error;
4613 }
4614 
4615 /*
4616  * Free the vnode data (for directories) associated with the file glob.
4617  */
4618 struct fd_vn_data *
fg_vn_data_alloc(void)4619 fg_vn_data_alloc(void)
4620 {
4621 	struct fd_vn_data *fvdata;
4622 
4623 	/* Allocate per fd vnode data */
4624 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4625 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4626 	return fvdata;
4627 }
4628 
4629 /*
4630  * Free the vnode data (for directories) associated with the file glob.
4631  */
4632 void
fg_vn_data_free(void * fgvndata)4633 fg_vn_data_free(void *fgvndata)
4634 {
4635 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4636 
4637 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4638 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4639 	kfree_type(struct fd_vn_data, fvdata);
4640 }
4641 
4642 /*
4643  * Check permissions, allocate an open file structure,
4644  * and call the device open routine if any.
4645  *
4646  * Returns:	0			Success
4647  *		EINVAL
4648  *		EINTR
4649  *	falloc:ENFILE
4650  *	falloc:EMFILE
4651  *	falloc:ENOMEM
4652  *	vn_open_auth:???
4653  *	dupfdopen:???
4654  *	VNOP_ADVLOCK:???
4655  *	vnode_setsize:???
4656  *
4657  * XXX Need to implement uid, gid
4658  */
4659 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4660 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4661     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4662 {
4663 	proc_t p = vfs_context_proc(ctx);
4664 	kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4665 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4666 	struct fileproc *fp;
4667 	vnode_t vp;
4668 	int flags, oflags, amode;
4669 	int type, indx, error;
4670 	struct vfs_context context;
4671 	vnode_t authvp = NULLVP;
4672 
4673 	oflags = uflags;
4674 
4675 	amode = oflags & O_ACCMODE;
4676 	/*
4677 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4678 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4679 	 * with FREAD/FWRITE.
4680 	 */
4681 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4682 		return EINVAL;
4683 	}
4684 
4685 	flags = FFLAGS(uflags);
4686 	CLR(flags, FENCRYPTED);
4687 	CLR(flags, FUNENCRYPTED);
4688 
4689 	AUDIT_ARG(fflags, oflags);
4690 	AUDIT_ARG(mode, vap->va_mode);
4691 
4692 	if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4693 		return error;
4694 	}
4695 	if (flags & O_CLOEXEC) {
4696 		fp->fp_flags |= FP_CLOEXEC;
4697 	}
4698 	if (flags & O_CLOFORK) {
4699 		fp->fp_flags |= FP_CLOFORK;
4700 	}
4701 
4702 	/* setup state to recognize when fdesc_open was called */
4703 	uu->uu_dupfd = -1;
4704 
4705 	/*
4706 	 * Disable read/write access if file is opened with O_EVTONLY and
4707 	 * the process has requested to deny read/write access.
4708 	 */
4709 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4710 		flags &= ~(FREAD | FWRITE);
4711 	}
4712 
4713 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4714 		error = vnode_getfromfd(ctx, authfd, &authvp);
4715 		if (error) {
4716 			fp_free(p, indx, fp);
4717 			return error;
4718 		}
4719 	}
4720 
4721 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4722 		if (authvp != NULLVP) {
4723 			vnode_put(authvp);
4724 		}
4725 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4726 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4727 				*retval = indx;
4728 				return 0;
4729 			}
4730 		}
4731 		if (error == ERESTART) {
4732 			error = EINTR;
4733 		}
4734 		fp_free(p, indx, fp);
4735 		return error;
4736 	}
4737 
4738 	if (authvp != NULLVP) {
4739 		vnode_put(authvp);
4740 	}
4741 
4742 	uu->uu_dupfd = 0;
4743 	vp = ndp->ni_vp;
4744 
4745 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4746 	fp->fp_glob->fg_ops = &vnops;
4747 	fp_set_data(fp, vp);
4748 
4749 #if CONFIG_FILE_LEASES
4750 	/*
4751 	 * If we are creating a file or open with truncate, we need to break the
4752 	 * lease if there is a read lease placed on the parent dir.
4753 	 */
4754 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4755 		vnode_breakdirlease(vp, true, oflags);
4756 	}
4757 	/* Now check if there is a lease placed on the file itself. */
4758 	error = vnode_breaklease(vp, oflags, ctx);
4759 	if (error) {
4760 		goto bad;
4761 	}
4762 #endif /* CONFIG_FILE_LEASES */
4763 
4764 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4765 		struct flock lf = {
4766 			.l_whence = SEEK_SET,
4767 		};
4768 
4769 		if (flags & O_EXLOCK) {
4770 			lf.l_type = F_WRLCK;
4771 		} else {
4772 			lf.l_type = F_RDLCK;
4773 		}
4774 		type = F_FLOCK;
4775 		if ((flags & FNONBLOCK) == 0) {
4776 			type |= F_WAIT;
4777 		}
4778 #if CONFIG_MACF
4779 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4780 		    F_SETLK, &lf);
4781 		if (error) {
4782 			goto bad;
4783 		}
4784 #endif
4785 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4786 			goto bad;
4787 		}
4788 		fp->fp_glob->fg_flag |= FWASLOCKED;
4789 	}
4790 
4791 	/* try to truncate by setting the size attribute */
4792 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4793 		goto bad;
4794 	}
4795 
4796 	/*
4797 	 * For directories we hold some additional information in the fd.
4798 	 */
4799 	if (vnode_vtype(vp) == VDIR) {
4800 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4801 	} else {
4802 		fp->fp_glob->fg_vn_data = NULL;
4803 	}
4804 
4805 #if CONFIG_SECLUDED_MEMORY
4806 	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4807 		memory_object_control_t moc;
4808 		const char *v_name;
4809 
4810 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4811 
4812 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4813 			/* nothing to do... */
4814 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4815 			/* writable -> no longer  eligible for secluded pages */
4816 			memory_object_mark_eligible_for_secluded(moc,
4817 			    FALSE);
4818 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4819 			char pathname[32] = { 0, };
4820 			size_t copied;
4821 			/* XXX FBDP: better way to detect /Applications/ ? */
4822 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4823 				(void)copyinstr(ndp->ni_dirp,
4824 				    pathname,
4825 				    sizeof(pathname),
4826 				    &copied);
4827 			} else {
4828 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4829 				    pathname,
4830 				    sizeof(pathname),
4831 				    &copied);
4832 			}
4833 			pathname[sizeof(pathname) - 1] = '\0';
4834 			if (strncmp(pathname,
4835 			    "/Applications/",
4836 			    strlen("/Applications/")) == 0 &&
4837 			    strncmp(pathname,
4838 			    "/Applications/Camera.app/",
4839 			    strlen("/Applications/Camera.app/")) != 0) {
4840 				/*
4841 				 * not writable
4842 				 * AND from "/Applications/"
4843 				 * AND not from "/Applications/Camera.app/"
4844 				 * ==> eligible for secluded
4845 				 */
4846 				memory_object_mark_eligible_for_secluded(moc,
4847 				    TRUE);
4848 			}
4849 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4850 		    (v_name = vnode_getname(vp))) {
4851 			size_t len = strlen(v_name);
4852 
4853 			if (!strncmp(v_name, "dyld", len) ||
4854 			    !strncmp(v_name, "launchd", len) ||
4855 			    !strncmp(v_name, "Camera", len) ||
4856 			    !strncmp(v_name, "SpringBoard", len) ||
4857 			    !strncmp(v_name, "backboardd", len) ||
4858 			    !strncmp(v_name, "cameracaptured", len)) {
4859 				/*
4860 				 * This file matters when launching Camera:
4861 				 * do not store its contents in the secluded
4862 				 * pool that will be drained on Camera launch.
4863 				 */
4864 				memory_object_mark_eligible_for_secluded(moc,
4865 				    FALSE);
4866 			} else if (!strncmp(v_name, "audiomxd", len) ||
4867 			    !strncmp(v_name, "mediaplaybackd", len)) {
4868 				memory_object_mark_eligible_for_secluded(moc,
4869 				    FALSE);
4870 				memory_object_mark_for_realtime(moc,
4871 				    true);
4872 			} else if (!strncmp(v_name, "bluetoothd", len)) {
4873 				/*
4874 				 * bluetoothd might be needed for realtime audio
4875 				 * playback.
4876 				 */
4877 				memory_object_mark_eligible_for_secluded(moc,
4878 				    FALSE);
4879 				memory_object_mark_for_realtime(moc,
4880 				    true);
4881 			} else {
4882 				char pathname[64] = { 0, };
4883 				size_t copied;
4884 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4885 					(void)copyinstr(ndp->ni_dirp,
4886 					    pathname,
4887 					    sizeof(pathname),
4888 					    &copied);
4889 				} else {
4890 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
4891 					    pathname,
4892 					    sizeof(pathname),
4893 					    &copied);
4894 				}
4895 				pathname[sizeof(pathname) - 1] = '\0';
4896 				if (strncmp(pathname,
4897 				    "/Library/Audio/Plug-Ins/",
4898 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4899 				    strncmp(pathname,
4900 				    "/System/Library/Audio/Plug-Ins/",
4901 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4902 					/*
4903 					 * This may be an audio plugin required
4904 					 * for realtime playback.
4905 					 * ==> NOT eligible for secluded.
4906 					 */
4907 					memory_object_mark_eligible_for_secluded(moc,
4908 					    FALSE);
4909 					memory_object_mark_for_realtime(moc,
4910 					    true);
4911 				}
4912 			}
4913 			vnode_putname(v_name);
4914 		}
4915 	}
4916 #endif /* CONFIG_SECLUDED_MEMORY */
4917 
4918 	vnode_put(vp);
4919 
4920 	/*
4921 	 * The first terminal open (without a O_NOCTTY) by a session leader
4922 	 * results in it being set as the controlling terminal.
4923 	 */
4924 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4925 	    !(flags & O_NOCTTY)) {
4926 		int tmp = 0;
4927 
4928 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4929 		    (caddr_t)&tmp, ctx);
4930 	}
4931 
4932 	proc_fdlock(p);
4933 	procfdtbl_releasefd(p, indx, NULL);
4934 
4935 	fp_drop(p, indx, fp, 1);
4936 	proc_fdunlock(p);
4937 
4938 	*retval = indx;
4939 
4940 	return 0;
4941 bad:
4942 	context = *vfs_context_current();
4943 	context.vc_ucred = fp->fp_glob->fg_cred;
4944 
4945 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4946 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4947 		struct flock lf = {
4948 			.l_whence = SEEK_SET,
4949 			.l_type = F_UNLCK,
4950 		};
4951 
4952 		(void)VNOP_ADVLOCK(
4953 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4954 	}
4955 
4956 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4957 	vnode_put(vp);
4958 	fp_free(p, indx, fp);
4959 
4960 	return error;
4961 }
4962 
4963 /*
4964  * While most of the *at syscall handlers can call nameiat() which
4965  * is a wrapper around namei, the use of namei and initialisation
4966  * of nameidata are far removed and in different functions  - namei
4967  * gets called in vn_open_auth for open1. So we'll just do here what
4968  * nameiat() does.
4969  */
4970 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4971 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4972     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4973     int dirfd, int authfd)
4974 {
4975 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4976 		int error;
4977 		char c;
4978 
4979 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4980 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4981 			if (error) {
4982 				return error;
4983 			}
4984 		} else {
4985 			c = *((char *)(ndp->ni_dirp));
4986 		}
4987 
4988 		if (c != '/') {
4989 			vnode_t dvp_at;
4990 
4991 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4992 			    &dvp_at);
4993 			if (error) {
4994 				return error;
4995 			}
4996 
4997 			if (vnode_vtype(dvp_at) != VDIR) {
4998 				vnode_put(dvp_at);
4999 				return ENOTDIR;
5000 			}
5001 
5002 			ndp->ni_dvp = dvp_at;
5003 			ndp->ni_cnd.cn_flags |= USEDVP;
5004 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
5005 			    retval, authfd);
5006 			vnode_put(dvp_at);
5007 			return error;
5008 		}
5009 	}
5010 
5011 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
5012 }
5013 
5014 /*
5015  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
5016  *
5017  * Parameters:	p			Process requesting the open
5018  *		uap			User argument descriptor (see below)
5019  *		retval			Pointer to an area to receive the
5020  *					return calue from the system call
5021  *
5022  * Indirect:	uap->path		Path to open (same as 'open')
5023  *		uap->flags		Flags to open (same as 'open'
5024  *		uap->uid		UID to set, if creating
5025  *		uap->gid		GID to set, if creating
5026  *		uap->mode		File mode, if creating (same as 'open')
5027  *		uap->xsecurity		ACL to set, if creating
5028  *
5029  * Returns:	0			Success
5030  *		!0			errno value
5031  *
5032  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5033  *
5034  * XXX:		We should enummerate the possible errno values here, and where
5035  *		in the code they originated.
5036  */
5037 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)5038 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
5039 {
5040 	int ciferror;
5041 	kauth_filesec_t xsecdst;
5042 	struct vnode_attr va;
5043 	struct nameidata nd;
5044 	int cmode;
5045 
5046 	AUDIT_ARG(owner, uap->uid, uap->gid);
5047 
5048 	xsecdst = NULL;
5049 	if ((uap->xsecurity != USER_ADDR_NULL) &&
5050 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
5051 		return ciferror;
5052 	}
5053 
5054 	VATTR_INIT(&va);
5055 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
5056 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5057 	if (uap->uid != KAUTH_UID_NONE) {
5058 		VATTR_SET(&va, va_uid, uap->uid);
5059 	}
5060 	if (uap->gid != KAUTH_GID_NONE) {
5061 		VATTR_SET(&va, va_gid, uap->gid);
5062 	}
5063 	if (xsecdst != NULL) {
5064 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5065 		va.va_vaflags |= VA_FILESEC_ACL;
5066 	}
5067 
5068 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
5069 	    uap->path, vfs_context_current());
5070 
5071 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
5072 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
5073 	if (xsecdst != NULL) {
5074 		kauth_filesec_free(xsecdst);
5075 	}
5076 
5077 	return ciferror;
5078 }
5079 
5080 /*
5081  * Go through the data-protected atomically controlled open (2)
5082  *
5083  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5084  */
5085 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5086 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5087     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5088 {
5089 	/*
5090 	 * Follow the same path as normal open(2)
5091 	 * Look up the item if it exists, and acquire the vnode.
5092 	 */
5093 	struct vnode_attr va;
5094 	struct nameidata nd;
5095 	int cmode;
5096 	int error;
5097 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5098 
5099 	VATTR_INIT(&va);
5100 	/* Mask off all but regular access permissions */
5101 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5102 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5103 
5104 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5105 	    path, ctx);
5106 
5107 	/*
5108 	 * Initialize the extra fields in vnode_attr to pass down our
5109 	 * extra fields.
5110 	 * 1. target cprotect class.
5111 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5112 	 */
5113 	if (flags & O_CREAT) {
5114 		/* lower level kernel code validates that the class is valid before applying it. */
5115 		if (class != PROTECTION_CLASS_DEFAULT) {
5116 			/*
5117 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5118 			 * file behave the same as open (2)
5119 			 */
5120 			VATTR_SET(&va, va_dataprotect_class, class);
5121 		}
5122 	}
5123 
5124 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5125 		if (flags & (O_RDWR | O_WRONLY)) {
5126 			/*
5127 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
5128 			 */
5129 			return EINVAL;
5130 		}
5131 		if (dpflags & O_DP_GETRAWENCRYPTED) {
5132 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5133 		}
5134 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5135 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5136 		}
5137 		if (dpflags & O_DP_AUTHENTICATE) {
5138 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5139 		}
5140 	}
5141 
5142 	error = open1at(vfs_context_current(), &nd, flags, &va,
5143 	    NULL, NULL, retval, fd, authfd);
5144 
5145 	return error;
5146 }
5147 
5148 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5149 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5150 {
5151 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5152 		return EINVAL;
5153 	}
5154 
5155 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5156 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5157 }
5158 
5159 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5160 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5161 {
5162 	if (uap->dpflags & O_DP_AUTHENTICATE) {
5163 		return EINVAL;
5164 	}
5165 
5166 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5167 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5168 }
5169 
5170 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5171 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5172     int fd, enum uio_seg segflg, int *retval)
5173 {
5174 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5175 	struct {
5176 		struct vnode_attr va;
5177 		struct nameidata nd;
5178 	} *__open_data;
5179 	struct vnode_attr *vap;
5180 	struct nameidata *ndp;
5181 	int cmode;
5182 	int error;
5183 
5184 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5185 	vap = &__open_data->va;
5186 	ndp = &__open_data->nd;
5187 
5188 	VATTR_INIT(vap);
5189 	/* Mask off all but regular access permissions */
5190 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5191 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5192 
5193 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5194 	    segflg, path, ctx);
5195 
5196 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5197 
5198 	kfree_type(typeof(*__open_data), __open_data);
5199 
5200 	return error;
5201 }
5202 
5203 int
open(proc_t p,struct open_args * uap,int32_t * retval)5204 open(proc_t p, struct open_args *uap, int32_t *retval)
5205 {
5206 	__pthread_testcancel(1);
5207 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5208 }
5209 
5210 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5211 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5212     int32_t *retval)
5213 {
5214 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5215 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5216 }
5217 
5218 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5219 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5220     int32_t *retval)
5221 {
5222 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5223 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
5224 }
5225 
5226 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5227 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5228 {
5229 	__pthread_testcancel(1);
5230 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5231 }
5232 
5233 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5234 
5235 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5236 vfs_context_can_open_by_id(vfs_context_t ctx)
5237 {
5238 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5239 		return TRUE;
5240 	}
5241 
5242 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5243 	           OPEN_BY_ID_ENTITLEMENT);
5244 }
5245 
5246 /*
5247  * openbyid_np: open a file given a file system id and a file system object id
5248  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5249  *	file systems that don't support object ids it is a node id (uint64_t).
5250  *
5251  * Parameters:	p			Process requesting the open
5252  *		uap			User argument descriptor (see below)
5253  *		retval			Pointer to an area to receive the
5254  *					return calue from the system call
5255  *
5256  * Indirect:	uap->path		Path to open (same as 'open')
5257  *
5258  *		uap->fsid		id of target file system
5259  *		uap->objid		id of target file system object
5260  *		uap->flags		Flags to open (same as 'open')
5261  *
5262  * Returns:	0			Success
5263  *		!0			errno value
5264  *
5265  *
5266  * XXX:		We should enummerate the possible errno values here, and where
5267  *		in the code they originated.
5268  */
5269 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5270 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5271 {
5272 	fsid_t fsid;
5273 	uint64_t objid;
5274 	int error;
5275 	char *buf = NULL;
5276 	int buflen = MAXPATHLEN;
5277 	int pathlen = 0;
5278 	vfs_context_t ctx = vfs_context_current();
5279 
5280 	if (!vfs_context_can_open_by_id(ctx)) {
5281 		return EPERM;
5282 	}
5283 
5284 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5285 		return error;
5286 	}
5287 
5288 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5289 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5290 		return error;
5291 	}
5292 
5293 	AUDIT_ARG(value32, fsid.val[0]);
5294 	AUDIT_ARG(value64, objid);
5295 
5296 	/*resolve path from fsis, objid*/
5297 	do {
5298 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5299 		if (buf == NULL) {
5300 			return ENOMEM;
5301 		}
5302 
5303 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5304 		    buf, FSOPT_ISREALFSID, &pathlen);
5305 
5306 		if (error) {
5307 			kfree_data(buf, buflen + 1);
5308 			buf = NULL;
5309 		}
5310 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5311 
5312 	if (error) {
5313 		return error;
5314 	}
5315 
5316 	buf[pathlen] = 0;
5317 
5318 	error = openat_internal(
5319 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5320 
5321 	kfree_data(buf, buflen + 1);
5322 
5323 	return error;
5324 }
5325 
5326 
5327 /*
5328  * Create a special file.
5329  */
5330 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5331     int fd);
5332 
5333 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5334 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5335     mode_t mode, int fd)
5336 {
5337 	vfs_context_t ctx = vfs_context_current();
5338 	struct nameidata nd;
5339 	vnode_t vp, dvp;
5340 	int error;
5341 
5342 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5343 	if ((mode & S_IFMT) == S_IFIFO) {
5344 		return mkfifo1(ctx, upath, vap, fd);
5345 	}
5346 
5347 	AUDIT_ARG(mode, mode);
5348 	AUDIT_ARG(value32, vap->va_rdev);
5349 
5350 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5351 		return error;
5352 	}
5353 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5354 	    UIO_USERSPACE, upath, ctx);
5355 	error = nameiat(&nd, fd);
5356 	if (error) {
5357 		return error;
5358 	}
5359 	dvp = nd.ni_dvp;
5360 	vp = nd.ni_vp;
5361 
5362 	if (vp != NULL) {
5363 		error = EEXIST;
5364 		goto out;
5365 	}
5366 
5367 	switch (mode & S_IFMT) {
5368 	case S_IFCHR:
5369 		VATTR_SET(vap, va_type, VCHR);
5370 		break;
5371 	case S_IFBLK:
5372 		VATTR_SET(vap, va_type, VBLK);
5373 		break;
5374 	default:
5375 		error = EINVAL;
5376 		goto out;
5377 	}
5378 
5379 #if CONFIG_MACF
5380 	error = mac_vnode_check_create(ctx,
5381 	    nd.ni_dvp, &nd.ni_cnd, vap);
5382 	if (error) {
5383 		goto out;
5384 	}
5385 #endif
5386 
5387 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5388 		goto out;
5389 	}
5390 
5391 #if CONFIG_FILE_LEASES
5392 	vnode_breakdirlease(dvp, false, O_WRONLY);
5393 #endif
5394 
5395 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5396 		goto out;
5397 	}
5398 
5399 	if (vp) {
5400 		int     update_flags = 0;
5401 
5402 		// Make sure the name & parent pointers are hooked up
5403 		if (vp->v_name == NULL) {
5404 			update_flags |= VNODE_UPDATE_NAME;
5405 		}
5406 		if (vp->v_parent == NULLVP) {
5407 			update_flags |= VNODE_UPDATE_PARENT;
5408 		}
5409 
5410 		if (update_flags) {
5411 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5412 		}
5413 
5414 #if CONFIG_FSE
5415 		add_fsevent(FSE_CREATE_FILE, ctx,
5416 		    FSE_ARG_VNODE, vp,
5417 		    FSE_ARG_DONE);
5418 #endif
5419 	}
5420 
5421 out:
5422 	/*
5423 	 * nameidone has to happen before we vnode_put(dvp)
5424 	 * since it may need to release the fs_nodelock on the dvp
5425 	 */
5426 	nameidone(&nd);
5427 
5428 	if (vp) {
5429 		vnode_put(vp);
5430 	}
5431 	vnode_put(dvp);
5432 
5433 	return error;
5434 }
5435 
5436 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5437 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5438 {
5439 	struct vnode_attr va;
5440 
5441 	VATTR_INIT(&va);
5442 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5443 	VATTR_SET(&va, va_rdev, uap->dev);
5444 
5445 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5446 }
5447 
5448 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5449 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5450 {
5451 	struct vnode_attr va;
5452 
5453 	VATTR_INIT(&va);
5454 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5455 	VATTR_SET(&va, va_rdev, uap->dev);
5456 
5457 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5458 }
5459 
5460 /*
5461  * Create a named pipe.
5462  *
5463  * Returns:	0			Success
5464  *		EEXIST
5465  *	namei:???
5466  *	vnode_authorize:???
5467  *	vn_create:???
5468  */
5469 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5470 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5471 {
5472 	vnode_t vp, dvp;
5473 	int error;
5474 	struct nameidata nd;
5475 
5476 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5477 	    UIO_USERSPACE, upath, ctx);
5478 	error = nameiat(&nd, fd);
5479 	if (error) {
5480 		return error;
5481 	}
5482 	dvp = nd.ni_dvp;
5483 	vp = nd.ni_vp;
5484 
5485 	/* check that this is a new file and authorize addition */
5486 	if (vp != NULL) {
5487 		error = EEXIST;
5488 		goto out;
5489 	}
5490 	VATTR_SET(vap, va_type, VFIFO);
5491 
5492 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5493 		goto out;
5494 	}
5495 
5496 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5497 out:
5498 	/*
5499 	 * nameidone has to happen before we vnode_put(dvp)
5500 	 * since it may need to release the fs_nodelock on the dvp
5501 	 */
5502 	nameidone(&nd);
5503 
5504 	if (vp) {
5505 		vnode_put(vp);
5506 	}
5507 	vnode_put(dvp);
5508 
5509 	return error;
5510 }
5511 
5512 
5513 /*
5514  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5515  *
5516  * Parameters:	p			Process requesting the open
5517  *		uap			User argument descriptor (see below)
5518  *		retval			(Ignored)
5519  *
5520  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5521  *		uap->uid		UID to set
5522  *		uap->gid		GID to set
5523  *		uap->mode		File mode to set (same as 'mkfifo')
5524  *		uap->xsecurity		ACL to set, if creating
5525  *
5526  * Returns:	0			Success
5527  *		!0			errno value
5528  *
5529  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5530  *
5531  * XXX:		We should enummerate the possible errno values here, and where
5532  *		in the code they originated.
5533  */
5534 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5535 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5536 {
5537 	int ciferror;
5538 	kauth_filesec_t xsecdst;
5539 	struct vnode_attr va;
5540 
5541 	AUDIT_ARG(owner, uap->uid, uap->gid);
5542 
5543 	xsecdst = KAUTH_FILESEC_NONE;
5544 	if (uap->xsecurity != USER_ADDR_NULL) {
5545 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5546 			return ciferror;
5547 		}
5548 	}
5549 
5550 	VATTR_INIT(&va);
5551 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5552 	if (uap->uid != KAUTH_UID_NONE) {
5553 		VATTR_SET(&va, va_uid, uap->uid);
5554 	}
5555 	if (uap->gid != KAUTH_GID_NONE) {
5556 		VATTR_SET(&va, va_gid, uap->gid);
5557 	}
5558 	if (xsecdst != KAUTH_FILESEC_NONE) {
5559 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5560 		va.va_vaflags |= VA_FILESEC_ACL;
5561 	}
5562 
5563 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5564 
5565 	if (xsecdst != KAUTH_FILESEC_NONE) {
5566 		kauth_filesec_free(xsecdst);
5567 	}
5568 	return ciferror;
5569 }
5570 
5571 /* ARGSUSED */
5572 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5573 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5574 {
5575 	struct vnode_attr va;
5576 
5577 	VATTR_INIT(&va);
5578 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5579 
5580 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5581 }
5582 
5583 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5584 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5585 {
5586 	struct vnode_attr va;
5587 
5588 	VATTR_INIT(&va);
5589 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5590 
5591 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5592 }
5593 
5594 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5595 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5596 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5597 
5598 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5599 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5600 {
5601 	int ret, len = _len;
5602 
5603 	*truncated_path = 0;
5604 
5605 	if (firmlink) {
5606 		ret = vn_getpath(dvp, path, &len);
5607 	} else {
5608 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5609 	}
5610 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5611 		if (leafname) {
5612 			path[len - 1] = '/';
5613 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5614 			if (len > MAXPATHLEN) {
5615 				char *ptr;
5616 
5617 				// the string got truncated!
5618 				*truncated_path = 1;
5619 				ptr = strrchr(path, '/');
5620 				if (ptr) {
5621 					*ptr = '\0';   // chop off the string at the last directory component
5622 				}
5623 				len = (int)strlen(path) + 1;
5624 			}
5625 		}
5626 	} else if (ret == 0) {
5627 		*truncated_path = 1;
5628 	} else if (ret != 0) {
5629 		struct vnode *mydvp = dvp;
5630 
5631 		if (ret != ENOSPC) {
5632 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5633 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5634 		}
5635 		*truncated_path = 1;
5636 
5637 		do {
5638 			if (mydvp->v_parent != NULL) {
5639 				mydvp = mydvp->v_parent;
5640 			} else if (mydvp->v_mount) {
5641 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5642 				break;
5643 			} else {
5644 				// no parent and no mount point?  only thing is to punt and say "/" changed
5645 				strlcpy(path, "/", _len);
5646 				len = 2;
5647 				mydvp = NULL;
5648 			}
5649 
5650 			if (mydvp == NULL) {
5651 				break;
5652 			}
5653 
5654 			len = _len;
5655 			if (firmlink) {
5656 				ret = vn_getpath(mydvp, path, &len);
5657 			} else {
5658 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5659 			}
5660 		} while (ret == ENOSPC);
5661 	}
5662 
5663 	return len;
5664 }
5665 
5666 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5667 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5668 {
5669 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5670 }
5671 
5672 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5673 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5674 {
5675 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5676 }
5677 
5678 /*
5679  * Make a hard file link.
5680  *
5681  * Returns:	0			Success
5682  *		EPERM
5683  *		EEXIST
5684  *		EXDEV
5685  *	namei:???
5686  *	vnode_authorize:???
5687  *	VNOP_LINK:???
5688  */
5689 /* ARGSUSED */
5690 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5691 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5692     user_addr_t link, int flag, enum uio_seg segflg)
5693 {
5694 	vnode_t vp, pvp, dvp, lvp;
5695 	struct nameidata nd;
5696 	int follow;
5697 	int error;
5698 #if CONFIG_FSE
5699 	fse_info finfo;
5700 #endif
5701 	char *target_path = NULL;
5702 	char  *no_firmlink_path = NULL;
5703 	vnode_t locked_vp = NULLVP;
5704 	int truncated = 0;
5705 	int truncated_no_firmlink_path = 0;
5706 	int num_retries = 0;
5707 	int need_event, has_listeners, need_kpath2;
5708 	bool do_retry;
5709 
5710 	/* look up the object we are linking to */
5711 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5712 
5713 retry:
5714 	do_retry = false;
5715 	vp = dvp = lvp = NULLVP;
5716 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5717 	    segflg, path, ctx);
5718 
5719 	error = nameiat(&nd, fd1);
5720 	if (error) {
5721 		return error;
5722 	}
5723 	vp = nd.ni_vp;
5724 
5725 	nameidone(&nd);
5726 
5727 	/*
5728 	 * Normally, linking to directories is not supported.
5729 	 * However, some file systems may have limited support.
5730 	 */
5731 	if (vp->v_type == VDIR) {
5732 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5733 			error = EPERM;   /* POSIX */
5734 			goto out;
5735 		}
5736 
5737 		/* Linking to a directory requires ownership. */
5738 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5739 			struct vnode_attr dva;
5740 
5741 			VATTR_INIT(&dva);
5742 			VATTR_WANTED(&dva, va_uid);
5743 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5744 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5745 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5746 				error = EACCES;
5747 				goto out;
5748 			}
5749 		}
5750 	}
5751 
5752 	/* lookup the target node */
5753 #if CONFIG_TRIGGERS
5754 	nd.ni_op = OP_LINK;
5755 #endif
5756 	nd.ni_cnd.cn_nameiop = CREATE;
5757 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5758 	nd.ni_dirp = link;
5759 	error = nameiat(&nd, fd2);
5760 	if (error != 0) {
5761 		goto out;
5762 	}
5763 	dvp = nd.ni_dvp;
5764 	lvp = nd.ni_vp;
5765 
5766 	assert(locked_vp == NULLVP);
5767 	vnode_link_lock(vp);
5768 	locked_vp = vp;
5769 
5770 #if CONFIG_MACF
5771 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5772 		goto out2;
5773 	}
5774 #endif
5775 
5776 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5777 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5778 		goto out2;
5779 	}
5780 
5781 	/* target node must not exist */
5782 	if (lvp != NULLVP) {
5783 		error = EEXIST;
5784 		goto out2;
5785 	}
5786 	/* cannot link across mountpoints */
5787 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5788 		error = EXDEV;
5789 		goto out2;
5790 	}
5791 
5792 	/* authorize creation of the target note */
5793 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5794 		goto out2;
5795 	}
5796 
5797 #if CONFIG_FILE_LEASES
5798 	vnode_breakdirlease(dvp, false, O_WRONLY);
5799 #endif
5800 
5801 	/* and finally make the link */
5802 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5803 	if (error) {
5804 		if (error == ENOENT && num_retries < MAX_LINK_ENOENT_RETRIES) {
5805 			do_retry = true;
5806 		}
5807 		goto out2;
5808 	}
5809 
5810 #if CONFIG_MACF
5811 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5812 #endif
5813 
5814 	assert(locked_vp == vp);
5815 	vnode_link_unlock(locked_vp);
5816 	locked_vp = NULLVP;
5817 
5818 #if CONFIG_FSE
5819 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5820 #else
5821 	need_event = 0;
5822 #endif
5823 	has_listeners = kauth_authorize_fileop_has_listeners();
5824 
5825 	need_kpath2 = 0;
5826 #if CONFIG_AUDIT
5827 	if (AUDIT_RECORD_EXISTS()) {
5828 		need_kpath2 = 1;
5829 	}
5830 #endif
5831 
5832 	if (need_event || has_listeners || need_kpath2) {
5833 		char *link_to_path = NULL;
5834 		int len, link_name_len;
5835 		int  len_no_firmlink_path = 0;
5836 
5837 		/* build the path to the new link file */
5838 		GET_PATH(target_path);
5839 
5840 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5841 		if (no_firmlink_path == NULL) {
5842 			GET_PATH(no_firmlink_path);
5843 		}
5844 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5845 
5846 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5847 
5848 		if (has_listeners) {
5849 			/* build the path to file we are linking to */
5850 			GET_PATH(link_to_path);
5851 
5852 			link_name_len = MAXPATHLEN;
5853 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5854 				/*
5855 				 * Call out to allow 3rd party notification of rename.
5856 				 * Ignore result of kauth_authorize_fileop call.
5857 				 */
5858 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5859 				    (uintptr_t)link_to_path,
5860 				    (uintptr_t)target_path);
5861 			}
5862 			if (link_to_path != NULL) {
5863 				RELEASE_PATH(link_to_path);
5864 			}
5865 		}
5866 #if CONFIG_FSE
5867 		if (need_event) {
5868 			/* construct fsevent */
5869 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5870 				if (truncated_no_firmlink_path) {
5871 					finfo.mode |= FSE_TRUNCATED_PATH;
5872 				}
5873 
5874 				// build the path to the destination of the link
5875 				add_fsevent(FSE_CREATE_FILE, ctx,
5876 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5877 				    FSE_ARG_FINFO, &finfo,
5878 				    FSE_ARG_DONE);
5879 			}
5880 
5881 			pvp = vp->v_parent;
5882 			// need an iocount on parent vnode in this case
5883 			if (pvp && pvp != dvp) {
5884 				pvp = vnode_getparent_if_different(vp, dvp);
5885 			}
5886 			if (pvp) {
5887 				add_fsevent(FSE_STAT_CHANGED, ctx,
5888 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5889 			}
5890 			if (pvp && pvp != dvp) {
5891 				vnode_put(pvp);
5892 			}
5893 		}
5894 #endif
5895 	}
5896 out2:
5897 	/*
5898 	 * nameidone has to happen before we vnode_put(dvp)
5899 	 * since it may need to release the fs_nodelock on the dvp
5900 	 */
5901 	nameidone(&nd);
5902 	if (target_path != NULL) {
5903 		RELEASE_PATH(target_path);
5904 		target_path = NULL;
5905 	}
5906 	if (no_firmlink_path != NULL) {
5907 		RELEASE_PATH(no_firmlink_path);
5908 		no_firmlink_path = NULL;
5909 	}
5910 out:
5911 	if (locked_vp) {
5912 		assert(locked_vp == vp);
5913 		vnode_link_unlock(locked_vp);
5914 		locked_vp = NULLVP;
5915 	}
5916 	if (lvp) {
5917 		vnode_put(lvp);
5918 	}
5919 	if (dvp) {
5920 		vnode_put(dvp);
5921 	}
5922 	vnode_put(vp);
5923 
5924 	if (do_retry) {
5925 		goto retry;
5926 	}
5927 
5928 	return error;
5929 }
5930 
5931 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5932 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5933 {
5934 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5935 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5936 }
5937 
5938 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5939 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5940 {
5941 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5942 		return EINVAL;
5943 	}
5944 
5945 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5946 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5947 }
5948 
5949 /*
5950  * Make a symbolic link.
5951  *
5952  * We could add support for ACLs here too...
5953  */
5954 /* ARGSUSED */
5955 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5956 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5957     user_addr_t link, enum uio_seg segflg)
5958 {
5959 	struct vnode_attr va;
5960 	char *path;
5961 	int error;
5962 	struct nameidata nd;
5963 	vnode_t vp, dvp;
5964 	size_t dummy = 0;
5965 	proc_t p;
5966 
5967 	error = 0;
5968 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5969 		path = zalloc(ZV_NAMEI);
5970 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5971 	} else {
5972 		path = (char *)path_data;
5973 	}
5974 	if (error) {
5975 		goto out;
5976 	}
5977 	AUDIT_ARG(text, path);  /* This is the link string */
5978 
5979 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5980 	    segflg, link, ctx);
5981 
5982 	error = nameiat(&nd, fd);
5983 	if (error) {
5984 		goto out;
5985 	}
5986 	dvp = nd.ni_dvp;
5987 	vp = nd.ni_vp;
5988 
5989 	p = vfs_context_proc(ctx);
5990 	VATTR_INIT(&va);
5991 	VATTR_SET(&va, va_type, VLNK);
5992 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5993 
5994 #if CONFIG_MACF
5995 	error = mac_vnode_check_create(ctx,
5996 	    dvp, &nd.ni_cnd, &va);
5997 #endif
5998 	if (error != 0) {
5999 		goto skipit;
6000 	}
6001 
6002 	if (vp != NULL) {
6003 		error = EEXIST;
6004 		goto skipit;
6005 	}
6006 
6007 	/* authorize */
6008 	if (error == 0) {
6009 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6010 	}
6011 	/* get default ownership, etc. */
6012 	if (error == 0) {
6013 		error = vnode_authattr_new(dvp, &va, 0, ctx);
6014 	}
6015 
6016 #if CONFIG_FILE_LEASES
6017 	vnode_breakdirlease(dvp, false, O_WRONLY);
6018 #endif
6019 
6020 	if (error == 0) {
6021 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
6022 	}
6023 
6024 	/* do fallback attribute handling */
6025 	if (error == 0 && vp) {
6026 		error = vnode_setattr_fallback(vp, &va, ctx);
6027 	}
6028 
6029 #if CONFIG_MACF
6030 	if (error == 0 && vp) {
6031 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
6032 	}
6033 #endif
6034 
6035 	if (error == 0) {
6036 		int     update_flags = 0;
6037 
6038 		/*check if a new vnode was created, else try to get one*/
6039 		if (vp == NULL) {
6040 			nd.ni_cnd.cn_nameiop = LOOKUP;
6041 #if CONFIG_TRIGGERS
6042 			nd.ni_op = OP_LOOKUP;
6043 #endif
6044 			/*
6045 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
6046 			 * reallocated again in namei().
6047 			 */
6048 			nd.ni_cnd.cn_flags &= HASBUF;
6049 			error = nameiat(&nd, fd);
6050 			if (error) {
6051 				goto skipit;
6052 			}
6053 			vp = nd.ni_vp;
6054 		}
6055 
6056 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
6057 		/* call out to allow 3rd party notification of rename.
6058 		 * Ignore result of kauth_authorize_fileop call.
6059 		 */
6060 		if (kauth_authorize_fileop_has_listeners() &&
6061 		    namei(&nd) == 0) {
6062 			char *new_link_path = NULL;
6063 			int             len;
6064 
6065 			/* build the path to the new link file */
6066 			new_link_path = get_pathbuff();
6067 			len = MAXPATHLEN;
6068 			vn_getpath(dvp, new_link_path, &len);
6069 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
6070 				new_link_path[len - 1] = '/';
6071 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
6072 			}
6073 
6074 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
6075 			    (uintptr_t)path, (uintptr_t)new_link_path);
6076 			if (new_link_path != NULL) {
6077 				release_pathbuff(new_link_path);
6078 			}
6079 		}
6080 #endif
6081 		// Make sure the name & parent pointers are hooked up
6082 		if (vp->v_name == NULL) {
6083 			update_flags |= VNODE_UPDATE_NAME;
6084 		}
6085 		if (vp->v_parent == NULLVP) {
6086 			update_flags |= VNODE_UPDATE_PARENT;
6087 		}
6088 
6089 		if (update_flags) {
6090 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
6091 		}
6092 
6093 #if CONFIG_FSE
6094 		add_fsevent(FSE_CREATE_FILE, ctx,
6095 		    FSE_ARG_VNODE, vp,
6096 		    FSE_ARG_DONE);
6097 #endif
6098 	}
6099 
6100 skipit:
6101 	/*
6102 	 * nameidone has to happen before we vnode_put(dvp)
6103 	 * since it may need to release the fs_nodelock on the dvp
6104 	 */
6105 	nameidone(&nd);
6106 
6107 	if (vp) {
6108 		vnode_put(vp);
6109 	}
6110 	vnode_put(dvp);
6111 out:
6112 	if (path && (path != (char *)path_data)) {
6113 		zfree(ZV_NAMEI, path);
6114 	}
6115 
6116 	return error;
6117 }
6118 
6119 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6120 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6121 {
6122 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6123 	           uap->link, UIO_USERSPACE);
6124 }
6125 
6126 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6127 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6128     __unused int32_t *retval)
6129 {
6130 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6131 	           uap->path2, UIO_USERSPACE);
6132 }
6133 
6134 /*
6135  * Delete a whiteout from the filesystem.
6136  * No longer supported.
6137  */
6138 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6139 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6140 {
6141 	return ENOTSUP;
6142 }
6143 
6144 /*
6145  * Delete a name from the filesystem.
6146  */
6147 /* ARGSUSED */
6148 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6149 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6150     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6151 {
6152 	struct {
6153 		struct nameidata nd;
6154 #if CONFIG_FSE
6155 		struct vnode_attr va;
6156 		fse_info finfo;
6157 #endif
6158 	} *__unlink_data;
6159 	struct nameidata *ndp;
6160 	vnode_t vp, dvp;
6161 	int error;
6162 	struct componentname *cnp;
6163 	char  *path = NULL;
6164 	char  *no_firmlink_path = NULL;
6165 	int  len_path = 0;
6166 	int  len_no_firmlink_path = 0;
6167 	int flags;
6168 	int need_event;
6169 	int has_listeners;
6170 	int truncated_path;
6171 	int truncated_no_firmlink_path;
6172 	int batched;
6173 	struct vnode_attr *vap;
6174 	vnode_t locked_vp = NULLVP;
6175 	int do_retry;
6176 	int retry_count = 0;
6177 	int cn_flags;
6178 	int nofollow_any = 0;
6179 
6180 	cn_flags = LOCKPARENT;
6181 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6182 		cn_flags |= AUDITVNPATH1;
6183 	}
6184 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6185 		nofollow_any = NAMEI_NOFOLLOW_ANY;
6186 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6187 	}
6188 	/* If a starting dvp is passed, it trumps any fd passed. */
6189 	if (start_dvp) {
6190 		cn_flags |= USEDVP;
6191 	}
6192 
6193 #if NAMEDRSRCFORK
6194 	/* unlink or delete is allowed on rsrc forks and named streams */
6195 	cn_flags |= CN_ALLOWRSRCFORK;
6196 #endif
6197 
6198 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6199 	ndp = &__unlink_data->nd;
6200 #if CONFIG_FSE
6201 	fse_info *finfop = &__unlink_data->finfo;
6202 #endif
6203 
6204 retry:
6205 	do_retry = 0;
6206 	flags = 0;
6207 	need_event = 0;
6208 	has_listeners = 0;
6209 	truncated_path = 0;
6210 	truncated_no_firmlink_path = 0;
6211 	vap = NULL;
6212 
6213 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6214 
6215 	ndp->ni_dvp = start_dvp;
6216 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6217 	cnp = &ndp->ni_cnd;
6218 
6219 continue_lookup:
6220 	error = nameiat(ndp, fd);
6221 	if (error) {
6222 		goto early_out;
6223 	}
6224 
6225 	dvp = ndp->ni_dvp;
6226 	vp = ndp->ni_vp;
6227 
6228 	/* With Carbon delete semantics, busy files cannot be deleted */
6229 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6230 		flags |= VNODE_REMOVE_NODELETEBUSY;
6231 	}
6232 
6233 	/* Skip any potential upcalls if told to. */
6234 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6235 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6236 	}
6237 
6238 	/* Update speculative telemetry with system discarded use state */
6239 	if (unlink_flags & VNODE_REMOVE_SYSTEM_DISCARDED) {
6240 		flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6241 	}
6242 
6243 	if (vp) {
6244 		batched = vnode_compound_remove_available(vp);
6245 		/*
6246 		 * The root of a mounted filesystem cannot be deleted.
6247 		 */
6248 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6249 			error = EBUSY;
6250 			goto out;
6251 		}
6252 
6253 #if DEVELOPMENT || DEBUG
6254 		/*
6255 		 * XXX VSWAP: Check for entitlements or special flag here
6256 		 * so we can restrict access appropriately.
6257 		 */
6258 #else /* DEVELOPMENT || DEBUG */
6259 
6260 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6261 			error = EPERM;
6262 			goto out;
6263 		}
6264 #endif /* DEVELOPMENT || DEBUG */
6265 
6266 		if (!batched) {
6267 			vnode_link_lock(vp);
6268 			locked_vp = vp;
6269 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6270 			if (error) {
6271 				if (error == ENOENT) {
6272 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6273 						do_retry = 1;
6274 						retry_count++;
6275 					}
6276 				}
6277 				vnode_link_unlock(vp);
6278 				locked_vp = NULLVP;
6279 				goto out;
6280 			}
6281 		}
6282 	} else {
6283 		batched = 1;
6284 
6285 		if (!vnode_compound_remove_available(dvp)) {
6286 			panic("No vp, but no compound remove?");
6287 		}
6288 	}
6289 
6290 #if CONFIG_FSE
6291 	need_event = need_fsevent(FSE_DELETE, dvp);
6292 	if (need_event) {
6293 		if (!batched) {
6294 			if ((vp->v_flag & VISHARDLINK) == 0) {
6295 				/* XXX need to get these data in batched VNOP */
6296 				get_fse_info(vp, finfop, ctx);
6297 			}
6298 		} else {
6299 			error =
6300 			    vfs_get_notify_attributes(&__unlink_data->va);
6301 			if (error) {
6302 				goto out;
6303 			}
6304 
6305 			vap = &__unlink_data->va;
6306 		}
6307 	}
6308 #endif
6309 	has_listeners = kauth_authorize_fileop_has_listeners();
6310 	if (need_event || has_listeners) {
6311 		if (path == NULL) {
6312 			GET_PATH(path);
6313 		}
6314 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6315 		if (no_firmlink_path == NULL) {
6316 			GET_PATH(no_firmlink_path);
6317 		}
6318 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6319 	}
6320 
6321 #if NAMEDRSRCFORK
6322 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6323 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6324 	} else
6325 #endif
6326 	{
6327 #if CONFIG_FILE_LEASES
6328 		vnode_breakdirlease(dvp, false, O_WRONLY);
6329 #endif
6330 
6331 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6332 		vp = ndp->ni_vp;
6333 		if (error == EKEEPLOOKING) {
6334 			if (!batched) {
6335 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6336 			}
6337 
6338 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6339 				panic("EKEEPLOOKING, but continue flag not set?");
6340 			}
6341 
6342 			if (vnode_isdir(vp)) {
6343 				error = EISDIR;
6344 				goto out;
6345 			}
6346 			goto continue_lookup;
6347 		} else if (error == ENOENT && batched) {
6348 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6349 				/*
6350 				 * For compound VNOPs, the authorization callback may
6351 				 * return ENOENT in case of racing hardlink lookups
6352 				 * hitting the name  cache, redrive the lookup.
6353 				 */
6354 				do_retry = 1;
6355 				retry_count += 1;
6356 				goto out;
6357 			}
6358 		}
6359 	}
6360 
6361 	/*
6362 	 * Call out to allow 3rd party notification of delete.
6363 	 * Ignore result of kauth_authorize_fileop call.
6364 	 */
6365 	if (!error) {
6366 		if (has_listeners) {
6367 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6368 			    KAUTH_FILEOP_DELETE,
6369 			    (uintptr_t)vp,
6370 			    (uintptr_t)path);
6371 		}
6372 
6373 		if (vp->v_flag & VISHARDLINK) {
6374 			//
6375 			// if a hardlink gets deleted we want to blow away the
6376 			// v_parent link because the path that got us to this
6377 			// instance of the link is no longer valid.  this will
6378 			// force the next call to get the path to ask the file
6379 			// system instead of just following the v_parent link.
6380 			//
6381 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6382 		}
6383 
6384 #if CONFIG_FSE
6385 		if (need_event) {
6386 			if (vp->v_flag & VISHARDLINK) {
6387 				get_fse_info(vp, finfop, ctx);
6388 			} else if (vap) {
6389 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6390 			}
6391 			if (truncated_path) {
6392 				finfop->mode |= FSE_TRUNCATED_PATH;
6393 			}
6394 			add_fsevent(FSE_DELETE, ctx,
6395 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6396 			    FSE_ARG_FINFO, finfop,
6397 			    FSE_ARG_DONE);
6398 		}
6399 #endif
6400 
6401 #if CONFIG_MACF
6402 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6403 #endif
6404 	}
6405 
6406 out:
6407 	if (locked_vp) {
6408 		assert(locked_vp == vp);
6409 		vnode_link_unlock(locked_vp);
6410 		locked_vp = NULLVP;
6411 	}
6412 
6413 	if (path != NULL) {
6414 		RELEASE_PATH(path);
6415 		path = NULL;
6416 	}
6417 
6418 	if (no_firmlink_path != NULL) {
6419 		RELEASE_PATH(no_firmlink_path);
6420 		no_firmlink_path = NULL;
6421 	}
6422 #if NAMEDRSRCFORK
6423 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6424 	 * will cause its shadow file to go away if necessary.
6425 	 */
6426 	if (vp && (vnode_isnamedstream(vp)) &&
6427 	    (vp->v_parent != NULLVP) &&
6428 	    vnode_isshadow(vp)) {
6429 		vnode_recycle(vp);
6430 	}
6431 #endif
6432 	/*
6433 	 * nameidone has to happen before we vnode_put(dvp)
6434 	 * since it may need to release the fs_nodelock on the dvp
6435 	 */
6436 	nameidone(ndp);
6437 	vnode_put(dvp);
6438 	if (vp) {
6439 		vnode_put(vp);
6440 	}
6441 
6442 	if (do_retry) {
6443 		goto retry;
6444 	}
6445 
6446 early_out:
6447 	kfree_type(typeof(*__unlink_data), __unlink_data);
6448 	return error;
6449 }
6450 
6451 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6452 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6453     enum uio_seg segflg, int unlink_flags)
6454 {
6455 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6456 	           unlink_flags);
6457 }
6458 
6459 /*
6460  * Delete a name from the filesystem using Carbon semantics.
6461  */
6462 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6463 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6464 {
6465 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6466 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6467 }
6468 
6469 /*
6470  * Delete a name from the filesystem using POSIX semantics.
6471  */
6472 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6473 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6474 {
6475 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6476 	           uap->path, UIO_USERSPACE, 0);
6477 }
6478 
6479 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6480 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6481 {
6482 	int unlink_flags = 0;
6483 
6484 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY | AT_SYSTEM_DISCARDED)) {
6485 		return EINVAL;
6486 	}
6487 
6488 	if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6489 		unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6490 	}
6491 
6492 	if (uap->flag & AT_SYSTEM_DISCARDED) {
6493 		unlink_flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6494 	}
6495 
6496 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6497 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6498 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6499 		}
6500 		return rmdirat_internal(vfs_context_current(), uap->fd,
6501 		           uap->path, UIO_USERSPACE, unlink_flags);
6502 	} else {
6503 		return unlinkat_internal(vfs_context_current(), uap->fd,
6504 		           NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6505 	}
6506 }
6507 
6508 /*
6509  * Reposition read/write file offset.
6510  */
6511 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6512 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6513 {
6514 	struct fileproc *fp;
6515 	vnode_t vp;
6516 	struct vfs_context *ctx;
6517 	off_t offset = uap->offset, file_size;
6518 	int error;
6519 
6520 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6521 		if (error == ENOTSUP) {
6522 			return ESPIPE;
6523 		}
6524 		return error;
6525 	}
6526 	if (vnode_isfifo(vp)) {
6527 		file_drop(uap->fd);
6528 		return ESPIPE;
6529 	}
6530 
6531 
6532 	ctx = vfs_context_current();
6533 #if CONFIG_MACF
6534 	if (uap->whence == L_INCR && uap->offset == 0) {
6535 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6536 		    fp->fp_glob);
6537 	} else {
6538 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6539 		    fp->fp_glob);
6540 	}
6541 	if (error) {
6542 		file_drop(uap->fd);
6543 		return error;
6544 	}
6545 #endif
6546 	if ((error = vnode_getwithref(vp))) {
6547 		file_drop(uap->fd);
6548 		return error;
6549 	}
6550 
6551 	switch (uap->whence) {
6552 	case L_INCR:
6553 		offset += fp->fp_glob->fg_offset;
6554 		break;
6555 	case L_XTND:
6556 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6557 			break;
6558 		}
6559 		offset += file_size;
6560 		break;
6561 	case L_SET:
6562 		break;
6563 	case SEEK_HOLE:
6564 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6565 		break;
6566 	case SEEK_DATA:
6567 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6568 		break;
6569 	default:
6570 		error = EINVAL;
6571 	}
6572 	if (error == 0) {
6573 		if (uap->offset > 0 && offset < 0) {
6574 			/* Incremented/relative move past max size */
6575 			error = EOVERFLOW;
6576 		} else {
6577 			/*
6578 			 * Allow negative offsets on character devices, per
6579 			 * POSIX 1003.1-2001.  Most likely for writing disk
6580 			 * labels.
6581 			 */
6582 			if (offset < 0 && vp->v_type != VCHR) {
6583 				/* Decremented/relative move before start */
6584 				error = EINVAL;
6585 			} else {
6586 				/* Success */
6587 				fp->fp_glob->fg_offset = offset;
6588 				*retval = fp->fp_glob->fg_offset;
6589 			}
6590 		}
6591 	}
6592 
6593 	/*
6594 	 * An lseek can affect whether data is "available to read."  Use
6595 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6596 	 */
6597 	post_event_if_success(vp, error, NOTE_NONE);
6598 	(void)vnode_put(vp);
6599 	file_drop(uap->fd);
6600 	return error;
6601 }
6602 
6603 
6604 /*
6605  * Check access permissions.
6606  *
6607  * Returns:	0			Success
6608  *		vnode_authorize:???
6609  */
6610 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6611 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6612 {
6613 	kauth_action_t action;
6614 	int error;
6615 
6616 	/*
6617 	 * If just the regular access bits, convert them to something
6618 	 * that vnode_authorize will understand.
6619 	 */
6620 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6621 		action = 0;
6622 		if (uflags & R_OK) {
6623 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6624 		}
6625 		if (uflags & W_OK) {
6626 			if (vnode_isdir(vp)) {
6627 				action |= KAUTH_VNODE_ADD_FILE |
6628 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6629 				/* might want delete rights here too */
6630 			} else {
6631 				action |= KAUTH_VNODE_WRITE_DATA;
6632 			}
6633 		}
6634 		if (uflags & X_OK) {
6635 			if (vnode_isdir(vp)) {
6636 				action |= KAUTH_VNODE_SEARCH;
6637 			} else {
6638 				action |= KAUTH_VNODE_EXECUTE;
6639 			}
6640 		}
6641 	} else {
6642 		/* take advantage of definition of uflags */
6643 		action = uflags >> 8;
6644 	}
6645 
6646 #if CONFIG_MACF
6647 	error = mac_vnode_check_access(ctx, vp, uflags);
6648 	if (error) {
6649 		return error;
6650 	}
6651 #endif /* MAC */
6652 
6653 	/* action == 0 means only check for existence */
6654 	if (action != 0) {
6655 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6656 	} else {
6657 		error = 0;
6658 	}
6659 
6660 	return error;
6661 }
6662 
6663 
6664 
6665 /*
6666  * access_extended: Check access permissions in bulk.
6667  *
6668  * Description:	uap->entries		Pointer to an array of accessx
6669  *                                      descriptor structs, plus one or
6670  *                                      more NULL terminated strings (see
6671  *                                      "Notes" section below).
6672  *		uap->size		Size of the area pointed to by
6673  *					uap->entries.
6674  *		uap->results		Pointer to the results array.
6675  *
6676  * Returns:	0			Success
6677  *		ENOMEM			Insufficient memory
6678  *		EINVAL			Invalid arguments
6679  *		namei:EFAULT		Bad address
6680  *		namei:ENAMETOOLONG	Filename too long
6681  *		namei:ENOENT		No such file or directory
6682  *		namei:ELOOP		Too many levels of symbolic links
6683  *		namei:EBADF		Bad file descriptor
6684  *		namei:ENOTDIR		Not a directory
6685  *		namei:???
6686  *		access1:
6687  *
6688  * Implicit returns:
6689  *		uap->results		Array contents modified
6690  *
6691  * Notes:	The uap->entries are structured as an arbitrary length array
6692  *		of accessx descriptors, followed by one or more NULL terminated
6693  *		strings
6694  *
6695  *			struct accessx_descriptor[0]
6696  *			...
6697  *			struct accessx_descriptor[n]
6698  *			char name_data[0];
6699  *
6700  *		We determine the entry count by walking the buffer containing
6701  *		the uap->entries argument descriptor.  For each descriptor we
6702  *		see, the valid values for the offset ad_name_offset will be
6703  *		in the byte range:
6704  *
6705  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6706  *						to
6707  *				[ uap->entries + uap->size - 2 ]
6708  *
6709  *		since we must have at least one string, and the string must
6710  *		be at least one character plus the NULL terminator in length.
6711  *
6712  * XXX:		Need to support the check-as uid argument
6713  */
6714 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6715 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6716 {
6717 	struct accessx_descriptor *input = NULL;
6718 	errno_t *result = NULL;
6719 	errno_t error = 0;
6720 	int wantdelete = 0;
6721 	size_t desc_max, desc_actual = 0;
6722 	unsigned int i, j;
6723 	struct vfs_context context;
6724 	struct nameidata nd;
6725 	int niopts;
6726 	vnode_t vp = NULL;
6727 	vnode_t dvp = NULL;
6728 #define ACCESSX_MAX_DESCR_ON_STACK 10
6729 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6730 
6731 	context.vc_ucred = NULL;
6732 
6733 	/*
6734 	 * Validate parameters; if valid, copy the descriptor array and string
6735 	 * arguments into local memory.  Before proceeding, the following
6736 	 * conditions must have been met:
6737 	 *
6738 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6739 	 * o	There must be sufficient room in the request for at least one
6740 	 *	descriptor and a one yte NUL terminated string.
6741 	 * o	The allocation of local storage must not fail.
6742 	 */
6743 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6744 		return ENOMEM;
6745 	}
6746 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6747 		return EINVAL;
6748 	}
6749 	if (uap->size <= sizeof(stack_input)) {
6750 		input = stack_input;
6751 	} else {
6752 		input = kalloc_data(uap->size, Z_WAITOK);
6753 		if (input == NULL) {
6754 			error = ENOMEM;
6755 			goto out;
6756 		}
6757 	}
6758 	error = copyin(uap->entries, input, uap->size);
6759 	if (error) {
6760 		goto out;
6761 	}
6762 
6763 	AUDIT_ARG(opaque, input, uap->size);
6764 
6765 	/*
6766 	 * Force NUL termination of the copyin buffer to avoid nami() running
6767 	 * off the end.  If the caller passes us bogus data, they may get a
6768 	 * bogus result.
6769 	 */
6770 	((char *)input)[uap->size - 1] = 0;
6771 
6772 	/*
6773 	 * Access is defined as checking against the process' real identity,
6774 	 * even if operations are checking the effective identity.  This
6775 	 * requires that we use a local vfs context.
6776 	 */
6777 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6778 	context.vc_thread = current_thread();
6779 
6780 	/*
6781 	 * Find out how many entries we have, so we can allocate the result
6782 	 * array by walking the list and adjusting the count downward by the
6783 	 * earliest string offset we see.
6784 	 */
6785 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6786 	desc_actual = desc_max;
6787 	for (i = 0; i < desc_actual; i++) {
6788 		/*
6789 		 * Take the offset to the name string for this entry and
6790 		 * convert to an input array index, which would be one off
6791 		 * the end of the array if this entry was the lowest-addressed
6792 		 * name string.
6793 		 */
6794 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6795 
6796 		/*
6797 		 * An offset greater than the max allowable offset is an error.
6798 		 * It is also an error for any valid entry to point
6799 		 * to a location prior to the end of the current entry, if
6800 		 * it's not a reference to the string of the previous entry.
6801 		 */
6802 		if (j > desc_max || (j != 0 && j <= i)) {
6803 			error = EINVAL;
6804 			goto out;
6805 		}
6806 
6807 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6808 		if (input[i].ad_name_offset >= uap->size) {
6809 			error = EINVAL;
6810 			goto out;
6811 		}
6812 
6813 		/*
6814 		 * An offset of 0 means use the previous descriptor's offset;
6815 		 * this is used to chain multiple requests for the same file
6816 		 * to avoid multiple lookups.
6817 		 */
6818 		if (j == 0) {
6819 			/* This is not valid for the first entry */
6820 			if (i == 0) {
6821 				error = EINVAL;
6822 				goto out;
6823 			}
6824 			continue;
6825 		}
6826 
6827 		/*
6828 		 * If the offset of the string for this descriptor is before
6829 		 * what we believe is the current actual last descriptor,
6830 		 * then we need to adjust our estimate downward; this permits
6831 		 * the string table following the last descriptor to be out
6832 		 * of order relative to the descriptor list.
6833 		 */
6834 		if (j < desc_actual) {
6835 			desc_actual = j;
6836 		}
6837 	}
6838 
6839 	/*
6840 	 * We limit the actual number of descriptors we are willing to process
6841 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6842 	 * requested does not exceed this limit,
6843 	 */
6844 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6845 		error = ENOMEM;
6846 		goto out;
6847 	}
6848 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6849 	if (result == NULL) {
6850 		error = ENOMEM;
6851 		goto out;
6852 	}
6853 
6854 	/*
6855 	 * Do the work by iterating over the descriptor entries we know to
6856 	 * at least appear to contain valid data.
6857 	 */
6858 	error = 0;
6859 	for (i = 0; i < desc_actual; i++) {
6860 		/*
6861 		 * If the ad_name_offset is 0, then we use the previous
6862 		 * results to make the check; otherwise, we are looking up
6863 		 * a new file name.
6864 		 */
6865 		if (input[i].ad_name_offset != 0) {
6866 			/* discard old vnodes */
6867 			if (vp) {
6868 				vnode_put(vp);
6869 				vp = NULL;
6870 			}
6871 			if (dvp) {
6872 				vnode_put(dvp);
6873 				dvp = NULL;
6874 			}
6875 
6876 			/*
6877 			 * Scan forward in the descriptor list to see if we
6878 			 * need the parent vnode.  We will need it if we are
6879 			 * deleting, since we must have rights  to remove
6880 			 * entries in the parent directory, as well as the
6881 			 * rights to delete the object itself.
6882 			 */
6883 			wantdelete = input[i].ad_flags & _DELETE_OK;
6884 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6885 				if (input[j].ad_flags & _DELETE_OK) {
6886 					wantdelete = 1;
6887 				}
6888 			}
6889 
6890 			niopts = FOLLOW | AUDITVNPATH1;
6891 
6892 			/* need parent for vnode_authorize for deletion test */
6893 			if (wantdelete) {
6894 				niopts |= WANTPARENT;
6895 			}
6896 
6897 			/* do the lookup */
6898 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6899 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6900 			    &context);
6901 			error = namei(&nd);
6902 			if (!error) {
6903 				vp = nd.ni_vp;
6904 				if (wantdelete) {
6905 					dvp = nd.ni_dvp;
6906 				}
6907 			}
6908 			nameidone(&nd);
6909 		}
6910 
6911 		/*
6912 		 * Handle lookup errors.
6913 		 */
6914 		switch (error) {
6915 		case ENOENT:
6916 		case EACCES:
6917 		case EPERM:
6918 		case ENOTDIR:
6919 			result[i] = error;
6920 			break;
6921 		case 0:
6922 			/* run this access check */
6923 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6924 			break;
6925 		default:
6926 			/* fatal lookup error */
6927 
6928 			goto out;
6929 		}
6930 	}
6931 
6932 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6933 
6934 	/* copy out results */
6935 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6936 
6937 out:
6938 	if (input && input != stack_input) {
6939 		kfree_data(input, uap->size);
6940 	}
6941 	if (result) {
6942 		kfree_data(result, desc_actual * sizeof(errno_t));
6943 	}
6944 	if (vp) {
6945 		vnode_put(vp);
6946 	}
6947 	if (dvp) {
6948 		vnode_put(dvp);
6949 	}
6950 	if (IS_VALID_CRED(context.vc_ucred)) {
6951 		kauth_cred_unref(&context.vc_ucred);
6952 	}
6953 	return error;
6954 }
6955 
6956 
6957 /*
6958  * Returns:	0			Success
6959  *		namei:EFAULT		Bad address
6960  *		namei:ENAMETOOLONG	Filename too long
6961  *		namei:ENOENT		No such file or directory
6962  *		namei:ELOOP		Too many levels of symbolic links
6963  *		namei:EBADF		Bad file descriptor
6964  *		namei:ENOTDIR		Not a directory
6965  *		namei:???
6966  *		access1:
6967  */
6968 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6969 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6970     int flag, enum uio_seg segflg)
6971 {
6972 	int error;
6973 	struct nameidata nd;
6974 	int niopts;
6975 	struct vfs_context context;
6976 #if NAMEDRSRCFORK
6977 	int is_namedstream = 0;
6978 #endif
6979 
6980 	/*
6981 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6982 	 * against the process' real identity, even if operations are checking
6983 	 * the effective identity.  So we need to tweak the credential
6984 	 * in the context for that case.
6985 	 */
6986 	if (!(flag & AT_EACCESS)) {
6987 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6988 	} else {
6989 		context.vc_ucred = ctx->vc_ucred;
6990 	}
6991 	context.vc_thread = ctx->vc_thread;
6992 
6993 
6994 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6995 	/* need parent for vnode_authorize for deletion test */
6996 	if (amode & _DELETE_OK) {
6997 		niopts |= WANTPARENT;
6998 	}
6999 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
7000 	    path, &context);
7001 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7002 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7003 	}
7004 
7005 #if NAMEDRSRCFORK
7006 	/* access(F_OK) calls are allowed for resource forks. */
7007 	if (amode == F_OK) {
7008 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7009 	}
7010 #endif
7011 	error = nameiat(&nd, fd);
7012 	if (error) {
7013 		goto out;
7014 	}
7015 
7016 #if NAMEDRSRCFORK
7017 	/* Grab reference on the shadow stream file vnode to
7018 	 * force an inactive on release which will mark it
7019 	 * for recycle.
7020 	 */
7021 	if (vnode_isnamedstream(nd.ni_vp) &&
7022 	    (nd.ni_vp->v_parent != NULLVP) &&
7023 	    vnode_isshadow(nd.ni_vp)) {
7024 		is_namedstream = 1;
7025 		vnode_ref(nd.ni_vp);
7026 	}
7027 #endif
7028 
7029 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
7030 
7031 #if NAMEDRSRCFORK
7032 	if (is_namedstream) {
7033 		vnode_rele(nd.ni_vp);
7034 	}
7035 #endif
7036 
7037 	vnode_put(nd.ni_vp);
7038 	if (amode & _DELETE_OK) {
7039 		vnode_put(nd.ni_dvp);
7040 	}
7041 	nameidone(&nd);
7042 
7043 out:
7044 	if (!(flag & AT_EACCESS)) {
7045 		kauth_cred_unref(&context.vc_ucred);
7046 	}
7047 	return error;
7048 }
7049 
7050 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)7051 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
7052 {
7053 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
7054 	           uap->path, uap->flags, 0, UIO_USERSPACE);
7055 }
7056 
7057 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)7058 faccessat(__unused proc_t p, struct faccessat_args *uap,
7059     __unused int32_t *retval)
7060 {
7061 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7062 		return EINVAL;
7063 	}
7064 
7065 	return faccessat_internal(vfs_context_current(), uap->fd,
7066 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
7067 }
7068 
7069 /*
7070  * Returns:	0			Success
7071  *		EFAULT
7072  *	copyout:EFAULT
7073  *	namei:???
7074  *	vn_stat:???
7075  */
7076 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)7077 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
7078     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
7079     enum uio_seg segflg, int fd, int flag)
7080 {
7081 	struct nameidata *ndp = NULL;
7082 	int follow;
7083 	union {
7084 		struct stat sb;
7085 		struct stat64 sb64;
7086 	} source = {};
7087 	union {
7088 		struct user64_stat user64_sb;
7089 		struct user32_stat user32_sb;
7090 		struct user64_stat64 user64_sb64;
7091 		struct user32_stat64 user32_sb64;
7092 	} dest = {};
7093 	caddr_t sbp;
7094 	int error, my_size;
7095 	kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
7096 	size_t xsecurity_bufsize;
7097 	void * statptr;
7098 	struct fileproc *fp = NULL;
7099 	int needsrealdev = 0;
7100 
7101 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7102 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
7103 	NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
7104 	    segflg, path, ctx);
7105 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7106 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
7107 	}
7108 
7109 #if NAMEDRSRCFORK
7110 	int is_namedstream = 0;
7111 	/* stat calls are allowed for resource forks. */
7112 	ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7113 #endif
7114 
7115 	if (flag & AT_FDONLY) {
7116 		vnode_t fvp;
7117 
7118 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
7119 		if (error) {
7120 			goto out;
7121 		}
7122 		if ((error = vnode_getwithref(fvp))) {
7123 			file_drop(fd);
7124 			goto out;
7125 		}
7126 		ndp->ni_vp = fvp;
7127 	} else {
7128 		error = nameiat(ndp, fd);
7129 		if (error) {
7130 			goto out;
7131 		}
7132 	}
7133 
7134 	statptr = (void *)&source;
7135 
7136 #if NAMEDRSRCFORK
7137 	/* Grab reference on the shadow stream file vnode to
7138 	 * force an inactive on release which will mark it
7139 	 * for recycle.
7140 	 */
7141 	if (vnode_isnamedstream(ndp->ni_vp) &&
7142 	    (ndp->ni_vp->v_parent != NULLVP) &&
7143 	    vnode_isshadow(ndp->ni_vp)) {
7144 		is_namedstream = 1;
7145 		vnode_ref(ndp->ni_vp);
7146 	}
7147 #endif
7148 
7149 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
7150 	if (fp && (xsecurity == USER_ADDR_NULL)) {
7151 		/*
7152 		 * If the caller has the file open, and is not
7153 		 * requesting extended security information, we are
7154 		 * going to let them get the basic stat information.
7155 		 */
7156 		error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7157 		    fp->fp_glob->fg_cred);
7158 	} else {
7159 		error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7160 		    isstat64, needsrealdev, ctx);
7161 	}
7162 
7163 #if NAMEDRSRCFORK
7164 	if (is_namedstream) {
7165 		vnode_rele(ndp->ni_vp);
7166 	}
7167 #endif
7168 	vnode_put(ndp->ni_vp);
7169 	nameidone(ndp);
7170 
7171 	if (fp) {
7172 		file_drop(fd);
7173 		fp = NULL;
7174 	}
7175 
7176 	if (error) {
7177 		goto out;
7178 	}
7179 	/* Zap spare fields */
7180 	if (isstat64 != 0) {
7181 		source.sb64.st_lspare = 0;
7182 		source.sb64.st_qspare[0] = 0LL;
7183 		source.sb64.st_qspare[1] = 0LL;
7184 		if (vfs_context_is64bit(ctx)) {
7185 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7186 			my_size = sizeof(dest.user64_sb64);
7187 			sbp = (caddr_t)&dest.user64_sb64;
7188 		} else {
7189 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7190 			my_size = sizeof(dest.user32_sb64);
7191 			sbp = (caddr_t)&dest.user32_sb64;
7192 		}
7193 		/*
7194 		 * Check if we raced (post lookup) against the last unlink of a file.
7195 		 */
7196 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7197 			source.sb64.st_nlink = 1;
7198 		}
7199 	} else {
7200 		source.sb.st_lspare = 0;
7201 		source.sb.st_qspare[0] = 0LL;
7202 		source.sb.st_qspare[1] = 0LL;
7203 		if (vfs_context_is64bit(ctx)) {
7204 			munge_user64_stat(&source.sb, &dest.user64_sb);
7205 			my_size = sizeof(dest.user64_sb);
7206 			sbp = (caddr_t)&dest.user64_sb;
7207 		} else {
7208 			munge_user32_stat(&source.sb, &dest.user32_sb);
7209 			my_size = sizeof(dest.user32_sb);
7210 			sbp = (caddr_t)&dest.user32_sb;
7211 		}
7212 
7213 		/*
7214 		 * Check if we raced (post lookup) against the last unlink of a file.
7215 		 */
7216 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7217 			source.sb.st_nlink = 1;
7218 		}
7219 	}
7220 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7221 		goto out;
7222 	}
7223 
7224 	/* caller wants extended security information? */
7225 	if (xsecurity != USER_ADDR_NULL) {
7226 		/* did we get any? */
7227 		if (fsec == KAUTH_FILESEC_NONE) {
7228 			if (susize(xsecurity_size, 0) != 0) {
7229 				error = EFAULT;
7230 				goto out;
7231 			}
7232 		} else {
7233 			/* find the user buffer size */
7234 			xsecurity_bufsize = fusize(xsecurity_size);
7235 
7236 			/* copy out the actual data size */
7237 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7238 				error = EFAULT;
7239 				goto out;
7240 			}
7241 
7242 			/* if the caller supplied enough room, copy out to it */
7243 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7244 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7245 			}
7246 		}
7247 	}
7248 out:
7249 	if (ndp) {
7250 		kfree_type(struct nameidata, ndp);
7251 	}
7252 	if (fsec != KAUTH_FILESEC_NONE) {
7253 		kauth_filesec_free(fsec);
7254 	}
7255 	return error;
7256 }
7257 
7258 /*
7259  * stat_extended: Get file status; with extended security (ACL).
7260  *
7261  * Parameters:    p                       (ignored)
7262  *                uap                     User argument descriptor (see below)
7263  *                retval                  (ignored)
7264  *
7265  * Indirect:      uap->path               Path of file to get status from
7266  *                uap->ub                 User buffer (holds file status info)
7267  *                uap->xsecurity          ACL to get (extended security)
7268  *                uap->xsecurity_size     Size of ACL
7269  *
7270  * Returns:        0                      Success
7271  *                !0                      errno value
7272  *
7273  */
7274 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7275 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7276     __unused int32_t *retval)
7277 {
7278 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7279 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7280 	           0);
7281 }
7282 
7283 /*
7284  * Returns:	0			Success
7285  *	fstatat_internal:???		[see fstatat_internal() in this file]
7286  */
7287 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7288 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7289 {
7290 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7291 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7292 }
7293 
7294 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7295 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7296 {
7297 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7298 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7299 }
7300 
7301 /*
7302  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7303  *
7304  * Parameters:    p                       (ignored)
7305  *                uap                     User argument descriptor (see below)
7306  *                retval                  (ignored)
7307  *
7308  * Indirect:      uap->path               Path of file to get status from
7309  *                uap->ub                 User buffer (holds file status info)
7310  *                uap->xsecurity          ACL to get (extended security)
7311  *                uap->xsecurity_size     Size of ACL
7312  *
7313  * Returns:        0                      Success
7314  *                !0                      errno value
7315  *
7316  */
7317 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7318 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7319 {
7320 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7321 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7322 	           0);
7323 }
7324 
7325 /*
7326  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7327  *
7328  * Parameters:    p                       (ignored)
7329  *                uap                     User argument descriptor (see below)
7330  *                retval                  (ignored)
7331  *
7332  * Indirect:      uap->path               Path of file to get status from
7333  *                uap->ub                 User buffer (holds file status info)
7334  *                uap->xsecurity          ACL to get (extended security)
7335  *                uap->xsecurity_size     Size of ACL
7336  *
7337  * Returns:        0                      Success
7338  *                !0                      errno value
7339  *
7340  */
7341 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7342 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7343 {
7344 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7345 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7346 	           AT_SYMLINK_NOFOLLOW);
7347 }
7348 
7349 /*
7350  * Get file status; this version does not follow links.
7351  */
7352 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7353 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7354 {
7355 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7356 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7357 }
7358 
7359 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7360 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7361 {
7362 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7363 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7364 }
7365 
7366 /*
7367  * lstat64_extended: Get file status; can handle large inode numbers; does not
7368  * follow links; with extended security (ACL).
7369  *
7370  * Parameters:    p                       (ignored)
7371  *                uap                     User argument descriptor (see below)
7372  *                retval                  (ignored)
7373  *
7374  * Indirect:      uap->path               Path of file to get status from
7375  *                uap->ub                 User buffer (holds file status info)
7376  *                uap->xsecurity          ACL to get (extended security)
7377  *                uap->xsecurity_size     Size of ACL
7378  *
7379  * Returns:        0                      Success
7380  *                !0                      errno value
7381  *
7382  */
7383 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7384 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7385 {
7386 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7387 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7388 	           AT_SYMLINK_NOFOLLOW);
7389 }
7390 
7391 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7392 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7393 {
7394 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7395 		return EINVAL;
7396 	}
7397 
7398 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7399 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7400 }
7401 
7402 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7403 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7404     __unused int32_t *retval)
7405 {
7406 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7407 		return EINVAL;
7408 	}
7409 
7410 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7411 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7412 }
7413 
7414 /*
7415  * Get configurable pathname variables.
7416  *
7417  * Returns:	0			Success
7418  *	namei:???
7419  *	vn_pathconf:???
7420  *
7421  * Notes:	Global implementation  constants are intended to be
7422  *		implemented in this function directly; all other constants
7423  *		are per-FS implementation, and therefore must be handled in
7424  *		each respective FS, instead.
7425  *
7426  * XXX We implement some things globally right now that should actually be
7427  * XXX per-FS; we will need to deal with this at some point.
7428  */
7429 /* ARGSUSED */
7430 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7431 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7432 {
7433 	int error;
7434 	struct nameidata nd;
7435 	vfs_context_t ctx = vfs_context_current();
7436 
7437 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7438 	    UIO_USERSPACE, uap->path, ctx);
7439 	error = namei(&nd);
7440 	if (error) {
7441 		return error;
7442 	}
7443 
7444 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7445 
7446 	vnode_put(nd.ni_vp);
7447 	nameidone(&nd);
7448 	return error;
7449 }
7450 
7451 /*
7452  * Return target name of a symbolic link.
7453  */
7454 /* ARGSUSED */
7455 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7456 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7457     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7458     int *retval)
7459 {
7460 	vnode_t vp;
7461 	uio_t auio;
7462 	int error;
7463 	struct nameidata nd;
7464 	UIO_STACKBUF(uio_buf, 1);
7465 	bool put_vnode;
7466 
7467 	if (bufsize > INT32_MAX) {
7468 		return EINVAL;
7469 	}
7470 
7471 	if (lnk_vp) {
7472 		vp = lnk_vp;
7473 		put_vnode = false;
7474 	} else {
7475 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7476 		    seg, path, ctx);
7477 
7478 		error = nameiat(&nd, fd);
7479 		if (error) {
7480 			return error;
7481 		}
7482 		vp = nd.ni_vp;
7483 		put_vnode = true;
7484 		nameidone(&nd);
7485 	}
7486 
7487 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7488 	    &uio_buf[0], sizeof(uio_buf));
7489 	uio_addiov(auio, buf, bufsize);
7490 	if (vp->v_type != VLNK) {
7491 		error = EINVAL;
7492 	} else {
7493 #if CONFIG_MACF
7494 		error = mac_vnode_check_readlink(ctx, vp);
7495 #endif
7496 		if (error == 0) {
7497 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7498 			    ctx);
7499 		}
7500 		if (error == 0) {
7501 			error = VNOP_READLINK(vp, auio, ctx);
7502 		}
7503 	}
7504 
7505 	if (put_vnode) {
7506 		vnode_put(vp);
7507 	}
7508 
7509 	*retval = (int)(bufsize - uio_resid(auio));
7510 	return error;
7511 }
7512 
7513 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7514 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7515 {
7516 	enum uio_seg procseg;
7517 	vnode_t vp;
7518 	int error;
7519 
7520 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7521 
7522 	AUDIT_ARG(fd, uap->fd);
7523 
7524 	if ((error = file_vnode(uap->fd, &vp))) {
7525 		return error;
7526 	}
7527 	if ((error = vnode_getwithref(vp))) {
7528 		file_drop(uap->fd);
7529 		return error;
7530 	}
7531 
7532 	error = readlinkat_internal(vfs_context_current(), -1,
7533 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7534 	    uap->bufsize, procseg, retval);
7535 
7536 	vnode_put(vp);
7537 	file_drop(uap->fd);
7538 	return error;
7539 }
7540 
7541 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7542 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7543 {
7544 	enum uio_seg procseg;
7545 
7546 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7547 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7548 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7549 	           uap->count, procseg, retval);
7550 }
7551 
7552 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7553 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7554 {
7555 	enum uio_seg procseg;
7556 
7557 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7558 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7559 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7560 	           retval);
7561 }
7562 
7563 /*
7564  * Change file flags, the deep inner layer.
7565  */
7566 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7567 chflags0(vnode_t vp, struct vnode_attr *va,
7568     int (*setattr)(vnode_t, void *, vfs_context_t),
7569     void *arg, vfs_context_t ctx)
7570 {
7571 	kauth_action_t action = 0;
7572 	int error;
7573 
7574 #if CONFIG_MACF
7575 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7576 	if (error) {
7577 		goto out;
7578 	}
7579 #endif
7580 
7581 	/* request authorisation, disregard immutability */
7582 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7583 		goto out;
7584 	}
7585 	/*
7586 	 * Request that the auth layer disregard those file flags it's allowed to when
7587 	 * authorizing this operation; we need to do this in order to be able to
7588 	 * clear immutable flags.
7589 	 */
7590 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7591 		goto out;
7592 	}
7593 	error = (*setattr)(vp, arg, ctx);
7594 
7595 #if CONFIG_MACF
7596 	if (error == 0) {
7597 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7598 	}
7599 #endif
7600 
7601 out:
7602 	return error;
7603 }
7604 
7605 /*
7606  * Change file flags.
7607  *
7608  * NOTE: this will vnode_put() `vp'
7609  */
7610 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7611 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7612 {
7613 	struct vnode_attr va;
7614 	int error;
7615 
7616 	VATTR_INIT(&va);
7617 	VATTR_SET(&va, va_flags, flags);
7618 
7619 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7620 	vnode_put(vp);
7621 
7622 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7623 		error = ENOTSUP;
7624 	}
7625 
7626 	return error;
7627 }
7628 
7629 /*
7630  * Change flags of a file given a path name.
7631  */
7632 /* ARGSUSED */
7633 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7634 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7635 {
7636 	vnode_t vp;
7637 	vfs_context_t ctx = vfs_context_current();
7638 	int error;
7639 	struct nameidata nd;
7640 	uint32_t wantparent = 0;
7641 
7642 #if CONFIG_FILE_LEASES
7643 	wantparent = WANTPARENT;
7644 #endif
7645 
7646 	AUDIT_ARG(fflags, uap->flags);
7647 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7648 	    UIO_USERSPACE, uap->path, ctx);
7649 	error = namei(&nd);
7650 	if (error) {
7651 		return error;
7652 	}
7653 	vp = nd.ni_vp;
7654 
7655 #if CONFIG_FILE_LEASES
7656 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7657 	vnode_put(nd.ni_dvp);
7658 #endif
7659 
7660 	nameidone(&nd);
7661 
7662 	/* we don't vnode_put() here because chflags1 does internally */
7663 	error = chflags1(vp, uap->flags, ctx);
7664 
7665 	return error;
7666 }
7667 
7668 /*
7669  * Change flags of a file given a file descriptor.
7670  */
7671 /* ARGSUSED */
7672 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7673 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7674 {
7675 	vnode_t vp;
7676 	int error;
7677 
7678 	AUDIT_ARG(fd, uap->fd);
7679 	AUDIT_ARG(fflags, uap->flags);
7680 	if ((error = file_vnode(uap->fd, &vp))) {
7681 		return error;
7682 	}
7683 
7684 	if ((error = vnode_getwithref(vp))) {
7685 		file_drop(uap->fd);
7686 		return error;
7687 	}
7688 
7689 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7690 
7691 #if CONFIG_FILE_LEASES
7692 	vnode_breakdirlease(vp, true, O_WRONLY);
7693 #endif
7694 
7695 	/* we don't vnode_put() here because chflags1 does internally */
7696 	error = chflags1(vp, uap->flags, vfs_context_current());
7697 
7698 	file_drop(uap->fd);
7699 	return error;
7700 }
7701 
7702 /*
7703  * Change security information on a filesystem object.
7704  *
7705  * Returns:	0			Success
7706  *		EPERM			Operation not permitted
7707  *		vnode_authattr:???	[anything vnode_authattr can return]
7708  *		vnode_authorize:???	[anything vnode_authorize can return]
7709  *		vnode_setattr:???	[anything vnode_setattr can return]
7710  *
7711  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7712  *		translated to EPERM before being returned.
7713  */
7714 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7715 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7716 {
7717 	kauth_action_t action;
7718 	int error;
7719 
7720 	AUDIT_ARG(mode, vap->va_mode);
7721 	/* XXX audit new args */
7722 
7723 #if NAMEDSTREAMS
7724 	/* chmod calls are not allowed for resource forks. */
7725 	if (vp->v_flag & VISNAMEDSTREAM) {
7726 		return EPERM;
7727 	}
7728 #endif
7729 
7730 #if CONFIG_MACF
7731 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7732 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7733 		return error;
7734 	}
7735 
7736 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7737 		if ((error = mac_vnode_check_setowner(ctx, vp,
7738 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7739 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7740 			return error;
7741 		}
7742 	}
7743 
7744 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7745 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7746 		return error;
7747 	}
7748 #endif
7749 
7750 	/* make sure that the caller is allowed to set this security information */
7751 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7752 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7753 		if (error == EACCES) {
7754 			error = EPERM;
7755 		}
7756 		return error;
7757 	}
7758 
7759 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7760 		return error;
7761 	}
7762 
7763 #if CONFIG_MACF
7764 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7765 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7766 	}
7767 
7768 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7769 		mac_vnode_notify_setowner(ctx, vp,
7770 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7771 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7772 	}
7773 
7774 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7775 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7776 	}
7777 #endif
7778 
7779 	return error;
7780 }
7781 
7782 
7783 /*
7784  * Change mode of a file given a path name.
7785  *
7786  * Returns:	0			Success
7787  *		namei:???		[anything namei can return]
7788  *		chmod_vnode:???		[anything chmod_vnode can return]
7789  */
7790 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7791 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7792     int fd, int flag, enum uio_seg segflg)
7793 {
7794 	struct nameidata nd;
7795 	int follow, error;
7796 	uint32_t wantparent = 0;
7797 
7798 #if CONFIG_FILE_LEASES
7799 	wantparent = WANTPARENT;
7800 #endif
7801 
7802 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7803 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7804 	    segflg, path, ctx);
7805 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7806 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7807 	}
7808 	if ((error = nameiat(&nd, fd))) {
7809 		return error;
7810 	}
7811 
7812 #if CONFIG_FILE_LEASES
7813 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7814 	vnode_put(nd.ni_dvp);
7815 #endif
7816 
7817 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7818 	vnode_put(nd.ni_vp);
7819 	nameidone(&nd);
7820 	return error;
7821 }
7822 
7823 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7824 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7825     gid_t gid, user_addr_t xsecurity)
7826 {
7827 	int error;
7828 
7829 	VATTR_INIT(pva);
7830 
7831 	if (mode != -1) {
7832 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
7833 	} else {
7834 		pva->va_mode = 0;
7835 	}
7836 
7837 	if (uid != KAUTH_UID_NONE) {
7838 		VATTR_SET(pva, va_uid, uid);
7839 	}
7840 
7841 	if (gid != KAUTH_GID_NONE) {
7842 		VATTR_SET(pva, va_gid, gid);
7843 	}
7844 
7845 	*pxsecdst = NULL;
7846 	switch (xsecurity) {
7847 	case USER_ADDR_NULL:
7848 		break;
7849 
7850 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7851 		VATTR_SET(pva, va_acl, NULL);
7852 		break;
7853 
7854 	default:
7855 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7856 			return error;
7857 		}
7858 
7859 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7860 		pva->va_vaflags |= VA_FILESEC_ACL;
7861 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7862 		break;
7863 	}
7864 
7865 	return 0;
7866 }
7867 
7868 /*
7869  * chmod_extended: Change the mode of a file given a path name; with extended
7870  * argument list (including extended security (ACL)).
7871  *
7872  * Parameters:	p			Process requesting the open
7873  *		uap			User argument descriptor (see below)
7874  *		retval			(ignored)
7875  *
7876  * Indirect:	uap->path		Path to object (same as 'chmod')
7877  *		uap->uid		UID to set
7878  *		uap->gid		GID to set
7879  *		uap->mode		File mode to set (same as 'chmod')
7880  *		uap->xsecurity		ACL to set (or delete)
7881  *
7882  * Returns:	0			Success
7883  *		!0			errno value
7884  *
7885  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7886  *
7887  * XXX:		We should enummerate the possible errno values here, and where
7888  *		in the code they originated.
7889  */
7890 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7891 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7892 {
7893 	int error;
7894 	struct vnode_attr va;
7895 	kauth_filesec_t xsecdst = NULL;
7896 
7897 	AUDIT_ARG(owner, uap->uid, uap->gid);
7898 
7899 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7900 	    uap->gid, uap->xsecurity);
7901 
7902 	if (error) {
7903 		return error;
7904 	}
7905 
7906 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7907 	    UIO_USERSPACE);
7908 
7909 	if (xsecdst != NULL) {
7910 		kauth_filesec_free(xsecdst);
7911 	}
7912 	return error;
7913 }
7914 
7915 /*
7916  * Returns:	0			Success
7917  *		chmodat:???		[anything chmodat can return]
7918  */
7919 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7920 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7921     int flag, enum uio_seg segflg)
7922 {
7923 	struct vnode_attr va;
7924 
7925 	VATTR_INIT(&va);
7926 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7927 
7928 	return chmodat(ctx, path, &va, fd, flag, segflg);
7929 }
7930 
7931 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7932 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7933 {
7934 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7935 	           AT_FDCWD, 0, UIO_USERSPACE);
7936 }
7937 
7938 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7939 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7940 {
7941 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7942 		return EINVAL;
7943 	}
7944 
7945 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7946 	           uap->fd, uap->flag, UIO_USERSPACE);
7947 }
7948 
7949 /*
7950  * Change mode of a file given a file descriptor.
7951  */
7952 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7953 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7954 {
7955 	vnode_t vp;
7956 	int error;
7957 
7958 	AUDIT_ARG(fd, fd);
7959 
7960 	if ((error = file_vnode(fd, &vp)) != 0) {
7961 		return error;
7962 	}
7963 	if ((error = vnode_getwithref(vp)) != 0) {
7964 		file_drop(fd);
7965 		return error;
7966 	}
7967 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7968 
7969 #if CONFIG_FILE_LEASES
7970 	vnode_breakdirlease(vp, true, O_WRONLY);
7971 #endif
7972 
7973 	error = chmod_vnode(vfs_context_current(), vp, vap);
7974 	(void)vnode_put(vp);
7975 	file_drop(fd);
7976 
7977 	return error;
7978 }
7979 
7980 /*
7981  * fchmod_extended: Change mode of a file given a file descriptor; with
7982  * extended argument list (including extended security (ACL)).
7983  *
7984  * Parameters:    p                       Process requesting to change file mode
7985  *                uap                     User argument descriptor (see below)
7986  *                retval                  (ignored)
7987  *
7988  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7989  *                uap->uid                UID to set
7990  *                uap->gid                GID to set
7991  *                uap->xsecurity          ACL to set (or delete)
7992  *                uap->fd                 File descriptor of file to change mode
7993  *
7994  * Returns:        0                      Success
7995  *                !0                      errno value
7996  *
7997  */
7998 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7999 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
8000 {
8001 	int error;
8002 	struct vnode_attr va;
8003 	kauth_filesec_t xsecdst = NULL;
8004 
8005 	AUDIT_ARG(owner, uap->uid, uap->gid);
8006 
8007 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8008 	    uap->gid, uap->xsecurity);
8009 
8010 	if (error) {
8011 		return error;
8012 	}
8013 
8014 	error = fchmod1(p, uap->fd, &va);
8015 
8016 	if (xsecdst != NULL) {
8017 		kauth_filesec_free(xsecdst);
8018 	}
8019 	return error;
8020 }
8021 
8022 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)8023 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
8024 {
8025 	struct vnode_attr va;
8026 
8027 	VATTR_INIT(&va);
8028 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
8029 
8030 	return fchmod1(p, uap->fd, &va);
8031 }
8032 
8033 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)8034 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
8035 {
8036 	struct vnode_attr va;
8037 	kauth_action_t action;
8038 	int error;
8039 
8040 	VATTR_INIT(&va);
8041 	if (uid != (uid_t)VNOVAL) {
8042 		VATTR_SET(&va, va_uid, uid);
8043 	}
8044 	if (gid != (gid_t)VNOVAL) {
8045 		VATTR_SET(&va, va_gid, gid);
8046 	}
8047 
8048 #if NAMEDSTREAMS
8049 	/* chown calls are not allowed for resource forks. */
8050 	if (vp->v_flag & VISNAMEDSTREAM) {
8051 		error = EPERM;
8052 		goto out;
8053 	}
8054 #endif
8055 
8056 #if CONFIG_MACF
8057 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
8058 	if (error) {
8059 		goto out;
8060 	}
8061 #endif
8062 
8063 	/* preflight and authorize attribute changes */
8064 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8065 		goto out;
8066 	}
8067 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8068 		/*
8069 		 * EACCES is only allowed from namei(); permissions failure should
8070 		 * return EPERM, so we need to translate the error code.
8071 		 */
8072 		if (error == EACCES) {
8073 			error = EPERM;
8074 		}
8075 
8076 		goto out;
8077 	}
8078 
8079 #if CONFIG_FILE_LEASES
8080 	vnode_breakdirlease(vp, true, O_WRONLY);
8081 #endif
8082 
8083 	error = vnode_setattr(vp, &va, ctx);
8084 
8085 #if CONFIG_MACF
8086 	if (error == 0) {
8087 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
8088 	}
8089 #endif
8090 
8091 out:
8092 	return error;
8093 }
8094 
8095 /*
8096  * Set ownership given a path name.
8097  */
8098 /* ARGSUSED */
8099 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)8100 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
8101     gid_t gid, int flag, enum uio_seg segflg)
8102 {
8103 	vnode_t vp;
8104 	int error;
8105 	struct nameidata nd;
8106 	int follow;
8107 
8108 	AUDIT_ARG(owner, uid, gid);
8109 
8110 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8111 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
8112 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8113 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8114 	}
8115 
8116 	error = nameiat(&nd, fd);
8117 	if (error) {
8118 		return error;
8119 	}
8120 
8121 	vp = nd.ni_vp;
8122 	error = vn_chown_internal(ctx, vp, uid, gid);
8123 
8124 	nameidone(&nd);
8125 	vnode_put(vp);
8126 	return error;
8127 }
8128 
8129 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8130 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8131 {
8132 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8133 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
8134 }
8135 
8136 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8137 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8138 {
8139 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8140 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8141 }
8142 
8143 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8144 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8145 {
8146 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
8147 		return EINVAL;
8148 	}
8149 
8150 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8151 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8152 }
8153 
8154 /*
8155  * Set ownership given a file descriptor.
8156  */
8157 /* ARGSUSED */
8158 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8159 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8160 {
8161 	vfs_context_t ctx = vfs_context_current();
8162 	vnode_t vp;
8163 	int error;
8164 
8165 	AUDIT_ARG(owner, uap->uid, uap->gid);
8166 	AUDIT_ARG(fd, uap->fd);
8167 
8168 	if ((error = file_vnode(uap->fd, &vp))) {
8169 		return error;
8170 	}
8171 
8172 	if ((error = vnode_getwithref(vp))) {
8173 		file_drop(uap->fd);
8174 		return error;
8175 	}
8176 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8177 
8178 	error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8179 
8180 	(void)vnode_put(vp);
8181 	file_drop(uap->fd);
8182 	return error;
8183 }
8184 
8185 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8186 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8187 {
8188 	int error;
8189 
8190 	if (usrtvp == USER_ADDR_NULL) {
8191 		struct timeval old_tv;
8192 		/* XXX Y2038 bug because of microtime argument */
8193 		microtime(&old_tv);
8194 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8195 		tsp[1] = tsp[0];
8196 	} else {
8197 		if (IS_64BIT_PROCESS(current_proc())) {
8198 			struct user64_timeval tv[2];
8199 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8200 			if (error) {
8201 				return error;
8202 			}
8203 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8204 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8205 		} else {
8206 			struct user32_timeval tv[2];
8207 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8208 			if (error) {
8209 				return error;
8210 			}
8211 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8212 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8213 		}
8214 	}
8215 	return 0;
8216 }
8217 
8218 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8219 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8220     int nullflag)
8221 {
8222 	int error;
8223 	struct vnode_attr va;
8224 	kauth_action_t action;
8225 
8226 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8227 
8228 	VATTR_INIT(&va);
8229 	VATTR_SET(&va, va_access_time, ts[0]);
8230 	VATTR_SET(&va, va_modify_time, ts[1]);
8231 	if (nullflag) {
8232 		va.va_vaflags |= VA_UTIMES_NULL;
8233 	}
8234 
8235 #if NAMEDSTREAMS
8236 	/* utimes calls are not allowed for resource forks. */
8237 	if (vp->v_flag & VISNAMEDSTREAM) {
8238 		error = EPERM;
8239 		goto out;
8240 	}
8241 #endif
8242 
8243 #if CONFIG_MACF
8244 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8245 	if (error) {
8246 		goto out;
8247 	}
8248 #endif
8249 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8250 		if (!nullflag && error == EACCES) {
8251 			error = EPERM;
8252 		}
8253 		goto out;
8254 	}
8255 
8256 	/* since we may not need to auth anything, check here */
8257 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8258 		if (!nullflag && error == EACCES) {
8259 			error = EPERM;
8260 		}
8261 		goto out;
8262 	}
8263 	error = vnode_setattr(vp, &va, ctx);
8264 
8265 #if CONFIG_MACF
8266 	if (error == 0) {
8267 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8268 	}
8269 #endif
8270 
8271 out:
8272 	return error;
8273 }
8274 
8275 /*
8276  * Set the access and modification times of a file.
8277  */
8278 /* ARGSUSED */
8279 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8280 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8281 {
8282 	struct timespec ts[2];
8283 	user_addr_t usrtvp;
8284 	int error;
8285 	struct nameidata nd;
8286 	vfs_context_t ctx = vfs_context_current();
8287 	uint32_t wantparent = 0;
8288 
8289 #if CONFIG_FILE_LEASES
8290 	wantparent = WANTPARENT;
8291 #endif
8292 
8293 	/*
8294 	 * AUDIT: Needed to change the order of operations to do the
8295 	 * name lookup first because auditing wants the path.
8296 	 */
8297 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8298 	    UIO_USERSPACE, uap->path, ctx);
8299 	error = namei(&nd);
8300 	if (error) {
8301 		return error;
8302 	}
8303 
8304 	/*
8305 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8306 	 * the current time instead.
8307 	 */
8308 	usrtvp = uap->tptr;
8309 	if ((error = getutimes(usrtvp, ts)) != 0) {
8310 		goto out;
8311 	}
8312 
8313 #if CONFIG_FILE_LEASES
8314 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8315 #endif
8316 
8317 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8318 
8319 out:
8320 #if CONFIG_FILE_LEASES
8321 	vnode_put(nd.ni_dvp);
8322 #endif
8323 	nameidone(&nd);
8324 	vnode_put(nd.ni_vp);
8325 	return error;
8326 }
8327 
8328 /*
8329  * Set the access and modification times of a file.
8330  */
8331 /* ARGSUSED */
8332 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8333 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8334 {
8335 	struct timespec ts[2];
8336 	vnode_t vp;
8337 	user_addr_t usrtvp;
8338 	int error;
8339 
8340 	AUDIT_ARG(fd, uap->fd);
8341 	usrtvp = uap->tptr;
8342 	if ((error = getutimes(usrtvp, ts)) != 0) {
8343 		return error;
8344 	}
8345 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8346 		return error;
8347 	}
8348 	if ((error = vnode_getwithref(vp))) {
8349 		file_drop(uap->fd);
8350 		return error;
8351 	}
8352 
8353 #if CONFIG_FILE_LEASES
8354 	vnode_breakdirlease(vp, true, O_WRONLY);
8355 #endif
8356 
8357 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8358 
8359 	vnode_put(vp);
8360 	file_drop(uap->fd);
8361 	return error;
8362 }
8363 
8364 static int
truncate_validate_common(proc_t p,off_t length)8365 truncate_validate_common(proc_t p, off_t length)
8366 {
8367 	rlim_t fsize_limit;
8368 
8369 	if (length < 0) {
8370 		return EINVAL;
8371 	}
8372 
8373 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8374 	if ((rlim_t)length > fsize_limit) {
8375 		psignal(p, SIGXFSZ);
8376 		return EFBIG;
8377 	}
8378 
8379 	return 0;
8380 }
8381 
8382 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8383 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8384     vfs_context_t ctx, boolean_t need_auth)
8385 {
8386 	struct vnode_attr va;
8387 	kauth_action_t action;
8388 	int error;
8389 
8390 	VATTR_INIT(&va);
8391 	VATTR_SET(&va, va_data_size, length);
8392 
8393 #if CONFIG_MACF
8394 	error = mac_vnode_check_truncate(ctx, cred, vp);
8395 	if (error) {
8396 		return error;
8397 	}
8398 #endif
8399 
8400 	/*
8401 	 * If we reached here from `ftruncate` then we already did an effective
8402 	 * `vnode_authorize` upon open.  We honour the result from then.
8403 	 */
8404 	if (need_auth) {
8405 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8406 			return error;
8407 		}
8408 
8409 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8410 			return error;
8411 		}
8412 	}
8413 
8414 #if CONFIG_FILE_LEASES
8415 	/* Check if there is a lease placed on the parent directory. */
8416 	vnode_breakdirlease(vp, true, O_WRONLY);
8417 
8418 	/* Now check if there is a lease placed on the file itself. */
8419 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8420 #endif
8421 
8422 	error = vnode_setattr(vp, &va, ctx);
8423 
8424 #if CONFIG_MACF
8425 	if (error == 0) {
8426 		mac_vnode_notify_truncate(ctx, cred, vp);
8427 	}
8428 #endif
8429 
8430 	return error;
8431 }
8432 
8433 /*
8434  * Truncate a file given its path name.
8435  */
8436 /* ARGSUSED */
8437 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8438 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8439 {
8440 	vfs_context_t ctx = vfs_context_current();
8441 	vnode_t vp;
8442 	int error;
8443 	struct nameidata nd;
8444 
8445 	if ((error = truncate_validate_common(p, uap->length))) {
8446 		return error;
8447 	}
8448 
8449 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8450 	    UIO_USERSPACE, uap->path, ctx);
8451 
8452 	if ((error = namei(&nd))) {
8453 		return error;
8454 	}
8455 
8456 	vp = nd.ni_vp;
8457 	nameidone(&nd);
8458 
8459 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8460 	vnode_put(vp);
8461 
8462 	return error;
8463 }
8464 
8465 /*
8466  * Truncate a file given a file descriptor.
8467  */
8468 /* ARGSUSED */
8469 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8470 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8471 {
8472 	vnode_t vp;
8473 	struct fileproc *fp;
8474 	int error;
8475 
8476 	AUDIT_ARG(fd, uap->fd);
8477 
8478 	if ((error = truncate_validate_common(p, uap->length))) {
8479 		return error;
8480 	}
8481 
8482 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8483 		return error;
8484 	}
8485 
8486 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8487 	case DTYPE_PSXSHM:
8488 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8489 		goto out;
8490 	case DTYPE_VNODE:
8491 		break;
8492 	default:
8493 		error = EINVAL;
8494 		goto out;
8495 	}
8496 
8497 	vp = (vnode_t)fp_get_data(fp);
8498 
8499 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8500 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8501 		error = EINVAL;
8502 		goto out;
8503 	}
8504 
8505 	if ((error = vnode_getwithref(vp)) != 0) {
8506 		goto out;
8507 	}
8508 
8509 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8510 
8511 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8512 	    vfs_context_current(), false);
8513 	vnode_put(vp);
8514 
8515 out:
8516 	file_drop(uap->fd);
8517 	return error;
8518 }
8519 
8520 
8521 /*
8522  * Sync an open file with synchronized I/O _file_ integrity completion
8523  */
8524 /* ARGSUSED */
8525 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8526 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8527 {
8528 	__pthread_testcancel(1);
8529 	return fsync_common(p, uap, MNT_WAIT);
8530 }
8531 
8532 
8533 /*
8534  * Sync an open file with synchronized I/O _file_ integrity completion
8535  *
8536  * Notes:	This is a legacy support function that does not test for
8537  *		thread cancellation points.
8538  */
8539 /* ARGSUSED */
8540 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8541 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8542 {
8543 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8544 }
8545 
8546 
8547 /*
8548  * Sync an open file with synchronized I/O _data_ integrity completion
8549  */
8550 /* ARGSUSED */
8551 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8552 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8553 {
8554 	__pthread_testcancel(1);
8555 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8556 }
8557 
8558 
8559 /*
8560  * fsync_common
8561  *
8562  * Common fsync code to support both synchronized I/O file integrity completion
8563  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8564  *
8565  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8566  * will only guarantee that the file data contents are retrievable.  If
8567  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8568  * includes additional metadata unnecessary for retrieving the file data
8569  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8570  * storage.
8571  *
8572  * Parameters:	p				The process
8573  *		uap->fd				The descriptor to synchronize
8574  *		flags				The data integrity flags
8575  *
8576  * Returns:	int				Success
8577  *	fp_getfvp:EBADF				Bad file descriptor
8578  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8579  *	VNOP_FSYNC:???				unspecified
8580  *
8581  * Notes:	We use struct fsync_args because it is a short name, and all
8582  *		caller argument structures are otherwise identical.
8583  */
8584 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8585 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8586 {
8587 	vnode_t vp;
8588 	struct fileproc *fp;
8589 	vfs_context_t ctx = vfs_context_current();
8590 	int error;
8591 
8592 	AUDIT_ARG(fd, uap->fd);
8593 
8594 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8595 		return error;
8596 	}
8597 	if ((error = vnode_getwithref(vp))) {
8598 		file_drop(uap->fd);
8599 		return error;
8600 	}
8601 
8602 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8603 
8604 	error = VNOP_FSYNC(vp, flags, ctx);
8605 
8606 #if NAMEDRSRCFORK
8607 	/* Sync resource fork shadow file if necessary. */
8608 	if ((error == 0) &&
8609 	    (vp->v_flag & VISNAMEDSTREAM) &&
8610 	    (vp->v_parent != NULLVP) &&
8611 	    vnode_isshadow(vp) &&
8612 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8613 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8614 	}
8615 #endif
8616 
8617 	(void)vnode_put(vp);
8618 	file_drop(uap->fd);
8619 	return error;
8620 }
8621 
8622 /*
8623  * Duplicate files.  Source must be a file, target must be a file or
8624  * must not exist.
8625  *
8626  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8627  *     perform inheritance correctly.
8628  */
8629 /* ARGSUSED */
8630 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8631 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8632 {
8633 	vnode_t tvp, fvp, tdvp, sdvp;
8634 	struct nameidata fromnd, tond;
8635 	int error;
8636 	vfs_context_t ctx = vfs_context_current();
8637 
8638 	/* Check that the flags are valid. */
8639 	if (uap->flags & ~CPF_MASK) {
8640 		return EINVAL;
8641 	}
8642 
8643 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8644 	    UIO_USERSPACE, uap->from, ctx);
8645 	if ((error = namei(&fromnd))) {
8646 		return error;
8647 	}
8648 	fvp = fromnd.ni_vp;
8649 
8650 	NDINIT(&tond, CREATE, OP_LINK,
8651 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8652 	    UIO_USERSPACE, uap->to, ctx);
8653 	if ((error = namei(&tond))) {
8654 		goto out1;
8655 	}
8656 	tdvp = tond.ni_dvp;
8657 	tvp = tond.ni_vp;
8658 
8659 	if (tvp != NULL) {
8660 		if (!(uap->flags & CPF_OVERWRITE)) {
8661 			error = EEXIST;
8662 			goto out;
8663 		}
8664 	}
8665 
8666 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8667 		error = EISDIR;
8668 		goto out;
8669 	}
8670 
8671 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8672 		error = EOPNOTSUPP;
8673 		goto out;
8674 	}
8675 
8676 #if CONFIG_MACF
8677 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8678 		goto out;
8679 	}
8680 #endif /* CONFIG_MACF */
8681 
8682 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8683 		goto out;
8684 	}
8685 	if (tvp) {
8686 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8687 			goto out;
8688 		}
8689 	}
8690 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8691 		goto out;
8692 	}
8693 
8694 	if (fvp == tdvp) {
8695 		error = EINVAL;
8696 	}
8697 	/*
8698 	 * If source is the same as the destination (that is the
8699 	 * same inode number) then there is nothing to do.
8700 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8701 	 */
8702 	if (fvp == tvp) {
8703 		error = -1;
8704 	}
8705 
8706 #if CONFIG_FILE_LEASES
8707 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8708 #endif
8709 
8710 	if (!error) {
8711 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8712 	}
8713 out:
8714 	sdvp = tond.ni_startdir;
8715 	/*
8716 	 * nameidone has to happen before we vnode_put(tdvp)
8717 	 * since it may need to release the fs_nodelock on the tdvp
8718 	 */
8719 	nameidone(&tond);
8720 
8721 	if (tvp) {
8722 		vnode_put(tvp);
8723 	}
8724 	vnode_put(tdvp);
8725 	vnode_put(sdvp);
8726 out1:
8727 	vnode_put(fvp);
8728 
8729 	nameidone(&fromnd);
8730 
8731 	if (error == -1) {
8732 		return 0;
8733 	}
8734 	return error;
8735 }
8736 
8737 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8738 
8739 /*
8740  * Helper function for doing clones. The caller is expected to provide an
8741  * iocounted source vnode and release it.
8742  */
8743 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8744 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8745     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8746 {
8747 	vnode_t tvp, tdvp;
8748 	struct nameidata *tondp = NULL;
8749 	int error;
8750 	int follow;
8751 	boolean_t free_src_acl;
8752 	boolean_t attr_cleanup;
8753 	enum vtype v_type;
8754 	kauth_action_t action;
8755 	struct componentname *cnp;
8756 	uint32_t defaulted = 0;
8757 	struct {
8758 		struct vnode_attr va[2];
8759 	} *va2p = NULL;
8760 	struct vnode_attr *vap = NULL;
8761 	struct vnode_attr *nvap = NULL;
8762 	uint32_t vnop_flags;
8763 
8764 	v_type = vnode_vtype(fvp);
8765 	switch (v_type) {
8766 	case VLNK:
8767 	/* FALLTHRU */
8768 	case VREG:
8769 		action = KAUTH_VNODE_ADD_FILE;
8770 		break;
8771 	case VDIR:
8772 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8773 		    fvp->v_mountedhere) {
8774 			return EINVAL;
8775 		}
8776 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8777 		break;
8778 	default:
8779 		return EINVAL;
8780 	}
8781 
8782 	AUDIT_ARG(fd2, dst_dirfd);
8783 	AUDIT_ARG(value32, flags);
8784 
8785 	tondp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8786 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8787 	NDINIT(tondp, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8788 	    UIO_USERSPACE, dst, ctx);
8789 	if (flags & CLONE_NOFOLLOW_ANY) {
8790 		tondp->ni_flag |= NAMEI_NOFOLLOW_ANY;
8791 	}
8792 
8793 	if ((error = nameiat(tondp, dst_dirfd))) {
8794 		kfree_type(struct nameidata, tondp);
8795 		return error;
8796 	}
8797 	cnp = &tondp->ni_cnd;
8798 	tdvp = tondp->ni_dvp;
8799 	tvp = tondp->ni_vp;
8800 
8801 	free_src_acl = FALSE;
8802 	attr_cleanup = FALSE;
8803 
8804 	if (tvp != NULL) {
8805 		error = EEXIST;
8806 		goto out;
8807 	}
8808 
8809 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8810 		error = EXDEV;
8811 		goto out;
8812 	}
8813 
8814 #if CONFIG_MACF
8815 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8816 		goto out;
8817 	}
8818 #endif
8819 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8820 		goto out;
8821 	}
8822 
8823 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8824 	if (data_read_authorised) {
8825 		action &= ~KAUTH_VNODE_READ_DATA;
8826 	}
8827 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8828 		goto out;
8829 	}
8830 
8831 	va2p = kalloc_type(typeof(*va2p), Z_WAITOK | Z_NOFAIL);
8832 	vap = &va2p->va[0];
8833 	nvap = &va2p->va[1];
8834 
8835 	/*
8836 	 * certain attributes may need to be changed from the source, we ask for
8837 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
8838 	 * flag is specified. By default, the clone file will inherit the target
8839 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8840 	 * will inherit the source file's ACLs instead.
8841 	 */
8842 	VATTR_INIT(vap);
8843 	VATTR_WANTED(vap, va_uid);
8844 	VATTR_WANTED(vap, va_gid);
8845 	VATTR_WANTED(vap, va_mode);
8846 	VATTR_WANTED(vap, va_flags);
8847 	if (flags & CLONE_ACL) {
8848 		VATTR_WANTED(vap, va_acl);
8849 	}
8850 
8851 	if ((error = vnode_getattr(fvp, vap, ctx)) != 0) {
8852 		goto out;
8853 	}
8854 
8855 	VATTR_INIT(nvap);
8856 	VATTR_SET(nvap, va_type, v_type);
8857 	if (VATTR_IS_SUPPORTED(vap, va_acl) && vap->va_acl != NULL) {
8858 		VATTR_SET(nvap, va_acl, vap->va_acl);
8859 		free_src_acl = TRUE;
8860 	}
8861 
8862 	/* Handle ACL inheritance, initialize vap. */
8863 	if (v_type == VLNK) {
8864 		error = vnode_authattr_new(tdvp, nvap, 0, ctx);
8865 	} else {
8866 		error = vn_attribute_prepare(tdvp, nvap, &defaulted, ctx);
8867 		if (error) {
8868 			goto out;
8869 		}
8870 		attr_cleanup = TRUE;
8871 	}
8872 
8873 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8874 	/*
8875 	 * We've got initial values for all security parameters,
8876 	 * If we are superuser, then we can change owners to be the
8877 	 * same as the source. Both superuser and the owner have default
8878 	 * WRITE_SECURITY privileges so all other fields can be taken
8879 	 * from source as well.
8880 	 */
8881 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8882 		if (VATTR_IS_SUPPORTED(vap, va_uid)) {
8883 			VATTR_SET(nvap, va_uid, vap->va_uid);
8884 		}
8885 		if (VATTR_IS_SUPPORTED(vap, va_gid)) {
8886 			VATTR_SET(nvap, va_gid, vap->va_gid);
8887 		}
8888 	} else {
8889 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8890 	}
8891 
8892 	if (VATTR_IS_SUPPORTED(vap, va_mode)) {
8893 		VATTR_SET(nvap, va_mode, vap->va_mode);
8894 	}
8895 	if (VATTR_IS_SUPPORTED(vap, va_flags)) {
8896 		VATTR_SET(nvap, va_flags,
8897 		    ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8898 		    (nvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8899 	}
8900 
8901 #if CONFIG_FILE_LEASES
8902 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8903 #endif
8904 
8905 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, nvap, vnop_flags, ctx);
8906 
8907 	if (!error && tvp) {
8908 		int     update_flags = 0;
8909 #if CONFIG_FSE
8910 		int fsevent;
8911 #endif /* CONFIG_FSE */
8912 
8913 		/*
8914 		 * If some of the requested attributes weren't handled by the
8915 		 * VNOP, use our fallback code.
8916 		 */
8917 		if (!VATTR_ALL_SUPPORTED(nvap)) {
8918 			(void)vnode_setattr_fallback(tvp, nvap, ctx);
8919 		}
8920 
8921 #if CONFIG_MACF
8922 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8923 		    VNODE_LABEL_CREATE, ctx);
8924 #endif
8925 
8926 		// Make sure the name & parent pointers are hooked up
8927 		if (tvp->v_name == NULL) {
8928 			update_flags |= VNODE_UPDATE_NAME;
8929 		}
8930 		if (tvp->v_parent == NULLVP) {
8931 			update_flags |= VNODE_UPDATE_PARENT;
8932 		}
8933 
8934 		if (update_flags) {
8935 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8936 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8937 		}
8938 
8939 #if CONFIG_FSE
8940 		switch (vnode_vtype(tvp)) {
8941 		case VLNK:
8942 		/* FALLTHRU */
8943 		case VREG:
8944 			fsevent = FSE_CREATE_FILE;
8945 			break;
8946 		case VDIR:
8947 			fsevent = FSE_CREATE_DIR;
8948 			break;
8949 		default:
8950 			goto out;
8951 		}
8952 
8953 		if (need_fsevent(fsevent, tvp)) {
8954 			/*
8955 			 * The following is a sequence of three explicit events.
8956 			 * A pair of FSE_CLONE events representing the source and destination
8957 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8958 			 * fseventsd may coalesce the destination clone and create events
8959 			 * into a single event resulting in the following sequence for a client
8960 			 * FSE_CLONE (src)
8961 			 * FSE_CLONE | FSE_CREATE (dst)
8962 			 */
8963 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8964 			    FSE_ARG_DONE);
8965 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8966 			    FSE_ARG_DONE);
8967 		}
8968 #endif /* CONFIG_FSE */
8969 	}
8970 
8971 out:
8972 	if (attr_cleanup) {
8973 		vn_attribute_cleanup(nvap, defaulted);
8974 	}
8975 	if (free_src_acl && vap->va_acl) {
8976 		kauth_acl_free(vap->va_acl);
8977 	}
8978 	if (va2p) {
8979 		kfree_type(typeof(*va2p), va2p);
8980 	}
8981 	nameidone(tondp);
8982 	kfree_type(struct nameidata, tondp);
8983 	if (tvp) {
8984 		vnode_put(tvp);
8985 	}
8986 	vnode_put(tdvp);
8987 	return error;
8988 }
8989 
8990 /*
8991  * clone files or directories, target must not exist.
8992  */
8993 /* ARGSUSED */
8994 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8995 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8996     __unused int32_t *retval)
8997 {
8998 	vnode_t fvp;
8999 	struct nameidata *ndp = NULL;
9000 	int follow;
9001 	int error;
9002 	vfs_context_t ctx = vfs_context_current();
9003 
9004 	/* Check that the flags are valid. */
9005 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9006 	    CLONE_NOFOLLOW_ANY)) {
9007 		return EINVAL;
9008 	}
9009 
9010 	AUDIT_ARG(fd, uap->src_dirfd);
9011 
9012 	ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9013 
9014 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9015 	NDINIT(ndp, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
9016 	    UIO_USERSPACE, uap->src, ctx);
9017 	if (uap->flags & CLONE_NOFOLLOW_ANY) {
9018 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9019 	}
9020 
9021 	if ((error = nameiat(ndp, uap->src_dirfd))) {
9022 		kfree_type(struct nameidata, ndp);
9023 		return error;
9024 	}
9025 
9026 	fvp = ndp->ni_vp;
9027 	nameidone(ndp);
9028 	kfree_type(struct nameidata, ndp);
9029 
9030 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
9031 	    uap->flags, ctx);
9032 
9033 	vnode_put(fvp);
9034 	return error;
9035 }
9036 
9037 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)9038 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
9039     __unused int32_t *retval)
9040 {
9041 	vnode_t fvp;
9042 	struct fileproc *fp;
9043 	int error;
9044 	vfs_context_t ctx = vfs_context_current();
9045 
9046 	/* Check that the flags are valid. */
9047 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9048 	    CLONE_NOFOLLOW_ANY)) {
9049 		return EINVAL;
9050 	}
9051 
9052 	AUDIT_ARG(fd, uap->src_fd);
9053 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
9054 	if (error) {
9055 		return error;
9056 	}
9057 
9058 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9059 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
9060 		error = EBADF;
9061 		goto out;
9062 	}
9063 
9064 	if ((error = vnode_getwithref(fvp))) {
9065 		goto out;
9066 	}
9067 
9068 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
9069 
9070 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
9071 	    uap->flags, ctx);
9072 
9073 	vnode_put(fvp);
9074 out:
9075 	file_drop(uap->src_fd);
9076 	return error;
9077 }
9078 
9079 static int
rename_submounts_callback(mount_t mp,void * arg)9080 rename_submounts_callback(mount_t mp, void *arg)
9081 {
9082 	int error = 0;
9083 	mount_t pmp = (mount_t)arg;
9084 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
9085 
9086 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
9087 		return 0;
9088 	}
9089 
9090 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
9091 		return 0;
9092 	}
9093 
9094 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
9095 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
9096 		return -1;
9097 	}
9098 
9099 	size_t pathlen = MAXPATHLEN;
9100 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
9101 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
9102 	}
9103 
9104 	vfs_unbusy(mp);
9105 
9106 	return error;
9107 }
9108 
9109 /*
9110  * Rename files.  Source and destination must either both be directories,
9111  * or both not be directories.  If target is a directory, it must be empty.
9112  */
9113 /* ARGSUSED */
9114 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)9115 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
9116     int tofd, user_addr_t to, int segflg, u_int uflags)
9117 {
9118 	vnode_t tvp, tdvp;
9119 	vnode_t fvp, fdvp;
9120 	vnode_t mnt_fvp;
9121 	struct nameidata *fromnd, *tond;
9122 	int error = 0;
9123 	int do_retry;
9124 	int retry_count;
9125 	int mntrename;
9126 	int need_event;
9127 	int need_kpath2;
9128 	int has_listeners;
9129 	const char *oname = NULL;
9130 	char *from_name = NULL, *to_name = NULL;
9131 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
9132 	int from_len = 0, to_len = 0;
9133 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
9134 	int holding_mntlock;
9135 	int vn_authorize_skipped;
9136 	mount_t locked_mp = NULL;
9137 	vnode_t oparent = NULLVP;
9138 	vnode_t locked_vp = NULLVP;
9139 #if CONFIG_FSE
9140 	fse_info from_finfo = {}, to_finfo;
9141 #endif
9142 	int from_truncated = 0, to_truncated = 0;
9143 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
9144 	int batched = 0;
9145 	struct vnode_attr *fvap, *tvap;
9146 	int continuing = 0;
9147 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
9148 	int32_t nofollow_any = 0;
9149 	/* carving out a chunk for structs that are too big to be on stack. */
9150 	struct {
9151 		struct nameidata from_node, to_node;
9152 		struct vnode_attr fv_attr, tv_attr;
9153 	} * __rename_data;
9154 
9155 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9156 	fromnd = &__rename_data->from_node;
9157 	tond = &__rename_data->to_node;
9158 
9159 	holding_mntlock = 0;
9160 	do_retry = 0;
9161 	retry_count = 0;
9162 retry:
9163 	fvp = tvp = NULL;
9164 	fdvp = tdvp = NULL;
9165 	fvap = tvap = NULL;
9166 	mnt_fvp = NULLVP;
9167 	mntrename = FALSE;
9168 	vn_authorize_skipped = FALSE;
9169 
9170 	if (uflags & RENAME_NOFOLLOW_ANY) {
9171 		nofollow_any = NAMEI_NOFOLLOW_ANY;
9172 	}
9173 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9174 	    segflg, from, ctx);
9175 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9176 
9177 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9178 	    segflg, to, ctx);
9179 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9180 
9181 continue_lookup:
9182 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9183 		if ((error = nameiat(fromnd, fromfd))) {
9184 			goto out1;
9185 		}
9186 		fdvp = fromnd->ni_dvp;
9187 		fvp  = fromnd->ni_vp;
9188 
9189 		if (fvp && fvp->v_type == VDIR) {
9190 			tond->ni_cnd.cn_flags |= WILLBEDIR;
9191 		}
9192 	}
9193 
9194 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9195 		if ((error = nameiat(tond, tofd))) {
9196 			/*
9197 			 * Translate error code for rename("dir1", "dir2/.").
9198 			 */
9199 			if (error == EISDIR && fvp->v_type == VDIR) {
9200 				error = EINVAL;
9201 			}
9202 			goto out1;
9203 		}
9204 		tdvp = tond->ni_dvp;
9205 		tvp  = tond->ni_vp;
9206 	}
9207 
9208 #if DEVELOPMENT || DEBUG
9209 	/*
9210 	 * XXX VSWAP: Check for entitlements or special flag here
9211 	 * so we can restrict access appropriately.
9212 	 */
9213 #else /* DEVELOPMENT || DEBUG */
9214 
9215 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9216 		error = EPERM;
9217 		goto out1;
9218 	}
9219 
9220 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9221 		error = EPERM;
9222 		goto out1;
9223 	}
9224 #endif /* DEVELOPMENT || DEBUG */
9225 
9226 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9227 		error = ENOENT;
9228 		goto out1;
9229 	}
9230 
9231 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9232 		int32_t pval = 0;
9233 		int err = 0;
9234 
9235 		/*
9236 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9237 		 * has the same name as target iff the following conditions are met:
9238 		 * 1. the target file system is case insensitive
9239 		 * 2. source and target directories are the same
9240 		 * 3. source and target files are the same
9241 		 * 4. name only differs in case (determined by underlying filesystem)
9242 		 */
9243 		if (fvp != tvp || fdvp != tdvp) {
9244 			error = EEXIST;
9245 			goto out1;
9246 		}
9247 
9248 		/*
9249 		 * Assume that the target file system is case sensitive if
9250 		 * _PC_CASE_SENSITIVE selector isn't supported.
9251 		 */
9252 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9253 		if (err != 0 || pval != 0) {
9254 			error = EEXIST;
9255 			goto out1;
9256 		}
9257 	}
9258 
9259 	batched = vnode_compound_rename_available(fdvp);
9260 
9261 #if CONFIG_FSE
9262 	need_event = need_fsevent(FSE_RENAME, fdvp);
9263 	if (need_event) {
9264 		if (fvp) {
9265 			get_fse_info(fvp, &from_finfo, ctx);
9266 		} else {
9267 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9268 			if (error) {
9269 				goto out1;
9270 			}
9271 
9272 			fvap = &__rename_data->fv_attr;
9273 		}
9274 
9275 		if (tvp) {
9276 			get_fse_info(tvp, &to_finfo, ctx);
9277 		} else if (batched) {
9278 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9279 			if (error) {
9280 				goto out1;
9281 			}
9282 
9283 			tvap = &__rename_data->tv_attr;
9284 		}
9285 	}
9286 #else
9287 	need_event = 0;
9288 #endif /* CONFIG_FSE */
9289 
9290 	has_listeners = kauth_authorize_fileop_has_listeners();
9291 
9292 	need_kpath2 = 0;
9293 #if CONFIG_AUDIT
9294 	if (AUDIT_RECORD_EXISTS()) {
9295 		need_kpath2 = 1;
9296 	}
9297 #endif
9298 
9299 	if (need_event || has_listeners) {
9300 		if (from_name == NULL) {
9301 			GET_PATH(from_name);
9302 		}
9303 
9304 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9305 
9306 		if (from_name_no_firmlink == NULL) {
9307 			GET_PATH(from_name_no_firmlink);
9308 		}
9309 
9310 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9311 	}
9312 
9313 	if (need_event || need_kpath2 || has_listeners) {
9314 		if (to_name == NULL) {
9315 			GET_PATH(to_name);
9316 		}
9317 
9318 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9319 
9320 		if (to_name_no_firmlink == NULL) {
9321 			GET_PATH(to_name_no_firmlink);
9322 		}
9323 
9324 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9325 		if (to_name && need_kpath2) {
9326 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9327 		}
9328 	}
9329 	if (!fvp) {
9330 		/*
9331 		 * Claim: this check will never reject a valid rename.
9332 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9333 		 * Suppose fdvp and tdvp are not on the same mount.
9334 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9335 		 *      then you can't move it to within another dir on the same mountpoint.
9336 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9337 		 *
9338 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9339 		 */
9340 		if (fdvp->v_mount != tdvp->v_mount) {
9341 			error = EXDEV;
9342 			goto out1;
9343 		}
9344 		goto skipped_lookup;
9345 	}
9346 
9347 	/*
9348 	 * If the source and destination are the same (i.e. they're
9349 	 * links to the same vnode) and the target file system is
9350 	 * case sensitive, then there is nothing to do.
9351 	 *
9352 	 * XXX Come back to this.
9353 	 */
9354 	if (fvp == tvp) {
9355 		int pathconf_val;
9356 
9357 		/*
9358 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9359 		 * then assume that this file system is case sensitive.
9360 		 */
9361 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9362 		    pathconf_val != 0) {
9363 			vn_authorize_skipped = TRUE;
9364 			goto out1;
9365 		}
9366 	}
9367 
9368 	/*
9369 	 * Allow the renaming of mount points.
9370 	 * - target must not exist
9371 	 * - target must reside in the same directory as source
9372 	 * - union mounts cannot be renamed
9373 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9374 	 *
9375 	 * XXX Handle this in VFS after a continued lookup (if we missed
9376 	 * in the cache to start off)
9377 	 *
9378 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9379 	 * we'll skip past here.  The file system is responsible for
9380 	 * checking that @tvp is not a descendent of @fvp and vice versa
9381 	 * so it should always return EINVAL if either @tvp or @fvp is the
9382 	 * root of a volume.
9383 	 */
9384 	if ((fvp->v_flag & VROOT) &&
9385 	    (fvp->v_type == VDIR) &&
9386 	    (tvp == NULL) &&
9387 	    (fvp->v_mountedhere == NULL) &&
9388 	    (fdvp == tdvp) &&
9389 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9390 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9391 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9392 		vnode_t coveredvp;
9393 
9394 		/* switch fvp to the covered vnode */
9395 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9396 		if ((vnode_getwithref(coveredvp))) {
9397 			error = ENOENT;
9398 			goto out1;
9399 		}
9400 		/*
9401 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9402 		 * later.
9403 		 */
9404 		mnt_fvp = fvp;
9405 
9406 		fvp = coveredvp;
9407 		mntrename = TRUE;
9408 	}
9409 	/*
9410 	 * Check for cross-device rename.
9411 	 * For rename on mountpoint, we want to also check the source and its parent
9412 	 * belong to the same mountpoint.
9413 	 */
9414 	if ((fvp->v_mount != tdvp->v_mount) ||
9415 	    (fvp->v_mount != fdvp->v_mount) ||
9416 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9417 		error = EXDEV;
9418 		goto out1;
9419 	}
9420 
9421 	/*
9422 	 * If source is the same as the destination (that is the
9423 	 * same inode number) then there is nothing to do...
9424 	 * EXCEPT if the underlying file system supports case
9425 	 * insensitivity and is case preserving.  In this case
9426 	 * the file system needs to handle the special case of
9427 	 * getting the same vnode as target (fvp) and source (tvp).
9428 	 *
9429 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9430 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9431 	 * handle the special case of getting the same vnode as target and
9432 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9433 	 * so not to cause locking problems. There is a single reference on tvp.
9434 	 *
9435 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9436 	 * that correct behaviour then is just to return success without doing
9437 	 * anything.
9438 	 *
9439 	 * XXX filesystem should take care of this itself, perhaps...
9440 	 */
9441 	if (fvp == tvp && fdvp == tdvp) {
9442 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9443 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9444 		    fromnd->ni_cnd.cn_namelen)) {
9445 			vn_authorize_skipped = TRUE;
9446 			goto out1;
9447 		}
9448 	}
9449 
9450 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9451 		/*
9452 		 * we're holding a reference and lock
9453 		 * on locked_mp, but it no longer matches
9454 		 * what we want to do... so drop our hold
9455 		 */
9456 		mount_unlock_renames(locked_mp);
9457 		mount_drop(locked_mp, 0);
9458 		holding_mntlock = 0;
9459 	}
9460 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9461 		/*
9462 		 * serialize renames that re-shape
9463 		 * the tree... if holding_mntlock is
9464 		 * set, then we're ready to go...
9465 		 * otherwise we
9466 		 * first need to drop the iocounts
9467 		 * we picked up, second take the
9468 		 * lock to serialize the access,
9469 		 * then finally start the lookup
9470 		 * process over with the lock held
9471 		 */
9472 		if (!holding_mntlock) {
9473 			/*
9474 			 * need to grab a reference on
9475 			 * the mount point before we
9476 			 * drop all the iocounts... once
9477 			 * the iocounts are gone, the mount
9478 			 * could follow
9479 			 */
9480 			locked_mp = fvp->v_mount;
9481 			mount_ref(locked_mp, 0);
9482 
9483 			/*
9484 			 * nameidone has to happen before we vnode_put(tvp)
9485 			 * since it may need to release the fs_nodelock on the tvp
9486 			 */
9487 			nameidone(tond);
9488 
9489 			if (tvp) {
9490 				vnode_put(tvp);
9491 			}
9492 			vnode_put(tdvp);
9493 
9494 			/*
9495 			 * nameidone has to happen before we vnode_put(fdvp)
9496 			 * since it may need to release the fs_nodelock on the fvp
9497 			 */
9498 			nameidone(fromnd);
9499 
9500 			vnode_put(fvp);
9501 			vnode_put(fdvp);
9502 
9503 			if (mnt_fvp != NULLVP) {
9504 				vnode_put(mnt_fvp);
9505 			}
9506 
9507 			mount_lock_renames(locked_mp);
9508 			holding_mntlock = 1;
9509 
9510 			goto retry;
9511 		}
9512 	} else {
9513 		/*
9514 		 * when we dropped the iocounts to take
9515 		 * the lock, we allowed the identity of
9516 		 * the various vnodes to change... if they did,
9517 		 * we may no longer be dealing with a rename
9518 		 * that reshapes the tree... once we're holding
9519 		 * the iocounts, the vnodes can't change type
9520 		 * so we're free to drop the lock at this point
9521 		 * and continue on
9522 		 */
9523 		if (holding_mntlock) {
9524 			mount_unlock_renames(locked_mp);
9525 			mount_drop(locked_mp, 0);
9526 			holding_mntlock = 0;
9527 		}
9528 	}
9529 
9530 	if (!batched) {
9531 		assert(locked_vp == NULLVP);
9532 		vnode_link_lock(fvp);
9533 		locked_vp = fvp;
9534 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9535 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9536 		    flags, NULL);
9537 		if (error) {
9538 			if (error == ENOENT) {
9539 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9540 					/*
9541 					 * We encountered a race where after doing the namei,
9542 					 * tvp stops being valid. If so, simply re-drive the rename
9543 					 * call from the top.
9544 					 */
9545 					do_retry = 1;
9546 					retry_count += 1;
9547 				}
9548 			}
9549 			vnode_link_unlock(fvp);
9550 			locked_vp = NULLVP;
9551 			goto out1;
9552 		}
9553 	}
9554 
9555 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9556 	if (mnt_fvp != NULLVP) {
9557 		vnode_put(mnt_fvp);
9558 		mnt_fvp = NULLVP;
9559 	}
9560 
9561 	// save these off so we can later verify that fvp is the same
9562 	oname   = fvp->v_name;
9563 	oparent = fvp->v_parent;
9564 
9565 skipped_lookup:
9566 #if CONFIG_FILE_LEASES
9567 	/* Lease break needed for source's parent dir? */
9568 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9569 
9570 	/* Lease break needed for target's parent dir? */
9571 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9572 #endif
9573 
9574 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9575 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9576 	    flags, ctx);
9577 
9578 	if (locked_vp) {
9579 		vnode_link_unlock(fvp);
9580 		locked_vp = NULLVP;
9581 	}
9582 
9583 	if (holding_mntlock) {
9584 		/*
9585 		 * we can drop our serialization
9586 		 * lock now
9587 		 */
9588 		mount_unlock_renames(locked_mp);
9589 		mount_drop(locked_mp, 0);
9590 		holding_mntlock = 0;
9591 	}
9592 	if (error) {
9593 		if (error == EDATALESS) {
9594 			/*
9595 			 * If we've been here before, something has gone
9596 			 * horribly wrong and we should just get out lest
9597 			 * we spiral around the drain forever.
9598 			 */
9599 			if (flags & VFS_RENAME_DATALESS) {
9600 				error = EIO;
9601 				goto out1;
9602 			}
9603 
9604 			/*
9605 			 * The object we're renaming is dataless (or has a
9606 			 * dataless descendent) and requires materialization
9607 			 * before the rename occurs.  But we're holding the
9608 			 * mount point's rename lock, so it's not safe to
9609 			 * make the upcall.
9610 			 *
9611 			 * In this case, we release the lock (above), perform
9612 			 * the materialization, and start the whole thing over.
9613 			 */
9614 			error = vfs_materialize_reparent(fvp, tdvp);
9615 			if (error == 0) {
9616 				/*
9617 				 * The next time around we need to tell the
9618 				 * file system that the materializtaion has
9619 				 * been performed.
9620 				 */
9621 				flags |= VFS_RENAME_DATALESS;
9622 				do_retry = 1;
9623 			}
9624 			goto out1;
9625 		}
9626 		if (error == EKEEPLOOKING) {
9627 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9628 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9629 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9630 				}
9631 			}
9632 
9633 			fromnd->ni_vp = fvp;
9634 			tond->ni_vp = tvp;
9635 
9636 			goto continue_lookup;
9637 		}
9638 
9639 		/*
9640 		 * We may encounter a race in the VNOP where the destination didn't
9641 		 * exist when we did the namei, but it does by the time we go and
9642 		 * try to create the entry. In this case, we should re-drive this rename
9643 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9644 		 * but other filesystems susceptible to this race could return it, too.
9645 		 */
9646 		if (error == ERECYCLE) {
9647 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9648 				do_retry = 1;
9649 				retry_count += 1;
9650 			} else {
9651 				printf("rename retry limit due to ERECYCLE reached\n");
9652 				error = ENOENT;
9653 			}
9654 		}
9655 
9656 		/*
9657 		 * For compound VNOPs, the authorization callback may return
9658 		 * ENOENT in case of racing hardlink lookups hitting the name
9659 		 * cache, redrive the lookup.
9660 		 */
9661 		if (batched && error == ENOENT) {
9662 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9663 				do_retry = 1;
9664 				retry_count += 1;
9665 			}
9666 		}
9667 
9668 		goto out1;
9669 	}
9670 
9671 	/* call out to allow 3rd party notification of rename.
9672 	 * Ignore result of kauth_authorize_fileop call.
9673 	 */
9674 	kauth_authorize_fileop(vfs_context_ucred(ctx),
9675 	    KAUTH_FILEOP_RENAME,
9676 	    (uintptr_t)from_name, (uintptr_t)to_name);
9677 	if (flags & VFS_RENAME_SWAP) {
9678 		kauth_authorize_fileop(vfs_context_ucred(ctx),
9679 		    KAUTH_FILEOP_RENAME,
9680 		    (uintptr_t)to_name, (uintptr_t)from_name);
9681 	}
9682 
9683 #if CONFIG_FSE
9684 	if (from_name != NULL && to_name != NULL) {
9685 		if (from_truncated || to_truncated) {
9686 			// set it here since only the from_finfo gets reported up to user space
9687 			from_finfo.mode |= FSE_TRUNCATED_PATH;
9688 		}
9689 
9690 		if (tvap && tvp) {
9691 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9692 		}
9693 		if (fvap) {
9694 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9695 		}
9696 
9697 		if (tvp) {
9698 			add_fsevent(FSE_RENAME, ctx,
9699 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9700 			    FSE_ARG_FINFO, &from_finfo,
9701 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9702 			    FSE_ARG_FINFO, &to_finfo,
9703 			    FSE_ARG_DONE);
9704 			if (flags & VFS_RENAME_SWAP) {
9705 				/*
9706 				 * Strictly speaking, swap is the equivalent of
9707 				 * *three* renames.  FSEvents clients should only take
9708 				 * the events as a hint, so we only bother reporting
9709 				 * two.
9710 				 */
9711 				add_fsevent(FSE_RENAME, ctx,
9712 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9713 				    FSE_ARG_FINFO, &to_finfo,
9714 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9715 				    FSE_ARG_FINFO, &from_finfo,
9716 				    FSE_ARG_DONE);
9717 			}
9718 		} else {
9719 			add_fsevent(FSE_RENAME, ctx,
9720 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9721 			    FSE_ARG_FINFO, &from_finfo,
9722 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9723 			    FSE_ARG_DONE);
9724 		}
9725 	}
9726 #endif /* CONFIG_FSE */
9727 
9728 	/*
9729 	 * update filesystem's mount point data
9730 	 */
9731 	if (mntrename) {
9732 		char *cp, *pathend, *mpname;
9733 		char * tobuf;
9734 		struct mount *mp;
9735 		int maxlen;
9736 		size_t len = 0;
9737 
9738 		mp = fvp->v_mountedhere;
9739 
9740 		if (vfs_busy(mp, LK_NOWAIT)) {
9741 			error = EBUSY;
9742 			goto out1;
9743 		}
9744 		tobuf = zalloc(ZV_NAMEI);
9745 
9746 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
9747 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9748 		} else {
9749 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9750 		}
9751 		if (!error) {
9752 			/* find current mount point prefix */
9753 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
9754 			for (cp = pathend; *cp != '\0'; ++cp) {
9755 				if (*cp == '/') {
9756 					pathend = cp + 1;
9757 				}
9758 			}
9759 			/* find last component of target name */
9760 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9761 				if (*cp == '/') {
9762 					mpname = cp + 1;
9763 				}
9764 			}
9765 
9766 			/* Update f_mntonname of sub mounts */
9767 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
9768 
9769 			/* append name to prefix */
9770 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9771 			bzero(pathend, maxlen);
9772 
9773 			strlcpy(pathend, mpname, maxlen);
9774 		}
9775 		zfree(ZV_NAMEI, tobuf);
9776 
9777 		vfs_unbusy(mp);
9778 
9779 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9780 	}
9781 	/*
9782 	 * fix up name & parent pointers.  note that we first
9783 	 * check that fvp has the same name/parent pointers it
9784 	 * had before the rename call... this is a 'weak' check
9785 	 * at best...
9786 	 *
9787 	 * XXX oparent and oname may not be set in the compound vnop case
9788 	 */
9789 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9790 		int update_flags;
9791 
9792 		update_flags = VNODE_UPDATE_NAME;
9793 
9794 		if (fdvp != tdvp) {
9795 			update_flags |= VNODE_UPDATE_PARENT;
9796 		}
9797 
9798 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9799 	}
9800 out1:
9801 	/*
9802 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9803 	 * skipped earlier as no actual rename was performed.
9804 	 */
9805 	if (vn_authorize_skipped && error == 0) {
9806 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
9807 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9808 		    flags, NULL);
9809 		if (error && error == ENOENT) {
9810 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9811 				do_retry = 1;
9812 				retry_count += 1;
9813 			}
9814 		}
9815 	}
9816 	if (to_name != NULL) {
9817 		RELEASE_PATH(to_name);
9818 		to_name = NULL;
9819 	}
9820 	if (to_name_no_firmlink != NULL) {
9821 		RELEASE_PATH(to_name_no_firmlink);
9822 		to_name_no_firmlink = NULL;
9823 	}
9824 	if (from_name != NULL) {
9825 		RELEASE_PATH(from_name);
9826 		from_name = NULL;
9827 	}
9828 	if (from_name_no_firmlink != NULL) {
9829 		RELEASE_PATH(from_name_no_firmlink);
9830 		from_name_no_firmlink = NULL;
9831 	}
9832 	if (holding_mntlock) {
9833 		mount_unlock_renames(locked_mp);
9834 		mount_drop(locked_mp, 0);
9835 		holding_mntlock = 0;
9836 	}
9837 	if (tdvp) {
9838 		/*
9839 		 * nameidone has to happen before we vnode_put(tdvp)
9840 		 * since it may need to release the fs_nodelock on the tdvp
9841 		 */
9842 		nameidone(tond);
9843 
9844 		if (tvp) {
9845 			vnode_put(tvp);
9846 		}
9847 		vnode_put(tdvp);
9848 	}
9849 	if (fdvp) {
9850 		/*
9851 		 * nameidone has to happen before we vnode_put(fdvp)
9852 		 * since it may need to release the fs_nodelock on the fdvp
9853 		 */
9854 		nameidone(fromnd);
9855 
9856 		if (fvp) {
9857 			vnode_put(fvp);
9858 		}
9859 		vnode_put(fdvp);
9860 	}
9861 	if (mnt_fvp != NULLVP) {
9862 		vnode_put(mnt_fvp);
9863 	}
9864 	/*
9865 	 * If things changed after we did the namei, then we will re-drive
9866 	 * this rename call from the top.
9867 	 */
9868 	if (do_retry) {
9869 		do_retry = 0;
9870 		goto retry;
9871 	}
9872 
9873 	kfree_type(typeof(*__rename_data), __rename_data);
9874 	return error;
9875 }
9876 
9877 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9878 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9879 {
9880 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9881 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9882 }
9883 
9884 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9885 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9886 {
9887 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9888 		return EINVAL;
9889 	}
9890 
9891 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9892 		return EINVAL;
9893 	}
9894 
9895 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9896 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9897 }
9898 
9899 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9900 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9901 {
9902 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9903 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9904 }
9905 
9906 /*
9907  * Make a directory file.
9908  *
9909  * Returns:	0			Success
9910  *		EEXIST
9911  *	namei:???
9912  *	vnode_authorize:???
9913  *	vn_create:???
9914  */
9915 /* ARGSUSED */
9916 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9917 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9918     enum uio_seg segflg)
9919 {
9920 	vnode_t vp, dvp;
9921 	int error;
9922 	int update_flags = 0;
9923 	int batched;
9924 	struct nameidata nd;
9925 
9926 	AUDIT_ARG(mode, vap->va_mode);
9927 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9928 	    path, ctx);
9929 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9930 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9931 
9932 continue_lookup:
9933 	error = nameiat(&nd, fd);
9934 	if (error) {
9935 		return error;
9936 	}
9937 	dvp = nd.ni_dvp;
9938 	vp = nd.ni_vp;
9939 
9940 	if (vp != NULL) {
9941 		error = EEXIST;
9942 		goto out;
9943 	}
9944 
9945 	batched = vnode_compound_mkdir_available(dvp);
9946 
9947 	VATTR_SET(vap, va_type, VDIR);
9948 
9949 	/*
9950 	 * XXX
9951 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9952 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9953 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9954 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9955 	 */
9956 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9957 		if (error == EACCES || error == EPERM) {
9958 			int error2;
9959 
9960 			nameidone(&nd);
9961 			vnode_put(dvp);
9962 			dvp = NULLVP;
9963 
9964 			/*
9965 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9966 			 * rather than EACCESS if the target exists.
9967 			 */
9968 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9969 			    path, ctx);
9970 			error2 = nameiat(&nd, fd);
9971 			if (error2) {
9972 				goto out;
9973 			} else {
9974 				vp = nd.ni_vp;
9975 				error = EEXIST;
9976 				goto out;
9977 			}
9978 		}
9979 
9980 		goto out;
9981 	}
9982 
9983 #if CONFIG_FILE_LEASES
9984 	vnode_breakdirlease(dvp, false, O_WRONLY);
9985 #endif
9986 
9987 	/*
9988 	 * make the directory
9989 	 */
9990 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9991 		if (error == EKEEPLOOKING) {
9992 			nd.ni_vp = vp;
9993 			goto continue_lookup;
9994 		}
9995 
9996 		goto out;
9997 	}
9998 
9999 	// Make sure the name & parent pointers are hooked up
10000 	if (vp->v_name == NULL) {
10001 		update_flags |= VNODE_UPDATE_NAME;
10002 	}
10003 	if (vp->v_parent == NULLVP) {
10004 		update_flags |= VNODE_UPDATE_PARENT;
10005 	}
10006 
10007 	if (update_flags) {
10008 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
10009 	}
10010 
10011 #if CONFIG_FSE
10012 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
10013 #endif
10014 
10015 out:
10016 	/*
10017 	 * nameidone has to happen before we vnode_put(dvp)
10018 	 * since it may need to release the fs_nodelock on the dvp
10019 	 */
10020 	nameidone(&nd);
10021 
10022 	if (vp) {
10023 		vnode_put(vp);
10024 	}
10025 	if (dvp) {
10026 		vnode_put(dvp);
10027 	}
10028 
10029 	return error;
10030 }
10031 
10032 /*
10033  * mkdir_extended: Create a directory; with extended security (ACL).
10034  *
10035  * Parameters:    p                       Process requesting to create the directory
10036  *                uap                     User argument descriptor (see below)
10037  *                retval                  (ignored)
10038  *
10039  * Indirect:      uap->path               Path of directory to create
10040  *                uap->mode               Access permissions to set
10041  *                uap->xsecurity          ACL to set
10042  *
10043  * Returns:        0                      Success
10044  *                !0                      Not success
10045  *
10046  */
10047 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)10048 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
10049 {
10050 	int ciferror;
10051 	kauth_filesec_t xsecdst;
10052 	struct vnode_attr va;
10053 
10054 	AUDIT_ARG(owner, uap->uid, uap->gid);
10055 
10056 	xsecdst = NULL;
10057 	if ((uap->xsecurity != USER_ADDR_NULL) &&
10058 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
10059 		return ciferror;
10060 	}
10061 
10062 	VATTR_INIT(&va);
10063 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10064 	if (xsecdst != NULL) {
10065 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
10066 		va.va_vaflags |= VA_FILESEC_ACL;
10067 	}
10068 
10069 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10070 	    UIO_USERSPACE);
10071 	if (xsecdst != NULL) {
10072 		kauth_filesec_free(xsecdst);
10073 	}
10074 	return ciferror;
10075 }
10076 
10077 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)10078 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
10079 {
10080 	struct vnode_attr va;
10081 
10082 	VATTR_INIT(&va);
10083 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10084 
10085 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10086 	           UIO_USERSPACE);
10087 }
10088 
10089 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)10090 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
10091 {
10092 	struct vnode_attr va;
10093 
10094 	VATTR_INIT(&va);
10095 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10096 
10097 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
10098 	           UIO_USERSPACE);
10099 }
10100 
10101 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)10102 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
10103     enum uio_seg segflg, int unlink_flags)
10104 {
10105 	struct {
10106 		struct nameidata nd;
10107 #if CONFIG_FSE
10108 		struct vnode_attr va;
10109 #endif /* CONFIG_FSE */
10110 	} *__rmdir_data;
10111 	vnode_t vp, dvp;
10112 	int error;
10113 	struct nameidata *ndp;
10114 	char     *path = NULL;
10115 	char     *no_firmlink_path = NULL;
10116 	int       len_path = 0;
10117 	int       len_no_firmlink_path = 0;
10118 	int has_listeners = 0;
10119 	int need_event = 0;
10120 	int truncated_path = 0;
10121 	int truncated_no_firmlink_path = 0;
10122 	struct vnode_attr *vap = NULL;
10123 	int restart_count = 0;
10124 	int batched;
10125 
10126 	int restart_flag;
10127 	int nofollow_any = 0;
10128 
10129 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
10130 	ndp = &__rmdir_data->nd;
10131 
10132 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
10133 		nofollow_any = NAMEI_NOFOLLOW_ANY;
10134 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
10135 	}
10136 
10137 	/*
10138 	 * This loop exists to restart rmdir in the unlikely case that two
10139 	 * processes are simultaneously trying to remove the same directory
10140 	 * containing orphaned appleDouble files.
10141 	 */
10142 	do {
10143 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
10144 		    segflg, dirpath, ctx);
10145 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
10146 continue_lookup:
10147 		restart_flag = 0;
10148 		vap = NULL;
10149 
10150 		error = nameiat(ndp, fd);
10151 		if (error) {
10152 			goto err_out;
10153 		}
10154 
10155 		dvp = ndp->ni_dvp;
10156 		vp = ndp->ni_vp;
10157 
10158 		if (vp) {
10159 			batched = vnode_compound_rmdir_available(vp);
10160 
10161 			if (vp->v_flag & VROOT) {
10162 				/*
10163 				 * The root of a mounted filesystem cannot be deleted.
10164 				 */
10165 				error = EBUSY;
10166 				goto out;
10167 			}
10168 
10169 #if DEVELOPMENT || DEBUG
10170 			/*
10171 			 * XXX VSWAP: Check for entitlements or special flag here
10172 			 * so we can restrict access appropriately.
10173 			 */
10174 #else /* DEVELOPMENT || DEBUG */
10175 
10176 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10177 				error = EPERM;
10178 				goto out;
10179 			}
10180 #endif /* DEVELOPMENT || DEBUG */
10181 
10182 			/*
10183 			 * Removed a check here; we used to abort if vp's vid
10184 			 * was not the same as what we'd seen the last time around.
10185 			 * I do not think that check was valid, because if we retry
10186 			 * and all dirents are gone, the directory could legitimately
10187 			 * be recycled but still be present in a situation where we would
10188 			 * have had permission to delete.  Therefore, we won't make
10189 			 * an effort to preserve that check now that we may not have a
10190 			 * vp here.
10191 			 */
10192 
10193 			if (!batched) {
10194 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10195 				if (error) {
10196 					if (error == ENOENT) {
10197 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10198 							restart_flag = 1;
10199 							restart_count += 1;
10200 						}
10201 					}
10202 					goto out;
10203 				}
10204 			}
10205 		} else {
10206 			batched = 1;
10207 
10208 			if (!vnode_compound_rmdir_available(dvp)) {
10209 				panic("No error, but no compound rmdir?");
10210 			}
10211 		}
10212 
10213 #if CONFIG_FSE
10214 		fse_info  finfo = {0};
10215 
10216 		need_event = need_fsevent(FSE_DELETE, dvp);
10217 		if (need_event) {
10218 			if (!batched) {
10219 				get_fse_info(vp, &finfo, ctx);
10220 			} else {
10221 				error = vfs_get_notify_attributes(&__rmdir_data->va);
10222 				if (error) {
10223 					goto out;
10224 				}
10225 
10226 				vap = &__rmdir_data->va;
10227 			}
10228 		}
10229 #endif
10230 		has_listeners = kauth_authorize_fileop_has_listeners();
10231 		if (need_event || has_listeners) {
10232 			if (path == NULL) {
10233 				GET_PATH(path);
10234 			}
10235 
10236 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10237 
10238 			if (no_firmlink_path == NULL) {
10239 				GET_PATH(no_firmlink_path);
10240 			}
10241 
10242 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10243 #if CONFIG_FSE
10244 			if (truncated_no_firmlink_path) {
10245 				finfo.mode |= FSE_TRUNCATED_PATH;
10246 			}
10247 #endif
10248 		}
10249 
10250 #if CONFIG_FILE_LEASES
10251 		vnode_breakdirlease(dvp, false, O_WRONLY);
10252 #endif
10253 
10254 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10255 		ndp->ni_vp = vp;
10256 		if (vp == NULLVP) {
10257 			/* Couldn't find a vnode */
10258 			goto out;
10259 		}
10260 
10261 		if (error == EKEEPLOOKING) {
10262 			goto continue_lookup;
10263 		} else if (batched && error == ENOENT) {
10264 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10265 				/*
10266 				 * For compound VNOPs, the authorization callback
10267 				 * may return ENOENT in case of racing hard link lookups
10268 				 * redrive the lookup.
10269 				 */
10270 				restart_flag = 1;
10271 				restart_count += 1;
10272 				goto out;
10273 			}
10274 		}
10275 
10276 		/*
10277 		 * XXX There's no provision for passing flags
10278 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10279 		 * because it's not empty, then we try again
10280 		 * with VNOP_REMOVE(), passing in a special
10281 		 * flag that clever file systems will know
10282 		 * how to handle.
10283 		 */
10284 		if (error == ENOTEMPTY &&
10285 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10286 			/*
10287 			 * Only do this if the directory is actually
10288 			 * marked as DATALESS.
10289 			 */
10290 			struct vnode_attr *lvap =
10291 			    kalloc_type(struct vnode_attr, Z_WAITOK);
10292 
10293 			VATTR_INIT(lvap);
10294 			VATTR_WANTED(lvap, va_flags);
10295 			if (vnode_getattr(vp, lvap, ctx) == 0 &&
10296 			    VATTR_IS_SUPPORTED(lvap, va_flags) &&
10297 			    (lvap->va_flags & SF_DATALESS) != 0) {
10298 				/*
10299 				 * If this fails, we want to keep the original
10300 				 * error.
10301 				 */
10302 				if (vn_remove(dvp, &vp, ndp,
10303 				    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10304 					error = 0;
10305 				}
10306 			}
10307 			kfree_type(struct vnode_attr, lvap);
10308 		}
10309 
10310 #if CONFIG_APPLEDOUBLE
10311 		/*
10312 		 * Special case to remove orphaned AppleDouble
10313 		 * files. I don't like putting this in the kernel,
10314 		 * but carbon does not like putting this in carbon either,
10315 		 * so here we are.
10316 		 */
10317 		if (error == ENOTEMPTY) {
10318 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10319 			if (ad_error == EBUSY) {
10320 				error = ad_error;
10321 				goto out;
10322 			}
10323 
10324 
10325 			/*
10326 			 * Assuming everything went well, we will try the RMDIR again
10327 			 */
10328 			if (!ad_error) {
10329 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10330 			}
10331 		}
10332 #endif /* CONFIG_APPLEDOUBLE */
10333 		/*
10334 		 * Call out to allow 3rd party notification of delete.
10335 		 * Ignore result of kauth_authorize_fileop call.
10336 		 */
10337 		if (!error) {
10338 			if (has_listeners) {
10339 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10340 				    KAUTH_FILEOP_DELETE,
10341 				    (uintptr_t)vp,
10342 				    (uintptr_t)path);
10343 			}
10344 
10345 			if (vp->v_flag & VISHARDLINK) {
10346 				// see the comment in unlink1() about why we update
10347 				// the parent of a hard link when it is removed
10348 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10349 			}
10350 
10351 #if CONFIG_FSE
10352 			if (need_event) {
10353 				if (vap) {
10354 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10355 				}
10356 				add_fsevent(FSE_DELETE, ctx,
10357 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10358 				    FSE_ARG_FINFO, &finfo,
10359 				    FSE_ARG_DONE);
10360 			}
10361 #endif
10362 
10363 #if CONFIG_MACF
10364 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10365 #endif
10366 		}
10367 
10368 out:
10369 		if (path != NULL) {
10370 			RELEASE_PATH(path);
10371 			path = NULL;
10372 		}
10373 
10374 		if (no_firmlink_path != NULL) {
10375 			RELEASE_PATH(no_firmlink_path);
10376 			no_firmlink_path = NULL;
10377 		}
10378 
10379 		/*
10380 		 * nameidone has to happen before we vnode_put(dvp)
10381 		 * since it may need to release the fs_nodelock on the dvp
10382 		 */
10383 		nameidone(ndp);
10384 		vnode_put(dvp);
10385 
10386 		if (vp) {
10387 			vnode_put(vp);
10388 		}
10389 
10390 		if (restart_flag == 0) {
10391 			wakeup_one((caddr_t)vp);
10392 			goto err_out;
10393 		}
10394 		tsleep(vp, PVFS, "rm AD", 1);
10395 	} while (restart_flag != 0);
10396 
10397 err_out:
10398 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10399 
10400 	return error;
10401 }
10402 
10403 /*
10404  * Remove a directory file.
10405  */
10406 /* ARGSUSED */
10407 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10408 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10409 {
10410 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10411 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10412 }
10413 
10414 /* Get direntry length padded to 8 byte alignment */
10415 #define DIRENT64_LEN(namlen) \
10416 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10417 
10418 /* Get dirent length padded to 4 byte alignment */
10419 #define DIRENT_LEN(namelen) \
10420 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10421 
10422 /* Get the end of this dirent */
10423 #define DIRENT_END(dep) \
10424 	(((char *)(dep)) + (dep)->d_reclen - 1)
10425 
10426 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10427 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10428     int *numdirent, vfs_context_t ctxp)
10429 {
10430 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10431 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10432 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10433 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10434 	} else {
10435 		size_t bufsize;
10436 		void * bufptr;
10437 		uio_t auio;
10438 		struct direntry *entry64;
10439 		struct dirent *dep;
10440 		size_t bytesread;
10441 		int error;
10442 
10443 		/*
10444 		 * We're here because the underlying file system does not
10445 		 * support direnties or we mounted denying support so we must
10446 		 * fall back to dirents and convert them to direntries.
10447 		 *
10448 		 * Our kernel buffer needs to be smaller since re-packing will
10449 		 * expand each dirent.  The worse case (when the name length
10450 		 * is 3 or less) corresponds to a struct direntry size of 32
10451 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10452 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10453 		 * will prevent us from reading more than we can pack.
10454 		 *
10455 		 * Since this buffer is wired memory, we will limit the
10456 		 * buffer size to a maximum of 32K. We would really like to
10457 		 * use 32K in the MIN(), but we use magic number 87371 to
10458 		 * prevent uio_resid() * 3 / 8 from overflowing.
10459 		 */
10460 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10461 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10462 		if (bufptr == NULL) {
10463 			return ENOMEM;
10464 		}
10465 
10466 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10467 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10468 		auio->uio_offset = uio->uio_offset;
10469 
10470 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10471 
10472 		dep = (struct dirent *)bufptr;
10473 		bytesread = bufsize - uio_resid(auio);
10474 
10475 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10476 		/*
10477 		 * Convert all the entries and copy them out to user's buffer.
10478 		 */
10479 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10480 			/* First check that the dirent struct up to d_name is within the buffer */
10481 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10482 			    /* Check that the length of the entire dirent is within the buffer */
10483 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10484 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10485 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10486 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10487 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10488 				    vp->v_name ? vp->v_name : "<unknown>");
10489 				error = EIO;
10490 				break;
10491 			}
10492 
10493 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10494 
10495 			bzero(entry64, enbufsize);
10496 			/* Convert a dirent to a dirent64. */
10497 			entry64->d_ino = dep->d_ino;
10498 			entry64->d_seekoff = 0;
10499 			entry64->d_reclen = (uint16_t)enbufsize;
10500 			entry64->d_namlen = dep->d_namlen;
10501 			entry64->d_type = dep->d_type;
10502 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10503 
10504 			/* Move to next entry. */
10505 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10506 
10507 			/* Copy entry64 to user's buffer. */
10508 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10509 		}
10510 
10511 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10512 		if (error == 0) {
10513 			uio->uio_offset = auio->uio_offset;
10514 		}
10515 		uio_free(auio);
10516 		kfree_data(bufptr, bufsize);
10517 		kfree_type(struct direntry, entry64);
10518 		return error;
10519 	}
10520 }
10521 
10522 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10523 
10524 /*
10525  * Read a block of directory entries in a file system independent format.
10526  */
10527 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10528 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10529     off_t *offset, int *eofflag, int flags)
10530 {
10531 	vnode_t vp;
10532 	struct vfs_context context = *vfs_context_current();    /* local copy */
10533 	struct fileproc *fp;
10534 	uio_t auio;
10535 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10536 	off_t loff;
10537 	int error, numdirent;
10538 	UIO_STACKBUF(uio_buf, 1);
10539 
10540 get_from_fd:
10541 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10542 	if (error) {
10543 		return error;
10544 	}
10545 
10546 	vn_offset_lock(fp->fp_glob);
10547 	if (((vnode_t)fp_get_data(fp)) != vp) {
10548 		vn_offset_unlock(fp->fp_glob);
10549 		file_drop(fd);
10550 		goto get_from_fd;
10551 	}
10552 
10553 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10554 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10555 		error = EBADF;
10556 		goto out;
10557 	}
10558 
10559 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10560 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10561 	}
10562 
10563 #if CONFIG_MACF
10564 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10565 	if (error) {
10566 		goto out;
10567 	}
10568 #endif
10569 
10570 	if ((error = vnode_getwithref(vp))) {
10571 		goto out;
10572 	}
10573 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10574 
10575 #if CONFIG_UNION_MOUNTS
10576 unionread:
10577 #endif /* CONFIG_UNION_MOUNTS */
10578 	if (vp->v_type != VDIR) {
10579 		(void)vnode_put(vp);
10580 		error = EINVAL;
10581 		goto out;
10582 	}
10583 
10584 #if CONFIG_MACF
10585 	error = mac_vnode_check_readdir(&context, vp);
10586 	if (error != 0) {
10587 		(void)vnode_put(vp);
10588 		goto out;
10589 	}
10590 #endif /* MAC */
10591 
10592 	loff = fp->fp_glob->fg_offset;
10593 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10594 	uio_addiov(auio, bufp, bufsize);
10595 
10596 	if (flags & VNODE_READDIR_EXTENDED) {
10597 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10598 		fp->fp_glob->fg_offset = uio_offset(auio);
10599 	} else {
10600 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10601 		fp->fp_glob->fg_offset = uio_offset(auio);
10602 	}
10603 	if (error) {
10604 		(void)vnode_put(vp);
10605 		goto out;
10606 	}
10607 
10608 #if CONFIG_UNION_MOUNTS
10609 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10610 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10611 		vnode_t uvp;
10612 
10613 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10614 			if (vnode_ref(uvp) == 0) {
10615 				fp_set_data(fp, uvp);
10616 				fp->fp_glob->fg_offset = 0;
10617 				vnode_rele(vp);
10618 				vnode_put(vp);
10619 				vp = uvp;
10620 				goto unionread;
10621 			} else {
10622 				/* could not get a ref, can't replace in fd */
10623 				vnode_put(uvp);
10624 			}
10625 		}
10626 	}
10627 #endif /* CONFIG_UNION_MOUNTS */
10628 
10629 	vnode_put(vp);
10630 	if (offset) {
10631 		*offset = loff;
10632 	}
10633 
10634 	*bytesread = bufsize - uio_resid(auio);
10635 out:
10636 	vn_offset_unlock(fp->fp_glob);
10637 	file_drop(fd);
10638 	return error;
10639 }
10640 
10641 
10642 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10643 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10644 {
10645 	off_t offset;
10646 	ssize_t bytesread;
10647 	int error, eofflag;
10648 
10649 	AUDIT_ARG(fd, uap->fd);
10650 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
10651 	    &bytesread, &offset, &eofflag, 0);
10652 
10653 	if (error == 0) {
10654 		if (proc_is64bit(p)) {
10655 			user64_long_t base = (user64_long_t)offset;
10656 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10657 		} else {
10658 			user32_long_t base = (user32_long_t)offset;
10659 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10660 		}
10661 		*retval = (int)bytesread;
10662 	}
10663 	return error;
10664 }
10665 
10666 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10667 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10668 {
10669 	off_t offset;
10670 	ssize_t bytesread;
10671 	int error, eofflag;
10672 	user_size_t bufsize;
10673 
10674 	AUDIT_ARG(fd, uap->fd);
10675 
10676 	/*
10677 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10678 	 * then the kernel carves out the last 4 bytes to return extended
10679 	 * information to userspace (namely whether we reached EOF with this call).
10680 	 */
10681 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10682 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10683 	} else {
10684 		bufsize = uap->bufsize;
10685 	}
10686 
10687 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
10688 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10689 
10690 	if (error == 0) {
10691 		*retval = bytesread;
10692 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10693 
10694 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10695 			getdirentries64_flags_t flags = 0;
10696 			if (eofflag) {
10697 				flags |= GETDIRENTRIES64_EOF;
10698 			}
10699 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10700 			    sizeof(flags));
10701 		}
10702 	}
10703 	return error;
10704 }
10705 
10706 
10707 /*
10708  * Set the mode mask for creation of filesystem nodes.
10709  * XXX implement xsecurity
10710  */
10711 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
10712 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10713 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10714 {
10715 	AUDIT_ARG(mask, newmask);
10716 	proc_fdlock(p);
10717 	*retval = p->p_fd.fd_cmask;
10718 	p->p_fd.fd_cmask = newmask & ALLPERMS;
10719 	proc_fdunlock(p);
10720 	return 0;
10721 }
10722 
10723 /*
10724  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10725  *
10726  * Parameters:    p                       Process requesting to set the umask
10727  *                uap                     User argument descriptor (see below)
10728  *                retval                  umask of the process (parameter p)
10729  *
10730  * Indirect:      uap->newmask            umask to set
10731  *                uap->xsecurity          ACL to set
10732  *
10733  * Returns:        0                      Success
10734  *                !0                      Not success
10735  *
10736  */
10737 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10738 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10739 {
10740 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10741 }
10742 
10743 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10744 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10745 {
10746 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10747 }
10748 
10749 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
10750 	"com.apple.private.vfs.revoke-mounted-device"
10751 
10752 /*
10753  * Void all references to file by ripping underlying filesystem
10754  * away from vnode.
10755  */
10756 /* ARGSUSED */
10757 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10758 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10759 {
10760 	vnode_t vp;
10761 	struct vnode_attr va;
10762 	vfs_context_t ctx = vfs_context_current();
10763 	int error;
10764 	struct nameidata nd;
10765 
10766 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10767 	    uap->path, ctx);
10768 	error = namei(&nd);
10769 	if (error) {
10770 		return error;
10771 	}
10772 	vp = nd.ni_vp;
10773 
10774 	nameidone(&nd);
10775 
10776 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10777 		error = ENOTSUP;
10778 		goto out;
10779 	}
10780 
10781 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10782 		error = EBUSY;
10783 		goto out;
10784 	}
10785 
10786 #if CONFIG_MACF
10787 	error = mac_vnode_check_revoke(ctx, vp);
10788 	if (error) {
10789 		goto out;
10790 	}
10791 #endif
10792 
10793 	VATTR_INIT(&va);
10794 	VATTR_WANTED(&va, va_uid);
10795 	if ((error = vnode_getattr(vp, &va, ctx))) {
10796 		goto out;
10797 	}
10798 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10799 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10800 		goto out;
10801 	}
10802 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10803 		VNOP_REVOKE(vp, REVOKEALL, ctx);
10804 	}
10805 out:
10806 	vnode_put(vp);
10807 	return error;
10808 }
10809 
10810 
10811 /*
10812  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10813  *  The following system calls are designed to support features
10814  *  which are specific to the HFS & HFS Plus volume formats
10815  */
10816 
10817 
10818 /*
10819  * Obtain attribute information on objects in a directory while enumerating
10820  * the directory.
10821  */
10822 /* ARGSUSED */
10823 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10824 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10825 {
10826 	vnode_t vp;
10827 	struct fileproc *fp;
10828 	uio_t auio = NULL;
10829 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10830 	uint32_t count = 0, savecount = 0;
10831 	uint32_t newstate = 0;
10832 	int error, eofflag = 0;
10833 	off_t loff = 0;
10834 	struct attrlist attributelist;
10835 	vfs_context_t ctx = vfs_context_current();
10836 	int fd = uap->fd;
10837 	UIO_STACKBUF(uio_buf, 1);
10838 	kauth_action_t action;
10839 
10840 	AUDIT_ARG(fd, fd);
10841 
10842 	/* Get the attributes into kernel space */
10843 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10844 		return error;
10845 	}
10846 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10847 		return error;
10848 	}
10849 	savecount = count;
10850 
10851 get_from_fd:
10852 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10853 		return error;
10854 	}
10855 
10856 	vn_offset_lock(fp->fp_glob);
10857 	if (((vnode_t)fp_get_data(fp)) != vp) {
10858 		vn_offset_unlock(fp->fp_glob);
10859 		file_drop(fd);
10860 		goto get_from_fd;
10861 	}
10862 
10863 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10864 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10865 		error = EBADF;
10866 		goto out;
10867 	}
10868 
10869 
10870 #if CONFIG_MACF
10871 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10872 	    fp->fp_glob);
10873 	if (error) {
10874 		goto out;
10875 	}
10876 #endif
10877 
10878 
10879 	if ((error = vnode_getwithref(vp))) {
10880 		goto out;
10881 	}
10882 
10883 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10884 
10885 #if CONFIG_UNION_MOUNTS
10886 unionread:
10887 #endif /* CONFIG_UNION_MOUNTS */
10888 	if (vp->v_type != VDIR) {
10889 		(void)vnode_put(vp);
10890 		error = EINVAL;
10891 		goto out;
10892 	}
10893 
10894 #if CONFIG_MACF
10895 	error = mac_vnode_check_readdir(ctx, vp);
10896 	if (error != 0) {
10897 		(void)vnode_put(vp);
10898 		goto out;
10899 	}
10900 #endif /* MAC */
10901 
10902 	/* set up the uio structure which will contain the users return buffer */
10903 	loff = fp->fp_glob->fg_offset;
10904 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10905 	uio_addiov(auio, uap->buffer, uap->buffersize);
10906 
10907 	/*
10908 	 * If the only item requested is file names, we can let that past with
10909 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10910 	 * they need SEARCH as well.
10911 	 */
10912 	action = KAUTH_VNODE_LIST_DIRECTORY;
10913 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10914 	    attributelist.fileattr || attributelist.dirattr) {
10915 		action |= KAUTH_VNODE_SEARCH;
10916 	}
10917 
10918 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10919 		/* Believe it or not, uap->options only has 32-bits of valid
10920 		 * info, so truncate before extending again */
10921 
10922 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10923 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10924 	}
10925 
10926 	if (error) {
10927 		(void) vnode_put(vp);
10928 		goto out;
10929 	}
10930 
10931 #if CONFIG_UNION_MOUNTS
10932 	/*
10933 	 * If we've got the last entry of a directory in a union mount
10934 	 * then reset the eofflag and pretend there's still more to come.
10935 	 * The next call will again set eofflag and the buffer will be empty,
10936 	 * so traverse to the underlying directory and do the directory
10937 	 * read there.
10938 	 */
10939 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10940 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10941 			eofflag = 0;
10942 		} else {                                                // Empty buffer
10943 			vnode_t uvp;
10944 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10945 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10946 					fp_set_data(fp, uvp);
10947 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10948 					count = savecount;
10949 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10950 					vnode_put(vp);
10951 					vp = uvp;
10952 					goto unionread;
10953 				} else {
10954 					/* could not get a ref, can't replace in fd */
10955 					vnode_put(uvp);
10956 				}
10957 			}
10958 		}
10959 	}
10960 #endif /* CONFIG_UNION_MOUNTS */
10961 
10962 	(void)vnode_put(vp);
10963 
10964 	if (error) {
10965 		goto out;
10966 	}
10967 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10968 
10969 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10970 		goto out;
10971 	}
10972 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10973 		goto out;
10974 	}
10975 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10976 		goto out;
10977 	}
10978 
10979 	*retval = eofflag;  /* similar to getdirentries */
10980 	error = 0;
10981 out:
10982 	vn_offset_unlock(fp->fp_glob);
10983 	file_drop(fd);
10984 	return error; /* return error earlier, an retval of 0 or 1 now */
10985 } /* end of getdirentriesattr system call */
10986 
10987 /*
10988  * Exchange data between two files
10989  */
10990 
10991 /* ARGSUSED */
10992 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10993 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10994 {
10995 	struct nameidata fnd, snd;
10996 	vfs_context_t ctx = vfs_context_current();
10997 	vnode_t fvp;
10998 	vnode_t svp;
10999 	int error;
11000 	u_int32_t nameiflags;
11001 	char *fpath = NULL;
11002 	char *spath = NULL;
11003 	int   flen = 0, slen = 0;
11004 	int from_truncated = 0, to_truncated = 0;
11005 #if CONFIG_FSE
11006 	fse_info f_finfo, s_finfo;
11007 #endif
11008 
11009 	nameiflags = 0;
11010 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11011 		nameiflags |= FOLLOW;
11012 	}
11013 
11014 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
11015 	    UIO_USERSPACE, uap->path1, ctx);
11016 
11017 	error = namei(&fnd);
11018 	if (error) {
11019 		goto out2;
11020 	}
11021 
11022 	nameidone(&fnd);
11023 	fvp = fnd.ni_vp;
11024 
11025 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
11026 	    UIO_USERSPACE, uap->path2, ctx);
11027 
11028 	error = namei(&snd);
11029 	if (error) {
11030 		vnode_put(fvp);
11031 		goto out2;
11032 	}
11033 	nameidone(&snd);
11034 	svp = snd.ni_vp;
11035 
11036 	/*
11037 	 * if the files are the same, return an inval error
11038 	 */
11039 	if (svp == fvp) {
11040 		error = EINVAL;
11041 		goto out;
11042 	}
11043 
11044 	/*
11045 	 * if the files are on different volumes, return an error
11046 	 */
11047 	if (svp->v_mount != fvp->v_mount) {
11048 		error = EXDEV;
11049 		goto out;
11050 	}
11051 
11052 	/* If they're not files, return an error */
11053 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
11054 		error = EINVAL;
11055 		goto out;
11056 	}
11057 
11058 #if CONFIG_MACF
11059 	error = mac_vnode_check_exchangedata(ctx,
11060 	    fvp, svp);
11061 	if (error) {
11062 		goto out;
11063 	}
11064 #endif
11065 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
11066 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
11067 		goto out;
11068 	}
11069 
11070 	if (
11071 #if CONFIG_FSE
11072 		need_fsevent(FSE_EXCHANGE, fvp) ||
11073 #endif
11074 		kauth_authorize_fileop_has_listeners()) {
11075 		GET_PATH(fpath);
11076 		GET_PATH(spath);
11077 
11078 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
11079 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
11080 
11081 #if CONFIG_FSE
11082 		get_fse_info(fvp, &f_finfo, ctx);
11083 		get_fse_info(svp, &s_finfo, ctx);
11084 		if (from_truncated || to_truncated) {
11085 			// set it here since only the f_finfo gets reported up to user space
11086 			f_finfo.mode |= FSE_TRUNCATED_PATH;
11087 		}
11088 #endif
11089 	}
11090 	/* Ok, make the call */
11091 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
11092 
11093 	if (error == 0) {
11094 		const char *tmpname;
11095 
11096 		if (fpath != NULL && spath != NULL) {
11097 			/* call out to allow 3rd party notification of exchangedata.
11098 			 * Ignore result of kauth_authorize_fileop call.
11099 			 */
11100 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
11101 			    (uintptr_t)fpath, (uintptr_t)spath);
11102 		}
11103 		name_cache_lock();
11104 
11105 		tmpname     = fvp->v_name;
11106 		fvp->v_name = svp->v_name;
11107 		svp->v_name = tmpname;
11108 
11109 		if (fvp->v_parent != svp->v_parent) {
11110 			vnode_t tmp;
11111 
11112 			tmp           = fvp->v_parent;
11113 			fvp->v_parent = svp->v_parent;
11114 			svp->v_parent = tmp;
11115 		}
11116 		name_cache_unlock();
11117 
11118 #if CONFIG_FSE
11119 		if (fpath != NULL && spath != NULL) {
11120 			add_fsevent(FSE_EXCHANGE, ctx,
11121 			    FSE_ARG_STRING, flen, fpath,
11122 			    FSE_ARG_FINFO, &f_finfo,
11123 			    FSE_ARG_STRING, slen, spath,
11124 			    FSE_ARG_FINFO, &s_finfo,
11125 			    FSE_ARG_DONE);
11126 		}
11127 #endif
11128 	}
11129 
11130 out:
11131 	if (fpath != NULL) {
11132 		RELEASE_PATH(fpath);
11133 	}
11134 	if (spath != NULL) {
11135 		RELEASE_PATH(spath);
11136 	}
11137 	vnode_put(svp);
11138 	vnode_put(fvp);
11139 out2:
11140 	return error;
11141 }
11142 
11143 /*
11144  * Return (in MB) the amount of freespace on the given vnode's volume.
11145  */
11146 uint32_t freespace_mb(vnode_t vp);
11147 
11148 uint32_t
freespace_mb(vnode_t vp)11149 freespace_mb(vnode_t vp)
11150 {
11151 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
11152 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
11153 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
11154 }
11155 
11156 #if CONFIG_SEARCHFS
11157 
11158 /* ARGSUSED */
11159 
11160 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)11161 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
11162 {
11163 	vnode_t vp, tvp;
11164 	int i, error = 0;
11165 	int fserror = 0;
11166 	struct nameidata nd;
11167 	struct user64_fssearchblock searchblock;
11168 	struct searchstate *state;
11169 	struct attrlist *returnattrs;
11170 	struct timeval timelimit;
11171 	void *searchparams1, *searchparams2;
11172 	uio_t auio = NULL;
11173 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11174 	uint32_t nummatches;
11175 	size_t mallocsize;
11176 	uint32_t nameiflags;
11177 	vfs_context_t ctx = vfs_context_current();
11178 	UIO_STACKBUF(uio_buf, 1);
11179 
11180 	/* Start by copying in fsearchblock parameter list */
11181 	if (IS_64BIT_PROCESS(p)) {
11182 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11183 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
11184 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
11185 	} else {
11186 		struct user32_fssearchblock tmp_searchblock;
11187 
11188 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11189 		// munge into 64-bit version
11190 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11191 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11192 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11193 		searchblock.maxmatches = tmp_searchblock.maxmatches;
11194 		/*
11195 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11196 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11197 		 */
11198 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11199 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11200 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11201 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11202 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11203 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11204 		searchblock.searchattrs = tmp_searchblock.searchattrs;
11205 	}
11206 	if (error) {
11207 		return error;
11208 	}
11209 
11210 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11211 	 */
11212 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11213 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11214 		return EINVAL;
11215 	}
11216 
11217 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11218 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
11219 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11220 	/* block.                                                                                             */
11221 	/*												      */
11222 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
11223 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
11224 	/*       assumes the size is still 556 bytes it will continue to work				      */
11225 
11226 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11227 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11228 
11229 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11230 
11231 	/* Now set up the various pointers to the correct place in our newly allocated memory */
11232 
11233 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11234 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11235 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11236 
11237 	/* Now copy in the stuff given our local variables. */
11238 
11239 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11240 		goto freeandexit;
11241 	}
11242 
11243 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11244 		goto freeandexit;
11245 	}
11246 
11247 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11248 		goto freeandexit;
11249 	}
11250 
11251 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11252 		goto freeandexit;
11253 	}
11254 
11255 	/*
11256 	 * When searching a union mount, need to set the
11257 	 * start flag at the first call on each layer to
11258 	 * reset state for the new volume.
11259 	 */
11260 	if (uap->options & SRCHFS_START) {
11261 		state->ss_union_layer = 0;
11262 	} else {
11263 		uap->options |= state->ss_union_flags;
11264 	}
11265 	state->ss_union_flags = 0;
11266 
11267 	/*
11268 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11269 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11270 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11271 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11272 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11273 	 */
11274 
11275 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11276 		attrreference_t* string_ref;
11277 		u_int32_t* start_length;
11278 		user64_size_t param_length;
11279 
11280 		/* validate searchparams1 */
11281 		param_length = searchblock.sizeofsearchparams1;
11282 		/* skip the word that specifies length of the buffer */
11283 		start_length = (u_int32_t*) searchparams1;
11284 		start_length = start_length + 1;
11285 		string_ref = (attrreference_t*) start_length;
11286 
11287 		/* ensure no negative offsets or too big offsets */
11288 		if (string_ref->attr_dataoffset < 0) {
11289 			error = EINVAL;
11290 			goto freeandexit;
11291 		}
11292 		if (string_ref->attr_length > MAXPATHLEN) {
11293 			error = EINVAL;
11294 			goto freeandexit;
11295 		}
11296 
11297 		/* Check for pointer overflow in the string ref */
11298 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11299 			error = EINVAL;
11300 			goto freeandexit;
11301 		}
11302 
11303 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11304 			error = EINVAL;
11305 			goto freeandexit;
11306 		}
11307 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11308 			error = EINVAL;
11309 			goto freeandexit;
11310 		}
11311 	}
11312 
11313 	/* set up the uio structure which will contain the users return buffer */
11314 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11315 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11316 
11317 	nameiflags = 0;
11318 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11319 		nameiflags |= FOLLOW;
11320 	}
11321 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11322 	    UIO_USERSPACE, uap->path, ctx);
11323 
11324 	error = namei(&nd);
11325 	if (error) {
11326 		goto freeandexit;
11327 	}
11328 	vp = nd.ni_vp;
11329 	nameidone(&nd);
11330 
11331 	/*
11332 	 * Switch to the root vnode for the volume
11333 	 */
11334 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11335 	vnode_put(vp);
11336 	if (error) {
11337 		goto freeandexit;
11338 	}
11339 	vp = tvp;
11340 
11341 #if CONFIG_UNION_MOUNTS
11342 	/*
11343 	 * If it's a union mount, the path lookup takes
11344 	 * us to the top layer. But we may need to descend
11345 	 * to a lower layer. For non-union mounts the layer
11346 	 * is always zero.
11347 	 */
11348 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11349 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11350 			break;
11351 		}
11352 		tvp = vp;
11353 		vp = vp->v_mount->mnt_vnodecovered;
11354 		if (vp == NULL) {
11355 			vnode_put(tvp);
11356 			error = ENOENT;
11357 			goto freeandexit;
11358 		}
11359 		error = vnode_getwithref(vp);
11360 		vnode_put(tvp);
11361 		if (error) {
11362 			goto freeandexit;
11363 		}
11364 	}
11365 #endif /* CONFIG_UNION_MOUNTS */
11366 
11367 #if CONFIG_MACF
11368 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11369 	if (error) {
11370 		vnode_put(vp);
11371 		goto freeandexit;
11372 	}
11373 #endif
11374 
11375 
11376 	/*
11377 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11378 	 * before and sometimes the underlying code doesnt deal with it well.
11379 	 */
11380 	if (searchblock.maxmatches == 0) {
11381 		nummatches = 0;
11382 		goto saveandexit;
11383 	}
11384 
11385 	/*
11386 	 * Allright, we have everything we need, so lets make that call.
11387 	 *
11388 	 * We keep special track of the return value from the file system:
11389 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11390 	 * from copying out any results...
11391 	 */
11392 
11393 	fserror = VNOP_SEARCHFS(vp,
11394 	    searchparams1,
11395 	    searchparams2,
11396 	    &searchblock.searchattrs,
11397 	    (uint32_t)searchblock.maxmatches,
11398 	    &timelimit,
11399 	    returnattrs,
11400 	    &nummatches,
11401 	    (uint32_t)uap->scriptcode,
11402 	    (uint32_t)uap->options,
11403 	    auio,
11404 	    (struct searchstate *) &state->ss_fsstate,
11405 	    ctx);
11406 
11407 #if CONFIG_UNION_MOUNTS
11408 	/*
11409 	 * If it's a union mount we need to be called again
11410 	 * to search the mounted-on filesystem.
11411 	 */
11412 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11413 		state->ss_union_flags = SRCHFS_START;
11414 		state->ss_union_layer++;        // search next layer down
11415 		fserror = EAGAIN;
11416 	}
11417 #endif /* CONFIG_UNION_MOUNTS */
11418 
11419 saveandexit:
11420 
11421 	vnode_put(vp);
11422 
11423 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11424 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11425 
11426 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11427 		goto freeandexit;
11428 	}
11429 
11430 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11431 		goto freeandexit;
11432 	}
11433 
11434 	error = fserror;
11435 
11436 freeandexit:
11437 
11438 	kfree_data(searchparams1, mallocsize);
11439 
11440 	return error;
11441 } /* end of searchfs system call */
11442 
11443 #else /* CONFIG_SEARCHFS */
11444 
11445 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11446 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11447 {
11448 	return ENOTSUP;
11449 }
11450 
11451 #endif /* CONFIG_SEARCHFS */
11452 
11453 
11454 #if CONFIG_DATALESS_FILES
11455 
11456 /*
11457  * === Namespace Resolver Up-call Mechanism ===
11458  *
11459  * When I/O is performed to a dataless file or directory (read, write,
11460  * lookup-in, etc.), the file system performs an upcall to the namespace
11461  * resolver (filecoordinationd) to materialize the object.
11462  *
11463  * We need multiple up-calls to be in flight at once, and we need these
11464  * up-calls to be interruptible, thus the following implementation:
11465  *
11466  * => The nspace_resolver_request represents the in-kernel request state.
11467  *    It contains a request ID, storage space for the errno code returned
11468  *    by filecoordinationd, and flags.
11469  *
11470  * => The request ID is simply a global monotonically incrementing 32-bit
11471  *    number.  Outstanding requests are stored in a hash table, and the
11472  *    hash function is extremely simple.
11473  *
11474  * => When an upcall is to be made to filecoordinationd, a request structure
11475  *    is allocated on the stack (it is small, and needs to live only during
11476  *    the duration of the call to resolve_nspace_item_ext()).  It is
11477  *    initialized and inserted into the table.  Some backpressure from
11478  *    filecoordinationd is applied by limiting the numnber of entries that
11479  *    can be inserted into the table (and thus limiting the number of
11480  *    outstanding requests issued to filecoordinationd); waiting for an
11481  *    available slot is interruptible.
11482  *
11483  * => Once the request has been inserted into the table, the up-call is made
11484  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11485  *    immediately and filecoordinationd processes the request asynchronously.
11486  *
11487  * => The caller now waits for the request to complete.  Tnis is achieved by
11488  *    sleeping on the address of the request structure and waiting for
11489  *    filecoordinationd to mark the request structure as complete.  This
11490  *    is an interruptible sleep call; if interrupted, the request structure
11491  *    is removed from the table and EINTR is returned to the caller.  If
11492  *    this occurs, an advisory up-call is made to filecoordinationd with
11493  *    the request ID to indicate that the request can be aborted or
11494  *    de-prioritized at the discretion of filecoordinationd.
11495  *
11496  * => When filecoordinationd has completed the request, it signals completion
11497  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11498  *    decorated as a namespace resolver can write to this sysctl node.  The
11499  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11500  *    The request ID is looked up in the table, and if the request is found,
11501  *    the error code is stored in the request structure and a wakeup()
11502  *    issued on the address of the request structure.  If the request is not
11503  *    found, we simply drop the completion notification, assuming that the
11504  *    caller was interrupted.
11505  *
11506  * => When the waiting thread wakes up, it extracts the error code from the
11507  *    request structure, removes the request from the table, and returns the
11508  *    error code to the calling function.  Fini!
11509  */
11510 
11511 struct nspace_resolver_request {
11512 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11513 	vnode_t         r_vp;
11514 	vnode_t         r_tdvp;
11515 	uint32_t        r_req_id;
11516 	int             r_resolver_error;
11517 	int             r_flags;
11518 };
11519 
11520 #define RRF_COMPLETE    0x0001
11521 #define RRF_COMPLETING  0x0002
11522 
11523 struct nspace_resolver_completion_data {
11524 	uint32_t req_id;
11525 	int32_t  resolver_error;
11526 	uint64_t orig_gencount;
11527 	uint64_t orig_syncroot;
11528 };
11529 
11530 static uint32_t
next_nspace_req_id(void)11531 next_nspace_req_id(void)
11532 {
11533 	static uint32_t next_req_id;
11534 
11535 	return OSAddAtomic(1, &next_req_id);
11536 }
11537 
11538 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11539 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11540 
11541 static LIST_HEAD(nspace_resolver_requesthead,
11542     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11543 static u_long nspace_resolver_request_hashmask;
11544 static u_int nspace_resolver_request_count;
11545 static bool nspace_resolver_request_wait_slot;
11546 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11547 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11548     &nspace_resolver_request_lck_grp);
11549 
11550 #define NSPACE_REQ_LOCK() \
11551 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11552 #define NSPACE_REQ_UNLOCK() \
11553 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11554 
11555 #define NSPACE_RESOLVER_HASH(req_id)    \
11556 	(&nspace_resolver_request_hashtbl[(req_id) & \
11557 	 nspace_resolver_request_hashmask])
11558 
11559 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11560 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11561 {
11562 	struct nspace_resolver_requesthead *bucket;
11563 	struct nspace_resolver_request *req;
11564 
11565 	bucket = NSPACE_RESOLVER_HASH(req_id);
11566 	LIST_FOREACH(req, bucket, r_hashlink) {
11567 		if (req->r_req_id == req_id) {
11568 			/*
11569 			 * If this request already has a completion
11570 			 * pending, don't return it again.
11571 			 */
11572 			if ((req->r_flags & RRF_COMPLETING) != 0 &&
11573 			    skip_completing) {
11574 				req = NULL;
11575 			}
11576 			return req;
11577 		}
11578 	}
11579 
11580 	return NULL;
11581 }
11582 
11583 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11584 nspace_resolver_req_add(struct nspace_resolver_request *req)
11585 {
11586 	struct nspace_resolver_requesthead *bucket;
11587 	int error;
11588 
11589 	NSPACE_REQ_LOCK();
11590 
11591 	while (nspace_resolver_request_count >=
11592 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11593 		nspace_resolver_request_wait_slot = true;
11594 		error = msleep(&nspace_resolver_request_count,
11595 		    &nspace_resolver_request_hash_mutex,
11596 		    PVFS | PCATCH, "nspacerq", NULL);
11597 		if (error) {
11598 			NSPACE_REQ_UNLOCK();
11599 			return error;
11600 		}
11601 	}
11602 
11603 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11604 #if DIAGNOSTIC
11605 	assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11606 #endif /* DIAGNOSTIC */
11607 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11608 	nspace_resolver_request_count++;
11609 
11610 	NSPACE_REQ_UNLOCK();
11611 
11612 	return 0;
11613 }
11614 
11615 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11616 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11617 {
11618 	/*
11619 	 * If a completion is in-progress, we have to wait for the
11620 	 * completion handler to finish because it's still using 'req',
11621 	 * which is allocated on our stack a couple of frames up.
11622 	 */
11623 	while ((req->r_flags & RRF_COMPLETING) != 0) {
11624 		(void) msleep(req, &nspace_resolver_request_hash_mutex,
11625 		    PVFS, "nspacecmplt", NULL);
11626 	}
11627 }
11628 
11629 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11630 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11631 {
11632 	struct nspace_resolver_requesthead *bucket;
11633 
11634 	/* We're called with NSPACE_REQ_LOCK held. */
11635 
11636 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11637 #if DIAGNOSTIC
11638 	assert((req->r_flags & RRF_COMPLETING) == 0);
11639 	assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11640 #endif /* DIAGNOSTIC */
11641 	LIST_REMOVE(req, r_hashlink);
11642 	nspace_resolver_request_count--;
11643 
11644 	if (nspace_resolver_request_wait_slot) {
11645 		nspace_resolver_request_wait_slot = false;
11646 		wakeup(&nspace_resolver_request_count);
11647 	}
11648 
11649 	nspace_resolver_req_wait_pending_completion(req);
11650 
11651 	NSPACE_REQ_UNLOCK();
11652 }
11653 
11654 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11655 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11656 {
11657 	NSPACE_REQ_LOCK();
11658 	nspace_resolver_req_remove_and_unlock(req);
11659 }
11660 
11661 static void
nspace_resolver_req_cancel(uint32_t req_id)11662 nspace_resolver_req_cancel(uint32_t req_id)
11663 {
11664 	kern_return_t kr;
11665 	mach_port_t mp;
11666 
11667 	// Failures here aren't fatal -- the cancellation message
11668 	// sent to the resolver is merely advisory.
11669 
11670 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11671 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11672 		return;
11673 	}
11674 
11675 	kr = send_nspace_resolve_cancel(mp, req_id);
11676 	if (kr != KERN_SUCCESS) {
11677 		os_log_error(OS_LOG_DEFAULT,
11678 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11679 	}
11680 
11681 	ipc_port_release_send(mp);
11682 }
11683 
11684 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11685 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11686 {
11687 	bool send_cancel_message = false;
11688 	int error;
11689 
11690 	NSPACE_REQ_LOCK();
11691 
11692 	while ((req->r_flags & RRF_COMPLETE) == 0) {
11693 		error = msleep(req, &nspace_resolver_request_hash_mutex,
11694 		    PVFS | PCATCH, "nspace", NULL);
11695 		if (error && error != ERESTART) {
11696 			req->r_resolver_error = (error == EINTR) ? EINTR :
11697 			    ETIMEDOUT;
11698 			send_cancel_message = true;
11699 			break;
11700 		}
11701 	}
11702 
11703 	nspace_resolver_req_remove_and_unlock(req);
11704 
11705 	/*
11706 	 * It's safe to continue referencing 'req' here because it's
11707 	 * allocated on our caller's stack.
11708 	 */
11709 
11710 	if (send_cancel_message) {
11711 		nspace_resolver_req_cancel(req->r_req_id);
11712 	}
11713 
11714 	return req->r_resolver_error;
11715 }
11716 
11717 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11718 nspace_resolver_req_mark_complete(
11719 	struct nspace_resolver_request *req,
11720 	int resolver_error)
11721 {
11722 	req->r_resolver_error = resolver_error;
11723 	req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11724 	wakeup(req);
11725 }
11726 
11727 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11728 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11729 {
11730 	req->r_flags |= RRF_COMPLETING;
11731 }
11732 
11733 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11734 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11735 {
11736 	struct nspace_resolver_request *req;
11737 	int error;
11738 	struct vnode_attr va;
11739 	vnode_t vp;
11740 
11741 	NSPACE_REQ_LOCK();
11742 
11743 	req = nspace_resolver_req_lookup(c->req_id, true);
11744 	if (req == NULL) {
11745 		/*
11746 		 * If we don't find the request corresponding to our req_id,
11747 		 * just drop the completion on the floor; it's likely that
11748 		 * the requester interrupted with a signal, or it may already
11749 		 * be completing.
11750 		 */
11751 		NSPACE_REQ_UNLOCK();
11752 		return;
11753 	}
11754 
11755 	/*
11756 	 * Get out now if the resolver reported an error.
11757 	 */
11758 	if ((error = c->resolver_error) != 0) {
11759 		goto out;
11760 	}
11761 
11762 	/*
11763 	 * If the resolver did not specify any namespace shape criteria
11764 	 * for letting the operation proceed, then get out now.
11765 	 */
11766 	if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11767 		goto out;
11768 	}
11769 
11770 	/*
11771 	 * We're going to have to acquire the mount rename lock and do
11772 	 * some I/O in order to verify the criteria.  Mark the request
11773 	 * as pending so no one else messes with it after we drop the
11774 	 * NSPACE_REQ_LOCK.
11775 	 */
11776 	nspace_resolver_req_mark_completion_pending(req);
11777 	NSPACE_REQ_UNLOCK();
11778 
11779 	/*
11780 	 * Lock out renames from changing the shape of the tree while
11781 	 * validate the criteria.
11782 	 */
11783 	mount_t locked_mp = req->r_vp->v_mount;
11784 	mount_ref(locked_mp, 0);
11785 	mount_lock_renames(locked_mp);
11786 
11787 	if (c->orig_gencount != 0) {
11788 		vp = req->r_vp;
11789 		if (error) {
11790 			goto out_dropmount;
11791 		}
11792 
11793 		VATTR_INIT(&va);
11794 		VATTR_WANTED(&va, va_recursive_gencount);
11795 		error = vnode_getattr(vp, &va, vfs_context_kernel());
11796 		if (error) {
11797 			goto out_dropmount;
11798 		}
11799 		if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11800 		    va.va_recursive_gencount != c->orig_gencount) {
11801 			printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11802 			    c->orig_gencount, va.va_recursive_gencount);
11803 			error = EBUSY;
11804 			goto out_dropmount;
11805 		}
11806 	}
11807 
11808 	/*
11809 	 * Ignore orig_syncroot if a destination directory wasn't specified
11810 	 * in the request.
11811 	 */
11812 	if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11813 		uint64_t syncroot_id;
11814 
11815 		if (error) {
11816 			goto out_dropmount;
11817 		}
11818 
11819 #ifndef APFSIOC_GET_SYNC_ROOT
11820 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11821 #endif
11822 
11823 		error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11824 		    (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11825 		if (error) {
11826 			goto out_dropmount;
11827 		}
11828 		if (syncroot_id != c->orig_syncroot) {
11829 			printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11830 			    c->orig_syncroot, syncroot_id);
11831 			error = EBUSY;
11832 			goto out_dropmount;
11833 		}
11834 	}
11835 
11836 out_dropmount:
11837 	mount_unlock_renames(locked_mp);
11838 	mount_drop(locked_mp, 0);
11839 	NSPACE_REQ_LOCK();
11840 
11841 out:
11842 	nspace_resolver_req_mark_complete(req, error);
11843 	NSPACE_REQ_UNLOCK();
11844 }
11845 
11846 static struct proc *nspace_resolver_proc;
11847 
11848 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11849 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11850 {
11851 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11852 	    p == nspace_resolver_proc) ? 1 : 0;
11853 	return 0;
11854 }
11855 
11856 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11857 
11858 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11859 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11860 {
11861 	vfs_context_t ctx = vfs_context_current();
11862 	int error = 0;
11863 
11864 	//
11865 	// The system filecoordinationd runs as uid == 0.  This also
11866 	// has the nice side-effect of filtering out filecoordinationd
11867 	// running in the simulator.
11868 	//
11869 	if (!vfs_context_issuser(ctx) ||
11870 	    !vfs_context_is_dataless_resolver(ctx)) {
11871 		return EPERM;
11872 	}
11873 
11874 	if (is_resolver) {
11875 		NSPACE_REQ_LOCK();
11876 
11877 		if (nspace_resolver_proc == NULL) {
11878 			proc_lock(p);
11879 			p->p_lflag |= P_LNSPACE_RESOLVER;
11880 			proc_unlock(p);
11881 			nspace_resolver_proc = p;
11882 		} else {
11883 			error = EBUSY;
11884 		}
11885 
11886 		NSPACE_REQ_UNLOCK();
11887 	} else {
11888 		// This is basically just like the exit case.
11889 		// nspace_resolver_exited() will verify that the
11890 		// process is the resolver, and will clear the
11891 		// global.
11892 		nspace_resolver_exited(p);
11893 	}
11894 
11895 	return error;
11896 }
11897 
11898 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11899 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11900 {
11901 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11902 	    (p->p_vfs_iopolicy &
11903 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11904 		*is_prevented = 1;
11905 	} else {
11906 		*is_prevented = 0;
11907 	}
11908 	return 0;
11909 }
11910 
11911 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11912 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11913 {
11914 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11915 		return is_prevented ? 0 : EBUSY;
11916 	}
11917 
11918 	if (is_prevented) {
11919 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11920 	} else {
11921 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11922 	}
11923 	return 0;
11924 }
11925 
11926 static int
nspace_materialization_get_thread_state(int * is_prevented)11927 nspace_materialization_get_thread_state(int *is_prevented)
11928 {
11929 	uthread_t ut = current_uthread();
11930 
11931 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11932 	return 0;
11933 }
11934 
11935 static int
nspace_materialization_set_thread_state(int is_prevented)11936 nspace_materialization_set_thread_state(int is_prevented)
11937 {
11938 	uthread_t ut = current_uthread();
11939 
11940 	if (is_prevented) {
11941 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11942 	} else {
11943 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11944 	}
11945 	return 0;
11946 }
11947 
11948 /* the vfs.nspace branch */
11949 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11950 
11951 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11952 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11953     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11954 {
11955 	struct proc *p = req->p;
11956 	int new_value, old_value, changed = 0;
11957 	int error;
11958 
11959 	error = nspace_resolver_get_proc_state(p, &old_value);
11960 	if (error) {
11961 		return error;
11962 	}
11963 
11964 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11965 	    &changed);
11966 	if (error == 0 && changed) {
11967 		error = nspace_resolver_set_proc_state(p, new_value);
11968 	}
11969 	return error;
11970 }
11971 
11972 /* decorate this process as the dataless file resolver */
11973 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11974     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11975     0, 0, sysctl_nspace_resolver, "I", "");
11976 
11977 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11978 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11979     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11980 {
11981 	struct proc *p = req->p;
11982 	int new_value, old_value, changed = 0;
11983 	int error;
11984 
11985 	error = nspace_materialization_get_proc_state(p, &old_value);
11986 	if (error) {
11987 		return error;
11988 	}
11989 
11990 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11991 	    &changed);
11992 	if (error == 0 && changed) {
11993 		error = nspace_materialization_set_proc_state(p, new_value);
11994 	}
11995 	return error;
11996 }
11997 
11998 /* decorate this process as not wanting to materialize dataless files */
11999 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
12000     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12001     0, 0, sysctl_nspace_prevent_materialization, "I", "");
12002 
12003 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12004 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
12005     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12006 {
12007 	int new_value, old_value, changed = 0;
12008 	int error;
12009 
12010 	error = nspace_materialization_get_thread_state(&old_value);
12011 	if (error) {
12012 		return error;
12013 	}
12014 
12015 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12016 	    &changed);
12017 	if (error == 0 && changed) {
12018 		error = nspace_materialization_set_thread_state(new_value);
12019 	}
12020 	return error;
12021 }
12022 
12023 /* decorate this thread as not wanting to materialize dataless files */
12024 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
12025     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12026     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
12027 
12028 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12029 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
12030     __unused int arg2, struct sysctl_req *req)
12031 {
12032 	struct proc *p = req->p;
12033 	uint32_t req_status[2] = { 0, 0 };
12034 	uint64_t gencount = 0;
12035 	uint64_t syncroot = 0;
12036 	int error, is_resolver, changed = 0, other_changed;
12037 
12038 	error = nspace_resolver_get_proc_state(p, &is_resolver);
12039 	if (error) {
12040 		return error;
12041 	}
12042 
12043 	if (!is_resolver) {
12044 		return EPERM;
12045 	}
12046 
12047 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
12048 	    &changed);
12049 	if (error) {
12050 		return error;
12051 	}
12052 
12053 	/*
12054 	 * Get the gencount if it was passed.  Ignore errors, because
12055 	 * it's optional.
12056 	 */
12057 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
12058 	    &other_changed);
12059 	if (error) {
12060 		gencount = 0;
12061 		error = 0;
12062 	}
12063 
12064 	/*
12065 	 * ...and now the syncroot ID.
12066 	 */
12067 	error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
12068 	    &other_changed);
12069 	if (error) {
12070 		syncroot = 0;
12071 		error = 0;
12072 	}
12073 
12074 	/*
12075 	 * req_status[0] is the req_id
12076 	 *
12077 	 * req_status[1] is the errno
12078 	 */
12079 	if (error == 0 && changed) {
12080 		const struct nspace_resolver_completion_data cd = {
12081 			.req_id = req_status[0],
12082 			.resolver_error = req_status[1],
12083 			.orig_gencount = gencount,
12084 			.orig_syncroot = syncroot,
12085 		};
12086 		nspace_resolver_req_completed(&cd);
12087 	}
12088 	return error;
12089 }
12090 
12091 /* Resolver reports completed reqs here. */
12092 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
12093     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12094     0, 0, sysctl_nspace_complete, "-", "");
12095 
12096 #endif /* CONFIG_DATALESS_FILES */
12097 
12098 #if CONFIG_DATALESS_FILES
12099 #define __no_dataless_unused    /* nothing */
12100 #else
12101 #define __no_dataless_unused    __unused
12102 #endif
12103 
12104 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)12105 vfs_context_dataless_materialization_is_prevented(
12106 	vfs_context_t const ctx __no_dataless_unused)
12107 {
12108 #if CONFIG_DATALESS_FILES
12109 	proc_t const p = vfs_context_proc(ctx);
12110 	thread_t const t = vfs_context_thread(ctx);
12111 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
12112 
12113 	/*
12114 	 * Kernel context ==> return EDEADLK, as we would with any random
12115 	 * process decorated as no-materialize.
12116 	 */
12117 	if (ctx == vfs_context_kernel()) {
12118 		return EDEADLK;
12119 	}
12120 
12121 	/*
12122 	 * If the process has the dataless-manipulation entitlement,
12123 	 * materialization is prevented, and depending on the kind
12124 	 * of file system operation, things get to proceed as if the
12125 	 * object is not dataless.
12126 	 */
12127 	if (vfs_context_is_dataless_manipulator(ctx)) {
12128 		return EJUSTRETURN;
12129 	}
12130 
12131 	/*
12132 	 * Per-thread decorations override any process-wide decorations.
12133 	 * (Foundation uses this, and this overrides even the dataless-
12134 	 * manipulation entitlement so as to make API contracts consistent.)
12135 	 */
12136 	if (ut != NULL) {
12137 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
12138 			return EDEADLK;
12139 		}
12140 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
12141 			return 0;
12142 		}
12143 	}
12144 
12145 	/*
12146 	 * If the process's iopolicy specifies that dataless files
12147 	 * can be materialized, then we let it go ahead.
12148 	 */
12149 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
12150 		return 0;
12151 	}
12152 #endif /* CONFIG_DATALESS_FILES */
12153 
12154 	/*
12155 	 * The default behavior is to not materialize dataless files;
12156 	 * return to the caller that deadlock was detected.
12157 	 */
12158 	return EDEADLK;
12159 }
12160 
12161 void
nspace_resolver_init(void)12162 nspace_resolver_init(void)
12163 {
12164 #if CONFIG_DATALESS_FILES
12165 	nspace_resolver_request_hashtbl =
12166 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12167 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12168 #endif /* CONFIG_DATALESS_FILES */
12169 }
12170 
12171 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12172 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12173 {
12174 #if CONFIG_DATALESS_FILES
12175 	struct nspace_resolver_requesthead *bucket;
12176 	struct nspace_resolver_request *req;
12177 	u_long idx;
12178 
12179 	NSPACE_REQ_LOCK();
12180 
12181 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12182 	    p == nspace_resolver_proc) {
12183 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12184 			bucket = &nspace_resolver_request_hashtbl[idx];
12185 			LIST_FOREACH(req, bucket, r_hashlink) {
12186 				nspace_resolver_req_wait_pending_completion(req);
12187 				nspace_resolver_req_mark_complete(req,
12188 				    ETIMEDOUT);
12189 			}
12190 		}
12191 		nspace_resolver_proc = NULL;
12192 	}
12193 
12194 	NSPACE_REQ_UNLOCK();
12195 #endif /* CONFIG_DATALESS_FILES */
12196 }
12197 
12198 #define DATALESS_RESOLVER_ENTITLEMENT     \
12199 	"com.apple.private.vfs.dataless-resolver"
12200 #define DATALESS_MANIPULATION_ENTITLEMENT \
12201 	"com.apple.private.vfs.dataless-manipulation"
12202 
12203 #if CONFIG_DATALESS_FILES
12204 /*
12205  * Return TRUE if the vfs context is associated with the dataless
12206  * resolver.
12207  */
12208 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12209 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12210 {
12211 	return IOTaskHasEntitlement(vfs_context_task(ctx),
12212 	           DATALESS_RESOLVER_ENTITLEMENT);
12213 }
12214 #endif /* CONFIG_DATALESS_FILES */
12215 
12216 /*
12217  * Return TRUE if the vfs context is associated with a process entitled
12218  * for dataless manipulation.
12219  *
12220  * XXX Arguably belongs in vfs_subr.c, but is here because of the
12221  * complication around CONFIG_DATALESS_FILES.
12222  */
12223 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12224 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12225 {
12226 #if CONFIG_DATALESS_FILES
12227 	task_t task = vfs_context_task(ctx);
12228 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12229 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12230 #else
12231 	return false;
12232 #endif /* CONFIG_DATALESS_FILES */
12233 }
12234 
12235 #if CONFIG_DATALESS_FILES
12236 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12237 log_materialization_prevented(vnode_t vp, uint64_t op)
12238 {
12239 	char p_name[MAXCOMLEN + 1];
12240 	char *vntype;
12241 	proc_selfname(&p_name[0], sizeof(p_name));
12242 
12243 	if (vp->v_type == VREG) {
12244 		vntype = "File";
12245 	} else if (vp->v_type == VDIR) {
12246 		vntype = "Dir";
12247 	} else if (vp->v_type == VLNK) {
12248 		vntype = "SymLink";
12249 	} else {
12250 		vntype = "Other";
12251 	}
12252 
12253 #if DEVELOPMENT
12254 	struct vnode_attr *vap = kalloc_type(struct vnode_attr, Z_WAITOK);
12255 
12256 	VATTR_INIT(vap);
12257 	VATTR_WANTED(vap, va_fsid);
12258 	VATTR_WANTED(vap, va_fileid);
12259 	if (vnode_getattr(vp, vap, vfs_context_current()) == 0) {
12260 		os_log_debug(OS_LOG_DEFAULT,
12261 		    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) fsid 0x%08x/%u fileid=%llu",
12262 		    p_name, proc_selfpid(), op, vntype,
12263 		    vap->va_fsid, vap->va_fsid, vap->va_fileid);
12264 	} else
12265 #endif
12266 	{
12267 		os_log_debug(OS_LOG_DEFAULT,
12268 		    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12269 		    p_name, proc_selfpid(), op, vntype);
12270 	}
12271 #if DEVELOPMENT
12272 	kfree_type(struct vnode_attr, vap);
12273 #endif
12274 }
12275 #endif /* CONFIG_DATALESS_FILES */
12276 
12277 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12278 vfs_materialize_item(
12279 	vnode_t vp __no_dataless_unused,
12280 	uint32_t op __no_dataless_unused,
12281 	int64_t offset __no_dataless_unused,
12282 	int64_t size __no_dataless_unused,
12283 	char *lookup_name __no_dataless_unused,
12284 	size_t const namelen __no_dataless_unused,
12285 	vnode_t tdvp __no_dataless_unused)
12286 {
12287 #if CONFIG_DATALESS_FILES
12288 	kern_return_t kern_ret;
12289 	mach_port_t mach_port;
12290 	char *path = NULL;
12291 	vfs_context_t context;
12292 	int path_len;
12293 	int error;
12294 	audit_token_t atoken;
12295 	enum vtype vp_vtype;
12296 
12297 	/* Swap files are special; ignore them */
12298 	if (vnode_isswap(vp)) {
12299 		return 0;
12300 	}
12301 
12302 	/*
12303 	 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12304 	 * are no longer used nor supported.
12305 	 */
12306 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12307 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12308 		return ENOTSUP;
12309 	}
12310 	if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12311 		os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12312 		return ENOTSUP;
12313 	}
12314 
12315 	/* Normalize 'op'. */
12316 	op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12317 
12318 	/*
12319 	 * To-directory is only meaningful for rename operations;
12320 	 * ignore it if someone handed one to us unexpectedly.
12321 	 */
12322 	if (op != NAMESPACE_HANDLER_RENAME_OP) {
12323 		tdvp = NULL;
12324 	}
12325 
12326 	context = vfs_context_current();
12327 
12328 	/* Remember this for later. */
12329 	vp_vtype = vnode_vtype(vp);
12330 
12331 	error = vfs_context_dataless_materialization_is_prevented(context);
12332 	if (error) {
12333 		log_materialization_prevented(vp, op);
12334 		goto out_check_errors;
12335 	}
12336 
12337 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12338 	    &mach_port);
12339 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12340 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12341 		/*
12342 		 * Treat this like being unable to access the backing store
12343 		 * server.
12344 		 */
12345 		return ETIMEDOUT;
12346 	}
12347 
12348 	int path_alloc_len = MAXPATHLEN;
12349 	do {
12350 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12351 		if (path == NULL) {
12352 			return ENOMEM;
12353 		}
12354 
12355 		path_len = path_alloc_len;
12356 		error = vn_getpath(vp, path, &path_len);
12357 		if (error == 0) {
12358 			break;
12359 		} else if (error == ENOSPC) {
12360 			kfree_data(path, path_alloc_len);
12361 			path = NULL;
12362 		} else {
12363 			goto out_release_port;
12364 		}
12365 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12366 
12367 	error = vfs_context_copy_audit_token(context, &atoken);
12368 	if (error) {
12369 		goto out_release_port;
12370 	}
12371 
12372 	struct nspace_resolver_request req = {
12373 		.r_req_id = next_nspace_req_id(),
12374 		.r_vp = vp,
12375 		.r_tdvp = tdvp,
12376 	};
12377 
12378 	error = nspace_resolver_req_add(&req);
12379 	if (error) {
12380 		goto out_release_port;
12381 	}
12382 
12383 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12384 
12385 	if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12386 		char *dest_path = NULL;
12387 		int dest_path_len;
12388 
12389 		dest_path = zalloc(ZV_NAMEI);
12390 		dest_path_len = MAXPATHLEN;
12391 
12392 		error = vn_getpath(tdvp, dest_path, &dest_path_len);
12393 		if (error) {
12394 			zfree(ZV_NAMEI, dest_path);
12395 			goto out_release_port;
12396 		}
12397 
12398 		/*
12399 		 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12400 		 * compatibility with existing agents in user-space
12401 		 * who get passed this value.
12402 		 */
12403 		kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12404 		    req.r_req_id,
12405 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12406 		    path, dest_path, atoken);
12407 
12408 		zfree(ZV_NAMEI, dest_path);
12409 	} else if (vp_vtype == VDIR) {
12410 		char *tmpname = NULL;
12411 
12412 		/*
12413 		 * If the caller provided a lookup_name *and* a name length,
12414 		 * then we assume the lookup_name is not NUL-terminated.
12415 		 * Allocate a temporary buffer in this case to provide
12416 		 * a NUL-terminated path name to the IPC call.
12417 		 */
12418 		if (lookup_name != NULL && namelen != 0) {
12419 			if (namelen >= PATH_MAX) {
12420 				error = EINVAL;
12421 				goto out_req_remove;
12422 			}
12423 			tmpname = zalloc(ZV_NAMEI);
12424 			strlcpy(tmpname, lookup_name, namelen + 1);
12425 			lookup_name = tmpname;
12426 		} else if (lookup_name != NULL) {
12427 			/*
12428 			 * If the caller provided a lookup_name with a
12429 			 * zero name length, then we assume it's NUL-
12430 			 * terminated.  Verify it has a valid length.
12431 			 */
12432 			if (strlen(lookup_name) >= PATH_MAX) {
12433 				error = EINVAL;
12434 				goto out_req_remove;
12435 			}
12436 		}
12437 
12438 		/* (See above.) */
12439 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12440 		    req.r_req_id,
12441 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12442 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12443 
12444 		if (tmpname != NULL) {
12445 			zfree(ZV_NAMEI, tmpname);
12446 
12447 			/*
12448 			 * Poison lookup_name rather than reference
12449 			 * freed memory.
12450 			 */
12451 			lookup_name = NULL;
12452 		}
12453 	} else {
12454 		/* (See above.) */
12455 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12456 		    req.r_req_id,
12457 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12458 		    offset, size, path, atoken);
12459 	}
12460 	if (kern_ret != KERN_SUCCESS) {
12461 		/*
12462 		 * Also treat this like being unable to access the backing
12463 		 * store server.
12464 		 */
12465 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12466 		    kern_ret);
12467 		error = ETIMEDOUT;
12468 		goto out_req_remove;
12469 	}
12470 
12471 	/*
12472 	 * Give back the memory we allocated earlier while we wait; we
12473 	 * no longer need it.
12474 	 */
12475 	kfree_data(path, path_alloc_len);
12476 	path = NULL;
12477 
12478 	/*
12479 	 * Request has been submitted to the resolver. Now (interruptibly)
12480 	 * wait for completion. Upon requrn, the request will have been
12481 	 * removed from the lookup table.
12482 	 */
12483 	error = nspace_resolver_req_wait(&req);
12484 
12485 out_release_port:
12486 	if (path != NULL) {
12487 		kfree_data(path, path_alloc_len);
12488 		path = NULL;
12489 	}
12490 	ipc_port_release_send(mach_port);
12491 
12492 out_check_errors:
12493 	/*
12494 	 * The file resolver owns the logic about what error to return
12495 	 * to the caller.  We only need to handle a couple of special
12496 	 * cases here:
12497 	 */
12498 	if (error == EJUSTRETURN) {
12499 		/*
12500 		 * The requesting process is allowed to interact with
12501 		 * dataless objects.  Make a couple of sanity-checks
12502 		 * here to ensure the action makes sense.
12503 		 */
12504 		switch (op) {
12505 		case NAMESPACE_HANDLER_WRITE_OP:
12506 		case NAMESPACE_HANDLER_TRUNCATE_OP:
12507 		case NAMESPACE_HANDLER_RENAME_OP:
12508 			/*
12509 			 * This handles the case of the resolver itself
12510 			 * writing data to the file (or throwing it
12511 			 * away).
12512 			 */
12513 			error = 0;
12514 			break;
12515 		case NAMESPACE_HANDLER_READ_OP:
12516 		case NAMESPACE_HANDLER_LOOKUP_OP:
12517 			/*
12518 			 * This handles the case of the resolver needing
12519 			 * to look up inside of a dataless directory while
12520 			 * it's in the process of materializing it (for
12521 			 * example, creating files or directories).
12522 			 */
12523 			error = (vp_vtype == VDIR) ? 0 : EBADF;
12524 			break;
12525 		default:
12526 			error = EBADF;
12527 			break;
12528 		}
12529 	}
12530 
12531 	return error;
12532 
12533 out_req_remove:
12534 	nspace_resolver_req_remove(&req);
12535 	goto out_release_port;
12536 #else
12537 	return ENOTSUP;
12538 #endif /* CONFIG_DATALESS_FILES */
12539 }
12540 
12541 /*
12542  * vfs_materialize_file: Materialize a regular file.
12543  *
12544  * Inputs:
12545  * vp		The dataless file to be materialized.
12546  *
12547  * op		What kind of operation is being performed:
12548  *		-> NAMESPACE_HANDLER_READ_OP
12549  *		-> NAMESPACE_HANDLER_WRITE_OP
12550  *		-> NAMESPACE_HANDLER_LINK_CREATE
12551  *		-> NAMESPACE_HANDLER_DELETE_OP
12552  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12553  *		-> NAMESPACE_HANDLER_RENAME_OP
12554  *
12555  * offset	offset of I/O for READ or WRITE.  Ignored for
12556  *		other ops.
12557  *
12558  * size		size of I/O for READ or WRITE  Ignored for
12559  *		other ops.
12560  *
12561  * If offset or size are -1 for a READ or WRITE, then the resolver should
12562  * consider the range to be unknown.
12563  *
12564  * Upon successful return, the caller may proceed with the operation.
12565  * N.B. the file may still be "dataless" in this case.
12566  */
12567 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12568 vfs_materialize_file(
12569 	struct vnode *vp,
12570 	uint64_t op,
12571 	int64_t offset,
12572 	int64_t size)
12573 {
12574 	if (vp->v_type != VREG) {
12575 		return EFTYPE;
12576 	}
12577 	return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12578 	           NULL);
12579 }
12580 
12581 /*
12582  * vfs_materialize_dir:
12583  *
12584  * Inputs:
12585  * vp		The dataless directory to be materialized.
12586  *
12587  * op		What kind of operation is being performed:
12588  *		-> NAMESPACE_HANDLER_READ_OP
12589  *		-> NAMESPACE_HANDLER_WRITE_OP
12590  *		-> NAMESPACE_HANDLER_DELETE_OP
12591  *		-> NAMESPACE_HANDLER_RENAME_OP
12592  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12593  *
12594  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12595  *		other ops.  May or may not be NUL-terminated; see below.
12596  *
12597  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12598  *		terminated and namelen is the number of valid bytes in
12599  *		lookup_name. If zero, then lookup_name is assumed to be
12600  *		NUL-terminated.
12601  *
12602  * Upon successful return, the caller may proceed with the operation.
12603  * N.B. the directory may still be "dataless" in this case.
12604  */
12605 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12606 vfs_materialize_dir(
12607 	struct vnode *vp,
12608 	uint64_t op,
12609 	char *lookup_name,
12610 	size_t namelen)
12611 {
12612 	if (vp->v_type != VDIR) {
12613 		return EFTYPE;
12614 	}
12615 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12616 		return EINVAL;
12617 	}
12618 	return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12619 	           namelen, NULL);
12620 }
12621 
12622 /*
12623  * vfs_materialize_reparent:
12624  *
12625  * Inputs:
12626  * vp		The dataless file or directory to be materialized.
12627  *
12628  * tdvp		The new parent directory for the dataless file.
12629  *
12630  * Upon successful return, the caller may proceed with the operation.
12631  * N.B. the item may still be "dataless" in this case.
12632  */
12633 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12634 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12635 {
12636 	if (vp->v_type != VDIR && vp->v_type != VREG) {
12637 		return EFTYPE;
12638 	}
12639 	return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12640 	           0, 0, NULL, 0, tdvp);
12641 }
12642 
12643 #if 0
12644 static int
12645 build_volfs_path(struct vnode *vp, char *path, int *len)
12646 {
12647 	struct vnode_attr va;
12648 	int ret;
12649 
12650 	VATTR_INIT(&va);
12651 	VATTR_WANTED(&va, va_fsid);
12652 	VATTR_WANTED(&va, va_fileid);
12653 
12654 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12655 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12656 		ret = -1;
12657 	} else {
12658 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12659 		ret = 0;
12660 	}
12661 
12662 	return ret;
12663 }
12664 #endif
12665 
12666 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12667 fsctl_bogus_command_compat(unsigned long cmd)
12668 {
12669 	switch (cmd) {
12670 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12671 		return FSIOC_SYNC_VOLUME;
12672 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12673 		return FSIOC_ROUTEFS_SETROUTEID;
12674 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12675 		return FSIOC_SET_PACKAGE_EXTS;
12676 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12677 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
12678 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12679 		return DISK_CONDITIONER_IOC_GET;
12680 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12681 		return DISK_CONDITIONER_IOC_SET;
12682 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12683 		return FSIOC_FIOSEEKHOLE;
12684 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12685 		return FSIOC_FIOSEEKDATA;
12686 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12687 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12688 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12689 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
12690 	}
12691 
12692 	return cmd;
12693 }
12694 
12695 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12696 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12697 {
12698 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12699 }
12700 
12701 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12702 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12703 {
12704 	struct vfs_attr vfa;
12705 	mount_t mp = vp->v_mount;
12706 	unsigned arg;
12707 	int error;
12708 
12709 	/* record vid of vp so we can drop it below. */
12710 	uint32_t vvid = vp->v_id;
12711 
12712 	/*
12713 	 * Then grab mount_iterref so that we can release the vnode.
12714 	 * Without this, a thread may call vnode_iterate_prepare then
12715 	 * get into a deadlock because we've never released the root vp
12716 	 */
12717 	error = mount_iterref(mp, 0);
12718 	if (error) {
12719 		return error;
12720 	}
12721 	vnode_hold(vp);
12722 	vnode_put(vp);
12723 
12724 	arg = MNT_NOWAIT;
12725 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12726 		arg = MNT_WAIT;
12727 	}
12728 
12729 	/*
12730 	 * If the filessytem supports multiple filesytems in a
12731 	 * partition (For eg APFS volumes in a container, it knows
12732 	 * that the waitfor argument to VFS_SYNC are flags.
12733 	 */
12734 	VFSATTR_INIT(&vfa);
12735 	VFSATTR_WANTED(&vfa, f_capabilities);
12736 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12737 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12738 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12739 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12740 		arg |= MNT_VOLUME;
12741 	}
12742 
12743 	/* issue the sync for this volume */
12744 	(void)sync_callback(mp, &arg);
12745 
12746 	/*
12747 	 * Then release the mount_iterref once we're done syncing; it's not
12748 	 * needed for the VNOP_IOCTL below
12749 	 */
12750 	mount_iterdrop(mp);
12751 
12752 	if (arg & FSCTL_SYNC_FULLSYNC) {
12753 		/* re-obtain vnode iocount on the root vp, if possible */
12754 		error = vnode_getwithvid(vp, vvid);
12755 		if (error == 0) {
12756 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12757 			vnode_put(vp);
12758 		}
12759 	}
12760 	vnode_drop(vp);
12761 	/* mark the argument VP as having been released */
12762 	*arg_vp = NULL;
12763 	return error;
12764 }
12765 
12766 #if ROUTEFS
12767 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12768 handle_routes(user_addr_t udata)
12769 {
12770 	char routepath[MAXPATHLEN];
12771 	size_t len = 0;
12772 	int error;
12773 
12774 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12775 		return error;
12776 	}
12777 	bzero(routepath, MAXPATHLEN);
12778 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12779 	if (error) {
12780 		return error;
12781 	}
12782 	error = routefs_kernel_mount(routepath);
12783 	return error;
12784 }
12785 #endif
12786 
12787 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12788 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12789 {
12790 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12791 	struct vnode_attr va;
12792 	int error;
12793 
12794 	VATTR_INIT(&va);
12795 	VATTR_SET(&va, va_flags, cas->new_flags);
12796 
12797 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12798 
12799 #if CONFIG_FSE
12800 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12801 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12802 	}
12803 #endif
12804 
12805 	return error;
12806 }
12807 
12808 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12809 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12810 {
12811 	struct mount *mp = NULL;
12812 	errno_t rootauth = 0;
12813 
12814 	mp = vp->v_mount;
12815 
12816 	/*
12817 	 * query the underlying FS and see if it reports something
12818 	 * sane for this vnode. If volume is authenticated via
12819 	 * chunklist, leave that for the caller to determine.
12820 	 */
12821 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12822 
12823 	return rootauth;
12824 }
12825 
12826 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12827 	"com.apple.private.kernel.set-package-extensions"
12828 
12829 /*
12830  * Make a filesystem-specific control call:
12831  */
12832 /* ARGSUSED */
12833 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12834 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12835 {
12836 	int error = 0;
12837 	boolean_t is64bit;
12838 	u_int size;
12839 #define STK_PARAMS 128
12840 	char stkbuf[STK_PARAMS] = {0};
12841 	caddr_t data, memp;
12842 	vnode_t vp = *arg_vp;
12843 
12844 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
12845 		return ENOTTY;
12846 	}
12847 
12848 	cmd = fsctl_bogus_command_compat(cmd);
12849 
12850 	size = IOCPARM_LEN(cmd);
12851 	if (size > IOCPARM_MAX) {
12852 		return EINVAL;
12853 	}
12854 
12855 	is64bit = proc_is64bit(p);
12856 
12857 	memp = NULL;
12858 
12859 	if (size > sizeof(stkbuf)) {
12860 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12861 			return ENOMEM;
12862 		}
12863 		data = memp;
12864 	} else {
12865 		data = &stkbuf[0];
12866 	};
12867 
12868 	if (cmd & IOC_IN) {
12869 		if (size) {
12870 			error = copyin(udata, data, size);
12871 			if (error) {
12872 				if (memp) {
12873 					kfree_data(memp, size);
12874 				}
12875 				return error;
12876 			}
12877 		} else {
12878 			if (is64bit) {
12879 				*(user_addr_t *)data = udata;
12880 			} else {
12881 				*(uint32_t *)data = (uint32_t)udata;
12882 			}
12883 		};
12884 	} else if ((cmd & IOC_OUT) && size) {
12885 		/*
12886 		 * Zero the buffer so the user always
12887 		 * gets back something deterministic.
12888 		 */
12889 		bzero(data, size);
12890 	} else if (cmd & IOC_VOID) {
12891 		if (is64bit) {
12892 			*(user_addr_t *)data = udata;
12893 		} else {
12894 			*(uint32_t *)data = (uint32_t)udata;
12895 		}
12896 	}
12897 
12898 	/* Check to see if it's a generic command */
12899 	switch (cmd) {
12900 	case FSIOC_SYNC_VOLUME:
12901 		error = handle_sync_volume(vp, arg_vp, data, ctx);
12902 		break;
12903 
12904 	case FSIOC_ROUTEFS_SETROUTEID:
12905 #if ROUTEFS
12906 		error = handle_routes(udata);
12907 #endif
12908 		break;
12909 
12910 	case FSIOC_SET_PACKAGE_EXTS: {
12911 		user_addr_t ext_strings;
12912 		uint32_t    num_entries;
12913 		uint32_t    max_width;
12914 
12915 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12916 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12917 			error = EPERM;
12918 			break;
12919 		}
12920 
12921 		if ((is64bit && size != sizeof(user64_package_ext_info))
12922 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12923 			// either you're 64-bit and passed a 64-bit struct or
12924 			// you're 32-bit and passed a 32-bit struct.  otherwise
12925 			// it's not ok.
12926 			error = EINVAL;
12927 			break;
12928 		}
12929 
12930 		if (is64bit) {
12931 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12932 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12933 			}
12934 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12935 			num_entries = ((user64_package_ext_info *)data)->num_entries;
12936 			max_width   = ((user64_package_ext_info *)data)->max_width;
12937 		} else {
12938 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12939 			num_entries = ((user32_package_ext_info *)data)->num_entries;
12940 			max_width   = ((user32_package_ext_info *)data)->max_width;
12941 		}
12942 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
12943 	}
12944 	break;
12945 
12946 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12947 	{
12948 		mount_t mp;
12949 
12950 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12951 			break;
12952 		}
12953 		if ((mp = vp->v_mount) != NULL) {
12954 			mount_lock(mp);
12955 			if (data[0] != 0) {
12956 				for (int i = 0; i < MFSTYPENAMELEN; i++) {
12957 					if (!data[i]) {
12958 						goto continue_copy;
12959 					}
12960 				}
12961 				/*
12962 				 * Getting here means we have a user data
12963 				 * string which has no NULL termination in
12964 				 * its first MFSTYPENAMELEN bytes.  This is
12965 				 * bogus, let's avoid strlcpy-ing the read
12966 				 * data and return an error.
12967 				 */
12968 				error = EINVAL;
12969 				goto unlock;
12970 continue_copy:
12971 				vfs_setfstypename_locked(mp, data);
12972 				if (vfs_isrdonly(mp) &&
12973 				    strcmp(data, "mtmfs") == 0) {
12974 					mp->mnt_kern_flag |=
12975 					    MNTK_EXTENDED_SECURITY;
12976 					mp->mnt_kern_flag &=
12977 					    ~MNTK_AUTH_OPAQUE;
12978 				}
12979 			} else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12980 				const char *name =
12981 				    vfs_getfstypenameref_locked(mp, NULL);
12982 				if (strcmp(name, "mtmfs") == 0) {
12983 					mp->mnt_kern_flag &=
12984 					    ~MNTK_EXTENDED_SECURITY;
12985 				}
12986 				vfs_setfstypename_locked(mp, NULL);
12987 			}
12988 unlock:
12989 			mount_unlock(mp);
12990 		}
12991 	}
12992 	break;
12993 
12994 	case DISK_CONDITIONER_IOC_GET: {
12995 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12996 	}
12997 	break;
12998 
12999 	case DISK_CONDITIONER_IOC_SET: {
13000 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
13001 	}
13002 	break;
13003 
13004 	case FSIOC_CAS_BSDFLAGS:
13005 		error = handle_flags(vp, data, ctx);
13006 		break;
13007 
13008 	case FSIOC_FD_ONLY_OPEN_ONCE: {
13009 		error = 0;
13010 		if (vnode_usecount(vp) > 1) {
13011 			vnode_lock_spin(vp);
13012 			if (vp->v_lflag & VL_HASSTREAMS) {
13013 				if (vnode_isinuse_locked(vp, 1, 1)) {
13014 					error = EBUSY;
13015 				}
13016 			} else if (vnode_usecount(vp) > 1) {
13017 				error = EBUSY;
13018 			}
13019 			vnode_unlock(vp);
13020 		}
13021 	}
13022 	break;
13023 
13024 	case FSIOC_EVAL_ROOTAUTH:
13025 		error = handle_auth(vp, cmd, data, options, ctx);
13026 		break;
13027 
13028 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
13029 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
13030 		break;
13031 
13032 #if CONFIG_EXCLAVES
13033 	case FSIOC_EXCLAVE_FS_REGISTER:
13034 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13035 			error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
13036 		} else {
13037 			error = EPERM;
13038 		}
13039 		break;
13040 
13041 	case FSIOC_EXCLAVE_FS_UNREGISTER:
13042 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13043 			error = vfs_exclave_fs_unregister(vp);
13044 		} else {
13045 			error = EPERM;
13046 		}
13047 		break;
13048 
13049 	case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
13050 		exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
13051 		exclave_fs_base_dir_t *dirs = NULL;
13052 		if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13053 			error = EPERM;
13054 			break;
13055 		}
13056 		if (get_base_dirs->base_dirs) {
13057 			if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
13058 				error = EINVAL;
13059 				break;
13060 			}
13061 			dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
13062 			if (!dirs) {
13063 				error = ENOSPC;
13064 				break;
13065 			}
13066 		}
13067 		error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
13068 		if (!error && dirs) {
13069 			error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
13070 			    get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
13071 		}
13072 		if (dirs) {
13073 			kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
13074 		}
13075 	}
13076 	break;
13077 #endif
13078 
13079 	default: {
13080 		/*
13081 		 * Other, known commands shouldn't be passed down here.
13082 		 * (When adding a selector to this list, it may be prudent
13083 		 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
13084 		 */
13085 		switch (cmd) {
13086 		case F_PUNCHHOLE:
13087 		case F_TRIM_ACTIVE_FILE:
13088 		case F_RDADVISE:
13089 		case F_TRANSCODEKEY:
13090 		case F_GETPROTECTIONLEVEL:
13091 		case F_GETDEFAULTPROTLEVEL:
13092 		case F_MAKECOMPRESSED:
13093 		case F_SET_GREEDY_MODE:
13094 		case F_SETSTATICCONTENT:
13095 		case F_SETIOTYPE:
13096 		case F_SETBACKINGSTORE:
13097 		case F_GETPATH_MTMINFO:
13098 		case APFSIOC_REVERT_TO_SNAPSHOT:
13099 		case FSIOC_FIOSEEKHOLE:
13100 		case FSIOC_FIOSEEKDATA:
13101 		case HFS_GET_BOOT_INFO:
13102 		case HFS_SET_BOOT_INFO:
13103 		case FIOPINSWAP:
13104 		case F_CHKCLEAN:
13105 		case F_FULLFSYNC:
13106 		case F_BARRIERFSYNC:
13107 		case F_FREEZE_FS:
13108 		case F_THAW_FS:
13109 		case FSIOC_KERNEL_ROOTAUTH:
13110 		case FSIOC_GRAFT_FS:
13111 		case FSIOC_UNGRAFT_FS:
13112 		case FSIOC_AUTH_FS:
13113 		case F_SPECULATIVE_READ:
13114 		case F_ATTRIBUTION_TAG:
13115 		case F_TRANSFEREXTENTS:
13116 		case F_ASSERT_BG_ACCESS:
13117 		case F_RELEASE_BG_ACCESS:
13118 			error = EINVAL;
13119 			goto outdrop;
13120 		}
13121 		/* Invoke the filesystem-specific code */
13122 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13123 	}
13124 	} /* end switch stmt */
13125 
13126 	/*
13127 	 * if no errors, copy any data to user. Size was
13128 	 * already set and checked above.
13129 	 */
13130 	if (error == 0 && (cmd & IOC_OUT) && size) {
13131 		error = copyout(data, udata, size);
13132 	}
13133 
13134 outdrop:
13135 	if (memp) {
13136 		kfree_data(memp, size);
13137 	}
13138 
13139 	return error;
13140 }
13141 
13142 /* ARGSUSED */
13143 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)13144 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
13145 {
13146 	int error;
13147 	struct nameidata nd;
13148 	uint32_t nameiflags;
13149 	vnode_t vp = NULL;
13150 	vfs_context_t ctx = vfs_context_current();
13151 
13152 	AUDIT_ARG(cmd, (int)uap->cmd);
13153 	AUDIT_ARG(value32, uap->options);
13154 	/* Get the vnode for the file we are getting info on:  */
13155 	nameiflags = 0;
13156 	//
13157 	// if we come through fsctl() then the file is by definition not open.
13158 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
13159 	// lest the caller mistakenly thinks the only open is their own (but in
13160 	// reality it's someone elses).
13161 	//
13162 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
13163 		return EINVAL;
13164 	}
13165 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
13166 		nameiflags |= FOLLOW;
13167 	}
13168 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
13169 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13170 	}
13171 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13172 	    UIO_USERSPACE, uap->path, ctx);
13173 	if ((error = namei(&nd))) {
13174 		goto done;
13175 	}
13176 	vp = nd.ni_vp;
13177 	nameidone(&nd);
13178 
13179 #if CONFIG_MACF
13180 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13181 	if (error) {
13182 		goto done;
13183 	}
13184 #endif
13185 
13186 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13187 
13188 done:
13189 	if (vp) {
13190 		vnode_put(vp);
13191 	}
13192 	return error;
13193 }
13194 /* ARGSUSED */
13195 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13196 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13197 {
13198 	int error;
13199 	vnode_t vp = NULL;
13200 	vfs_context_t ctx = vfs_context_current();
13201 	int fd = -1;
13202 
13203 	AUDIT_ARG(fd, uap->fd);
13204 	AUDIT_ARG(cmd, (int)uap->cmd);
13205 	AUDIT_ARG(value32, uap->options);
13206 
13207 	/* Get the vnode for the file we are getting info on:  */
13208 	if ((error = file_vnode(uap->fd, &vp))) {
13209 		return error;
13210 	}
13211 	fd = uap->fd;
13212 	if ((error = vnode_getwithref(vp))) {
13213 		file_drop(fd);
13214 		return error;
13215 	}
13216 
13217 #if CONFIG_MACF
13218 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13219 		file_drop(fd);
13220 		vnode_put(vp);
13221 		return error;
13222 	}
13223 #endif
13224 
13225 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13226 
13227 	file_drop(fd);
13228 
13229 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13230 	if (vp) {
13231 		vnode_put(vp);
13232 	}
13233 
13234 	return error;
13235 }
13236 /* end of fsctl system call */
13237 
13238 #define FILESEC_ACCESS_ENTITLEMENT              \
13239 	"com.apple.private.vfs.filesec-access"
13240 
13241 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13242 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13243 {
13244 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13245 		/*
13246 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13247 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13248 		 */
13249 		if ((!setting && vfs_context_issuser(ctx)) ||
13250 		    IOTaskHasEntitlement(vfs_context_task(ctx),
13251 		    FILESEC_ACCESS_ENTITLEMENT)) {
13252 			return 0;
13253 		}
13254 	}
13255 
13256 	return EPERM;
13257 }
13258 
13259 /*
13260  *  Retrieve the data of an extended attribute.
13261  */
13262 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13263 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13264 {
13265 	vnode_t vp;
13266 	struct nameidata nd;
13267 	char attrname[XATTR_MAXNAMELEN + 1];
13268 	vfs_context_t ctx = vfs_context_current();
13269 	uio_t auio = NULL;
13270 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13271 	size_t attrsize = 0;
13272 	size_t namelen;
13273 	u_int32_t nameiflags;
13274 	int error;
13275 	UIO_STACKBUF(uio_buf, 1);
13276 
13277 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13278 		return EINVAL;
13279 	}
13280 
13281 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13282 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13283 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13284 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13285 	}
13286 
13287 	if ((error = namei(&nd))) {
13288 		return error;
13289 	}
13290 	vp = nd.ni_vp;
13291 	nameidone(&nd);
13292 
13293 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13294 	if (error != 0) {
13295 		goto out;
13296 	}
13297 	if (xattr_protected(attrname) &&
13298 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13299 		goto out;
13300 	}
13301 	/*
13302 	 * the specific check for 0xffffffff is a hack to preserve
13303 	 * binaray compatibilty in K64 with applications that discovered
13304 	 * that passing in a buf pointer and a size of -1 resulted in
13305 	 * just the size of the indicated extended attribute being returned.
13306 	 * this isn't part of the documented behavior, but because of the
13307 	 * original implemtation's check for "uap->size > 0", this behavior
13308 	 * was allowed. In K32 that check turned into a signed comparison
13309 	 * even though uap->size is unsigned...  in K64, we blow by that
13310 	 * check because uap->size is unsigned and doesn't get sign smeared
13311 	 * in the munger for a 32 bit user app.  we also need to add a
13312 	 * check to limit the maximum size of the buffer being passed in...
13313 	 * unfortunately, the underlying fileystems seem to just malloc
13314 	 * the requested size even if the actual extended attribute is tiny.
13315 	 * because that malloc is for kernel wired memory, we have to put a
13316 	 * sane limit on it.
13317 	 *
13318 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13319 	 * U64 running on K64 will yield -1 (64 bits wide)
13320 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
13321 	 */
13322 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13323 		goto no_uio;
13324 	}
13325 
13326 	if (uap->value) {
13327 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13328 			uap->size = XATTR_MAXSIZE;
13329 		}
13330 
13331 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13332 		    &uio_buf[0], sizeof(uio_buf));
13333 		uio_addiov(auio, uap->value, uap->size);
13334 	}
13335 no_uio:
13336 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13337 out:
13338 	vnode_put(vp);
13339 
13340 	if (auio) {
13341 		*retval = uap->size - uio_resid(auio);
13342 	} else {
13343 		*retval = (user_ssize_t)attrsize;
13344 	}
13345 
13346 	return error;
13347 }
13348 
13349 /*
13350  * Retrieve the data of an extended attribute.
13351  */
13352 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13353 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13354 {
13355 	vnode_t vp;
13356 	char attrname[XATTR_MAXNAMELEN + 1];
13357 	vfs_context_t ctx = vfs_context_current();
13358 	uio_t auio = NULL;
13359 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13360 	size_t attrsize = 0;
13361 	size_t namelen;
13362 	int error;
13363 	UIO_STACKBUF(uio_buf, 1);
13364 
13365 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13366 	    XATTR_NOFOLLOW_ANY)) {
13367 		return EINVAL;
13368 	}
13369 
13370 	if ((error = file_vnode(uap->fd, &vp))) {
13371 		return error;
13372 	}
13373 	if ((error = vnode_getwithref(vp))) {
13374 		file_drop(uap->fd);
13375 		return error;
13376 	}
13377 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13378 	if (error != 0) {
13379 		goto out;
13380 	}
13381 	if (xattr_protected(attrname) &&
13382 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13383 		goto out;
13384 	}
13385 	if (uap->value && uap->size > 0) {
13386 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13387 			uap->size = XATTR_MAXSIZE;
13388 		}
13389 
13390 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13391 		    &uio_buf[0], sizeof(uio_buf));
13392 		uio_addiov(auio, uap->value, uap->size);
13393 	}
13394 
13395 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13396 out:
13397 	(void)vnode_put(vp);
13398 	file_drop(uap->fd);
13399 
13400 	if (auio) {
13401 		*retval = uap->size - uio_resid(auio);
13402 	} else {
13403 		*retval = (user_ssize_t)attrsize;
13404 	}
13405 	return error;
13406 }
13407 
13408 /* struct for checkdirs iteration */
13409 struct setxattr_ctx {
13410 	struct nameidata nd;
13411 	char attrname[XATTR_MAXNAMELEN + 1];
13412 	UIO_STACKBUF(uio_buf, 1);
13413 };
13414 
13415 /*
13416  * Set the data of an extended attribute.
13417  */
13418 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13419 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13420 {
13421 	vnode_t vp;
13422 	vfs_context_t ctx = vfs_context_current();
13423 	uio_t auio = NULL;
13424 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13425 	size_t namelen;
13426 	u_int32_t nameiflags;
13427 	int error;
13428 	struct setxattr_ctx *sactx;
13429 
13430 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13431 		return EINVAL;
13432 	}
13433 
13434 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13435 	if (sactx == NULL) {
13436 		return ENOMEM;
13437 	}
13438 
13439 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13440 	if (error != 0) {
13441 		if (error == EPERM) {
13442 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13443 			error = ENAMETOOLONG;
13444 		}
13445 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13446 		goto out;
13447 	}
13448 	if (xattr_protected(sactx->attrname) &&
13449 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13450 		goto out;
13451 	}
13452 	if (uap->size != 0 && uap->value == 0) {
13453 		error = EINVAL;
13454 		goto out;
13455 	}
13456 	if (uap->size > INT_MAX) {
13457 		error = E2BIG;
13458 		goto out;
13459 	}
13460 
13461 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13462 #if CONFIG_FILE_LEASES
13463 	nameiflags |= WANTPARENT;
13464 #endif
13465 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13466 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13467 		sactx->nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13468 	}
13469 
13470 	if ((error = namei(&sactx->nd))) {
13471 		goto out;
13472 	}
13473 	vp = sactx->nd.ni_vp;
13474 #if CONFIG_FILE_LEASES
13475 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13476 	vnode_put(sactx->nd.ni_dvp);
13477 #endif
13478 	nameidone(&sactx->nd);
13479 
13480 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13481 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13482 	uio_addiov(auio, uap->value, uap->size);
13483 
13484 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13485 #if CONFIG_FSE
13486 	if (error == 0) {
13487 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13488 		    FSE_ARG_VNODE, vp,
13489 		    FSE_ARG_DONE);
13490 	}
13491 #endif
13492 	vnode_put(vp);
13493 out:
13494 	kfree_type(struct setxattr_ctx, sactx);
13495 	*retval = 0;
13496 	return error;
13497 }
13498 
13499 /*
13500  * Set the data of an extended attribute.
13501  */
13502 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13503 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13504 {
13505 	vnode_t vp;
13506 	char attrname[XATTR_MAXNAMELEN + 1];
13507 	vfs_context_t ctx = vfs_context_current();
13508 	uio_t auio = NULL;
13509 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13510 	size_t namelen;
13511 	int error;
13512 	UIO_STACKBUF(uio_buf, 1);
13513 
13514 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13515 	    XATTR_NOFOLLOW_ANY)) {
13516 		return EINVAL;
13517 	}
13518 
13519 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13520 	if (error != 0) {
13521 		if (error == EPERM) {
13522 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13523 			return ENAMETOOLONG;
13524 		}
13525 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13526 		return error;
13527 	}
13528 	if (xattr_protected(attrname) &&
13529 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13530 		return error;
13531 	}
13532 	if (uap->size != 0 && uap->value == 0) {
13533 		return EINVAL;
13534 	}
13535 	if (uap->size > INT_MAX) {
13536 		return E2BIG;
13537 	}
13538 	if ((error = file_vnode(uap->fd, &vp))) {
13539 		return error;
13540 	}
13541 	if ((error = vnode_getwithref(vp))) {
13542 		file_drop(uap->fd);
13543 		return error;
13544 	}
13545 
13546 #if CONFIG_FILE_LEASES
13547 	vnode_breakdirlease(vp, true, O_WRONLY);
13548 #endif
13549 
13550 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13551 	    &uio_buf[0], sizeof(uio_buf));
13552 	uio_addiov(auio, uap->value, uap->size);
13553 
13554 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13555 #if CONFIG_FSE
13556 	if (error == 0) {
13557 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13558 		    FSE_ARG_VNODE, vp,
13559 		    FSE_ARG_DONE);
13560 	}
13561 #endif
13562 	vnode_put(vp);
13563 	file_drop(uap->fd);
13564 	*retval = 0;
13565 	return error;
13566 }
13567 
13568 /*
13569  * Remove an extended attribute.
13570  * XXX Code duplication here.
13571  */
13572 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13573 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13574 {
13575 	vnode_t vp;
13576 	struct nameidata nd;
13577 	char attrname[XATTR_MAXNAMELEN + 1];
13578 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13579 	vfs_context_t ctx = vfs_context_current();
13580 	size_t namelen;
13581 	u_int32_t nameiflags;
13582 	int error;
13583 
13584 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13585 		return EINVAL;
13586 	}
13587 
13588 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13589 	if (error != 0) {
13590 		return error;
13591 	}
13592 	if (xattr_protected(attrname)) {
13593 		return EPERM;
13594 	}
13595 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13596 #if CONFIG_FILE_LEASES
13597 	nameiflags |= WANTPARENT;
13598 #endif
13599 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13600 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13601 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13602 	}
13603 
13604 	if ((error = namei(&nd))) {
13605 		return error;
13606 	}
13607 	vp = nd.ni_vp;
13608 #if CONFIG_FILE_LEASES
13609 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13610 	vnode_put(nd.ni_dvp);
13611 #endif
13612 	nameidone(&nd);
13613 
13614 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13615 #if CONFIG_FSE
13616 	if (error == 0) {
13617 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13618 		    FSE_ARG_VNODE, vp,
13619 		    FSE_ARG_DONE);
13620 	}
13621 #endif
13622 	vnode_put(vp);
13623 	*retval = 0;
13624 	return error;
13625 }
13626 
13627 /*
13628  * Remove an extended attribute.
13629  * XXX Code duplication here.
13630  */
13631 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13632 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13633 {
13634 	vnode_t vp;
13635 	char attrname[XATTR_MAXNAMELEN + 1];
13636 	size_t namelen;
13637 	int error;
13638 #if CONFIG_FSE
13639 	vfs_context_t ctx = vfs_context_current();
13640 #endif
13641 
13642 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13643 	    XATTR_NOFOLLOW_ANY)) {
13644 		return EINVAL;
13645 	}
13646 
13647 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13648 	if (error != 0) {
13649 		return error;
13650 	}
13651 	if (xattr_protected(attrname)) {
13652 		return EPERM;
13653 	}
13654 	if ((error = file_vnode(uap->fd, &vp))) {
13655 		return error;
13656 	}
13657 	if ((error = vnode_getwithref(vp))) {
13658 		file_drop(uap->fd);
13659 		return error;
13660 	}
13661 
13662 #if CONFIG_FILE_LEASES
13663 	vnode_breakdirlease(vp, true, O_WRONLY);
13664 #endif
13665 
13666 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13667 #if CONFIG_FSE
13668 	if (error == 0) {
13669 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13670 		    FSE_ARG_VNODE, vp,
13671 		    FSE_ARG_DONE);
13672 	}
13673 #endif
13674 	vnode_put(vp);
13675 	file_drop(uap->fd);
13676 	*retval = 0;
13677 	return error;
13678 }
13679 
13680 /*
13681  * Retrieve the list of extended attribute names.
13682  * XXX Code duplication here.
13683  */
13684 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13685 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13686 {
13687 	vnode_t vp;
13688 	struct nameidata nd;
13689 	vfs_context_t ctx = vfs_context_current();
13690 	uio_t auio = NULL;
13691 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13692 	size_t attrsize = 0;
13693 	u_int32_t nameiflags;
13694 	int error;
13695 	UIO_STACKBUF(uio_buf, 1);
13696 
13697 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13698 		return EINVAL;
13699 	}
13700 
13701 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13702 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13703 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13704 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13705 	}
13706 
13707 	if ((error = namei(&nd))) {
13708 		return error;
13709 	}
13710 	vp = nd.ni_vp;
13711 	nameidone(&nd);
13712 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13713 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13714 		    &uio_buf[0], sizeof(uio_buf));
13715 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13716 	}
13717 
13718 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13719 
13720 	vnode_put(vp);
13721 	if (auio) {
13722 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13723 	} else {
13724 		*retval = (user_ssize_t)attrsize;
13725 	}
13726 	return error;
13727 }
13728 
13729 /*
13730  * Retrieve the list of extended attribute names.
13731  * XXX Code duplication here.
13732  */
13733 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13734 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13735 {
13736 	vnode_t vp;
13737 	uio_t auio = NULL;
13738 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13739 	size_t attrsize = 0;
13740 	int error;
13741 	UIO_STACKBUF(uio_buf, 1);
13742 
13743 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13744 	    XATTR_NOFOLLOW_ANY)) {
13745 		return EINVAL;
13746 	}
13747 
13748 	if ((error = file_vnode(uap->fd, &vp))) {
13749 		return error;
13750 	}
13751 	if ((error = vnode_getwithref(vp))) {
13752 		file_drop(uap->fd);
13753 		return error;
13754 	}
13755 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13756 		auio = uio_createwithbuffer(1, 0, spacetype,
13757 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
13758 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13759 	}
13760 
13761 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13762 
13763 	vnode_put(vp);
13764 	file_drop(uap->fd);
13765 	if (auio) {
13766 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13767 	} else {
13768 		*retval = (user_ssize_t)attrsize;
13769 	}
13770 	return error;
13771 }
13772 
13773 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13774 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13775     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13776 {
13777 	int error;
13778 	struct mount *mp = NULL;
13779 	vnode_t vp;
13780 	int length;
13781 	int bpflags;
13782 	/* maximum number of times to retry build_path */
13783 	unsigned int retries = 0x10;
13784 
13785 	if (bufsize > FSGETPATH_MAXBUFLEN) {
13786 		return EINVAL;
13787 	}
13788 
13789 	if (buf == NULL) {
13790 		return ENOMEM;
13791 	}
13792 
13793 retry:
13794 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13795 		error = ENOTSUP;  /* unexpected failure */
13796 		return ENOTSUP;
13797 	}
13798 
13799 #if CONFIG_UNION_MOUNTS
13800 unionget:
13801 #endif /* CONFIG_UNION_MOUNTS */
13802 	if (objid == 2) {
13803 		struct vfs_attr vfsattr;
13804 		int use_vfs_root = TRUE;
13805 
13806 		VFSATTR_INIT(&vfsattr);
13807 		VFSATTR_WANTED(&vfsattr, f_capabilities);
13808 		if (!(options & FSOPT_ISREALFSID) &&
13809 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13810 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13811 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13812 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13813 				use_vfs_root = FALSE;
13814 			}
13815 		}
13816 
13817 		if (use_vfs_root) {
13818 			error = VFS_ROOT(mp, &vp, ctx);
13819 		} else {
13820 			error = VFS_VGET(mp, objid, &vp, ctx);
13821 		}
13822 	} else {
13823 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13824 	}
13825 
13826 #if CONFIG_UNION_MOUNTS
13827 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13828 		/*
13829 		 * If the fileid isn't found and we're in a union
13830 		 * mount volume, then see if the fileid is in the
13831 		 * mounted-on volume.
13832 		 */
13833 		struct mount *tmp = mp;
13834 		mp = vnode_mount(tmp->mnt_vnodecovered);
13835 		vfs_unbusy(tmp);
13836 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
13837 			goto unionget;
13838 		}
13839 	} else {
13840 		vfs_unbusy(mp);
13841 	}
13842 #else
13843 	vfs_unbusy(mp);
13844 #endif /* CONFIG_UNION_MOUNTS */
13845 
13846 	if (error) {
13847 		return error;
13848 	}
13849 
13850 #if CONFIG_MACF
13851 	error = mac_vnode_check_fsgetpath(ctx, vp);
13852 	if (error) {
13853 		vnode_put(vp);
13854 		return error;
13855 	}
13856 #endif
13857 
13858 	/* Obtain the absolute path to this vnode. */
13859 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13860 	if (options & FSOPT_NOFIRMLINKPATH) {
13861 		bpflags |= BUILDPATH_NO_FIRMLINK;
13862 	}
13863 	bpflags |= BUILDPATH_CHECK_MOVED;
13864 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13865 	vnode_put(vp);
13866 
13867 	if (error) {
13868 		/* there was a race building the path, try a few more times */
13869 		if (error == EAGAIN) {
13870 			--retries;
13871 			if (retries > 0) {
13872 				goto retry;
13873 			}
13874 
13875 			error = ENOENT;
13876 		}
13877 		goto out;
13878 	}
13879 
13880 	AUDIT_ARG(text, buf);
13881 
13882 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13883 		unsigned long path_words[NUMPARMS];
13884 		size_t path_len = sizeof(path_words);
13885 
13886 		if ((size_t)length < path_len) {
13887 			memcpy((char *)path_words, buf, length);
13888 			memset((char *)path_words + length, 0, path_len - length);
13889 
13890 			path_len = length;
13891 		} else {
13892 			memcpy((char *)path_words, buf + (length - path_len), path_len);
13893 		}
13894 
13895 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
13896 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13897 	}
13898 
13899 	*pathlen = length; /* may be superseded by error */
13900 
13901 out:
13902 	return error;
13903 }
13904 
13905 /*
13906  * Obtain the full pathname of a file system object by id.
13907  */
13908 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13909 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13910     uint32_t options, user_ssize_t *retval)
13911 {
13912 	vfs_context_t ctx = vfs_context_current();
13913 	fsid_t fsid;
13914 	char *realpath;
13915 	int length;
13916 	int error;
13917 
13918 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13919 		return EINVAL;
13920 	}
13921 
13922 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13923 		return error;
13924 	}
13925 	AUDIT_ARG(value32, fsid.val[0]);
13926 	AUDIT_ARG(value64, objid);
13927 	/* Restrict output buffer size for now. */
13928 
13929 	if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13930 		return EINVAL;
13931 	}
13932 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13933 	if (realpath == NULL) {
13934 		return ENOMEM;
13935 	}
13936 
13937 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13938 	    options, &length);
13939 
13940 	if (error) {
13941 		goto out;
13942 	}
13943 
13944 	error = copyout((caddr_t)realpath, buf, length);
13945 
13946 	*retval = (user_ssize_t)length; /* may be superseded by error */
13947 out:
13948 	kfree_data(realpath, bufsize);
13949 	return error;
13950 }
13951 
13952 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13953 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13954 {
13955 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13956 	           0, retval);
13957 }
13958 
13959 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13960 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13961 {
13962 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13963 	           uap->options, retval);
13964 }
13965 
13966 /*
13967  * Common routine to handle various flavors of statfs data heading out
13968  *	to user space.
13969  *
13970  * Returns:	0			Success
13971  *		EFAULT
13972  */
13973 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13974 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13975     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13976     boolean_t partial_copy)
13977 {
13978 	int             error;
13979 	int             my_size, copy_size;
13980 
13981 	if (is_64_bit) {
13982 		struct user64_statfs sfs;
13983 		my_size = copy_size = sizeof(sfs);
13984 		bzero(&sfs, my_size);
13985 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13986 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13987 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13988 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13989 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13990 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13991 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13992 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13993 		sfs.f_files = (user64_long_t)sfsp->f_files;
13994 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13995 		sfs.f_fsid = sfsp->f_fsid;
13996 		sfs.f_owner = sfsp->f_owner;
13997 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13998 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13999 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14000 
14001 		if (partial_copy) {
14002 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14003 		}
14004 		error = copyout((caddr_t)&sfs, bufp, copy_size);
14005 	} else {
14006 		struct user32_statfs sfs;
14007 
14008 		my_size = copy_size = sizeof(sfs);
14009 		bzero(&sfs, my_size);
14010 
14011 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14012 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14013 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14014 
14015 		/*
14016 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
14017 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
14018 		 * to reflect the filesystem size as best we can.
14019 		 */
14020 		if ((sfsp->f_blocks > INT_MAX)
14021 		    /* Hack for 4061702 . I think the real fix is for Carbon to
14022 		     * look for some volume capability and not depend on hidden
14023 		     * semantics agreed between a FS and carbon.
14024 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
14025 		     * for Carbon to set bNoVolumeSizes volume attribute.
14026 		     * Without this the webdavfs files cannot be copied onto
14027 		     * disk as they look huge. This change should not affect
14028 		     * XSAN as they should not setting these to -1..
14029 		     */
14030 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
14031 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
14032 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
14033 			int             shift;
14034 
14035 			/*
14036 			 * Work out how far we have to shift the block count down to make it fit.
14037 			 * Note that it's possible to have to shift so far that the resulting
14038 			 * blocksize would be unreportably large.  At that point, we will clip
14039 			 * any values that don't fit.
14040 			 *
14041 			 * For safety's sake, we also ensure that f_iosize is never reported as
14042 			 * being smaller than f_bsize.
14043 			 */
14044 			for (shift = 0; shift < 32; shift++) {
14045 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
14046 					break;
14047 				}
14048 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
14049 					break;
14050 				}
14051 			}
14052 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
14053 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
14054 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
14055 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
14056 #undef __SHIFT_OR_CLIP
14057 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
14058 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
14059 		} else {
14060 			/* filesystem is small enough to be reported honestly */
14061 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
14062 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
14063 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
14064 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
14065 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
14066 		}
14067 		sfs.f_files = (user32_long_t)sfsp->f_files;
14068 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
14069 		sfs.f_fsid = sfsp->f_fsid;
14070 		sfs.f_owner = sfsp->f_owner;
14071 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14072 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14073 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14074 
14075 		if (partial_copy) {
14076 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14077 		}
14078 		error = copyout((caddr_t)&sfs, bufp, copy_size);
14079 	}
14080 
14081 	if (sizep != NULL) {
14082 		*sizep = my_size;
14083 	}
14084 	return error;
14085 }
14086 
14087 /*
14088  * copy stat structure into user_stat structure.
14089  */
14090 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)14091 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
14092 {
14093 	bzero(usbp, sizeof(*usbp));
14094 
14095 	usbp->st_dev = sbp->st_dev;
14096 	usbp->st_ino = sbp->st_ino;
14097 	usbp->st_mode = sbp->st_mode;
14098 	usbp->st_nlink = sbp->st_nlink;
14099 	usbp->st_uid = sbp->st_uid;
14100 	usbp->st_gid = sbp->st_gid;
14101 	usbp->st_rdev = sbp->st_rdev;
14102 #ifndef _POSIX_C_SOURCE
14103 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14104 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14105 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14106 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14107 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14108 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14109 #else
14110 	usbp->st_atime = sbp->st_atime;
14111 	usbp->st_atimensec = sbp->st_atimensec;
14112 	usbp->st_mtime = sbp->st_mtime;
14113 	usbp->st_mtimensec = sbp->st_mtimensec;
14114 	usbp->st_ctime = sbp->st_ctime;
14115 	usbp->st_ctimensec = sbp->st_ctimensec;
14116 #endif
14117 	usbp->st_size = sbp->st_size;
14118 	usbp->st_blocks = sbp->st_blocks;
14119 	usbp->st_blksize = sbp->st_blksize;
14120 	usbp->st_flags = sbp->st_flags;
14121 	usbp->st_gen = sbp->st_gen;
14122 	usbp->st_lspare = sbp->st_lspare;
14123 	usbp->st_qspare[0] = sbp->st_qspare[0];
14124 	usbp->st_qspare[1] = sbp->st_qspare[1];
14125 }
14126 
14127 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)14128 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
14129 {
14130 	bzero(usbp, sizeof(*usbp));
14131 
14132 	usbp->st_dev = sbp->st_dev;
14133 	usbp->st_ino = sbp->st_ino;
14134 	usbp->st_mode = sbp->st_mode;
14135 	usbp->st_nlink = sbp->st_nlink;
14136 	usbp->st_uid = sbp->st_uid;
14137 	usbp->st_gid = sbp->st_gid;
14138 	usbp->st_rdev = sbp->st_rdev;
14139 #ifndef _POSIX_C_SOURCE
14140 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14141 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14142 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14143 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14144 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14145 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14146 #else
14147 	usbp->st_atime = sbp->st_atime;
14148 	usbp->st_atimensec = sbp->st_atimensec;
14149 	usbp->st_mtime = sbp->st_mtime;
14150 	usbp->st_mtimensec = sbp->st_mtimensec;
14151 	usbp->st_ctime = sbp->st_ctime;
14152 	usbp->st_ctimensec = sbp->st_ctimensec;
14153 #endif
14154 	usbp->st_size = sbp->st_size;
14155 	usbp->st_blocks = sbp->st_blocks;
14156 	usbp->st_blksize = sbp->st_blksize;
14157 	usbp->st_flags = sbp->st_flags;
14158 	usbp->st_gen = sbp->st_gen;
14159 	usbp->st_lspare = sbp->st_lspare;
14160 	usbp->st_qspare[0] = sbp->st_qspare[0];
14161 	usbp->st_qspare[1] = sbp->st_qspare[1];
14162 }
14163 
14164 /*
14165  * copy stat64 structure into user_stat64 structure.
14166  */
14167 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)14168 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
14169 {
14170 	bzero(usbp, sizeof(*usbp));
14171 
14172 	usbp->st_dev = sbp->st_dev;
14173 	usbp->st_ino = sbp->st_ino;
14174 	usbp->st_mode = sbp->st_mode;
14175 	usbp->st_nlink = sbp->st_nlink;
14176 	usbp->st_uid = sbp->st_uid;
14177 	usbp->st_gid = sbp->st_gid;
14178 	usbp->st_rdev = sbp->st_rdev;
14179 #ifndef _POSIX_C_SOURCE
14180 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14181 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14182 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14183 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14184 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14185 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14186 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
14187 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
14188 #else
14189 	usbp->st_atime = sbp->st_atime;
14190 	usbp->st_atimensec = sbp->st_atimensec;
14191 	usbp->st_mtime = sbp->st_mtime;
14192 	usbp->st_mtimensec = sbp->st_mtimensec;
14193 	usbp->st_ctime = sbp->st_ctime;
14194 	usbp->st_ctimensec = sbp->st_ctimensec;
14195 	usbp->st_birthtime = sbp->st_birthtime;
14196 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14197 #endif
14198 	usbp->st_size = sbp->st_size;
14199 	usbp->st_blocks = sbp->st_blocks;
14200 	usbp->st_blksize = sbp->st_blksize;
14201 	usbp->st_flags = sbp->st_flags;
14202 	usbp->st_gen = sbp->st_gen;
14203 	usbp->st_lspare = sbp->st_lspare;
14204 	usbp->st_qspare[0] = sbp->st_qspare[0];
14205 	usbp->st_qspare[1] = sbp->st_qspare[1];
14206 }
14207 
14208 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14209 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14210 {
14211 	bzero(usbp, sizeof(*usbp));
14212 
14213 	usbp->st_dev = sbp->st_dev;
14214 	usbp->st_ino = sbp->st_ino;
14215 	usbp->st_mode = sbp->st_mode;
14216 	usbp->st_nlink = sbp->st_nlink;
14217 	usbp->st_uid = sbp->st_uid;
14218 	usbp->st_gid = sbp->st_gid;
14219 	usbp->st_rdev = sbp->st_rdev;
14220 #ifndef _POSIX_C_SOURCE
14221 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14222 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14223 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14224 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14225 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14226 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14227 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14228 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14229 #else
14230 	usbp->st_atime = sbp->st_atime;
14231 	usbp->st_atimensec = sbp->st_atimensec;
14232 	usbp->st_mtime = sbp->st_mtime;
14233 	usbp->st_mtimensec = sbp->st_mtimensec;
14234 	usbp->st_ctime = sbp->st_ctime;
14235 	usbp->st_ctimensec = sbp->st_ctimensec;
14236 	usbp->st_birthtime = sbp->st_birthtime;
14237 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14238 #endif
14239 	usbp->st_size = sbp->st_size;
14240 	usbp->st_blocks = sbp->st_blocks;
14241 	usbp->st_blksize = sbp->st_blksize;
14242 	usbp->st_flags = sbp->st_flags;
14243 	usbp->st_gen = sbp->st_gen;
14244 	usbp->st_lspare = sbp->st_lspare;
14245 	usbp->st_qspare[0] = sbp->st_qspare[0];
14246 	usbp->st_qspare[1] = sbp->st_qspare[1];
14247 }
14248 
14249 /*
14250  * Purge buffer cache for simulating cold starts
14251  */
14252 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14253 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14254 {
14255 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14256 
14257 	return VNODE_RETURNED;
14258 }
14259 
14260 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14261 vfs_purge_callback(mount_t mp, __unused void * arg)
14262 {
14263 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14264 
14265 	return VFS_RETURNED;
14266 }
14267 
14268 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14269 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14270 
14271 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14272 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14273 {
14274 	if (!kauth_cred_issuser(kauth_cred_get())) {
14275 		return EPERM;
14276 	}
14277 
14278 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14279 
14280 	/* also flush any VM pagers backed by files */
14281 	if (vfs_purge_vm_pagers) {
14282 		vm_purge_filebacked_pagers();
14283 	}
14284 
14285 	return 0;
14286 }
14287 
14288 /*
14289  * gets the vnode associated with the (unnamed) snapshot directory
14290  * for a Filesystem. The snapshot directory vnode is returned with
14291  * an iocount on it.
14292  */
14293 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14294 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14295 {
14296 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14297 }
14298 
14299 /*
14300  * Get the snapshot vnode.
14301  *
14302  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14303  * needs nameidone() on ndp.
14304  *
14305  * If the snapshot vnode exists it is returned in ndp->ni_vp.
14306  *
14307  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14308  * not needed.
14309  */
14310 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14311 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14312     user_addr_t name, struct nameidata *ndp, int32_t op,
14313 #if !CONFIG_TRIGGERS
14314     __unused
14315 #endif
14316     enum path_operation pathop,
14317     vfs_context_t ctx)
14318 {
14319 	int error, i;
14320 	caddr_t name_buf;
14321 	size_t name_len;
14322 	struct vfs_attr vfa;
14323 
14324 	*sdvpp = NULLVP;
14325 	*rvpp = NULLVP;
14326 
14327 	error = vnode_getfromfd(ctx, dirfd, rvpp);
14328 	if (error) {
14329 		return error;
14330 	}
14331 
14332 	if (!vnode_isvroot(*rvpp)) {
14333 		error = EINVAL;
14334 		goto out;
14335 	}
14336 
14337 	/* Make sure the filesystem supports snapshots */
14338 	VFSATTR_INIT(&vfa);
14339 	VFSATTR_WANTED(&vfa, f_capabilities);
14340 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14341 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14342 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14343 	    VOL_CAP_INT_SNAPSHOT)) ||
14344 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14345 	    VOL_CAP_INT_SNAPSHOT))) {
14346 		error = ENOTSUP;
14347 		goto out;
14348 	}
14349 
14350 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14351 	if (error) {
14352 		goto out;
14353 	}
14354 
14355 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14356 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14357 	if (error) {
14358 		goto out1;
14359 	}
14360 
14361 	/*
14362 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14363 	 * (the length returned by copyinstr includes the terminating NUL)
14364 	 */
14365 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14366 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14367 		error = EINVAL;
14368 		goto out1;
14369 	}
14370 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14371 		;
14372 	}
14373 	if (i < (int)name_len) {
14374 		error = EINVAL;
14375 		goto out1;
14376 	}
14377 
14378 #if CONFIG_MACF
14379 	if (op == CREATE) {
14380 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14381 		    name_buf);
14382 	} else if (op == DELETE) {
14383 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14384 		    name_buf);
14385 	}
14386 	if (error) {
14387 		goto out1;
14388 	}
14389 #endif
14390 
14391 	/* Check if the snapshot already exists ... */
14392 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14393 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14394 	ndp->ni_dvp = *sdvpp;
14395 
14396 	error = namei(ndp);
14397 out1:
14398 	zfree(ZV_NAMEI, name_buf);
14399 out:
14400 	if (error) {
14401 		if (*sdvpp) {
14402 			vnode_put(*sdvpp);
14403 			*sdvpp = NULLVP;
14404 		}
14405 		if (*rvpp) {
14406 			vnode_put(*rvpp);
14407 			*rvpp = NULLVP;
14408 		}
14409 	}
14410 	return error;
14411 }
14412 
14413 /*
14414  * create a filesystem snapshot (for supporting filesystems)
14415  *
14416  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14417  * We get to the (unnamed) snapshot directory vnode and create the vnode
14418  * for the snapshot in it.
14419  *
14420  * Restrictions:
14421  *
14422  *    a) Passed in name for snapshot cannot have slashes.
14423  *    b) name can't be "." or ".."
14424  *
14425  * Since this requires superuser privileges, vnode_authorize calls are not
14426  * made.
14427  */
14428 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14429 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14430     vfs_context_t ctx)
14431 {
14432 	vnode_t rvp, snapdvp;
14433 	int error;
14434 	struct nameidata *ndp;
14435 
14436 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14437 
14438 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14439 	    OP_LINK, ctx);
14440 	if (error) {
14441 		goto out;
14442 	}
14443 
14444 	if (ndp->ni_vp) {
14445 		vnode_put(ndp->ni_vp);
14446 		error = EEXIST;
14447 	} else {
14448 		struct vnode_attr *vap;
14449 		vnode_t vp = NULLVP;
14450 
14451 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14452 
14453 		VATTR_INIT(vap);
14454 		VATTR_SET(vap, va_type, VREG);
14455 		VATTR_SET(vap, va_mode, 0);
14456 
14457 		error = vn_create(snapdvp, &vp, ndp, vap,
14458 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14459 		if (!error && vp) {
14460 			vnode_put(vp);
14461 		}
14462 
14463 		kfree_type(struct vnode_attr, vap);
14464 	}
14465 
14466 	nameidone(ndp);
14467 	vnode_put(snapdvp);
14468 	vnode_put(rvp);
14469 out:
14470 	kfree_type(struct nameidata, ndp);
14471 
14472 	return error;
14473 }
14474 
14475 /*
14476  * Delete a Filesystem snapshot
14477  *
14478  * get the vnode for the unnamed snapshot directory and the snapshot and
14479  * delete the snapshot.
14480  */
14481 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14482 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14483     vfs_context_t ctx)
14484 {
14485 	vnode_t rvp, snapdvp;
14486 	int error;
14487 	struct nameidata *ndp;
14488 
14489 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14490 
14491 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14492 	    OP_UNLINK, ctx);
14493 	if (error) {
14494 		goto out;
14495 	}
14496 
14497 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14498 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14499 
14500 	vnode_put(ndp->ni_vp);
14501 	nameidone(ndp);
14502 	vnode_put(snapdvp);
14503 	vnode_put(rvp);
14504 out:
14505 	kfree_type(struct nameidata, ndp);
14506 
14507 	return error;
14508 }
14509 
14510 /*
14511  * Revert a filesystem to a snapshot
14512  *
14513  * Marks the filesystem to revert to the given snapshot on next mount.
14514  */
14515 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14516 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14517     vfs_context_t ctx)
14518 {
14519 	int error;
14520 	vnode_t rvp;
14521 	mount_t mp;
14522 	struct fs_snapshot_revert_args revert_data;
14523 	struct componentname cnp;
14524 	caddr_t name_buf;
14525 	size_t name_len;
14526 
14527 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14528 	if (error) {
14529 		return error;
14530 	}
14531 	mp = vnode_mount(rvp);
14532 
14533 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14534 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14535 	if (error) {
14536 		zfree(ZV_NAMEI, name_buf);
14537 		vnode_put(rvp);
14538 		return error;
14539 	}
14540 
14541 #if CONFIG_MACF
14542 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14543 	if (error) {
14544 		zfree(ZV_NAMEI, name_buf);
14545 		vnode_put(rvp);
14546 		return error;
14547 	}
14548 #endif
14549 
14550 	/*
14551 	 * Grab mount_iterref so that we can release the vnode,
14552 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14553 	 */
14554 	error = mount_iterref(mp, 0);
14555 	vnode_put(rvp);
14556 	if (error) {
14557 		zfree(ZV_NAMEI, name_buf);
14558 		return error;
14559 	}
14560 
14561 	memset(&cnp, 0, sizeof(cnp));
14562 	cnp.cn_pnbuf = (char *)name_buf;
14563 	cnp.cn_nameiop = LOOKUP;
14564 	cnp.cn_flags = ISLASTCN | HASBUF;
14565 	cnp.cn_pnlen = MAXPATHLEN;
14566 	cnp.cn_nameptr = cnp.cn_pnbuf;
14567 	cnp.cn_namelen = (int)name_len;
14568 	revert_data.sr_cnp = &cnp;
14569 
14570 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14571 	mount_iterdrop(mp);
14572 	zfree(ZV_NAMEI, name_buf);
14573 
14574 	if (error) {
14575 		/* If there was any error, try again using VNOP_IOCTL */
14576 
14577 		vnode_t snapdvp;
14578 		struct nameidata namend;
14579 
14580 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14581 		    OP_LOOKUP, ctx);
14582 		if (error) {
14583 			return error;
14584 		}
14585 
14586 
14587 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14588 		    0, ctx);
14589 
14590 		vnode_put(namend.ni_vp);
14591 		nameidone(&namend);
14592 		vnode_put(snapdvp);
14593 		vnode_put(rvp);
14594 	}
14595 
14596 	return error;
14597 }
14598 
14599 /*
14600  * rename a Filesystem snapshot
14601  *
14602  * get the vnode for the unnamed snapshot directory and the snapshot and
14603  * rename the snapshot. This is a very specialised (and simple) case of
14604  * rename(2) (which has to deal with a lot more complications). It differs
14605  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14606  */
14607 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14608 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14609     __unused uint32_t flags, vfs_context_t ctx)
14610 {
14611 	vnode_t rvp, snapdvp;
14612 	int error, i;
14613 	caddr_t newname_buf;
14614 	size_t name_len;
14615 	vnode_t fvp;
14616 	struct nameidata *fromnd, *tond;
14617 	/* carving out a chunk for structs that are too big to be on stack. */
14618 	struct {
14619 		struct nameidata from_node;
14620 		struct nameidata to_node;
14621 	} * __rename_data;
14622 
14623 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14624 	fromnd = &__rename_data->from_node;
14625 	tond = &__rename_data->to_node;
14626 
14627 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14628 	    OP_UNLINK, ctx);
14629 	if (error) {
14630 		goto out;
14631 	}
14632 	fvp  = fromnd->ni_vp;
14633 
14634 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14635 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14636 	if (error) {
14637 		goto out1;
14638 	}
14639 
14640 	/*
14641 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14642 	 * slashes.
14643 	 * (the length returned by copyinstr includes the terminating NUL)
14644 	 *
14645 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14646 	 * off here itself.
14647 	 */
14648 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14649 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14650 		error = EINVAL;
14651 		goto out1;
14652 	}
14653 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14654 		;
14655 	}
14656 	if (i < (int)name_len) {
14657 		error = EINVAL;
14658 		goto out1;
14659 	}
14660 
14661 #if CONFIG_MACF
14662 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14663 	    newname_buf);
14664 	if (error) {
14665 		goto out1;
14666 	}
14667 #endif
14668 
14669 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14670 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14671 	tond->ni_dvp = snapdvp;
14672 
14673 	error = namei(tond);
14674 	if (error) {
14675 		goto out2;
14676 	} else if (tond->ni_vp) {
14677 		/*
14678 		 * snapshot rename behaves differently than rename(2) - if the
14679 		 * new name exists, EEXIST is returned.
14680 		 */
14681 		vnode_put(tond->ni_vp);
14682 		error = EEXIST;
14683 		goto out2;
14684 	}
14685 
14686 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14687 	    &tond->ni_cnd, ctx);
14688 
14689 out2:
14690 	nameidone(tond);
14691 out1:
14692 	zfree(ZV_NAMEI, newname_buf);
14693 	vnode_put(fvp);
14694 	vnode_put(snapdvp);
14695 	vnode_put(rvp);
14696 	nameidone(fromnd);
14697 out:
14698 	kfree_type(typeof(*__rename_data), __rename_data);
14699 	return error;
14700 }
14701 
14702 /*
14703  * Mount a Filesystem snapshot
14704  *
14705  * get the vnode for the unnamed snapshot directory and the snapshot and
14706  * mount the snapshot.
14707  */
14708 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14709 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14710     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14711 {
14712 	mount_t mp;
14713 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14714 	struct fs_snapshot_mount_args smnt_data;
14715 	int error, mount_flags = 0;
14716 	struct nameidata *snapndp, *dirndp;
14717 	/* carving out a chunk for structs that are too big to be on stack. */
14718 	struct {
14719 		struct nameidata snapnd;
14720 		struct nameidata dirnd;
14721 	} * __snapshot_mount_data;
14722 
14723 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14724 	snapndp = &__snapshot_mount_data->snapnd;
14725 	dirndp = &__snapshot_mount_data->dirnd;
14726 
14727 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14728 	    OP_LOOKUP, ctx);
14729 	if (error) {
14730 		goto out;
14731 	}
14732 
14733 	snapvp  = snapndp->ni_vp;
14734 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14735 		error = EIO;
14736 		goto out1;
14737 	}
14738 
14739 	/* Convert snapshot_mount flags to mount flags */
14740 	if (flags & SNAPSHOT_MNT_DONTBROWSE) {
14741 		mount_flags |= MNT_DONTBROWSE;
14742 	}
14743 	if (flags & SNAPSHOT_MNT_IGNORE_OWNERSHIP) {
14744 		mount_flags |= MNT_IGNORE_OWNERSHIP;
14745 	}
14746 	if (flags & SNAPSHOT_MNT_NOFOLLOW) {
14747 		mount_flags |= MNT_NOFOLLOW;
14748 	}
14749 
14750 	/* Get the vnode to be covered */
14751 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14752 	    UIO_USERSPACE, directory, ctx);
14753 	if (mount_flags & MNT_NOFOLLOW) {
14754 		dirndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
14755 	}
14756 
14757 	error = namei(dirndp);
14758 	if (error) {
14759 		goto out1;
14760 	}
14761 
14762 	vp = dirndp->ni_vp;
14763 	pvp = dirndp->ni_dvp;
14764 	mp = vnode_mount(rvp);
14765 
14766 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14767 		error = EINVAL;
14768 		goto out2;
14769 	}
14770 
14771 #if CONFIG_MACF
14772 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14773 	    mp->mnt_vfsstat.f_fstypename);
14774 	if (error) {
14775 		goto out2;
14776 	}
14777 #endif
14778 
14779 	smnt_data.sm_mp  = mp;
14780 	smnt_data.sm_cnp = &snapndp->ni_cnd;
14781 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14782 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), mount_flags,
14783 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14784 
14785 out2:
14786 	vnode_put(vp);
14787 	vnode_put(pvp);
14788 	nameidone(dirndp);
14789 out1:
14790 	vnode_put(snapvp);
14791 	vnode_put(snapdvp);
14792 	vnode_put(rvp);
14793 	nameidone(snapndp);
14794 out:
14795 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14796 	return error;
14797 }
14798 
14799 /*
14800  * Root from a snapshot of the filesystem
14801  *
14802  * Marks the filesystem to root from the given snapshot on next boot.
14803  */
14804 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14805 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14806     vfs_context_t ctx)
14807 {
14808 	int error;
14809 	vnode_t rvp;
14810 	mount_t mp;
14811 	struct fs_snapshot_root_args root_data;
14812 	struct componentname cnp;
14813 	caddr_t name_buf;
14814 	size_t name_len;
14815 
14816 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14817 	if (error) {
14818 		return error;
14819 	}
14820 	mp = vnode_mount(rvp);
14821 
14822 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14823 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14824 	if (error) {
14825 		zfree(ZV_NAMEI, name_buf);
14826 		vnode_put(rvp);
14827 		return error;
14828 	}
14829 
14830 	// XXX MAC checks ?
14831 
14832 	/*
14833 	 * Grab mount_iterref so that we can release the vnode,
14834 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14835 	 */
14836 	error = mount_iterref(mp, 0);
14837 	vnode_put(rvp);
14838 	if (error) {
14839 		zfree(ZV_NAMEI, name_buf);
14840 		return error;
14841 	}
14842 
14843 	memset(&cnp, 0, sizeof(cnp));
14844 	cnp.cn_pnbuf = (char *)name_buf;
14845 	cnp.cn_nameiop = LOOKUP;
14846 	cnp.cn_flags = ISLASTCN | HASBUF;
14847 	cnp.cn_pnlen = MAXPATHLEN;
14848 	cnp.cn_nameptr = cnp.cn_pnbuf;
14849 	cnp.cn_namelen = (int)name_len;
14850 	root_data.sr_cnp = &cnp;
14851 
14852 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14853 
14854 	mount_iterdrop(mp);
14855 	zfree(ZV_NAMEI, name_buf);
14856 
14857 	return error;
14858 }
14859 
14860 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14861 vfs_context_can_snapshot(vfs_context_t ctx)
14862 {
14863 	static const char * const snapshot_entitlements[] = {
14864 		"com.apple.private.vfs.snapshot",
14865 		"com.apple.developer.vfs.snapshot",
14866 		"com.apple.private.apfs.arv.limited.snapshot",
14867 	};
14868 	static const size_t nentitlements =
14869 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14870 	size_t i;
14871 
14872 	task_t task = vfs_context_task(ctx);
14873 	for (i = 0; i < nentitlements; i++) {
14874 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14875 			return TRUE;
14876 		}
14877 	}
14878 	return FALSE;
14879 }
14880 
14881 /*
14882  * FS snapshot operations dispatcher
14883  */
14884 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14885 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14886     __unused int32_t *retval)
14887 {
14888 	int error;
14889 	vfs_context_t ctx = vfs_context_current();
14890 
14891 	AUDIT_ARG(fd, uap->dirfd);
14892 	AUDIT_ARG(value32, uap->op);
14893 
14894 	if (!vfs_context_can_snapshot(ctx)) {
14895 		return EPERM;
14896 	}
14897 
14898 	/*
14899 	 * Enforce user authorization for snapshot modification operations,
14900 	 * or if trying to root from snapshot.
14901 	 */
14902 	if (uap->op != SNAPSHOT_OP_MOUNT) {
14903 		vnode_t dvp = NULLVP;
14904 		vnode_t devvp = NULLVP;
14905 		mount_t mp;
14906 
14907 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14908 		if (error) {
14909 			return error;
14910 		}
14911 		mp = vnode_mount(dvp);
14912 		devvp = mp->mnt_devvp;
14913 
14914 		/* get an iocount on devvp */
14915 		if (devvp == NULLVP) {
14916 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14917 			/* for mounts which arent block devices */
14918 			if (error == ENOENT) {
14919 				error = ENXIO;
14920 			}
14921 		} else {
14922 			error = vnode_getwithref(devvp);
14923 		}
14924 
14925 		if (error) {
14926 			vnode_put(dvp);
14927 			return error;
14928 		}
14929 
14930 		if ((vfs_context_issuser(ctx) == 0) &&
14931 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14932 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14933 			error = EPERM;
14934 		}
14935 		vnode_put(dvp);
14936 		vnode_put(devvp);
14937 
14938 		if (error) {
14939 			return error;
14940 		}
14941 	}
14942 
14943 	switch (uap->op) {
14944 	case SNAPSHOT_OP_CREATE:
14945 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14946 		break;
14947 	case SNAPSHOT_OP_DELETE:
14948 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14949 		break;
14950 	case SNAPSHOT_OP_RENAME:
14951 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14952 		    uap->flags, ctx);
14953 		break;
14954 	case SNAPSHOT_OP_MOUNT:
14955 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14956 		    uap->data, uap->flags, ctx);
14957 		break;
14958 	case SNAPSHOT_OP_REVERT:
14959 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14960 		break;
14961 #if CONFIG_MNT_ROOTSNAP
14962 	case SNAPSHOT_OP_ROOT:
14963 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14964 		break;
14965 #endif /* CONFIG_MNT_ROOTSNAP */
14966 	default:
14967 		error = ENOSYS;
14968 	}
14969 
14970 	return error;
14971 }
14972