xref: /xnu-12377.1.9/bsd/vfs/vfs_syscalls.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/syslimits.h> /* For MAXLONGPATHLEN */
77 #include <sys/namei.h>
78 #include <sys/filedesc.h>
79 #include <sys/kernel.h>
80 #include <sys/file_internal.h>
81 #include <sys/stat.h>
82 #include <sys/vnode_internal.h>
83 #include <sys/mount_internal.h>
84 #include <sys/proc_internal.h>
85 #include <sys/kauth.h>
86 #include <sys/uio_internal.h>
87 #include <kern/kalloc.h>
88 #include <sys/mman.h>
89 #include <sys/dirent.h>
90 #include <sys/attr.h>
91 #include <sys/sysctl.h>
92 #include <sys/ubc.h>
93 #include <sys/quota.h>
94 #include <sys/kdebug.h>
95 #include <sys/fsevents.h>
96 #include <sys/imgsrc.h>
97 #include <sys/sysproto.h>
98 #include <sys/sysctl.h>
99 #include <sys/xattr.h>
100 #include <sys/fcntl.h>
101 #include <sys/stdio.h>
102 #include <sys/fsctl.h>
103 #include <sys/ubc_internal.h>
104 #include <sys/disk.h>
105 #include <sys/content_protection.h>
106 #include <sys/clonefile.h>
107 #include <sys/snapshot.h>
108 #include <sys/priv.h>
109 #include <sys/fsgetpath.h>
110 #include <machine/cons.h>
111 #include <machine/limits.h>
112 #include <miscfs/specfs/specdev.h>
113 
114 #include <vfs/vfs_disk_conditioner.h>
115 #if CONFIG_EXCLAVES
116 #include <vfs/vfs_exclave_fs.h>
117 #endif
118 
119 #include <security/audit/audit.h>
120 #include <bsm/audit_kevents.h>
121 
122 #include <mach/mach_types.h>
123 #include <kern/kern_types.h>
124 #include <kern/kalloc.h>
125 #include <kern/task.h>
126 
127 #include <vm/vm_pageout.h>
128 #include <vm/vm_protos.h>
129 #include <vm/memory_object_xnu.h>
130 
131 #include <libkern/OSAtomic.h>
132 #include <os/atomic_private.h>
133 #include <pexpert/pexpert.h>
134 #include <IOKit/IOBSD.h>
135 
136 // deps for MIG call
137 #include <kern/host.h>
138 #include <kern/ipc_misc.h>
139 #include <mach/host_priv.h>
140 #include <mach/vfs_nspace.h>
141 #include <os/log.h>
142 
143 #include <nfs/nfs_conf.h>
144 
145 #if ROUTEFS
146 #include <miscfs/routefs/routefs.h>
147 #endif /* ROUTEFS */
148 
149 #if CONFIG_MACF
150 #include <security/mac.h>
151 #include <security/mac_framework.h>
152 #endif
153 
154 #if CONFIG_FSE
155 #define GET_PATH(x) \
156 	((x) = get_pathbuff())
157 #define RELEASE_PATH(x) \
158 	release_pathbuff(x)
159 #else
160 #define GET_PATH(x)     \
161 	((x) = zalloc(ZV_NAMEI))
162 #define RELEASE_PATH(x) \
163 	zfree(ZV_NAMEI, x)
164 #endif /* CONFIG_FSE */
165 
166 #ifndef HFS_GET_BOOT_INFO
167 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
168 #endif
169 
170 #ifndef HFS_SET_BOOT_INFO
171 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
172 #endif
173 
174 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
175 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
176 #endif
177 
178 extern void disk_conditioner_unmount(mount_t mp);
179 
180 /* struct for checkdirs iteration */
181 struct cdirargs {
182 	vnode_t olddp;
183 	vnode_t newdp;
184 };
185 /* callback  for checkdirs iteration */
186 static int checkdirs_callback(proc_t p, void * arg);
187 
188 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
189 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
190 void enablequotas(struct mount *mp, vfs_context_t ctx);
191 static int getfsstat_callback(mount_t mp, void * arg);
192 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
193 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
194 static int sync_callback(mount_t, void *);
195 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
196     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
197     boolean_t partial_copy);
198 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
199 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
200     struct componentname *cnp, user_addr_t fsmountargs,
201     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
202 void vfs_notify_mount(vnode_t pdvp);
203 
204 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
205 
206 struct fd_vn_data * fg_vn_data_alloc(void);
207 
208 /*
209  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
210  * Concurrent lookups (or lookups by ids) on hard links can cause the
211  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
212  * does) to return ENOENT as the path cannot be returned from the name cache
213  * alone. We have no option but to retry and hope to get one namei->reverse path
214  * generation done without an intervening lookup, lookup by id on the hard link
215  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
216  * which currently are the MAC hooks for rename, unlink and rmdir.
217  */
218 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
219 
220 /* Max retry limit for rename due to vnode recycling. */
221 #define MAX_RENAME_ERECYCLE_RETRIES 1024
222 
223 #define MAX_LINK_ENOENT_RETRIES 1024
224 
225 /* Max retries for concurrent mounts on the same covered vnode. */
226 #define MAX_MOUNT_RETRIES       10
227 
228 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
229     int unlink_flags);
230 
231 #ifdef CONFIG_IMGSRC_ACCESS
232 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
233 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
234 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
235 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
236 static void mount_end_update(mount_t mp);
237 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
238 #endif /* CONFIG_IMGSRC_ACCESS */
239 
240 //snapshot functions
241 #if CONFIG_MNT_ROOTSNAP
242 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
243 #else
244 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
245 #endif
246 
247 __private_extern__
248 int sync_internal(void);
249 
250 __private_extern__
251 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
252 
253 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
254 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
255 
256 /* vars for sync mutex */
257 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
258 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
259 
260 extern lck_rw_t rootvnode_rw_lock;
261 
262 VFS_SMR_DECLARE;
263 extern uint32_t nc_smr_enabled;
264 
265 /*
266  * incremented each time a mount or unmount operation occurs
267  * used to invalidate the cached value of the rootvp in the
268  * mount structure utilized by cache_lookup_path
269  */
270 uint32_t mount_generation = 0;
271 
272 /* counts number of mount and unmount operations */
273 unsigned int vfs_nummntops = 0;
274 
275 /* system-wide, per-boot unique mount ID */
276 static _Atomic uint64_t mount_unique_id = 1;
277 
278 extern const struct fileops vnops;
279 #if CONFIG_APPLEDOUBLE
280 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
281 #endif /* CONFIG_APPLEDOUBLE */
282 
283 
284 /*
285  * Virtual File System System Calls
286  */
287 
288 /*
289  * Private in-kernel mounting spi (specific use-cases only)
290  */
291 boolean_t
vfs_iskernelmount(mount_t mp)292 vfs_iskernelmount(mount_t mp)
293 {
294 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
295 }
296 
297 __private_extern__
298 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)299 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
300     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
301     vfs_context_t ctx)
302 {
303 	struct nameidata nd;
304 	boolean_t did_namei;
305 	int error;
306 
307 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
308 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
309 	if (syscall_flags & MNT_NOFOLLOW) {
310 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
311 	}
312 
313 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
314 
315 	/*
316 	 * Get the vnode to be covered if it's not supplied
317 	 */
318 	if (vp == NULLVP) {
319 		error = namei(&nd);
320 		if (error) {
321 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
322 				printf("failed to locate mount-on path: %s ", path);
323 			}
324 			return error;
325 		}
326 		vp = nd.ni_vp;
327 		pvp = nd.ni_dvp;
328 		did_namei = TRUE;
329 	} else {
330 		char *pnbuf = CAST_DOWN(char *, path);
331 
332 		nd.ni_cnd.cn_pnbuf = pnbuf;
333 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
334 		did_namei = FALSE;
335 	}
336 
337 	kern_flags |= KERNEL_MOUNT_KMOUNT;
338 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
339 	    syscall_flags, kern_flags, NULL, ctx);
340 
341 	if (did_namei) {
342 		vnode_put(vp);
343 		vnode_put(pvp);
344 		nameidone(&nd);
345 	}
346 
347 	return error;
348 }
349 
350 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)351 vfs_mount_at_path(const char *fstype, const char *path,
352     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
353     int mnt_flags, int flags)
354 {
355 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
356 	int error, km_flags = 0;
357 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
358 
359 	/*
360 	 * This call is currently restricted to specific use cases.
361 	 */
362 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
363 		return ENOTSUP;
364 	}
365 
366 #if !defined(XNU_TARGET_OS_OSX)
367 	if (strcmp(fstype, "lifs") == 0) {
368 		syscall_flags |= MNT_NOEXEC;
369 	}
370 #endif
371 
372 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
373 		km_flags |= KERNEL_MOUNT_NOAUTH;
374 	}
375 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
376 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
377 	}
378 
379 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
380 	    syscall_flags, km_flags, ctx);
381 	if (error) {
382 		printf("%s: mount on %s failed, error %d\n", __func__, path,
383 		    error);
384 	}
385 
386 	return error;
387 }
388 
389 /*
390  * Mount a file system.
391  */
392 /* ARGSUSED */
393 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)394 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
395 {
396 	struct __mac_mount_args muap;
397 
398 	muap.type = uap->type;
399 	muap.path = uap->path;
400 	muap.flags = uap->flags;
401 	muap.data = uap->data;
402 	muap.mac_p = USER_ADDR_NULL;
403 	return __mac_mount(p, &muap, retval);
404 }
405 
406 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)407 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
408 {
409 	struct componentname    cn;
410 	vfs_context_t           ctx = vfs_context_current();
411 	size_t                  dummy = 0;
412 	int                     error;
413 	int                     flags = uap->flags;
414 	char                    fstypename[MFSNAMELEN];
415 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
416 	vnode_t                 pvp;
417 	vnode_t                 vp;
418 
419 	AUDIT_ARG(fd, uap->fd);
420 	AUDIT_ARG(fflags, flags);
421 	/* fstypename will get audited by mount_common */
422 
423 	/* Sanity check the flags */
424 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
425 		return ENOTSUP;
426 	}
427 
428 	if (flags & MNT_UNION) {
429 		return EPERM;
430 	}
431 
432 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
433 	if (error) {
434 		return error;
435 	}
436 
437 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
438 		return error;
439 	}
440 
441 	if ((error = vnode_getwithref(vp)) != 0) {
442 		file_drop(uap->fd);
443 		return error;
444 	}
445 
446 	pvp = vnode_getparent(vp);
447 	if (pvp == NULL) {
448 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
449 			error = EBUSY;
450 		} else {
451 			error = EINVAL;
452 		}
453 		vnode_put(vp);
454 		file_drop(uap->fd);
455 		return error;
456 	}
457 
458 	memset(&cn, 0, sizeof(struct componentname));
459 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
460 	cn.cn_pnlen = MAXPATHLEN;
461 
462 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
463 		zfree(ZV_NAMEI, cn.cn_pnbuf);
464 		vnode_put(pvp);
465 		vnode_put(vp);
466 		file_drop(uap->fd);
467 		return error;
468 	}
469 
470 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
471 
472 	zfree(ZV_NAMEI, cn.cn_pnbuf);
473 	vnode_put(pvp);
474 	vnode_put(vp);
475 	file_drop(uap->fd);
476 
477 	return error;
478 }
479 
480 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
481 
482 /*
483  * Get the size of a graft file (a manifest or payload file).
484  * The vp should be an iocounted vnode.
485  */
486 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)487 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
488 {
489 	struct stat64 sb = {};
490 	int error;
491 
492 	*size = 0;
493 
494 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
495 	if (error) {
496 		return error;
497 	}
498 
499 	if (sb.st_size == 0) {
500 		error = ENODATA;
501 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
502 		error = EFBIG;
503 	} else {
504 		*size = (size_t) sb.st_size;
505 	}
506 
507 	return error;
508 }
509 
510 /*
511  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
512  * `size` must already be validated.
513  */
514 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)515 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
516 {
517 	return vn_rdwr(UIO_READ, graft_vp,
518 	           (caddr_t) buf, (int) size, /* offset */ 0,
519 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
520 	           vfs_context_ucred(vctx), /* resid */ NULL,
521 	           vfs_context_proc(vctx));
522 }
523 
524 /*
525  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
526  * and read it into `buf`.
527  * If `path_prefix` is non-NULL, verify that the file path has that prefix.
528  */
529 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,const char * path_prefix,size_t * size,void * buf)530 graft_secureboot_read_fd(int fd, vfs_context_t vctx, const char *path_prefix, size_t *size, void *buf)
531 {
532 	vnode_t metadata_vp = NULLVP;
533 	char *path = NULL;
534 	int error;
535 
536 	// Convert this graft fd to a vnode.
537 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
538 		goto out;
539 	}
540 
541 	// Verify that the vnode path starts with `path_prefix` if it was passed.
542 	if (path_prefix) {
543 		int len = MAXPATHLEN;
544 		path = zalloc(ZV_NAMEI);
545 		if ((error = vn_getpath(metadata_vp, path, &len))) {
546 			goto out;
547 		}
548 		if (strncmp(path, path_prefix, strlen(path_prefix))) {
549 			error = EINVAL;
550 			goto out;
551 		}
552 	}
553 
554 	// Get (and validate) size information.
555 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
556 		goto out;
557 	}
558 
559 	// Read each file into the provided buffer - we must get the expected amount of bytes.
560 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
561 		goto out;
562 	}
563 
564 out:
565 	if (path) {
566 		zfree(ZV_NAMEI, path);
567 	}
568 	if (metadata_vp) {
569 		vnode_put(metadata_vp);
570 		metadata_vp = NULLVP;
571 	}
572 
573 	return error;
574 }
575 
576 #if XNU_TARGET_OS_OSX
577 #if defined(__arm64e__)
578 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/manifests/"
579 #else /* x86_64 */
580 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/"
581 #endif /* x86_64 */
582 #else /* !XNU_TARGET_OS_OSX */
583 #define MOBILE_ASSET_DATA_VAULT_PATH "/private/var/MobileAsset/AssetsV2/manifests/"
584 #endif /* !XNU_TARGET_OS_OSX */
585 
586 /*
587  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
588  * provided in `gfs`, saving the size of data read in `gfs`.
589  */
590 static int
graft_secureboot_read_metadata(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)591 graft_secureboot_read_metadata(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
592     vfs_context_t vctx, fsioc_graft_fs_t *gfs)
593 {
594 	const char *manifest_path_prefix = NULL;
595 	int error;
596 
597 	// For Mobile Asset, make sure that the manifest comes from a data vault.
598 	if (graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET) {
599 		manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_PATH;
600 	}
601 
602 	// Read the authentic manifest.
603 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
604 	    manifest_path_prefix, &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
605 		return error;
606 	}
607 
608 	// The user manifest is currently unused, but set its size.
609 	gfs->user_manifest_size = 0;
610 
611 	// Read the payload.
612 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
613 	    NULL, &gfs->payload_size, gfs->payload))) {
614 		return error;
615 	}
616 
617 	return 0;
618 }
619 
620 /*
621  * Call into the filesystem to verify and graft a cryptex.
622  */
623 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)624 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
625     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
626 {
627 	fsioc_graft_fs_t gfs = {};
628 	uint64_t graft_dir_ino = 0;
629 	struct stat64 sb = {};
630 	int error;
631 
632 	// Pre-flight arguments.
633 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
634 		// Make sure that this graft version matches what we support.
635 		return ENOTSUP;
636 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
637 		// For this type, cryptex VP must live on same volume as the target of graft.
638 		return EXDEV;
639 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
640 		// We cannot graft upon non-directories.
641 		return ENOTDIR;
642 	} else if (cryptex_vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) {
643 		// We do not allow grafts inside disk images.
644 		return ENODEV;
645 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
646 	    sbc_args->sbc_payload_fd < 0) {
647 		// We cannot graft without a manifest and payload.
648 		return EINVAL;
649 	}
650 
651 	if (mounton_vp) {
652 		// Get the mounton's inode number.
653 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
654 		if (error) {
655 			return error;
656 		}
657 		graft_dir_ino = (uint64_t) sb.st_ino;
658 	}
659 
660 	// Create buffers (of our maximum-defined size) to store authentication info.
661 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
662 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
663 
664 	if (!gfs.authentic_manifest || !gfs.payload) {
665 		error = ENOMEM;
666 		goto out;
667 	}
668 
669 	// Read our fd's into our buffers.
670 	// (Note that this will set the buffer size fields in `gfs`.)
671 	error = graft_secureboot_read_metadata(graft_type, sbc_args, vctx, &gfs);
672 	if (error) {
673 		goto out;
674 	}
675 
676 	gfs.graft_version = FSIOC_GRAFT_VERSION;
677 	gfs.graft_type = graft_type;
678 	gfs.graft_4cc = sbc_args->sbc_4cc;
679 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
680 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
681 	}
682 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
683 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
684 	}
685 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
686 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
687 	}
688 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
689 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
690 	}
691 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
692 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
693 	}
694 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
695 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
696 	}
697 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
698 
699 	// Call into the FS to perform the graft (and validation).
700 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
701 
702 out:
703 	if (gfs.authentic_manifest) {
704 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
705 		gfs.authentic_manifest = NULL;
706 	}
707 	if (gfs.payload) {
708 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
709 		gfs.payload = NULL;
710 	}
711 
712 	return error;
713 }
714 
715 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
716 
717 /*
718  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
719  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
720  */
721 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)722 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
723 {
724 	int ua_dmgfd = uap->dmg_fd;
725 	user_addr_t ua_mountdir = uap->mountdir;
726 	uint32_t ua_grafttype = uap->graft_type;
727 	user_addr_t ua_graftargs = uap->gda;
728 
729 	graftdmg_args_un kern_gda = {};
730 	int error = 0;
731 	secure_boot_cryptex_args_t *sbc_args = NULL;
732 	bool graft_on_parent = (ua_mountdir == USER_ADDR_NULL);
733 
734 	vnode_t cryptex_vp = NULLVP;
735 	struct nameidata nd = {};
736 	vfs_context_t ctx = vfs_context_current();
737 #if CONFIG_MACF
738 	vnode_t parent_vp = NULLVP;
739 #endif
740 
741 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
742 		return EPERM;
743 	}
744 
745 	// Copy graftargs in, if provided.
746 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
747 	if (error) {
748 		return error;
749 	}
750 
751 	// Convert fd to vnode.
752 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
753 	if (error) {
754 		return error;
755 	}
756 
757 	if (vnode_isdir(cryptex_vp)) {
758 		error = EISDIR;
759 		goto graftout;
760 	}
761 
762 #if CONFIG_MACF
763 	if (graft_on_parent) {
764 		// Grafting on Cryptex file parent directory, need to get its vp for MAC check.
765 		parent_vp = vnode_getparent(cryptex_vp);
766 		if (parent_vp == NULLVP) {
767 			error = ENOENT;
768 			goto graftout;
769 		}
770 	}
771 #endif
772 
773 	if (!graft_on_parent) {
774 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
775 		    UIO_USERSPACE, ua_mountdir, ctx);
776 
777 		error = namei(&nd);
778 		if (error) {
779 			goto graftout;
780 		}
781 	}
782 
783 #if CONFIG_MACF
784 	vnode_t macf_vp = graft_on_parent ? parent_vp : nd.ni_vp;
785 	error = mac_graft_check_graft(ctx, macf_vp);
786 	if (error) {
787 		goto graftout;
788 	}
789 #endif
790 
791 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
792 		error = EINVAL;
793 	} else {
794 		sbc_args = &kern_gda.sbc_args;
795 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx,
796 		    cryptex_vp, graft_on_parent ? NULLVP : nd.ni_vp);
797 	}
798 
799 #if CONFIG_MACF
800 	if (!error) {
801 		mac_graft_notify_graft(ctx, macf_vp);
802 	}
803 #endif
804 
805 graftout:
806 #if CONFIG_MACF
807 	if (parent_vp != NULLVP) {
808 		vnode_put(parent_vp);
809 		parent_vp = NULLVP;
810 	}
811 #endif
812 	if (cryptex_vp != NULLVP) {
813 		vnode_put(cryptex_vp);
814 		cryptex_vp = NULLVP;
815 	}
816 	if (nd.ni_vp != NULLVP) {
817 		vnode_put(nd.ni_vp);
818 		nameidone(&nd);
819 	}
820 
821 	return error;
822 }
823 
824 /*
825  * Ungraft a cryptex disk image (via mount dir FD)
826  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
827  */
828 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)829 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
830 {
831 	int error = 0;
832 	user_addr_t ua_mountdir = uap->mountdir;
833 	fsioc_ungraft_fs_t ugfs = {};
834 	struct nameidata nd = {};
835 	vfs_context_t ctx = vfs_context_current();
836 
837 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
838 		return EPERM;
839 	}
840 
841 	if (ua_mountdir == USER_ADDR_NULL) {
842 		return EINVAL;
843 	}
844 
845 	if (uap->flags & UNGRAFTDMG_NOFORCE) {
846 		ugfs.ungraft_flags |= FSCTL_UNGRAFT_NOFORCE;
847 	}
848 
849 	// Acquire vnode for mount-on path
850 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
851 	    UIO_USERSPACE, ua_mountdir, ctx);
852 
853 	error = namei(&nd);
854 	if (error) {
855 		return error;
856 	}
857 
858 	if (!vnode_isdir(nd.ni_vp)) {
859 		error = ENOTDIR;
860 		goto ungraftout;
861 	}
862 
863 #if CONFIG_MACF
864 	error = mac_graft_check_ungraft(ctx, nd.ni_vp);
865 	if (error) {
866 		goto ungraftout;
867 	}
868 #endif
869 
870 	// Call into the FS to perform the ungraft
871 	error = VNOP_IOCTL(nd.ni_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
872 
873 #if CONFIG_MACF
874 	if (!error) {
875 		mac_graft_notify_ungraft(ctx, nd.ni_vp);
876 	}
877 #endif
878 
879 ungraftout:
880 	vnode_put(nd.ni_vp);
881 	nameidone(&nd);
882 
883 	return error;
884 }
885 
886 
887 void
vfs_notify_mount(vnode_t pdvp)888 vfs_notify_mount(vnode_t pdvp)
889 {
890 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
891 	lock_vnode_and_post(pdvp, NOTE_WRITE);
892 }
893 
894 /*
895  * __mac_mount:
896  *	Mount a file system taking into account MAC label behavior.
897  *	See mount(2) man page for more information
898  *
899  * Parameters:    p                        Process requesting the mount
900  *                uap                      User argument descriptor (see below)
901  *                retval                   (ignored)
902  *
903  * Indirect:      uap->type                Filesystem type
904  *                uap->path                Path to mount
905  *                uap->data                Mount arguments
906  *                uap->mac_p               MAC info
907  *                uap->flags               Mount flags
908  *
909  *
910  * Returns:        0                       Success
911  *                !0                       Not success
912  */
913 boolean_t root_fs_upgrade_try = FALSE;
914 
915 #define MAX_NESTED_UNION_MOUNTS  10
916 
917 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)918 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
919 {
920 	vnode_t pvp = NULLVP;
921 	vnode_t vp = NULLVP;
922 	int need_nameidone = 0;
923 	vfs_context_t ctx = vfs_context_current();
924 	char fstypename[MFSNAMELEN];
925 	struct nameidata nd;
926 	size_t dummy = 0;
927 	char *labelstr = NULL;
928 	size_t labelsz = 0;
929 	int flags = uap->flags;
930 	int error;
931 	int num_retries = 0;
932 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
933 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
934 #else
935 #pragma unused(p)
936 #endif
937 	/*
938 	 * Get the fs type name from user space
939 	 */
940 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
941 	if (error) {
942 		return error;
943 	}
944 
945 retry:
946 	/*
947 	 * Get the vnode to be covered
948 	 */
949 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
950 	    UIO_USERSPACE, uap->path, ctx);
951 	if (flags & MNT_NOFOLLOW) {
952 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
953 	}
954 	error = namei(&nd);
955 	if (error) {
956 		goto out;
957 	}
958 	need_nameidone = 1;
959 	vp = nd.ni_vp;
960 	pvp = nd.ni_dvp;
961 
962 #ifdef CONFIG_IMGSRC_ACCESS
963 	/* Mounting image source cannot be batched with other operations */
964 	if (flags == MNT_IMGSRC_BY_INDEX) {
965 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
966 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
967 		goto out;
968 	}
969 #endif /* CONFIG_IMGSRC_ACCESS */
970 
971 #if CONFIG_MACF
972 	/*
973 	 * Get the label string (if any) from user space
974 	 */
975 	if (uap->mac_p != USER_ADDR_NULL) {
976 		struct user_mac mac;
977 		size_t ulen = 0;
978 
979 		if (is_64bit) {
980 			struct user64_mac mac64;
981 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
982 			mac.m_buflen = (user_size_t)mac64.m_buflen;
983 			mac.m_string = (user_addr_t)mac64.m_string;
984 		} else {
985 			struct user32_mac mac32;
986 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
987 			mac.m_buflen = mac32.m_buflen;
988 			mac.m_string = mac32.m_string;
989 		}
990 		if (error) {
991 			goto out;
992 		}
993 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
994 		    (mac.m_buflen < 2)) {
995 			error = EINVAL;
996 			goto out;
997 		}
998 		labelsz = mac.m_buflen;
999 		labelstr = kalloc_data(labelsz, Z_WAITOK);
1000 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
1001 		if (error) {
1002 			goto out;
1003 		}
1004 		AUDIT_ARG(mac_string, labelstr);
1005 	}
1006 #endif /* CONFIG_MACF */
1007 
1008 	AUDIT_ARG(fflags, flags);
1009 
1010 	if (flags & MNT_UNION) {
1011 #if CONFIG_UNION_MOUNTS
1012 		mount_t mp = vp->v_mount;
1013 		int nested_union_mounts = 0;
1014 
1015 		name_cache_lock_shared();
1016 
1017 		/* Walk up the vnodecovered chain and check for nested union mounts. */
1018 		mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
1019 		while (mp) {
1020 			if (!(mp->mnt_flag & MNT_UNION)) {
1021 				break;
1022 			}
1023 			mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
1024 
1025 			/*
1026 			 * Limit the max nested unon mounts to prevent stack exhaustion
1027 			 * when calling lookup_traverse_union().
1028 			 */
1029 			if (++nested_union_mounts >= MAX_NESTED_UNION_MOUNTS) {
1030 				error = ELOOP;
1031 				break;
1032 			}
1033 		}
1034 
1035 		name_cache_unlock();
1036 		if (error) {
1037 			goto out;
1038 		}
1039 #else
1040 		error = EPERM;
1041 		goto out;
1042 #endif /* CONFIG_UNION_MOUNTS */
1043 	}
1044 
1045 	if ((vp->v_flag & VROOT) &&
1046 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
1047 #if CONFIG_UNION_MOUNTS
1048 		if (!(flags & MNT_UNION)) {
1049 			flags |= MNT_UPDATE;
1050 		} else {
1051 			/*
1052 			 * For a union mount on '/', treat it as fresh
1053 			 * mount instead of update.
1054 			 * Otherwise, union mouting on '/' used to panic the
1055 			 * system before, since mnt_vnodecovered was found to
1056 			 * be NULL for '/' which is required for unionlookup
1057 			 * after it gets ENOENT on union mount.
1058 			 */
1059 			flags = (flags & ~(MNT_UPDATE));
1060 		}
1061 #else
1062 		flags |= MNT_UPDATE;
1063 #endif /* CONFIG_UNION_MOUNTS */
1064 
1065 #if SECURE_KERNEL
1066 		if ((flags & MNT_RDONLY) == 0) {
1067 			/* Release kernels are not allowed to mount "/" as rw */
1068 			error = EPERM;
1069 			goto out;
1070 		}
1071 #endif
1072 
1073 		/*
1074 		 * See 7392553 for more details on why this check exists.
1075 		 * Suffice to say: If this check is ON and something tries
1076 		 * to mount the rootFS RW, we'll turn off the codesign
1077 		 * bitmap optimization.
1078 		 */
1079 #if CHECK_CS_VALIDATION_BITMAP
1080 		if ((flags & MNT_RDONLY) == 0) {
1081 			root_fs_upgrade_try = TRUE;
1082 		}
1083 #endif
1084 	}
1085 
1086 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
1087 	    labelstr, ctx);
1088 
1089 out:
1090 
1091 #if CONFIG_MACF
1092 	kfree_data(labelstr, labelsz);
1093 #endif /* CONFIG_MACF */
1094 
1095 	if (vp) {
1096 		vnode_put(vp);
1097 		vp = NULLVP;
1098 	}
1099 	if (pvp) {
1100 		vnode_put(pvp);
1101 		pvp = NULLVP;
1102 	}
1103 	if (need_nameidone) {
1104 		nameidone(&nd);
1105 		need_nameidone = 0;
1106 	}
1107 
1108 	if (error == EBUSY) {
1109 		/* Retry the lookup and mount again due to concurrent mounts. */
1110 		if (++num_retries < MAX_MOUNT_RETRIES) {
1111 			goto retry;
1112 		}
1113 	}
1114 
1115 	return error;
1116 }
1117 
1118 /*
1119  * common mount implementation (final stage of mounting)
1120  *
1121  * Arguments:
1122  *  fstypename	file system type (ie it's vfs name)
1123  *  pvp		parent of covered vnode
1124  *  vp		covered vnode
1125  *  cnp		component name (ie path) of covered vnode
1126  *  flags	generic mount flags
1127  *  fsmountargs	file system specific data
1128  *  labelstr	optional MAC label
1129  *  kernelmount	TRUE for mounts initiated from inside the kernel
1130  *  ctx		caller's context
1131  */
1132 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1133 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1134     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1135     char *labelstr, vfs_context_t ctx)
1136 {
1137 #if !CONFIG_MACF
1138 #pragma unused(labelstr)
1139 #endif
1140 	struct vnode *devvp = NULLVP;
1141 	struct vnode *device_vnode = NULLVP;
1142 #if CONFIG_MACF
1143 	struct vnode *rvp;
1144 #endif
1145 	struct mount *mp = NULL;
1146 	struct vfstable *vfsp = (struct vfstable *)0;
1147 	struct proc *p = vfs_context_proc(ctx);
1148 	int error, flag = 0;
1149 	bool flag_set = false;
1150 	user_addr_t devpath = USER_ADDR_NULL;
1151 	int ronly = 0;
1152 	int mntalloc = 0;
1153 	boolean_t vfsp_ref = FALSE;
1154 	boolean_t is_rwlock_locked = FALSE;
1155 	boolean_t did_rele = FALSE;
1156 	boolean_t have_usecount = FALSE;
1157 	boolean_t did_set_lmount = FALSE;
1158 	boolean_t did_set_vmount = FALSE;
1159 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1160 
1161 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1162 	/* Check for mutually-exclusive flag bits */
1163 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1164 	int bitcount = 0;
1165 	while (checkflags != 0) {
1166 		checkflags &= (checkflags - 1);
1167 		bitcount++;
1168 	}
1169 
1170 	if (bitcount > 1) {
1171 		//not allowed to request multiple mount-by-role flags
1172 		error = EINVAL;
1173 		goto out1;
1174 	}
1175 #endif
1176 
1177 	/*
1178 	 * Process an update for an existing mount
1179 	 */
1180 	if (flags & MNT_UPDATE) {
1181 		if ((vp->v_flag & VROOT) == 0) {
1182 			error = EINVAL;
1183 			goto out1;
1184 		}
1185 		mp = vp->v_mount;
1186 
1187 		/* if unmount or mount in progress, return error */
1188 		mount_lock_spin(mp);
1189 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1190 			mount_unlock(mp);
1191 			error = EBUSY;
1192 			goto out1;
1193 		}
1194 		mp->mnt_lflag |= MNT_LMOUNT;
1195 		did_set_lmount = TRUE;
1196 		mount_unlock(mp);
1197 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1198 		is_rwlock_locked = TRUE;
1199 		/*
1200 		 * We only allow the filesystem to be reloaded if it
1201 		 * is currently mounted read-only.
1202 		 */
1203 		if ((flags & MNT_RELOAD) &&
1204 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1205 			error = ENOTSUP;
1206 			goto out1;
1207 		}
1208 
1209 		/*
1210 		 * If content protection is enabled, update mounts are not
1211 		 * allowed to turn it off.
1212 		 */
1213 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1214 		    ((flags & MNT_CPROTECT) == 0)) {
1215 			error = EINVAL;
1216 			goto out1;
1217 		}
1218 
1219 		/*
1220 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1221 		 * failure to return an error for this so we'll just silently
1222 		 * add it if it is not passed in.
1223 		 */
1224 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1225 		    ((flags & MNT_REMOVABLE) == 0)) {
1226 			flags |= MNT_REMOVABLE;
1227 		}
1228 
1229 		/* Can't downgrade the backer of the root FS */
1230 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1231 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1232 			error = ENOTSUP;
1233 			goto out1;
1234 		}
1235 
1236 		/*
1237 		 * Only root, or the user that did the original mount is
1238 		 * permitted to update it.
1239 		 */
1240 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1241 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1242 			goto out1;
1243 		}
1244 #if CONFIG_MACF
1245 		error = mac_mount_check_remount(ctx, mp, flags);
1246 		if (error != 0) {
1247 			goto out1;
1248 		}
1249 #endif
1250 		/*
1251 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1252 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1253 		 */
1254 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1255 			flags |= MNT_NOSUID | MNT_NODEV;
1256 			if (mp->mnt_flag & MNT_NOEXEC) {
1257 				flags |= MNT_NOEXEC;
1258 			}
1259 		}
1260 		flag = mp->mnt_flag;
1261 		flag_set = true;
1262 
1263 
1264 
1265 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1266 
1267 		vfsp = mp->mnt_vtable;
1268 		goto update;
1269 	} // MNT_UPDATE
1270 
1271 	/*
1272 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1273 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1274 	 */
1275 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1276 		flags |= MNT_NOSUID | MNT_NODEV;
1277 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1278 			flags |= MNT_NOEXEC;
1279 		}
1280 	}
1281 
1282 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1283 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1284 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1285 	mount_list_lock();
1286 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1287 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1288 			vfsp->vfc_refcount++;
1289 			vfsp_ref = TRUE;
1290 			break;
1291 		}
1292 	}
1293 	mount_list_unlock();
1294 	if (vfsp == NULL) {
1295 		error = ENODEV;
1296 		goto out1;
1297 	}
1298 
1299 	/*
1300 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1301 	 * except in ROSV configs and for the initial BaseSystem root.
1302 	 */
1303 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1304 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1305 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1306 		error = EINVAL;  /* unsupported request */
1307 		goto out1;
1308 	}
1309 
1310 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1311 	if (error != 0) {
1312 		goto out1;
1313 	}
1314 
1315 	/*
1316 	 * Upon successful of prepare_coveredvp(), VMOUNT is set for the covered vp.
1317 	 */
1318 	did_set_vmount = TRUE;
1319 
1320 	/*
1321 	 * Allocate and initialize the filesystem (mount_t)
1322 	 */
1323 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1324 	mntalloc = 1;
1325 
1326 	/* Initialize the default IO constraints */
1327 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1328 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1329 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1330 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1331 	mp->mnt_devblocksize = DEV_BSIZE;
1332 	mp->mnt_alignmentmask = PAGE_MASK;
1333 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1334 	mp->mnt_ioscale = 1;
1335 	mp->mnt_ioflags = 0;
1336 	mp->mnt_realrootvp = NULLVP;
1337 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1338 
1339 	mp->mnt_lflag |= MNT_LMOUNT;
1340 	did_set_lmount = TRUE;
1341 
1342 	TAILQ_INIT(&mp->mnt_vnodelist);
1343 	TAILQ_INIT(&mp->mnt_workerqueue);
1344 	TAILQ_INIT(&mp->mnt_newvnodes);
1345 	mount_lock_init(mp);
1346 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1347 	is_rwlock_locked = TRUE;
1348 	mp->mnt_op = vfsp->vfc_vfsops;
1349 	mp->mnt_vtable = vfsp;
1350 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1351 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1352 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1353 	do {
1354 		size_t pathlen = MAXPATHLEN;
1355 
1356 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1357 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1358 		}
1359 	} while (0);
1360 	mp->mnt_vnodecovered = vp;
1361 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1362 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1363 	mp->mnt_devbsdunit = 0;
1364 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1365 
1366 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1367 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1368 
1369 	if (kernelmount) {
1370 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1371 	}
1372 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1373 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1374 	}
1375 
1376 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1377 		// kernel mounted devfs
1378 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1379 	}
1380 
1381 update:
1382 
1383 	/*
1384 	 * Set the mount level flags.
1385 	 */
1386 	if (flags & MNT_RDONLY) {
1387 		mp->mnt_flag |= MNT_RDONLY;
1388 	} else if (mp->mnt_flag & MNT_RDONLY) {
1389 		// disallow read/write upgrades of file systems that
1390 		// had the TYPENAME_OVERRIDE feature set.
1391 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1392 			error = EPERM;
1393 			goto out1;
1394 		}
1395 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1396 	}
1397 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1398 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1399 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1400 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1401 	    MNT_QUARANTINE | MNT_CPROTECT);
1402 
1403 #if SECURE_KERNEL
1404 #if !CONFIG_MNT_SUID
1405 	/*
1406 	 * On release builds of iOS based platforms, always enforce NOSUID on
1407 	 * all mounts. We do this here because we can catch update mounts as well as
1408 	 * non-update mounts in this case.
1409 	 */
1410 	mp->mnt_flag |= (MNT_NOSUID);
1411 #endif
1412 #endif
1413 
1414 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1415 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1416 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1417 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1418 	    MNT_QUARANTINE | MNT_CPROTECT);
1419 
1420 #if CONFIG_MACF
1421 	if (flags & MNT_MULTILABEL) {
1422 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1423 			error = EINVAL;
1424 			goto out1;
1425 		}
1426 		mp->mnt_flag |= MNT_MULTILABEL;
1427 	}
1428 #endif
1429 	/*
1430 	 * Process device path for local file systems if requested.
1431 	 *
1432 	 * Snapshot and mount-by-role mounts do not use this path; they are
1433 	 * passing other opaque data in the device path field.
1434 	 *
1435 	 * Basesystemroot mounts pass a device path to be resolved here,
1436 	 * but it's just a char * already inside the kernel, which
1437 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1438 	 * mounts we must skip copyin (both of the address and of the string
1439 	 * (in NDINIT).
1440 	 */
1441 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1442 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1443 		boolean_t do_copyin_devpath = true;
1444 #if CONFIG_BASESYSTEMROOT
1445 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1446 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1447 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1448 			// but is actually a char ** pointing to a (kernelspace) string.
1449 			// We manually unpack it with a series of casts and dereferences
1450 			// that reverses what was done just above us on the stack in
1451 			// imageboot_pivot_image().
1452 			// After retrieving the path to the dev node (which we will NDINIT
1453 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1454 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1455 			char **devnamepp = (char **)fsmountargs;
1456 			char *devnamep = *devnamepp;
1457 			devpath = CAST_USER_ADDR_T(devnamep);
1458 			do_copyin_devpath = false;
1459 			fsmountargs = USER_ADDR_NULL;
1460 
1461 			//Now that we have a mp, denote that this mount is for the basesystem.
1462 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1463 		}
1464 #endif // CONFIG_BASESYSTEMROOT
1465 
1466 		if (do_copyin_devpath) {
1467 			if (vfs_context_is64bit(ctx)) {
1468 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1469 					goto out1;
1470 				}
1471 				fsmountargs += sizeof(devpath);
1472 			} else {
1473 				user32_addr_t tmp;
1474 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1475 					goto out1;
1476 				}
1477 				/* munge into LP64 addr */
1478 				devpath = CAST_USER_ADDR_T(tmp);
1479 				fsmountargs += sizeof(tmp);
1480 			}
1481 		}
1482 
1483 		/* Lookup device and authorize access to it */
1484 		if ((devpath)) {
1485 			struct nameidata nd;
1486 
1487 			enum uio_seg seg = UIO_USERSPACE;
1488 #if CONFIG_BASESYSTEMROOT
1489 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1490 				seg = UIO_SYSSPACE;
1491 			}
1492 #endif // CONFIG_BASESYSTEMROOT
1493 
1494 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1495 			if (flags & MNT_NOFOLLOW) {
1496 				nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
1497 			}
1498 			if ((error = namei(&nd))) {
1499 				goto out1;
1500 			}
1501 
1502 			devvp = nd.ni_vp;
1503 
1504 			if (devvp->v_type != VBLK) {
1505 				error = ENOTBLK;
1506 				nameidone(&nd);
1507 				goto out2;
1508 			}
1509 			if (major(devvp->v_rdev) >= nblkdev) {
1510 				error = ENXIO;
1511 				nameidone(&nd);
1512 				goto out2;
1513 			}
1514 			/*
1515 			 * If mount by non-root, then verify that user has necessary
1516 			 * permissions on the device.
1517 			 */
1518 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1519 				kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1520 
1521 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1522 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1523 				}
1524 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1525 					nameidone(&nd);
1526 					goto out2;
1527 				}
1528 			}
1529 
1530 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1531 			nameidone(&nd);
1532 		}
1533 		/* On first mount, preflight and open device */
1534 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1535 			if ((error = vnode_ref(devvp))) {
1536 				goto out2;
1537 			}
1538 			/*
1539 			 * Disallow multiple mounts of the same device.
1540 			 * Disallow mounting of a device that is currently in use
1541 			 * (except for root, which might share swap device for miniroot).
1542 			 * Flush out any old buffers remaining from a previous use.
1543 			 */
1544 			if ((error = vfs_setmounting(devvp))) {
1545 				vnode_rele(devvp);
1546 				goto out2;
1547 			}
1548 
1549 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1550 				error = EBUSY;
1551 				goto out3;
1552 			}
1553 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1554 				error = ENOTBLK;
1555 				goto out3;
1556 			}
1557 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1558 				goto out3;
1559 			}
1560 
1561 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1562 #if CONFIG_MACF
1563 			error = mac_vnode_check_open(ctx,
1564 			    devvp,
1565 			    ronly ? FREAD : FREAD | FWRITE);
1566 			if (error) {
1567 				goto out3;
1568 			}
1569 #endif /* MAC */
1570 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1571 				goto out3;
1572 			}
1573 
1574 			mp->mnt_devvp = devvp;
1575 			device_vnode = devvp;
1576 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1577 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1578 		    (device_vnode = mp->mnt_devvp)) {
1579 			dev_t dev;
1580 			int maj;
1581 			/*
1582 			 * If upgrade to read-write by non-root, then verify
1583 			 * that user has necessary permissions on the device.
1584 			 */
1585 			vnode_getalways(device_vnode);
1586 
1587 			if (suser(vfs_context_ucred(ctx), NULL) &&
1588 			    (error = vnode_authorize(device_vnode, NULL,
1589 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1590 			    ctx)) != 0) {
1591 				vnode_put(device_vnode);
1592 				goto out2;
1593 			}
1594 
1595 			/* Tell the device that we're upgrading */
1596 			dev = (dev_t)device_vnode->v_rdev;
1597 			maj = major(dev);
1598 
1599 			if ((u_int)maj >= (u_int)nblkdev) {
1600 				panic("Volume mounted on a device with invalid major number.");
1601 			}
1602 
1603 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1604 			vnode_put(device_vnode);
1605 			device_vnode = NULLVP;
1606 			if (error != 0) {
1607 				goto out2;
1608 			}
1609 		}
1610 	} // localargs && !(snapshot | data | vm)
1611 
1612 #if CONFIG_MACF
1613 	if ((flags & MNT_UPDATE) == 0) {
1614 		mac_mount_label_init(mp);
1615 		mac_mount_label_associate(ctx, mp);
1616 	}
1617 	if (labelstr) {
1618 		if ((flags & MNT_UPDATE) != 0) {
1619 			error = mac_mount_check_label_update(ctx, mp);
1620 			if (error != 0) {
1621 				goto out3;
1622 			}
1623 		}
1624 	}
1625 #endif
1626 	/*
1627 	 * Mount the filesystem.  We already asserted that internal_flags
1628 	 * cannot have more than one mount-by-role bit set.
1629 	 */
1630 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1631 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1632 		    (caddr_t)fsmountargs, 0, ctx);
1633 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1634 #if CONFIG_ROSV_STARTUP
1635 		struct mount *origin_mp = (struct mount*)fsmountargs;
1636 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1637 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1638 		if (error) {
1639 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1640 		} else {
1641 			/* Mark volume associated with system volume */
1642 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1643 
1644 			/* Attempt to acquire the mnt_devvp and set it up */
1645 			struct vnode *mp_devvp = NULL;
1646 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1647 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1648 				    0, &mp_devvp, vfs_context_kernel());
1649 				if (!lerr) {
1650 					mp->mnt_devvp = mp_devvp;
1651 					//vnode_lookup took an iocount, need to drop it.
1652 					vnode_put(mp_devvp);
1653 					// now set `device_vnode` to the devvp that was acquired.
1654 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1655 					// note that though the iocount above was dropped, the mount acquires
1656 					// an implicit reference against the device.
1657 					device_vnode = mp_devvp;
1658 				}
1659 			}
1660 		}
1661 #else
1662 		error = EINVAL;
1663 #endif
1664 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1665 #if CONFIG_MOUNT_VM
1666 		struct mount *origin_mp = (struct mount*)fsmountargs;
1667 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1668 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1669 		if (error) {
1670 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1671 		} else {
1672 			/* Mark volume associated with system volume and a swap mount */
1673 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1674 			/* Attempt to acquire the mnt_devvp and set it up */
1675 			struct vnode *mp_devvp = NULL;
1676 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1677 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1678 				    0, &mp_devvp, vfs_context_kernel());
1679 				if (!lerr) {
1680 					mp->mnt_devvp = mp_devvp;
1681 					//vnode_lookup took an iocount, need to drop it.
1682 					vnode_put(mp_devvp);
1683 
1684 					// now set `device_vnode` to the devvp that was acquired.
1685 					// note that though the iocount above was dropped, the mount acquires
1686 					// an implicit reference against the device.
1687 					device_vnode = mp_devvp;
1688 				}
1689 			}
1690 		}
1691 #else
1692 		error = EINVAL;
1693 #endif
1694 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1695 #if CONFIG_MOUNT_PREBOOTRECOVERY
1696 		struct mount *origin_mp = (struct mount*)fsmountargs;
1697 		uint32_t mount_role = 0;
1698 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1699 			mount_role = VFS_PREBOOT_ROLE;
1700 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1701 			mount_role = VFS_RECOVERY_ROLE;
1702 		}
1703 
1704 		if (mount_role != 0) {
1705 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1706 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1707 			if (error) {
1708 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1709 			} else {
1710 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1711 				/* Mark volume associated with system volume */
1712 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1713 				/* Attempt to acquire the mnt_devvp and set it up */
1714 				struct vnode *mp_devvp = NULL;
1715 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1716 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1717 					    0, &mp_devvp, vfs_context_kernel());
1718 					if (!lerr) {
1719 						mp->mnt_devvp = mp_devvp;
1720 						//vnode_lookup took an iocount, need to drop it.
1721 						vnode_put(mp_devvp);
1722 
1723 						// now set `device_vnode` to the devvp that was acquired.
1724 						// note that though the iocount above was dropped, the mount acquires
1725 						// an implicit reference against the device.
1726 						device_vnode = mp_devvp;
1727 					}
1728 				}
1729 			}
1730 		} else {
1731 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1732 			error = EINVAL;
1733 		}
1734 #else
1735 		error = EINVAL;
1736 #endif
1737 	} else {
1738 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1739 	}
1740 
1741 	if (flags & MNT_UPDATE) {
1742 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1743 			mp->mnt_flag &= ~MNT_RDONLY;
1744 		}
1745 		mp->mnt_flag &= ~
1746 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1747 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1748 		if (error) {
1749 			mp->mnt_flag = flag;  /* restore flag value */
1750 		}
1751 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1752 		lck_rw_done(&mp->mnt_rwlock);
1753 		is_rwlock_locked = FALSE;
1754 		if (!error) {
1755 			enablequotas(mp, ctx);
1756 		}
1757 		goto exit;
1758 	}
1759 
1760 	/*
1761 	 * Put the new filesystem on the mount list after root.
1762 	 */
1763 	if (error == 0) {
1764 		struct vfs_attr vfsattr;
1765 		if (device_vnode) {
1766 			/*
1767 			 *   cache the IO attributes for the underlying physical media...
1768 			 *   an error return indicates the underlying driver doesn't
1769 			 *   support all the queries necessary... however, reasonable
1770 			 *   defaults will have been set, so no reason to bail or care
1771 			 *
1772 			 *   Need to do this before calling the MAC hook as it needs
1773 			 *   information from this call.
1774 			 */
1775 			vfs_init_io_attributes(device_vnode, mp);
1776 		}
1777 
1778 #if CONFIG_MACF
1779 		error = mac_mount_check_mount_late(ctx, mp);
1780 		if (error != 0) {
1781 			goto out4;
1782 		}
1783 
1784 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1785 			error = VFS_ROOT(mp, &rvp, ctx);
1786 			if (error) {
1787 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1788 				goto out4;
1789 			}
1790 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1791 			/*
1792 			 * drop reference provided by VFS_ROOT
1793 			 */
1794 			vnode_put(rvp);
1795 
1796 			if (error) {
1797 				goto out4;
1798 			}
1799 		}
1800 #endif  /* MAC */
1801 
1802 		vnode_lock_spin(vp);
1803 		CLR(vp->v_flag, VMOUNT);
1804 		vp->v_mountedhere = mp;
1805 		SET(vp->v_flag, VMOUNTEDHERE);
1806 
1807 		/*
1808 		 * Wakeup any waiter(s) in prepare_coveredvp() that is waiting for the
1809 		 * 'v_mountedhere' to be planted.
1810 		 */
1811 		wakeup(&vp->v_flag);
1812 		vnode_unlock(vp);
1813 
1814 		/*
1815 		 * taking the name_cache_lock exclusively will
1816 		 * insure that everyone is out of the fast path who
1817 		 * might be trying to use a now stale copy of
1818 		 * vp->v_mountedhere->mnt_realrootvp
1819 		 * bumping mount_generation causes the cached values
1820 		 * to be invalidated
1821 		 */
1822 		name_cache_lock();
1823 		mount_generation++;
1824 		name_cache_unlock();
1825 
1826 		error = vnode_ref(vp);
1827 		if (error != 0) {
1828 			goto out4;
1829 		}
1830 
1831 		have_usecount = TRUE;
1832 
1833 		error = checkdirs(vp, ctx);
1834 		if (error != 0) {
1835 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1836 			goto out4;
1837 		}
1838 		/*
1839 		 * there is no cleanup code here so I have made it void
1840 		 * we need to revisit this
1841 		 */
1842 		(void)VFS_START(mp, 0, ctx);
1843 
1844 		if (mount_list_add(mp) != 0) {
1845 			/*
1846 			 * The system is shutting down trying to umount
1847 			 * everything, so fail with a plausible errno.
1848 			 */
1849 			error = EBUSY;
1850 			goto out4;
1851 		}
1852 		lck_rw_done(&mp->mnt_rwlock);
1853 		is_rwlock_locked = FALSE;
1854 
1855 		/* Check if this mounted file system supports EAs or named streams. */
1856 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1857 		VFSATTR_INIT(&vfsattr);
1858 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1859 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1860 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1861 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1862 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1863 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1864 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1865 			}
1866 #if NAMEDSTREAMS
1867 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1868 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1869 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1870 			}
1871 #endif
1872 			/* Check if this file system supports path from id lookups. */
1873 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1874 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1875 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1876 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1877 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1878 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1879 			}
1880 
1881 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1882 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1883 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1884 			}
1885 		}
1886 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1887 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1888 		}
1889 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1890 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1891 		}
1892 		/* Get subtype if supported to cache it */
1893 		VFSATTR_INIT(&vfsattr);
1894 		VFSATTR_WANTED(&vfsattr, f_fssubtype);
1895 		if (vfs_getattr(mp, &vfsattr, ctx) == 0 && VFSATTR_IS_SUPPORTED(&vfsattr, f_fssubtype)) {
1896 			mp->mnt_vfsstat.f_fssubtype = vfsattr.f_fssubtype;
1897 		}
1898 
1899 		/* increment the operations count */
1900 		OSAddAtomic(1, &vfs_nummntops);
1901 		enablequotas(mp, ctx);
1902 
1903 		if (device_vnode) {
1904 			vfs_setmountedon(device_vnode);
1905 		}
1906 
1907 		/* Now that mount is setup, notify the listeners */
1908 		vfs_notify_mount(pvp);
1909 		IOBSDMountChange(mp, kIOMountChangeMount);
1910 #if CONFIG_MACF
1911 		mac_mount_notify_mount(ctx, mp);
1912 #endif /* CONFIG_MACF */
1913 	} else {
1914 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1915 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1916 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1917 			    mp->mnt_vtable->vfc_name, error);
1918 		}
1919 
1920 		vnode_lock_spin(vp);
1921 		CLR(vp->v_flag, VMOUNT);
1922 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
1923 		wakeup(&vp->v_flag);
1924 		vnode_unlock(vp);
1925 		mount_list_lock();
1926 		mp->mnt_vtable->vfc_refcount--;
1927 		mount_list_unlock();
1928 
1929 		if (device_vnode) {
1930 			vnode_rele(device_vnode);
1931 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1932 			vfs_clearmounting(device_vnode);
1933 		}
1934 		lck_rw_done(&mp->mnt_rwlock);
1935 		is_rwlock_locked = FALSE;
1936 
1937 		if (nc_smr_enabled) {
1938 			vfs_smr_synchronize();
1939 		}
1940 
1941 		/*
1942 		 * if we get here, we have a mount structure that needs to be freed,
1943 		 * but since the coveredvp hasn't yet been updated to point at it,
1944 		 * no need to worry about other threads holding a crossref on this mp
1945 		 * so it's ok to just free it
1946 		 */
1947 		mount_lock_destroy(mp);
1948 #if CONFIG_MACF
1949 		mac_mount_label_destroy(mp);
1950 #endif
1951 		zfree(mount_zone, mp);
1952 		did_set_lmount = false;
1953 	}
1954 exit:
1955 	/*
1956 	 * drop I/O count on the device vp if there was one
1957 	 */
1958 	if (devpath && devvp) {
1959 		vnode_put(devvp);
1960 	}
1961 
1962 	if (did_set_lmount) {
1963 		mount_lock_spin(mp);
1964 		mp->mnt_lflag &= ~MNT_LMOUNT;
1965 		mount_unlock(mp);
1966 	}
1967 
1968 	return error;
1969 
1970 /* Error condition exits */
1971 out4:
1972 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1973 
1974 	/*
1975 	 * If the mount has been placed on the covered vp,
1976 	 * it may have been discovered by now, so we have
1977 	 * to treat this just like an unmount
1978 	 */
1979 	mount_lock_spin(mp);
1980 	mp->mnt_lflag |= MNT_LDEAD;
1981 	mount_unlock(mp);
1982 
1983 	if (device_vnode != NULLVP) {
1984 		vnode_rele(device_vnode);
1985 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1986 		    ctx);
1987 		vfs_clearmounting(device_vnode);
1988 		did_rele = TRUE;
1989 	}
1990 
1991 	vnode_lock_spin(vp);
1992 
1993 	mp->mnt_crossref++;
1994 	CLR(vp->v_flag, VMOUNTEDHERE);
1995 	vp->v_mountedhere = (mount_t) 0;
1996 
1997 	vnode_unlock(vp);
1998 
1999 	if (have_usecount) {
2000 		vnode_rele(vp);
2001 	}
2002 out3:
2003 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
2004 		vnode_rele(devvp);
2005 		vfs_clearmounting(devvp);
2006 	}
2007 out2:
2008 	if (devpath && devvp) {
2009 		vnode_put(devvp);
2010 	}
2011 out1:
2012 	/* Release mnt_rwlock only when it was taken */
2013 	if (is_rwlock_locked == TRUE) {
2014 		if (flag_set) {
2015 			mp->mnt_flag = flag;  /* restore mnt_flag value */
2016 		}
2017 		lck_rw_done(&mp->mnt_rwlock);
2018 	}
2019 
2020 	if (did_set_lmount) {
2021 		mount_lock_spin(mp);
2022 		mp->mnt_lflag &= ~MNT_LMOUNT;
2023 		mount_unlock(mp);
2024 	}
2025 
2026 	if (did_set_vmount) {
2027 		vnode_lock_spin(vp);
2028 		CLR(vp->v_flag, VMOUNT);
2029 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2030 		wakeup(&vp->v_flag);
2031 		vnode_unlock(vp);
2032 	}
2033 
2034 	if (mntalloc) {
2035 		if (mp->mnt_crossref) {
2036 			mount_dropcrossref(mp, vp, 0);
2037 		} else {
2038 			if (nc_smr_enabled) {
2039 				vfs_smr_synchronize();
2040 			}
2041 
2042 			mount_lock_destroy(mp);
2043 #if CONFIG_MACF
2044 			mac_mount_label_destroy(mp);
2045 #endif
2046 			zfree(mount_zone, mp);
2047 		}
2048 	}
2049 	if (vfsp_ref) {
2050 		mount_list_lock();
2051 		vfsp->vfc_refcount--;
2052 		mount_list_unlock();
2053 	}
2054 
2055 	return error;
2056 }
2057 
2058 /*
2059  * Flush in-core data, check for competing mount attempts,
2060  * and set VMOUNT
2061  */
2062 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)2063 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
2064 {
2065 #if !CONFIG_MACF
2066 #pragma unused(cnp,fsname)
2067 #endif
2068 	struct vnode_attr va;
2069 	int error;
2070 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
2071 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
2072 	boolean_t is_kmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
2073 
2074 	if (!skip_auth) {
2075 		/*
2076 		 * If the user is not root, ensure that they own the directory
2077 		 * onto which we are attempting to mount.
2078 		 */
2079 		VATTR_INIT(&va);
2080 		VATTR_WANTED(&va, va_uid);
2081 		if ((error = vnode_getattr(vp, &va, ctx)) ||
2082 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2083 		    (!vfs_context_issuser(ctx)))) {
2084 			error = EPERM;
2085 			goto out;
2086 		}
2087 	}
2088 
2089 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
2090 		goto out;
2091 	}
2092 
2093 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
2094 		goto out;
2095 	}
2096 
2097 	if (vp->v_type != VDIR) {
2098 		error = ENOTDIR;
2099 		goto out;
2100 	}
2101 
2102 	vnode_lock_spin(vp);
2103 
2104 	if (is_fmount && (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL))) {
2105 		error = EBUSY;
2106 	} else if (!is_kmount && (ISSET(vp->v_flag, VMOUNT) ||
2107 	    (vp->v_mountedhere != NULL))) {
2108 		/*
2109 		 * For mount triggered from mount() call, we want to wait for the
2110 		 * current in-progress mount to complete, redo lookup and retry the
2111 		 * mount again. Similarly, we also want to retry if we lost the race
2112 		 * due to concurrent mounts and the 'VMOUNT' flag has been cleared and
2113 		 * 'v_mountedhere' has been planted after initial lookup.
2114 		 */
2115 		if (ISSET(vp->v_flag, VMOUNT)) {
2116 			vnode_lock_convert(vp);
2117 			msleep(&vp->v_flag, &vp->v_lock, PVFS, "vnode_waitformount", NULL);
2118 		}
2119 		error = EBUSY;
2120 	} else if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
2121 		error = EBUSY;
2122 	}
2123 
2124 	if (error) {
2125 		vnode_unlock(vp);
2126 		goto out;
2127 	}
2128 	SET(vp->v_flag, VMOUNT);
2129 	vnode_unlock(vp);
2130 
2131 #if CONFIG_MACF
2132 	error = mac_mount_check_mount(ctx, vp,
2133 	    cnp, fsname);
2134 	if (error != 0) {
2135 		vnode_lock_spin(vp);
2136 		CLR(vp->v_flag, VMOUNT);
2137 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2138 		wakeup(&vp->v_flag);
2139 		vnode_unlock(vp);
2140 	}
2141 #endif
2142 
2143 out:
2144 	return error;
2145 }
2146 
2147 #if CONFIG_IMGSRC_ACCESS
2148 
2149 #define DEBUG_IMGSRC 0
2150 
2151 #if DEBUG_IMGSRC
2152 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2153 #else
2154 #define IMGSRC_DEBUG(args...) do { } while(0)
2155 #endif
2156 
2157 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2158 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2159 {
2160 	struct nameidata nd;
2161 	vnode_t vp, realdevvp;
2162 	kauth_action_t accessmode;
2163 	int error;
2164 	enum uio_seg uio = UIO_USERSPACE;
2165 
2166 	if (ctx == vfs_context_kernel()) {
2167 		uio = UIO_SYSSPACE;
2168 	}
2169 
2170 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2171 	if ((error = namei(&nd))) {
2172 		IMGSRC_DEBUG("namei() failed with %d\n", error);
2173 		return error;
2174 	}
2175 
2176 	vp = nd.ni_vp;
2177 
2178 	if (!vnode_isblk(vp)) {
2179 		IMGSRC_DEBUG("Not block device.\n");
2180 		error = ENOTBLK;
2181 		goto out;
2182 	}
2183 
2184 	realdevvp = mp->mnt_devvp;
2185 	if (realdevvp == NULLVP) {
2186 		IMGSRC_DEBUG("No device backs the mount.\n");
2187 		error = ENXIO;
2188 		goto out;
2189 	}
2190 
2191 	error = vnode_getwithref(realdevvp);
2192 	if (error != 0) {
2193 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2194 		goto out;
2195 	}
2196 
2197 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2198 		IMGSRC_DEBUG("Wrong dev_t.\n");
2199 		error = ENXIO;
2200 		goto out1;
2201 	}
2202 
2203 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2204 
2205 	/*
2206 	 * If mount by non-root, then verify that user has necessary
2207 	 * permissions on the device.
2208 	 */
2209 	if (!vfs_context_issuser(ctx)) {
2210 		accessmode = KAUTH_VNODE_READ_DATA;
2211 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2212 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2213 		}
2214 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2215 			IMGSRC_DEBUG("Access denied.\n");
2216 			goto out1;
2217 		}
2218 	}
2219 
2220 	*devvpp = vp;
2221 
2222 out1:
2223 	vnode_put(realdevvp);
2224 
2225 out:
2226 	nameidone(&nd);
2227 
2228 	if (error) {
2229 		vnode_put(vp);
2230 	}
2231 
2232 	return error;
2233 }
2234 
2235 /*
2236  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2237  * and call checkdirs()
2238  */
2239 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2240 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2241 {
2242 	int error;
2243 
2244 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2245 
2246 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2247 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2248 
2249 	vnode_lock_spin(vp);
2250 	CLR(vp->v_flag, VMOUNT);
2251 	vp->v_mountedhere = mp;
2252 	SET(vp->v_flag, VMOUNTEDHERE);
2253 	/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2254 	wakeup(&vp->v_flag);
2255 	vnode_unlock(vp);
2256 
2257 	/*
2258 	 * taking the name_cache_lock exclusively will
2259 	 * insure that everyone is out of the fast path who
2260 	 * might be trying to use a now stale copy of
2261 	 * vp->v_mountedhere->mnt_realrootvp
2262 	 * bumping mount_generation causes the cached values
2263 	 * to be invalidated
2264 	 */
2265 	name_cache_lock();
2266 	mount_generation++;
2267 	name_cache_unlock();
2268 
2269 	error = vnode_ref(vp);
2270 	if (error != 0) {
2271 		goto out;
2272 	}
2273 
2274 	error = checkdirs(vp, ctx);
2275 	if (error != 0) {
2276 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2277 		vnode_rele(vp);
2278 		goto out;
2279 	}
2280 
2281 out:
2282 	if (error != 0) {
2283 		mp->mnt_vnodecovered = NULLVP;
2284 	}
2285 	return error;
2286 }
2287 
2288 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2289 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2290 {
2291 	vnode_rele(vp);
2292 	vnode_lock_spin(vp);
2293 	CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2294 	vp->v_mountedhere = (mount_t)NULL;
2295 	/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2296 	wakeup(&vp->v_flag);
2297 	vnode_unlock(vp);
2298 
2299 	mp->mnt_vnodecovered = NULLVP;
2300 }
2301 
2302 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2303 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2304 {
2305 	int error;
2306 
2307 	/* unmount in progress return error */
2308 	mount_lock_spin(mp);
2309 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2310 		mount_unlock(mp);
2311 		return EBUSY;
2312 	}
2313 	mount_unlock(mp);
2314 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2315 
2316 	/*
2317 	 * We only allow the filesystem to be reloaded if it
2318 	 * is currently mounted read-only.
2319 	 */
2320 	if ((flags & MNT_RELOAD) &&
2321 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2322 		error = ENOTSUP;
2323 		goto out;
2324 	}
2325 
2326 	/*
2327 	 * Only root, or the user that did the original mount is
2328 	 * permitted to update it.
2329 	 */
2330 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2331 	    (!vfs_context_issuser(ctx))) {
2332 		error = EPERM;
2333 		goto out;
2334 	}
2335 #if CONFIG_MACF
2336 	error = mac_mount_check_remount(ctx, mp, flags);
2337 	if (error != 0) {
2338 		goto out;
2339 	}
2340 #endif
2341 
2342 out:
2343 	if (error) {
2344 		lck_rw_done(&mp->mnt_rwlock);
2345 	}
2346 
2347 	return error;
2348 }
2349 
2350 static void
mount_end_update(mount_t mp)2351 mount_end_update(mount_t mp)
2352 {
2353 	lck_rw_done(&mp->mnt_rwlock);
2354 }
2355 
2356 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2357 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2358 {
2359 	vnode_t vp;
2360 
2361 	if (height >= MAX_IMAGEBOOT_NESTING) {
2362 		return EINVAL;
2363 	}
2364 
2365 	vp = imgsrc_rootvnodes[height];
2366 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2367 		*rvpp = vp;
2368 		return 0;
2369 	} else {
2370 		return ENOENT;
2371 	}
2372 }
2373 
2374 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2375 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2376     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2377     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2378 {
2379 	int error;
2380 	mount_t mp;
2381 	boolean_t placed = FALSE;
2382 	struct vfstable *vfsp;
2383 	user_addr_t devpath;
2384 	char *old_mntonname;
2385 	vnode_t rvp;
2386 	vnode_t devvp;
2387 	uint32_t height;
2388 	uint32_t flags;
2389 
2390 	/* If we didn't imageboot, nothing to move */
2391 	if (imgsrc_rootvnodes[0] == NULLVP) {
2392 		return EINVAL;
2393 	}
2394 
2395 	/* Only root can do this */
2396 	if (!vfs_context_issuser(ctx)) {
2397 		return EPERM;
2398 	}
2399 
2400 	IMGSRC_DEBUG("looking for root vnode.\n");
2401 
2402 	/*
2403 	 * Get root vnode of filesystem we're moving.
2404 	 */
2405 	if (by_index) {
2406 		if (is64bit) {
2407 			struct user64_mnt_imgsrc_args mia64;
2408 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2409 			if (error != 0) {
2410 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2411 				return error;
2412 			}
2413 
2414 			height = mia64.mi_height;
2415 			flags = mia64.mi_flags;
2416 			devpath = (user_addr_t)mia64.mi_devpath;
2417 		} else {
2418 			struct user32_mnt_imgsrc_args mia32;
2419 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2420 			if (error != 0) {
2421 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2422 				return error;
2423 			}
2424 
2425 			height = mia32.mi_height;
2426 			flags = mia32.mi_flags;
2427 			devpath = mia32.mi_devpath;
2428 		}
2429 	} else {
2430 		/*
2431 		 * For binary compatibility--assumes one level of nesting.
2432 		 */
2433 		if (is64bit) {
2434 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2435 				return error;
2436 			}
2437 		} else {
2438 			user32_addr_t tmp;
2439 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2440 				return error;
2441 			}
2442 
2443 			/* munge into LP64 addr */
2444 			devpath = CAST_USER_ADDR_T(tmp);
2445 		}
2446 
2447 		height = 0;
2448 		flags = 0;
2449 	}
2450 
2451 	if (flags != 0) {
2452 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2453 		return EINVAL;
2454 	}
2455 
2456 	error = get_imgsrc_rootvnode(height, &rvp);
2457 	if (error != 0) {
2458 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2459 		return error;
2460 	}
2461 
2462 	IMGSRC_DEBUG("got old root vnode\n");
2463 
2464 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2465 
2466 	/* Can only move once */
2467 	mp = vnode_mount(rvp);
2468 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2469 		IMGSRC_DEBUG("Already moved.\n");
2470 		error = EBUSY;
2471 		goto out0;
2472 	}
2473 
2474 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2475 	IMGSRC_DEBUG("Starting updated.\n");
2476 
2477 	/* Get exclusive rwlock on mount, authorize update on mp */
2478 	error = mount_begin_update(mp, ctx, 0);
2479 	if (error != 0) {
2480 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2481 		goto out0;
2482 	}
2483 
2484 	/*
2485 	 * It can only be moved once.  Flag is set under the rwlock,
2486 	 * so we're now safe to proceed.
2487 	 */
2488 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2489 		IMGSRC_DEBUG("Already moved [2]\n");
2490 		goto out1;
2491 	}
2492 
2493 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2494 
2495 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2496 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2497 	if (error != 0) {
2498 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2499 		goto out1;
2500 	}
2501 
2502 	IMGSRC_DEBUG("Covered vp OK.\n");
2503 
2504 	/* Sanity check the name caller has provided */
2505 	vfsp = mp->mnt_vtable;
2506 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2507 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2508 		    vfsp->vfc_name, fsname);
2509 		error = EINVAL;
2510 		goto out2;
2511 	}
2512 
2513 	/* Check the device vnode and update mount-from name, for local filesystems */
2514 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2515 		IMGSRC_DEBUG("Local, doing device validation.\n");
2516 
2517 		if (devpath != USER_ADDR_NULL) {
2518 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2519 			if (error) {
2520 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2521 				goto out2;
2522 			}
2523 
2524 			vnode_put(devvp);
2525 		}
2526 	}
2527 
2528 	/*
2529 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2530 	 * and increment the name cache's mount generation
2531 	 */
2532 
2533 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2534 	error = place_mount_and_checkdirs(mp, vp, ctx);
2535 	if (error != 0) {
2536 		goto out2;
2537 	}
2538 
2539 	placed = TRUE;
2540 
2541 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2542 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2543 
2544 	/* Forbid future moves */
2545 	mount_lock(mp);
2546 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2547 	mount_unlock(mp);
2548 
2549 	/* Finally, add to mount list, completely ready to go */
2550 	if (mount_list_add(mp) != 0) {
2551 		/*
2552 		 * The system is shutting down trying to umount
2553 		 * everything, so fail with a plausible errno.
2554 		 */
2555 		error = EBUSY;
2556 		goto out3;
2557 	}
2558 
2559 	mount_end_update(mp);
2560 	vnode_put(rvp);
2561 	zfree(ZV_NAMEI, old_mntonname);
2562 
2563 	vfs_notify_mount(pvp);
2564 #if CONFIG_MACF
2565 	mac_mount_notify_mount(ctx, mp);
2566 #endif /* CONFIG_MACF */
2567 
2568 	return 0;
2569 out3:
2570 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2571 
2572 	mount_lock(mp);
2573 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2574 	mount_unlock(mp);
2575 
2576 out2:
2577 	/*
2578 	 * Placing the mp on the vnode clears VMOUNT,
2579 	 * so cleanup is different after that point
2580 	 */
2581 	if (placed) {
2582 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2583 		undo_place_on_covered_vp(mp, vp);
2584 	} else {
2585 		vnode_lock_spin(vp);
2586 		CLR(vp->v_flag, VMOUNT);
2587 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2588 		wakeup(&vp->v_flag);
2589 		vnode_unlock(vp);
2590 	}
2591 out1:
2592 	mount_end_update(mp);
2593 
2594 out0:
2595 	vnode_put(rvp);
2596 	zfree(ZV_NAMEI, old_mntonname);
2597 	return error;
2598 }
2599 
2600 #endif /* CONFIG_IMGSRC_ACCESS */
2601 
2602 void
enablequotas(struct mount * mp,vfs_context_t ctx)2603 enablequotas(struct mount *mp, vfs_context_t ctx)
2604 {
2605 	struct nameidata qnd;
2606 	int type;
2607 	char qfpath[MAXPATHLEN];
2608 	const char *qfname = QUOTAFILENAME;
2609 	const char *qfopsname = QUOTAOPSNAME;
2610 	const char *qfextension[] = INITQFNAMES;
2611 
2612 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2613 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2614 		return;
2615 	}
2616 	/*
2617 	 * Enable filesystem disk quotas if necessary.
2618 	 * We ignore errors as this should not interfere with final mount
2619 	 */
2620 	for (type = 0; type < MAXQUOTAS; type++) {
2621 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2622 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2623 		    CAST_USER_ADDR_T(qfpath), ctx);
2624 		if (namei(&qnd) != 0) {
2625 			continue;           /* option file to trigger quotas is not present */
2626 		}
2627 		vnode_put(qnd.ni_vp);
2628 		nameidone(&qnd);
2629 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2630 
2631 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2632 	}
2633 	return;
2634 }
2635 
2636 
2637 static int
checkdirs_callback(proc_t p,void * arg)2638 checkdirs_callback(proc_t p, void * arg)
2639 {
2640 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2641 	vnode_t olddp = cdrp->olddp;
2642 	vnode_t newdp = cdrp->newdp;
2643 	struct filedesc *fdp = &p->p_fd;
2644 	vnode_t new_cvp = newdp;
2645 	vnode_t new_rvp = newdp;
2646 	vnode_t old_cvp = NULL;
2647 	vnode_t old_rvp = NULL;
2648 
2649 	/*
2650 	 * XXX Also needs to iterate each thread in the process to see if it
2651 	 * XXX is using a per-thread current working directory, and, if so,
2652 	 * XXX update that as well.
2653 	 */
2654 
2655 	/*
2656 	 * First, with the proc_fdlock held, check to see if we will need
2657 	 * to do any work.  If not, we will get out fast.
2658 	 */
2659 	proc_fdlock(p);
2660 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2661 		proc_fdunlock(p);
2662 		return PROC_RETURNED;
2663 	}
2664 	proc_fdunlock(p);
2665 
2666 	/*
2667 	 * Ok, we will have to do some work.  Always take two refs
2668 	 * because we might need that many.  We'll dispose of whatever
2669 	 * we ended up not using.
2670 	 */
2671 	if (vnode_ref(newdp) != 0) {
2672 		return PROC_RETURNED;
2673 	}
2674 	if (vnode_ref(newdp) != 0) {
2675 		vnode_rele(newdp);
2676 		return PROC_RETURNED;
2677 	}
2678 
2679 	proc_dirs_lock_exclusive(p);
2680 	/*
2681 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2682 	 * have to do all of the checks again.
2683 	 */
2684 	proc_fdlock(p);
2685 	if (fdp->fd_cdir == olddp) {
2686 		old_cvp = olddp;
2687 		fdp->fd_cdir = newdp;
2688 		new_cvp = NULL;
2689 	}
2690 	if (fdp->fd_rdir == olddp) {
2691 		old_rvp = olddp;
2692 		fdp->fd_rdir = newdp;
2693 		new_rvp = NULL;
2694 	}
2695 	proc_fdunlock(p);
2696 	proc_dirs_unlock_exclusive(p);
2697 
2698 	/*
2699 	 * Dispose of any references that are no longer needed.
2700 	 */
2701 	if (old_cvp != NULL) {
2702 		vnode_rele(old_cvp);
2703 	}
2704 	if (old_rvp != NULL) {
2705 		vnode_rele(old_rvp);
2706 	}
2707 	if (new_cvp != NULL) {
2708 		vnode_rele(new_cvp);
2709 	}
2710 	if (new_rvp != NULL) {
2711 		vnode_rele(new_rvp);
2712 	}
2713 
2714 	return PROC_RETURNED;
2715 }
2716 
2717 
2718 
2719 /*
2720  * Scan all active processes to see if any of them have a current
2721  * or root directory onto which the new filesystem has just been
2722  * mounted. If so, replace them with the new mount point.
2723  */
2724 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2725 checkdirs(vnode_t olddp, vfs_context_t ctx)
2726 {
2727 	vnode_t newdp;
2728 	vnode_t tvp;
2729 	int err;
2730 	struct cdirargs cdr;
2731 
2732 	if (olddp->v_usecount == 1) {
2733 		return 0;
2734 	}
2735 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2736 
2737 	if (err != 0) {
2738 #if DIAGNOSTIC
2739 		panic("mount: lost mount: error %d", err);
2740 #endif
2741 		return err;
2742 	}
2743 
2744 	cdr.olddp = olddp;
2745 	cdr.newdp = newdp;
2746 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2747 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2748 
2749 	if (rootvnode == olddp) {
2750 		vnode_ref(newdp);
2751 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2752 		tvp = rootvnode;
2753 		rootvnode = newdp;
2754 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2755 		vnode_rele(tvp);
2756 	}
2757 
2758 	vnode_put(newdp);
2759 	return 0;
2760 }
2761 
2762 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2763 	"com.apple.private.vfs.role-account-unmount"
2764 #define SYSTEM_VOLUME_UNMOUNT_ENTITLEMENT       \
2765 	"com.apple.private.vfs.system-volume-unmount"
2766 
2767 /*
2768  * Unmount a file system.
2769  *
2770  * Note: unmount takes a path to the vnode mounted on as argument,
2771  * not special file (as before).
2772  */
2773 /* ARGSUSED */
2774 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2775 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2776 {
2777 	vnode_t vp;
2778 	struct mount *mp;
2779 	int flags = uap->flags;
2780 	int error;
2781 	struct nameidata nd;
2782 	vfs_context_t ctx;
2783 
2784 	/*
2785 	 * If the process has the entitlement, use the kernel's context when
2786 	 * performing lookup on the mount path as the process might lack proper
2787 	 * permission to access the directory.
2788 	 */
2789 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2790 	    vfs_context_kernel() : vfs_context_current();
2791 
2792 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2793 	    UIO_USERSPACE, uap->path, ctx);
2794 	if (flags & MNT_NOFOLLOW) {
2795 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
2796 	}
2797 
2798 	error = namei(&nd);
2799 	if (error) {
2800 		return error;
2801 	}
2802 	vp = nd.ni_vp;
2803 	mp = vp->v_mount;
2804 	nameidone(&nd);
2805 
2806 	/*
2807 	 * Must be the root of the filesystem
2808 	 */
2809 	if ((vp->v_flag & VROOT) == 0) {
2810 		vnode_put(vp);
2811 		return EINVAL;
2812 	}
2813 #if CONFIG_MACF
2814 	error = mac_mount_check_umount(ctx, mp);
2815 	if (error != 0) {
2816 		vnode_put(vp);
2817 		return error;
2818 	}
2819 #endif
2820 	mount_ref(mp, 0);
2821 	vnode_put(vp);
2822 	/* safedounmount consumes the mount ref */
2823 	return safedounmount(mp, flags, ctx);
2824 }
2825 
2826 int
funmount(__unused proc_t p,struct funmount_args * uap,__unused int32_t * retval)2827 funmount(__unused proc_t p, struct funmount_args *uap, __unused int32_t *retval)
2828 {
2829 	int error;
2830 	vnode_t vp;
2831 	struct mount *mp;
2832 	vfs_context_t ctx;
2833 
2834 	AUDIT_ARG(fd, uap->fd);
2835 	AUDIT_ARG(fflags, uap->flags);
2836 
2837 	/*
2838 	 * If the process has the entitlement, use the kernel's context when
2839 	 * performing lookup on the mount path as the process might lack proper
2840 	 * permission to access the directory.
2841 	 */
2842 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2843 	    vfs_context_kernel() : vfs_context_current();
2844 
2845 	error = vnode_getfromfd(ctx, uap->fd, &vp);
2846 	if (error) {
2847 		return error;
2848 	}
2849 
2850 	/*
2851 	 * Must be the root of the filesystem
2852 	 */
2853 	if ((vp->v_flag & VROOT) == 0) {
2854 		vnode_put(vp);
2855 		return EINVAL;
2856 	}
2857 	mp = vnode_mount(vp);
2858 
2859 #if CONFIG_MACF
2860 	error = mac_mount_check_umount(ctx, mp);
2861 	if (error != 0) {
2862 		vnode_put(vp);
2863 		return error;
2864 	}
2865 #endif
2866 	mount_ref(mp, 0);
2867 	vnode_put(vp);
2868 
2869 	/* safedounmount consumes the mount ref */
2870 	return safedounmount(mp, uap->flags, ctx);
2871 }
2872 
2873 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2874 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2875 {
2876 	mount_t mp;
2877 
2878 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2879 	if (mp == (mount_t)0) {
2880 		return ENOENT;
2881 	}
2882 	mount_ref(mp, 0);
2883 	mount_iterdrop(mp);
2884 	/* safedounmount consumes the mount ref */
2885 	return safedounmount(mp, flags, ctx);
2886 }
2887 
2888 /*
2889  * The mount struct comes with a mount ref which will be consumed.
2890  * Do the actual file system unmount, prevent some common foot shooting.
2891  */
2892 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2893 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2894 {
2895 	int error;
2896 	proc_t p = vfs_context_proc(ctx);
2897 
2898 	/*
2899 	 * If the file system is not responding and MNT_NOBLOCK
2900 	 * is set and not a forced unmount then return EBUSY.
2901 	 */
2902 	if ((mp->mnt_lflag & MNT_LNOTRESP) &&
2903 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2904 		error = EBUSY;
2905 		goto out;
2906 	}
2907 
2908 	/*
2909 	 * Skip authorization in two cases:
2910 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2911 	 *   This entitlement allows non-root processes unmount volumes mounted by
2912 	 *   other processes.
2913 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2914 	 *   attempt.
2915 	 */
2916 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2917 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2918 		/*
2919 		 * Only root, or the user that did the original mount is
2920 		 * permitted to unmount this filesystem.
2921 		 */
2922 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2923 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2924 			goto out;
2925 		}
2926 	}
2927 
2928 	/*
2929 	 * Don't allow unmounting the root file system, or other volumes
2930 	 * associated with it (for example, the associated VM or DATA mounts) .
2931 	 */
2932 	if (mp->mnt_flag & MNT_ROOTFS) {
2933 		error = EBUSY; /* the root is always busy */
2934 		goto out;
2935 	}
2936 	if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !IOCurrentTaskHasEntitlement(SYSTEM_VOLUME_UNMOUNT_ENTITLEMENT)) {
2937 		printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2938 		    mp->mnt_vfsstat.f_mntonname);
2939 		error = EBUSY; /* root-associated volumes are always busy unless caller is entitled */
2940 		goto out;
2941 	}
2942 
2943 	/*
2944 	 * If the mount is providing the root filesystem's disk image
2945 	 * (i.e. imageboot), don't allow unmounting
2946 	 */
2947 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2948 		error = EBUSY;
2949 		goto out;
2950 	}
2951 
2952 	return dounmount(mp, flags, 1, ctx);
2953 
2954 out:
2955 	mount_drop(mp, 0);
2956 	return error;
2957 }
2958 
2959 /*
2960  * Do the actual file system unmount.
2961  */
2962 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2963 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2964 {
2965 	vnode_t coveredvp = (vnode_t)0;
2966 	int error;
2967 	int needwakeup = 0;
2968 	int forcedunmount = 0;
2969 	int lflags = 0;
2970 	struct vnode *devvp = NULLVP;
2971 #if CONFIG_TRIGGERS
2972 	proc_t p = vfs_context_proc(ctx);
2973 	int did_vflush = 0;
2974 	int pflags_save = 0;
2975 #endif /* CONFIG_TRIGGERS */
2976 
2977 #if CONFIG_FSE
2978 	if (!(flags & MNT_FORCE)) {
2979 		fsevent_unmount(mp, ctx);  /* has to come first! */
2980 	}
2981 #endif
2982 
2983 	mount_lock(mp);
2984 
2985 	/*
2986 	 * If already an unmount in progress just return EBUSY.
2987 	 * Even a forced unmount cannot override.
2988 	 */
2989 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2990 		if (withref != 0) {
2991 			mount_drop(mp, 1);
2992 		}
2993 		mount_unlock(mp);
2994 		return EBUSY;
2995 	}
2996 
2997 	if (flags & MNT_FORCE) {
2998 		forcedunmount = 1;
2999 		mp->mnt_lflag |= MNT_LFORCE;
3000 	}
3001 
3002 #if CONFIG_TRIGGERS
3003 	if (flags & MNT_NOBLOCK && p != kernproc) {
3004 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
3005 	}
3006 #endif
3007 
3008 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
3009 	mp->mnt_lflag |= MNT_LUNMOUNT;
3010 	mp->mnt_flag &= ~MNT_ASYNC;
3011 	/*
3012 	 * anyone currently in the fast path that
3013 	 * trips over the cached rootvp will be
3014 	 * dumped out and forced into the slow path
3015 	 * to regenerate a new cached value
3016 	 */
3017 	mp->mnt_realrootvp = NULLVP;
3018 	mount_unlock(mp);
3019 
3020 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
3021 		/*
3022 		 * Force unmount any mounts in this filesystem.
3023 		 * If any unmounts fail - just leave them dangling.
3024 		 * Avoids recursion.
3025 		 */
3026 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
3027 	}
3028 
3029 	/*
3030 	 * taking the name_cache_lock exclusively will
3031 	 * insure that everyone is out of the fast path who
3032 	 * might be trying to use a now stale copy of
3033 	 * vp->v_mountedhere->mnt_realrootvp
3034 	 * bumping mount_generation causes the cached values
3035 	 * to be invalidated
3036 	 */
3037 	name_cache_lock();
3038 	mount_generation++;
3039 	name_cache_unlock();
3040 
3041 	/*
3042 	 * Make sure there are no one in the mount iterations or lookup.
3043 	 * Drain makes 'mnt_iterref' -ve so on error exit we need to ensure that
3044 	 * 'mnt_iterref' is reset back to 0 by calling mount_iterreset().
3045 	 */
3046 	mount_iterdrain(mp);
3047 
3048 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
3049 	if (withref != 0) {
3050 		mount_drop(mp, 0);
3051 	}
3052 	error = 0;
3053 	if (forcedunmount == 0) {
3054 		ubc_umount(mp); /* release cached vnodes */
3055 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3056 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
3057 			if (error) {
3058 				mount_iterreset(mp);
3059 				mount_lock(mp);
3060 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
3061 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
3062 				mp->mnt_lflag &= ~MNT_LFORCE;
3063 				goto out;
3064 			}
3065 		}
3066 	}
3067 
3068 	IOBSDMountChange(mp, kIOMountChangeUnmount);
3069 
3070 #if CONFIG_TRIGGERS
3071 	vfs_nested_trigger_unmounts(mp, flags, ctx);
3072 	did_vflush = 1;
3073 #endif
3074 	if (forcedunmount) {
3075 		lflags |= FORCECLOSE;
3076 	}
3077 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
3078 	if ((forcedunmount == 0) && error) {
3079 		mount_iterreset(mp);
3080 		mount_lock(mp);
3081 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
3082 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
3083 		mp->mnt_lflag &= ~MNT_LFORCE;
3084 		goto out;
3085 	}
3086 
3087 	error = VFS_UNMOUNT(mp, flags, ctx);
3088 	if (error) {
3089 		mount_iterreset(mp);
3090 		mount_lock(mp);
3091 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
3092 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
3093 		mp->mnt_lflag &= ~MNT_LFORCE;
3094 		goto out;
3095 	}
3096 
3097 	/* increment the operations count */
3098 	if (!error) {
3099 		OSAddAtomic(1, &vfs_nummntops);
3100 	}
3101 
3102 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
3103 		/* hold an io reference and drop the usecount before close */
3104 		devvp = mp->mnt_devvp;
3105 		vnode_getalways(devvp);
3106 		vnode_rele(devvp);
3107 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
3108 		    ctx);
3109 		vnode_clearmountedon(devvp);
3110 		vnode_put(devvp);
3111 	}
3112 	lck_rw_done(&mp->mnt_rwlock);
3113 	mount_list_remove(mp);
3114 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
3115 
3116 	/* mark the mount point hook in the vp but not drop the ref yet */
3117 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
3118 		/*
3119 		 * The covered vnode needs special handling. Trying to get an
3120 		 * iocount must not block here as this may lead to deadlocks
3121 		 * if the Filesystem to which the covered vnode belongs is
3122 		 * undergoing forced unmounts. Since we hold a usecount, the
3123 		 * vnode cannot be reused (it can, however, still be terminated)
3124 		 */
3125 		vnode_getalways(coveredvp);
3126 		vnode_lock_spin(coveredvp);
3127 
3128 		mp->mnt_crossref++;
3129 		coveredvp->v_mountedhere = (struct mount *)0;
3130 		CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
3131 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
3132 		wakeup(&coveredvp->v_flag);
3133 		vnode_unlock(coveredvp);
3134 		vnode_put(coveredvp);
3135 	}
3136 
3137 	mount_list_lock();
3138 	mp->mnt_vtable->vfc_refcount--;
3139 	mount_list_unlock();
3140 
3141 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
3142 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
3143 	mount_lock(mp);
3144 	mp->mnt_lflag |= MNT_LDEAD;
3145 
3146 	if (mp->mnt_lflag & MNT_LWAIT) {
3147 		/*
3148 		 * do the wakeup here
3149 		 * in case we block in mount_refdrain
3150 		 * which will drop the mount lock
3151 		 * and allow anyone blocked in vfs_busy
3152 		 * to wakeup and see the LDEAD state
3153 		 */
3154 		mp->mnt_lflag &= ~MNT_LWAIT;
3155 		wakeup((caddr_t)mp);
3156 	}
3157 	mount_refdrain(mp);
3158 
3159 	/* free disk_conditioner_info structure for this mount */
3160 	disk_conditioner_unmount(mp);
3161 
3162 out:
3163 	if (mp->mnt_lflag & MNT_LWAIT) {
3164 		mp->mnt_lflag &= ~MNT_LWAIT;
3165 		needwakeup = 1;
3166 	}
3167 
3168 #if CONFIG_TRIGGERS
3169 	if (flags & MNT_NOBLOCK && p != kernproc) {
3170 		// Restore P_NOREMOTEHANG bit to its previous value
3171 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
3172 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
3173 		}
3174 	}
3175 
3176 	/*
3177 	 * Callback and context are set together under the mount lock, and
3178 	 * never cleared, so we're safe to examine them here, drop the lock,
3179 	 * and call out.
3180 	 */
3181 	if (mp->mnt_triggercallback != NULL) {
3182 		mount_unlock(mp);
3183 		if (error == 0) {
3184 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
3185 		} else if (did_vflush) {
3186 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
3187 		}
3188 	} else {
3189 		mount_unlock(mp);
3190 	}
3191 #else
3192 	mount_unlock(mp);
3193 #endif /* CONFIG_TRIGGERS */
3194 
3195 	lck_rw_done(&mp->mnt_rwlock);
3196 
3197 	if (needwakeup) {
3198 		wakeup((caddr_t)mp);
3199 	}
3200 
3201 	if (!error) {
3202 		if ((coveredvp != NULLVP)) {
3203 			vnode_t pvp = NULLVP;
3204 
3205 			/*
3206 			 * The covered vnode needs special handling. Trying to
3207 			 * get an iocount must not block here as this may lead
3208 			 * to deadlocks if the Filesystem to which the covered
3209 			 * vnode belongs is undergoing forced unmounts. Since we
3210 			 * hold a usecount, the  vnode cannot be reused
3211 			 * (it can, however, still be terminated).
3212 			 */
3213 			vnode_getalways(coveredvp);
3214 
3215 			mount_dropcrossref(mp, coveredvp, 0);
3216 			/*
3217 			 * We'll _try_ to detect if this really needs to be
3218 			 * done. The coveredvp can only be in termination (or
3219 			 * terminated) if the coveredvp's mount point is in a
3220 			 * forced unmount (or has been) since we still hold the
3221 			 * ref.
3222 			 */
3223 			if (!vnode_isrecycled(coveredvp)) {
3224 				pvp = vnode_getparent(coveredvp);
3225 #if CONFIG_TRIGGERS
3226 				if (coveredvp->v_resolve) {
3227 					vnode_trigger_rearm(coveredvp, ctx);
3228 				}
3229 #endif
3230 			}
3231 
3232 			vnode_rele(coveredvp);
3233 			vnode_put(coveredvp);
3234 			coveredvp = NULLVP;
3235 
3236 			if (pvp) {
3237 				lock_vnode_and_post(pvp, NOTE_WRITE);
3238 				vnode_put(pvp);
3239 			}
3240 		} else if (mp->mnt_flag & MNT_ROOTFS) {
3241 			if (nc_smr_enabled) {
3242 				vfs_smr_synchronize();
3243 			}
3244 
3245 			mount_lock_destroy(mp);
3246 #if CONFIG_MACF
3247 			mac_mount_label_destroy(mp);
3248 #endif
3249 			zfree(mount_zone, mp);
3250 		} else {
3251 			panic("dounmount: no coveredvp");
3252 		}
3253 	}
3254 	return error;
3255 }
3256 
3257 /*
3258  * Unmount any mounts in this filesystem.
3259  */
3260 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3261 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3262 {
3263 	mount_t smp;
3264 	fsid_t *fsids, fsid;
3265 	int fsids_sz;
3266 	int count = 0, i, m = 0;
3267 	vnode_t vp;
3268 
3269 	mount_list_lock();
3270 
3271 	// Get an array to hold the submounts fsids.
3272 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
3273 	count++;
3274 	fsids_sz = count * sizeof(fsid_t);
3275 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3276 	if (fsids == NULL) {
3277 		mount_list_unlock();
3278 		goto out;
3279 	}
3280 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
3281 
3282 	/*
3283 	 * Fill the array with submount fsids.
3284 	 * Since mounts are always added to the tail of the mount list, the
3285 	 * list is always in mount order.
3286 	 * For each mount check if the mounted-on vnode belongs to a
3287 	 * mount that's already added to our array of mounts to be unmounted.
3288 	 */
3289 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3290 		vp = smp->mnt_vnodecovered;
3291 		if (vp == NULL) {
3292 			continue;
3293 		}
3294 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3295 		for (i = 0; i <= m; i++) {
3296 			if (fsids[i].val[0] == fsid.val[0] &&
3297 			    fsids[i].val[1] == fsid.val[1]) {
3298 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3299 				break;
3300 			}
3301 		}
3302 	}
3303 	mount_list_unlock();
3304 
3305 	// Unmount the submounts in reverse order. Ignore errors.
3306 	for (i = m; i > 0; i--) {
3307 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3308 		if (smp) {
3309 			mount_ref(smp, 0);
3310 			mount_iterdrop(smp);
3311 			(void) dounmount(smp, flags, 1, ctx);
3312 		}
3313 	}
3314 out:
3315 	kfree_data(fsids, fsids_sz);
3316 }
3317 
3318 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3319 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3320 {
3321 	vnode_hold(dp);
3322 	vnode_lock(dp);
3323 	mp->mnt_crossref--;
3324 
3325 	if (mp->mnt_crossref < 0) {
3326 		panic("mount cross refs -ve");
3327 	}
3328 
3329 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3330 		if (need_put) {
3331 			vnode_put_locked(dp);
3332 		}
3333 		vnode_drop_and_unlock(dp);
3334 
3335 		if (nc_smr_enabled) {
3336 			vfs_smr_synchronize();
3337 		}
3338 
3339 		mount_lock_destroy(mp);
3340 #if CONFIG_MACF
3341 		mac_mount_label_destroy(mp);
3342 #endif
3343 		zfree(mount_zone, mp);
3344 		return;
3345 	}
3346 	if (need_put) {
3347 		vnode_put_locked(dp);
3348 	}
3349 	vnode_drop_and_unlock(dp);
3350 }
3351 
3352 
3353 /*
3354  * Sync each mounted filesystem.
3355  */
3356 #if DIAGNOSTIC
3357 int syncprt = 0;
3358 #endif
3359 
3360 int print_vmpage_stat = 0;
3361 
3362 /*
3363  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3364  *			mounted read-write with the passed waitfor value.
3365  *
3366  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3367  *		arg	user argument (please see below)
3368  *
3369  * User argument is a pointer to 32 bit unsigned integer which describes the
3370  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3371  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3372  * waitfor value.
3373  *
3374  * Returns:		VFS_RETURNED
3375  */
3376 static int
sync_callback(mount_t mp,void * arg)3377 sync_callback(mount_t mp, void *arg)
3378 {
3379 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3380 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3381 		unsigned waitfor = MNT_NOWAIT;
3382 
3383 		if (arg) {
3384 			waitfor = *(uint32_t*)arg;
3385 		}
3386 
3387 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3388 		if (waitfor != MNT_WAIT &&
3389 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3390 		    waitfor != MNT_NOWAIT &&
3391 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3392 		    waitfor != MNT_DWAIT &&
3393 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3394 			panic("Passed inappropriate waitfor %u to "
3395 			    "sync_callback()", waitfor);
3396 		}
3397 
3398 		mp->mnt_flag &= ~MNT_ASYNC;
3399 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3400 		if (asyncflag) {
3401 			mp->mnt_flag |= MNT_ASYNC;
3402 		}
3403 	}
3404 
3405 	return VFS_RETURNED;
3406 }
3407 
3408 /* ARGSUSED */
3409 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3410 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3411 {
3412 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3413 
3414 	if (print_vmpage_stat) {
3415 		vm_countdirtypages();
3416 	}
3417 
3418 #if DIAGNOSTIC
3419 	if (syncprt) {
3420 		vfs_bufstats();
3421 	}
3422 #endif /* DIAGNOSTIC */
3423 	return 0;
3424 }
3425 
3426 typedef enum {
3427 	SYNC_ALL = 0,
3428 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3429 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3430 } sync_type_t;
3431 
3432 static int
sync_internal_callback(mount_t mp,void * arg)3433 sync_internal_callback(mount_t mp, void *arg)
3434 {
3435 	if (arg) {
3436 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3437 		    (mp->mnt_flag & MNT_LOCAL);
3438 		sync_type_t sync_type = *((sync_type_t *)arg);
3439 
3440 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3441 			return VFS_RETURNED;
3442 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3443 			return VFS_RETURNED;
3444 		}
3445 	}
3446 
3447 	(void)sync_callback(mp, NULL);
3448 
3449 	return VFS_RETURNED;
3450 }
3451 
3452 int sync_thread_state = 0;
3453 int sync_timeout_seconds = 5;
3454 
3455 #define SYNC_THREAD_RUN       0x0001
3456 #define SYNC_THREAD_RUNNING   0x0002
3457 
3458 #if CONFIG_PHYS_WRITE_ACCT
3459 thread_t pm_sync_thread;
3460 #endif /* CONFIG_PHYS_WRITE_ACCT */
3461 
3462 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3463 sync_thread(__unused void *arg, __unused wait_result_t wr)
3464 {
3465 	sync_type_t sync_type;
3466 #if CONFIG_PHYS_WRITE_ACCT
3467 	pm_sync_thread = current_thread();
3468 #endif /* CONFIG_PHYS_WRITE_ACCT */
3469 
3470 	lck_mtx_lock(&sync_mtx_lck);
3471 	while (sync_thread_state & SYNC_THREAD_RUN) {
3472 		sync_thread_state &= ~SYNC_THREAD_RUN;
3473 		lck_mtx_unlock(&sync_mtx_lck);
3474 
3475 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3476 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3477 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3478 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3479 
3480 		lck_mtx_lock(&sync_mtx_lck);
3481 	}
3482 	/*
3483 	 * This wakeup _has_ to be issued before the lock is released otherwise
3484 	 * we may end up waking up a thread in sync_internal which is
3485 	 * expecting a wakeup from a thread it just created and not from this
3486 	 * thread which is about to exit.
3487 	 */
3488 	wakeup(&sync_thread_state);
3489 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3490 #if CONFIG_PHYS_WRITE_ACCT
3491 	pm_sync_thread = NULL;
3492 #endif /* CONFIG_PHYS_WRITE_ACCT */
3493 	lck_mtx_unlock(&sync_mtx_lck);
3494 
3495 	if (print_vmpage_stat) {
3496 		vm_countdirtypages();
3497 	}
3498 
3499 #if DIAGNOSTIC
3500 	if (syncprt) {
3501 		vfs_bufstats();
3502 	}
3503 #endif /* DIAGNOSTIC */
3504 }
3505 
3506 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3507 
3508 /*
3509  * An in-kernel sync for power management to call.
3510  * This function always returns within sync_timeout seconds.
3511  */
3512 __private_extern__ int
sync_internal(void)3513 sync_internal(void)
3514 {
3515 	thread_t thd = NULL;
3516 	int error;
3517 	int thread_created = FALSE;
3518 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3519 
3520 	lck_mtx_lock(&sync_mtx_lck);
3521 	sync_thread_state |= SYNC_THREAD_RUN;
3522 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3523 		int kr;
3524 
3525 		sync_thread_state |= SYNC_THREAD_RUNNING;
3526 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3527 		if (kr != KERN_SUCCESS) {
3528 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3529 			lck_mtx_unlock(&sync_mtx_lck);
3530 			printf("sync_thread failed\n");
3531 			return 0;
3532 		}
3533 		thread_created = TRUE;
3534 	}
3535 
3536 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3537 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3538 	if (error) {
3539 		struct timeval now;
3540 
3541 		microtime(&now);
3542 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3543 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3544 			sync_timeout_last_print.tv_sec = now.tv_sec;
3545 		}
3546 	}
3547 
3548 	if (thread_created) {
3549 		thread_deallocate(thd);
3550 	}
3551 
3552 	return 0;
3553 } /* end of sync_internal call */
3554 
3555 /*
3556  * Change filesystem quotas.
3557  */
3558 #if QUOTA
3559 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3560 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3561 {
3562 	struct mount *mp;
3563 	int error, quota_cmd, quota_status = 0;
3564 	caddr_t datap;
3565 	size_t fnamelen;
3566 	struct nameidata nd;
3567 	vfs_context_t ctx = vfs_context_current();
3568 	struct dqblk my_dqblk = {};
3569 
3570 	AUDIT_ARG(uid, uap->uid);
3571 	AUDIT_ARG(cmd, uap->cmd);
3572 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3573 	    uap->path, ctx);
3574 	error = namei(&nd);
3575 	if (error) {
3576 		return error;
3577 	}
3578 	mp = nd.ni_vp->v_mount;
3579 	mount_ref(mp, 0);
3580 	vnode_put(nd.ni_vp);
3581 	nameidone(&nd);
3582 
3583 #if CONFIG_MACF
3584 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3585 	if (error != 0) {
3586 		goto out;
3587 	}
3588 #endif
3589 
3590 	/* copyin any data we will need for downstream code */
3591 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3592 
3593 	switch (quota_cmd) {
3594 	case Q_QUOTAON:
3595 		/* uap->arg specifies a file from which to take the quotas */
3596 		fnamelen = MAXPATHLEN;
3597 		datap = zalloc(ZV_NAMEI);
3598 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3599 		break;
3600 	case Q_GETQUOTA:
3601 		/* uap->arg is a pointer to a dqblk structure. */
3602 		datap = (caddr_t) &my_dqblk;
3603 		break;
3604 	case Q_SETQUOTA:
3605 	case Q_SETUSE:
3606 		/* uap->arg is a pointer to a dqblk structure. */
3607 		datap = (caddr_t) &my_dqblk;
3608 		if (proc_is64bit(p)) {
3609 			struct user_dqblk       my_dqblk64;
3610 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3611 			if (error == 0) {
3612 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3613 			}
3614 		} else {
3615 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3616 		}
3617 		break;
3618 	case Q_QUOTASTAT:
3619 		/* uap->arg is a pointer to an integer */
3620 		datap = (caddr_t) &quota_status;
3621 		break;
3622 	default:
3623 		datap = NULL;
3624 		break;
3625 	} /* switch */
3626 
3627 	if (error == 0) {
3628 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3629 	}
3630 
3631 	switch (quota_cmd) {
3632 	case Q_QUOTAON:
3633 		if (datap != NULL) {
3634 			zfree(ZV_NAMEI, datap);
3635 		}
3636 		break;
3637 	case Q_GETQUOTA:
3638 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3639 		if (error == 0) {
3640 			if (proc_is64bit(p)) {
3641 				struct user_dqblk       my_dqblk64;
3642 
3643 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3644 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3645 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3646 			} else {
3647 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3648 			}
3649 		}
3650 		break;
3651 	case Q_QUOTASTAT:
3652 		/* uap->arg is a pointer to an integer */
3653 		if (error == 0) {
3654 			error = copyout(datap, uap->arg, sizeof(quota_status));
3655 		}
3656 		break;
3657 	default:
3658 		break;
3659 	} /* switch */
3660 
3661 out:
3662 	mount_drop(mp, 0);
3663 	return error;
3664 }
3665 #else
3666 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3667 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3668 {
3669 	return EOPNOTSUPP;
3670 }
3671 #endif /* QUOTA */
3672 
3673 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3674 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3675 {
3676 	int error;
3677 	vfs_context_t ctx = vfs_context_current();
3678 
3679 #if CONFIG_MACF
3680 	error = mac_mount_check_stat(ctx, mp);
3681 	if (error != 0) {
3682 		return error;
3683 	}
3684 #endif
3685 
3686 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3687 	if (error != 0) {
3688 		return error;
3689 	}
3690 
3691 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3692 }
3693 
3694 /*
3695  * Get filesystem statistics.
3696  *
3697  * Returns:	0			Success
3698  *	namei:???
3699  *	vfs_update_vfsstat:???
3700  *	munge_statfs:EFAULT
3701  */
3702 /* ARGSUSED */
3703 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3704 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3705 {
3706 	int error;
3707 	struct mount *mp;
3708 	struct nameidata nd;
3709 	vfs_context_t ctx = vfs_context_current();
3710 	vnode_t vp;
3711 
3712 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3713 	    UIO_USERSPACE, uap->path, ctx);
3714 	error = namei(&nd);
3715 	if (error != 0) {
3716 		return error;
3717 	}
3718 	vp = nd.ni_vp;
3719 	mp = vp->v_mount;
3720 	nameidone(&nd);
3721 
3722 	error = statfs_internal(p, mp, uap->buf);
3723 	vnode_put(vp);
3724 
3725 	return error;
3726 }
3727 
3728 /*
3729  * Get filesystem statistics.
3730  */
3731 /* ARGSUSED */
3732 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3733 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3734 {
3735 	int error;
3736 	vnode_t vp = NULL;
3737 	struct mount *mp;
3738 
3739 	AUDIT_ARG(fd, uap->fd);
3740 
3741 	if ((error = file_vnode(uap->fd, &vp)) ||
3742 	    (error = vnode_getwithref(vp))) {
3743 		goto out;
3744 	}
3745 
3746 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3747 
3748 	mp = vp->v_mount;
3749 	if (!mp) {
3750 		error = EBADF;
3751 		goto out_vnode;
3752 	}
3753 
3754 	error = statfs_internal(p, mp, uap->buf);
3755 
3756 out_vnode:
3757 	vnode_put(vp);
3758 
3759 out:
3760 	if (vp != NULL) {
3761 		file_drop(uap->fd);
3762 	}
3763 
3764 	return error;
3765 }
3766 
3767 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3768 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3769 {
3770 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3771 
3772 	bzero(sfs, sizeof(*sfs));
3773 
3774 	sfs->f_bsize = vsfs->f_bsize;
3775 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3776 	sfs->f_blocks = vsfs->f_blocks;
3777 	sfs->f_bfree = vsfs->f_bfree;
3778 	sfs->f_bavail = vsfs->f_bavail;
3779 	sfs->f_files = vsfs->f_files;
3780 	sfs->f_ffree = vsfs->f_ffree;
3781 	sfs->f_fsid = vsfs->f_fsid;
3782 	sfs->f_owner = vsfs->f_owner;
3783 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3784 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3785 	sfs->f_fssubtype = vsfs->f_fssubtype;
3786 	sfs->f_flags_ext = vfs_getextflags(mp);
3787 	vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3788 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3789 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3790 }
3791 
3792 /*
3793  * Get file system statistics in 64-bit mode
3794  */
3795 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3796 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3797 {
3798 	struct mount *mp;
3799 	int error;
3800 	struct nameidata *ndp;
3801 	struct statfs64 *sfsp;
3802 	vfs_context_t ctxp = vfs_context_current();
3803 	vnode_t vp;
3804 	struct {
3805 		struct nameidata nd;
3806 		struct statfs64 sfs;
3807 	} *__nameidata_statfs64;
3808 
3809 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3810 	    Z_WAITOK);
3811 	ndp = &__nameidata_statfs64->nd;
3812 
3813 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3814 	    UIO_USERSPACE, uap->path, ctxp);
3815 	error = namei(ndp);
3816 	if (error != 0) {
3817 		goto out;
3818 	}
3819 	vp = ndp->ni_vp;
3820 	mp = vp->v_mount;
3821 	nameidone(ndp);
3822 
3823 #if CONFIG_MACF
3824 	error = mac_mount_check_stat(ctxp, mp);
3825 	if (error != 0) {
3826 		vnode_put(vp);
3827 		goto out;
3828 	}
3829 #endif
3830 
3831 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3832 	if (error != 0) {
3833 		vnode_put(vp);
3834 		goto out;
3835 	}
3836 
3837 	sfsp = &__nameidata_statfs64->sfs;
3838 	vfs_get_statfs64(mp, sfsp);
3839 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3840 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3841 		/* This process does not want to see a seperate data volume mountpoint */
3842 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3843 	}
3844 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3845 	vnode_put(vp);
3846 
3847 out:
3848 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3849 
3850 	return error;
3851 }
3852 
3853 /*
3854  * Get file system statistics in 64-bit mode
3855  */
3856 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3857 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3858 {
3859 	struct vnode *vp;
3860 	struct mount *mp;
3861 	struct statfs64 sfs;
3862 	int error;
3863 
3864 	AUDIT_ARG(fd, uap->fd);
3865 
3866 	if ((error = file_vnode(uap->fd, &vp))) {
3867 		return error;
3868 	}
3869 
3870 	error = vnode_getwithref(vp);
3871 	if (error) {
3872 		file_drop(uap->fd);
3873 		return error;
3874 	}
3875 
3876 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3877 
3878 	mp = vp->v_mount;
3879 	if (!mp) {
3880 		error = EBADF;
3881 		goto out;
3882 	}
3883 
3884 #if CONFIG_MACF
3885 	error = mac_mount_check_stat(vfs_context_current(), mp);
3886 	if (error != 0) {
3887 		goto out;
3888 	}
3889 #endif
3890 
3891 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3892 		goto out;
3893 	}
3894 
3895 	vfs_get_statfs64(mp, &sfs);
3896 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3897 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3898 		/* This process does not want to see a seperate data volume mountpoint */
3899 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3900 	}
3901 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3902 
3903 out:
3904 	file_drop(uap->fd);
3905 	vnode_put(vp);
3906 
3907 	return error;
3908 }
3909 
3910 struct getfsstat_struct {
3911 	user_addr_t     sfsp;
3912 	user_addr_t     *mp;
3913 	int             count;
3914 	int             maxcount;
3915 	int             flags;
3916 	int             error;
3917 };
3918 
3919 
3920 static int
getfsstat_callback(mount_t mp,void * arg)3921 getfsstat_callback(mount_t mp, void * arg)
3922 {
3923 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3924 	struct vfsstatfs *sp;
3925 	int error, my_size;
3926 	vfs_context_t ctx = vfs_context_current();
3927 
3928 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3929 #if CONFIG_MACF
3930 		error = mac_mount_check_stat(ctx, mp);
3931 		if (error != 0) {
3932 			fstp->error = error;
3933 			return VFS_RETURNED_DONE;
3934 		}
3935 #endif
3936 		sp = &mp->mnt_vfsstat;
3937 		/*
3938 		 * If MNT_NOWAIT is specified, do not refresh the
3939 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3940 		 */
3941 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3942 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3943 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3944 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3945 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3946 			return VFS_RETURNED;
3947 		}
3948 
3949 		/*
3950 		 * Need to handle LP64 version of struct statfs
3951 		 */
3952 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3953 		if (error) {
3954 			fstp->error = error;
3955 			return VFS_RETURNED_DONE;
3956 		}
3957 		fstp->sfsp += my_size;
3958 
3959 		if (fstp->mp) {
3960 #if CONFIG_MACF
3961 			error = mac_mount_label_get(mp, *fstp->mp);
3962 			if (error) {
3963 				fstp->error = error;
3964 				return VFS_RETURNED_DONE;
3965 			}
3966 #endif
3967 			fstp->mp++;
3968 		}
3969 	}
3970 	fstp->count++;
3971 	return VFS_RETURNED;
3972 }
3973 
3974 /*
3975  * Get statistics on all filesystems.
3976  */
3977 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3978 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3979 {
3980 	struct __mac_getfsstat_args muap;
3981 
3982 	muap.buf = uap->buf;
3983 	muap.bufsize = uap->bufsize;
3984 	muap.mac = USER_ADDR_NULL;
3985 	muap.macsize = 0;
3986 	muap.flags = uap->flags;
3987 
3988 	return __mac_getfsstat(p, &muap, retval);
3989 }
3990 
3991 /*
3992  * __mac_getfsstat: Get MAC-related file system statistics
3993  *
3994  * Parameters:    p                        (ignored)
3995  *                uap                      User argument descriptor (see below)
3996  *                retval                   Count of file system statistics (N stats)
3997  *
3998  * Indirect:      uap->bufsize             Buffer size
3999  *                uap->macsize             MAC info size
4000  *                uap->buf                 Buffer where information will be returned
4001  *                uap->mac                 MAC info
4002  *                uap->flags               File system flags
4003  *
4004  *
4005  * Returns:        0                       Success
4006  *                !0                       Not success
4007  *
4008  */
4009 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)4010 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
4011 {
4012 	user_addr_t sfsp;
4013 	user_addr_t *mp;
4014 	size_t count, maxcount, bufsize, macsize;
4015 	struct getfsstat_struct fst;
4016 
4017 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
4018 		return EINVAL;
4019 	}
4020 
4021 	bufsize = (size_t) uap->bufsize;
4022 	macsize = (size_t) uap->macsize;
4023 
4024 	if (IS_64BIT_PROCESS(p)) {
4025 		maxcount = bufsize / sizeof(struct user64_statfs);
4026 	} else {
4027 		maxcount = bufsize / sizeof(struct user32_statfs);
4028 	}
4029 	sfsp = uap->buf;
4030 	count = 0;
4031 
4032 	mp = NULL;
4033 
4034 #if CONFIG_MACF
4035 	if (uap->mac != USER_ADDR_NULL) {
4036 		u_int32_t *mp0;
4037 		int error;
4038 		unsigned int i;
4039 
4040 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
4041 		if (count != maxcount) {
4042 			return EINVAL;
4043 		}
4044 
4045 		/* Copy in the array */
4046 		mp0 = kalloc_data(macsize, Z_WAITOK);
4047 		if (mp0 == NULL) {
4048 			return ENOMEM;
4049 		}
4050 
4051 		error = copyin(uap->mac, mp0, macsize);
4052 		if (error) {
4053 			kfree_data(mp0, macsize);
4054 			return error;
4055 		}
4056 
4057 		/* Normalize to an array of user_addr_t */
4058 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
4059 		if (mp == NULL) {
4060 			kfree_data(mp0, macsize);
4061 			return ENOMEM;
4062 		}
4063 
4064 		for (i = 0; i < count; i++) {
4065 			if (IS_64BIT_PROCESS(p)) {
4066 				mp[i] = ((user_addr_t *)mp0)[i];
4067 			} else {
4068 				mp[i] = (user_addr_t)mp0[i];
4069 			}
4070 		}
4071 		kfree_data(mp0, macsize);
4072 	}
4073 #endif
4074 
4075 
4076 	fst.sfsp = sfsp;
4077 	fst.mp = mp;
4078 	fst.flags = uap->flags;
4079 	fst.count = 0;
4080 	fst.error = 0;
4081 	fst.maxcount = (int)maxcount;
4082 
4083 
4084 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
4085 
4086 	if (mp) {
4087 		kfree_data(mp, count * sizeof(user_addr_t));
4088 	}
4089 
4090 	if (fst.error) {
4091 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4092 		return fst.error;
4093 	}
4094 
4095 	if (fst.sfsp && fst.count > fst.maxcount) {
4096 		*retval = fst.maxcount;
4097 	} else {
4098 		*retval = fst.count;
4099 	}
4100 	return 0;
4101 }
4102 
4103 static int
getfsstat64_callback(mount_t mp,void * arg)4104 getfsstat64_callback(mount_t mp, void * arg)
4105 {
4106 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
4107 	struct vfsstatfs *sp;
4108 	struct statfs64 sfs;
4109 	int error;
4110 
4111 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
4112 #if CONFIG_MACF
4113 		error = mac_mount_check_stat(vfs_context_current(), mp);
4114 		if (error != 0) {
4115 			fstp->error = error;
4116 			return VFS_RETURNED_DONE;
4117 		}
4118 #endif
4119 		sp = &mp->mnt_vfsstat;
4120 		/*
4121 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
4122 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
4123 		 *
4124 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
4125 		 * getfsstat, since the constants are out of the same
4126 		 * namespace.
4127 		 */
4128 		if ((mp->mnt_lflag & MNT_LDEAD) ||
4129 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
4130 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
4131 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
4132 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
4133 			return VFS_RETURNED;
4134 		}
4135 
4136 		vfs_get_statfs64(mp, &sfs);
4137 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
4138 		if (error) {
4139 			fstp->error = error;
4140 			return VFS_RETURNED_DONE;
4141 		}
4142 		fstp->sfsp += sizeof(sfs);
4143 	}
4144 	fstp->count++;
4145 	return VFS_RETURNED;
4146 }
4147 
4148 /*
4149  * Get statistics on all file systems in 64 bit mode.
4150  */
4151 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)4152 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
4153 {
4154 	user_addr_t sfsp;
4155 	int count, maxcount;
4156 	struct getfsstat_struct fst;
4157 
4158 	maxcount = uap->bufsize / sizeof(struct statfs64);
4159 
4160 	sfsp = uap->buf;
4161 	count = 0;
4162 
4163 	fst.sfsp = sfsp;
4164 	fst.flags = uap->flags;
4165 	fst.count = 0;
4166 	fst.error = 0;
4167 	fst.maxcount = maxcount;
4168 
4169 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
4170 
4171 	if (fst.error) {
4172 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4173 		return fst.error;
4174 	}
4175 
4176 	if (fst.sfsp && fst.count > fst.maxcount) {
4177 		*retval = fst.maxcount;
4178 	} else {
4179 		*retval = fst.count;
4180 	}
4181 
4182 	return 0;
4183 }
4184 
4185 /*
4186  * gets the associated vnode with the file descriptor passed.
4187  * as input
4188  *
4189  * INPUT
4190  * ctx - vfs context of caller
4191  * fd - file descriptor for which vnode is required.
4192  * vpp - Pointer to pointer to vnode to be returned.
4193  *
4194  * The vnode is returned with an iocount so any vnode obtained
4195  * by this call needs a vnode_put
4196  *
4197  */
4198 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)4199 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
4200 {
4201 	int error;
4202 	vnode_t vp;
4203 	struct fileproc *fp;
4204 	proc_t p = vfs_context_proc(ctx);
4205 
4206 	*vpp =  NULLVP;
4207 
4208 	error = fp_getfvp(p, fd, &fp, &vp);
4209 	if (error) {
4210 		return error;
4211 	}
4212 
4213 	error = vnode_getwithref(vp);
4214 	if (error) {
4215 		(void)fp_drop(p, fd, fp, 0);
4216 		return error;
4217 	}
4218 
4219 	(void)fp_drop(p, fd, fp, 0);
4220 	*vpp = vp;
4221 	return error;
4222 }
4223 
4224 int
vnode_getfromid(int volfs_id,uint64_t objid,vfs_context_t ctx,int realfsid,vnode_t * vpp)4225 vnode_getfromid(int volfs_id, uint64_t objid, vfs_context_t ctx, int realfsid, vnode_t *vpp)
4226 {
4227 	int error = 0;
4228 	vnode_t vp = NULLVP;
4229 	struct mount *mp = NULL;
4230 
4231 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
4232 		error = ENOTSUP; /* unexpected failure */
4233 		return ENOTSUP;
4234 	}
4235 
4236 #if CONFIG_UNION_MOUNTS
4237 unionget:
4238 #endif /* CONFIG_UNION_MOUNTS */
4239 	if (objid == 2) {
4240 		struct vfs_attr vfsattr;
4241 		int use_vfs_root = TRUE;
4242 
4243 		VFSATTR_INIT(&vfsattr);
4244 		VFSATTR_WANTED(&vfsattr, f_capabilities);
4245 		if (!realfsid &&
4246 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
4247 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
4248 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
4249 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
4250 				use_vfs_root = FALSE;
4251 			}
4252 		}
4253 
4254 		if (use_vfs_root) {
4255 			error = VFS_ROOT(mp, &vp, ctx);
4256 		} else {
4257 			error = VFS_VGET(mp, objid, &vp, ctx);
4258 		}
4259 	} else {
4260 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
4261 	}
4262 
4263 #if CONFIG_UNION_MOUNTS
4264 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
4265 		/*
4266 		 * If the fileid isn't found and we're in a union
4267 		 * mount volume, then see if the fileid is in the
4268 		 * mounted-on volume.
4269 		 */
4270 		struct mount *tmp = mp;
4271 		mp = vnode_mount(tmp->mnt_vnodecovered);
4272 		vfs_unbusy(tmp);
4273 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
4274 			goto unionget;
4275 		}
4276 	} else {
4277 		vfs_unbusy(mp);
4278 	}
4279 #else
4280 	vfs_unbusy(mp);
4281 #endif /* CONFIG_UNION_MOUNTS */
4282 
4283 	if (!error) {
4284 		*vpp = vp;
4285 	}
4286 
4287 	return error;
4288 }
4289 
4290 /*
4291  * Wrapper function around namei to start lookup from a directory
4292  * specified by a file descriptor ni_dirfd.
4293  *
4294  * In addition to all the errors returned by namei, this call can
4295  * return ENOTDIR if the file descriptor does not refer to a directory.
4296  * and EBADF if the file descriptor is not valid.
4297  */
4298 int
nameiat(struct nameidata * ndp,int dirfd)4299 nameiat(struct nameidata *ndp, int dirfd)
4300 {
4301 	if ((dirfd != AT_FDCWD) &&
4302 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4303 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
4304 		int error = 0;
4305 		char c;
4306 
4307 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4308 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4309 			if (error) {
4310 				return error;
4311 			}
4312 		} else {
4313 			c = *((char *)(ndp->ni_dirp));
4314 		}
4315 
4316 		if (c != '/') {
4317 			vnode_t dvp_at;
4318 
4319 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4320 			    &dvp_at);
4321 			if (error) {
4322 				return error;
4323 			}
4324 
4325 			if (vnode_vtype(dvp_at) != VDIR) {
4326 				vnode_put(dvp_at);
4327 				return ENOTDIR;
4328 			}
4329 
4330 			ndp->ni_dvp = dvp_at;
4331 			ndp->ni_cnd.cn_flags |= USEDVP;
4332 			error = namei(ndp);
4333 			ndp->ni_cnd.cn_flags &= ~USEDVP;
4334 			vnode_put(dvp_at);
4335 			return error;
4336 		}
4337 	}
4338 
4339 	return namei(ndp);
4340 }
4341 
4342 /*
4343  * Change current working directory to a given file descriptor.
4344  */
4345 /* ARGSUSED */
4346 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4347 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4348 {
4349 	vnode_t vp;
4350 	vnode_t tdp;
4351 	vnode_t tvp;
4352 	struct mount *mp;
4353 	int error, should_put = 1;
4354 
4355 	AUDIT_ARG(fd, fd);
4356 	if (per_thread && fd == -1) {
4357 		/*
4358 		 * Switching back from per-thread to per process CWD; verify we
4359 		 * in fact have one before proceeding.  The only success case
4360 		 * for this code path is to return 0 preemptively after zapping
4361 		 * the thread structure contents.
4362 		 */
4363 		thread_t th = vfs_context_thread(ctx);
4364 		if (th) {
4365 			uthread_t uth = get_bsdthread_info(th);
4366 			tvp = uth->uu_cdir;
4367 			uth->uu_cdir = NULLVP;
4368 			if (tvp != NULLVP) {
4369 				vnode_rele(tvp);
4370 				return 0;
4371 			}
4372 		}
4373 		return EBADF;
4374 	}
4375 
4376 	if ((error = file_vnode(fd, &vp))) {
4377 		return error;
4378 	}
4379 	if ((error = vnode_getwithref(vp))) {
4380 		file_drop(fd);
4381 		return error;
4382 	}
4383 
4384 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4385 
4386 	if (vp->v_type != VDIR) {
4387 		error = ENOTDIR;
4388 		goto out;
4389 	}
4390 
4391 #if CONFIG_MACF
4392 	error = mac_vnode_check_chdir(ctx, vp);
4393 	if (error) {
4394 		goto out;
4395 	}
4396 #endif
4397 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4398 	if (error) {
4399 		goto out;
4400 	}
4401 
4402 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4403 		if (vfs_busy(mp, LK_NOWAIT)) {
4404 			error = EACCES;
4405 			goto out;
4406 		}
4407 		error = VFS_ROOT(mp, &tdp, ctx);
4408 		vfs_unbusy(mp);
4409 		if (error) {
4410 			break;
4411 		}
4412 		vnode_put(vp);
4413 		vp = tdp;
4414 	}
4415 	if (error) {
4416 		goto out;
4417 	}
4418 	if ((error = vnode_ref(vp))) {
4419 		goto out;
4420 	}
4421 	vnode_put(vp);
4422 	should_put = 0;
4423 
4424 	if (per_thread) {
4425 		thread_t th = vfs_context_thread(ctx);
4426 		if (th) {
4427 			uthread_t uth = get_bsdthread_info(th);
4428 			tvp = uth->uu_cdir;
4429 			uth->uu_cdir = vp;
4430 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4431 		} else {
4432 			vnode_rele(vp);
4433 			error = ENOENT;
4434 			goto out;
4435 		}
4436 	} else {
4437 		proc_dirs_lock_exclusive(p);
4438 		proc_fdlock(p);
4439 		tvp = p->p_fd.fd_cdir;
4440 		p->p_fd.fd_cdir = vp;
4441 		proc_fdunlock(p);
4442 		proc_dirs_unlock_exclusive(p);
4443 	}
4444 
4445 	if (tvp) {
4446 		vnode_rele(tvp);
4447 	}
4448 
4449 out:
4450 	if (should_put) {
4451 		vnode_put(vp);
4452 	}
4453 	file_drop(fd);
4454 
4455 	return error;
4456 }
4457 
4458 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4459 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4460 {
4461 	return fchdir(p, vfs_context_current(), uap->fd, false);
4462 }
4463 
4464 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4465 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4466 {
4467 	return fchdir(p, vfs_context_current(), uap->fd, true);
4468 }
4469 
4470 
4471 /*
4472  * Change current working directory (".").
4473  *
4474  * Returns:	0			Success
4475  *	change_dir:ENOTDIR
4476  *	change_dir:???
4477  *	vnode_ref:ENOENT		No such file or directory
4478  */
4479 /* ARGSUSED */
4480 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4481 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4482 {
4483 	int error;
4484 	vnode_t tvp;
4485 
4486 	error = change_dir(ndp, ctx);
4487 	if (error) {
4488 		return error;
4489 	}
4490 	if ((error = vnode_ref(ndp->ni_vp))) {
4491 		vnode_put(ndp->ni_vp);
4492 		return error;
4493 	}
4494 	/*
4495 	 * drop the iocount we picked up in change_dir
4496 	 */
4497 	vnode_put(ndp->ni_vp);
4498 
4499 	if (per_thread) {
4500 		thread_t th = vfs_context_thread(ctx);
4501 		if (th) {
4502 			uthread_t uth = get_bsdthread_info(th);
4503 			tvp = uth->uu_cdir;
4504 			uth->uu_cdir = ndp->ni_vp;
4505 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4506 		} else {
4507 			vnode_rele(ndp->ni_vp);
4508 			return ENOENT;
4509 		}
4510 	} else {
4511 		proc_dirs_lock_exclusive(p);
4512 		proc_fdlock(p);
4513 		tvp = p->p_fd.fd_cdir;
4514 		p->p_fd.fd_cdir = ndp->ni_vp;
4515 		proc_fdunlock(p);
4516 		proc_dirs_unlock_exclusive(p);
4517 	}
4518 
4519 	if (tvp) {
4520 		vnode_rele(tvp);
4521 	}
4522 
4523 	return 0;
4524 }
4525 
4526 
4527 /*
4528  * Change current working directory (".").
4529  *
4530  * Returns:	0			Success
4531  *	chdir_internal:ENOTDIR
4532  *	chdir_internal:ENOENT		No such file or directory
4533  *	chdir_internal:???
4534  */
4535 /* ARGSUSED */
4536 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4537 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4538 {
4539 	struct nameidata nd;
4540 	vfs_context_t ctx = vfs_context_current();
4541 
4542 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4543 	    UIO_USERSPACE, uap->path, ctx);
4544 
4545 	return chdir_internal(p, ctx, &nd, per_thread);
4546 }
4547 
4548 
4549 /*
4550  * chdir
4551  *
4552  * Change current working directory (".") for the entire process
4553  *
4554  * Parameters:  p       Process requesting the call
4555  *              uap     User argument descriptor (see below)
4556  *              retval  (ignored)
4557  *
4558  * Indirect parameters:	uap->path	Directory path
4559  *
4560  * Returns:	0			Success
4561  *              common_chdir: ENOTDIR
4562  *              common_chdir: ENOENT	No such file or directory
4563  *              common_chdir: ???
4564  *
4565  */
4566 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4567 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4568 {
4569 	return common_chdir(p, (void *)uap, 0);
4570 }
4571 
4572 /*
4573  * __pthread_chdir
4574  *
4575  * Change current working directory (".") for a single thread
4576  *
4577  * Parameters:  p       Process requesting the call
4578  *              uap     User argument descriptor (see below)
4579  *              retval  (ignored)
4580  *
4581  * Indirect parameters:	uap->path	Directory path
4582  *
4583  * Returns:	0			Success
4584  *              common_chdir: ENOTDIR
4585  *		common_chdir: ENOENT	No such file or directory
4586  *		common_chdir: ???
4587  *
4588  */
4589 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4590 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4591 {
4592 	return common_chdir(p, (void *)uap, 1);
4593 }
4594 
4595 
4596 /*
4597  * Change notion of root (``/'') directory.
4598  */
4599 /* ARGSUSED */
4600 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4601 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4602 {
4603 	struct filedesc *fdp = &p->p_fd;
4604 	int error;
4605 	struct nameidata nd;
4606 	vnode_t tvp;
4607 	vfs_context_t ctx = vfs_context_current();
4608 
4609 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4610 		return error;
4611 	}
4612 
4613 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4614 	    UIO_USERSPACE, uap->path, ctx);
4615 	error = change_dir(&nd, ctx);
4616 	if (error) {
4617 		return error;
4618 	}
4619 
4620 #if CONFIG_MACF
4621 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4622 	    &nd.ni_cnd);
4623 	if (error) {
4624 		vnode_put(nd.ni_vp);
4625 		return error;
4626 	}
4627 #endif
4628 
4629 	if ((error = vnode_ref(nd.ni_vp))) {
4630 		vnode_put(nd.ni_vp);
4631 		return error;
4632 	}
4633 	vnode_put(nd.ni_vp);
4634 
4635 	/*
4636 	 * This lock provides the guarantee that as long as you hold the lock
4637 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4638 	 * on a referenced vnode in namei when determining the rootvnode for
4639 	 * a process.
4640 	 */
4641 	/* needed for synchronization with lookup */
4642 	proc_dirs_lock_exclusive(p);
4643 	/* needed for setting the flag and other activities on the fd itself */
4644 	proc_fdlock(p);
4645 	tvp = fdp->fd_rdir;
4646 	fdp->fd_rdir = nd.ni_vp;
4647 	fdt_flag_set(fdp, FD_CHROOT);
4648 	proc_fdunlock(p);
4649 	proc_dirs_unlock_exclusive(p);
4650 
4651 	if (tvp != NULL) {
4652 		vnode_rele(tvp);
4653 	}
4654 
4655 	return 0;
4656 }
4657 
4658 #define PATHSTATICBUFLEN 256
4659 #define PIVOT_ROOT_ENTITLEMENT              \
4660        "com.apple.private.vfs.pivot-root"
4661 
4662 #if defined(XNU_TARGET_OS_OSX)
4663 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4664 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4665 {
4666 	int error;
4667 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4668 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4669 	char *new_rootfs_path_before_buf = NULL;
4670 	char *old_rootfs_path_after_buf = NULL;
4671 	char *incoming = NULL;
4672 	char *outgoing = NULL;
4673 	vnode_t incoming_rootvp = NULLVP;
4674 	size_t bytes_copied;
4675 
4676 	/*
4677 	 * XXX : Additional restrictions needed
4678 	 * - perhaps callable only once.
4679 	 */
4680 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4681 		return error;
4682 	}
4683 
4684 	/*
4685 	 * pivot_root can be executed by launchd only.
4686 	 * Enforce entitlement.
4687 	 */
4688 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4689 		return EPERM;
4690 	}
4691 
4692 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4693 	if (error == ENAMETOOLONG) {
4694 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4695 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4696 	}
4697 
4698 	if (error) {
4699 		goto out;
4700 	}
4701 
4702 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4703 	if (error == ENAMETOOLONG) {
4704 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4705 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4706 	}
4707 	if (error) {
4708 		goto out;
4709 	}
4710 
4711 	if (new_rootfs_path_before_buf) {
4712 		incoming = new_rootfs_path_before_buf;
4713 	} else {
4714 		incoming = &new_rootfs_path_before[0];
4715 	}
4716 
4717 	if (old_rootfs_path_after_buf) {
4718 		outgoing = old_rootfs_path_after_buf;
4719 	} else {
4720 		outgoing = &old_rootfs_path_after[0];
4721 	}
4722 
4723 	/*
4724 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4725 	 * Userland is not allowed to pivot to an image.
4726 	 */
4727 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4728 	if (error) {
4729 		goto out;
4730 	}
4731 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4732 	if (error) {
4733 		goto out;
4734 	}
4735 
4736 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4737 
4738 out:
4739 	if (incoming_rootvp != NULLVP) {
4740 		vnode_put(incoming_rootvp);
4741 		incoming_rootvp = NULLVP;
4742 	}
4743 
4744 	if (old_rootfs_path_after_buf) {
4745 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4746 	}
4747 
4748 	if (new_rootfs_path_before_buf) {
4749 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4750 	}
4751 
4752 	return error;
4753 }
4754 #else
4755 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4756 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4757 {
4758 	return nosys(p, NULL, retval);
4759 }
4760 #endif /* XNU_TARGET_OS_OSX */
4761 
4762 /*
4763  * Common routine for chroot and chdir.
4764  *
4765  * Returns:	0			Success
4766  *		ENOTDIR			Not a directory
4767  *		namei:???		[anything namei can return]
4768  *		vnode_authorize:???	[anything vnode_authorize can return]
4769  */
4770 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4771 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4772 {
4773 	vnode_t vp;
4774 	int error;
4775 
4776 	if ((error = namei(ndp))) {
4777 		return error;
4778 	}
4779 	nameidone(ndp);
4780 	vp = ndp->ni_vp;
4781 
4782 	if (vp->v_type != VDIR) {
4783 		vnode_put(vp);
4784 		return ENOTDIR;
4785 	}
4786 
4787 #if CONFIG_MACF
4788 	error = mac_vnode_check_chdir(ctx, vp);
4789 	if (error) {
4790 		vnode_put(vp);
4791 		return error;
4792 	}
4793 #endif
4794 
4795 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4796 	if (error) {
4797 		vnode_put(vp);
4798 		return error;
4799 	}
4800 
4801 	return error;
4802 }
4803 
4804 /*
4805  * Free the vnode data (for directories) associated with the file glob.
4806  */
4807 struct fd_vn_data *
fg_vn_data_alloc(void)4808 fg_vn_data_alloc(void)
4809 {
4810 	struct fd_vn_data *fvdata;
4811 
4812 	/* Allocate per fd vnode data */
4813 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4814 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4815 	return fvdata;
4816 }
4817 
4818 /*
4819  * Free the vnode data (for directories) associated with the file glob.
4820  */
4821 void
fg_vn_data_free(void * fgvndata)4822 fg_vn_data_free(void *fgvndata)
4823 {
4824 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4825 
4826 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4827 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4828 	kfree_type(struct fd_vn_data, fvdata);
4829 }
4830 
4831 /*
4832  * Check permissions, allocate an open file structure,
4833  * and call the device open routine if any.
4834  *
4835  * Returns:	0			Success
4836  *		EINVAL
4837  *		EINTR
4838  *	falloc:ENFILE
4839  *	falloc:EMFILE
4840  *	falloc:ENOMEM
4841  *	vn_open_auth:???
4842  *	dupfdopen:???
4843  *	VNOP_ADVLOCK:???
4844  *	vnode_setsize:???
4845  *
4846  * XXX Need to implement uid, gid
4847  */
4848 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4849 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4850     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4851 {
4852 	proc_t p = vfs_context_proc(ctx);
4853 	kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4854 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4855 	struct fileproc *fp;
4856 	vnode_t vp;
4857 	int flags, oflags, amode;
4858 	int type, indx, error;
4859 	struct vfs_context context;
4860 	vnode_t authvp = NULLVP;
4861 
4862 	oflags = uflags;
4863 
4864 	amode = oflags & O_ACCMODE;
4865 	/*
4866 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4867 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4868 	 * with FREAD/FWRITE.
4869 	 */
4870 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4871 		return EINVAL;
4872 	}
4873 
4874 	flags = FFLAGS(uflags);
4875 	CLR(flags, FENCRYPTED);
4876 	CLR(flags, FUNENCRYPTED);
4877 
4878 	AUDIT_ARG(fflags, oflags);
4879 	AUDIT_ARG(mode, vap->va_mode);
4880 
4881 	if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4882 		return error;
4883 	}
4884 	if (flags & O_CLOEXEC) {
4885 		fp->fp_flags |= FP_CLOEXEC;
4886 	}
4887 	if (flags & O_CLOFORK) {
4888 		fp->fp_flags |= FP_CLOFORK;
4889 	}
4890 
4891 	/* setup state to recognize when fdesc_open was called */
4892 	uu->uu_dupfd = -1;
4893 
4894 	/*
4895 	 * Disable read/write access if file is opened with O_EVTONLY and
4896 	 * the process has requested to deny read/write access.
4897 	 */
4898 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4899 		flags &= ~(FREAD | FWRITE);
4900 	}
4901 
4902 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4903 		error = vnode_getfromfd(ctx, authfd, &authvp);
4904 		if (error) {
4905 			fp_free(p, indx, fp);
4906 			return error;
4907 		}
4908 	}
4909 
4910 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4911 		if (authvp != NULLVP) {
4912 			vnode_put(authvp);
4913 		}
4914 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4915 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4916 				*retval = indx;
4917 				return 0;
4918 			}
4919 		}
4920 		if (error == ERESTART) {
4921 			error = EINTR;
4922 		}
4923 		fp_free(p, indx, fp);
4924 		return error;
4925 	}
4926 
4927 	if (authvp != NULLVP) {
4928 		vnode_put(authvp);
4929 	}
4930 
4931 	uu->uu_dupfd = 0;
4932 	vp = ndp->ni_vp;
4933 
4934 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4935 	fp->fp_glob->fg_ops = &vnops;
4936 	fp_set_data(fp, vp);
4937 
4938 #if CONFIG_FILE_LEASES
4939 	/*
4940 	 * If we are creating a file or open with truncate, we need to break the
4941 	 * lease if there is a read lease placed on the parent dir.
4942 	 */
4943 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4944 		vnode_breakdirlease(vp, true, oflags);
4945 	}
4946 	/* Now check if there is a lease placed on the file itself. */
4947 	error = vnode_breaklease(vp, oflags, ctx);
4948 	if (error) {
4949 		goto bad;
4950 	}
4951 #endif /* CONFIG_FILE_LEASES */
4952 
4953 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4954 		struct flock lf = {
4955 			.l_whence = SEEK_SET,
4956 		};
4957 
4958 		if (flags & O_EXLOCK) {
4959 			lf.l_type = F_WRLCK;
4960 		} else {
4961 			lf.l_type = F_RDLCK;
4962 		}
4963 		type = F_FLOCK;
4964 		if ((flags & FNONBLOCK) == 0) {
4965 			type |= F_WAIT;
4966 		}
4967 #if CONFIG_MACF
4968 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4969 		    F_SETLK, &lf);
4970 		if (error) {
4971 			goto bad;
4972 		}
4973 #endif
4974 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4975 			goto bad;
4976 		}
4977 		fp->fp_glob->fg_flag |= FWASLOCKED;
4978 	}
4979 
4980 	/* try to truncate by setting the size attribute */
4981 	if (flags & O_TRUNC) {
4982 		if ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0) {
4983 			goto bad;
4984 		}
4985 		fp->fp_glob->fg_flag |= FWASWRITTEN;
4986 	}
4987 
4988 	/*
4989 	 * For directories we hold some additional information in the fd.
4990 	 */
4991 	if (vnode_vtype(vp) == VDIR) {
4992 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4993 	} else {
4994 		fp->fp_glob->fg_vn_data = NULL;
4995 	}
4996 
4997 #if CONFIG_SECLUDED_MEMORY
4998 	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4999 		memory_object_control_t moc;
5000 		const char *v_name;
5001 
5002 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
5003 
5004 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
5005 			/* nothing to do... */
5006 		} else if (fp->fp_glob->fg_flag & FWRITE) {
5007 			/* writable -> no longer  eligible for secluded pages */
5008 			memory_object_mark_eligible_for_secluded(moc,
5009 			    FALSE);
5010 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
5011 			char pathname[32] = { 0, };
5012 			size_t copied;
5013 			/* XXX FBDP: better way to detect /Applications/ ? */
5014 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5015 				(void)copyinstr(ndp->ni_dirp,
5016 				    pathname,
5017 				    sizeof(pathname),
5018 				    &copied);
5019 			} else {
5020 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
5021 				    pathname,
5022 				    sizeof(pathname),
5023 				    &copied);
5024 			}
5025 			pathname[sizeof(pathname) - 1] = '\0';
5026 			if (strncmp(pathname,
5027 			    "/Applications/",
5028 			    strlen("/Applications/")) == 0 &&
5029 			    strncmp(pathname,
5030 			    "/Applications/Camera.app/",
5031 			    strlen("/Applications/Camera.app/")) != 0) {
5032 				/*
5033 				 * not writable
5034 				 * AND from "/Applications/"
5035 				 * AND not from "/Applications/Camera.app/"
5036 				 * ==> eligible for secluded
5037 				 */
5038 				memory_object_mark_eligible_for_secluded(moc,
5039 				    TRUE);
5040 			}
5041 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
5042 		    (v_name = vnode_getname(vp))) {
5043 			size_t len = strlen(v_name);
5044 
5045 			if (!strncmp(v_name, "dyld", len) ||
5046 			    !strncmp(v_name, "launchd", len) ||
5047 			    !strncmp(v_name, "Camera", len) ||
5048 			    !strncmp(v_name, "SpringBoard", len) ||
5049 			    !strncmp(v_name, "backboardd", len) ||
5050 			    !strncmp(v_name, "cameracaptured", len)) {
5051 				/*
5052 				 * This file matters when launching Camera:
5053 				 * do not store its contents in the secluded
5054 				 * pool that will be drained on Camera launch.
5055 				 */
5056 				memory_object_mark_eligible_for_secluded(moc,
5057 				    FALSE);
5058 			} else if (!strncmp(v_name, "audiomxd", len) ||
5059 			    !strncmp(v_name, "mediaplaybackd", len)) {
5060 				memory_object_mark_eligible_for_secluded(moc,
5061 				    FALSE);
5062 				memory_object_mark_for_realtime(moc,
5063 				    true);
5064 			} else if (!strncmp(v_name, "bluetoothd", len)) {
5065 				/*
5066 				 * bluetoothd might be needed for realtime audio
5067 				 * playback.
5068 				 */
5069 				memory_object_mark_eligible_for_secluded(moc,
5070 				    FALSE);
5071 				memory_object_mark_for_realtime(moc,
5072 				    true);
5073 			} else {
5074 				char pathname[64] = { 0, };
5075 				size_t copied;
5076 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5077 					(void)copyinstr(ndp->ni_dirp,
5078 					    pathname,
5079 					    sizeof(pathname),
5080 					    &copied);
5081 				} else {
5082 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
5083 					    pathname,
5084 					    sizeof(pathname),
5085 					    &copied);
5086 				}
5087 				pathname[sizeof(pathname) - 1] = '\0';
5088 				if (strncmp(pathname,
5089 				    "/Library/Audio/Plug-Ins/",
5090 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
5091 				    strncmp(pathname,
5092 				    "/System/Library/Audio/Plug-Ins/",
5093 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
5094 					/*
5095 					 * This may be an audio plugin required
5096 					 * for realtime playback.
5097 					 * ==> NOT eligible for secluded.
5098 					 */
5099 					memory_object_mark_eligible_for_secluded(moc,
5100 					    FALSE);
5101 					memory_object_mark_for_realtime(moc,
5102 					    true);
5103 				}
5104 			}
5105 			vnode_putname(v_name);
5106 		}
5107 	}
5108 #endif /* CONFIG_SECLUDED_MEMORY */
5109 
5110 	vnode_put(vp);
5111 
5112 	/*
5113 	 * The first terminal open (without a O_NOCTTY) by a session leader
5114 	 * results in it being set as the controlling terminal.
5115 	 */
5116 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
5117 	    !(flags & O_NOCTTY)) {
5118 		int tmp = 0;
5119 
5120 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
5121 		    (caddr_t)&tmp, ctx);
5122 	}
5123 
5124 	proc_fdlock(p);
5125 	procfdtbl_releasefd(p, indx, NULL);
5126 
5127 	fp_drop(p, indx, fp, 1);
5128 	proc_fdunlock(p);
5129 
5130 	*retval = indx;
5131 
5132 	return 0;
5133 bad:
5134 	context = *vfs_context_current();
5135 	context.vc_ucred = fp->fp_glob->fg_cred;
5136 
5137 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
5138 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
5139 		struct flock lf = {
5140 			.l_whence = SEEK_SET,
5141 			.l_type = F_UNLCK,
5142 		};
5143 
5144 		(void)VNOP_ADVLOCK(
5145 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
5146 	}
5147 
5148 	vn_close(vp, fp->fp_glob->fg_flag, &context);
5149 	vnode_put(vp);
5150 	fp_free(p, indx, fp);
5151 
5152 	return error;
5153 }
5154 
5155 /*
5156  * While most of the *at syscall handlers can call nameiat() which
5157  * is a wrapper around namei, the use of namei and initialisation
5158  * of nameidata are far removed and in different functions  - namei
5159  * gets called in vn_open_auth for open1. So we'll just do here what
5160  * nameiat() does.
5161  */
5162 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)5163 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
5164     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
5165     int dirfd, int authfd)
5166 {
5167 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
5168 		int error;
5169 		char c;
5170 
5171 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5172 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
5173 			if (error) {
5174 				return error;
5175 			}
5176 		} else {
5177 			c = *((char *)(ndp->ni_dirp));
5178 		}
5179 
5180 		if (c != '/') {
5181 			vnode_t dvp_at;
5182 
5183 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
5184 			    &dvp_at);
5185 			if (error) {
5186 				return error;
5187 			}
5188 
5189 			if (vnode_vtype(dvp_at) != VDIR) {
5190 				vnode_put(dvp_at);
5191 				return ENOTDIR;
5192 			}
5193 
5194 			ndp->ni_dvp = dvp_at;
5195 			ndp->ni_cnd.cn_flags |= USEDVP;
5196 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
5197 			    retval, authfd);
5198 			vnode_put(dvp_at);
5199 			return error;
5200 		}
5201 	}
5202 
5203 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
5204 }
5205 
5206 /*
5207  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
5208  *
5209  * Parameters:	p			Process requesting the open
5210  *		uap			User argument descriptor (see below)
5211  *		retval			Pointer to an area to receive the
5212  *					return calue from the system call
5213  *
5214  * Indirect:	uap->path		Path to open (same as 'open')
5215  *		uap->flags		Flags to open (same as 'open'
5216  *		uap->uid		UID to set, if creating
5217  *		uap->gid		GID to set, if creating
5218  *		uap->mode		File mode, if creating (same as 'open')
5219  *		uap->xsecurity		ACL to set, if creating
5220  *
5221  * Returns:	0			Success
5222  *		!0			errno value
5223  *
5224  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5225  *
5226  * XXX:		We should enummerate the possible errno values here, and where
5227  *		in the code they originated.
5228  */
5229 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)5230 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
5231 {
5232 	int ciferror;
5233 	kauth_filesec_t xsecdst;
5234 	struct vnode_attr va;
5235 	struct nameidata nd;
5236 	int cmode;
5237 
5238 	AUDIT_ARG(owner, uap->uid, uap->gid);
5239 
5240 	xsecdst = NULL;
5241 	if ((uap->xsecurity != USER_ADDR_NULL) &&
5242 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
5243 		return ciferror;
5244 	}
5245 
5246 	VATTR_INIT(&va);
5247 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
5248 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5249 	if (uap->uid != KAUTH_UID_NONE) {
5250 		VATTR_SET(&va, va_uid, uap->uid);
5251 	}
5252 	if (uap->gid != KAUTH_GID_NONE) {
5253 		VATTR_SET(&va, va_gid, uap->gid);
5254 	}
5255 	if (xsecdst != NULL) {
5256 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5257 		va.va_vaflags |= VA_FILESEC_ACL;
5258 	}
5259 
5260 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
5261 	    uap->path, vfs_context_current());
5262 
5263 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
5264 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
5265 	if (xsecdst != NULL) {
5266 		kauth_filesec_free(xsecdst);
5267 	}
5268 
5269 	return ciferror;
5270 }
5271 
5272 /*
5273  * Go through the data-protected atomically controlled open (2)
5274  *
5275  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5276  */
5277 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5278 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5279     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5280 {
5281 	/*
5282 	 * Follow the same path as normal open(2)
5283 	 * Look up the item if it exists, and acquire the vnode.
5284 	 */
5285 	struct vnode_attr va;
5286 	struct nameidata nd;
5287 	int cmode;
5288 	int error;
5289 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5290 
5291 	VATTR_INIT(&va);
5292 	/* Mask off all but regular access permissions */
5293 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5294 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5295 
5296 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5297 	    path, ctx);
5298 
5299 	/*
5300 	 * Initialize the extra fields in vnode_attr to pass down our
5301 	 * extra fields.
5302 	 * 1. target cprotect class.
5303 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5304 	 */
5305 	if (flags & O_CREAT) {
5306 		/* lower level kernel code validates that the class is valid before applying it. */
5307 		if (class != PROTECTION_CLASS_DEFAULT) {
5308 			/*
5309 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5310 			 * file behave the same as open (2)
5311 			 */
5312 			VATTR_SET(&va, va_dataprotect_class, class);
5313 		}
5314 	}
5315 
5316 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5317 		if (flags & (O_RDWR | O_WRONLY)) {
5318 			/*
5319 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
5320 			 */
5321 			return EINVAL;
5322 		}
5323 		if (dpflags & O_DP_GETRAWENCRYPTED) {
5324 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5325 		}
5326 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5327 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5328 		}
5329 		if (dpflags & O_DP_AUTHENTICATE) {
5330 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5331 		}
5332 	}
5333 
5334 	error = open1at(vfs_context_current(), &nd, flags, &va,
5335 	    NULL, NULL, retval, fd, authfd);
5336 
5337 	return error;
5338 }
5339 
5340 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5341 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5342 {
5343 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5344 		return EINVAL;
5345 	}
5346 
5347 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5348 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5349 }
5350 
5351 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5352 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5353 {
5354 	if (uap->dpflags & O_DP_AUTHENTICATE) {
5355 		return EINVAL;
5356 	}
5357 
5358 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5359 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5360 }
5361 
5362 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval,uint64_t * objidp,fsid_t * fsidp)5363 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5364     int fd, enum uio_seg segflg, int *retval, uint64_t *objidp, fsid_t *fsidp)
5365 {
5366 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5367 	struct {
5368 		struct vnode_attr va;
5369 		struct nameidata nd;
5370 	} *__open_data;
5371 	struct vnode_attr *vap;
5372 	struct nameidata *ndp;
5373 	int cmode;
5374 	int error;
5375 
5376 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5377 	vap = &__open_data->va;
5378 	ndp = &__open_data->nd;
5379 
5380 	VATTR_INIT(vap);
5381 	/* Mask off all but regular access permissions */
5382 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5383 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5384 
5385 	/* Check for fileid and fsid authentication */
5386 	if (objidp || fsidp) {
5387 		if (!objidp || !fsidp) {
5388 			error = EINVAL;
5389 			goto out;
5390 		}
5391 		VATTR_SET(vap, va_flags, VA_VAFILEID);
5392 		VATTR_SET(vap, va_fileid, *objidp);
5393 		VATTR_SET(vap, va_fsid64, *fsidp);
5394 	}
5395 
5396 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5397 	    segflg, path, ctx);
5398 
5399 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5400 
5401 out:
5402 	kfree_type(typeof(*__open_data), __open_data);
5403 
5404 	return error;
5405 }
5406 
5407 int
open(proc_t p,struct open_args * uap,int32_t * retval)5408 open(proc_t p, struct open_args *uap, int32_t *retval)
5409 {
5410 	__pthread_testcancel(1);
5411 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5412 }
5413 
5414 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5415 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5416     int32_t *retval)
5417 {
5418 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5419 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval, NULL, NULL);
5420 }
5421 
5422 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5423 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5424     int32_t *retval)
5425 {
5426 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5427 	           uap->mode, uap->fd, UIO_USERSPACE, retval, NULL, NULL);
5428 }
5429 
5430 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5431 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5432 {
5433 	__pthread_testcancel(1);
5434 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5435 }
5436 
5437 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5438 
5439 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5440 vfs_context_can_open_by_id(vfs_context_t ctx)
5441 {
5442 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5443 		return TRUE;
5444 	}
5445 
5446 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5447 	           OPEN_BY_ID_ENTITLEMENT);
5448 }
5449 
5450 #define MAX_OPENBYID_NP_RETRIES 10
5451 
5452 /*
5453  * openbyid_np: open a file given a file system id and a file system object id
5454  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5455  *	file systems that don't support object ids it is a node id (uint64_t).
5456  *
5457  * Parameters:	p			Process requesting the open
5458  *		uap			User argument descriptor (see below)
5459  *		retval			Pointer to an area to receive the
5460  *					return calue from the system call
5461  *
5462  * Indirect:	uap->path		Path to open (same as 'open')
5463  *
5464  *		uap->fsid		id of target file system
5465  *		uap->objid		id of target file system object
5466  *		uap->flags		Flags to open (same as 'open')
5467  *
5468  * Returns:	0			Success
5469  *		!0			errno value
5470  *
5471  *
5472  * XXX:		We should enummerate the possible errno values here, and where
5473  *		in the code they originated.
5474  */
5475 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5476 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5477 {
5478 	fsid_t fsid;
5479 	uint64_t objid;
5480 	int fd;
5481 	int error;
5482 	int retry_count = 0;
5483 	char *buf = NULL;
5484 	int buflen = MAXPATHLEN;
5485 	int pathlen = 0;
5486 	vfs_context_t ctx = vfs_context_current();
5487 
5488 	if (!vfs_context_can_open_by_id(ctx)) {
5489 		return EPERM;
5490 	}
5491 
5492 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5493 		return error;
5494 	}
5495 
5496 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5497 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5498 		return error;
5499 	}
5500 
5501 	AUDIT_ARG(value32, fsid.val[0]);
5502 	AUDIT_ARG(value64, objid);
5503 
5504 retry:
5505 	fd = -1;
5506 	error = 0;
5507 	buf = NULL;
5508 	pathlen = 0;
5509 	buflen = MAXPATHLEN;
5510 
5511 	/*resolve path from fsis, objid*/
5512 	do {
5513 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5514 		if (buf == NULL) {
5515 			return ENOMEM;
5516 		}
5517 
5518 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5519 		    buf, FSOPT_ISREALFSID, &pathlen);
5520 
5521 		if (error) {
5522 			kfree_data(buf, buflen + 1);
5523 			buf = NULL;
5524 		}
5525 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5526 
5527 	if (error) {
5528 		return error;
5529 	}
5530 
5531 	buf[pathlen] = 0;
5532 
5533 	error = openat_internal(
5534 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, &fd, &objid, &fsid);
5535 
5536 	kfree_data(buf, buflen + 1);
5537 
5538 	/* Ensure the correct file is opened */
5539 	if (error == ERECYCLE) {
5540 		if (retry_count < MAX_OPENBYID_NP_RETRIES) {
5541 			retry_count += 1;
5542 			goto retry;
5543 		} else {
5544 			printf("openbyid_np() retry limit due to ERECYCLE reached\n");
5545 			error = ENOENT;
5546 		}
5547 	}
5548 
5549 	if (!error) {
5550 		*retval = fd;
5551 	}
5552 
5553 	return error;
5554 }
5555 
5556 
5557 /*
5558  * Create a special file.
5559  */
5560 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5561     int fd);
5562 
5563 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5564 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5565     mode_t mode, int fd)
5566 {
5567 	vfs_context_t ctx = vfs_context_current();
5568 	struct nameidata nd;
5569 	vnode_t vp, dvp;
5570 	int error;
5571 
5572 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5573 	if ((mode & S_IFMT) == S_IFIFO) {
5574 		return mkfifo1(ctx, upath, vap, fd);
5575 	}
5576 
5577 	AUDIT_ARG(mode, mode);
5578 	AUDIT_ARG(value32, vap->va_rdev);
5579 
5580 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5581 		return error;
5582 	}
5583 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5584 	    UIO_USERSPACE, upath, ctx);
5585 	error = nameiat(&nd, fd);
5586 	if (error) {
5587 		return error;
5588 	}
5589 	dvp = nd.ni_dvp;
5590 	vp = nd.ni_vp;
5591 
5592 	if (vp != NULL) {
5593 		error = EEXIST;
5594 		goto out;
5595 	}
5596 
5597 	switch (mode & S_IFMT) {
5598 	case S_IFCHR:
5599 		VATTR_SET(vap, va_type, VCHR);
5600 		break;
5601 	case S_IFBLK:
5602 		VATTR_SET(vap, va_type, VBLK);
5603 		break;
5604 	default:
5605 		error = EINVAL;
5606 		goto out;
5607 	}
5608 
5609 #if CONFIG_MACF
5610 	error = mac_vnode_check_create(ctx,
5611 	    nd.ni_dvp, &nd.ni_cnd, vap);
5612 	if (error) {
5613 		goto out;
5614 	}
5615 #endif
5616 
5617 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5618 		goto out;
5619 	}
5620 
5621 #if CONFIG_FILE_LEASES
5622 	vnode_breakdirlease(dvp, false, O_WRONLY);
5623 #endif
5624 
5625 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5626 		goto out;
5627 	}
5628 
5629 	if (vp) {
5630 		int     update_flags = 0;
5631 
5632 		// Make sure the name & parent pointers are hooked up
5633 		if (vp->v_name == NULL) {
5634 			update_flags |= VNODE_UPDATE_NAME;
5635 		}
5636 		if (vp->v_parent == NULLVP) {
5637 			update_flags |= VNODE_UPDATE_PARENT;
5638 		}
5639 
5640 		if (update_flags) {
5641 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5642 		}
5643 
5644 #if CONFIG_FSE
5645 		add_fsevent(FSE_CREATE_FILE, ctx,
5646 		    FSE_ARG_VNODE, vp,
5647 		    FSE_ARG_DONE);
5648 #endif
5649 	}
5650 
5651 out:
5652 	/*
5653 	 * nameidone has to happen before we vnode_put(dvp)
5654 	 * since it may need to release the fs_nodelock on the dvp
5655 	 */
5656 	nameidone(&nd);
5657 
5658 	if (vp) {
5659 		vnode_put(vp);
5660 	}
5661 	vnode_put(dvp);
5662 
5663 	return error;
5664 }
5665 
5666 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5667 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5668 {
5669 	struct vnode_attr va;
5670 
5671 	VATTR_INIT(&va);
5672 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5673 	VATTR_SET(&va, va_rdev, uap->dev);
5674 
5675 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5676 }
5677 
5678 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5679 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5680 {
5681 	struct vnode_attr va;
5682 
5683 	VATTR_INIT(&va);
5684 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5685 	VATTR_SET(&va, va_rdev, uap->dev);
5686 
5687 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5688 }
5689 
5690 /*
5691  * Create a named pipe.
5692  *
5693  * Returns:	0			Success
5694  *		EEXIST
5695  *	namei:???
5696  *	vnode_authorize:???
5697  *	vn_create:???
5698  */
5699 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5700 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5701 {
5702 	vnode_t vp, dvp;
5703 	int error;
5704 	struct nameidata nd;
5705 
5706 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5707 	    UIO_USERSPACE, upath, ctx);
5708 	error = nameiat(&nd, fd);
5709 	if (error) {
5710 		return error;
5711 	}
5712 	dvp = nd.ni_dvp;
5713 	vp = nd.ni_vp;
5714 
5715 	/* check that this is a new file and authorize addition */
5716 	if (vp != NULL) {
5717 		error = EEXIST;
5718 		goto out;
5719 	}
5720 	VATTR_SET(vap, va_type, VFIFO);
5721 
5722 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5723 		goto out;
5724 	}
5725 
5726 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5727 out:
5728 	/*
5729 	 * nameidone has to happen before we vnode_put(dvp)
5730 	 * since it may need to release the fs_nodelock on the dvp
5731 	 */
5732 	nameidone(&nd);
5733 
5734 	if (vp) {
5735 		vnode_put(vp);
5736 	}
5737 	vnode_put(dvp);
5738 
5739 	return error;
5740 }
5741 
5742 
5743 /*
5744  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5745  *
5746  * Parameters:	p			Process requesting the open
5747  *		uap			User argument descriptor (see below)
5748  *		retval			(Ignored)
5749  *
5750  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5751  *		uap->uid		UID to set
5752  *		uap->gid		GID to set
5753  *		uap->mode		File mode to set (same as 'mkfifo')
5754  *		uap->xsecurity		ACL to set, if creating
5755  *
5756  * Returns:	0			Success
5757  *		!0			errno value
5758  *
5759  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5760  *
5761  * XXX:		We should enummerate the possible errno values here, and where
5762  *		in the code they originated.
5763  */
5764 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5765 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5766 {
5767 	int ciferror;
5768 	kauth_filesec_t xsecdst;
5769 	struct vnode_attr va;
5770 
5771 	AUDIT_ARG(owner, uap->uid, uap->gid);
5772 
5773 	xsecdst = KAUTH_FILESEC_NONE;
5774 	if (uap->xsecurity != USER_ADDR_NULL) {
5775 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5776 			return ciferror;
5777 		}
5778 	}
5779 
5780 	VATTR_INIT(&va);
5781 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5782 	if (uap->uid != KAUTH_UID_NONE) {
5783 		VATTR_SET(&va, va_uid, uap->uid);
5784 	}
5785 	if (uap->gid != KAUTH_GID_NONE) {
5786 		VATTR_SET(&va, va_gid, uap->gid);
5787 	}
5788 	if (xsecdst != KAUTH_FILESEC_NONE) {
5789 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5790 		va.va_vaflags |= VA_FILESEC_ACL;
5791 	}
5792 
5793 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5794 
5795 	if (xsecdst != KAUTH_FILESEC_NONE) {
5796 		kauth_filesec_free(xsecdst);
5797 	}
5798 	return ciferror;
5799 }
5800 
5801 /* ARGSUSED */
5802 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5803 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5804 {
5805 	struct vnode_attr va;
5806 
5807 	VATTR_INIT(&va);
5808 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5809 
5810 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5811 }
5812 
5813 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5814 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5815 {
5816 	struct vnode_attr va;
5817 
5818 	VATTR_INIT(&va);
5819 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5820 
5821 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5822 }
5823 
5824 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5825 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5826 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5827 
5828 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5829 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5830 {
5831 	int ret, len = _len;
5832 
5833 	*truncated_path = 0;
5834 
5835 	if (firmlink) {
5836 		ret = vn_getpath(dvp, path, &len);
5837 	} else {
5838 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5839 	}
5840 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5841 		if (leafname) {
5842 			path[len - 1] = '/';
5843 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5844 			if (len > MAXPATHLEN) {
5845 				char *ptr;
5846 
5847 				// the string got truncated!
5848 				*truncated_path = 1;
5849 				ptr = strrchr(path, '/');
5850 				if (ptr) {
5851 					*ptr = '\0';   // chop off the string at the last directory component
5852 				}
5853 				len = (int)strlen(path) + 1;
5854 			}
5855 		}
5856 	} else if (ret == 0) {
5857 		*truncated_path = 1;
5858 	} else if (ret != 0) {
5859 		struct vnode *mydvp = dvp;
5860 
5861 		if (ret != ENOSPC) {
5862 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5863 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5864 		}
5865 		*truncated_path = 1;
5866 
5867 		do {
5868 			if (mydvp->v_parent != NULL) {
5869 				mydvp = mydvp->v_parent;
5870 			} else if (mydvp->v_mount) {
5871 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5872 				break;
5873 			} else {
5874 				// no parent and no mount point?  only thing is to punt and say "/" changed
5875 				strlcpy(path, "/", _len);
5876 				len = 2;
5877 				mydvp = NULL;
5878 			}
5879 
5880 			if (mydvp == NULL) {
5881 				break;
5882 			}
5883 
5884 			len = _len;
5885 			if (firmlink) {
5886 				ret = vn_getpath(mydvp, path, &len);
5887 			} else {
5888 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5889 			}
5890 		} while (ret == ENOSPC);
5891 	}
5892 
5893 	return len;
5894 }
5895 
5896 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5897 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5898 {
5899 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5900 }
5901 
5902 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5903 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5904 {
5905 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5906 }
5907 
5908 /*
5909  * Make a hard file link.
5910  *
5911  * Returns:	0			Success
5912  *		EPERM
5913  *		EEXIST
5914  *		EXDEV
5915  *	namei:???
5916  *	vnode_authorize:???
5917  *	VNOP_LINK:???
5918  */
5919 /* ARGSUSED */
5920 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5921 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5922     user_addr_t link, int flag, enum uio_seg segflg)
5923 {
5924 	vnode_t vp, pvp, dvp, lvp;
5925 	struct nameidata nd;
5926 	int follow;
5927 	int error;
5928 #if CONFIG_FSE
5929 	fse_info finfo;
5930 #endif
5931 	char *target_path = NULL;
5932 	char  *no_firmlink_path = NULL;
5933 	vnode_t locked_vp = NULLVP;
5934 	int truncated = 0;
5935 	int truncated_no_firmlink_path = 0;
5936 	int num_retries = 0;
5937 	int need_event, has_listeners, need_kpath2;
5938 	bool do_retry;
5939 
5940 	/* look up the object we are linking to */
5941 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5942 
5943 retry:
5944 	do_retry = false;
5945 	vp = dvp = lvp = NULLVP;
5946 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5947 	    segflg, path, ctx);
5948 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
5949 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
5950 	}
5951 	if (flag & AT_RESOLVE_BENEATH) {
5952 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
5953 	}
5954 
5955 	error = nameiat(&nd, fd1);
5956 	if (error) {
5957 		return error;
5958 	}
5959 	vp = nd.ni_vp;
5960 
5961 	nameidone(&nd);
5962 
5963 	/*
5964 	 * Normally, linking to directories is not supported.
5965 	 * However, some file systems may have limited support.
5966 	 */
5967 	if (vp->v_type == VDIR) {
5968 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5969 			error = EPERM;   /* POSIX */
5970 			goto out;
5971 		}
5972 
5973 		/* Linking to a directory requires ownership. */
5974 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5975 			struct vnode_attr dva;
5976 
5977 			VATTR_INIT(&dva);
5978 			VATTR_WANTED(&dva, va_uid);
5979 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5980 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5981 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5982 				error = EACCES;
5983 				goto out;
5984 			}
5985 		}
5986 	}
5987 
5988 	/* lookup the target node */
5989 #if CONFIG_TRIGGERS
5990 	nd.ni_op = OP_LINK;
5991 #endif
5992 	nd.ni_cnd.cn_nameiop = CREATE;
5993 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5994 	nd.ni_dirp = link;
5995 	error = nameiat(&nd, fd2);
5996 	if (error != 0) {
5997 		goto out;
5998 	}
5999 	dvp = nd.ni_dvp;
6000 	lvp = nd.ni_vp;
6001 
6002 	assert(locked_vp == NULLVP);
6003 	vnode_link_lock(vp);
6004 	locked_vp = vp;
6005 
6006 #if CONFIG_MACF
6007 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
6008 		goto out2;
6009 	}
6010 #endif
6011 
6012 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
6013 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
6014 		goto out2;
6015 	}
6016 
6017 	/* target node must not exist */
6018 	if (lvp != NULLVP) {
6019 		error = EEXIST;
6020 		goto out2;
6021 	}
6022 	/* cannot link across mountpoints */
6023 	if (vnode_mount(vp) != vnode_mount(dvp)) {
6024 		error = EXDEV;
6025 		goto out2;
6026 	}
6027 
6028 	/* authorize creation of the target note */
6029 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
6030 		goto out2;
6031 	}
6032 
6033 #if CONFIG_FILE_LEASES
6034 	vnode_breakdirlease(dvp, false, O_WRONLY);
6035 #endif
6036 
6037 	/* and finally make the link */
6038 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
6039 	if (error) {
6040 		if (error == ENOENT && num_retries < MAX_LINK_ENOENT_RETRIES) {
6041 			do_retry = true;
6042 			num_retries += 1;
6043 		}
6044 		goto out2;
6045 	}
6046 
6047 #if CONFIG_MACF
6048 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
6049 #endif
6050 
6051 	vnode_lock_spin(vp);
6052 	vp->v_ext_flag &= ~VE_NOT_HARDLINK;
6053 	vnode_unlock(vp);
6054 
6055 	assert(locked_vp == vp);
6056 	vnode_link_unlock(locked_vp);
6057 	locked_vp = NULLVP;
6058 
6059 #if CONFIG_FSE
6060 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
6061 #else
6062 	need_event = 0;
6063 #endif
6064 	has_listeners = kauth_authorize_fileop_has_listeners();
6065 
6066 	need_kpath2 = 0;
6067 #if CONFIG_AUDIT
6068 	if (AUDIT_RECORD_EXISTS()) {
6069 		need_kpath2 = 1;
6070 	}
6071 #endif
6072 
6073 	if (need_event || has_listeners || need_kpath2) {
6074 		char *link_to_path = NULL;
6075 		int len, link_name_len;
6076 		int  len_no_firmlink_path = 0;
6077 
6078 		/* build the path to the new link file */
6079 		GET_PATH(target_path);
6080 
6081 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
6082 		if (no_firmlink_path == NULL) {
6083 			GET_PATH(no_firmlink_path);
6084 		}
6085 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6086 
6087 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
6088 
6089 		if (has_listeners) {
6090 			/* build the path to file we are linking to */
6091 			GET_PATH(link_to_path);
6092 
6093 			link_name_len = MAXPATHLEN;
6094 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
6095 				/*
6096 				 * Call out to allow 3rd party notification of rename.
6097 				 * Ignore result of kauth_authorize_fileop call.
6098 				 */
6099 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
6100 				    (uintptr_t)link_to_path,
6101 				    (uintptr_t)target_path);
6102 			}
6103 			if (link_to_path != NULL) {
6104 				RELEASE_PATH(link_to_path);
6105 			}
6106 		}
6107 #if CONFIG_FSE
6108 		if (need_event) {
6109 			/* construct fsevent */
6110 			if (get_fse_info(vp, &finfo, ctx) == 0) {
6111 				if (truncated_no_firmlink_path) {
6112 					finfo.mode |= FSE_TRUNCATED_PATH;
6113 				}
6114 
6115 				// build the path to the destination of the link
6116 				add_fsevent(FSE_CREATE_FILE, ctx,
6117 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6118 				    FSE_ARG_FINFO, &finfo,
6119 				    FSE_ARG_DONE);
6120 			}
6121 
6122 			pvp = vp->v_parent;
6123 			// need an iocount on parent vnode in this case
6124 			if (pvp && pvp != dvp) {
6125 				pvp = vnode_getparent_if_different(vp, dvp);
6126 			}
6127 			if (pvp) {
6128 				add_fsevent(FSE_STAT_CHANGED, ctx,
6129 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
6130 			}
6131 			if (pvp && pvp != dvp) {
6132 				vnode_put(pvp);
6133 			}
6134 		}
6135 #endif
6136 	}
6137 out2:
6138 	/*
6139 	 * nameidone has to happen before we vnode_put(dvp)
6140 	 * since it may need to release the fs_nodelock on the dvp
6141 	 */
6142 	nameidone(&nd);
6143 	if (target_path != NULL) {
6144 		RELEASE_PATH(target_path);
6145 		target_path = NULL;
6146 	}
6147 	if (no_firmlink_path != NULL) {
6148 		RELEASE_PATH(no_firmlink_path);
6149 		no_firmlink_path = NULL;
6150 	}
6151 out:
6152 	if (locked_vp) {
6153 		assert(locked_vp == vp);
6154 		vnode_link_unlock(locked_vp);
6155 		locked_vp = NULLVP;
6156 	}
6157 	if (lvp) {
6158 		vnode_put(lvp);
6159 	}
6160 	if (dvp) {
6161 		vnode_put(dvp);
6162 	}
6163 	vnode_put(vp);
6164 
6165 	if (do_retry) {
6166 		goto retry;
6167 	}
6168 
6169 	return error;
6170 }
6171 
6172 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)6173 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
6174 {
6175 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6176 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
6177 }
6178 
6179 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)6180 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
6181 {
6182 	if (uap->flag & ~(AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) {
6183 		return EINVAL;
6184 	}
6185 
6186 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
6187 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
6188 }
6189 
6190 /*
6191  * Make a symbolic link.
6192  *
6193  * We could add support for ACLs here too...
6194  */
6195 /* ARGSUSED */
6196 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)6197 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
6198     user_addr_t link, enum uio_seg segflg)
6199 {
6200 	struct vnode_attr va;
6201 	char *path;
6202 	int error;
6203 	struct nameidata nd;
6204 	vnode_t vp, dvp;
6205 	size_t dummy = 0;
6206 	proc_t p;
6207 
6208 	error = 0;
6209 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
6210 		path = zalloc(ZV_NAMEI);
6211 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
6212 	} else {
6213 		path = (char *)path_data;
6214 	}
6215 	if (error) {
6216 		goto out;
6217 	}
6218 	AUDIT_ARG(text, path);  /* This is the link string */
6219 
6220 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
6221 	    segflg, link, ctx);
6222 
6223 	error = nameiat(&nd, fd);
6224 	if (error) {
6225 		goto out;
6226 	}
6227 	dvp = nd.ni_dvp;
6228 	vp = nd.ni_vp;
6229 
6230 	p = vfs_context_proc(ctx);
6231 	VATTR_INIT(&va);
6232 	VATTR_SET(&va, va_type, VLNK);
6233 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
6234 
6235 #if CONFIG_MACF
6236 	error = mac_vnode_check_create(ctx,
6237 	    dvp, &nd.ni_cnd, &va);
6238 #endif
6239 	if (error != 0) {
6240 		goto skipit;
6241 	}
6242 
6243 	if (vp != NULL) {
6244 		error = EEXIST;
6245 		goto skipit;
6246 	}
6247 
6248 	/* authorize */
6249 	if (error == 0) {
6250 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6251 	}
6252 	/* get default ownership, etc. */
6253 	if (error == 0) {
6254 		error = vnode_authattr_new(dvp, &va, 0, ctx);
6255 	}
6256 
6257 #if CONFIG_FILE_LEASES
6258 	vnode_breakdirlease(dvp, false, O_WRONLY);
6259 #endif
6260 
6261 	if (error == 0) {
6262 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
6263 	}
6264 
6265 	/* do fallback attribute handling */
6266 	if (error == 0 && vp) {
6267 		error = vnode_setattr_fallback(vp, &va, ctx);
6268 	}
6269 
6270 #if CONFIG_MACF
6271 	if (error == 0 && vp) {
6272 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
6273 	}
6274 #endif
6275 
6276 	if (error == 0) {
6277 		int     update_flags = 0;
6278 
6279 		/*check if a new vnode was created, else try to get one*/
6280 		if (vp == NULL) {
6281 			nd.ni_cnd.cn_nameiop = LOOKUP;
6282 #if CONFIG_TRIGGERS
6283 			nd.ni_op = OP_LOOKUP;
6284 #endif
6285 			/*
6286 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
6287 			 * reallocated again in namei().
6288 			 */
6289 			nd.ni_cnd.cn_flags &= HASBUF;
6290 			error = nameiat(&nd, fd);
6291 			if (error) {
6292 				goto skipit;
6293 			}
6294 			vp = nd.ni_vp;
6295 		}
6296 
6297 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
6298 		/* call out to allow 3rd party notification of rename.
6299 		 * Ignore result of kauth_authorize_fileop call.
6300 		 */
6301 		if (kauth_authorize_fileop_has_listeners() &&
6302 		    namei(&nd) == 0) {
6303 			char *new_link_path = NULL;
6304 			int             len;
6305 
6306 			/* build the path to the new link file */
6307 			new_link_path = get_pathbuff();
6308 			len = MAXPATHLEN;
6309 			vn_getpath(dvp, new_link_path, &len);
6310 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
6311 				new_link_path[len - 1] = '/';
6312 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
6313 			}
6314 
6315 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
6316 			    (uintptr_t)path, (uintptr_t)new_link_path);
6317 			if (new_link_path != NULL) {
6318 				release_pathbuff(new_link_path);
6319 			}
6320 		}
6321 #endif
6322 		// Make sure the name & parent pointers are hooked up
6323 		if (vp->v_name == NULL) {
6324 			update_flags |= VNODE_UPDATE_NAME;
6325 		}
6326 		if (vp->v_parent == NULLVP) {
6327 			update_flags |= VNODE_UPDATE_PARENT;
6328 		}
6329 
6330 		if (update_flags) {
6331 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
6332 		}
6333 
6334 #if CONFIG_FSE
6335 		add_fsevent(FSE_CREATE_FILE, ctx,
6336 		    FSE_ARG_VNODE, vp,
6337 		    FSE_ARG_DONE);
6338 #endif
6339 	}
6340 
6341 skipit:
6342 	/*
6343 	 * nameidone has to happen before we vnode_put(dvp)
6344 	 * since it may need to release the fs_nodelock on the dvp
6345 	 */
6346 	nameidone(&nd);
6347 
6348 	if (vp) {
6349 		vnode_put(vp);
6350 	}
6351 	vnode_put(dvp);
6352 out:
6353 	if (path && (path != (char *)path_data)) {
6354 		zfree(ZV_NAMEI, path);
6355 	}
6356 
6357 	return error;
6358 }
6359 
6360 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6361 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6362 {
6363 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6364 	           uap->link, UIO_USERSPACE);
6365 }
6366 
6367 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6368 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6369     __unused int32_t *retval)
6370 {
6371 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6372 	           uap->path2, UIO_USERSPACE);
6373 }
6374 
6375 /*
6376  * Delete a whiteout from the filesystem.
6377  * No longer supported.
6378  */
6379 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6380 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6381 {
6382 	return ENOTSUP;
6383 }
6384 
6385 /*
6386  * Delete a name from the filesystem.
6387  */
6388 /* ARGSUSED */
6389 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6390 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6391     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6392 {
6393 	struct {
6394 		struct nameidata nd;
6395 #if CONFIG_FSE
6396 		struct vnode_attr va;
6397 		fse_info finfo;
6398 #endif
6399 	} *__unlink_data;
6400 	struct nameidata *ndp;
6401 	vnode_t vp, dvp;
6402 	int error;
6403 	struct componentname *cnp;
6404 	char  *path = NULL;
6405 	char  *no_firmlink_path = NULL;
6406 	int  len_path = 0;
6407 	int  len_no_firmlink_path = 0;
6408 	int flags;
6409 	int need_event;
6410 	int has_listeners;
6411 	int truncated_path;
6412 	int truncated_no_firmlink_path;
6413 	int batched;
6414 	struct vnode_attr *vap;
6415 	vnode_t locked_vp = NULLVP;
6416 	int do_retry;
6417 	int retry_count = 0;
6418 	int cn_flags;
6419 	int nofollow_any = 0;
6420 	int resolve_beneath = 0;
6421 
6422 	cn_flags = LOCKPARENT;
6423 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6424 		cn_flags |= AUDITVNPATH1;
6425 	}
6426 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6427 		nofollow_any = NAMEI_NOFOLLOW_ANY;
6428 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6429 	}
6430 	if (unlink_flags & VNODE_REMOVE_RESOLVE_BENEATH) {
6431 		resolve_beneath = NAMEI_RESOLVE_BENEATH;
6432 		unlink_flags &= ~VNODE_REMOVE_RESOLVE_BENEATH;
6433 	}
6434 	/* If a starting dvp is passed, it trumps any fd passed. */
6435 	if (start_dvp) {
6436 		cn_flags |= USEDVP;
6437 	}
6438 
6439 #if NAMEDRSRCFORK
6440 	/* unlink or delete is allowed on rsrc forks and named streams */
6441 	cn_flags |= CN_ALLOWRSRCFORK;
6442 #endif
6443 
6444 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6445 	ndp = &__unlink_data->nd;
6446 #if CONFIG_FSE
6447 	fse_info *finfop = &__unlink_data->finfo;
6448 #endif
6449 
6450 retry:
6451 	do_retry = 0;
6452 	flags = 0;
6453 	need_event = 0;
6454 	has_listeners = 0;
6455 	truncated_path = 0;
6456 	truncated_no_firmlink_path = 0;
6457 	vap = NULL;
6458 
6459 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6460 
6461 	ndp->ni_dvp = start_dvp;
6462 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any | resolve_beneath;
6463 	cnp = &ndp->ni_cnd;
6464 
6465 continue_lookup:
6466 	error = nameiat(ndp, fd);
6467 	if (error) {
6468 		goto early_out;
6469 	}
6470 
6471 	dvp = ndp->ni_dvp;
6472 	vp = ndp->ni_vp;
6473 
6474 	/* With Carbon delete semantics, busy files cannot be deleted */
6475 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6476 		flags |= VNODE_REMOVE_NODELETEBUSY;
6477 	}
6478 
6479 	/* Skip any potential upcalls if told to. */
6480 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6481 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6482 	}
6483 
6484 	/* Update speculative telemetry with system discarded use state */
6485 	if (unlink_flags & VNODE_REMOVE_SYSTEM_DISCARDED) {
6486 		flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6487 	}
6488 
6489 	if (vp) {
6490 		batched = vnode_compound_remove_available(vp);
6491 		/*
6492 		 * The root of a mounted filesystem cannot be deleted.
6493 		 */
6494 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6495 			error = EBUSY;
6496 			goto out;
6497 		}
6498 
6499 #if DEVELOPMENT || DEBUG
6500 		/*
6501 		 * XXX VSWAP: Check for entitlements or special flag here
6502 		 * so we can restrict access appropriately.
6503 		 */
6504 #else /* DEVELOPMENT || DEBUG */
6505 
6506 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6507 			error = EPERM;
6508 			goto out;
6509 		}
6510 #endif /* DEVELOPMENT || DEBUG */
6511 
6512 		if (!batched) {
6513 			vnode_link_lock(vp);
6514 			locked_vp = vp;
6515 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6516 			if (error) {
6517 				if (error == ENOENT) {
6518 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6519 						do_retry = 1;
6520 						retry_count++;
6521 					}
6522 				}
6523 				vnode_link_unlock(vp);
6524 				locked_vp = NULLVP;
6525 				goto out;
6526 			}
6527 		}
6528 	} else {
6529 		batched = 1;
6530 
6531 		if (!vnode_compound_remove_available(dvp)) {
6532 			panic("No vp, but no compound remove?");
6533 		}
6534 	}
6535 
6536 #if CONFIG_FSE
6537 	need_event = need_fsevent(FSE_DELETE, dvp);
6538 	if (need_event) {
6539 		if (!batched) {
6540 			if ((vp->v_flag & VISHARDLINK) == 0) {
6541 				/* XXX need to get these data in batched VNOP */
6542 				get_fse_info(vp, finfop, ctx);
6543 			}
6544 		} else {
6545 			error =
6546 			    vfs_get_notify_attributes(&__unlink_data->va);
6547 			if (error) {
6548 				goto out;
6549 			}
6550 
6551 			vap = &__unlink_data->va;
6552 		}
6553 	}
6554 #endif
6555 	has_listeners = kauth_authorize_fileop_has_listeners();
6556 	if (need_event || has_listeners) {
6557 		if (path == NULL) {
6558 			GET_PATH(path);
6559 		}
6560 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6561 		if (no_firmlink_path == NULL) {
6562 			GET_PATH(no_firmlink_path);
6563 		}
6564 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6565 	}
6566 
6567 #if NAMEDRSRCFORK
6568 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6569 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6570 	} else
6571 #endif
6572 	{
6573 #if CONFIG_FILE_LEASES
6574 		vnode_breakdirlease(dvp, false, O_WRONLY);
6575 #endif
6576 
6577 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6578 		vp = ndp->ni_vp;
6579 		if (error == EKEEPLOOKING) {
6580 			if (!batched) {
6581 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6582 			}
6583 
6584 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6585 				panic("EKEEPLOOKING, but continue flag not set?");
6586 			}
6587 
6588 			if (vnode_isdir(vp)) {
6589 				error = EISDIR;
6590 				goto out;
6591 			}
6592 			goto continue_lookup;
6593 		} else if (error == ENOENT && batched) {
6594 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6595 				/*
6596 				 * For compound VNOPs, the authorization callback may
6597 				 * return ENOENT in case of racing hardlink lookups
6598 				 * hitting the name  cache, redrive the lookup.
6599 				 */
6600 				do_retry = 1;
6601 				retry_count += 1;
6602 				goto out;
6603 			}
6604 		}
6605 	}
6606 
6607 	/*
6608 	 * Call out to allow 3rd party notification of delete.
6609 	 * Ignore result of kauth_authorize_fileop call.
6610 	 */
6611 	if (!error) {
6612 		if (has_listeners) {
6613 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6614 			    KAUTH_FILEOP_DELETE,
6615 			    (uintptr_t)vp,
6616 			    (uintptr_t)path);
6617 		}
6618 
6619 		if (vp->v_flag & VISHARDLINK) {
6620 			//
6621 			// if a hardlink gets deleted we want to blow away the
6622 			// v_parent link because the path that got us to this
6623 			// instance of the link is no longer valid.  this will
6624 			// force the next call to get the path to ask the file
6625 			// system instead of just following the v_parent link.
6626 			//
6627 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6628 		}
6629 
6630 #if CONFIG_FSE
6631 		if (need_event) {
6632 			if (vp->v_flag & VISHARDLINK) {
6633 				get_fse_info(vp, finfop, ctx);
6634 			} else if (vap) {
6635 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6636 			}
6637 			if (truncated_path) {
6638 				finfop->mode |= FSE_TRUNCATED_PATH;
6639 			}
6640 			add_fsevent(FSE_DELETE, ctx,
6641 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6642 			    FSE_ARG_FINFO, finfop,
6643 			    FSE_ARG_DONE);
6644 		}
6645 #endif
6646 
6647 #if CONFIG_MACF
6648 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6649 #endif
6650 	}
6651 
6652 out:
6653 	if (locked_vp) {
6654 		assert(locked_vp == vp);
6655 		vnode_link_unlock(locked_vp);
6656 		locked_vp = NULLVP;
6657 	}
6658 
6659 	if (path != NULL) {
6660 		RELEASE_PATH(path);
6661 		path = NULL;
6662 	}
6663 
6664 	if (no_firmlink_path != NULL) {
6665 		RELEASE_PATH(no_firmlink_path);
6666 		no_firmlink_path = NULL;
6667 	}
6668 #if NAMEDRSRCFORK
6669 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6670 	 * will cause its shadow file to go away if necessary.
6671 	 */
6672 	if (vp && (vnode_isnamedstream(vp)) &&
6673 	    (vp->v_parent != NULLVP) &&
6674 	    vnode_isshadow(vp)) {
6675 		vnode_recycle(vp);
6676 	}
6677 #endif
6678 	/*
6679 	 * nameidone has to happen before we vnode_put(dvp)
6680 	 * since it may need to release the fs_nodelock on the dvp
6681 	 */
6682 	nameidone(ndp);
6683 	vnode_put(dvp);
6684 	if (vp) {
6685 		vnode_put(vp);
6686 	}
6687 
6688 	if (do_retry) {
6689 		goto retry;
6690 	}
6691 
6692 early_out:
6693 	kfree_type(typeof(*__unlink_data), __unlink_data);
6694 	return error;
6695 }
6696 
6697 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6698 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6699     enum uio_seg segflg, int unlink_flags)
6700 {
6701 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6702 	           unlink_flags);
6703 }
6704 
6705 /*
6706  * Delete a name from the filesystem using Carbon semantics.
6707  */
6708 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6709 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6710 {
6711 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6712 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6713 }
6714 
6715 /*
6716  * Delete a name from the filesystem using POSIX semantics.
6717  */
6718 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6719 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6720 {
6721 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6722 	           uap->path, UIO_USERSPACE, 0);
6723 }
6724 
6725 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6726 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6727 {
6728 	int unlink_flags = 0;
6729 
6730 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY | AT_SYSTEM_DISCARDED | AT_RESOLVE_BENEATH | AT_NODELETEBUSY)) {
6731 		return EINVAL;
6732 	}
6733 
6734 	if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6735 		unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6736 	}
6737 	if (uap->flag & AT_RESOLVE_BENEATH) {
6738 		unlink_flags |= VNODE_REMOVE_RESOLVE_BENEATH;
6739 	}
6740 
6741 	if (uap->flag & AT_SYSTEM_DISCARDED) {
6742 		unlink_flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6743 	}
6744 
6745 	if (uap->flag & AT_NODELETEBUSY) {
6746 		unlink_flags |= VNODE_REMOVE_NODELETEBUSY;
6747 	}
6748 
6749 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6750 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6751 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6752 		}
6753 		return rmdirat_internal(vfs_context_current(), uap->fd,
6754 		           uap->path, UIO_USERSPACE, unlink_flags);
6755 	} else {
6756 		return unlinkat_internal(vfs_context_current(), uap->fd,
6757 		           NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6758 	}
6759 }
6760 
6761 /*
6762  * Reposition read/write file offset.
6763  */
6764 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6765 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6766 {
6767 	struct fileproc *fp;
6768 	vnode_t vp;
6769 	struct vfs_context *ctx;
6770 	off_t offset = uap->offset, file_size;
6771 	int error;
6772 
6773 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6774 		if (error == ENOTSUP) {
6775 			return ESPIPE;
6776 		}
6777 		return error;
6778 	}
6779 	if (
6780 		// rdar://3837316: Seeking a pipe is disallowed by POSIX.
6781 		vnode_isfifo(vp)
6782 		// rdar://120750171: Seeking a TTY is undefined and should be denied.
6783 		|| vnode_istty(vp)
6784 		) {
6785 		file_drop(uap->fd);
6786 		return ESPIPE;
6787 	}
6788 
6789 
6790 	ctx = vfs_context_current();
6791 #if CONFIG_MACF
6792 	if (uap->whence == L_INCR && uap->offset == 0) {
6793 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6794 		    fp->fp_glob);
6795 	} else {
6796 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6797 		    fp->fp_glob);
6798 	}
6799 	if (error) {
6800 		file_drop(uap->fd);
6801 		return error;
6802 	}
6803 #endif
6804 	if ((error = vnode_getwithref(vp))) {
6805 		file_drop(uap->fd);
6806 		return error;
6807 	}
6808 
6809 	switch (uap->whence) {
6810 	case L_INCR:
6811 		offset += fp->fp_glob->fg_offset;
6812 		break;
6813 	case L_XTND:
6814 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6815 			break;
6816 		}
6817 		offset += file_size;
6818 		break;
6819 	case L_SET:
6820 		break;
6821 	case SEEK_HOLE:
6822 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6823 		break;
6824 	case SEEK_DATA:
6825 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6826 		break;
6827 	default:
6828 		error = EINVAL;
6829 	}
6830 	if (error == 0) {
6831 		if (uap->offset > 0 && offset < 0) {
6832 			/* Incremented/relative move past max size */
6833 			error = EOVERFLOW;
6834 		} else {
6835 			/*
6836 			 * Allow negative offsets on character devices, per
6837 			 * POSIX 1003.1-2001.  Most likely for writing disk
6838 			 * labels.
6839 			 */
6840 			if (offset < 0 && vp->v_type != VCHR) {
6841 				/* Decremented/relative move before start */
6842 				error = EINVAL;
6843 			} else {
6844 				/* Success */
6845 				fp->fp_glob->fg_offset = offset;
6846 				*retval = fp->fp_glob->fg_offset;
6847 			}
6848 		}
6849 	}
6850 
6851 	/*
6852 	 * An lseek can affect whether data is "available to read."  Use
6853 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6854 	 */
6855 	post_event_if_success(vp, error, NOTE_NONE);
6856 	(void)vnode_put(vp);
6857 	file_drop(uap->fd);
6858 	return error;
6859 }
6860 
6861 
6862 /*
6863  * Check access permissions.
6864  *
6865  * Returns:	0			Success
6866  *		vnode_authorize:???
6867  */
6868 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6869 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6870 {
6871 	kauth_action_t action;
6872 	int error;
6873 
6874 	/*
6875 	 * If just the regular access bits, convert them to something
6876 	 * that vnode_authorize will understand.
6877 	 */
6878 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6879 		action = 0;
6880 		if (uflags & R_OK) {
6881 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6882 		}
6883 		if (uflags & W_OK) {
6884 			if (vnode_isdir(vp)) {
6885 				action |= KAUTH_VNODE_ADD_FILE |
6886 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6887 				/* might want delete rights here too */
6888 			} else {
6889 				action |= KAUTH_VNODE_WRITE_DATA;
6890 			}
6891 		}
6892 		if (uflags & X_OK) {
6893 			if (vnode_isdir(vp)) {
6894 				action |= KAUTH_VNODE_SEARCH;
6895 			} else {
6896 				action |= KAUTH_VNODE_EXECUTE;
6897 			}
6898 		}
6899 	} else {
6900 		/* take advantage of definition of uflags */
6901 		action = uflags >> 8;
6902 	}
6903 
6904 #if CONFIG_MACF
6905 	error = mac_vnode_check_access(ctx, vp, uflags);
6906 	if (error) {
6907 		return error;
6908 	}
6909 #endif /* MAC */
6910 
6911 	/* action == 0 means only check for existence */
6912 	if (action != 0) {
6913 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6914 	} else {
6915 		error = 0;
6916 	}
6917 
6918 	return error;
6919 }
6920 
6921 
6922 
6923 /*
6924  * access_extended: Check access permissions in bulk.
6925  *
6926  * Description:	uap->entries		Pointer to an array of accessx
6927  *                                      descriptor structs, plus one or
6928  *                                      more NULL terminated strings (see
6929  *                                      "Notes" section below).
6930  *		uap->size		Size of the area pointed to by
6931  *					uap->entries.
6932  *		uap->results		Pointer to the results array.
6933  *
6934  * Returns:	0			Success
6935  *		ENOMEM			Insufficient memory
6936  *		EINVAL			Invalid arguments
6937  *		namei:EFAULT		Bad address
6938  *		namei:ENAMETOOLONG	Filename too long
6939  *		namei:ENOENT		No such file or directory
6940  *		namei:ELOOP		Too many levels of symbolic links
6941  *		namei:EBADF		Bad file descriptor
6942  *		namei:ENOTDIR		Not a directory
6943  *		namei:???
6944  *		access1:
6945  *
6946  * Implicit returns:
6947  *		uap->results		Array contents modified
6948  *
6949  * Notes:	The uap->entries are structured as an arbitrary length array
6950  *		of accessx descriptors, followed by one or more NULL terminated
6951  *		strings
6952  *
6953  *			struct accessx_descriptor[0]
6954  *			...
6955  *			struct accessx_descriptor[n]
6956  *			char name_data[0];
6957  *
6958  *		We determine the entry count by walking the buffer containing
6959  *		the uap->entries argument descriptor.  For each descriptor we
6960  *		see, the valid values for the offset ad_name_offset will be
6961  *		in the byte range:
6962  *
6963  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6964  *						to
6965  *				[ uap->entries + uap->size - 2 ]
6966  *
6967  *		since we must have at least one string, and the string must
6968  *		be at least one character plus the NULL terminator in length.
6969  *
6970  * XXX:		Need to support the check-as uid argument
6971  */
6972 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6973 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6974 {
6975 	struct accessx_descriptor *input = NULL;
6976 	errno_t *result = NULL;
6977 	errno_t error = 0;
6978 	int wantdelete = 0;
6979 	size_t desc_max, desc_actual = 0;
6980 	unsigned int i, j;
6981 	struct vfs_context context;
6982 	struct nameidata nd;
6983 	int niopts;
6984 	vnode_t vp = NULL;
6985 	vnode_t dvp = NULL;
6986 #define ACCESSX_MAX_DESCR_ON_STACK 10
6987 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6988 
6989 	context.vc_ucred = NULL;
6990 
6991 	/*
6992 	 * Validate parameters; if valid, copy the descriptor array and string
6993 	 * arguments into local memory.  Before proceeding, the following
6994 	 * conditions must have been met:
6995 	 *
6996 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6997 	 * o	There must be sufficient room in the request for at least one
6998 	 *	descriptor and a one yte NUL terminated string.
6999 	 * o	The allocation of local storage must not fail.
7000 	 */
7001 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
7002 		return ENOMEM;
7003 	}
7004 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
7005 		return EINVAL;
7006 	}
7007 	if (uap->size <= sizeof(stack_input)) {
7008 		input = stack_input;
7009 	} else {
7010 		input = kalloc_data(uap->size, Z_WAITOK);
7011 		if (input == NULL) {
7012 			error = ENOMEM;
7013 			goto out;
7014 		}
7015 	}
7016 	error = copyin(uap->entries, input, uap->size);
7017 	if (error) {
7018 		goto out;
7019 	}
7020 
7021 	AUDIT_ARG(opaque, input, uap->size);
7022 
7023 	/*
7024 	 * Force NUL termination of the copyin buffer to avoid nami() running
7025 	 * off the end.  If the caller passes us bogus data, they may get a
7026 	 * bogus result.
7027 	 */
7028 	((char *)input)[uap->size - 1] = 0;
7029 
7030 	/*
7031 	 * Access is defined as checking against the process' real identity,
7032 	 * even if operations are checking the effective identity.  This
7033 	 * requires that we use a local vfs context.
7034 	 */
7035 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
7036 	context.vc_thread = current_thread();
7037 
7038 	/*
7039 	 * Find out how many entries we have, so we can allocate the result
7040 	 * array by walking the list and adjusting the count downward by the
7041 	 * earliest string offset we see.
7042 	 */
7043 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
7044 	desc_actual = desc_max;
7045 	for (i = 0; i < desc_actual; i++) {
7046 		/*
7047 		 * Take the offset to the name string for this entry and
7048 		 * convert to an input array index, which would be one off
7049 		 * the end of the array if this entry was the lowest-addressed
7050 		 * name string.
7051 		 */
7052 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
7053 
7054 		/*
7055 		 * An offset greater than the max allowable offset is an error.
7056 		 * It is also an error for any valid entry to point
7057 		 * to a location prior to the end of the current entry, if
7058 		 * it's not a reference to the string of the previous entry.
7059 		 */
7060 		if (j > desc_max || (j != 0 && j <= i)) {
7061 			error = EINVAL;
7062 			goto out;
7063 		}
7064 
7065 		/* Also do not let ad_name_offset point to something beyond the size of the input */
7066 		if (input[i].ad_name_offset >= uap->size) {
7067 			error = EINVAL;
7068 			goto out;
7069 		}
7070 
7071 		/*
7072 		 * An offset of 0 means use the previous descriptor's offset;
7073 		 * this is used to chain multiple requests for the same file
7074 		 * to avoid multiple lookups.
7075 		 */
7076 		if (j == 0) {
7077 			/* This is not valid for the first entry */
7078 			if (i == 0) {
7079 				error = EINVAL;
7080 				goto out;
7081 			}
7082 			continue;
7083 		}
7084 
7085 		/*
7086 		 * If the offset of the string for this descriptor is before
7087 		 * what we believe is the current actual last descriptor,
7088 		 * then we need to adjust our estimate downward; this permits
7089 		 * the string table following the last descriptor to be out
7090 		 * of order relative to the descriptor list.
7091 		 */
7092 		if (j < desc_actual) {
7093 			desc_actual = j;
7094 		}
7095 	}
7096 
7097 	/*
7098 	 * We limit the actual number of descriptors we are willing to process
7099 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
7100 	 * requested does not exceed this limit,
7101 	 */
7102 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
7103 		error = ENOMEM;
7104 		goto out;
7105 	}
7106 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
7107 	if (result == NULL) {
7108 		error = ENOMEM;
7109 		goto out;
7110 	}
7111 
7112 	/*
7113 	 * Do the work by iterating over the descriptor entries we know to
7114 	 * at least appear to contain valid data.
7115 	 */
7116 	error = 0;
7117 	for (i = 0; i < desc_actual; i++) {
7118 		/*
7119 		 * If the ad_name_offset is 0, then we use the previous
7120 		 * results to make the check; otherwise, we are looking up
7121 		 * a new file name.
7122 		 */
7123 		if (input[i].ad_name_offset != 0) {
7124 			/* discard old vnodes */
7125 			if (vp) {
7126 				vnode_put(vp);
7127 				vp = NULL;
7128 			}
7129 			if (dvp) {
7130 				vnode_put(dvp);
7131 				dvp = NULL;
7132 			}
7133 
7134 			/*
7135 			 * Scan forward in the descriptor list to see if we
7136 			 * need the parent vnode.  We will need it if we are
7137 			 * deleting, since we must have rights  to remove
7138 			 * entries in the parent directory, as well as the
7139 			 * rights to delete the object itself.
7140 			 */
7141 			wantdelete = input[i].ad_flags & _DELETE_OK;
7142 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
7143 				if (input[j].ad_flags & _DELETE_OK) {
7144 					wantdelete = 1;
7145 				}
7146 			}
7147 
7148 			niopts = FOLLOW | AUDITVNPATH1;
7149 
7150 			/* need parent for vnode_authorize for deletion test */
7151 			if (wantdelete) {
7152 				niopts |= WANTPARENT;
7153 			}
7154 
7155 			/* do the lookup */
7156 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
7157 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
7158 			    &context);
7159 			error = namei(&nd);
7160 			if (!error) {
7161 				vp = nd.ni_vp;
7162 				if (wantdelete) {
7163 					dvp = nd.ni_dvp;
7164 				}
7165 			}
7166 			nameidone(&nd);
7167 		}
7168 
7169 		/*
7170 		 * Handle lookup errors.
7171 		 */
7172 		switch (error) {
7173 		case ENOENT:
7174 		case EACCES:
7175 		case EPERM:
7176 		case ENOTDIR:
7177 			result[i] = error;
7178 			break;
7179 		case 0:
7180 			/* run this access check */
7181 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
7182 			break;
7183 		default:
7184 			/* fatal lookup error */
7185 
7186 			goto out;
7187 		}
7188 	}
7189 
7190 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
7191 
7192 	/* copy out results */
7193 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
7194 
7195 out:
7196 	if (input && input != stack_input) {
7197 		kfree_data(input, uap->size);
7198 	}
7199 	if (result) {
7200 		kfree_data(result, desc_actual * sizeof(errno_t));
7201 	}
7202 	if (vp) {
7203 		vnode_put(vp);
7204 	}
7205 	if (dvp) {
7206 		vnode_put(dvp);
7207 	}
7208 	if (IS_VALID_CRED(context.vc_ucred)) {
7209 		kauth_cred_unref(&context.vc_ucred);
7210 	}
7211 	return error;
7212 }
7213 
7214 
7215 /*
7216  * Returns:	0			Success
7217  *		namei:EFAULT		Bad address
7218  *		namei:ENAMETOOLONG	Filename too long
7219  *		namei:ENOENT		No such file or directory
7220  *		namei:ELOOP		Too many levels of symbolic links
7221  *		namei:EBADF		Bad file descriptor
7222  *		namei:ENOTDIR		Not a directory
7223  *		namei:???
7224  *		access1:
7225  */
7226 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)7227 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
7228     int flag, enum uio_seg segflg)
7229 {
7230 	int error;
7231 	struct nameidata nd;
7232 	int niopts;
7233 	struct vfs_context context;
7234 #if NAMEDRSRCFORK
7235 	int is_namedstream = 0;
7236 #endif
7237 
7238 	/*
7239 	 * Unless the AT_EACCESS option is used, Access is defined as checking
7240 	 * against the process' real identity, even if operations are checking
7241 	 * the effective identity.  So we need to tweak the credential
7242 	 * in the context for that case.
7243 	 */
7244 	if (!(flag & AT_EACCESS)) {
7245 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
7246 	} else {
7247 		context.vc_ucred = ctx->vc_ucred;
7248 	}
7249 	context.vc_thread = ctx->vc_thread;
7250 
7251 
7252 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
7253 	/* need parent for vnode_authorize for deletion test */
7254 	if (amode & _DELETE_OK) {
7255 		niopts |= WANTPARENT;
7256 	}
7257 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
7258 	    path, &context);
7259 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7260 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7261 	}
7262 	if (flag & AT_RESOLVE_BENEATH) {
7263 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
7264 	}
7265 
7266 #if NAMEDRSRCFORK
7267 	/* access(F_OK) calls are allowed for resource forks. */
7268 	if (amode == F_OK) {
7269 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7270 	}
7271 #endif
7272 	error = nameiat(&nd, fd);
7273 	if (error) {
7274 		goto out;
7275 	}
7276 
7277 #if NAMEDRSRCFORK
7278 	/* Grab reference on the shadow stream file vnode to
7279 	 * force an inactive on release which will mark it
7280 	 * for recycle.
7281 	 */
7282 	if (vnode_isnamedstream(nd.ni_vp) &&
7283 	    (nd.ni_vp->v_parent != NULLVP) &&
7284 	    vnode_isshadow(nd.ni_vp)) {
7285 		is_namedstream = 1;
7286 		vnode_ref(nd.ni_vp);
7287 	}
7288 #endif
7289 
7290 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
7291 
7292 #if NAMEDRSRCFORK
7293 	if (is_namedstream) {
7294 		vnode_rele(nd.ni_vp);
7295 	}
7296 #endif
7297 
7298 	vnode_put(nd.ni_vp);
7299 	if (amode & _DELETE_OK) {
7300 		vnode_put(nd.ni_dvp);
7301 	}
7302 	nameidone(&nd);
7303 
7304 out:
7305 	if (!(flag & AT_EACCESS)) {
7306 		kauth_cred_unref(&context.vc_ucred);
7307 	}
7308 	return error;
7309 }
7310 
7311 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)7312 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
7313 {
7314 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
7315 	           uap->path, uap->flags, 0, UIO_USERSPACE);
7316 }
7317 
7318 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)7319 faccessat(__unused proc_t p, struct faccessat_args *uap,
7320     __unused int32_t *retval)
7321 {
7322 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) {
7323 		return EINVAL;
7324 	}
7325 
7326 	return faccessat_internal(vfs_context_current(), uap->fd,
7327 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
7328 }
7329 
7330 /*
7331  * Returns:	0			Success
7332  *		EFAULT
7333  *	copyout:EFAULT
7334  *	namei:???
7335  *	vn_stat:???
7336  */
7337 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)7338 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
7339     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
7340     enum uio_seg segflg, int fd, int flag)
7341 {
7342 	struct nameidata *ndp = NULL;
7343 	int follow;
7344 	union {
7345 		struct stat sb;
7346 		struct stat64 sb64;
7347 	} source = {};
7348 	union {
7349 		struct user64_stat user64_sb;
7350 		struct user32_stat user32_sb;
7351 		struct user64_stat64 user64_sb64;
7352 		struct user32_stat64 user32_sb64;
7353 	} dest = {};
7354 	caddr_t sbp;
7355 	int error, my_size;
7356 	kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
7357 	size_t xsecurity_bufsize;
7358 	void * statptr;
7359 	struct fileproc *fp = NULL;
7360 	int needsrealdev = 0;
7361 
7362 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7363 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
7364 	NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
7365 	    segflg, path, ctx);
7366 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7367 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
7368 	}
7369 	if (flag & AT_RESOLVE_BENEATH) {
7370 		ndp->ni_flag |= NAMEI_RESOLVE_BENEATH;
7371 	}
7372 
7373 #if NAMEDRSRCFORK
7374 	int is_namedstream = 0;
7375 	/* stat calls are allowed for resource forks. */
7376 	ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7377 #endif
7378 
7379 	if (flag & AT_FDONLY) {
7380 		vnode_t fvp;
7381 
7382 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
7383 		if (error) {
7384 			goto out;
7385 		}
7386 		if ((error = vnode_getwithref(fvp))) {
7387 			file_drop(fd);
7388 			goto out;
7389 		}
7390 		ndp->ni_vp = fvp;
7391 	} else {
7392 		error = nameiat(ndp, fd);
7393 		if (error) {
7394 			goto out;
7395 		}
7396 	}
7397 
7398 	statptr = (void *)&source;
7399 
7400 #if NAMEDRSRCFORK
7401 	/* Grab reference on the shadow stream file vnode to
7402 	 * force an inactive on release which will mark it
7403 	 * for recycle.
7404 	 */
7405 	if (vnode_isnamedstream(ndp->ni_vp) &&
7406 	    (ndp->ni_vp->v_parent != NULLVP) &&
7407 	    vnode_isshadow(ndp->ni_vp)) {
7408 		is_namedstream = 1;
7409 		vnode_ref(ndp->ni_vp);
7410 	}
7411 #endif
7412 
7413 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
7414 	if (fp && (xsecurity == USER_ADDR_NULL)) {
7415 		/*
7416 		 * If the caller has the file open, and is not
7417 		 * requesting extended security information, we are
7418 		 * going to let them get the basic stat information.
7419 		 */
7420 		error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7421 		    fp->fp_glob->fg_cred);
7422 	} else {
7423 		error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7424 		    isstat64, needsrealdev, ctx);
7425 	}
7426 
7427 #if NAMEDRSRCFORK
7428 	if (is_namedstream) {
7429 		vnode_rele(ndp->ni_vp);
7430 	}
7431 #endif
7432 	vnode_put(ndp->ni_vp);
7433 	nameidone(ndp);
7434 
7435 	if (fp) {
7436 		file_drop(fd);
7437 		fp = NULL;
7438 	}
7439 
7440 	if (error) {
7441 		goto out;
7442 	}
7443 	/* Zap spare fields */
7444 	if (isstat64 != 0) {
7445 		source.sb64.st_lspare = 0;
7446 		source.sb64.st_qspare[0] = 0LL;
7447 		source.sb64.st_qspare[1] = 0LL;
7448 		if (vfs_context_is64bit(ctx)) {
7449 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7450 			my_size = sizeof(dest.user64_sb64);
7451 			sbp = (caddr_t)&dest.user64_sb64;
7452 		} else {
7453 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7454 			my_size = sizeof(dest.user32_sb64);
7455 			sbp = (caddr_t)&dest.user32_sb64;
7456 		}
7457 		/*
7458 		 * Check if we raced (post lookup) against the last unlink of a file.
7459 		 */
7460 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7461 			source.sb64.st_nlink = 1;
7462 		}
7463 	} else {
7464 		source.sb.st_lspare = 0;
7465 		source.sb.st_qspare[0] = 0LL;
7466 		source.sb.st_qspare[1] = 0LL;
7467 		if (vfs_context_is64bit(ctx)) {
7468 			munge_user64_stat(&source.sb, &dest.user64_sb);
7469 			my_size = sizeof(dest.user64_sb);
7470 			sbp = (caddr_t)&dest.user64_sb;
7471 		} else {
7472 			munge_user32_stat(&source.sb, &dest.user32_sb);
7473 			my_size = sizeof(dest.user32_sb);
7474 			sbp = (caddr_t)&dest.user32_sb;
7475 		}
7476 
7477 		/*
7478 		 * Check if we raced (post lookup) against the last unlink of a file.
7479 		 */
7480 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7481 			source.sb.st_nlink = 1;
7482 		}
7483 	}
7484 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7485 		goto out;
7486 	}
7487 
7488 	/* caller wants extended security information? */
7489 	if (xsecurity != USER_ADDR_NULL) {
7490 		/* did we get any? */
7491 		if (fsec == KAUTH_FILESEC_NONE) {
7492 			if (susize(xsecurity_size, 0) != 0) {
7493 				error = EFAULT;
7494 				goto out;
7495 			}
7496 		} else {
7497 			/* find the user buffer size */
7498 			xsecurity_bufsize = fusize(xsecurity_size);
7499 
7500 			/* copy out the actual data size */
7501 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7502 				error = EFAULT;
7503 				goto out;
7504 			}
7505 
7506 			/* if the caller supplied enough room, copy out to it */
7507 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7508 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7509 			}
7510 		}
7511 	}
7512 out:
7513 	if (ndp) {
7514 		kfree_type(struct nameidata, ndp);
7515 	}
7516 	if (fsec != KAUTH_FILESEC_NONE) {
7517 		kauth_filesec_free(fsec);
7518 	}
7519 	return error;
7520 }
7521 
7522 /*
7523  * stat_extended: Get file status; with extended security (ACL).
7524  *
7525  * Parameters:    p                       (ignored)
7526  *                uap                     User argument descriptor (see below)
7527  *                retval                  (ignored)
7528  *
7529  * Indirect:      uap->path               Path of file to get status from
7530  *                uap->ub                 User buffer (holds file status info)
7531  *                uap->xsecurity          ACL to get (extended security)
7532  *                uap->xsecurity_size     Size of ACL
7533  *
7534  * Returns:        0                      Success
7535  *                !0                      errno value
7536  *
7537  */
7538 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7539 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7540     __unused int32_t *retval)
7541 {
7542 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7543 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7544 	           0);
7545 }
7546 
7547 /*
7548  * Returns:	0			Success
7549  *	fstatat_internal:???		[see fstatat_internal() in this file]
7550  */
7551 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7552 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7553 {
7554 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7555 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7556 }
7557 
7558 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7559 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7560 {
7561 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7562 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7563 }
7564 
7565 /*
7566  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7567  *
7568  * Parameters:    p                       (ignored)
7569  *                uap                     User argument descriptor (see below)
7570  *                retval                  (ignored)
7571  *
7572  * Indirect:      uap->path               Path of file to get status from
7573  *                uap->ub                 User buffer (holds file status info)
7574  *                uap->xsecurity          ACL to get (extended security)
7575  *                uap->xsecurity_size     Size of ACL
7576  *
7577  * Returns:        0                      Success
7578  *                !0                      errno value
7579  *
7580  */
7581 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7582 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7583 {
7584 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7585 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7586 	           0);
7587 }
7588 
7589 /*
7590  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7591  *
7592  * Parameters:    p                       (ignored)
7593  *                uap                     User argument descriptor (see below)
7594  *                retval                  (ignored)
7595  *
7596  * Indirect:      uap->path               Path of file to get status from
7597  *                uap->ub                 User buffer (holds file status info)
7598  *                uap->xsecurity          ACL to get (extended security)
7599  *                uap->xsecurity_size     Size of ACL
7600  *
7601  * Returns:        0                      Success
7602  *                !0                      errno value
7603  *
7604  */
7605 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7606 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7607 {
7608 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7609 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7610 	           AT_SYMLINK_NOFOLLOW);
7611 }
7612 
7613 /*
7614  * Get file status; this version does not follow links.
7615  */
7616 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7617 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7618 {
7619 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7620 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7621 }
7622 
7623 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7624 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7625 {
7626 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7627 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7628 }
7629 
7630 /*
7631  * lstat64_extended: Get file status; can handle large inode numbers; does not
7632  * follow links; with extended security (ACL).
7633  *
7634  * Parameters:    p                       (ignored)
7635  *                uap                     User argument descriptor (see below)
7636  *                retval                  (ignored)
7637  *
7638  * Indirect:      uap->path               Path of file to get status from
7639  *                uap->ub                 User buffer (holds file status info)
7640  *                uap->xsecurity          ACL to get (extended security)
7641  *                uap->xsecurity_size     Size of ACL
7642  *
7643  * Returns:        0                      Success
7644  *                !0                      errno value
7645  *
7646  */
7647 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7648 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7649 {
7650 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7651 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7652 	           AT_SYMLINK_NOFOLLOW);
7653 }
7654 
7655 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7656 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7657 {
7658 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) {
7659 		return EINVAL;
7660 	}
7661 
7662 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7663 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7664 }
7665 
7666 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7667 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7668     __unused int32_t *retval)
7669 {
7670 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) {
7671 		return EINVAL;
7672 	}
7673 
7674 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7675 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7676 }
7677 
7678 /*
7679  * Get configurable pathname variables.
7680  *
7681  * Returns:	0			Success
7682  *	namei:???
7683  *	vn_pathconf:???
7684  *
7685  * Notes:	Global implementation  constants are intended to be
7686  *		implemented in this function directly; all other constants
7687  *		are per-FS implementation, and therefore must be handled in
7688  *		each respective FS, instead.
7689  *
7690  * XXX We implement some things globally right now that should actually be
7691  * XXX per-FS; we will need to deal with this at some point.
7692  */
7693 /* ARGSUSED */
7694 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7695 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7696 {
7697 	int error;
7698 	struct nameidata nd;
7699 	vfs_context_t ctx = vfs_context_current();
7700 
7701 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7702 	    UIO_USERSPACE, uap->path, ctx);
7703 	error = namei(&nd);
7704 	if (error) {
7705 		return error;
7706 	}
7707 
7708 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7709 
7710 	vnode_put(nd.ni_vp);
7711 	nameidone(&nd);
7712 	return error;
7713 }
7714 
7715 /*
7716  * Return target name of a symbolic link.
7717  */
7718 /* ARGSUSED */
7719 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7720 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7721     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7722     int *retval)
7723 {
7724 	vnode_t vp;
7725 	uio_t auio;
7726 	int error;
7727 	struct nameidata nd;
7728 	UIO_STACKBUF(uio_buf, 1);
7729 	bool put_vnode;
7730 
7731 	if (bufsize > INT32_MAX) {
7732 		return EINVAL;
7733 	}
7734 
7735 	if (lnk_vp) {
7736 		vp = lnk_vp;
7737 		put_vnode = false;
7738 	} else {
7739 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7740 		    seg, path, ctx);
7741 
7742 		error = nameiat(&nd, fd);
7743 		if (error) {
7744 			return error;
7745 		}
7746 		vp = nd.ni_vp;
7747 		put_vnode = true;
7748 		nameidone(&nd);
7749 	}
7750 
7751 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7752 	    &uio_buf[0], sizeof(uio_buf));
7753 	uio_addiov(auio, buf, bufsize);
7754 	if (vp->v_type != VLNK) {
7755 		error = EINVAL;
7756 	} else {
7757 #if CONFIG_MACF
7758 		error = mac_vnode_check_readlink(ctx, vp);
7759 #endif
7760 		if (error == 0) {
7761 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7762 			    ctx);
7763 		}
7764 		if (error == 0) {
7765 			error = VNOP_READLINK(vp, auio, ctx);
7766 		}
7767 	}
7768 
7769 	if (put_vnode) {
7770 		vnode_put(vp);
7771 	}
7772 
7773 	*retval = (int)(bufsize - uio_resid(auio));
7774 	return error;
7775 }
7776 
7777 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7778 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7779 {
7780 	enum uio_seg procseg;
7781 	vnode_t vp;
7782 	int error;
7783 
7784 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7785 
7786 	AUDIT_ARG(fd, uap->fd);
7787 
7788 	if ((error = file_vnode(uap->fd, &vp))) {
7789 		return error;
7790 	}
7791 	if ((error = vnode_getwithref(vp))) {
7792 		file_drop(uap->fd);
7793 		return error;
7794 	}
7795 
7796 	error = readlinkat_internal(vfs_context_current(), -1,
7797 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7798 	    uap->bufsize, procseg, retval);
7799 
7800 	vnode_put(vp);
7801 	file_drop(uap->fd);
7802 	return error;
7803 }
7804 
7805 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7806 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7807 {
7808 	enum uio_seg procseg;
7809 
7810 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7811 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7812 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7813 	           uap->count, procseg, retval);
7814 }
7815 
7816 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7817 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7818 {
7819 	enum uio_seg procseg;
7820 
7821 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7822 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7823 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7824 	           retval);
7825 }
7826 
7827 /*
7828  * Change file flags, the deep inner layer.
7829  */
7830 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7831 chflags0(vnode_t vp, struct vnode_attr *va,
7832     int (*setattr)(vnode_t, void *, vfs_context_t),
7833     void *arg, vfs_context_t ctx)
7834 {
7835 	kauth_action_t action = 0;
7836 	int error;
7837 
7838 #if CONFIG_MACF
7839 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7840 	if (error) {
7841 		goto out;
7842 	}
7843 #endif
7844 
7845 	/* request authorisation, disregard immutability */
7846 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7847 		goto out;
7848 	}
7849 	/*
7850 	 * Request that the auth layer disregard those file flags it's allowed to when
7851 	 * authorizing this operation; we need to do this in order to be able to
7852 	 * clear immutable flags.
7853 	 */
7854 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7855 		goto out;
7856 	}
7857 	error = (*setattr)(vp, arg, ctx);
7858 
7859 #if CONFIG_MACF
7860 	if (error == 0) {
7861 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7862 	}
7863 #endif
7864 
7865 out:
7866 	return error;
7867 }
7868 
7869 /*
7870  * Change file flags.
7871  *
7872  * NOTE: this will vnode_put() `vp'
7873  */
7874 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7875 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7876 {
7877 	struct vnode_attr va;
7878 	int error;
7879 
7880 	VATTR_INIT(&va);
7881 	VATTR_SET(&va, va_flags, flags);
7882 
7883 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7884 	vnode_put(vp);
7885 
7886 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7887 		error = ENOTSUP;
7888 	}
7889 
7890 	return error;
7891 }
7892 
7893 /*
7894  * Change flags of a file given a path name.
7895  */
7896 /* ARGSUSED */
7897 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7898 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7899 {
7900 	vnode_t vp;
7901 	vfs_context_t ctx = vfs_context_current();
7902 	int error;
7903 	struct nameidata nd;
7904 	uint32_t wantparent = 0;
7905 
7906 #if CONFIG_FILE_LEASES
7907 	wantparent = WANTPARENT;
7908 #endif
7909 
7910 	AUDIT_ARG(fflags, uap->flags);
7911 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7912 	    UIO_USERSPACE, uap->path, ctx);
7913 	error = namei(&nd);
7914 	if (error) {
7915 		return error;
7916 	}
7917 	vp = nd.ni_vp;
7918 
7919 #if CONFIG_FILE_LEASES
7920 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7921 	vnode_put(nd.ni_dvp);
7922 #endif
7923 
7924 	nameidone(&nd);
7925 
7926 	/* we don't vnode_put() here because chflags1 does internally */
7927 	error = chflags1(vp, uap->flags, ctx);
7928 
7929 	return error;
7930 }
7931 
7932 /*
7933  * Change flags of a file given a file descriptor.
7934  */
7935 /* ARGSUSED */
7936 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7937 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7938 {
7939 	vnode_t vp;
7940 	int error;
7941 
7942 	AUDIT_ARG(fd, uap->fd);
7943 	AUDIT_ARG(fflags, uap->flags);
7944 	if ((error = file_vnode(uap->fd, &vp))) {
7945 		return error;
7946 	}
7947 
7948 	if ((error = vnode_getwithref(vp))) {
7949 		file_drop(uap->fd);
7950 		return error;
7951 	}
7952 
7953 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7954 
7955 #if CONFIG_FILE_LEASES
7956 	vnode_breakdirlease(vp, true, O_WRONLY);
7957 #endif
7958 
7959 	/* we don't vnode_put() here because chflags1 does internally */
7960 	error = chflags1(vp, uap->flags, vfs_context_current());
7961 
7962 	file_drop(uap->fd);
7963 	return error;
7964 }
7965 
7966 /*
7967  * Change security information on a filesystem object.
7968  *
7969  * Returns:	0			Success
7970  *		EPERM			Operation not permitted
7971  *		vnode_authattr:???	[anything vnode_authattr can return]
7972  *		vnode_authorize:???	[anything vnode_authorize can return]
7973  *		vnode_setattr:???	[anything vnode_setattr can return]
7974  *
7975  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7976  *		translated to EPERM before being returned.
7977  */
7978 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7979 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7980 {
7981 	kauth_action_t action;
7982 	int error;
7983 
7984 	AUDIT_ARG(mode, vap->va_mode);
7985 	/* XXX audit new args */
7986 
7987 #if NAMEDSTREAMS
7988 	/* chmod calls are not allowed for resource forks. */
7989 	if (vp->v_flag & VISNAMEDSTREAM) {
7990 		return EPERM;
7991 	}
7992 #endif
7993 
7994 #if CONFIG_MACF
7995 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7996 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7997 		return error;
7998 	}
7999 
8000 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
8001 		if ((error = mac_vnode_check_setowner(ctx, vp,
8002 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
8003 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
8004 			return error;
8005 		}
8006 	}
8007 
8008 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
8009 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
8010 		return error;
8011 	}
8012 #endif
8013 
8014 	/* make sure that the caller is allowed to set this security information */
8015 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
8016 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8017 		if (error == EACCES) {
8018 			error = EPERM;
8019 		}
8020 		return error;
8021 	}
8022 
8023 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
8024 		return error;
8025 	}
8026 
8027 #if CONFIG_MACF
8028 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
8029 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
8030 	}
8031 
8032 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
8033 		mac_vnode_notify_setowner(ctx, vp,
8034 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
8035 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
8036 	}
8037 
8038 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
8039 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
8040 	}
8041 #endif
8042 
8043 	return error;
8044 }
8045 
8046 
8047 /*
8048  * Change mode of a file given a path name.
8049  *
8050  * Returns:	0			Success
8051  *		namei:???		[anything namei can return]
8052  *		chmod_vnode:???		[anything chmod_vnode can return]
8053  */
8054 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)8055 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
8056     int fd, int flag, enum uio_seg segflg)
8057 {
8058 	struct nameidata nd;
8059 	int follow, error;
8060 	uint32_t wantparent = 0;
8061 
8062 #if CONFIG_FILE_LEASES
8063 	wantparent = WANTPARENT;
8064 #endif
8065 
8066 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8067 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
8068 	    segflg, path, ctx);
8069 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8070 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8071 	}
8072 	if (flag & AT_RESOLVE_BENEATH) {
8073 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
8074 	}
8075 	if ((error = nameiat(&nd, fd))) {
8076 		return error;
8077 	}
8078 
8079 #if CONFIG_FILE_LEASES
8080 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8081 	vnode_put(nd.ni_dvp);
8082 #endif
8083 
8084 	error = chmod_vnode(ctx, nd.ni_vp, vap);
8085 	vnode_put(nd.ni_vp);
8086 	nameidone(&nd);
8087 	return error;
8088 }
8089 
8090 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)8091 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
8092     gid_t gid, user_addr_t xsecurity)
8093 {
8094 	int error;
8095 
8096 	VATTR_INIT(pva);
8097 
8098 	if (mode != -1) {
8099 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
8100 	} else {
8101 		pva->va_mode = 0;
8102 	}
8103 
8104 	if (uid != KAUTH_UID_NONE) {
8105 		VATTR_SET(pva, va_uid, uid);
8106 	}
8107 
8108 	if (gid != KAUTH_GID_NONE) {
8109 		VATTR_SET(pva, va_gid, gid);
8110 	}
8111 
8112 	*pxsecdst = NULL;
8113 	switch (xsecurity) {
8114 	case USER_ADDR_NULL:
8115 		break;
8116 
8117 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
8118 		VATTR_SET(pva, va_acl, NULL);
8119 		break;
8120 
8121 	default:
8122 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
8123 			return error;
8124 		}
8125 
8126 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
8127 		pva->va_vaflags |= VA_FILESEC_ACL;
8128 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
8129 		break;
8130 	}
8131 
8132 	return 0;
8133 }
8134 
8135 /*
8136  * chmod_extended: Change the mode of a file given a path name; with extended
8137  * argument list (including extended security (ACL)).
8138  *
8139  * Parameters:	p			Process requesting the open
8140  *		uap			User argument descriptor (see below)
8141  *		retval			(ignored)
8142  *
8143  * Indirect:	uap->path		Path to object (same as 'chmod')
8144  *		uap->uid		UID to set
8145  *		uap->gid		GID to set
8146  *		uap->mode		File mode to set (same as 'chmod')
8147  *		uap->xsecurity		ACL to set (or delete)
8148  *
8149  * Returns:	0			Success
8150  *		!0			errno value
8151  *
8152  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
8153  *
8154  * XXX:		We should enummerate the possible errno values here, and where
8155  *		in the code they originated.
8156  */
8157 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)8158 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
8159 {
8160 	int error;
8161 	struct vnode_attr va;
8162 	kauth_filesec_t xsecdst = NULL;
8163 
8164 	AUDIT_ARG(owner, uap->uid, uap->gid);
8165 
8166 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8167 	    uap->gid, uap->xsecurity);
8168 
8169 	if (error) {
8170 		return error;
8171 	}
8172 
8173 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
8174 	    UIO_USERSPACE);
8175 
8176 	if (xsecdst != NULL) {
8177 		kauth_filesec_free(xsecdst);
8178 	}
8179 	return error;
8180 }
8181 
8182 /*
8183  * Returns:	0			Success
8184  *		chmodat:???		[anything chmodat can return]
8185  */
8186 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)8187 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
8188     int flag, enum uio_seg segflg)
8189 {
8190 	struct vnode_attr va;
8191 
8192 	VATTR_INIT(&va);
8193 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
8194 
8195 	return chmodat(ctx, path, &va, fd, flag, segflg);
8196 }
8197 
8198 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)8199 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
8200 {
8201 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
8202 	           AT_FDCWD, 0, UIO_USERSPACE);
8203 }
8204 
8205 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)8206 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
8207 {
8208 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) {
8209 		return EINVAL;
8210 	}
8211 
8212 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
8213 	           uap->fd, uap->flag, UIO_USERSPACE);
8214 }
8215 
8216 /*
8217  * Change mode of a file given a file descriptor.
8218  */
8219 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)8220 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
8221 {
8222 	vnode_t vp;
8223 	int error;
8224 
8225 	AUDIT_ARG(fd, fd);
8226 
8227 	if ((error = file_vnode(fd, &vp)) != 0) {
8228 		return error;
8229 	}
8230 	if ((error = vnode_getwithref(vp)) != 0) {
8231 		file_drop(fd);
8232 		return error;
8233 	}
8234 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8235 
8236 #if CONFIG_FILE_LEASES
8237 	vnode_breakdirlease(vp, true, O_WRONLY);
8238 #endif
8239 
8240 	error = chmod_vnode(vfs_context_current(), vp, vap);
8241 	(void)vnode_put(vp);
8242 	file_drop(fd);
8243 
8244 	return error;
8245 }
8246 
8247 /*
8248  * fchmod_extended: Change mode of a file given a file descriptor; with
8249  * extended argument list (including extended security (ACL)).
8250  *
8251  * Parameters:    p                       Process requesting to change file mode
8252  *                uap                     User argument descriptor (see below)
8253  *                retval                  (ignored)
8254  *
8255  * Indirect:      uap->mode               File mode to set (same as 'chmod')
8256  *                uap->uid                UID to set
8257  *                uap->gid                GID to set
8258  *                uap->xsecurity          ACL to set (or delete)
8259  *                uap->fd                 File descriptor of file to change mode
8260  *
8261  * Returns:        0                      Success
8262  *                !0                      errno value
8263  *
8264  */
8265 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)8266 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
8267 {
8268 	int error;
8269 	struct vnode_attr va;
8270 	kauth_filesec_t xsecdst = NULL;
8271 
8272 	AUDIT_ARG(owner, uap->uid, uap->gid);
8273 
8274 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8275 	    uap->gid, uap->xsecurity);
8276 
8277 	if (error) {
8278 		return error;
8279 	}
8280 
8281 	error = fchmod1(p, uap->fd, &va);
8282 
8283 	if (xsecdst != NULL) {
8284 		kauth_filesec_free(xsecdst);
8285 	}
8286 	return error;
8287 }
8288 
8289 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)8290 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
8291 {
8292 	struct vnode_attr va;
8293 
8294 	VATTR_INIT(&va);
8295 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
8296 
8297 	return fchmod1(p, uap->fd, &va);
8298 }
8299 
8300 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)8301 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
8302 {
8303 	struct vnode_attr va;
8304 	kauth_action_t action;
8305 	int error;
8306 
8307 	VATTR_INIT(&va);
8308 	if (uid != (uid_t)VNOVAL) {
8309 		VATTR_SET(&va, va_uid, uid);
8310 	}
8311 	if (gid != (gid_t)VNOVAL) {
8312 		VATTR_SET(&va, va_gid, gid);
8313 	}
8314 
8315 #if NAMEDSTREAMS
8316 	/* chown calls are not allowed for resource forks. */
8317 	if (vp->v_flag & VISNAMEDSTREAM) {
8318 		error = EPERM;
8319 		goto out;
8320 	}
8321 #endif
8322 
8323 #if CONFIG_MACF
8324 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
8325 	if (error) {
8326 		goto out;
8327 	}
8328 #endif
8329 
8330 	/* preflight and authorize attribute changes */
8331 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8332 		goto out;
8333 	}
8334 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8335 		/*
8336 		 * EACCES is only allowed from namei(); permissions failure should
8337 		 * return EPERM, so we need to translate the error code.
8338 		 */
8339 		if (error == EACCES) {
8340 			error = EPERM;
8341 		}
8342 
8343 		goto out;
8344 	}
8345 
8346 #if CONFIG_FILE_LEASES
8347 	vnode_breakdirlease(vp, true, O_WRONLY);
8348 #endif
8349 
8350 	error = vnode_setattr(vp, &va, ctx);
8351 
8352 #if CONFIG_MACF
8353 	if (error == 0) {
8354 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
8355 	}
8356 #endif
8357 
8358 out:
8359 	return error;
8360 }
8361 
8362 /*
8363  * Set ownership given a path name.
8364  */
8365 /* ARGSUSED */
8366 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)8367 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
8368     gid_t gid, int flag, enum uio_seg segflg)
8369 {
8370 	vnode_t vp;
8371 	int error;
8372 	struct nameidata nd;
8373 	int follow;
8374 
8375 	AUDIT_ARG(owner, uid, gid);
8376 
8377 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8378 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
8379 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8380 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8381 	}
8382 	if (flag & AT_RESOLVE_BENEATH) {
8383 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
8384 	}
8385 
8386 	error = nameiat(&nd, fd);
8387 	if (error) {
8388 		return error;
8389 	}
8390 
8391 	vp = nd.ni_vp;
8392 	error = vn_chown_internal(ctx, vp, uid, gid);
8393 
8394 	nameidone(&nd);
8395 	vnode_put(vp);
8396 	return error;
8397 }
8398 
8399 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8400 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8401 {
8402 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8403 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
8404 }
8405 
8406 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8407 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8408 {
8409 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8410 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8411 }
8412 
8413 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8414 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8415 {
8416 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) {
8417 		return EINVAL;
8418 	}
8419 
8420 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8421 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8422 }
8423 
8424 /*
8425  * Set ownership given a file descriptor.
8426  */
8427 /* ARGSUSED */
8428 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8429 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8430 {
8431 	vfs_context_t ctx = vfs_context_current();
8432 	vnode_t vp;
8433 	int error;
8434 
8435 	AUDIT_ARG(owner, uap->uid, uap->gid);
8436 	AUDIT_ARG(fd, uap->fd);
8437 
8438 	if ((error = file_vnode(uap->fd, &vp))) {
8439 		return error;
8440 	}
8441 
8442 	if ((error = vnode_getwithref(vp))) {
8443 		file_drop(uap->fd);
8444 		return error;
8445 	}
8446 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8447 
8448 	error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8449 
8450 	(void)vnode_put(vp);
8451 	file_drop(uap->fd);
8452 	return error;
8453 }
8454 
8455 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8456 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8457 {
8458 	int error;
8459 
8460 	if (usrtvp == USER_ADDR_NULL) {
8461 		struct timeval old_tv;
8462 		/* XXX Y2038 bug because of microtime argument */
8463 		microtime(&old_tv);
8464 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8465 		tsp[1] = tsp[0];
8466 	} else {
8467 		if (IS_64BIT_PROCESS(current_proc())) {
8468 			struct user64_timeval tv[2];
8469 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8470 			if (error) {
8471 				return error;
8472 			}
8473 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8474 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8475 		} else {
8476 			struct user32_timeval tv[2];
8477 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8478 			if (error) {
8479 				return error;
8480 			}
8481 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8482 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8483 		}
8484 	}
8485 	return 0;
8486 }
8487 
8488 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8489 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8490     int nullflag)
8491 {
8492 	int error;
8493 	struct vnode_attr va;
8494 	kauth_action_t action;
8495 
8496 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8497 
8498 	VATTR_INIT(&va);
8499 	VATTR_SET(&va, va_access_time, ts[0]);
8500 	VATTR_SET(&va, va_modify_time, ts[1]);
8501 	if (nullflag) {
8502 		va.va_vaflags |= VA_UTIMES_NULL;
8503 	}
8504 
8505 #if NAMEDSTREAMS
8506 	/* utimes calls are not allowed for resource forks. */
8507 	if (vp->v_flag & VISNAMEDSTREAM) {
8508 		error = EPERM;
8509 		goto out;
8510 	}
8511 #endif
8512 
8513 #if CONFIG_MACF
8514 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8515 	if (error) {
8516 		goto out;
8517 	}
8518 #endif
8519 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8520 		if (!nullflag && error == EACCES) {
8521 			error = EPERM;
8522 		}
8523 		goto out;
8524 	}
8525 
8526 	/* since we may not need to auth anything, check here */
8527 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8528 		if (!nullflag && error == EACCES) {
8529 			error = EPERM;
8530 		}
8531 		goto out;
8532 	}
8533 	error = vnode_setattr(vp, &va, ctx);
8534 
8535 #if CONFIG_MACF
8536 	if (error == 0) {
8537 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8538 	}
8539 #endif
8540 
8541 out:
8542 	return error;
8543 }
8544 
8545 /*
8546  * Set the access and modification times of a file.
8547  */
8548 /* ARGSUSED */
8549 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8550 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8551 {
8552 	struct timespec ts[2];
8553 	user_addr_t usrtvp;
8554 	int error;
8555 	struct nameidata nd;
8556 	vfs_context_t ctx = vfs_context_current();
8557 	uint32_t wantparent = 0;
8558 
8559 #if CONFIG_FILE_LEASES
8560 	wantparent = WANTPARENT;
8561 #endif
8562 
8563 	/*
8564 	 * AUDIT: Needed to change the order of operations to do the
8565 	 * name lookup first because auditing wants the path.
8566 	 */
8567 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8568 	    UIO_USERSPACE, uap->path, ctx);
8569 	error = namei(&nd);
8570 	if (error) {
8571 		return error;
8572 	}
8573 
8574 	/*
8575 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8576 	 * the current time instead.
8577 	 */
8578 	usrtvp = uap->tptr;
8579 	if ((error = getutimes(usrtvp, ts)) != 0) {
8580 		goto out;
8581 	}
8582 
8583 #if CONFIG_FILE_LEASES
8584 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8585 #endif
8586 
8587 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8588 
8589 out:
8590 #if CONFIG_FILE_LEASES
8591 	vnode_put(nd.ni_dvp);
8592 #endif
8593 	nameidone(&nd);
8594 	vnode_put(nd.ni_vp);
8595 	return error;
8596 }
8597 
8598 /*
8599  * Set the access and modification times of a file.
8600  */
8601 /* ARGSUSED */
8602 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8603 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8604 {
8605 	struct timespec ts[2];
8606 	vnode_t vp;
8607 	user_addr_t usrtvp;
8608 	int error;
8609 
8610 	AUDIT_ARG(fd, uap->fd);
8611 	usrtvp = uap->tptr;
8612 	if ((error = getutimes(usrtvp, ts)) != 0) {
8613 		return error;
8614 	}
8615 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8616 		return error;
8617 	}
8618 	if ((error = vnode_getwithref(vp))) {
8619 		file_drop(uap->fd);
8620 		return error;
8621 	}
8622 
8623 #if CONFIG_FILE_LEASES
8624 	vnode_breakdirlease(vp, true, O_WRONLY);
8625 #endif
8626 
8627 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8628 
8629 	vnode_put(vp);
8630 	file_drop(uap->fd);
8631 	return error;
8632 }
8633 
8634 static int
truncate_validate_common(proc_t p,off_t length)8635 truncate_validate_common(proc_t p, off_t length)
8636 {
8637 	rlim_t fsize_limit;
8638 
8639 	if (length < 0) {
8640 		return EINVAL;
8641 	}
8642 
8643 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8644 	if ((rlim_t)length > fsize_limit) {
8645 		psignal(p, SIGXFSZ);
8646 		return EFBIG;
8647 	}
8648 
8649 	return 0;
8650 }
8651 
8652 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8653 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8654     vfs_context_t ctx, boolean_t need_auth)
8655 {
8656 	struct vnode_attr va;
8657 	kauth_action_t action;
8658 	int error;
8659 
8660 	VATTR_INIT(&va);
8661 	VATTR_SET(&va, va_data_size, length);
8662 
8663 #if CONFIG_MACF
8664 	error = mac_vnode_check_truncate(ctx, cred, vp);
8665 	if (error) {
8666 		return error;
8667 	}
8668 #endif
8669 
8670 	/*
8671 	 * If we reached here from `ftruncate` then we already did an effective
8672 	 * `vnode_authorize` upon open.  We honour the result from then.
8673 	 */
8674 	if (need_auth) {
8675 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8676 			return error;
8677 		}
8678 
8679 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8680 			return error;
8681 		}
8682 	}
8683 
8684 #if CONFIG_FILE_LEASES
8685 	/* Check if there is a lease placed on the parent directory. */
8686 	vnode_breakdirlease(vp, true, O_WRONLY);
8687 
8688 	/* Now check if there is a lease placed on the file itself. */
8689 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8690 #endif
8691 
8692 	error = vnode_setattr(vp, &va, ctx);
8693 
8694 #if CONFIG_MACF
8695 	if (error == 0) {
8696 		mac_vnode_notify_truncate(ctx, cred, vp);
8697 	}
8698 #endif
8699 
8700 	return error;
8701 }
8702 
8703 /*
8704  * Truncate a file given its path name.
8705  */
8706 /* ARGSUSED */
8707 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8708 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8709 {
8710 	vfs_context_t ctx = vfs_context_current();
8711 	vnode_t vp;
8712 	int error;
8713 	struct nameidata nd;
8714 
8715 	if ((error = truncate_validate_common(p, uap->length))) {
8716 		return error;
8717 	}
8718 
8719 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8720 	    UIO_USERSPACE, uap->path, ctx);
8721 
8722 	if ((error = namei(&nd))) {
8723 		return error;
8724 	}
8725 
8726 	vp = nd.ni_vp;
8727 	nameidone(&nd);
8728 
8729 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8730 	vnode_put(vp);
8731 
8732 	return error;
8733 }
8734 
8735 /*
8736  * Truncate a file given a file descriptor.
8737  */
8738 /* ARGSUSED */
8739 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8740 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8741 {
8742 	struct vnode_attr va;
8743 	vnode_t vp = NULLVP;
8744 	struct fileproc *fp;
8745 	bool need_vnode_put = false;
8746 	int error;
8747 
8748 	AUDIT_ARG(fd, uap->fd);
8749 
8750 	if ((error = truncate_validate_common(p, uap->length))) {
8751 		return error;
8752 	}
8753 
8754 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8755 		return error;
8756 	}
8757 
8758 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8759 	case DTYPE_PSXSHM:
8760 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8761 		goto out;
8762 	case DTYPE_VNODE:
8763 		break;
8764 	default:
8765 		error = EINVAL;
8766 		goto out;
8767 	}
8768 
8769 	vp = (vnode_t)fp_get_data(fp);
8770 
8771 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8772 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8773 		error = EINVAL;
8774 		goto out;
8775 	}
8776 
8777 	if ((error = vnode_getwithref(vp)) != 0) {
8778 		goto out;
8779 	}
8780 	need_vnode_put = true;
8781 
8782 	VATTR_INIT(&va);
8783 	VATTR_WANTED(&va, va_flags);
8784 
8785 	error = vnode_getattr(vp, &va, vfs_context_current());
8786 	if (error) {
8787 		goto out;
8788 	}
8789 
8790 	/* Don't allow ftruncate if the file has append-only flag set. */
8791 	if (va.va_flags & APPEND) {
8792 		error = EPERM;
8793 		goto out;
8794 	}
8795 
8796 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8797 
8798 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8799 	    vfs_context_current(), false);
8800 	if (!error) {
8801 		fp->fp_glob->fg_flag |= FWASWRITTEN;
8802 	}
8803 
8804 out:
8805 	if (vp && need_vnode_put) {
8806 		vnode_put(vp);
8807 	}
8808 
8809 	file_drop(uap->fd);
8810 	return error;
8811 }
8812 
8813 
8814 /*
8815  * Sync an open file with synchronized I/O _file_ integrity completion
8816  */
8817 /* ARGSUSED */
8818 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8819 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8820 {
8821 	__pthread_testcancel(1);
8822 	return fsync_common(p, uap, MNT_WAIT);
8823 }
8824 
8825 
8826 /*
8827  * Sync an open file with synchronized I/O _file_ integrity completion
8828  *
8829  * Notes:	This is a legacy support function that does not test for
8830  *		thread cancellation points.
8831  */
8832 /* ARGSUSED */
8833 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8834 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8835 {
8836 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8837 }
8838 
8839 
8840 /*
8841  * Sync an open file with synchronized I/O _data_ integrity completion
8842  */
8843 /* ARGSUSED */
8844 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8845 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8846 {
8847 	__pthread_testcancel(1);
8848 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8849 }
8850 
8851 
8852 /*
8853  * fsync_common
8854  *
8855  * Common fsync code to support both synchronized I/O file integrity completion
8856  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8857  *
8858  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8859  * will only guarantee that the file data contents are retrievable.  If
8860  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8861  * includes additional metadata unnecessary for retrieving the file data
8862  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8863  * storage.
8864  *
8865  * Parameters:	p				The process
8866  *		uap->fd				The descriptor to synchronize
8867  *		flags				The data integrity flags
8868  *
8869  * Returns:	int				Success
8870  *	fp_getfvp:EBADF				Bad file descriptor
8871  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8872  *	VNOP_FSYNC:???				unspecified
8873  *
8874  * Notes:	We use struct fsync_args because it is a short name, and all
8875  *		caller argument structures are otherwise identical.
8876  */
8877 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8878 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8879 {
8880 	vnode_t vp;
8881 	struct fileproc *fp;
8882 	vfs_context_t ctx = vfs_context_current();
8883 	int error;
8884 
8885 	AUDIT_ARG(fd, uap->fd);
8886 
8887 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8888 		return error;
8889 	}
8890 	if ((error = vnode_getwithref(vp))) {
8891 		file_drop(uap->fd);
8892 		return error;
8893 	}
8894 
8895 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8896 
8897 	error = VNOP_FSYNC(vp, flags, ctx);
8898 
8899 #if NAMEDRSRCFORK
8900 	/* Sync resource fork shadow file if necessary. */
8901 	if ((error == 0) &&
8902 	    (vp->v_flag & VISNAMEDSTREAM) &&
8903 	    (vp->v_parent != NULLVP) &&
8904 	    vnode_isshadow(vp) &&
8905 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8906 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8907 	}
8908 #endif
8909 
8910 	(void)vnode_put(vp);
8911 	file_drop(uap->fd);
8912 	return error;
8913 }
8914 
8915 /*
8916  * Duplicate files.  Source must be a file, target must be a file or
8917  * must not exist.
8918  *
8919  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8920  *     perform inheritance correctly.
8921  */
8922 /* ARGSUSED */
8923 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8924 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8925 {
8926 	vnode_t tvp, fvp, tdvp, sdvp;
8927 	struct nameidata fromnd, tond;
8928 	int error;
8929 	vfs_context_t ctx = vfs_context_current();
8930 
8931 	/* Check that the flags are valid. */
8932 	if (uap->flags & ~CPF_MASK) {
8933 		return EINVAL;
8934 	}
8935 
8936 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8937 	    UIO_USERSPACE, uap->from, ctx);
8938 	if ((error = namei(&fromnd))) {
8939 		return error;
8940 	}
8941 	fvp = fromnd.ni_vp;
8942 
8943 	NDINIT(&tond, CREATE, OP_LINK,
8944 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8945 	    UIO_USERSPACE, uap->to, ctx);
8946 	if ((error = namei(&tond))) {
8947 		goto out1;
8948 	}
8949 	tdvp = tond.ni_dvp;
8950 	tvp = tond.ni_vp;
8951 
8952 	if (tvp != NULL) {
8953 		if (!(uap->flags & CPF_OVERWRITE)) {
8954 			error = EEXIST;
8955 			goto out;
8956 		}
8957 	}
8958 
8959 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8960 		error = EISDIR;
8961 		goto out;
8962 	}
8963 
8964 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8965 		error = EOPNOTSUPP;
8966 		goto out;
8967 	}
8968 
8969 #if CONFIG_MACF
8970 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8971 		goto out;
8972 	}
8973 #endif /* CONFIG_MACF */
8974 
8975 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8976 		goto out;
8977 	}
8978 	if (tvp) {
8979 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8980 			goto out;
8981 		}
8982 	}
8983 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8984 		goto out;
8985 	}
8986 
8987 	if (fvp == tdvp) {
8988 		error = EINVAL;
8989 	}
8990 	/*
8991 	 * If source is the same as the destination (that is the
8992 	 * same inode number) then there is nothing to do.
8993 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8994 	 */
8995 	if (fvp == tvp) {
8996 		error = -1;
8997 	}
8998 
8999 #if CONFIG_FILE_LEASES
9000 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9001 #endif
9002 
9003 	if (!error) {
9004 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
9005 	}
9006 out:
9007 	sdvp = tond.ni_startdir;
9008 	/*
9009 	 * nameidone has to happen before we vnode_put(tdvp)
9010 	 * since it may need to release the fs_nodelock on the tdvp
9011 	 */
9012 	nameidone(&tond);
9013 
9014 	if (tvp) {
9015 		vnode_put(tvp);
9016 	}
9017 	vnode_put(tdvp);
9018 	vnode_put(sdvp);
9019 out1:
9020 	vnode_put(fvp);
9021 
9022 	nameidone(&fromnd);
9023 
9024 	if (error == -1) {
9025 		return 0;
9026 	}
9027 	return error;
9028 }
9029 
9030 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
9031 
9032 /*
9033  * Helper function for doing clones. The caller is expected to provide an
9034  * iocounted source vnode and release it.
9035  */
9036 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)9037 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
9038     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
9039 {
9040 	vnode_t tvp, tdvp;
9041 	struct nameidata *tondp = NULL;
9042 	int error;
9043 	int follow;
9044 	boolean_t free_src_acl;
9045 	boolean_t attr_cleanup;
9046 	enum vtype v_type;
9047 	kauth_action_t action;
9048 	struct componentname *cnp;
9049 	uint32_t defaulted = 0;
9050 	struct {
9051 		struct vnode_attr va[2];
9052 	} *va2p = NULL;
9053 	struct vnode_attr *vap = NULL;
9054 	struct vnode_attr *nvap = NULL;
9055 	uint32_t vnop_flags;
9056 
9057 	v_type = vnode_vtype(fvp);
9058 	switch (v_type) {
9059 	case VLNK:
9060 	/* FALLTHRU */
9061 	case VREG:
9062 		action = KAUTH_VNODE_ADD_FILE;
9063 		break;
9064 	case VDIR:
9065 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
9066 		    fvp->v_mountedhere) {
9067 			return EINVAL;
9068 		}
9069 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
9070 		break;
9071 	default:
9072 		return EINVAL;
9073 	}
9074 
9075 	AUDIT_ARG(fd2, dst_dirfd);
9076 	AUDIT_ARG(value32, flags);
9077 
9078 	tondp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9079 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9080 	NDINIT(tondp, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
9081 	    UIO_USERSPACE, dst, ctx);
9082 	if (flags & CLONE_NOFOLLOW_ANY) {
9083 		tondp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9084 	}
9085 	if (flags & CLONE_RESOLVE_BENEATH) {
9086 		tondp->ni_flag |= NAMEI_RESOLVE_BENEATH;
9087 	}
9088 
9089 	if ((error = nameiat(tondp, dst_dirfd))) {
9090 		kfree_type(struct nameidata, tondp);
9091 		return error;
9092 	}
9093 	cnp = &tondp->ni_cnd;
9094 	tdvp = tondp->ni_dvp;
9095 	tvp = tondp->ni_vp;
9096 
9097 	free_src_acl = FALSE;
9098 	attr_cleanup = FALSE;
9099 
9100 	if (tvp != NULL) {
9101 		error = EEXIST;
9102 		goto out;
9103 	}
9104 
9105 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
9106 		error = EXDEV;
9107 		goto out;
9108 	}
9109 
9110 #if CONFIG_MACF
9111 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
9112 		goto out;
9113 	}
9114 #endif
9115 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
9116 		goto out;
9117 	}
9118 
9119 	action = KAUTH_VNODE_GENERIC_READ_BITS;
9120 	if (data_read_authorised) {
9121 		action &= ~KAUTH_VNODE_READ_DATA;
9122 	}
9123 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
9124 		goto out;
9125 	}
9126 
9127 	va2p = kalloc_type(typeof(*va2p), Z_WAITOK | Z_NOFAIL);
9128 	vap = &va2p->va[0];
9129 	nvap = &va2p->va[1];
9130 
9131 	/*
9132 	 * certain attributes may need to be changed from the source, we ask for
9133 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
9134 	 * flag is specified. By default, the clone file will inherit the target
9135 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
9136 	 * will inherit the source file's ACLs instead.
9137 	 */
9138 	VATTR_INIT(vap);
9139 	VATTR_WANTED(vap, va_uid);
9140 	VATTR_WANTED(vap, va_gid);
9141 	VATTR_WANTED(vap, va_mode);
9142 	VATTR_WANTED(vap, va_flags);
9143 	if (flags & CLONE_ACL) {
9144 		VATTR_WANTED(vap, va_acl);
9145 	}
9146 
9147 	if ((error = vnode_getattr(fvp, vap, ctx)) != 0) {
9148 		goto out;
9149 	}
9150 
9151 	VATTR_INIT(nvap);
9152 	VATTR_SET(nvap, va_type, v_type);
9153 	if (VATTR_IS_SUPPORTED(vap, va_acl) && vap->va_acl != NULL) {
9154 		VATTR_SET(nvap, va_acl, vap->va_acl);
9155 		free_src_acl = TRUE;
9156 	}
9157 
9158 	/* Handle ACL inheritance, initialize vap. */
9159 	if (v_type == VLNK) {
9160 		error = vnode_authattr_new(tdvp, nvap, 0, ctx);
9161 	} else {
9162 		error = vn_attribute_prepare(tdvp, nvap, &defaulted, ctx);
9163 		if (error) {
9164 			goto out;
9165 		}
9166 		attr_cleanup = TRUE;
9167 	}
9168 
9169 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
9170 	/*
9171 	 * We've got initial values for all security parameters,
9172 	 * If we are superuser, then we can change owners to be the
9173 	 * same as the source. Both superuser and the owner have default
9174 	 * WRITE_SECURITY privileges so all other fields can be taken
9175 	 * from source as well.
9176 	 */
9177 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
9178 		if (VATTR_IS_SUPPORTED(vap, va_uid)) {
9179 			VATTR_SET(nvap, va_uid, vap->va_uid);
9180 		}
9181 		if (VATTR_IS_SUPPORTED(vap, va_gid)) {
9182 			VATTR_SET(nvap, va_gid, vap->va_gid);
9183 		}
9184 	} else {
9185 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
9186 	}
9187 
9188 	if (VATTR_IS_SUPPORTED(vap, va_mode)) {
9189 		VATTR_SET(nvap, va_mode, vap->va_mode);
9190 	}
9191 	if (VATTR_IS_SUPPORTED(vap, va_flags)) {
9192 		VATTR_SET(nvap, va_flags,
9193 		    ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
9194 		    (nvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
9195 	}
9196 
9197 #if CONFIG_FILE_LEASES
9198 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9199 #endif
9200 
9201 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, nvap, vnop_flags, ctx);
9202 
9203 	if (!error && tvp) {
9204 		int     update_flags = 0;
9205 #if CONFIG_FSE
9206 		int fsevent;
9207 #endif /* CONFIG_FSE */
9208 
9209 		/*
9210 		 * If some of the requested attributes weren't handled by the
9211 		 * VNOP, use our fallback code.
9212 		 */
9213 		if (!VATTR_ALL_SUPPORTED(nvap)) {
9214 			(void)vnode_setattr_fallback(tvp, nvap, ctx);
9215 		}
9216 
9217 #if CONFIG_MACF
9218 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
9219 		    VNODE_LABEL_CREATE, ctx);
9220 #endif
9221 
9222 		// Make sure the name & parent pointers are hooked up
9223 		if (tvp->v_name == NULL) {
9224 			update_flags |= VNODE_UPDATE_NAME;
9225 		}
9226 		if (tvp->v_parent == NULLVP) {
9227 			update_flags |= VNODE_UPDATE_PARENT;
9228 		}
9229 
9230 		if (update_flags) {
9231 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
9232 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
9233 		}
9234 
9235 #if CONFIG_FSE
9236 		switch (vnode_vtype(tvp)) {
9237 		case VLNK:
9238 		/* FALLTHRU */
9239 		case VREG:
9240 			fsevent = FSE_CREATE_FILE;
9241 			break;
9242 		case VDIR:
9243 			fsevent = FSE_CREATE_DIR;
9244 			break;
9245 		default:
9246 			goto out;
9247 		}
9248 
9249 		if (need_fsevent(fsevent, tvp)) {
9250 			/*
9251 			 * The following is a sequence of three explicit events.
9252 			 * A pair of FSE_CLONE events representing the source and destination
9253 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
9254 			 * fseventsd may coalesce the destination clone and create events
9255 			 * into a single event resulting in the following sequence for a client
9256 			 * FSE_CLONE (src)
9257 			 * FSE_CLONE | FSE_CREATE (dst)
9258 			 */
9259 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
9260 			    FSE_ARG_DONE);
9261 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
9262 			    FSE_ARG_DONE);
9263 		}
9264 #endif /* CONFIG_FSE */
9265 	}
9266 
9267 out:
9268 	if (attr_cleanup) {
9269 		vn_attribute_cleanup(nvap, defaulted);
9270 	}
9271 	if (free_src_acl && vap->va_acl) {
9272 		kauth_acl_free(vap->va_acl);
9273 	}
9274 	if (va2p) {
9275 		kfree_type(typeof(*va2p), va2p);
9276 	}
9277 	nameidone(tondp);
9278 	kfree_type(struct nameidata, tondp);
9279 	if (tvp) {
9280 		vnode_put(tvp);
9281 	}
9282 	vnode_put(tdvp);
9283 	return error;
9284 }
9285 
9286 /*
9287  * clone files or directories, target must not exist.
9288  */
9289 /* ARGSUSED */
9290 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)9291 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
9292     __unused int32_t *retval)
9293 {
9294 	vnode_t fvp;
9295 	struct nameidata *ndp = NULL;
9296 	int follow;
9297 	int error;
9298 	vfs_context_t ctx = vfs_context_current();
9299 
9300 	/* Check that the flags are valid. */
9301 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9302 	    CLONE_NOFOLLOW_ANY | CLONE_RESOLVE_BENEATH)) {
9303 		return EINVAL;
9304 	}
9305 
9306 	AUDIT_ARG(fd, uap->src_dirfd);
9307 
9308 	ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9309 
9310 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9311 	NDINIT(ndp, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
9312 	    UIO_USERSPACE, uap->src, ctx);
9313 	if (uap->flags & CLONE_NOFOLLOW_ANY) {
9314 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9315 	}
9316 	if (uap->flags & CLONE_RESOLVE_BENEATH) {
9317 		ndp->ni_flag |= NAMEI_RESOLVE_BENEATH;
9318 	}
9319 
9320 	if ((error = nameiat(ndp, uap->src_dirfd))) {
9321 		kfree_type(struct nameidata, ndp);
9322 		return error;
9323 	}
9324 
9325 	fvp = ndp->ni_vp;
9326 	nameidone(ndp);
9327 	kfree_type(struct nameidata, ndp);
9328 
9329 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
9330 	    uap->flags, ctx);
9331 
9332 	vnode_put(fvp);
9333 	return error;
9334 }
9335 
9336 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)9337 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
9338     __unused int32_t *retval)
9339 {
9340 	vnode_t fvp;
9341 	struct fileproc *fp;
9342 	int error;
9343 	vfs_context_t ctx = vfs_context_current();
9344 
9345 	/* Check that the flags are valid. */
9346 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9347 	    CLONE_NOFOLLOW_ANY | CLONE_RESOLVE_BENEATH)) {
9348 		return EINVAL;
9349 	}
9350 
9351 	AUDIT_ARG(fd, uap->src_fd);
9352 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
9353 	if (error) {
9354 		return error;
9355 	}
9356 
9357 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9358 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
9359 		error = EBADF;
9360 		goto out;
9361 	}
9362 
9363 	if ((error = vnode_getwithref(fvp))) {
9364 		goto out;
9365 	}
9366 
9367 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
9368 
9369 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
9370 	    uap->flags, ctx);
9371 
9372 	vnode_put(fvp);
9373 out:
9374 	file_drop(uap->src_fd);
9375 	return error;
9376 }
9377 
9378 static int
rename_submounts_callback(mount_t mp,void * arg)9379 rename_submounts_callback(mount_t mp, void *arg)
9380 {
9381 	char *prefix = (char *)arg;
9382 	int prefix_len = (int)strlen(prefix);
9383 	int error = 0;
9384 
9385 	if (strncmp(mp->mnt_vfsstat.f_mntonname, prefix, prefix_len) != 0) {
9386 		return 0;
9387 	}
9388 
9389 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
9390 		return 0;
9391 	}
9392 
9393 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
9394 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
9395 		return -1;
9396 	}
9397 
9398 	size_t pathlen = MAXPATHLEN;
9399 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
9400 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
9401 	}
9402 
9403 	vfs_unbusy(mp);
9404 
9405 	return error;
9406 }
9407 
9408 /*
9409  * Rename files.  Source and destination must either both be directories,
9410  * or both not be directories.  If target is a directory, it must be empty.
9411  */
9412 /* ARGSUSED */
9413 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)9414 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
9415     int tofd, user_addr_t to, int segflg, u_int uflags)
9416 {
9417 	vnode_t tvp, tdvp;
9418 	vnode_t fvp, fdvp;
9419 	vnode_t mnt_fvp;
9420 	struct nameidata *fromnd, *tond;
9421 	int error = 0;
9422 	int do_retry;
9423 	int retry_count;
9424 	int mntrename;
9425 	int dirrename;
9426 	int need_event;
9427 	int need_kpath2;
9428 	int has_listeners;
9429 	const char *oname = NULL;
9430 	char *old_dirpath = NULL, *from_name = NULL, *to_name = NULL;
9431 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
9432 	int from_len = 0, to_len = 0;
9433 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
9434 	int holding_mntlock;
9435 	int vn_authorize_skipped;
9436 	mount_t locked_mp = NULL;
9437 	vnode_t oparent = NULLVP;
9438 	vnode_t locked_vp = NULLVP;
9439 #if CONFIG_FSE
9440 	fse_info from_finfo = {}, to_finfo;
9441 #endif
9442 	int from_truncated = 0, to_truncated = 0;
9443 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
9444 	int batched = 0;
9445 	struct vnode_attr *fvap, *tvap;
9446 	int continuing = 0;
9447 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
9448 	int32_t nofollow_any = 0;
9449 	int32_t resolve_beneath = 0;
9450 	/* carving out a chunk for structs that are too big to be on stack. */
9451 	struct {
9452 		struct nameidata from_node, to_node;
9453 		struct vnode_attr fv_attr, tv_attr;
9454 	} * __rename_data;
9455 
9456 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9457 	fromnd = &__rename_data->from_node;
9458 	tond = &__rename_data->to_node;
9459 
9460 	holding_mntlock = 0;
9461 	do_retry = 0;
9462 	retry_count = 0;
9463 retry:
9464 	fvp = tvp = NULL;
9465 	fdvp = tdvp = NULL;
9466 	fvap = tvap = NULL;
9467 	mnt_fvp = NULLVP;
9468 	mntrename = dirrename = FALSE;
9469 	vn_authorize_skipped = FALSE;
9470 
9471 	if (uflags & RENAME_NOFOLLOW_ANY) {
9472 		nofollow_any = NAMEI_NOFOLLOW_ANY;
9473 	}
9474 	if (uflags & RENAME_RESOLVE_BENEATH) {
9475 		resolve_beneath = NAMEI_RESOLVE_BENEATH;
9476 	}
9477 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9478 	    segflg, from, ctx);
9479 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any | resolve_beneath;
9480 
9481 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9482 	    segflg, to, ctx);
9483 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any | resolve_beneath;
9484 
9485 continue_lookup:
9486 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9487 		if ((error = nameiat(fromnd, fromfd))) {
9488 			goto out1;
9489 		}
9490 		fdvp = fromnd->ni_dvp;
9491 		fvp  = fromnd->ni_vp;
9492 
9493 		if (fvp && fvp->v_type == VDIR) {
9494 			tond->ni_cnd.cn_flags |= WILLBEDIR;
9495 #if defined(XNU_TARGET_OS_OSX)
9496 			dirrename = TRUE;
9497 #endif
9498 		}
9499 	}
9500 
9501 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9502 		if ((error = nameiat(tond, tofd))) {
9503 			/*
9504 			 * Translate error code for rename("dir1", "dir2/.").
9505 			 */
9506 			if (error == EISDIR && fvp->v_type == VDIR) {
9507 				error = EINVAL;
9508 			}
9509 			goto out1;
9510 		}
9511 		tdvp = tond->ni_dvp;
9512 		tvp  = tond->ni_vp;
9513 	}
9514 
9515 #if DEVELOPMENT || DEBUG
9516 	/*
9517 	 * XXX VSWAP: Check for entitlements or special flag here
9518 	 * so we can restrict access appropriately.
9519 	 */
9520 #else /* DEVELOPMENT || DEBUG */
9521 
9522 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9523 		error = EPERM;
9524 		goto out1;
9525 	}
9526 
9527 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9528 		error = EPERM;
9529 		goto out1;
9530 	}
9531 #endif /* DEVELOPMENT || DEBUG */
9532 
9533 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9534 		error = ENOENT;
9535 		goto out1;
9536 	}
9537 
9538 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9539 		int32_t pval = 0;
9540 		int err = 0;
9541 
9542 		/*
9543 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9544 		 * has the same name as target iff the following conditions are met:
9545 		 * 1. the target file system is case insensitive
9546 		 * 2. source and target directories are the same
9547 		 * 3. source and target files are the same
9548 		 * 4. name only differs in case (determined by underlying filesystem)
9549 		 */
9550 		if (fvp != tvp || fdvp != tdvp) {
9551 			error = EEXIST;
9552 			goto out1;
9553 		}
9554 
9555 		/*
9556 		 * Assume that the target file system is case sensitive if
9557 		 * _PC_CASE_SENSITIVE selector isn't supported.
9558 		 */
9559 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9560 		if (err != 0 || pval != 0) {
9561 			error = EEXIST;
9562 			goto out1;
9563 		}
9564 	}
9565 
9566 	batched = vnode_compound_rename_available(fdvp);
9567 
9568 #if CONFIG_FSE
9569 	need_event = need_fsevent(FSE_RENAME, fdvp);
9570 	if (need_event) {
9571 		if (fvp) {
9572 			get_fse_info(fvp, &from_finfo, ctx);
9573 		} else {
9574 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9575 			if (error) {
9576 				goto out1;
9577 			}
9578 
9579 			fvap = &__rename_data->fv_attr;
9580 		}
9581 
9582 		if (tvp) {
9583 			get_fse_info(tvp, &to_finfo, ctx);
9584 		} else if (batched) {
9585 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9586 			if (error) {
9587 				goto out1;
9588 			}
9589 
9590 			tvap = &__rename_data->tv_attr;
9591 		}
9592 	}
9593 #else
9594 	need_event = 0;
9595 #endif /* CONFIG_FSE */
9596 
9597 	has_listeners = kauth_authorize_fileop_has_listeners();
9598 
9599 	need_kpath2 = 0;
9600 #if CONFIG_AUDIT
9601 	if (AUDIT_RECORD_EXISTS()) {
9602 		need_kpath2 = 1;
9603 	}
9604 #endif
9605 
9606 	if (need_event || has_listeners) {
9607 		if (from_name == NULL) {
9608 			GET_PATH(from_name);
9609 		}
9610 
9611 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9612 
9613 		if (from_name_no_firmlink == NULL) {
9614 			GET_PATH(from_name_no_firmlink);
9615 		}
9616 
9617 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9618 	}
9619 
9620 	if (need_event || need_kpath2 || has_listeners) {
9621 		if (to_name == NULL) {
9622 			GET_PATH(to_name);
9623 		}
9624 
9625 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9626 
9627 		if (to_name_no_firmlink == NULL) {
9628 			GET_PATH(to_name_no_firmlink);
9629 		}
9630 
9631 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9632 		if (to_name && need_kpath2) {
9633 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9634 		}
9635 	}
9636 	if (!fvp) {
9637 		/*
9638 		 * Claim: this check will never reject a valid rename.
9639 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9640 		 * Suppose fdvp and tdvp are not on the same mount.
9641 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9642 		 *      then you can't move it to within another dir on the same mountpoint.
9643 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9644 		 *
9645 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9646 		 */
9647 		if (fdvp->v_mount != tdvp->v_mount) {
9648 			error = EXDEV;
9649 			goto out1;
9650 		}
9651 		goto skipped_lookup;
9652 	}
9653 
9654 	/*
9655 	 * If the source and destination are the same (i.e. they're
9656 	 * links to the same vnode) and the target file system is
9657 	 * case sensitive, then there is nothing to do.
9658 	 *
9659 	 * XXX Come back to this.
9660 	 */
9661 	if (fvp == tvp) {
9662 		int pathconf_val;
9663 
9664 		/*
9665 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9666 		 * then assume that this file system is case sensitive.
9667 		 */
9668 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9669 		    pathconf_val != 0) {
9670 			vn_authorize_skipped = TRUE;
9671 			goto out1;
9672 		}
9673 	}
9674 
9675 	/*
9676 	 * Allow the renaming of mount points.
9677 	 * - target must not exist
9678 	 * - target must reside in the same directory as source
9679 	 * - union mounts cannot be renamed
9680 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9681 	 *
9682 	 * XXX Handle this in VFS after a continued lookup (if we missed
9683 	 * in the cache to start off)
9684 	 *
9685 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9686 	 * we'll skip past here.  The file system is responsible for
9687 	 * checking that @tvp is not a descendent of @fvp and vice versa
9688 	 * so it should always return EINVAL if either @tvp or @fvp is the
9689 	 * root of a volume.
9690 	 */
9691 	if ((fvp->v_flag & VROOT) &&
9692 	    (fvp->v_type == VDIR) &&
9693 	    (tvp == NULL) &&
9694 	    (fvp->v_mountedhere == NULL) &&
9695 	    (fdvp == tdvp) &&
9696 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9697 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9698 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9699 		vnode_t coveredvp;
9700 
9701 		/* switch fvp to the covered vnode */
9702 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9703 		if ((vnode_getwithref(coveredvp))) {
9704 			error = ENOENT;
9705 			goto out1;
9706 		}
9707 		/*
9708 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9709 		 * later.
9710 		 */
9711 		mnt_fvp = fvp;
9712 
9713 		fvp = coveredvp;
9714 		mntrename = TRUE;
9715 	}
9716 	/*
9717 	 * Check for cross-device rename.
9718 	 * For rename on mountpoint, we want to also check the source and its parent
9719 	 * belong to the same mountpoint.
9720 	 */
9721 	if ((fvp->v_mount != tdvp->v_mount) ||
9722 	    (fvp->v_mount != fdvp->v_mount) ||
9723 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9724 		error = EXDEV;
9725 		goto out1;
9726 	}
9727 
9728 	/*
9729 	 * If source is the same as the destination (that is the
9730 	 * same inode number) then there is nothing to do...
9731 	 * EXCEPT if the underlying file system supports case
9732 	 * insensitivity and is case preserving.  In this case
9733 	 * the file system needs to handle the special case of
9734 	 * getting the same vnode as target (fvp) and source (tvp).
9735 	 *
9736 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9737 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9738 	 * handle the special case of getting the same vnode as target and
9739 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9740 	 * so not to cause locking problems. There is a single reference on tvp.
9741 	 *
9742 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9743 	 * that correct behaviour then is just to return success without doing
9744 	 * anything.
9745 	 *
9746 	 * XXX filesystem should take care of this itself, perhaps...
9747 	 */
9748 	if (fvp == tvp && fdvp == tdvp) {
9749 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9750 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9751 		    fromnd->ni_cnd.cn_namelen)) {
9752 			vn_authorize_skipped = TRUE;
9753 			goto out1;
9754 		}
9755 	}
9756 
9757 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9758 		/*
9759 		 * we're holding a reference and lock
9760 		 * on locked_mp, but it no longer matches
9761 		 * what we want to do... so drop our hold
9762 		 */
9763 		mount_unlock_renames(locked_mp);
9764 		mount_drop(locked_mp, 0);
9765 		holding_mntlock = 0;
9766 	}
9767 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9768 		/*
9769 		 * serialize renames that re-shape
9770 		 * the tree... if holding_mntlock is
9771 		 * set, then we're ready to go...
9772 		 * otherwise we
9773 		 * first need to drop the iocounts
9774 		 * we picked up, second take the
9775 		 * lock to serialize the access,
9776 		 * then finally start the lookup
9777 		 * process over with the lock held
9778 		 */
9779 		if (!holding_mntlock) {
9780 			/*
9781 			 * need to grab a reference on
9782 			 * the mount point before we
9783 			 * drop all the iocounts... once
9784 			 * the iocounts are gone, the mount
9785 			 * could follow
9786 			 */
9787 			locked_mp = fvp->v_mount;
9788 			mount_ref(locked_mp, 0);
9789 
9790 			/*
9791 			 * nameidone has to happen before we vnode_put(tvp)
9792 			 * since it may need to release the fs_nodelock on the tvp
9793 			 */
9794 			nameidone(tond);
9795 
9796 			if (tvp) {
9797 				vnode_put(tvp);
9798 			}
9799 			vnode_put(tdvp);
9800 
9801 			/*
9802 			 * nameidone has to happen before we vnode_put(fdvp)
9803 			 * since it may need to release the fs_nodelock on the fvp
9804 			 */
9805 			nameidone(fromnd);
9806 
9807 			vnode_put(fvp);
9808 			vnode_put(fdvp);
9809 
9810 			if (mnt_fvp != NULLVP) {
9811 				vnode_put(mnt_fvp);
9812 			}
9813 
9814 			mount_lock_renames(locked_mp);
9815 			holding_mntlock = 1;
9816 
9817 			goto retry;
9818 		}
9819 	} else {
9820 		/*
9821 		 * when we dropped the iocounts to take
9822 		 * the lock, we allowed the identity of
9823 		 * the various vnodes to change... if they did,
9824 		 * we may no longer be dealing with a rename
9825 		 * that reshapes the tree... once we're holding
9826 		 * the iocounts, the vnodes can't change type
9827 		 * so we're free to drop the lock at this point
9828 		 * and continue on
9829 		 */
9830 		if (holding_mntlock) {
9831 			mount_unlock_renames(locked_mp);
9832 			mount_drop(locked_mp, 0);
9833 			holding_mntlock = 0;
9834 		}
9835 	}
9836 
9837 	if (!batched) {
9838 		assert(locked_vp == NULLVP);
9839 		vnode_link_lock(fvp);
9840 		locked_vp = fvp;
9841 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9842 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9843 		    flags, NULL);
9844 		if (error) {
9845 			if (error == ENOENT) {
9846 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9847 					/*
9848 					 * We encountered a race where after doing the namei,
9849 					 * tvp stops being valid. If so, simply re-drive the rename
9850 					 * call from the top.
9851 					 */
9852 					do_retry = 1;
9853 					retry_count += 1;
9854 				}
9855 			}
9856 			goto out1;
9857 		}
9858 	}
9859 
9860 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9861 	if (mnt_fvp != NULLVP) {
9862 		vnode_put(mnt_fvp);
9863 		mnt_fvp = NULLVP;
9864 	}
9865 
9866 	// save these off so we can later verify that fvp is the same
9867 	oname   = fvp->v_name;
9868 	oparent = fvp->v_parent;
9869 
9870 	/*
9871 	 * If renaming a directory, stash its path which we need later when
9872 	 * updating the 'f_mntonname' of sub mounts.
9873 	 */
9874 	if (dirrename) {
9875 		int pathlen = MAXPATHLEN;
9876 
9877 		old_dirpath = zalloc(ZV_NAMEI);
9878 		error = vn_getpath_fsenter(fvp, old_dirpath, &pathlen);
9879 		if (error) {
9880 			/*
9881 			 * Process that supports long path (opt-in to IO policy
9882 			 * IOPOL_TYPE_VFS_SUPPORT_LONG_PATHS) can have directory with path
9883 			 * length up to MAXLONGPATHLEN (8192). Since max path length in
9884 			 * mount's 'f_mntonname' is MAXPATHLEN (1024), this means the
9885 			 * directory can't be the parent of the sub mounts so we can just
9886 			 * silently drop the error and skip the check to update the
9887 			 * 'f_mntonname' of sub mounts.
9888 			 */
9889 			if (error == ENOSPC) {
9890 				dirrename = false;
9891 				error = 0;
9892 				if (old_dirpath) {
9893 					zfree(ZV_NAMEI, old_dirpath);
9894 					old_dirpath = NULL;
9895 				}
9896 			} else {
9897 				goto out1;
9898 			}
9899 		}
9900 	}
9901 
9902 skipped_lookup:
9903 #if CONFIG_FILE_LEASES
9904 	/* Lease break needed for source's parent dir? */
9905 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9906 
9907 	/* Lease break needed for target's parent dir? */
9908 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9909 #endif
9910 
9911 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9912 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9913 	    flags, ctx);
9914 
9915 	if (locked_vp) {
9916 		vnode_link_unlock(fvp);
9917 		locked_vp = NULLVP;
9918 	}
9919 
9920 	if (holding_mntlock) {
9921 		/*
9922 		 * we can drop our serialization
9923 		 * lock now
9924 		 */
9925 		mount_unlock_renames(locked_mp);
9926 		mount_drop(locked_mp, 0);
9927 		holding_mntlock = 0;
9928 	}
9929 	if (error) {
9930 		if (error == EDATALESS) {
9931 			/*
9932 			 * If we've been here before, something has gone
9933 			 * horribly wrong and we should just get out lest
9934 			 * we spiral around the drain forever.
9935 			 */
9936 			if (flags & VFS_RENAME_DATALESS) {
9937 				error = EIO;
9938 				goto out1;
9939 			}
9940 
9941 			/*
9942 			 * The object we're renaming is dataless (or has a
9943 			 * dataless descendent) and requires materialization
9944 			 * before the rename occurs.  But we're holding the
9945 			 * mount point's rename lock, so it's not safe to
9946 			 * make the upcall.
9947 			 *
9948 			 * In this case, we release the lock (above), perform
9949 			 * the materialization, and start the whole thing over.
9950 			 */
9951 			error = vfs_materialize_reparent(fvp, tdvp);
9952 			if (error == 0) {
9953 				/*
9954 				 * The next time around we need to tell the
9955 				 * file system that the materializtaion has
9956 				 * been performed.
9957 				 */
9958 				flags |= VFS_RENAME_DATALESS;
9959 				do_retry = 1;
9960 			}
9961 			goto out1;
9962 		}
9963 		if (error == EKEEPLOOKING) {
9964 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9965 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9966 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9967 				}
9968 			}
9969 
9970 			fromnd->ni_vp = fvp;
9971 			tond->ni_vp = tvp;
9972 
9973 			goto continue_lookup;
9974 		}
9975 
9976 		/*
9977 		 * We may encounter a race in the VNOP where the destination didn't
9978 		 * exist when we did the namei, but it does by the time we go and
9979 		 * try to create the entry. In this case, we should re-drive this rename
9980 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9981 		 * but other filesystems susceptible to this race could return it, too.
9982 		 */
9983 		if (error == ERECYCLE) {
9984 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9985 				do_retry = 1;
9986 				retry_count += 1;
9987 			} else {
9988 				printf("rename retry limit due to ERECYCLE reached\n");
9989 				error = ENOENT;
9990 			}
9991 		}
9992 
9993 		/*
9994 		 * For compound VNOPs, the authorization callback may return
9995 		 * ENOENT in case of racing hardlink lookups hitting the name
9996 		 * cache, redrive the lookup.
9997 		 */
9998 		if (batched && error == ENOENT) {
9999 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10000 				do_retry = 1;
10001 				retry_count += 1;
10002 			}
10003 		}
10004 
10005 		goto out1;
10006 	}
10007 
10008 	/* call out to allow 3rd party notification of rename.
10009 	 * Ignore result of kauth_authorize_fileop call.
10010 	 */
10011 	kauth_authorize_fileop(vfs_context_ucred(ctx),
10012 	    KAUTH_FILEOP_RENAME,
10013 	    (uintptr_t)from_name, (uintptr_t)to_name);
10014 	if (flags & VFS_RENAME_SWAP) {
10015 		kauth_authorize_fileop(vfs_context_ucred(ctx),
10016 		    KAUTH_FILEOP_RENAME,
10017 		    (uintptr_t)to_name, (uintptr_t)from_name);
10018 	}
10019 
10020 #if CONFIG_FSE
10021 	if (from_name != NULL && to_name != NULL) {
10022 		if (from_truncated || to_truncated) {
10023 			// set it here since only the from_finfo gets reported up to user space
10024 			from_finfo.mode |= FSE_TRUNCATED_PATH;
10025 		}
10026 
10027 		if (tvap && tvp) {
10028 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
10029 		}
10030 		if (fvap) {
10031 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
10032 		}
10033 
10034 		if (tvp) {
10035 			add_fsevent(FSE_RENAME, ctx,
10036 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
10037 			    FSE_ARG_FINFO, &from_finfo,
10038 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
10039 			    FSE_ARG_FINFO, &to_finfo,
10040 			    FSE_ARG_DONE);
10041 			if (flags & VFS_RENAME_SWAP) {
10042 				/*
10043 				 * Strictly speaking, swap is the equivalent of
10044 				 * *three* renames.  FSEvents clients should only take
10045 				 * the events as a hint, so we only bother reporting
10046 				 * two.
10047 				 */
10048 				add_fsevent(FSE_RENAME, ctx,
10049 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
10050 				    FSE_ARG_FINFO, &to_finfo,
10051 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
10052 				    FSE_ARG_FINFO, &from_finfo,
10053 				    FSE_ARG_DONE);
10054 			}
10055 		} else {
10056 			add_fsevent(FSE_RENAME, ctx,
10057 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
10058 			    FSE_ARG_FINFO, &from_finfo,
10059 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
10060 			    FSE_ARG_DONE);
10061 		}
10062 	}
10063 #endif /* CONFIG_FSE */
10064 
10065 	/*
10066 	 * update filesystem's mount point data
10067 	 */
10068 	if (mntrename) {
10069 		char *cp, *pathend, *mpname;
10070 		char * tobuf;
10071 		struct mount *mp;
10072 		int maxlen;
10073 		size_t len = 0;
10074 
10075 		mp = fvp->v_mountedhere;
10076 
10077 		if (vfs_busy(mp, LK_NOWAIT)) {
10078 			error = EBUSY;
10079 			goto out1;
10080 		}
10081 		tobuf = zalloc(ZV_NAMEI);
10082 
10083 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
10084 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
10085 		} else {
10086 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
10087 		}
10088 		if (!error) {
10089 			/* find current mount point prefix */
10090 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
10091 			for (cp = pathend; *cp != '\0'; ++cp) {
10092 				if (*cp == '/') {
10093 					pathend = cp + 1;
10094 				}
10095 			}
10096 			/* find last component of target name */
10097 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
10098 				if (*cp == '/') {
10099 					mpname = cp + 1;
10100 				}
10101 			}
10102 
10103 			/* Update f_mntonname of sub mounts */
10104 			vfs_iterate(0, rename_submounts_callback,
10105 			    (void *)mp->mnt_vfsstat.f_mntonname);
10106 
10107 			/* append name to prefix */
10108 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
10109 			bzero(pathend, maxlen);
10110 
10111 			strlcpy(pathend, mpname, maxlen);
10112 		}
10113 		zfree(ZV_NAMEI, tobuf);
10114 
10115 		vfs_unbusy(mp);
10116 
10117 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
10118 	} else if (dirrename) {
10119 		/*
10120 		 * If we renamed a directory, we need to check if there is any sub
10121 		 * mount(s) mounted under the directory. If so, then we need to update
10122 		 * the sub mount's f_mntonname path.
10123 		 */
10124 		vfs_iterate(0, rename_submounts_callback, (void *)old_dirpath);
10125 	}
10126 
10127 	/*
10128 	 * fix up name & parent pointers.  note that we first
10129 	 * check that fvp has the same name/parent pointers it
10130 	 * had before the rename call... this is a 'weak' check
10131 	 * at best...
10132 	 *
10133 	 * XXX oparent and oname may not be set in the compound vnop case
10134 	 */
10135 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
10136 		int update_flags;
10137 
10138 		update_flags = VNODE_UPDATE_NAME;
10139 
10140 		if (fdvp != tdvp) {
10141 			update_flags |= VNODE_UPDATE_PARENT;
10142 		}
10143 
10144 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
10145 	}
10146 out1:
10147 	/*
10148 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
10149 	 * skipped earlier as no actual rename was performed.
10150 	 */
10151 	if (vn_authorize_skipped && error == 0) {
10152 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
10153 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
10154 		    flags, NULL);
10155 		if (error && error == ENOENT) {
10156 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10157 				do_retry = 1;
10158 				retry_count += 1;
10159 			}
10160 		}
10161 	}
10162 	if (locked_vp) {
10163 		assert(locked_vp == fvp);
10164 		vnode_link_unlock(locked_vp);
10165 		locked_vp = NULLVP;
10166 	}
10167 	if (to_name != NULL) {
10168 		RELEASE_PATH(to_name);
10169 		to_name = NULL;
10170 	}
10171 	if (to_name_no_firmlink != NULL) {
10172 		RELEASE_PATH(to_name_no_firmlink);
10173 		to_name_no_firmlink = NULL;
10174 	}
10175 	if (from_name != NULL) {
10176 		RELEASE_PATH(from_name);
10177 		from_name = NULL;
10178 	}
10179 	if (from_name_no_firmlink != NULL) {
10180 		RELEASE_PATH(from_name_no_firmlink);
10181 		from_name_no_firmlink = NULL;
10182 	}
10183 	if (old_dirpath != NULL) {
10184 		zfree(ZV_NAMEI, old_dirpath);
10185 		old_dirpath = NULL;
10186 	}
10187 	if (holding_mntlock) {
10188 		mount_unlock_renames(locked_mp);
10189 		mount_drop(locked_mp, 0);
10190 		holding_mntlock = 0;
10191 	}
10192 	if (tdvp) {
10193 		/*
10194 		 * nameidone has to happen before we vnode_put(tdvp)
10195 		 * since it may need to release the fs_nodelock on the tdvp
10196 		 */
10197 		nameidone(tond);
10198 
10199 		if (tvp) {
10200 			vnode_put(tvp);
10201 		}
10202 		vnode_put(tdvp);
10203 	}
10204 	if (fdvp) {
10205 		/*
10206 		 * nameidone has to happen before we vnode_put(fdvp)
10207 		 * since it may need to release the fs_nodelock on the fdvp
10208 		 */
10209 		nameidone(fromnd);
10210 
10211 		if (fvp) {
10212 			vnode_put(fvp);
10213 		}
10214 		vnode_put(fdvp);
10215 	}
10216 	if (mnt_fvp != NULLVP) {
10217 		vnode_put(mnt_fvp);
10218 	}
10219 	/*
10220 	 * If things changed after we did the namei, then we will re-drive
10221 	 * this rename call from the top.
10222 	 */
10223 	if (do_retry) {
10224 		do_retry = 0;
10225 		goto retry;
10226 	}
10227 
10228 	kfree_type(typeof(*__rename_data), __rename_data);
10229 	return error;
10230 }
10231 
10232 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)10233 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
10234 {
10235 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
10236 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
10237 }
10238 
10239 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)10240 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
10241 {
10242 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY | RENAME_RESOLVE_BENEATH)) {
10243 		return EINVAL;
10244 	}
10245 
10246 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
10247 		return EINVAL;
10248 	}
10249 
10250 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
10251 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
10252 }
10253 
10254 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)10255 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
10256 {
10257 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
10258 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
10259 }
10260 
10261 /*
10262  * Make a directory file.
10263  *
10264  * Returns:	0			Success
10265  *		EEXIST
10266  *	namei:???
10267  *	vnode_authorize:???
10268  *	vn_create:???
10269  */
10270 /* ARGSUSED */
10271 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)10272 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
10273     enum uio_seg segflg)
10274 {
10275 	vnode_t vp, dvp;
10276 	int error;
10277 	int update_flags = 0;
10278 	int batched;
10279 	struct nameidata nd;
10280 
10281 	AUDIT_ARG(mode, vap->va_mode);
10282 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
10283 	    path, ctx);
10284 	nd.ni_cnd.cn_flags |= WILLBEDIR;
10285 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
10286 
10287 continue_lookup:
10288 	error = nameiat(&nd, fd);
10289 	if (error) {
10290 		return error;
10291 	}
10292 	dvp = nd.ni_dvp;
10293 	vp = nd.ni_vp;
10294 
10295 	if (vp != NULL) {
10296 		error = EEXIST;
10297 		goto out;
10298 	}
10299 
10300 	batched = vnode_compound_mkdir_available(dvp);
10301 
10302 	VATTR_SET(vap, va_type, VDIR);
10303 
10304 	/*
10305 	 * XXX
10306 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
10307 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
10308 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
10309 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
10310 	 */
10311 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
10312 		if (error == EACCES || error == EPERM) {
10313 			int error2;
10314 
10315 			nameidone(&nd);
10316 			vnode_put(dvp);
10317 			dvp = NULLVP;
10318 
10319 			/*
10320 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
10321 			 * rather than EACCESS if the target exists.
10322 			 */
10323 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
10324 			    path, ctx);
10325 			error2 = nameiat(&nd, fd);
10326 			if (error2) {
10327 				goto out;
10328 			} else {
10329 				vp = nd.ni_vp;
10330 				error = EEXIST;
10331 				goto out;
10332 			}
10333 		}
10334 
10335 		goto out;
10336 	}
10337 
10338 #if CONFIG_FILE_LEASES
10339 	vnode_breakdirlease(dvp, false, O_WRONLY);
10340 #endif
10341 
10342 	/*
10343 	 * make the directory
10344 	 */
10345 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
10346 		if (error == EKEEPLOOKING) {
10347 			nd.ni_vp = vp;
10348 			goto continue_lookup;
10349 		}
10350 
10351 		goto out;
10352 	}
10353 
10354 	// Make sure the name & parent pointers are hooked up
10355 	if (vp->v_name == NULL) {
10356 		update_flags |= VNODE_UPDATE_NAME;
10357 	}
10358 	if (vp->v_parent == NULLVP) {
10359 		update_flags |= VNODE_UPDATE_PARENT;
10360 	}
10361 
10362 	if (update_flags) {
10363 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
10364 	}
10365 
10366 #if CONFIG_FSE
10367 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
10368 #endif
10369 
10370 out:
10371 	/*
10372 	 * nameidone has to happen before we vnode_put(dvp)
10373 	 * since it may need to release the fs_nodelock on the dvp
10374 	 */
10375 	nameidone(&nd);
10376 
10377 	if (vp) {
10378 		vnode_put(vp);
10379 	}
10380 	if (dvp) {
10381 		vnode_put(dvp);
10382 	}
10383 
10384 	return error;
10385 }
10386 
10387 /*
10388  * mkdir_extended: Create a directory; with extended security (ACL).
10389  *
10390  * Parameters:    p                       Process requesting to create the directory
10391  *                uap                     User argument descriptor (see below)
10392  *                retval                  (ignored)
10393  *
10394  * Indirect:      uap->path               Path of directory to create
10395  *                uap->mode               Access permissions to set
10396  *                uap->xsecurity          ACL to set
10397  *
10398  * Returns:        0                      Success
10399  *                !0                      Not success
10400  *
10401  */
10402 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)10403 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
10404 {
10405 	int ciferror;
10406 	kauth_filesec_t xsecdst;
10407 	struct vnode_attr va;
10408 
10409 	AUDIT_ARG(owner, uap->uid, uap->gid);
10410 
10411 	xsecdst = NULL;
10412 	if ((uap->xsecurity != USER_ADDR_NULL) &&
10413 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
10414 		return ciferror;
10415 	}
10416 
10417 	VATTR_INIT(&va);
10418 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10419 	if (xsecdst != NULL) {
10420 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
10421 		va.va_vaflags |= VA_FILESEC_ACL;
10422 	}
10423 
10424 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10425 	    UIO_USERSPACE);
10426 	if (xsecdst != NULL) {
10427 		kauth_filesec_free(xsecdst);
10428 	}
10429 	return ciferror;
10430 }
10431 
10432 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)10433 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
10434 {
10435 	struct vnode_attr va;
10436 
10437 	VATTR_INIT(&va);
10438 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10439 
10440 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10441 	           UIO_USERSPACE);
10442 }
10443 
10444 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)10445 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
10446 {
10447 	struct vnode_attr va;
10448 
10449 	VATTR_INIT(&va);
10450 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10451 
10452 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
10453 	           UIO_USERSPACE);
10454 }
10455 
10456 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)10457 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
10458     enum uio_seg segflg, int unlink_flags)
10459 {
10460 	struct {
10461 		struct nameidata nd;
10462 #if CONFIG_FSE
10463 		struct vnode_attr va;
10464 #endif /* CONFIG_FSE */
10465 	} *__rmdir_data;
10466 	vnode_t vp, dvp;
10467 	int error;
10468 	struct nameidata *ndp;
10469 	char     *path = NULL;
10470 	char     *no_firmlink_path = NULL;
10471 	int       len_path = 0;
10472 	int       len_no_firmlink_path = 0;
10473 	int has_listeners = 0;
10474 	int need_event = 0;
10475 	int truncated_path = 0;
10476 	int truncated_no_firmlink_path = 0;
10477 	struct vnode_attr *vap = NULL;
10478 	int restart_count = 0;
10479 	int batched;
10480 
10481 	int restart_flag;
10482 	int nofollow_any = 0;
10483 	int resolve_beneath = 0;
10484 
10485 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
10486 	ndp = &__rmdir_data->nd;
10487 
10488 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
10489 		nofollow_any = NAMEI_NOFOLLOW_ANY;
10490 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
10491 	}
10492 	if (unlink_flags & VNODE_REMOVE_RESOLVE_BENEATH) {
10493 		resolve_beneath = NAMEI_RESOLVE_BENEATH;
10494 		unlink_flags &= ~VNODE_REMOVE_RESOLVE_BENEATH;
10495 	}
10496 
10497 	/*
10498 	 * This loop exists to restart rmdir in the unlikely case that two
10499 	 * processes are simultaneously trying to remove the same directory
10500 	 * containing orphaned appleDouble files.
10501 	 */
10502 	do {
10503 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
10504 		    segflg, dirpath, ctx);
10505 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any | resolve_beneath;
10506 continue_lookup:
10507 		restart_flag = 0;
10508 		vap = NULL;
10509 
10510 		error = nameiat(ndp, fd);
10511 		if (error) {
10512 			goto err_out;
10513 		}
10514 
10515 		dvp = ndp->ni_dvp;
10516 		vp = ndp->ni_vp;
10517 
10518 		if (vp) {
10519 			batched = vnode_compound_rmdir_available(vp);
10520 
10521 			if (vp->v_flag & VROOT) {
10522 				/*
10523 				 * The root of a mounted filesystem cannot be deleted.
10524 				 */
10525 				error = EBUSY;
10526 				goto out;
10527 			}
10528 
10529 #if DEVELOPMENT || DEBUG
10530 			/*
10531 			 * XXX VSWAP: Check for entitlements or special flag here
10532 			 * so we can restrict access appropriately.
10533 			 */
10534 #else /* DEVELOPMENT || DEBUG */
10535 
10536 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10537 				error = EPERM;
10538 				goto out;
10539 			}
10540 #endif /* DEVELOPMENT || DEBUG */
10541 
10542 			/*
10543 			 * Removed a check here; we used to abort if vp's vid
10544 			 * was not the same as what we'd seen the last time around.
10545 			 * I do not think that check was valid, because if we retry
10546 			 * and all dirents are gone, the directory could legitimately
10547 			 * be recycled but still be present in a situation where we would
10548 			 * have had permission to delete.  Therefore, we won't make
10549 			 * an effort to preserve that check now that we may not have a
10550 			 * vp here.
10551 			 */
10552 
10553 			if (!batched) {
10554 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10555 				if (error) {
10556 					if (error == ENOENT) {
10557 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10558 							restart_flag = 1;
10559 							restart_count += 1;
10560 						}
10561 					}
10562 					goto out;
10563 				}
10564 			}
10565 		} else {
10566 			batched = 1;
10567 
10568 			if (!vnode_compound_rmdir_available(dvp)) {
10569 				panic("No error, but no compound rmdir?");
10570 			}
10571 		}
10572 
10573 #if CONFIG_FSE
10574 		fse_info  finfo = {0};
10575 
10576 		need_event = need_fsevent(FSE_DELETE, dvp);
10577 		if (need_event) {
10578 			if (!batched) {
10579 				get_fse_info(vp, &finfo, ctx);
10580 			} else {
10581 				error = vfs_get_notify_attributes(&__rmdir_data->va);
10582 				if (error) {
10583 					goto out;
10584 				}
10585 
10586 				vap = &__rmdir_data->va;
10587 			}
10588 		}
10589 #endif
10590 		has_listeners = kauth_authorize_fileop_has_listeners();
10591 		if (need_event || has_listeners) {
10592 			if (path == NULL) {
10593 				GET_PATH(path);
10594 			}
10595 
10596 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10597 
10598 			if (no_firmlink_path == NULL) {
10599 				GET_PATH(no_firmlink_path);
10600 			}
10601 
10602 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10603 #if CONFIG_FSE
10604 			if (truncated_no_firmlink_path) {
10605 				finfo.mode |= FSE_TRUNCATED_PATH;
10606 			}
10607 #endif
10608 		}
10609 
10610 #if CONFIG_FILE_LEASES
10611 		vnode_breakdirlease(dvp, false, O_WRONLY);
10612 #endif
10613 
10614 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10615 		ndp->ni_vp = vp;
10616 		if (vp == NULLVP) {
10617 			/* Couldn't find a vnode */
10618 			goto out;
10619 		}
10620 
10621 		if (error == EKEEPLOOKING) {
10622 			goto continue_lookup;
10623 		} else if (batched && error == ENOENT) {
10624 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10625 				/*
10626 				 * For compound VNOPs, the authorization callback
10627 				 * may return ENOENT in case of racing hard link lookups
10628 				 * redrive the lookup.
10629 				 */
10630 				restart_flag = 1;
10631 				restart_count += 1;
10632 				goto out;
10633 			}
10634 		}
10635 
10636 		/*
10637 		 * XXX There's no provision for passing flags
10638 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10639 		 * because it's not empty, then we try again
10640 		 * with VNOP_REMOVE(), passing in a special
10641 		 * flag that clever file systems will know
10642 		 * how to handle.
10643 		 */
10644 		if (error == ENOTEMPTY &&
10645 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10646 			/*
10647 			 * Only do this if the directory is actually
10648 			 * marked as DATALESS.
10649 			 */
10650 			struct vnode_attr *lvap =
10651 			    kalloc_type(struct vnode_attr, Z_WAITOK);
10652 
10653 			VATTR_INIT(lvap);
10654 			VATTR_WANTED(lvap, va_flags);
10655 			if (vnode_getattr(vp, lvap, ctx) == 0 &&
10656 			    VATTR_IS_SUPPORTED(lvap, va_flags) &&
10657 			    (lvap->va_flags & SF_DATALESS) != 0) {
10658 				/*
10659 				 * If this fails, we want to keep the original
10660 				 * error.
10661 				 */
10662 				if (vn_remove(dvp, &vp, ndp,
10663 				    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10664 					error = 0;
10665 				}
10666 			}
10667 			kfree_type(struct vnode_attr, lvap);
10668 		}
10669 
10670 #if CONFIG_APPLEDOUBLE
10671 		/*
10672 		 * Special case to remove orphaned AppleDouble
10673 		 * files. I don't like putting this in the kernel,
10674 		 * but carbon does not like putting this in carbon either,
10675 		 * so here we are.
10676 		 */
10677 		if (error == ENOTEMPTY) {
10678 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10679 			if (ad_error == EBUSY) {
10680 				error = ad_error;
10681 				goto out;
10682 			}
10683 
10684 
10685 			/*
10686 			 * Assuming everything went well, we will try the RMDIR again
10687 			 */
10688 			if (!ad_error) {
10689 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10690 			}
10691 		}
10692 #endif /* CONFIG_APPLEDOUBLE */
10693 		/*
10694 		 * Call out to allow 3rd party notification of delete.
10695 		 * Ignore result of kauth_authorize_fileop call.
10696 		 */
10697 		if (!error) {
10698 			if (has_listeners) {
10699 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10700 				    KAUTH_FILEOP_DELETE,
10701 				    (uintptr_t)vp,
10702 				    (uintptr_t)path);
10703 			}
10704 
10705 			if (vp->v_flag & VISHARDLINK) {
10706 				// see the comment in unlink1() about why we update
10707 				// the parent of a hard link when it is removed
10708 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10709 			}
10710 
10711 #if CONFIG_FSE
10712 			if (need_event) {
10713 				if (vap) {
10714 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10715 				}
10716 				add_fsevent(FSE_DELETE, ctx,
10717 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10718 				    FSE_ARG_FINFO, &finfo,
10719 				    FSE_ARG_DONE);
10720 			}
10721 #endif
10722 
10723 #if CONFIG_MACF
10724 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10725 #endif
10726 		}
10727 
10728 out:
10729 		if (path != NULL) {
10730 			RELEASE_PATH(path);
10731 			path = NULL;
10732 		}
10733 
10734 		if (no_firmlink_path != NULL) {
10735 			RELEASE_PATH(no_firmlink_path);
10736 			no_firmlink_path = NULL;
10737 		}
10738 
10739 		/*
10740 		 * nameidone has to happen before we vnode_put(dvp)
10741 		 * since it may need to release the fs_nodelock on the dvp
10742 		 */
10743 		nameidone(ndp);
10744 		vnode_put(dvp);
10745 
10746 		if (vp) {
10747 			vnode_put(vp);
10748 		}
10749 
10750 		if (restart_flag == 0) {
10751 			wakeup_one((caddr_t)vp);
10752 			goto err_out;
10753 		}
10754 		tsleep(vp, PVFS, "rm AD", 1);
10755 	} while (restart_flag != 0);
10756 
10757 err_out:
10758 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10759 
10760 	return error;
10761 }
10762 
10763 /*
10764  * Remove a directory file.
10765  */
10766 /* ARGSUSED */
10767 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10768 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10769 {
10770 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10771 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10772 }
10773 
10774 /* Get direntry length padded to 8 byte alignment */
10775 #define DIRENT64_LEN(namlen) \
10776 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10777 
10778 /* Get dirent length padded to 4 byte alignment */
10779 #define DIRENT_LEN(namelen) \
10780 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10781 
10782 /* Get the end of this dirent */
10783 #define DIRENT_END(dep) \
10784 	(((char *)(dep)) + (dep)->d_reclen - 1)
10785 
10786 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10787 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10788     int *numdirent, vfs_context_t ctxp)
10789 {
10790 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10791 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10792 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10793 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10794 	} else {
10795 		size_t bufsize;
10796 		void * bufptr;
10797 		uio_t auio;
10798 		struct direntry *entry64;
10799 		struct dirent *dep;
10800 		size_t bytesread;
10801 		int error;
10802 
10803 		/*
10804 		 * We're here because the underlying file system does not
10805 		 * support direnties or we mounted denying support so we must
10806 		 * fall back to dirents and convert them to direntries.
10807 		 *
10808 		 * Our kernel buffer needs to be smaller since re-packing will
10809 		 * expand each dirent.  The worse case (when the name length
10810 		 * is 3 or less) corresponds to a struct direntry size of 32
10811 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10812 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10813 		 * will prevent us from reading more than we can pack.
10814 		 *
10815 		 * Since this buffer is wired memory, we will limit the
10816 		 * buffer size to a maximum of 32K. We would really like to
10817 		 * use 32K in the MIN(), but we use magic number 87371 to
10818 		 * prevent uio_resid() * 3 / 8 from overflowing.
10819 		 */
10820 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10821 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10822 		if (bufptr == NULL) {
10823 			return ENOMEM;
10824 		}
10825 
10826 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10827 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10828 		auio->uio_offset = uio->uio_offset;
10829 
10830 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10831 
10832 		dep = (struct dirent *)bufptr;
10833 		bytesread = bufsize - uio_resid(auio);
10834 
10835 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10836 		/*
10837 		 * Convert all the entries and copy them out to user's buffer.
10838 		 */
10839 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10840 			/* First check that the dirent struct up to d_name is within the buffer */
10841 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10842 			    /* Check that the length of the entire dirent is within the buffer */
10843 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10844 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10845 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10846 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10847 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10848 				    vp->v_name ? vp->v_name : "<unknown>");
10849 				error = EIO;
10850 				break;
10851 			}
10852 
10853 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10854 
10855 			bzero(entry64, enbufsize);
10856 			/* Convert a dirent to a dirent64. */
10857 			entry64->d_ino = dep->d_ino;
10858 			entry64->d_seekoff = 0;
10859 			entry64->d_reclen = (uint16_t)enbufsize;
10860 			entry64->d_namlen = dep->d_namlen;
10861 			entry64->d_type = dep->d_type;
10862 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10863 
10864 			/* Move to next entry. */
10865 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10866 
10867 			/* Copy entry64 to user's buffer. */
10868 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10869 		}
10870 
10871 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10872 		if (error == 0) {
10873 			uio->uio_offset = auio->uio_offset;
10874 		}
10875 		uio_free(auio);
10876 		kfree_data(bufptr, bufsize);
10877 		kfree_type(struct direntry, entry64);
10878 		return error;
10879 	}
10880 }
10881 
10882 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10883 
10884 /*
10885  * Read a block of directory entries in a file system independent format.
10886  */
10887 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10888 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10889     off_t *offset, int *eofflag, int flags)
10890 {
10891 	vnode_t vp;
10892 	struct vfs_context context = *vfs_context_current();    /* local copy */
10893 	struct fileproc *fp;
10894 	uio_t auio;
10895 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10896 	off_t loff;
10897 	int error, numdirent;
10898 	UIO_STACKBUF(uio_buf, 1);
10899 
10900 get_from_fd:
10901 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10902 	if (error) {
10903 		return error;
10904 	}
10905 
10906 	vn_offset_lock(fp->fp_glob);
10907 	if (((vnode_t)fp_get_data(fp)) != vp) {
10908 		vn_offset_unlock(fp->fp_glob);
10909 		file_drop(fd);
10910 		goto get_from_fd;
10911 	}
10912 
10913 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10914 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10915 		error = EBADF;
10916 		goto out;
10917 	}
10918 
10919 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10920 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10921 	}
10922 
10923 #if CONFIG_MACF
10924 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10925 	if (error) {
10926 		goto out;
10927 	}
10928 #endif
10929 
10930 	if ((error = vnode_getwithref(vp))) {
10931 		goto out;
10932 	}
10933 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10934 
10935 #if CONFIG_UNION_MOUNTS
10936 unionread:
10937 #endif /* CONFIG_UNION_MOUNTS */
10938 	if (vp->v_type != VDIR) {
10939 		(void)vnode_put(vp);
10940 		error = EINVAL;
10941 		goto out;
10942 	}
10943 
10944 #if CONFIG_MACF
10945 	error = mac_vnode_check_readdir(&context, vp);
10946 	if (error != 0) {
10947 		(void)vnode_put(vp);
10948 		goto out;
10949 	}
10950 #endif /* MAC */
10951 
10952 	loff = fp->fp_glob->fg_offset;
10953 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10954 	uio_addiov(auio, bufp, bufsize);
10955 
10956 	if (flags & VNODE_READDIR_EXTENDED) {
10957 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10958 		fp->fp_glob->fg_offset = uio_offset(auio);
10959 	} else {
10960 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10961 		fp->fp_glob->fg_offset = uio_offset(auio);
10962 	}
10963 	if (error) {
10964 		(void)vnode_put(vp);
10965 		goto out;
10966 	}
10967 
10968 #if CONFIG_UNION_MOUNTS
10969 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10970 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10971 		vnode_t uvp;
10972 
10973 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10974 			if (vnode_ref(uvp) == 0) {
10975 				fp_set_data(fp, uvp);
10976 				fp->fp_glob->fg_offset = 0;
10977 				vnode_rele(vp);
10978 				vnode_put(vp);
10979 				vp = uvp;
10980 				goto unionread;
10981 			} else {
10982 				/* could not get a ref, can't replace in fd */
10983 				vnode_put(uvp);
10984 			}
10985 		}
10986 	}
10987 #endif /* CONFIG_UNION_MOUNTS */
10988 
10989 	vnode_put(vp);
10990 	if (offset) {
10991 		*offset = loff;
10992 	}
10993 
10994 	*bytesread = bufsize - uio_resid(auio);
10995 out:
10996 	vn_offset_unlock(fp->fp_glob);
10997 	file_drop(fd);
10998 	return error;
10999 }
11000 
11001 
11002 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)11003 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
11004 {
11005 	off_t offset;
11006 	ssize_t bytesread;
11007 	int error, eofflag;
11008 
11009 	AUDIT_ARG(fd, uap->fd);
11010 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
11011 	    &bytesread, &offset, &eofflag, 0);
11012 
11013 	if (error == 0) {
11014 		if (proc_is64bit(p)) {
11015 			user64_long_t base = (user64_long_t)offset;
11016 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
11017 		} else {
11018 			user32_long_t base = (user32_long_t)offset;
11019 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
11020 		}
11021 		*retval = (int)bytesread;
11022 	}
11023 	return error;
11024 }
11025 
11026 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)11027 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
11028 {
11029 	off_t offset;
11030 	ssize_t bytesread;
11031 	int error, eofflag;
11032 	user_size_t bufsize;
11033 
11034 	AUDIT_ARG(fd, uap->fd);
11035 
11036 	/*
11037 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
11038 	 * then the kernel carves out the last 4 bytes to return extended
11039 	 * information to userspace (namely whether we reached EOF with this call).
11040 	 */
11041 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
11042 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
11043 	} else {
11044 		bufsize = uap->bufsize;
11045 	}
11046 
11047 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
11048 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
11049 
11050 	if (error == 0) {
11051 		*retval = bytesread;
11052 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
11053 
11054 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
11055 			getdirentries64_flags_t flags = 0;
11056 			if (eofflag) {
11057 				flags |= GETDIRENTRIES64_EOF;
11058 			}
11059 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
11060 			    sizeof(flags));
11061 		}
11062 	}
11063 	return error;
11064 }
11065 
11066 
11067 /*
11068  * Set the mode mask for creation of filesystem nodes.
11069  * XXX implement xsecurity
11070  */
11071 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
11072 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)11073 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
11074 {
11075 	AUDIT_ARG(mask, newmask);
11076 	proc_fdlock(p);
11077 	*retval = p->p_fd.fd_cmask;
11078 	p->p_fd.fd_cmask = newmask & ALLPERMS;
11079 	proc_fdunlock(p);
11080 	return 0;
11081 }
11082 
11083 /*
11084  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
11085  *
11086  * Parameters:    p                       Process requesting to set the umask
11087  *                uap                     User argument descriptor (see below)
11088  *                retval                  umask of the process (parameter p)
11089  *
11090  * Indirect:      uap->newmask            umask to set
11091  *                uap->xsecurity          ACL to set
11092  *
11093  * Returns:        0                      Success
11094  *                !0                      Not success
11095  *
11096  */
11097 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)11098 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
11099 {
11100 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
11101 }
11102 
11103 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)11104 umask(proc_t p, struct umask_args *uap, int32_t *retval)
11105 {
11106 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
11107 }
11108 
11109 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
11110 	"com.apple.private.vfs.revoke-mounted-device"
11111 
11112 /*
11113  * Void all references to file by ripping underlying filesystem
11114  * away from vnode.
11115  */
11116 /* ARGSUSED */
11117 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)11118 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
11119 {
11120 	vnode_t vp;
11121 	struct vnode_attr va;
11122 	vfs_context_t ctx = vfs_context_current();
11123 	int error;
11124 	struct nameidata nd;
11125 
11126 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
11127 	    uap->path, ctx);
11128 	error = namei(&nd);
11129 	if (error) {
11130 		return error;
11131 	}
11132 	vp = nd.ni_vp;
11133 
11134 	nameidone(&nd);
11135 
11136 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
11137 		error = ENOTSUP;
11138 		goto out;
11139 	}
11140 
11141 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
11142 		error = EBUSY;
11143 		goto out;
11144 	}
11145 
11146 #if CONFIG_MACF
11147 	error = mac_vnode_check_revoke(ctx, vp);
11148 	if (error) {
11149 		goto out;
11150 	}
11151 #endif
11152 
11153 	VATTR_INIT(&va);
11154 	VATTR_WANTED(&va, va_uid);
11155 	if ((error = vnode_getattr(vp, &va, ctx))) {
11156 		goto out;
11157 	}
11158 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
11159 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
11160 		goto out;
11161 	}
11162 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
11163 		VNOP_REVOKE(vp, REVOKEALL, ctx);
11164 	}
11165 out:
11166 	vnode_put(vp);
11167 	return error;
11168 }
11169 
11170 
11171 /*
11172  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
11173  *  The following system calls are designed to support features
11174  *  which are specific to the HFS & HFS Plus volume formats
11175  */
11176 
11177 
11178 /*
11179  * Obtain attribute information on objects in a directory while enumerating
11180  * the directory.
11181  */
11182 /* ARGSUSED */
11183 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)11184 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
11185 {
11186 	vnode_t vp;
11187 	struct fileproc *fp;
11188 	uio_t auio = NULL;
11189 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11190 	uint32_t count = 0, savecount = 0;
11191 	uint32_t newstate = 0;
11192 	int error, eofflag = 0;
11193 	off_t loff = 0;
11194 	struct attrlist attributelist;
11195 	vfs_context_t ctx = vfs_context_current();
11196 	int fd = uap->fd;
11197 	UIO_STACKBUF(uio_buf, 1);
11198 	kauth_action_t action;
11199 
11200 	AUDIT_ARG(fd, fd);
11201 
11202 	/* Get the attributes into kernel space */
11203 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
11204 		return error;
11205 	}
11206 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
11207 		return error;
11208 	}
11209 	savecount = count;
11210 
11211 get_from_fd:
11212 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
11213 		return error;
11214 	}
11215 
11216 	vn_offset_lock(fp->fp_glob);
11217 	if (((vnode_t)fp_get_data(fp)) != vp) {
11218 		vn_offset_unlock(fp->fp_glob);
11219 		file_drop(fd);
11220 		goto get_from_fd;
11221 	}
11222 
11223 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
11224 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
11225 		error = EBADF;
11226 		goto out;
11227 	}
11228 
11229 
11230 #if CONFIG_MACF
11231 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
11232 	    fp->fp_glob);
11233 	if (error) {
11234 		goto out;
11235 	}
11236 #endif
11237 
11238 
11239 	if ((error = vnode_getwithref(vp))) {
11240 		goto out;
11241 	}
11242 
11243 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
11244 
11245 #if CONFIG_UNION_MOUNTS
11246 unionread:
11247 #endif /* CONFIG_UNION_MOUNTS */
11248 	if (vp->v_type != VDIR) {
11249 		(void)vnode_put(vp);
11250 		error = EINVAL;
11251 		goto out;
11252 	}
11253 
11254 #if CONFIG_MACF
11255 	error = mac_vnode_check_readdir(ctx, vp);
11256 	if (error != 0) {
11257 		(void)vnode_put(vp);
11258 		goto out;
11259 	}
11260 #endif /* MAC */
11261 
11262 	/* set up the uio structure which will contain the users return buffer */
11263 	loff = fp->fp_glob->fg_offset;
11264 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11265 	uio_addiov(auio, uap->buffer, uap->buffersize);
11266 
11267 	/*
11268 	 * If the only item requested is file names, we can let that past with
11269 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
11270 	 * they need SEARCH as well.
11271 	 */
11272 	action = KAUTH_VNODE_LIST_DIRECTORY;
11273 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
11274 	    attributelist.fileattr || attributelist.dirattr) {
11275 		action |= KAUTH_VNODE_SEARCH;
11276 	}
11277 
11278 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
11279 		/* Believe it or not, uap->options only has 32-bits of valid
11280 		 * info, so truncate before extending again */
11281 
11282 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
11283 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
11284 	}
11285 
11286 	if (error) {
11287 		(void) vnode_put(vp);
11288 		goto out;
11289 	}
11290 
11291 #if CONFIG_UNION_MOUNTS
11292 	/*
11293 	 * If we've got the last entry of a directory in a union mount
11294 	 * then reset the eofflag and pretend there's still more to come.
11295 	 * The next call will again set eofflag and the buffer will be empty,
11296 	 * so traverse to the underlying directory and do the directory
11297 	 * read there.
11298 	 */
11299 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
11300 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
11301 			eofflag = 0;
11302 		} else {                                                // Empty buffer
11303 			vnode_t uvp;
11304 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
11305 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
11306 					fp_set_data(fp, uvp);
11307 					fp->fp_glob->fg_offset = 0; // reset index for new dir
11308 					count = savecount;
11309 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
11310 					vnode_put(vp);
11311 					vp = uvp;
11312 					goto unionread;
11313 				} else {
11314 					/* could not get a ref, can't replace in fd */
11315 					vnode_put(uvp);
11316 				}
11317 			}
11318 		}
11319 	}
11320 #endif /* CONFIG_UNION_MOUNTS */
11321 
11322 	(void)vnode_put(vp);
11323 
11324 	if (error) {
11325 		goto out;
11326 	}
11327 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
11328 
11329 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
11330 		goto out;
11331 	}
11332 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
11333 		goto out;
11334 	}
11335 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
11336 		goto out;
11337 	}
11338 
11339 	*retval = eofflag;  /* similar to getdirentries */
11340 	error = 0;
11341 out:
11342 	vn_offset_unlock(fp->fp_glob);
11343 	file_drop(fd);
11344 	return error; /* return error earlier, an retval of 0 or 1 now */
11345 } /* end of getdirentriesattr system call */
11346 
11347 /*
11348  * Exchange data between two files
11349  */
11350 
11351 /* ARGSUSED */
11352 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)11353 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
11354 {
11355 	struct nameidata fnd, snd;
11356 	vfs_context_t ctx = vfs_context_current();
11357 	vnode_t fvp;
11358 	vnode_t svp;
11359 	int error;
11360 	u_int32_t nameiflags;
11361 	char *fpath = NULL;
11362 	char *spath = NULL;
11363 	int   flen = 0, slen = 0;
11364 	int from_truncated = 0, to_truncated = 0;
11365 #if CONFIG_FSE
11366 	fse_info f_finfo, s_finfo;
11367 #endif
11368 
11369 	nameiflags = 0;
11370 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11371 		nameiflags |= FOLLOW;
11372 	}
11373 
11374 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
11375 	    UIO_USERSPACE, uap->path1, ctx);
11376 
11377 	error = namei(&fnd);
11378 	if (error) {
11379 		goto out2;
11380 	}
11381 
11382 	nameidone(&fnd);
11383 	fvp = fnd.ni_vp;
11384 
11385 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
11386 	    UIO_USERSPACE, uap->path2, ctx);
11387 
11388 	error = namei(&snd);
11389 	if (error) {
11390 		vnode_put(fvp);
11391 		goto out2;
11392 	}
11393 	nameidone(&snd);
11394 	svp = snd.ni_vp;
11395 
11396 	/*
11397 	 * if the files are the same, return an inval error
11398 	 */
11399 	if (svp == fvp) {
11400 		error = EINVAL;
11401 		goto out;
11402 	}
11403 
11404 	/*
11405 	 * if the files are on different volumes, return an error
11406 	 */
11407 	if (svp->v_mount != fvp->v_mount) {
11408 		error = EXDEV;
11409 		goto out;
11410 	}
11411 
11412 	/* If they're not files, return an error */
11413 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
11414 		error = EINVAL;
11415 		goto out;
11416 	}
11417 
11418 #if CONFIG_MACF
11419 	error = mac_vnode_check_exchangedata(ctx,
11420 	    fvp, svp);
11421 	if (error) {
11422 		goto out;
11423 	}
11424 #endif
11425 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
11426 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
11427 		goto out;
11428 	}
11429 
11430 	if (
11431 #if CONFIG_FSE
11432 		need_fsevent(FSE_EXCHANGE, fvp) ||
11433 #endif
11434 		kauth_authorize_fileop_has_listeners()) {
11435 		GET_PATH(fpath);
11436 		GET_PATH(spath);
11437 
11438 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
11439 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
11440 
11441 #if CONFIG_FSE
11442 		get_fse_info(fvp, &f_finfo, ctx);
11443 		get_fse_info(svp, &s_finfo, ctx);
11444 		if (from_truncated || to_truncated) {
11445 			// set it here since only the f_finfo gets reported up to user space
11446 			f_finfo.mode |= FSE_TRUNCATED_PATH;
11447 		}
11448 #endif
11449 	}
11450 	/* Ok, make the call */
11451 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
11452 
11453 	if (error == 0) {
11454 		const char *tmpname;
11455 
11456 		if (fpath != NULL && spath != NULL) {
11457 			/* call out to allow 3rd party notification of exchangedata.
11458 			 * Ignore result of kauth_authorize_fileop call.
11459 			 */
11460 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
11461 			    (uintptr_t)fpath, (uintptr_t)spath);
11462 		}
11463 		name_cache_lock();
11464 
11465 		tmpname     = fvp->v_name;
11466 		fvp->v_name = svp->v_name;
11467 		svp->v_name = tmpname;
11468 
11469 		if (fvp->v_parent != svp->v_parent) {
11470 			vnode_t tmp;
11471 
11472 			tmp           = fvp->v_parent;
11473 			fvp->v_parent = svp->v_parent;
11474 			svp->v_parent = tmp;
11475 		}
11476 		name_cache_unlock();
11477 
11478 #if CONFIG_FSE
11479 		if (fpath != NULL && spath != NULL) {
11480 			add_fsevent(FSE_EXCHANGE, ctx,
11481 			    FSE_ARG_STRING, flen, fpath,
11482 			    FSE_ARG_FINFO, &f_finfo,
11483 			    FSE_ARG_STRING, slen, spath,
11484 			    FSE_ARG_FINFO, &s_finfo,
11485 			    FSE_ARG_DONE);
11486 		}
11487 #endif
11488 	}
11489 
11490 out:
11491 	if (fpath != NULL) {
11492 		RELEASE_PATH(fpath);
11493 	}
11494 	if (spath != NULL) {
11495 		RELEASE_PATH(spath);
11496 	}
11497 	vnode_put(svp);
11498 	vnode_put(fvp);
11499 out2:
11500 	return error;
11501 }
11502 
11503 /*
11504  * Return (in MB) the amount of freespace on the given vnode's volume.
11505  */
11506 uint32_t freespace_mb(vnode_t vp);
11507 
11508 uint32_t
freespace_mb(vnode_t vp)11509 freespace_mb(vnode_t vp)
11510 {
11511 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
11512 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
11513 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
11514 }
11515 
11516 #if CONFIG_SEARCHFS
11517 
11518 /* ARGSUSED */
11519 
11520 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)11521 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
11522 {
11523 	vnode_t vp, tvp;
11524 	int i, error = 0;
11525 	int fserror = 0;
11526 	struct nameidata nd;
11527 	struct user64_fssearchblock searchblock;
11528 	struct searchstate *state;
11529 	struct attrlist *returnattrs;
11530 	struct timeval timelimit;
11531 	void *searchparams1, *searchparams2;
11532 	uio_t auio = NULL;
11533 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11534 	uint32_t nummatches;
11535 	size_t mallocsize;
11536 	uint32_t nameiflags;
11537 	vfs_context_t ctx = vfs_context_current();
11538 	UIO_STACKBUF(uio_buf, 1);
11539 
11540 	/* Start by copying in fsearchblock parameter list */
11541 	if (IS_64BIT_PROCESS(p)) {
11542 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11543 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
11544 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
11545 	} else {
11546 		struct user32_fssearchblock tmp_searchblock;
11547 
11548 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11549 		// munge into 64-bit version
11550 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11551 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11552 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11553 		searchblock.maxmatches = tmp_searchblock.maxmatches;
11554 		/*
11555 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11556 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11557 		 */
11558 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11559 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11560 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11561 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11562 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11563 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11564 		searchblock.searchattrs = tmp_searchblock.searchattrs;
11565 	}
11566 	if (error) {
11567 		return error;
11568 	}
11569 
11570 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11571 	 */
11572 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11573 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11574 		return EINVAL;
11575 	}
11576 
11577 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11578 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
11579 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11580 	/* block.                                                                                             */
11581 	/*												      */
11582 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
11583 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
11584 	/*       assumes the size is still 556 bytes it will continue to work				      */
11585 
11586 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11587 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11588 
11589 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11590 
11591 	/* Now set up the various pointers to the correct place in our newly allocated memory */
11592 
11593 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11594 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11595 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11596 
11597 	/* Now copy in the stuff given our local variables. */
11598 
11599 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11600 		goto freeandexit;
11601 	}
11602 
11603 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11604 		goto freeandexit;
11605 	}
11606 
11607 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11608 		goto freeandexit;
11609 	}
11610 
11611 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11612 		goto freeandexit;
11613 	}
11614 
11615 	/*
11616 	 * When searching a union mount, need to set the
11617 	 * start flag at the first call on each layer to
11618 	 * reset state for the new volume.
11619 	 */
11620 	if (uap->options & SRCHFS_START) {
11621 		state->ss_union_layer = 0;
11622 	} else {
11623 		uap->options |= state->ss_union_flags;
11624 	}
11625 	state->ss_union_flags = 0;
11626 
11627 	/*
11628 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11629 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11630 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11631 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11632 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11633 	 */
11634 
11635 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11636 		attrreference_t* string_ref;
11637 		u_int32_t* start_length;
11638 		user64_size_t param_length;
11639 
11640 		/* validate searchparams1 */
11641 		param_length = searchblock.sizeofsearchparams1;
11642 		/* skip the word that specifies length of the buffer */
11643 		start_length = (u_int32_t*) searchparams1;
11644 		start_length = start_length + 1;
11645 		string_ref = (attrreference_t*) start_length;
11646 
11647 		/* ensure no negative offsets or too big offsets */
11648 		if (string_ref->attr_dataoffset < 0) {
11649 			error = EINVAL;
11650 			goto freeandexit;
11651 		}
11652 		if (string_ref->attr_length > MAXPATHLEN) {
11653 			error = EINVAL;
11654 			goto freeandexit;
11655 		}
11656 
11657 		/* Check for pointer overflow in the string ref */
11658 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11659 			error = EINVAL;
11660 			goto freeandexit;
11661 		}
11662 
11663 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11664 			error = EINVAL;
11665 			goto freeandexit;
11666 		}
11667 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11668 			error = EINVAL;
11669 			goto freeandexit;
11670 		}
11671 	}
11672 
11673 	/* set up the uio structure which will contain the users return buffer */
11674 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11675 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11676 
11677 	nameiflags = 0;
11678 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11679 		nameiflags |= FOLLOW;
11680 	}
11681 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11682 	    UIO_USERSPACE, uap->path, ctx);
11683 
11684 	error = namei(&nd);
11685 	if (error) {
11686 		goto freeandexit;
11687 	}
11688 	vp = nd.ni_vp;
11689 	nameidone(&nd);
11690 
11691 	/*
11692 	 * Switch to the root vnode for the volume
11693 	 */
11694 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11695 	vnode_put(vp);
11696 	if (error) {
11697 		goto freeandexit;
11698 	}
11699 	vp = tvp;
11700 
11701 #if CONFIG_UNION_MOUNTS
11702 	/*
11703 	 * If it's a union mount, the path lookup takes
11704 	 * us to the top layer. But we may need to descend
11705 	 * to a lower layer. For non-union mounts the layer
11706 	 * is always zero.
11707 	 */
11708 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11709 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11710 			break;
11711 		}
11712 		tvp = vp;
11713 		vp = vp->v_mount->mnt_vnodecovered;
11714 		if (vp == NULL) {
11715 			vnode_put(tvp);
11716 			error = ENOENT;
11717 			goto freeandexit;
11718 		}
11719 		error = vnode_getwithref(vp);
11720 		vnode_put(tvp);
11721 		if (error) {
11722 			goto freeandexit;
11723 		}
11724 	}
11725 #endif /* CONFIG_UNION_MOUNTS */
11726 
11727 #if CONFIG_MACF
11728 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11729 	if (error) {
11730 		vnode_put(vp);
11731 		goto freeandexit;
11732 	}
11733 #endif
11734 
11735 
11736 	/*
11737 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11738 	 * before and sometimes the underlying code doesnt deal with it well.
11739 	 */
11740 	if (searchblock.maxmatches == 0) {
11741 		nummatches = 0;
11742 		goto saveandexit;
11743 	}
11744 
11745 	/*
11746 	 * Allright, we have everything we need, so lets make that call.
11747 	 *
11748 	 * We keep special track of the return value from the file system:
11749 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11750 	 * from copying out any results...
11751 	 */
11752 
11753 	fserror = VNOP_SEARCHFS(vp,
11754 	    searchparams1,
11755 	    searchparams2,
11756 	    &searchblock.searchattrs,
11757 	    (uint32_t)searchblock.maxmatches,
11758 	    &timelimit,
11759 	    returnattrs,
11760 	    &nummatches,
11761 	    (uint32_t)uap->scriptcode,
11762 	    (uint32_t)uap->options,
11763 	    auio,
11764 	    (struct searchstate *) &state->ss_fsstate,
11765 	    ctx);
11766 
11767 #if CONFIG_UNION_MOUNTS
11768 	/*
11769 	 * If it's a union mount we need to be called again
11770 	 * to search the mounted-on filesystem.
11771 	 */
11772 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11773 		state->ss_union_flags = SRCHFS_START;
11774 		state->ss_union_layer++;        // search next layer down
11775 		fserror = EAGAIN;
11776 	}
11777 #endif /* CONFIG_UNION_MOUNTS */
11778 
11779 saveandexit:
11780 
11781 	vnode_put(vp);
11782 
11783 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11784 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11785 
11786 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11787 		goto freeandexit;
11788 	}
11789 
11790 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11791 		goto freeandexit;
11792 	}
11793 
11794 	error = fserror;
11795 
11796 freeandexit:
11797 
11798 	kfree_data(searchparams1, mallocsize);
11799 
11800 	return error;
11801 } /* end of searchfs system call */
11802 
11803 #else /* CONFIG_SEARCHFS */
11804 
11805 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11806 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11807 {
11808 	return ENOTSUP;
11809 }
11810 
11811 #endif /* CONFIG_SEARCHFS */
11812 
11813 
11814 #if CONFIG_DATALESS_FILES
11815 
11816 /*
11817  * === Namespace Resolver Up-call Mechanism ===
11818  *
11819  * When I/O is performed to a dataless file or directory (read, write,
11820  * lookup-in, etc.), the file system performs an upcall to the namespace
11821  * resolver (filecoordinationd) to materialize the object.
11822  *
11823  * We need multiple up-calls to be in flight at once, and we need these
11824  * up-calls to be interruptible, thus the following implementation:
11825  *
11826  * => The nspace_resolver_request represents the in-kernel request state.
11827  *    It contains a request ID, storage space for the errno code returned
11828  *    by filecoordinationd, and flags.
11829  *
11830  * => The request ID is simply a global monotonically incrementing 32-bit
11831  *    number.  Outstanding requests are stored in a hash table, and the
11832  *    hash function is extremely simple.
11833  *
11834  * => When an upcall is to be made to filecoordinationd, a request structure
11835  *    is allocated on the stack (it is small, and needs to live only during
11836  *    the duration of the call to resolve_nspace_item_ext()).  It is
11837  *    initialized and inserted into the table.  Some backpressure from
11838  *    filecoordinationd is applied by limiting the numnber of entries that
11839  *    can be inserted into the table (and thus limiting the number of
11840  *    outstanding requests issued to filecoordinationd); waiting for an
11841  *    available slot is interruptible.
11842  *
11843  * => Once the request has been inserted into the table, the up-call is made
11844  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11845  *    immediately and filecoordinationd processes the request asynchronously.
11846  *
11847  * => The caller now waits for the request to complete.  Tnis is achieved by
11848  *    sleeping on the address of the request structure and waiting for
11849  *    filecoordinationd to mark the request structure as complete.  This
11850  *    is an interruptible sleep call; if interrupted, the request structure
11851  *    is removed from the table and EINTR is returned to the caller.  If
11852  *    this occurs, an advisory up-call is made to filecoordinationd with
11853  *    the request ID to indicate that the request can be aborted or
11854  *    de-prioritized at the discretion of filecoordinationd.
11855  *
11856  * => When filecoordinationd has completed the request, it signals completion
11857  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11858  *    decorated as a namespace resolver can write to this sysctl node.  The
11859  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11860  *    The request ID is looked up in the table, and if the request is found,
11861  *    the error code is stored in the request structure and a wakeup()
11862  *    issued on the address of the request structure.  If the request is not
11863  *    found, we simply drop the completion notification, assuming that the
11864  *    caller was interrupted.
11865  *
11866  * => When the waiting thread wakes up, it extracts the error code from the
11867  *    request structure, removes the request from the table, and returns the
11868  *    error code to the calling function.  Fini!
11869  */
11870 
11871 struct nspace_resolver_request {
11872 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11873 	vnode_t         r_vp;
11874 	vnode_t         r_tdvp;
11875 	uint32_t        r_req_id;
11876 	int             r_resolver_error;
11877 	int             r_flags;
11878 };
11879 
11880 #define RRF_COMPLETE    0x0001
11881 #define RRF_COMPLETING  0x0002
11882 
11883 struct nspace_resolver_completion_data {
11884 	uint32_t req_id;
11885 	int32_t  resolver_error;
11886 	uint64_t orig_gencount;
11887 	uint64_t orig_syncroot;
11888 };
11889 
11890 static uint32_t
next_nspace_req_id(void)11891 next_nspace_req_id(void)
11892 {
11893 	static uint32_t next_req_id;
11894 
11895 	return OSAddAtomic(1, &next_req_id);
11896 }
11897 
11898 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11899 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11900 
11901 static LIST_HEAD(nspace_resolver_requesthead,
11902     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11903 static u_long nspace_resolver_request_hashmask;
11904 static u_int nspace_resolver_request_count;
11905 static bool nspace_resolver_request_wait_slot;
11906 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11907 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11908     &nspace_resolver_request_lck_grp);
11909 
11910 #define NSPACE_REQ_LOCK() \
11911 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11912 #define NSPACE_REQ_UNLOCK() \
11913 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11914 
11915 #define NSPACE_RESOLVER_HASH(req_id)    \
11916 	(&nspace_resolver_request_hashtbl[(req_id) & \
11917 	 nspace_resolver_request_hashmask])
11918 
11919 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11920 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11921 {
11922 	struct nspace_resolver_requesthead *bucket;
11923 	struct nspace_resolver_request *req;
11924 
11925 	bucket = NSPACE_RESOLVER_HASH(req_id);
11926 	LIST_FOREACH(req, bucket, r_hashlink) {
11927 		if (req->r_req_id == req_id) {
11928 			/*
11929 			 * If this request already has a completion
11930 			 * pending, don't return it again.
11931 			 */
11932 			if ((req->r_flags & RRF_COMPLETING) != 0 &&
11933 			    skip_completing) {
11934 				req = NULL;
11935 			}
11936 			return req;
11937 		}
11938 	}
11939 
11940 	return NULL;
11941 }
11942 
11943 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11944 nspace_resolver_req_add(struct nspace_resolver_request *req)
11945 {
11946 	struct nspace_resolver_requesthead *bucket;
11947 	int error;
11948 
11949 	NSPACE_REQ_LOCK();
11950 
11951 	while (nspace_resolver_request_count >=
11952 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11953 		nspace_resolver_request_wait_slot = true;
11954 		error = msleep(&nspace_resolver_request_count,
11955 		    &nspace_resolver_request_hash_mutex,
11956 		    PVFS | PCATCH, "nspacerq", NULL);
11957 		if (error) {
11958 			NSPACE_REQ_UNLOCK();
11959 			return error;
11960 		}
11961 	}
11962 
11963 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11964 #if DIAGNOSTIC
11965 	assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11966 #endif /* DIAGNOSTIC */
11967 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11968 	nspace_resolver_request_count++;
11969 
11970 	NSPACE_REQ_UNLOCK();
11971 
11972 	return 0;
11973 }
11974 
11975 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11976 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11977 {
11978 	/*
11979 	 * If a completion is in-progress, we have to wait for the
11980 	 * completion handler to finish because it's still using 'req',
11981 	 * which is allocated on our stack a couple of frames up.
11982 	 */
11983 	while ((req->r_flags & RRF_COMPLETING) != 0) {
11984 		(void) msleep(req, &nspace_resolver_request_hash_mutex,
11985 		    PVFS, "nspacecmplt", NULL);
11986 	}
11987 }
11988 
11989 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11990 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11991 {
11992 	struct nspace_resolver_requesthead *bucket;
11993 
11994 	/* We're called with NSPACE_REQ_LOCK held. */
11995 
11996 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11997 #if DIAGNOSTIC
11998 	assert((req->r_flags & RRF_COMPLETING) == 0);
11999 	assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
12000 #endif /* DIAGNOSTIC */
12001 	LIST_REMOVE(req, r_hashlink);
12002 	nspace_resolver_request_count--;
12003 
12004 	if (nspace_resolver_request_wait_slot) {
12005 		nspace_resolver_request_wait_slot = false;
12006 		wakeup(&nspace_resolver_request_count);
12007 	}
12008 
12009 	nspace_resolver_req_wait_pending_completion(req);
12010 
12011 	NSPACE_REQ_UNLOCK();
12012 }
12013 
12014 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)12015 nspace_resolver_req_remove(struct nspace_resolver_request *req)
12016 {
12017 	NSPACE_REQ_LOCK();
12018 	nspace_resolver_req_remove_and_unlock(req);
12019 }
12020 
12021 static void
nspace_resolver_req_cancel(uint32_t req_id)12022 nspace_resolver_req_cancel(uint32_t req_id)
12023 {
12024 	kern_return_t kr;
12025 	mach_port_t mp;
12026 
12027 	// Failures here aren't fatal -- the cancellation message
12028 	// sent to the resolver is merely advisory.
12029 
12030 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
12031 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
12032 		return;
12033 	}
12034 
12035 	kr = send_nspace_resolve_cancel(mp, req_id);
12036 	if (kr != KERN_SUCCESS) {
12037 		os_log_error(OS_LOG_DEFAULT,
12038 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
12039 	}
12040 
12041 	ipc_port_release_send(mp);
12042 }
12043 
12044 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)12045 nspace_resolver_req_wait(struct nspace_resolver_request *req)
12046 {
12047 	bool send_cancel_message = false;
12048 	int error;
12049 
12050 	NSPACE_REQ_LOCK();
12051 
12052 	while ((req->r_flags & RRF_COMPLETE) == 0) {
12053 		error = msleep(req, &nspace_resolver_request_hash_mutex,
12054 		    PVFS | PCATCH, "nspace", NULL);
12055 		if (error && error != ERESTART) {
12056 			req->r_resolver_error = (error == EINTR) ? EINTR :
12057 			    ETIMEDOUT;
12058 			send_cancel_message = true;
12059 			break;
12060 		}
12061 	}
12062 
12063 	nspace_resolver_req_remove_and_unlock(req);
12064 
12065 	/*
12066 	 * It's safe to continue referencing 'req' here because it's
12067 	 * allocated on our caller's stack.
12068 	 */
12069 
12070 	if (send_cancel_message) {
12071 		nspace_resolver_req_cancel(req->r_req_id);
12072 	}
12073 
12074 	return req->r_resolver_error;
12075 }
12076 
12077 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)12078 nspace_resolver_req_mark_complete(
12079 	struct nspace_resolver_request *req,
12080 	int resolver_error)
12081 {
12082 	req->r_resolver_error = resolver_error;
12083 	req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
12084 	wakeup(req);
12085 }
12086 
12087 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)12088 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
12089 {
12090 	req->r_flags |= RRF_COMPLETING;
12091 }
12092 
12093 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)12094 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
12095 {
12096 	struct nspace_resolver_request *req;
12097 	int error;
12098 	struct vnode_attr va;
12099 	vnode_t vp;
12100 
12101 	NSPACE_REQ_LOCK();
12102 
12103 	req = nspace_resolver_req_lookup(c->req_id, true);
12104 	if (req == NULL) {
12105 		/*
12106 		 * If we don't find the request corresponding to our req_id,
12107 		 * just drop the completion on the floor; it's likely that
12108 		 * the requester interrupted with a signal, or it may already
12109 		 * be completing.
12110 		 */
12111 		NSPACE_REQ_UNLOCK();
12112 		return;
12113 	}
12114 
12115 	/*
12116 	 * Get out now if the resolver reported an error.
12117 	 */
12118 	if ((error = c->resolver_error) != 0) {
12119 		goto out;
12120 	}
12121 
12122 	/*
12123 	 * If the resolver did not specify any namespace shape criteria
12124 	 * for letting the operation proceed, then get out now.
12125 	 */
12126 	if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
12127 		goto out;
12128 	}
12129 
12130 	/*
12131 	 * We're going to have to acquire the mount rename lock and do
12132 	 * some I/O in order to verify the criteria.  Mark the request
12133 	 * as pending so no one else messes with it after we drop the
12134 	 * NSPACE_REQ_LOCK.
12135 	 */
12136 	nspace_resolver_req_mark_completion_pending(req);
12137 	NSPACE_REQ_UNLOCK();
12138 
12139 	/*
12140 	 * Lock out renames from changing the shape of the tree while
12141 	 * validate the criteria.
12142 	 */
12143 	mount_t locked_mp = req->r_vp->v_mount;
12144 	mount_ref(locked_mp, 0);
12145 	mount_lock_renames(locked_mp);
12146 
12147 	if (c->orig_gencount != 0) {
12148 		vp = req->r_vp;
12149 		if (error) {
12150 			goto out_dropmount;
12151 		}
12152 
12153 		VATTR_INIT(&va);
12154 		VATTR_WANTED(&va, va_recursive_gencount);
12155 		error = vnode_getattr(vp, &va, vfs_context_kernel());
12156 		if (error) {
12157 			goto out_dropmount;
12158 		}
12159 		if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
12160 		    va.va_recursive_gencount != c->orig_gencount) {
12161 			printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
12162 			    c->orig_gencount, va.va_recursive_gencount);
12163 			error = EBUSY;
12164 			goto out_dropmount;
12165 		}
12166 	}
12167 
12168 	/*
12169 	 * Ignore orig_syncroot if a destination directory wasn't specified
12170 	 * in the request.
12171 	 */
12172 	if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
12173 		uint64_t syncroot_id;
12174 
12175 		if (error) {
12176 			goto out_dropmount;
12177 		}
12178 
12179 #ifndef APFSIOC_GET_SYNC_ROOT
12180 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
12181 #endif
12182 
12183 		error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
12184 		    (caddr_t)&syncroot_id, 0, vfs_context_kernel());
12185 		if (error) {
12186 			goto out_dropmount;
12187 		}
12188 		if (syncroot_id != c->orig_syncroot) {
12189 			printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
12190 			    c->orig_syncroot, syncroot_id);
12191 			error = EBUSY;
12192 			goto out_dropmount;
12193 		}
12194 	}
12195 
12196 out_dropmount:
12197 	mount_unlock_renames(locked_mp);
12198 	mount_drop(locked_mp, 0);
12199 	NSPACE_REQ_LOCK();
12200 
12201 out:
12202 	nspace_resolver_req_mark_complete(req, error);
12203 	NSPACE_REQ_UNLOCK();
12204 }
12205 
12206 static struct proc *nspace_resolver_proc;
12207 
12208 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)12209 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
12210 {
12211 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12212 	    p == nspace_resolver_proc) ? 1 : 0;
12213 	return 0;
12214 }
12215 
12216 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
12217 
12218 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)12219 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
12220 {
12221 	vfs_context_t ctx = vfs_context_current();
12222 	int error = 0;
12223 
12224 	//
12225 	// The system filecoordinationd runs as uid == 0.  This also
12226 	// has the nice side-effect of filtering out filecoordinationd
12227 	// running in the simulator.
12228 	//
12229 	if (!vfs_context_issuser(ctx) ||
12230 	    !vfs_context_is_dataless_resolver(ctx)) {
12231 		return EPERM;
12232 	}
12233 
12234 	if (is_resolver) {
12235 		NSPACE_REQ_LOCK();
12236 
12237 		if (nspace_resolver_proc == NULL) {
12238 			proc_lock(p);
12239 			p->p_lflag |= P_LNSPACE_RESOLVER;
12240 			proc_unlock(p);
12241 			nspace_resolver_proc = p;
12242 		} else {
12243 			error = EBUSY;
12244 		}
12245 
12246 		NSPACE_REQ_UNLOCK();
12247 	} else {
12248 		// This is basically just like the exit case.
12249 		// nspace_resolver_exited() will verify that the
12250 		// process is the resolver, and will clear the
12251 		// global.
12252 		nspace_resolver_exited(p);
12253 	}
12254 
12255 	return error;
12256 }
12257 
12258 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)12259 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
12260 {
12261 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
12262 	    (p->p_vfs_iopolicy &
12263 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
12264 		*is_prevented = 1;
12265 	} else {
12266 		*is_prevented = 0;
12267 	}
12268 	return 0;
12269 }
12270 
12271 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)12272 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
12273 {
12274 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
12275 		return is_prevented ? 0 : EBUSY;
12276 	}
12277 
12278 	if (is_prevented) {
12279 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
12280 	} else {
12281 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
12282 	}
12283 	return 0;
12284 }
12285 
12286 static int
nspace_materialization_get_thread_state(int * is_prevented)12287 nspace_materialization_get_thread_state(int *is_prevented)
12288 {
12289 	uthread_t ut = current_uthread();
12290 
12291 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
12292 	return 0;
12293 }
12294 
12295 static int
nspace_materialization_set_thread_state(int is_prevented)12296 nspace_materialization_set_thread_state(int is_prevented)
12297 {
12298 	uthread_t ut = current_uthread();
12299 
12300 	if (is_prevented) {
12301 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
12302 	} else {
12303 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
12304 	}
12305 	return 0;
12306 }
12307 
12308 /* the vfs.nspace branch */
12309 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
12310 
12311 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12312 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
12313     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12314 {
12315 	struct proc *p = req->p;
12316 	int new_value, old_value, changed = 0;
12317 	int error;
12318 
12319 	error = nspace_resolver_get_proc_state(p, &old_value);
12320 	if (error) {
12321 		return error;
12322 	}
12323 
12324 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12325 	    &changed);
12326 	if (error == 0 && changed) {
12327 		error = nspace_resolver_set_proc_state(p, new_value);
12328 	}
12329 	return error;
12330 }
12331 
12332 /* decorate this process as the dataless file resolver */
12333 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
12334     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12335     0, 0, sysctl_nspace_resolver, "I", "");
12336 
12337 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12338 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
12339     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12340 {
12341 	struct proc *p = req->p;
12342 	int new_value, old_value, changed = 0;
12343 	int error;
12344 
12345 	error = nspace_materialization_get_proc_state(p, &old_value);
12346 	if (error) {
12347 		return error;
12348 	}
12349 
12350 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12351 	    &changed);
12352 	if (error == 0 && changed) {
12353 		error = nspace_materialization_set_proc_state(p, new_value);
12354 	}
12355 	return error;
12356 }
12357 
12358 /* decorate this process as not wanting to materialize dataless files */
12359 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
12360     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12361     0, 0, sysctl_nspace_prevent_materialization, "I", "");
12362 
12363 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12364 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
12365     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12366 {
12367 	int new_value, old_value, changed = 0;
12368 	int error;
12369 
12370 	error = nspace_materialization_get_thread_state(&old_value);
12371 	if (error) {
12372 		return error;
12373 	}
12374 
12375 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12376 	    &changed);
12377 	if (error == 0 && changed) {
12378 		error = nspace_materialization_set_thread_state(new_value);
12379 	}
12380 	return error;
12381 }
12382 
12383 /* decorate this thread as not wanting to materialize dataless files */
12384 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
12385     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12386     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
12387 
12388 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12389 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
12390     __unused int arg2, struct sysctl_req *req)
12391 {
12392 	struct proc *p = req->p;
12393 	uint32_t req_status[2] = { 0, 0 };
12394 	uint64_t gencount = 0;
12395 	uint64_t syncroot = 0;
12396 	int error, is_resolver, changed = 0, other_changed;
12397 
12398 	error = nspace_resolver_get_proc_state(p, &is_resolver);
12399 	if (error) {
12400 		return error;
12401 	}
12402 
12403 	if (!is_resolver) {
12404 		return EPERM;
12405 	}
12406 
12407 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
12408 	    &changed);
12409 	if (error) {
12410 		return error;
12411 	}
12412 
12413 	/*
12414 	 * Get the gencount if it was passed.  Ignore errors, because
12415 	 * it's optional.
12416 	 */
12417 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
12418 	    &other_changed);
12419 	if (error) {
12420 		gencount = 0;
12421 		error = 0;
12422 	}
12423 
12424 	/*
12425 	 * ...and now the syncroot ID.
12426 	 */
12427 	error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
12428 	    &other_changed);
12429 	if (error) {
12430 		syncroot = 0;
12431 		error = 0;
12432 	}
12433 
12434 	/*
12435 	 * req_status[0] is the req_id
12436 	 *
12437 	 * req_status[1] is the errno
12438 	 */
12439 	if (error == 0 && changed) {
12440 		const struct nspace_resolver_completion_data cd = {
12441 			.req_id = req_status[0],
12442 			.resolver_error = req_status[1],
12443 			.orig_gencount = gencount,
12444 			.orig_syncroot = syncroot,
12445 		};
12446 		nspace_resolver_req_completed(&cd);
12447 	}
12448 	return error;
12449 }
12450 
12451 /* Resolver reports completed reqs here. */
12452 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
12453     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12454     0, 0, sysctl_nspace_complete, "-", "");
12455 
12456 #endif /* CONFIG_DATALESS_FILES */
12457 
12458 #if CONFIG_DATALESS_FILES
12459 #define __no_dataless_unused    /* nothing */
12460 #else
12461 #define __no_dataless_unused    __unused
12462 #endif
12463 
12464 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)12465 vfs_context_dataless_materialization_is_prevented(
12466 	vfs_context_t const ctx __no_dataless_unused)
12467 {
12468 #if CONFIG_DATALESS_FILES
12469 	proc_t const p = vfs_context_proc(ctx);
12470 	thread_t const t = vfs_context_thread(ctx);
12471 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
12472 
12473 	/*
12474 	 * Kernel context ==> return EDEADLK, as we would with any random
12475 	 * process decorated as no-materialize.
12476 	 */
12477 	if (ctx == vfs_context_kernel()) {
12478 		return EDEADLK;
12479 	}
12480 
12481 	/*
12482 	 * If the process has the dataless-manipulation entitlement,
12483 	 * materialization is prevented, and depending on the kind
12484 	 * of file system operation, things get to proceed as if the
12485 	 * object is not dataless.
12486 	 */
12487 	if (vfs_context_is_dataless_manipulator(ctx)) {
12488 		return EJUSTRETURN;
12489 	}
12490 
12491 	/*
12492 	 * Per-thread decorations override any process-wide decorations.
12493 	 * (Foundation uses this, and this overrides even the dataless-
12494 	 * manipulation entitlement so as to make API contracts consistent.)
12495 	 */
12496 	if (ut != NULL) {
12497 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
12498 			return EDEADLK;
12499 		}
12500 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
12501 			return 0;
12502 		}
12503 	}
12504 
12505 	/*
12506 	 * If the process's iopolicy specifies that dataless files
12507 	 * can be materialized, then we let it go ahead.
12508 	 */
12509 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
12510 		return 0;
12511 	}
12512 #endif /* CONFIG_DATALESS_FILES */
12513 
12514 	/*
12515 	 * The default behavior is to not materialize dataless files;
12516 	 * return to the caller that deadlock was detected.
12517 	 */
12518 	return EDEADLK;
12519 }
12520 
12521 void
nspace_resolver_init(void)12522 nspace_resolver_init(void)
12523 {
12524 #if CONFIG_DATALESS_FILES
12525 	nspace_resolver_request_hashtbl =
12526 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12527 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12528 #endif /* CONFIG_DATALESS_FILES */
12529 }
12530 
12531 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12532 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12533 {
12534 #if CONFIG_DATALESS_FILES
12535 	struct nspace_resolver_requesthead *bucket;
12536 	struct nspace_resolver_request *req;
12537 	u_long idx;
12538 
12539 	NSPACE_REQ_LOCK();
12540 
12541 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12542 	    p == nspace_resolver_proc) {
12543 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12544 			bucket = &nspace_resolver_request_hashtbl[idx];
12545 			LIST_FOREACH(req, bucket, r_hashlink) {
12546 				nspace_resolver_req_wait_pending_completion(req);
12547 				nspace_resolver_req_mark_complete(req,
12548 				    ETIMEDOUT);
12549 			}
12550 		}
12551 		nspace_resolver_proc = NULL;
12552 	}
12553 
12554 	NSPACE_REQ_UNLOCK();
12555 #endif /* CONFIG_DATALESS_FILES */
12556 }
12557 
12558 #define DATALESS_RESOLVER_ENTITLEMENT     \
12559 	"com.apple.private.vfs.dataless-resolver"
12560 #define DATALESS_MANIPULATION_ENTITLEMENT \
12561 	"com.apple.private.vfs.dataless-manipulation"
12562 
12563 #if CONFIG_DATALESS_FILES
12564 /*
12565  * Return TRUE if the vfs context is associated with the dataless
12566  * resolver.
12567  */
12568 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12569 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12570 {
12571 	return IOTaskHasEntitlement(vfs_context_task(ctx),
12572 	           DATALESS_RESOLVER_ENTITLEMENT);
12573 }
12574 #endif /* CONFIG_DATALESS_FILES */
12575 
12576 /*
12577  * Return TRUE if the vfs context is associated with a process entitled
12578  * for dataless manipulation.
12579  *
12580  * XXX Arguably belongs in vfs_subr.c, but is here because of the
12581  * complication around CONFIG_DATALESS_FILES.
12582  */
12583 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12584 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12585 {
12586 #if CONFIG_DATALESS_FILES
12587 	task_t task = vfs_context_task(ctx);
12588 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12589 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12590 #else
12591 	return false;
12592 #endif /* CONFIG_DATALESS_FILES */
12593 }
12594 
12595 #if CONFIG_DATALESS_FILES
12596 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12597 log_materialization_prevented(vnode_t vp, uint64_t op)
12598 {
12599 	char p_name[MAXCOMLEN + 1];
12600 	char *vntype;
12601 	proc_selfname(&p_name[0], sizeof(p_name));
12602 
12603 	if (vp->v_type == VREG) {
12604 		vntype = "File";
12605 	} else if (vp->v_type == VDIR) {
12606 		vntype = "Dir";
12607 	} else if (vp->v_type == VLNK) {
12608 		vntype = "SymLink";
12609 	} else {
12610 		vntype = "Other";
12611 	}
12612 
12613 #if DEVELOPMENT
12614 	struct vnode_attr *vap = kalloc_type(struct vnode_attr, Z_WAITOK);
12615 
12616 	VATTR_INIT(vap);
12617 	VATTR_WANTED(vap, va_fsid);
12618 	VATTR_WANTED(vap, va_fileid);
12619 	if (vnode_getattr(vp, vap, vfs_context_current()) == 0) {
12620 		os_log_debug(OS_LOG_DEFAULT,
12621 		    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) fsid 0x%08x/%u fileid=%llu",
12622 		    p_name, proc_selfpid(), op, vntype,
12623 		    vap->va_fsid, vap->va_fsid, vap->va_fileid);
12624 	} else
12625 #endif
12626 	{
12627 		os_log_debug(OS_LOG_DEFAULT,
12628 		    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12629 		    p_name, proc_selfpid(), op, vntype);
12630 	}
12631 #if DEVELOPMENT
12632 	kfree_type(struct vnode_attr, vap);
12633 #endif
12634 }
12635 #endif /* CONFIG_DATALESS_FILES */
12636 
12637 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12638 vfs_materialize_item(
12639 	vnode_t vp __no_dataless_unused,
12640 	uint32_t op __no_dataless_unused,
12641 	int64_t offset __no_dataless_unused,
12642 	int64_t size __no_dataless_unused,
12643 	char *lookup_name __no_dataless_unused,
12644 	size_t const namelen __no_dataless_unused,
12645 	vnode_t tdvp __no_dataless_unused)
12646 {
12647 #if CONFIG_DATALESS_FILES
12648 	kern_return_t kern_ret;
12649 	mach_port_t mach_port;
12650 	char *path = NULL;
12651 	vfs_context_t context;
12652 	int path_len;
12653 	int error;
12654 	audit_token_t atoken;
12655 	enum vtype vp_vtype;
12656 
12657 	/* Swap files are special; ignore them */
12658 	if (vnode_isswap(vp)) {
12659 		return 0;
12660 	}
12661 
12662 	/*
12663 	 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12664 	 * are no longer used nor supported.
12665 	 */
12666 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12667 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12668 		return ENOTSUP;
12669 	}
12670 	if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12671 		os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12672 		return ENOTSUP;
12673 	}
12674 
12675 	/* Normalize 'op'. */
12676 	op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12677 
12678 	/*
12679 	 * To-directory is only meaningful for rename operations;
12680 	 * ignore it if someone handed one to us unexpectedly.
12681 	 */
12682 	if (op != NAMESPACE_HANDLER_RENAME_OP) {
12683 		tdvp = NULL;
12684 	}
12685 
12686 	context = vfs_context_current();
12687 
12688 	/* Remember this for later. */
12689 	vp_vtype = vnode_vtype(vp);
12690 
12691 	error = vfs_context_dataless_materialization_is_prevented(context);
12692 	if (error) {
12693 		log_materialization_prevented(vp, op);
12694 		goto out_check_errors;
12695 	}
12696 
12697 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12698 	    &mach_port);
12699 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12700 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12701 		/*
12702 		 * Treat this like being unable to access the backing store
12703 		 * server.
12704 		 */
12705 		return ETIMEDOUT;
12706 	}
12707 
12708 	int path_alloc_len = MAXPATHLEN;
12709 	do {
12710 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12711 		if (path == NULL) {
12712 			return ENOMEM;
12713 		}
12714 
12715 		path_len = path_alloc_len;
12716 		error = vn_getpath(vp, path, &path_len);
12717 		if (error == 0) {
12718 			break;
12719 		} else if (error == ENOSPC) {
12720 			kfree_data(path, path_alloc_len);
12721 			path = NULL;
12722 		} else {
12723 			goto out_release_port;
12724 		}
12725 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) &&
12726 	    path_alloc_len <= MAXLONGPATHLEN);
12727 
12728 	error = vfs_context_copy_audit_token(context, &atoken);
12729 	if (error) {
12730 		goto out_release_port;
12731 	}
12732 
12733 	struct nspace_resolver_request req = {
12734 		.r_req_id = next_nspace_req_id(),
12735 		.r_vp = vp,
12736 		.r_tdvp = tdvp,
12737 	};
12738 
12739 	error = nspace_resolver_req_add(&req);
12740 	if (error) {
12741 		goto out_release_port;
12742 	}
12743 
12744 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12745 
12746 	if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12747 		char *dest_path = NULL;
12748 		int dest_path_len;
12749 
12750 		dest_path = zalloc(ZV_NAMEI);
12751 		dest_path_len = MAXPATHLEN;
12752 
12753 		error = vn_getpath(tdvp, dest_path, &dest_path_len);
12754 		if (error) {
12755 			zfree(ZV_NAMEI, dest_path);
12756 			goto out_release_port;
12757 		}
12758 
12759 		/*
12760 		 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12761 		 * compatibility with existing agents in user-space
12762 		 * who get passed this value.
12763 		 */
12764 		kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12765 		    req.r_req_id,
12766 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12767 		    path, dest_path, atoken);
12768 
12769 		zfree(ZV_NAMEI, dest_path);
12770 	} else if (vp_vtype == VDIR) {
12771 		char *tmpname = NULL;
12772 
12773 		/*
12774 		 * If the caller provided a lookup_name *and* a name length,
12775 		 * then we assume the lookup_name is not NUL-terminated.
12776 		 * Allocate a temporary buffer in this case to provide
12777 		 * a NUL-terminated path name to the IPC call.
12778 		 */
12779 		if (lookup_name != NULL && namelen != 0) {
12780 			if (namelen >= PATH_MAX) {
12781 				error = EINVAL;
12782 				goto out_req_remove;
12783 			}
12784 			tmpname = zalloc(ZV_NAMEI);
12785 			strlcpy(tmpname, lookup_name, namelen + 1);
12786 			lookup_name = tmpname;
12787 		} else if (lookup_name != NULL) {
12788 			/*
12789 			 * If the caller provided a lookup_name with a
12790 			 * zero name length, then we assume it's NUL-
12791 			 * terminated.  Verify it has a valid length.
12792 			 */
12793 			if (strlen(lookup_name) >= PATH_MAX) {
12794 				error = EINVAL;
12795 				goto out_req_remove;
12796 			}
12797 		}
12798 
12799 		/* (See above.) */
12800 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12801 		    req.r_req_id,
12802 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12803 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12804 
12805 		if (tmpname != NULL) {
12806 			zfree(ZV_NAMEI, tmpname);
12807 
12808 			/*
12809 			 * Poison lookup_name rather than reference
12810 			 * freed memory.
12811 			 */
12812 			lookup_name = NULL;
12813 		}
12814 	} else {
12815 		/* (See above.) */
12816 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12817 		    req.r_req_id,
12818 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12819 		    offset, size, path, atoken);
12820 	}
12821 	if (kern_ret != KERN_SUCCESS) {
12822 		/*
12823 		 * Also treat this like being unable to access the backing
12824 		 * store server.
12825 		 */
12826 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12827 		    kern_ret);
12828 		error = ETIMEDOUT;
12829 		goto out_req_remove;
12830 	}
12831 
12832 	/*
12833 	 * Give back the memory we allocated earlier while we wait; we
12834 	 * no longer need it.
12835 	 */
12836 	kfree_data(path, path_alloc_len);
12837 	path = NULL;
12838 
12839 	/*
12840 	 * Request has been submitted to the resolver. Now (interruptibly)
12841 	 * wait for completion. Upon requrn, the request will have been
12842 	 * removed from the lookup table.
12843 	 */
12844 	error = nspace_resolver_req_wait(&req);
12845 
12846 out_release_port:
12847 	if (path != NULL) {
12848 		kfree_data(path, path_alloc_len);
12849 		path = NULL;
12850 	}
12851 	ipc_port_release_send(mach_port);
12852 
12853 out_check_errors:
12854 	/*
12855 	 * The file resolver owns the logic about what error to return
12856 	 * to the caller.  We only need to handle a couple of special
12857 	 * cases here:
12858 	 */
12859 	if (error == EJUSTRETURN) {
12860 		/*
12861 		 * The requesting process is allowed to interact with
12862 		 * dataless objects.  Make a couple of sanity-checks
12863 		 * here to ensure the action makes sense.
12864 		 */
12865 		switch (op) {
12866 		case NAMESPACE_HANDLER_WRITE_OP:
12867 		case NAMESPACE_HANDLER_TRUNCATE_OP:
12868 		case NAMESPACE_HANDLER_RENAME_OP:
12869 			/*
12870 			 * This handles the case of the resolver itself
12871 			 * writing data to the file (or throwing it
12872 			 * away).
12873 			 */
12874 			error = 0;
12875 			break;
12876 		case NAMESPACE_HANDLER_READ_OP:
12877 		case NAMESPACE_HANDLER_LOOKUP_OP:
12878 			/*
12879 			 * This handles the case of the resolver needing
12880 			 * to look up inside of a dataless directory while
12881 			 * it's in the process of materializing it (for
12882 			 * example, creating files or directories).
12883 			 */
12884 			error = (vp_vtype == VDIR) ? 0 : EBADF;
12885 			break;
12886 		default:
12887 			error = EBADF;
12888 			break;
12889 		}
12890 	}
12891 
12892 	return error;
12893 
12894 out_req_remove:
12895 	nspace_resolver_req_remove(&req);
12896 	goto out_release_port;
12897 #else
12898 	return ENOTSUP;
12899 #endif /* CONFIG_DATALESS_FILES */
12900 }
12901 
12902 /*
12903  * vfs_materialize_file: Materialize a regular file.
12904  *
12905  * Inputs:
12906  * vp		The dataless file to be materialized.
12907  *
12908  * op		What kind of operation is being performed:
12909  *		-> NAMESPACE_HANDLER_READ_OP
12910  *		-> NAMESPACE_HANDLER_WRITE_OP
12911  *		-> NAMESPACE_HANDLER_LINK_CREATE
12912  *		-> NAMESPACE_HANDLER_DELETE_OP
12913  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12914  *		-> NAMESPACE_HANDLER_RENAME_OP
12915  *
12916  * offset	offset of I/O for READ or WRITE.  Ignored for
12917  *		other ops.
12918  *
12919  * size		size of I/O for READ or WRITE  Ignored for
12920  *		other ops.
12921  *
12922  * If offset or size are -1 for a READ or WRITE, then the resolver should
12923  * consider the range to be unknown.
12924  *
12925  * Upon successful return, the caller may proceed with the operation.
12926  * N.B. the file may still be "dataless" in this case.
12927  */
12928 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12929 vfs_materialize_file(
12930 	struct vnode *vp,
12931 	uint64_t op,
12932 	int64_t offset,
12933 	int64_t size)
12934 {
12935 	if (vp->v_type != VREG) {
12936 		return EFTYPE;
12937 	}
12938 	return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12939 	           NULL);
12940 }
12941 
12942 /*
12943  * vfs_materialize_dir:
12944  *
12945  * Inputs:
12946  * vp		The dataless directory to be materialized.
12947  *
12948  * op		What kind of operation is being performed:
12949  *		-> NAMESPACE_HANDLER_READ_OP
12950  *		-> NAMESPACE_HANDLER_WRITE_OP
12951  *		-> NAMESPACE_HANDLER_DELETE_OP
12952  *		-> NAMESPACE_HANDLER_RENAME_OP
12953  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12954  *
12955  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12956  *		other ops.  May or may not be NUL-terminated; see below.
12957  *
12958  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12959  *		terminated and namelen is the number of valid bytes in
12960  *		lookup_name. If zero, then lookup_name is assumed to be
12961  *		NUL-terminated.
12962  *
12963  * Upon successful return, the caller may proceed with the operation.
12964  * N.B. the directory may still be "dataless" in this case.
12965  */
12966 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12967 vfs_materialize_dir(
12968 	struct vnode *vp,
12969 	uint64_t op,
12970 	char *lookup_name,
12971 	size_t namelen)
12972 {
12973 	if (vp->v_type != VDIR) {
12974 		return EFTYPE;
12975 	}
12976 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12977 		return EINVAL;
12978 	}
12979 	return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12980 	           namelen, NULL);
12981 }
12982 
12983 /*
12984  * vfs_materialize_reparent:
12985  *
12986  * Inputs:
12987  * vp		The dataless file or directory to be materialized.
12988  *
12989  * tdvp		The new parent directory for the dataless file.
12990  *
12991  * Upon successful return, the caller may proceed with the operation.
12992  * N.B. the item may still be "dataless" in this case.
12993  */
12994 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12995 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12996 {
12997 	if (vp->v_type != VDIR && vp->v_type != VREG) {
12998 		return EFTYPE;
12999 	}
13000 	return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
13001 	           0, 0, NULL, 0, tdvp);
13002 }
13003 
13004 #if 0
13005 static int
13006 build_volfs_path(struct vnode *vp, char *path, int *len)
13007 {
13008 	struct vnode_attr va;
13009 	int ret;
13010 
13011 	VATTR_INIT(&va);
13012 	VATTR_WANTED(&va, va_fsid);
13013 	VATTR_WANTED(&va, va_fileid);
13014 
13015 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
13016 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
13017 		ret = -1;
13018 	} else {
13019 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
13020 		ret = 0;
13021 	}
13022 
13023 	return ret;
13024 }
13025 #endif
13026 
13027 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)13028 fsctl_bogus_command_compat(unsigned long cmd)
13029 {
13030 	switch (cmd) {
13031 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
13032 		return FSIOC_SYNC_VOLUME;
13033 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
13034 		return FSIOC_ROUTEFS_SETROUTEID;
13035 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
13036 		return FSIOC_SET_PACKAGE_EXTS;
13037 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
13038 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
13039 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
13040 		return DISK_CONDITIONER_IOC_GET;
13041 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
13042 		return DISK_CONDITIONER_IOC_SET;
13043 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
13044 		return FSIOC_FIOSEEKHOLE;
13045 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
13046 		return FSIOC_FIOSEEKDATA;
13047 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
13048 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
13049 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
13050 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
13051 	}
13052 
13053 	return cmd;
13054 }
13055 
13056 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)13057 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
13058 {
13059 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
13060 }
13061 
13062 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)13063 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
13064 {
13065 	struct vfs_attr vfa;
13066 	mount_t mp = vp->v_mount;
13067 	unsigned arg;
13068 	int error;
13069 
13070 	/* record vid of vp so we can drop it below. */
13071 	uint32_t vvid = vp->v_id;
13072 
13073 	/*
13074 	 * Then grab mount_iterref so that we can release the vnode.
13075 	 * Without this, a thread may call vnode_iterate_prepare then
13076 	 * get into a deadlock because we've never released the root vp
13077 	 */
13078 	error = mount_iterref(mp, 0);
13079 	if (error) {
13080 		return error;
13081 	}
13082 	vnode_hold(vp);
13083 	vnode_put(vp);
13084 
13085 	arg = MNT_NOWAIT;
13086 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
13087 		arg = MNT_WAIT;
13088 	}
13089 
13090 	/*
13091 	 * If the filessytem supports multiple filesytems in a
13092 	 * partition (For eg APFS volumes in a container, it knows
13093 	 * that the waitfor argument to VFS_SYNC are flags.
13094 	 */
13095 	VFSATTR_INIT(&vfa);
13096 	VFSATTR_WANTED(&vfa, f_capabilities);
13097 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
13098 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
13099 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
13100 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
13101 		arg |= MNT_VOLUME;
13102 	}
13103 
13104 	/* issue the sync for this volume */
13105 	(void)sync_callback(mp, &arg);
13106 
13107 	/*
13108 	 * Then release the mount_iterref once we're done syncing; it's not
13109 	 * needed for the VNOP_IOCTL below
13110 	 */
13111 	mount_iterdrop(mp);
13112 
13113 	if (arg & FSCTL_SYNC_FULLSYNC) {
13114 		/* re-obtain vnode iocount on the root vp, if possible */
13115 		error = vnode_getwithvid(vp, vvid);
13116 		if (error == 0) {
13117 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
13118 			vnode_put(vp);
13119 		}
13120 	}
13121 	vnode_drop(vp);
13122 	/* mark the argument VP as having been released */
13123 	*arg_vp = NULL;
13124 	return error;
13125 }
13126 
13127 #if ROUTEFS
13128 static int __attribute__((noinline))
handle_routes(user_addr_t udata)13129 handle_routes(user_addr_t udata)
13130 {
13131 	char routepath[MAXPATHLEN];
13132 	size_t len = 0;
13133 	int error;
13134 
13135 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
13136 		return error;
13137 	}
13138 	bzero(routepath, MAXPATHLEN);
13139 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
13140 	if (error) {
13141 		return error;
13142 	}
13143 	error = routefs_kernel_mount(routepath);
13144 	return error;
13145 }
13146 #endif
13147 
13148 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)13149 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
13150 {
13151 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
13152 	struct vnode_attr va;
13153 	int error;
13154 
13155 	VATTR_INIT(&va);
13156 	VATTR_SET(&va, va_flags, cas->new_flags);
13157 
13158 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
13159 
13160 #if CONFIG_FSE
13161 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
13162 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
13163 	}
13164 #endif
13165 
13166 	return error;
13167 }
13168 
13169 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)13170 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
13171 {
13172 	struct mount *mp = NULL;
13173 	errno_t rootauth = 0;
13174 
13175 	mp = vp->v_mount;
13176 
13177 	/*
13178 	 * query the underlying FS and see if it reports something
13179 	 * sane for this vnode. If volume is authenticated via
13180 	 * chunklist, leave that for the caller to determine.
13181 	 */
13182 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13183 
13184 	return rootauth;
13185 }
13186 
13187 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
13188 	"com.apple.private.kernel.set-package-extensions"
13189 
13190 /*
13191  * Make a filesystem-specific control call:
13192  */
13193 /* ARGSUSED */
13194 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)13195 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
13196 {
13197 	int error = 0;
13198 	boolean_t is64bit;
13199 	u_int size;
13200 #define STK_PARAMS 128
13201 	char stkbuf[STK_PARAMS] = {0};
13202 	caddr_t data, memp;
13203 	vnode_t vp = *arg_vp;
13204 
13205 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
13206 		return ENOTTY;
13207 	}
13208 
13209 	cmd = fsctl_bogus_command_compat(cmd);
13210 
13211 	size = IOCPARM_LEN(cmd);
13212 	if (size > IOCPARM_MAX) {
13213 		return EINVAL;
13214 	}
13215 
13216 	is64bit = proc_is64bit(p);
13217 
13218 	memp = NULL;
13219 
13220 	if (size > sizeof(stkbuf)) {
13221 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
13222 			return ENOMEM;
13223 		}
13224 		data = memp;
13225 	} else {
13226 		data = &stkbuf[0];
13227 	};
13228 
13229 	if (cmd & IOC_IN) {
13230 		if (size) {
13231 			error = copyin(udata, data, size);
13232 			if (error) {
13233 				if (memp) {
13234 					kfree_data(memp, size);
13235 				}
13236 				return error;
13237 			}
13238 		} else {
13239 			if (is64bit) {
13240 				*(user_addr_t *)data = udata;
13241 			} else {
13242 				*(uint32_t *)data = (uint32_t)udata;
13243 			}
13244 		};
13245 	} else if ((cmd & IOC_OUT) && size) {
13246 		/*
13247 		 * Zero the buffer so the user always
13248 		 * gets back something deterministic.
13249 		 */
13250 		bzero(data, size);
13251 	} else if (cmd & IOC_VOID) {
13252 		if (is64bit) {
13253 			*(user_addr_t *)data = udata;
13254 		} else {
13255 			*(uint32_t *)data = (uint32_t)udata;
13256 		}
13257 	}
13258 
13259 	/* Check to see if it's a generic command */
13260 	switch (cmd) {
13261 	case FSIOC_SYNC_VOLUME:
13262 		error = handle_sync_volume(vp, arg_vp, data, ctx);
13263 		break;
13264 
13265 	case FSIOC_ROUTEFS_SETROUTEID:
13266 #if ROUTEFS
13267 		error = handle_routes(udata);
13268 #endif
13269 		break;
13270 
13271 	case FSIOC_SET_PACKAGE_EXTS: {
13272 		user_addr_t ext_strings;
13273 		uint32_t    num_entries;
13274 		uint32_t    max_width;
13275 
13276 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
13277 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
13278 			error = EPERM;
13279 			break;
13280 		}
13281 
13282 		if ((is64bit && size != sizeof(user64_package_ext_info))
13283 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
13284 			// either you're 64-bit and passed a 64-bit struct or
13285 			// you're 32-bit and passed a 32-bit struct.  otherwise
13286 			// it's not ok.
13287 			error = EINVAL;
13288 			break;
13289 		}
13290 
13291 		if (is64bit) {
13292 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
13293 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
13294 			}
13295 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
13296 			num_entries = ((user64_package_ext_info *)data)->num_entries;
13297 			max_width   = ((user64_package_ext_info *)data)->max_width;
13298 		} else {
13299 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
13300 			num_entries = ((user32_package_ext_info *)data)->num_entries;
13301 			max_width   = ((user32_package_ext_info *)data)->max_width;
13302 		}
13303 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
13304 	}
13305 	break;
13306 
13307 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
13308 	{
13309 		mount_t mp;
13310 
13311 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
13312 			break;
13313 		}
13314 		if ((mp = vp->v_mount) != NULL) {
13315 			mount_lock(mp);
13316 			if (data[0] != 0) {
13317 				for (int i = 0; i < MFSTYPENAMELEN; i++) {
13318 					if (!data[i]) {
13319 						goto continue_copy;
13320 					}
13321 				}
13322 				/*
13323 				 * Getting here means we have a user data
13324 				 * string which has no NULL termination in
13325 				 * its first MFSTYPENAMELEN bytes.  This is
13326 				 * bogus, let's avoid strlcpy-ing the read
13327 				 * data and return an error.
13328 				 */
13329 				error = EINVAL;
13330 				goto unlock;
13331 continue_copy:
13332 				vfs_setfstypename_locked(mp, data);
13333 				if (vfs_isrdonly(mp) &&
13334 				    strcmp(data, "mtmfs") == 0) {
13335 					mp->mnt_kern_flag |=
13336 					    MNTK_EXTENDED_SECURITY;
13337 					mp->mnt_kern_flag &=
13338 					    ~MNTK_AUTH_OPAQUE;
13339 				}
13340 			} else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13341 				const char *name =
13342 				    vfs_getfstypenameref_locked(mp, NULL);
13343 				if (strcmp(name, "mtmfs") == 0) {
13344 					mp->mnt_kern_flag &=
13345 					    ~MNTK_EXTENDED_SECURITY;
13346 				}
13347 				vfs_setfstypename_locked(mp, NULL);
13348 			}
13349 unlock:
13350 			mount_unlock(mp);
13351 		}
13352 	}
13353 	break;
13354 
13355 	case DISK_CONDITIONER_IOC_GET: {
13356 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
13357 	}
13358 	break;
13359 
13360 	case DISK_CONDITIONER_IOC_SET: {
13361 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
13362 	}
13363 	break;
13364 
13365 	case FSIOC_CAS_BSDFLAGS:
13366 		error = handle_flags(vp, data, ctx);
13367 		break;
13368 
13369 	case FSIOC_FD_ONLY_OPEN_ONCE: {
13370 		error = 0;
13371 		if (vnode_usecount(vp) > 1) {
13372 			vnode_lock_spin(vp);
13373 			if (vp->v_lflag & VL_HASSTREAMS) {
13374 				if (vnode_isinuse_locked(vp, 1, 1)) {
13375 					error = EBUSY;
13376 				}
13377 			} else if (vnode_usecount(vp) > 1) {
13378 				error = EBUSY;
13379 			}
13380 			vnode_unlock(vp);
13381 		}
13382 	}
13383 	break;
13384 
13385 	case FSIOC_EVAL_ROOTAUTH:
13386 		error = handle_auth(vp, cmd, data, options, ctx);
13387 		break;
13388 
13389 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
13390 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
13391 		break;
13392 
13393 #if CONFIG_EXCLAVES
13394 	case FSIOC_EXCLAVE_FS_REGISTER:
13395 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13396 			error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
13397 		} else {
13398 			error = EPERM;
13399 		}
13400 		break;
13401 
13402 	case FSIOC_EXCLAVE_FS_UNREGISTER:
13403 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13404 			error = vfs_exclave_fs_unregister(vp);
13405 		} else {
13406 			error = EPERM;
13407 		}
13408 		break;
13409 
13410 	case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
13411 		exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
13412 		exclave_fs_base_dir_t *dirs = NULL;
13413 		if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT) &&
13414 		    !IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_LIST_ENTITLEMENT)) {
13415 			error = EPERM;
13416 			break;
13417 		}
13418 		if (get_base_dirs->base_dirs) {
13419 			if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
13420 				error = EINVAL;
13421 				break;
13422 			}
13423 			dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
13424 			if (!dirs) {
13425 				error = ENOSPC;
13426 				break;
13427 			}
13428 		}
13429 		error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
13430 		if (!error && dirs) {
13431 			error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
13432 			    get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
13433 		}
13434 		if (dirs) {
13435 			kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
13436 		}
13437 	}
13438 	break;
13439 #endif
13440 
13441 	default: {
13442 		/*
13443 		 * Other, known commands shouldn't be passed down here.
13444 		 * (When adding a selector to this list, it may be prudent
13445 		 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
13446 		 */
13447 		switch (cmd) {
13448 		case F_PUNCHHOLE:
13449 		case F_TRIM_ACTIVE_FILE:
13450 		case F_RDADVISE:
13451 		case F_TRANSCODEKEY:
13452 		case F_GETPROTECTIONLEVEL:
13453 		case F_GETDEFAULTPROTLEVEL:
13454 		case F_MAKECOMPRESSED:
13455 		case F_SET_GREEDY_MODE:
13456 		case F_SETSTATICCONTENT:
13457 		case F_SETIOTYPE:
13458 		case F_SETBACKINGSTORE:
13459 		case F_GETPATH_MTMINFO:
13460 		case APFSIOC_REVERT_TO_SNAPSHOT:
13461 		case FSIOC_FIOSEEKHOLE:
13462 		case FSIOC_FIOSEEKDATA:
13463 		case HFS_GET_BOOT_INFO:
13464 		case HFS_SET_BOOT_INFO:
13465 		case FIOPINSWAP:
13466 		case F_CHKCLEAN:
13467 		case F_FULLFSYNC:
13468 		case F_BARRIERFSYNC:
13469 		case F_FREEZE_FS:
13470 		case F_THAW_FS:
13471 		case FSIOC_KERNEL_ROOTAUTH:
13472 		case FSIOC_GRAFT_FS:
13473 		case FSIOC_UNGRAFT_FS:
13474 		case FSIOC_AUTH_FS:
13475 		case F_SPECULATIVE_READ:
13476 		case F_ATTRIBUTION_TAG:
13477 		case F_TRANSFEREXTENTS:
13478 		case F_ASSERT_BG_ACCESS:
13479 		case F_RELEASE_BG_ACCESS:
13480 			error = EINVAL;
13481 			goto outdrop;
13482 		}
13483 		/* Invoke the filesystem-specific code */
13484 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13485 	}
13486 	} /* end switch stmt */
13487 
13488 	/*
13489 	 * if no errors, copy any data to user. Size was
13490 	 * already set and checked above.
13491 	 */
13492 	if (error == 0 && (cmd & IOC_OUT) && size) {
13493 		error = copyout(data, udata, size);
13494 	}
13495 
13496 outdrop:
13497 	if (memp) {
13498 		kfree_data(memp, size);
13499 	}
13500 
13501 	return error;
13502 }
13503 
13504 /* ARGSUSED */
13505 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)13506 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
13507 {
13508 	int error;
13509 	struct nameidata nd;
13510 	uint32_t nameiflags;
13511 	vnode_t vp = NULL;
13512 	vfs_context_t ctx = vfs_context_current();
13513 
13514 	AUDIT_ARG(cmd, (int)uap->cmd);
13515 	AUDIT_ARG(value32, uap->options);
13516 	/* Get the vnode for the file we are getting info on:  */
13517 	nameiflags = 0;
13518 	//
13519 	// if we come through fsctl() then the file is by definition not open.
13520 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
13521 	// lest the caller mistakenly thinks the only open is their own (but in
13522 	// reality it's someone elses).
13523 	//
13524 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
13525 		return EINVAL;
13526 	}
13527 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
13528 		nameiflags |= FOLLOW;
13529 	}
13530 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
13531 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13532 	}
13533 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13534 	    UIO_USERSPACE, uap->path, ctx);
13535 	if ((error = namei(&nd))) {
13536 		goto done;
13537 	}
13538 	vp = nd.ni_vp;
13539 	nameidone(&nd);
13540 
13541 #if CONFIG_MACF
13542 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13543 	if (error) {
13544 		goto done;
13545 	}
13546 #endif
13547 
13548 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13549 
13550 done:
13551 	if (vp) {
13552 		vnode_put(vp);
13553 	}
13554 	return error;
13555 }
13556 /* ARGSUSED */
13557 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13558 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13559 {
13560 	int error;
13561 	vnode_t vp = NULL;
13562 	vfs_context_t ctx = vfs_context_current();
13563 	int fd = -1;
13564 
13565 	AUDIT_ARG(fd, uap->fd);
13566 	AUDIT_ARG(cmd, (int)uap->cmd);
13567 	AUDIT_ARG(value32, uap->options);
13568 
13569 	/* Get the vnode for the file we are getting info on:  */
13570 	if ((error = file_vnode(uap->fd, &vp))) {
13571 		return error;
13572 	}
13573 	fd = uap->fd;
13574 	if ((error = vnode_getwithref(vp))) {
13575 		file_drop(fd);
13576 		return error;
13577 	}
13578 
13579 #if CONFIG_MACF
13580 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13581 		file_drop(fd);
13582 		vnode_put(vp);
13583 		return error;
13584 	}
13585 #endif
13586 
13587 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13588 
13589 	file_drop(fd);
13590 
13591 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13592 	if (vp) {
13593 		vnode_put(vp);
13594 	}
13595 
13596 	return error;
13597 }
13598 /* end of fsctl system call */
13599 
13600 #define FILESEC_ACCESS_ENTITLEMENT              \
13601 	"com.apple.private.vfs.filesec-access"
13602 
13603 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13604 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13605 {
13606 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13607 		/*
13608 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13609 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13610 		 */
13611 		if ((!setting && vfs_context_issuser(ctx)) ||
13612 		    IOTaskHasEntitlement(vfs_context_task(ctx),
13613 		    FILESEC_ACCESS_ENTITLEMENT)) {
13614 			return 0;
13615 		}
13616 	}
13617 
13618 	return EPERM;
13619 }
13620 
13621 /*
13622  *  Retrieve the data of an extended attribute.
13623  */
13624 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13625 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13626 {
13627 	vnode_t vp;
13628 	struct nameidata nd;
13629 	char attrname[XATTR_MAXNAMELEN + 1];
13630 	vfs_context_t ctx = vfs_context_current();
13631 	uio_t auio = NULL;
13632 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13633 	size_t attrsize = 0;
13634 	size_t namelen;
13635 	u_int32_t nameiflags;
13636 	int error;
13637 	UIO_STACKBUF(uio_buf, 1);
13638 
13639 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13640 		return EINVAL;
13641 	}
13642 
13643 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13644 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13645 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13646 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13647 	}
13648 	if (uap->options & XATTR_RESOLVE_BENEATH) {
13649 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
13650 	}
13651 
13652 	if ((error = namei(&nd))) {
13653 		return error;
13654 	}
13655 	vp = nd.ni_vp;
13656 	nameidone(&nd);
13657 
13658 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13659 	if (error != 0) {
13660 		goto out;
13661 	}
13662 	if (xattr_protected(attrname) &&
13663 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13664 		goto out;
13665 	}
13666 	/*
13667 	 * the specific check for 0xffffffff is a hack to preserve
13668 	 * binaray compatibilty in K64 with applications that discovered
13669 	 * that passing in a buf pointer and a size of -1 resulted in
13670 	 * just the size of the indicated extended attribute being returned.
13671 	 * this isn't part of the documented behavior, but because of the
13672 	 * original implemtation's check for "uap->size > 0", this behavior
13673 	 * was allowed. In K32 that check turned into a signed comparison
13674 	 * even though uap->size is unsigned...  in K64, we blow by that
13675 	 * check because uap->size is unsigned and doesn't get sign smeared
13676 	 * in the munger for a 32 bit user app.  we also need to add a
13677 	 * check to limit the maximum size of the buffer being passed in...
13678 	 * unfortunately, the underlying fileystems seem to just malloc
13679 	 * the requested size even if the actual extended attribute is tiny.
13680 	 * because that malloc is for kernel wired memory, we have to put a
13681 	 * sane limit on it.
13682 	 *
13683 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13684 	 * U64 running on K64 will yield -1 (64 bits wide)
13685 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
13686 	 */
13687 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13688 		goto no_uio;
13689 	}
13690 
13691 	if (uap->value) {
13692 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13693 			uap->size = XATTR_MAXSIZE;
13694 		}
13695 
13696 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13697 		    &uio_buf[0], sizeof(uio_buf));
13698 		uio_addiov(auio, uap->value, uap->size);
13699 	}
13700 no_uio:
13701 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13702 out:
13703 	vnode_put(vp);
13704 
13705 	if (auio) {
13706 		*retval = uap->size - uio_resid(auio);
13707 	} else {
13708 		*retval = (user_ssize_t)attrsize;
13709 	}
13710 
13711 	return error;
13712 }
13713 
13714 /*
13715  * Retrieve the data of an extended attribute.
13716  */
13717 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13718 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13719 {
13720 	vnode_t vp;
13721 	char attrname[XATTR_MAXNAMELEN + 1];
13722 	vfs_context_t ctx = vfs_context_current();
13723 	uio_t auio = NULL;
13724 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13725 	size_t attrsize = 0;
13726 	size_t namelen;
13727 	int error;
13728 	UIO_STACKBUF(uio_buf, 1);
13729 
13730 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13731 	    XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
13732 		return EINVAL;
13733 	}
13734 
13735 	if ((error = file_vnode(uap->fd, &vp))) {
13736 		return error;
13737 	}
13738 	if ((error = vnode_getwithref(vp))) {
13739 		file_drop(uap->fd);
13740 		return error;
13741 	}
13742 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13743 	if (error != 0) {
13744 		goto out;
13745 	}
13746 	if (xattr_protected(attrname) &&
13747 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13748 		goto out;
13749 	}
13750 	if (uap->value && uap->size > 0) {
13751 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13752 			uap->size = XATTR_MAXSIZE;
13753 		}
13754 
13755 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13756 		    &uio_buf[0], sizeof(uio_buf));
13757 		uio_addiov(auio, uap->value, uap->size);
13758 	}
13759 
13760 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13761 out:
13762 	(void)vnode_put(vp);
13763 	file_drop(uap->fd);
13764 
13765 	if (auio) {
13766 		*retval = uap->size - uio_resid(auio);
13767 	} else {
13768 		*retval = (user_ssize_t)attrsize;
13769 	}
13770 	return error;
13771 }
13772 
13773 /* struct for checkdirs iteration */
13774 struct setxattr_ctx {
13775 	struct nameidata nd;
13776 	char attrname[XATTR_MAXNAMELEN + 1];
13777 	UIO_STACKBUF(uio_buf, 1);
13778 };
13779 
13780 /*
13781  * Set the data of an extended attribute.
13782  */
13783 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13784 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13785 {
13786 	vnode_t vp;
13787 	vfs_context_t ctx = vfs_context_current();
13788 	uio_t auio = NULL;
13789 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13790 	size_t namelen;
13791 	u_int32_t nameiflags;
13792 	int error;
13793 	struct setxattr_ctx *sactx;
13794 
13795 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13796 		return EINVAL;
13797 	}
13798 
13799 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13800 	if (sactx == NULL) {
13801 		return ENOMEM;
13802 	}
13803 
13804 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13805 	if (error != 0) {
13806 		if (error == EPERM) {
13807 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13808 			error = ENAMETOOLONG;
13809 		}
13810 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13811 		goto out;
13812 	}
13813 	if (xattr_protected(sactx->attrname) &&
13814 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13815 		goto out;
13816 	}
13817 	if (uap->size != 0 && uap->value == 0) {
13818 		error = EINVAL;
13819 		goto out;
13820 	}
13821 	if (uap->size > INT_MAX) {
13822 		error = E2BIG;
13823 		goto out;
13824 	}
13825 
13826 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13827 #if CONFIG_FILE_LEASES
13828 	nameiflags |= WANTPARENT;
13829 #endif
13830 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13831 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13832 		sactx->nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13833 	}
13834 	if (uap->options & XATTR_RESOLVE_BENEATH) {
13835 		sactx->nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
13836 	}
13837 
13838 	if ((error = namei(&sactx->nd))) {
13839 		goto out;
13840 	}
13841 	vp = sactx->nd.ni_vp;
13842 #if CONFIG_FILE_LEASES
13843 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13844 	vnode_put(sactx->nd.ni_dvp);
13845 #endif
13846 	nameidone(&sactx->nd);
13847 
13848 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13849 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13850 	uio_addiov(auio, uap->value, uap->size);
13851 
13852 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13853 #if CONFIG_FSE
13854 	if (error == 0) {
13855 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13856 		    FSE_ARG_VNODE, vp,
13857 		    FSE_ARG_DONE);
13858 	}
13859 #endif
13860 	vnode_put(vp);
13861 out:
13862 	kfree_type(struct setxattr_ctx, sactx);
13863 	*retval = 0;
13864 	return error;
13865 }
13866 
13867 /*
13868  * Set the data of an extended attribute.
13869  */
13870 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13871 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13872 {
13873 	vnode_t vp;
13874 	char attrname[XATTR_MAXNAMELEN + 1];
13875 	vfs_context_t ctx = vfs_context_current();
13876 	uio_t auio = NULL;
13877 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13878 	size_t namelen;
13879 	int error;
13880 	UIO_STACKBUF(uio_buf, 1);
13881 
13882 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13883 	    XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
13884 		return EINVAL;
13885 	}
13886 
13887 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13888 	if (error != 0) {
13889 		if (error == EPERM) {
13890 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13891 			return ENAMETOOLONG;
13892 		}
13893 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13894 		return error;
13895 	}
13896 	if (xattr_protected(attrname) &&
13897 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13898 		return error;
13899 	}
13900 	if (uap->size != 0 && uap->value == 0) {
13901 		return EINVAL;
13902 	}
13903 	if (uap->size > INT_MAX) {
13904 		return E2BIG;
13905 	}
13906 	if ((error = file_vnode(uap->fd, &vp))) {
13907 		return error;
13908 	}
13909 	if ((error = vnode_getwithref(vp))) {
13910 		file_drop(uap->fd);
13911 		return error;
13912 	}
13913 
13914 #if CONFIG_FILE_LEASES
13915 	vnode_breakdirlease(vp, true, O_WRONLY);
13916 #endif
13917 
13918 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13919 	    &uio_buf[0], sizeof(uio_buf));
13920 	uio_addiov(auio, uap->value, uap->size);
13921 
13922 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13923 #if CONFIG_FSE
13924 	if (error == 0) {
13925 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13926 		    FSE_ARG_VNODE, vp,
13927 		    FSE_ARG_DONE);
13928 	}
13929 #endif
13930 	vnode_put(vp);
13931 	file_drop(uap->fd);
13932 	*retval = 0;
13933 	return error;
13934 }
13935 
13936 /*
13937  * Remove an extended attribute.
13938  * XXX Code duplication here.
13939  */
13940 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13941 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13942 {
13943 	vnode_t vp;
13944 	struct nameidata nd;
13945 	char attrname[XATTR_MAXNAMELEN + 1];
13946 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13947 	vfs_context_t ctx = vfs_context_current();
13948 	size_t namelen;
13949 	u_int32_t nameiflags;
13950 	int error;
13951 
13952 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13953 		return EINVAL;
13954 	}
13955 
13956 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13957 	if (error != 0) {
13958 		return error;
13959 	}
13960 	if (xattr_protected(attrname)) {
13961 		return EPERM;
13962 	}
13963 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13964 #if CONFIG_FILE_LEASES
13965 	nameiflags |= WANTPARENT;
13966 #endif
13967 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13968 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13969 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13970 	}
13971 	if (uap->options & XATTR_RESOLVE_BENEATH) {
13972 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
13973 	}
13974 
13975 	if ((error = namei(&nd))) {
13976 		return error;
13977 	}
13978 	vp = nd.ni_vp;
13979 #if CONFIG_FILE_LEASES
13980 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13981 	vnode_put(nd.ni_dvp);
13982 #endif
13983 	nameidone(&nd);
13984 
13985 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13986 #if CONFIG_FSE
13987 	if (error == 0) {
13988 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13989 		    FSE_ARG_VNODE, vp,
13990 		    FSE_ARG_DONE);
13991 	}
13992 #endif
13993 	vnode_put(vp);
13994 	*retval = 0;
13995 	return error;
13996 }
13997 
13998 /*
13999  * Remove an extended attribute.
14000  * XXX Code duplication here.
14001  */
14002 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)14003 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
14004 {
14005 	vnode_t vp;
14006 	char attrname[XATTR_MAXNAMELEN + 1];
14007 	size_t namelen;
14008 	int error;
14009 #if CONFIG_FSE
14010 	vfs_context_t ctx = vfs_context_current();
14011 #endif
14012 
14013 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
14014 	    XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
14015 		return EINVAL;
14016 	}
14017 
14018 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
14019 	if (error != 0) {
14020 		return error;
14021 	}
14022 	if (xattr_protected(attrname)) {
14023 		return EPERM;
14024 	}
14025 	if ((error = file_vnode(uap->fd, &vp))) {
14026 		return error;
14027 	}
14028 	if ((error = vnode_getwithref(vp))) {
14029 		file_drop(uap->fd);
14030 		return error;
14031 	}
14032 
14033 #if CONFIG_FILE_LEASES
14034 	vnode_breakdirlease(vp, true, O_WRONLY);
14035 #endif
14036 
14037 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
14038 #if CONFIG_FSE
14039 	if (error == 0) {
14040 		add_fsevent(FSE_XATTR_REMOVED, ctx,
14041 		    FSE_ARG_VNODE, vp,
14042 		    FSE_ARG_DONE);
14043 	}
14044 #endif
14045 	vnode_put(vp);
14046 	file_drop(uap->fd);
14047 	*retval = 0;
14048 	return error;
14049 }
14050 
14051 /*
14052  * Retrieve the list of extended attribute names.
14053  * XXX Code duplication here.
14054  */
14055 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)14056 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
14057 {
14058 	vnode_t vp;
14059 	struct nameidata nd;
14060 	vfs_context_t ctx = vfs_context_current();
14061 	uio_t auio = NULL;
14062 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
14063 	size_t attrsize = 0;
14064 	u_int32_t nameiflags;
14065 	int error;
14066 	UIO_STACKBUF(uio_buf, 1);
14067 
14068 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
14069 		return EINVAL;
14070 	}
14071 
14072 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
14073 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
14074 	if (uap->options & XATTR_NOFOLLOW_ANY) {
14075 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
14076 	}
14077 	if (uap->options & XATTR_RESOLVE_BENEATH) {
14078 		nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
14079 	}
14080 
14081 	if ((error = namei(&nd))) {
14082 		return error;
14083 	}
14084 	vp = nd.ni_vp;
14085 	nameidone(&nd);
14086 	if (uap->namebuf != 0 && uap->bufsize > 0) {
14087 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
14088 		    &uio_buf[0], sizeof(uio_buf));
14089 		uio_addiov(auio, uap->namebuf, uap->bufsize);
14090 	}
14091 
14092 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
14093 
14094 	vnode_put(vp);
14095 	if (auio) {
14096 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
14097 	} else {
14098 		*retval = (user_ssize_t)attrsize;
14099 	}
14100 	return error;
14101 }
14102 
14103 /*
14104  * Retrieve the list of extended attribute names.
14105  * XXX Code duplication here.
14106  */
14107 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)14108 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
14109 {
14110 	vnode_t vp;
14111 	uio_t auio = NULL;
14112 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
14113 	size_t attrsize = 0;
14114 	int error;
14115 	UIO_STACKBUF(uio_buf, 1);
14116 
14117 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
14118 	    XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
14119 		return EINVAL;
14120 	}
14121 
14122 	if ((error = file_vnode(uap->fd, &vp))) {
14123 		return error;
14124 	}
14125 	if ((error = vnode_getwithref(vp))) {
14126 		file_drop(uap->fd);
14127 		return error;
14128 	}
14129 	if (uap->namebuf != 0 && uap->bufsize > 0) {
14130 		auio = uio_createwithbuffer(1, 0, spacetype,
14131 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
14132 		uio_addiov(auio, uap->namebuf, uap->bufsize);
14133 	}
14134 
14135 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
14136 
14137 	vnode_put(vp);
14138 	file_drop(uap->fd);
14139 	if (auio) {
14140 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
14141 	} else {
14142 		*retval = (user_ssize_t)attrsize;
14143 	}
14144 	return error;
14145 }
14146 
14147 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)14148 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
14149     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
14150 {
14151 	int error;
14152 	vnode_t vp;
14153 	int length;
14154 	int bpflags;
14155 	/* maximum number of times to retry build_path */
14156 	unsigned int retries = 0x10;
14157 
14158 	if (bufsize > MAXLONGPATHLEN) {
14159 		return EINVAL;
14160 	}
14161 
14162 	if (buf == NULL) {
14163 		return ENOMEM;
14164 	}
14165 
14166 retry:
14167 	error = vnode_getfromid(volfs_id, objid, ctx, options & FSOPT_ISREALFSID, &vp);
14168 	if (error) {
14169 		return error;
14170 	}
14171 
14172 #if CONFIG_MACF
14173 	error = mac_vnode_check_fsgetpath(ctx, vp);
14174 	if (error) {
14175 		vnode_put(vp);
14176 		return error;
14177 	}
14178 #endif
14179 
14180 	/* Obtain the absolute path to this vnode. */
14181 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
14182 	if (options & FSOPT_NOFIRMLINKPATH) {
14183 		bpflags |= BUILDPATH_NO_FIRMLINK;
14184 	}
14185 	bpflags |= BUILDPATH_CHECK_MOVED;
14186 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
14187 	vnode_put(vp);
14188 
14189 	if (error) {
14190 		/* there was a race building the path, try a few more times */
14191 		if (error == EAGAIN) {
14192 			--retries;
14193 			if (retries > 0) {
14194 				goto retry;
14195 			}
14196 
14197 			error = ENOENT;
14198 		}
14199 		goto out;
14200 	}
14201 
14202 	AUDIT_ARG(text, buf);
14203 
14204 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
14205 		kdebug_vfs_lookup(buf, length, vp, KDBG_VFSLKUP_LOOKUP);
14206 	}
14207 
14208 	*pathlen = length; /* may be superseded by error */
14209 
14210 out:
14211 	return error;
14212 }
14213 
14214 /*
14215  * Obtain the full pathname of a file system object by id.
14216  */
14217 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)14218 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
14219     uint32_t options, user_ssize_t *retval)
14220 {
14221 	vfs_context_t ctx = vfs_context_current();
14222 	fsid_t fsid;
14223 	char *realpath;
14224 	int length;
14225 	int error;
14226 
14227 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
14228 		return EINVAL;
14229 	}
14230 
14231 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
14232 		return error;
14233 	}
14234 	AUDIT_ARG(value32, fsid.val[0]);
14235 	AUDIT_ARG(value64, objid);
14236 	/* Restrict output buffer size for now. */
14237 
14238 	if (bufsize > MAXLONGPATHLEN || bufsize <= 0) {
14239 		return EINVAL;
14240 	}
14241 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
14242 	if (realpath == NULL) {
14243 		return ENOMEM;
14244 	}
14245 
14246 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
14247 	    options, &length);
14248 
14249 	if (error) {
14250 		goto out;
14251 	}
14252 
14253 	error = copyout((caddr_t)realpath, buf, length);
14254 
14255 	*retval = (user_ssize_t)length; /* may be superseded by error */
14256 out:
14257 	kfree_data(realpath, bufsize);
14258 	return error;
14259 }
14260 
14261 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)14262 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
14263 {
14264 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
14265 	           0, retval);
14266 }
14267 
14268 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)14269 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
14270 {
14271 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
14272 	           uap->options, retval);
14273 }
14274 
14275 /*
14276  * Common routine to handle various flavors of statfs data heading out
14277  *	to user space.
14278  *
14279  * Returns:	0			Success
14280  *		EFAULT
14281  */
14282 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)14283 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
14284     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
14285     boolean_t partial_copy)
14286 {
14287 	int             error;
14288 	int             my_size, copy_size;
14289 
14290 	if (is_64_bit) {
14291 		struct user64_statfs sfs;
14292 		my_size = copy_size = sizeof(sfs);
14293 		bzero(&sfs, my_size);
14294 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14295 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14296 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14297 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
14298 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
14299 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
14300 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
14301 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
14302 		sfs.f_files = (user64_long_t)sfsp->f_files;
14303 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
14304 		sfs.f_fsid = sfsp->f_fsid;
14305 		sfs.f_owner = sfsp->f_owner;
14306 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14307 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14308 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14309 
14310 		if (partial_copy) {
14311 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14312 		}
14313 		error = copyout((caddr_t)&sfs, bufp, copy_size);
14314 	} else {
14315 		struct user32_statfs sfs;
14316 
14317 		my_size = copy_size = sizeof(sfs);
14318 		bzero(&sfs, my_size);
14319 
14320 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14321 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14322 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14323 
14324 		/*
14325 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
14326 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
14327 		 * to reflect the filesystem size as best we can.
14328 		 */
14329 		if ((sfsp->f_blocks > INT_MAX)
14330 		    /* Hack for 4061702 . I think the real fix is for Carbon to
14331 		     * look for some volume capability and not depend on hidden
14332 		     * semantics agreed between a FS and carbon.
14333 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
14334 		     * for Carbon to set bNoVolumeSizes volume attribute.
14335 		     * Without this the webdavfs files cannot be copied onto
14336 		     * disk as they look huge. This change should not affect
14337 		     * XSAN as they should not setting these to -1..
14338 		     */
14339 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
14340 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
14341 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
14342 			int             shift;
14343 
14344 			/*
14345 			 * Work out how far we have to shift the block count down to make it fit.
14346 			 * Note that it's possible to have to shift so far that the resulting
14347 			 * blocksize would be unreportably large.  At that point, we will clip
14348 			 * any values that don't fit.
14349 			 *
14350 			 * For safety's sake, we also ensure that f_iosize is never reported as
14351 			 * being smaller than f_bsize.
14352 			 */
14353 			for (shift = 0; shift < 32; shift++) {
14354 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
14355 					break;
14356 				}
14357 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
14358 					break;
14359 				}
14360 			}
14361 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
14362 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
14363 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
14364 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
14365 #undef __SHIFT_OR_CLIP
14366 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
14367 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
14368 		} else {
14369 			/* filesystem is small enough to be reported honestly */
14370 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
14371 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
14372 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
14373 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
14374 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
14375 		}
14376 		sfs.f_files = (user32_long_t)sfsp->f_files;
14377 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
14378 		sfs.f_fsid = sfsp->f_fsid;
14379 		sfs.f_owner = sfsp->f_owner;
14380 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14381 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14382 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14383 
14384 		if (partial_copy) {
14385 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14386 		}
14387 		error = copyout((caddr_t)&sfs, bufp, copy_size);
14388 	}
14389 
14390 	if (sizep != NULL) {
14391 		*sizep = my_size;
14392 	}
14393 	return error;
14394 }
14395 
14396 /*
14397  * copy stat structure into user_stat structure.
14398  */
14399 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)14400 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
14401 {
14402 	bzero(usbp, sizeof(*usbp));
14403 
14404 	usbp->st_dev = sbp->st_dev;
14405 	usbp->st_ino = sbp->st_ino;
14406 	usbp->st_mode = sbp->st_mode;
14407 	usbp->st_nlink = sbp->st_nlink;
14408 	usbp->st_uid = sbp->st_uid;
14409 	usbp->st_gid = sbp->st_gid;
14410 	usbp->st_rdev = sbp->st_rdev;
14411 #ifndef _POSIX_C_SOURCE
14412 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14413 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14414 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14415 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14416 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14417 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14418 #else
14419 	usbp->st_atime = sbp->st_atime;
14420 	usbp->st_atimensec = sbp->st_atimensec;
14421 	usbp->st_mtime = sbp->st_mtime;
14422 	usbp->st_mtimensec = sbp->st_mtimensec;
14423 	usbp->st_ctime = sbp->st_ctime;
14424 	usbp->st_ctimensec = sbp->st_ctimensec;
14425 #endif
14426 	usbp->st_size = sbp->st_size;
14427 	usbp->st_blocks = sbp->st_blocks;
14428 	usbp->st_blksize = sbp->st_blksize;
14429 	usbp->st_flags = sbp->st_flags;
14430 	usbp->st_gen = sbp->st_gen;
14431 	usbp->st_lspare = sbp->st_lspare;
14432 	usbp->st_qspare[0] = sbp->st_qspare[0];
14433 	usbp->st_qspare[1] = sbp->st_qspare[1];
14434 }
14435 
14436 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)14437 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
14438 {
14439 	bzero(usbp, sizeof(*usbp));
14440 
14441 	usbp->st_dev = sbp->st_dev;
14442 	usbp->st_ino = sbp->st_ino;
14443 	usbp->st_mode = sbp->st_mode;
14444 	usbp->st_nlink = sbp->st_nlink;
14445 	usbp->st_uid = sbp->st_uid;
14446 	usbp->st_gid = sbp->st_gid;
14447 	usbp->st_rdev = sbp->st_rdev;
14448 #ifndef _POSIX_C_SOURCE
14449 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14450 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14451 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14452 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14453 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14454 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14455 #else
14456 	usbp->st_atime = sbp->st_atime;
14457 	usbp->st_atimensec = sbp->st_atimensec;
14458 	usbp->st_mtime = sbp->st_mtime;
14459 	usbp->st_mtimensec = sbp->st_mtimensec;
14460 	usbp->st_ctime = sbp->st_ctime;
14461 	usbp->st_ctimensec = sbp->st_ctimensec;
14462 #endif
14463 	usbp->st_size = sbp->st_size;
14464 	usbp->st_blocks = sbp->st_blocks;
14465 	usbp->st_blksize = sbp->st_blksize;
14466 	usbp->st_flags = sbp->st_flags;
14467 	usbp->st_gen = sbp->st_gen;
14468 	usbp->st_lspare = sbp->st_lspare;
14469 	usbp->st_qspare[0] = sbp->st_qspare[0];
14470 	usbp->st_qspare[1] = sbp->st_qspare[1];
14471 }
14472 
14473 /*
14474  * copy stat64 structure into user_stat64 structure.
14475  */
14476 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)14477 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
14478 {
14479 	bzero(usbp, sizeof(*usbp));
14480 
14481 	usbp->st_dev = sbp->st_dev;
14482 	usbp->st_ino = sbp->st_ino;
14483 	usbp->st_mode = sbp->st_mode;
14484 	usbp->st_nlink = sbp->st_nlink;
14485 	usbp->st_uid = sbp->st_uid;
14486 	usbp->st_gid = sbp->st_gid;
14487 	usbp->st_rdev = sbp->st_rdev;
14488 #ifndef _POSIX_C_SOURCE
14489 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14490 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14491 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14492 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14493 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14494 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14495 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
14496 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
14497 #else
14498 	usbp->st_atime = sbp->st_atime;
14499 	usbp->st_atimensec = sbp->st_atimensec;
14500 	usbp->st_mtime = sbp->st_mtime;
14501 	usbp->st_mtimensec = sbp->st_mtimensec;
14502 	usbp->st_ctime = sbp->st_ctime;
14503 	usbp->st_ctimensec = sbp->st_ctimensec;
14504 	usbp->st_birthtime = sbp->st_birthtime;
14505 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14506 #endif
14507 	usbp->st_size = sbp->st_size;
14508 	usbp->st_blocks = sbp->st_blocks;
14509 	usbp->st_blksize = sbp->st_blksize;
14510 	usbp->st_flags = sbp->st_flags;
14511 	usbp->st_gen = sbp->st_gen;
14512 	usbp->st_lspare = sbp->st_lspare;
14513 	usbp->st_qspare[0] = sbp->st_qspare[0];
14514 	usbp->st_qspare[1] = sbp->st_qspare[1];
14515 }
14516 
14517 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14518 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14519 {
14520 	bzero(usbp, sizeof(*usbp));
14521 
14522 	usbp->st_dev = sbp->st_dev;
14523 	usbp->st_ino = sbp->st_ino;
14524 	usbp->st_mode = sbp->st_mode;
14525 	usbp->st_nlink = sbp->st_nlink;
14526 	usbp->st_uid = sbp->st_uid;
14527 	usbp->st_gid = sbp->st_gid;
14528 	usbp->st_rdev = sbp->st_rdev;
14529 #ifndef _POSIX_C_SOURCE
14530 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14531 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14532 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14533 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14534 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14535 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14536 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14537 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14538 #else
14539 	usbp->st_atime = sbp->st_atime;
14540 	usbp->st_atimensec = sbp->st_atimensec;
14541 	usbp->st_mtime = sbp->st_mtime;
14542 	usbp->st_mtimensec = sbp->st_mtimensec;
14543 	usbp->st_ctime = sbp->st_ctime;
14544 	usbp->st_ctimensec = sbp->st_ctimensec;
14545 	usbp->st_birthtime = sbp->st_birthtime;
14546 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14547 #endif
14548 	usbp->st_size = sbp->st_size;
14549 	usbp->st_blocks = sbp->st_blocks;
14550 	usbp->st_blksize = sbp->st_blksize;
14551 	usbp->st_flags = sbp->st_flags;
14552 	usbp->st_gen = sbp->st_gen;
14553 	usbp->st_lspare = sbp->st_lspare;
14554 	usbp->st_qspare[0] = sbp->st_qspare[0];
14555 	usbp->st_qspare[1] = sbp->st_qspare[1];
14556 }
14557 
14558 /*
14559  * Purge buffer cache for simulating cold starts
14560  */
14561 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14562 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14563 {
14564 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14565 
14566 	return VNODE_RETURNED;
14567 }
14568 
14569 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14570 vfs_purge_callback(mount_t mp, __unused void * arg)
14571 {
14572 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14573 
14574 	return VFS_RETURNED;
14575 }
14576 
14577 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14578 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14579 
14580 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14581 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14582 {
14583 	if (!kauth_cred_issuser(kauth_cred_get())) {
14584 		return EPERM;
14585 	}
14586 
14587 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14588 
14589 	/* also flush any VM pagers backed by files */
14590 	if (vfs_purge_vm_pagers) {
14591 		vm_purge_filebacked_pagers();
14592 	}
14593 
14594 	return 0;
14595 }
14596 
14597 /*
14598  * gets the vnode associated with the (unnamed) snapshot directory
14599  * for a Filesystem. The snapshot directory vnode is returned with
14600  * an iocount on it.
14601  */
14602 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14603 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14604 {
14605 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14606 }
14607 
14608 /*
14609  * Get the snapshot vnode.
14610  *
14611  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14612  * needs nameidone() on ndp.
14613  *
14614  * If the snapshot vnode exists it is returned in ndp->ni_vp.
14615  *
14616  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14617  * not needed.
14618  */
14619 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14620 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14621     user_addr_t name, struct nameidata *ndp, int32_t op,
14622 #if !CONFIG_TRIGGERS
14623     __unused
14624 #endif
14625     enum path_operation pathop,
14626     vfs_context_t ctx)
14627 {
14628 	int error, i;
14629 	caddr_t name_buf;
14630 	size_t name_len;
14631 	struct vfs_attr vfa;
14632 
14633 	*sdvpp = NULLVP;
14634 	*rvpp = NULLVP;
14635 
14636 	error = vnode_getfromfd(ctx, dirfd, rvpp);
14637 	if (error) {
14638 		return error;
14639 	}
14640 
14641 	if (!vnode_isvroot(*rvpp)) {
14642 		error = EINVAL;
14643 		goto out;
14644 	}
14645 
14646 	/* Make sure the filesystem supports snapshots */
14647 	VFSATTR_INIT(&vfa);
14648 	VFSATTR_WANTED(&vfa, f_capabilities);
14649 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14650 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14651 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14652 	    VOL_CAP_INT_SNAPSHOT)) ||
14653 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14654 	    VOL_CAP_INT_SNAPSHOT))) {
14655 		error = ENOTSUP;
14656 		goto out;
14657 	}
14658 
14659 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14660 	if (error) {
14661 		goto out;
14662 	}
14663 
14664 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14665 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14666 	if (error) {
14667 		goto out1;
14668 	}
14669 
14670 	/*
14671 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14672 	 * (the length returned by copyinstr includes the terminating NUL)
14673 	 */
14674 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14675 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14676 		error = EINVAL;
14677 		goto out1;
14678 	}
14679 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14680 		;
14681 	}
14682 	if (i < (int)name_len) {
14683 		error = EINVAL;
14684 		goto out1;
14685 	}
14686 
14687 #if CONFIG_MACF
14688 	if (op == CREATE) {
14689 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14690 		    name_buf);
14691 	} else if (op == DELETE) {
14692 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14693 		    name_buf);
14694 	}
14695 	if (error) {
14696 		goto out1;
14697 	}
14698 #endif
14699 
14700 	/* Check if the snapshot already exists ... */
14701 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14702 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14703 	ndp->ni_dvp = *sdvpp;
14704 
14705 	error = namei(ndp);
14706 out1:
14707 	zfree(ZV_NAMEI, name_buf);
14708 out:
14709 	if (error) {
14710 		if (*sdvpp) {
14711 			vnode_put(*sdvpp);
14712 			*sdvpp = NULLVP;
14713 		}
14714 		if (*rvpp) {
14715 			vnode_put(*rvpp);
14716 			*rvpp = NULLVP;
14717 		}
14718 	}
14719 	return error;
14720 }
14721 
14722 /*
14723  * create a filesystem snapshot (for supporting filesystems)
14724  *
14725  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14726  * We get to the (unnamed) snapshot directory vnode and create the vnode
14727  * for the snapshot in it.
14728  *
14729  * Restrictions:
14730  *
14731  *    a) Passed in name for snapshot cannot have slashes.
14732  *    b) name can't be "." or ".."
14733  *
14734  * Since this requires superuser privileges, vnode_authorize calls are not
14735  * made.
14736  */
14737 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)14738 snapshot_create(int dirfd, user_addr_t name, uint32_t flags,
14739     vfs_context_t ctx)
14740 {
14741 	vnode_t rvp, snapdvp;
14742 	int error;
14743 	struct nameidata *ndp;
14744 
14745 	/* No flags are currently defined */
14746 	if (flags) {
14747 		printf("snapshot_create: Invalid flags passed 0x%x\n", flags);
14748 		return EINVAL;
14749 	}
14750 
14751 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14752 
14753 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14754 	    OP_LINK, ctx);
14755 	if (error) {
14756 		goto out;
14757 	}
14758 
14759 	if (ndp->ni_vp) {
14760 		vnode_put(ndp->ni_vp);
14761 		error = EEXIST;
14762 	} else {
14763 		struct vnode_attr *vap;
14764 		vnode_t vp = NULLVP;
14765 
14766 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14767 
14768 		VATTR_INIT(vap);
14769 		VATTR_SET(vap, va_type, VREG);
14770 		VATTR_SET(vap, va_mode, 0);
14771 
14772 		error = vn_create(snapdvp, &vp, ndp, vap,
14773 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14774 		if (!error && vp) {
14775 			vnode_put(vp);
14776 		}
14777 
14778 		kfree_type(struct vnode_attr, vap);
14779 	}
14780 
14781 	nameidone(ndp);
14782 	vnode_put(snapdvp);
14783 	vnode_put(rvp);
14784 out:
14785 	kfree_type(struct nameidata, ndp);
14786 
14787 	return error;
14788 }
14789 
14790 /*
14791  * Delete a Filesystem snapshot
14792  *
14793  * get the vnode for the unnamed snapshot directory and the snapshot and
14794  * delete the snapshot.
14795  */
14796 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)14797 snapshot_delete(int dirfd, user_addr_t name, uint32_t flags,
14798     vfs_context_t ctx)
14799 {
14800 	vnode_t rvp, snapdvp;
14801 	int error;
14802 	struct nameidata *ndp;
14803 
14804 	/* No flags are currently defined */
14805 	if (flags) {
14806 		printf("snapshot_delete: Invalid flags passed 0x%x\n", flags);
14807 		return EINVAL;
14808 	}
14809 
14810 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14811 
14812 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14813 	    OP_UNLINK, ctx);
14814 	if (error) {
14815 		goto out;
14816 	}
14817 
14818 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14819 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14820 
14821 	vnode_put(ndp->ni_vp);
14822 	nameidone(ndp);
14823 	vnode_put(snapdvp);
14824 	vnode_put(rvp);
14825 out:
14826 	kfree_type(struct nameidata, ndp);
14827 
14828 	return error;
14829 }
14830 
14831 /*
14832  * Revert a filesystem to a snapshot
14833  *
14834  * Marks the filesystem to revert to the given snapshot on next mount.
14835  */
14836 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)14837 snapshot_revert(int dirfd, user_addr_t name, uint32_t flags,
14838     vfs_context_t ctx)
14839 {
14840 	int error;
14841 	vnode_t rvp;
14842 	mount_t mp;
14843 	struct fs_snapshot_revert_args revert_data;
14844 	struct componentname cnp;
14845 	caddr_t name_buf;
14846 	size_t name_len;
14847 
14848 	/* No flags are currently defined */
14849 	if (flags) {
14850 		printf("snapshot_revert: Invalid flags passed 0x%x\n", flags);
14851 		return EINVAL;
14852 	}
14853 
14854 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14855 	if (error) {
14856 		return error;
14857 	}
14858 	mp = vnode_mount(rvp);
14859 
14860 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14861 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14862 	if (error) {
14863 		zfree(ZV_NAMEI, name_buf);
14864 		vnode_put(rvp);
14865 		return error;
14866 	}
14867 
14868 #if CONFIG_MACF
14869 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14870 	if (error) {
14871 		zfree(ZV_NAMEI, name_buf);
14872 		vnode_put(rvp);
14873 		return error;
14874 	}
14875 #endif
14876 
14877 	/*
14878 	 * Grab mount_iterref so that we can release the vnode,
14879 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14880 	 */
14881 	error = mount_iterref(mp, 0);
14882 	vnode_put(rvp);
14883 	if (error) {
14884 		zfree(ZV_NAMEI, name_buf);
14885 		return error;
14886 	}
14887 
14888 	memset(&cnp, 0, sizeof(cnp));
14889 	cnp.cn_pnbuf = (char *)name_buf;
14890 	cnp.cn_nameiop = LOOKUP;
14891 	cnp.cn_flags = ISLASTCN | HASBUF;
14892 	cnp.cn_pnlen = MAXPATHLEN;
14893 	cnp.cn_nameptr = cnp.cn_pnbuf;
14894 	cnp.cn_namelen = (int)name_len;
14895 	revert_data.sr_cnp = &cnp;
14896 
14897 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14898 	mount_iterdrop(mp);
14899 	zfree(ZV_NAMEI, name_buf);
14900 
14901 	if (error) {
14902 		/* If there was any error, try again using VNOP_IOCTL */
14903 
14904 		vnode_t snapdvp;
14905 		struct nameidata namend;
14906 
14907 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14908 		    OP_LOOKUP, ctx);
14909 		if (error) {
14910 			return error;
14911 		}
14912 
14913 
14914 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14915 		    0, ctx);
14916 
14917 		vnode_put(namend.ni_vp);
14918 		nameidone(&namend);
14919 		vnode_put(snapdvp);
14920 		vnode_put(rvp);
14921 	}
14922 
14923 	return error;
14924 }
14925 
14926 /*
14927  * rename a Filesystem snapshot
14928  *
14929  * get the vnode for the unnamed snapshot directory and the snapshot and
14930  * rename the snapshot. This is a very specialised (and simple) case of
14931  * rename(2) (which has to deal with a lot more complications). It differs
14932  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14933  */
14934 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,uint32_t flags,vfs_context_t ctx)14935 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14936     uint32_t flags, vfs_context_t ctx)
14937 {
14938 	vnode_t rvp, snapdvp;
14939 	int error, i;
14940 	caddr_t newname_buf;
14941 	size_t name_len;
14942 	vnode_t fvp;
14943 	struct nameidata *fromnd, *tond;
14944 	/* carving out a chunk for structs that are too big to be on stack. */
14945 	struct {
14946 		struct nameidata from_node;
14947 		struct nameidata to_node;
14948 	} * __rename_data;
14949 
14950 	/* No flags are currently defined */
14951 	if (flags) {
14952 		printf("snapshot_rename: Invalid flags passed 0x%x\n", flags);
14953 		return EINVAL;
14954 	}
14955 
14956 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14957 	fromnd = &__rename_data->from_node;
14958 	tond = &__rename_data->to_node;
14959 
14960 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14961 	    OP_UNLINK, ctx);
14962 	if (error) {
14963 		goto out;
14964 	}
14965 	fvp  = fromnd->ni_vp;
14966 
14967 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14968 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14969 	if (error) {
14970 		goto out1;
14971 	}
14972 
14973 	/*
14974 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14975 	 * slashes.
14976 	 * (the length returned by copyinstr includes the terminating NUL)
14977 	 *
14978 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14979 	 * off here itself.
14980 	 */
14981 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14982 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14983 		error = EINVAL;
14984 		goto out1;
14985 	}
14986 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14987 		;
14988 	}
14989 	if (i < (int)name_len) {
14990 		error = EINVAL;
14991 		goto out1;
14992 	}
14993 
14994 #if CONFIG_MACF
14995 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14996 	    newname_buf);
14997 	if (error) {
14998 		goto out1;
14999 	}
15000 #endif
15001 
15002 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
15003 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
15004 	tond->ni_dvp = snapdvp;
15005 
15006 	error = namei(tond);
15007 	if (error) {
15008 		goto out2;
15009 	} else if (tond->ni_vp) {
15010 		/*
15011 		 * snapshot rename behaves differently than rename(2) - if the
15012 		 * new name exists, EEXIST is returned.
15013 		 */
15014 		vnode_put(tond->ni_vp);
15015 		error = EEXIST;
15016 		goto out2;
15017 	}
15018 
15019 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
15020 	    &tond->ni_cnd, ctx);
15021 
15022 out2:
15023 	nameidone(tond);
15024 out1:
15025 	zfree(ZV_NAMEI, newname_buf);
15026 	vnode_put(fvp);
15027 	vnode_put(snapdvp);
15028 	vnode_put(rvp);
15029 	nameidone(fromnd);
15030 out:
15031 	kfree_type(typeof(*__rename_data), __rename_data);
15032 	return error;
15033 }
15034 
15035 /*
15036  * Mount a Filesystem snapshot
15037  *
15038  * get the vnode for the unnamed snapshot directory and the snapshot and
15039  * mount the snapshot.
15040  */
15041 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,uint32_t flags,vfs_context_t ctx)15042 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
15043     __unused user_addr_t mnt_data, uint32_t flags, vfs_context_t ctx)
15044 {
15045 	mount_t mp;
15046 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
15047 	struct fs_snapshot_mount_args smnt_data;
15048 	int error, mount_flags = 0;
15049 	struct nameidata *snapndp, *dirndp;
15050 	/* carving out a chunk for structs that are too big to be on stack. */
15051 	struct {
15052 		struct nameidata snapnd;
15053 		struct nameidata dirnd;
15054 	} * __snapshot_mount_data;
15055 
15056 	/* Check for invalid flags */
15057 	if (flags & ~SNAPSHOT_MNT_VALIDMASK) {
15058 		printf("snapshot_mount: Invalid flags passed 0x%x\n", flags);
15059 		return EINVAL;
15060 	}
15061 
15062 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
15063 	snapndp = &__snapshot_mount_data->snapnd;
15064 	dirndp = &__snapshot_mount_data->dirnd;
15065 
15066 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
15067 	    OP_LOOKUP, ctx);
15068 	if (error) {
15069 		goto out;
15070 	}
15071 
15072 	snapvp  = snapndp->ni_vp;
15073 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
15074 		error = EIO;
15075 		goto out1;
15076 	}
15077 
15078 	/* Convert snapshot_mount flags to mount flags */
15079 	if (flags & SNAPSHOT_MNT_NOEXEC) {
15080 		mount_flags |= MNT_NOEXEC;
15081 	}
15082 	if (flags & SNAPSHOT_MNT_NOSUID) {
15083 		mount_flags |= MNT_NOSUID;
15084 	}
15085 	if (flags & SNAPSHOT_MNT_NODEV) {
15086 		mount_flags |= MNT_NODEV;
15087 	}
15088 	if (flags & SNAPSHOT_MNT_DONTBROWSE) {
15089 		mount_flags |= MNT_DONTBROWSE;
15090 	}
15091 	if (flags & SNAPSHOT_MNT_IGNORE_OWNERSHIP) {
15092 		mount_flags |= MNT_IGNORE_OWNERSHIP;
15093 	}
15094 	if (flags & SNAPSHOT_MNT_NOFOLLOW) {
15095 		mount_flags |= MNT_NOFOLLOW;
15096 	}
15097 
15098 	/* Get the vnode to be covered */
15099 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
15100 	    UIO_USERSPACE, directory, ctx);
15101 	if (mount_flags & MNT_NOFOLLOW) {
15102 		dirndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
15103 	}
15104 
15105 	error = namei(dirndp);
15106 	if (error) {
15107 		goto out1;
15108 	}
15109 
15110 	vp = dirndp->ni_vp;
15111 	pvp = dirndp->ni_dvp;
15112 	mp = vnode_mount(rvp);
15113 
15114 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
15115 		error = EINVAL;
15116 		goto out2;
15117 	}
15118 
15119 #if CONFIG_MACF
15120 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
15121 	    mp->mnt_vfsstat.f_fstypename);
15122 	if (error) {
15123 		goto out2;
15124 	}
15125 #endif
15126 
15127 	smnt_data.sm_mp  = mp;
15128 	smnt_data.sm_cnp = &snapndp->ni_cnd;
15129 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
15130 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), mount_flags,
15131 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
15132 
15133 out2:
15134 	vnode_put(vp);
15135 	vnode_put(pvp);
15136 	nameidone(dirndp);
15137 out1:
15138 	vnode_put(snapvp);
15139 	vnode_put(snapdvp);
15140 	vnode_put(rvp);
15141 	nameidone(snapndp);
15142 out:
15143 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
15144 	return error;
15145 }
15146 
15147 /*
15148  * Root from a snapshot of the filesystem
15149  *
15150  * Marks the filesystem to root from the given snapshot on next boot.
15151  */
15152 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)15153 snapshot_root(int dirfd, user_addr_t name, uint32_t flags,
15154     vfs_context_t ctx)
15155 {
15156 	int error;
15157 	vnode_t rvp;
15158 	mount_t mp;
15159 	struct fs_snapshot_root_args root_data;
15160 	struct componentname cnp;
15161 	caddr_t name_buf;
15162 	size_t name_len;
15163 
15164 	/* No flags are currently defined */
15165 	if (flags) {
15166 		printf("snapshot_root: Invalid flags passed 0x%x\n", flags);
15167 		return EINVAL;
15168 	}
15169 
15170 	error = vnode_getfromfd(ctx, dirfd, &rvp);
15171 	if (error) {
15172 		return error;
15173 	}
15174 	mp = vnode_mount(rvp);
15175 
15176 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
15177 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
15178 	if (error) {
15179 		zfree(ZV_NAMEI, name_buf);
15180 		vnode_put(rvp);
15181 		return error;
15182 	}
15183 
15184 	// XXX MAC checks ?
15185 
15186 	/*
15187 	 * Grab mount_iterref so that we can release the vnode,
15188 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
15189 	 */
15190 	error = mount_iterref(mp, 0);
15191 	vnode_put(rvp);
15192 	if (error) {
15193 		zfree(ZV_NAMEI, name_buf);
15194 		return error;
15195 	}
15196 
15197 	memset(&cnp, 0, sizeof(cnp));
15198 	cnp.cn_pnbuf = (char *)name_buf;
15199 	cnp.cn_nameiop = LOOKUP;
15200 	cnp.cn_flags = ISLASTCN | HASBUF;
15201 	cnp.cn_pnlen = MAXPATHLEN;
15202 	cnp.cn_nameptr = cnp.cn_pnbuf;
15203 	cnp.cn_namelen = (int)name_len;
15204 	root_data.sr_cnp = &cnp;
15205 
15206 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
15207 
15208 	mount_iterdrop(mp);
15209 	zfree(ZV_NAMEI, name_buf);
15210 
15211 	return error;
15212 }
15213 
15214 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)15215 vfs_context_can_snapshot(vfs_context_t ctx)
15216 {
15217 	static const char * const snapshot_entitlements[] = {
15218 		"com.apple.private.vfs.snapshot",
15219 		"com.apple.developer.vfs.snapshot",
15220 		"com.apple.private.apfs.arv.limited.snapshot",
15221 	};
15222 	static const size_t nentitlements =
15223 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
15224 	size_t i;
15225 
15226 	task_t task = vfs_context_task(ctx);
15227 	for (i = 0; i < nentitlements; i++) {
15228 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
15229 			return TRUE;
15230 		}
15231 	}
15232 	return FALSE;
15233 }
15234 
15235 /*
15236  * FS snapshot operations dispatcher
15237  */
15238 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)15239 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
15240     __unused int32_t *retval)
15241 {
15242 	int error;
15243 	vfs_context_t ctx = vfs_context_current();
15244 
15245 	AUDIT_ARG(fd, uap->dirfd);
15246 	AUDIT_ARG(value32, uap->op);
15247 
15248 	if (!vfs_context_can_snapshot(ctx)) {
15249 		return EPERM;
15250 	}
15251 
15252 	/*
15253 	 * Enforce user authorization for snapshot modification operations,
15254 	 * or if trying to root from snapshot.
15255 	 */
15256 	if (uap->op != SNAPSHOT_OP_MOUNT) {
15257 		vnode_t dvp = NULLVP;
15258 		vnode_t devvp = NULLVP;
15259 		mount_t mp;
15260 
15261 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
15262 		if (error) {
15263 			return error;
15264 		}
15265 		mp = vnode_mount(dvp);
15266 		devvp = mp->mnt_devvp;
15267 
15268 		/* get an iocount on devvp */
15269 		if (devvp == NULLVP) {
15270 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
15271 			/* for mounts which arent block devices */
15272 			if (error == ENOENT) {
15273 				error = ENXIO;
15274 			}
15275 		} else {
15276 			error = vnode_getwithref(devvp);
15277 		}
15278 
15279 		if (error) {
15280 			vnode_put(dvp);
15281 			return error;
15282 		}
15283 
15284 		if ((vfs_context_issuser(ctx) == 0) &&
15285 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
15286 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
15287 			error = EPERM;
15288 		}
15289 		vnode_put(dvp);
15290 		vnode_put(devvp);
15291 
15292 		if (error) {
15293 			return error;
15294 		}
15295 	}
15296 
15297 	switch (uap->op) {
15298 	case SNAPSHOT_OP_CREATE:
15299 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
15300 		break;
15301 	case SNAPSHOT_OP_DELETE:
15302 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
15303 		break;
15304 	case SNAPSHOT_OP_RENAME:
15305 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
15306 		    uap->flags, ctx);
15307 		break;
15308 	case SNAPSHOT_OP_MOUNT:
15309 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
15310 		    uap->data, uap->flags, ctx);
15311 		break;
15312 	case SNAPSHOT_OP_REVERT:
15313 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
15314 		break;
15315 #if CONFIG_MNT_ROOTSNAP
15316 	case SNAPSHOT_OP_ROOT:
15317 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
15318 		break;
15319 #endif /* CONFIG_MNT_ROOTSNAP */
15320 	default:
15321 		error = ENOSYS;
15322 	}
15323 
15324 	return error;
15325 }
15326