xref: /xnu-11417.121.6/bsd/vfs/vfs_syscalls.c (revision a1e26a70f38d1d7daa7b49b258e2f8538ad81650)
1 /*
2  * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/syslimits.h> /* For MAXLONGPATHLEN */
77 #include <sys/namei.h>
78 #include <sys/filedesc.h>
79 #include <sys/kernel.h>
80 #include <sys/file_internal.h>
81 #include <sys/stat.h>
82 #include <sys/vnode_internal.h>
83 #include <sys/mount_internal.h>
84 #include <sys/proc_internal.h>
85 #include <sys/kauth.h>
86 #include <sys/uio_internal.h>
87 #include <kern/kalloc.h>
88 #include <sys/mman.h>
89 #include <sys/dirent.h>
90 #include <sys/attr.h>
91 #include <sys/sysctl.h>
92 #include <sys/ubc.h>
93 #include <sys/quota.h>
94 #include <sys/kdebug.h>
95 #include <sys/fsevents.h>
96 #include <sys/imgsrc.h>
97 #include <sys/sysproto.h>
98 #include <sys/sysctl.h>
99 #include <sys/xattr.h>
100 #include <sys/fcntl.h>
101 #include <sys/stdio.h>
102 #include <sys/fsctl.h>
103 #include <sys/ubc_internal.h>
104 #include <sys/disk.h>
105 #include <sys/content_protection.h>
106 #include <sys/clonefile.h>
107 #include <sys/snapshot.h>
108 #include <sys/priv.h>
109 #include <sys/fsgetpath.h>
110 #include <machine/cons.h>
111 #include <machine/limits.h>
112 #include <miscfs/specfs/specdev.h>
113 
114 #include <vfs/vfs_disk_conditioner.h>
115 #if CONFIG_EXCLAVES
116 #include <vfs/vfs_exclave_fs.h>
117 #endif
118 
119 #include <security/audit/audit.h>
120 #include <bsm/audit_kevents.h>
121 
122 #include <mach/mach_types.h>
123 #include <kern/kern_types.h>
124 #include <kern/kalloc.h>
125 #include <kern/task.h>
126 
127 #include <vm/vm_pageout.h>
128 #include <vm/vm_protos.h>
129 #include <vm/memory_object_xnu.h>
130 
131 #include <libkern/OSAtomic.h>
132 #include <os/atomic_private.h>
133 #include <pexpert/pexpert.h>
134 #include <IOKit/IOBSD.h>
135 
136 // deps for MIG call
137 #include <kern/host.h>
138 #include <kern/ipc_misc.h>
139 #include <mach/host_priv.h>
140 #include <mach/vfs_nspace.h>
141 #include <os/log.h>
142 
143 #include <nfs/nfs_conf.h>
144 
145 #if ROUTEFS
146 #include <miscfs/routefs/routefs.h>
147 #endif /* ROUTEFS */
148 
149 #if CONFIG_MACF
150 #include <security/mac.h>
151 #include <security/mac_framework.h>
152 #endif
153 
154 #if CONFIG_FSE
155 #define GET_PATH(x) \
156 	((x) = get_pathbuff())
157 #define RELEASE_PATH(x) \
158 	release_pathbuff(x)
159 #else
160 #define GET_PATH(x)     \
161 	((x) = zalloc(ZV_NAMEI))
162 #define RELEASE_PATH(x) \
163 	zfree(ZV_NAMEI, x)
164 #endif /* CONFIG_FSE */
165 
166 #ifndef HFS_GET_BOOT_INFO
167 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
168 #endif
169 
170 #ifndef HFS_SET_BOOT_INFO
171 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
172 #endif
173 
174 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
175 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
176 #endif
177 
178 extern void disk_conditioner_unmount(mount_t mp);
179 
180 /* struct for checkdirs iteration */
181 struct cdirargs {
182 	vnode_t olddp;
183 	vnode_t newdp;
184 };
185 /* callback  for checkdirs iteration */
186 static int checkdirs_callback(proc_t p, void * arg);
187 
188 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
189 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
190 void enablequotas(struct mount *mp, vfs_context_t ctx);
191 static int getfsstat_callback(mount_t mp, void * arg);
192 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
193 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
194 static int sync_callback(mount_t, void *);
195 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
196     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
197     boolean_t partial_copy);
198 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
199 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
200     struct componentname *cnp, user_addr_t fsmountargs,
201     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
202 void vfs_notify_mount(vnode_t pdvp);
203 
204 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
205 
206 struct fd_vn_data * fg_vn_data_alloc(void);
207 
208 /*
209  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
210  * Concurrent lookups (or lookups by ids) on hard links can cause the
211  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
212  * does) to return ENOENT as the path cannot be returned from the name cache
213  * alone. We have no option but to retry and hope to get one namei->reverse path
214  * generation done without an intervening lookup, lookup by id on the hard link
215  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
216  * which currently are the MAC hooks for rename, unlink and rmdir.
217  */
218 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
219 
220 /* Max retry limit for rename due to vnode recycling. */
221 #define MAX_RENAME_ERECYCLE_RETRIES 1024
222 
223 #define MAX_LINK_ENOENT_RETRIES 1024
224 
225 /* Max retries for concurrent mounts on the same covered vnode. */
226 #define MAX_MOUNT_RETRIES       10
227 
228 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
229     int unlink_flags);
230 
231 #ifdef CONFIG_IMGSRC_ACCESS
232 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
233 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
234 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
235 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
236 static void mount_end_update(mount_t mp);
237 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
238 #endif /* CONFIG_IMGSRC_ACCESS */
239 
240 //snapshot functions
241 #if CONFIG_MNT_ROOTSNAP
242 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
243 #else
244 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
245 #endif
246 
247 __private_extern__
248 int sync_internal(void);
249 
250 __private_extern__
251 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
252 
253 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
254 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
255 
256 /* vars for sync mutex */
257 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
258 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
259 
260 extern lck_rw_t rootvnode_rw_lock;
261 
262 VFS_SMR_DECLARE;
263 extern uint32_t nc_smr_enabled;
264 
265 /*
266  * incremented each time a mount or unmount operation occurs
267  * used to invalidate the cached value of the rootvp in the
268  * mount structure utilized by cache_lookup_path
269  */
270 uint32_t mount_generation = 0;
271 
272 /* counts number of mount and unmount operations */
273 unsigned int vfs_nummntops = 0;
274 
275 /* system-wide, per-boot unique mount ID */
276 static _Atomic uint64_t mount_unique_id = 1;
277 
278 extern const struct fileops vnops;
279 #if CONFIG_APPLEDOUBLE
280 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
281 #endif /* CONFIG_APPLEDOUBLE */
282 
283 
284 /*
285  * Virtual File System System Calls
286  */
287 
288 /*
289  * Private in-kernel mounting spi (specific use-cases only)
290  */
291 boolean_t
vfs_iskernelmount(mount_t mp)292 vfs_iskernelmount(mount_t mp)
293 {
294 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
295 }
296 
297 __private_extern__
298 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)299 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
300     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
301     vfs_context_t ctx)
302 {
303 	struct nameidata nd;
304 	boolean_t did_namei;
305 	int error;
306 
307 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
308 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
309 	if (syscall_flags & MNT_NOFOLLOW) {
310 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
311 	}
312 
313 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
314 
315 	/*
316 	 * Get the vnode to be covered if it's not supplied
317 	 */
318 	if (vp == NULLVP) {
319 		error = namei(&nd);
320 		if (error) {
321 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
322 				printf("failed to locate mount-on path: %s ", path);
323 			}
324 			return error;
325 		}
326 		vp = nd.ni_vp;
327 		pvp = nd.ni_dvp;
328 		did_namei = TRUE;
329 	} else {
330 		char *pnbuf = CAST_DOWN(char *, path);
331 
332 		nd.ni_cnd.cn_pnbuf = pnbuf;
333 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
334 		did_namei = FALSE;
335 	}
336 
337 	kern_flags |= KERNEL_MOUNT_KMOUNT;
338 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
339 	    syscall_flags, kern_flags, NULL, ctx);
340 
341 	if (did_namei) {
342 		vnode_put(vp);
343 		vnode_put(pvp);
344 		nameidone(&nd);
345 	}
346 
347 	return error;
348 }
349 
350 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)351 vfs_mount_at_path(const char *fstype, const char *path,
352     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
353     int mnt_flags, int flags)
354 {
355 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
356 	int error, km_flags = 0;
357 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
358 
359 	/*
360 	 * This call is currently restricted to specific use cases.
361 	 */
362 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
363 		return ENOTSUP;
364 	}
365 
366 #if !defined(XNU_TARGET_OS_OSX)
367 	if (strcmp(fstype, "lifs") == 0) {
368 		syscall_flags |= MNT_NOEXEC;
369 	}
370 #endif
371 
372 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
373 		km_flags |= KERNEL_MOUNT_NOAUTH;
374 	}
375 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
376 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
377 	}
378 
379 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
380 	    syscall_flags, km_flags, ctx);
381 	if (error) {
382 		printf("%s: mount on %s failed, error %d\n", __func__, path,
383 		    error);
384 	}
385 
386 	return error;
387 }
388 
389 /*
390  * Mount a file system.
391  */
392 /* ARGSUSED */
393 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)394 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
395 {
396 	struct __mac_mount_args muap;
397 
398 	muap.type = uap->type;
399 	muap.path = uap->path;
400 	muap.flags = uap->flags;
401 	muap.data = uap->data;
402 	muap.mac_p = USER_ADDR_NULL;
403 	return __mac_mount(p, &muap, retval);
404 }
405 
406 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)407 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
408 {
409 	struct componentname    cn;
410 	vfs_context_t           ctx = vfs_context_current();
411 	size_t                  dummy = 0;
412 	int                     error;
413 	int                     flags = uap->flags;
414 	char                    fstypename[MFSNAMELEN];
415 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
416 	vnode_t                 pvp;
417 	vnode_t                 vp;
418 
419 	AUDIT_ARG(fd, uap->fd);
420 	AUDIT_ARG(fflags, flags);
421 	/* fstypename will get audited by mount_common */
422 
423 	/* Sanity check the flags */
424 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
425 		return ENOTSUP;
426 	}
427 
428 	if (flags & MNT_UNION) {
429 		return EPERM;
430 	}
431 
432 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
433 	if (error) {
434 		return error;
435 	}
436 
437 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
438 		return error;
439 	}
440 
441 	if ((error = vnode_getwithref(vp)) != 0) {
442 		file_drop(uap->fd);
443 		return error;
444 	}
445 
446 	pvp = vnode_getparent(vp);
447 	if (pvp == NULL) {
448 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
449 			error = EBUSY;
450 		} else {
451 			error = EINVAL;
452 		}
453 		vnode_put(vp);
454 		file_drop(uap->fd);
455 		return error;
456 	}
457 
458 	memset(&cn, 0, sizeof(struct componentname));
459 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
460 	cn.cn_pnlen = MAXPATHLEN;
461 
462 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
463 		zfree(ZV_NAMEI, cn.cn_pnbuf);
464 		vnode_put(pvp);
465 		vnode_put(vp);
466 		file_drop(uap->fd);
467 		return error;
468 	}
469 
470 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
471 
472 	zfree(ZV_NAMEI, cn.cn_pnbuf);
473 	vnode_put(pvp);
474 	vnode_put(vp);
475 	file_drop(uap->fd);
476 
477 	return error;
478 }
479 
480 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
481 
482 /*
483  * Get the size of a graft file (a manifest or payload file).
484  * The vp should be an iocounted vnode.
485  */
486 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)487 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
488 {
489 	struct stat64 sb = {};
490 	int error;
491 
492 	*size = 0;
493 
494 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
495 	if (error) {
496 		return error;
497 	}
498 
499 	if (sb.st_size == 0) {
500 		error = ENODATA;
501 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
502 		error = EFBIG;
503 	} else {
504 		*size = (size_t) sb.st_size;
505 	}
506 
507 	return error;
508 }
509 
510 /*
511  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
512  * `size` must already be validated.
513  */
514 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)515 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
516 {
517 	return vn_rdwr(UIO_READ, graft_vp,
518 	           (caddr_t) buf, (int) size, /* offset */ 0,
519 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
520 	           vfs_context_ucred(vctx), /* resid */ NULL,
521 	           vfs_context_proc(vctx));
522 }
523 
524 /*
525  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
526  * and read it into `buf`.
527  * If `path_prefix` is non-NULL, verify that the file path has that prefix.
528  */
529 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,const char * path_prefix,size_t * size,void * buf)530 graft_secureboot_read_fd(int fd, vfs_context_t vctx, const char *path_prefix, size_t *size, void *buf)
531 {
532 	vnode_t metadata_vp = NULLVP;
533 	char *path = NULL;
534 	int error;
535 
536 	// Convert this graft fd to a vnode.
537 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
538 		goto out;
539 	}
540 
541 	// Verify that the vnode path starts with `path_prefix` if it was passed.
542 	if (path_prefix) {
543 		int len = MAXPATHLEN;
544 		path = zalloc(ZV_NAMEI);
545 		if ((error = vn_getpath(metadata_vp, path, &len))) {
546 			goto out;
547 		}
548 		if (strncmp(path, path_prefix, strlen(path_prefix))) {
549 			error = EINVAL;
550 			goto out;
551 		}
552 	}
553 
554 	// Get (and validate) size information.
555 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
556 		goto out;
557 	}
558 
559 	// Read each file into the provided buffer - we must get the expected amount of bytes.
560 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
561 		goto out;
562 	}
563 
564 out:
565 	if (path) {
566 		zfree(ZV_NAMEI, path);
567 	}
568 	if (metadata_vp) {
569 		vnode_put(metadata_vp);
570 		metadata_vp = NULLVP;
571 	}
572 
573 	return error;
574 }
575 
576 #if XNU_TARGET_OS_OSX
577 #if defined(__arm64e__)
578 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/manifests/"
579 #else /* x86_64 */
580 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/"
581 #endif /* x86_64 */
582 #else /* !XNU_TARGET_OS_OSX */
583 #define MOBILE_ASSET_DATA_VAULT_PATH "/private/var/MobileAsset/AssetsV2/manifests/"
584 #endif /* !XNU_TARGET_OS_OSX */
585 
586 /*
587  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
588  * provided in `gfs`, saving the size of data read in `gfs`.
589  */
590 static int
graft_secureboot_read_metadata(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)591 graft_secureboot_read_metadata(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
592     vfs_context_t vctx, fsioc_graft_fs_t *gfs)
593 {
594 	const char *manifest_path_prefix = NULL;
595 	int error;
596 
597 	// For Mobile Asset, make sure that the manifest comes from a data vault.
598 	if (graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET) {
599 		manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_PATH;
600 	}
601 
602 	// Read the authentic manifest.
603 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
604 	    manifest_path_prefix, &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
605 		return error;
606 	}
607 
608 	// The user manifest is currently unused, but set its size.
609 	gfs->user_manifest_size = 0;
610 
611 	// Read the payload.
612 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
613 	    NULL, &gfs->payload_size, gfs->payload))) {
614 		return error;
615 	}
616 
617 	return 0;
618 }
619 
620 /*
621  * Call into the filesystem to verify and graft a cryptex.
622  */
623 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)624 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
625     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
626 {
627 	fsioc_graft_fs_t gfs = {};
628 	uint64_t graft_dir_ino = 0;
629 	struct stat64 sb = {};
630 	int error;
631 
632 	// Pre-flight arguments.
633 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
634 		// Make sure that this graft version matches what we support.
635 		return ENOTSUP;
636 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
637 		// For this type, cryptex VP must live on same volume as the target of graft.
638 		return EXDEV;
639 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
640 		// We cannot graft upon non-directories.
641 		return ENOTDIR;
642 	} else if (cryptex_vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) {
643 		// We do not allow grafts inside disk images.
644 		return ENODEV;
645 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
646 	    sbc_args->sbc_payload_fd < 0) {
647 		// We cannot graft without a manifest and payload.
648 		return EINVAL;
649 	}
650 
651 	if (mounton_vp) {
652 		// Get the mounton's inode number.
653 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
654 		if (error) {
655 			return error;
656 		}
657 		graft_dir_ino = (uint64_t) sb.st_ino;
658 	}
659 
660 	// Create buffers (of our maximum-defined size) to store authentication info.
661 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
662 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
663 
664 	if (!gfs.authentic_manifest || !gfs.payload) {
665 		error = ENOMEM;
666 		goto out;
667 	}
668 
669 	// Read our fd's into our buffers.
670 	// (Note that this will set the buffer size fields in `gfs`.)
671 	error = graft_secureboot_read_metadata(graft_type, sbc_args, vctx, &gfs);
672 	if (error) {
673 		goto out;
674 	}
675 
676 	gfs.graft_version = FSIOC_GRAFT_VERSION;
677 	gfs.graft_type = graft_type;
678 	gfs.graft_4cc = sbc_args->sbc_4cc;
679 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
680 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
681 	}
682 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
683 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
684 	}
685 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
686 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
687 	}
688 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
689 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
690 	}
691 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
692 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
693 	}
694 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
695 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
696 	}
697 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
698 
699 	// Call into the FS to perform the graft (and validation).
700 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
701 
702 out:
703 	if (gfs.authentic_manifest) {
704 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
705 		gfs.authentic_manifest = NULL;
706 	}
707 	if (gfs.payload) {
708 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
709 		gfs.payload = NULL;
710 	}
711 
712 	return error;
713 }
714 
715 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
716 
717 /*
718  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
719  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
720  */
721 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)722 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
723 {
724 	int ua_dmgfd = uap->dmg_fd;
725 	user_addr_t ua_mountdir = uap->mountdir;
726 	uint32_t ua_grafttype = uap->graft_type;
727 	user_addr_t ua_graftargs = uap->gda;
728 
729 	graftdmg_args_un kern_gda = {};
730 	int error = 0;
731 	secure_boot_cryptex_args_t *sbc_args = NULL;
732 
733 	vnode_t cryptex_vp = NULLVP;
734 	vnode_t mounton_vp = NULLVP;
735 	struct nameidata nd = {};
736 	vfs_context_t ctx = vfs_context_current();
737 
738 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
739 		return EPERM;
740 	}
741 
742 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
743 	if (error) {
744 		return error;
745 	}
746 
747 	// Copy mount dir in, if provided.
748 	if (ua_mountdir != USER_ADDR_NULL) {
749 		// Acquire vnode for mount-on path
750 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
751 		    UIO_USERSPACE, ua_mountdir, ctx);
752 
753 		error = namei(&nd);
754 		if (error) {
755 			return error;
756 		}
757 		mounton_vp = nd.ni_vp;
758 	}
759 
760 	// Convert fd to vnode.
761 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
762 	if (error) {
763 		goto graftout;
764 	}
765 
766 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
767 		error = EINVAL;
768 	} else {
769 		sbc_args = &kern_gda.sbc_args;
770 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
771 	}
772 
773 graftout:
774 	if (cryptex_vp) {
775 		vnode_put(cryptex_vp);
776 		cryptex_vp = NULLVP;
777 	}
778 	if (mounton_vp) {
779 		vnode_put(mounton_vp);
780 		mounton_vp = NULLVP;
781 	}
782 	if (ua_mountdir != USER_ADDR_NULL) {
783 		nameidone(&nd);
784 	}
785 
786 	return error;
787 }
788 
789 /*
790  * Ungraft a cryptex disk image (via mount dir FD)
791  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
792  */
793 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)794 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
795 {
796 	int error = 0;
797 	user_addr_t ua_mountdir = uap->mountdir;
798 	fsioc_ungraft_fs_t ugfs;
799 	vnode_t mounton_vp = NULLVP;
800 	struct nameidata nd = {};
801 	vfs_context_t ctx = vfs_context_current();
802 
803 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
804 		return EPERM;
805 	}
806 
807 	if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
808 		return EINVAL;
809 	}
810 
811 	ugfs.ungraft_flags = 0;
812 
813 	// Acquire vnode for mount-on path
814 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
815 	    UIO_USERSPACE, ua_mountdir, ctx);
816 
817 	error = namei(&nd);
818 	if (error) {
819 		return error;
820 	}
821 	mounton_vp = nd.ni_vp;
822 
823 	// Call into the FS to perform the ungraft
824 	error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
825 
826 	vnode_put(mounton_vp);
827 	nameidone(&nd);
828 
829 	return error;
830 }
831 
832 
833 void
vfs_notify_mount(vnode_t pdvp)834 vfs_notify_mount(vnode_t pdvp)
835 {
836 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
837 	lock_vnode_and_post(pdvp, NOTE_WRITE);
838 }
839 
840 /*
841  * __mac_mount:
842  *	Mount a file system taking into account MAC label behavior.
843  *	See mount(2) man page for more information
844  *
845  * Parameters:    p                        Process requesting the mount
846  *                uap                      User argument descriptor (see below)
847  *                retval                   (ignored)
848  *
849  * Indirect:      uap->type                Filesystem type
850  *                uap->path                Path to mount
851  *                uap->data                Mount arguments
852  *                uap->mac_p               MAC info
853  *                uap->flags               Mount flags
854  *
855  *
856  * Returns:        0                       Success
857  *                !0                       Not success
858  */
859 boolean_t root_fs_upgrade_try = FALSE;
860 
861 #define MAX_NESTED_UNION_MOUNTS  10
862 
863 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)864 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
865 {
866 	vnode_t pvp = NULLVP;
867 	vnode_t vp = NULLVP;
868 	int need_nameidone = 0;
869 	vfs_context_t ctx = vfs_context_current();
870 	char fstypename[MFSNAMELEN];
871 	struct nameidata nd;
872 	size_t dummy = 0;
873 	char *labelstr = NULL;
874 	size_t labelsz = 0;
875 	int flags = uap->flags;
876 	int error;
877 	int num_retries = 0;
878 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
879 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
880 #else
881 #pragma unused(p)
882 #endif
883 	/*
884 	 * Get the fs type name from user space
885 	 */
886 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
887 	if (error) {
888 		return error;
889 	}
890 
891 retry:
892 	/*
893 	 * Get the vnode to be covered
894 	 */
895 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
896 	    UIO_USERSPACE, uap->path, ctx);
897 	if (flags & MNT_NOFOLLOW) {
898 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
899 	}
900 	error = namei(&nd);
901 	if (error) {
902 		goto out;
903 	}
904 	need_nameidone = 1;
905 	vp = nd.ni_vp;
906 	pvp = nd.ni_dvp;
907 
908 #ifdef CONFIG_IMGSRC_ACCESS
909 	/* Mounting image source cannot be batched with other operations */
910 	if (flags == MNT_IMGSRC_BY_INDEX) {
911 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
912 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
913 		goto out;
914 	}
915 #endif /* CONFIG_IMGSRC_ACCESS */
916 
917 #if CONFIG_MACF
918 	/*
919 	 * Get the label string (if any) from user space
920 	 */
921 	if (uap->mac_p != USER_ADDR_NULL) {
922 		struct user_mac mac;
923 		size_t ulen = 0;
924 
925 		if (is_64bit) {
926 			struct user64_mac mac64;
927 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
928 			mac.m_buflen = (user_size_t)mac64.m_buflen;
929 			mac.m_string = (user_addr_t)mac64.m_string;
930 		} else {
931 			struct user32_mac mac32;
932 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
933 			mac.m_buflen = mac32.m_buflen;
934 			mac.m_string = mac32.m_string;
935 		}
936 		if (error) {
937 			goto out;
938 		}
939 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
940 		    (mac.m_buflen < 2)) {
941 			error = EINVAL;
942 			goto out;
943 		}
944 		labelsz = mac.m_buflen;
945 		labelstr = kalloc_data(labelsz, Z_WAITOK);
946 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
947 		if (error) {
948 			goto out;
949 		}
950 		AUDIT_ARG(mac_string, labelstr);
951 	}
952 #endif /* CONFIG_MACF */
953 
954 	AUDIT_ARG(fflags, flags);
955 
956 	if (flags & MNT_UNION) {
957 #if CONFIG_UNION_MOUNTS
958 		mount_t mp = vp->v_mount;
959 		int nested_union_mounts = 0;
960 
961 		name_cache_lock_shared();
962 
963 		/* Walk up the vnodecovered chain and check for nested union mounts. */
964 		mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
965 		while (mp) {
966 			if (!(mp->mnt_flag & MNT_UNION)) {
967 				break;
968 			}
969 			mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
970 
971 			/*
972 			 * Limit the max nested unon mounts to prevent stack exhaustion
973 			 * when calling lookup_traverse_union().
974 			 */
975 			if (++nested_union_mounts >= MAX_NESTED_UNION_MOUNTS) {
976 				error = ELOOP;
977 				break;
978 			}
979 		}
980 
981 		name_cache_unlock();
982 		if (error) {
983 			goto out;
984 		}
985 #else
986 		error = EPERM;
987 		goto out;
988 #endif /* CONFIG_UNION_MOUNTS */
989 	}
990 
991 	if ((vp->v_flag & VROOT) &&
992 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
993 #if CONFIG_UNION_MOUNTS
994 		if (!(flags & MNT_UNION)) {
995 			flags |= MNT_UPDATE;
996 		} else {
997 			/*
998 			 * For a union mount on '/', treat it as fresh
999 			 * mount instead of update.
1000 			 * Otherwise, union mouting on '/' used to panic the
1001 			 * system before, since mnt_vnodecovered was found to
1002 			 * be NULL for '/' which is required for unionlookup
1003 			 * after it gets ENOENT on union mount.
1004 			 */
1005 			flags = (flags & ~(MNT_UPDATE));
1006 		}
1007 #else
1008 		flags |= MNT_UPDATE;
1009 #endif /* CONFIG_UNION_MOUNTS */
1010 
1011 #if SECURE_KERNEL
1012 		if ((flags & MNT_RDONLY) == 0) {
1013 			/* Release kernels are not allowed to mount "/" as rw */
1014 			error = EPERM;
1015 			goto out;
1016 		}
1017 #endif
1018 
1019 		/*
1020 		 * See 7392553 for more details on why this check exists.
1021 		 * Suffice to say: If this check is ON and something tries
1022 		 * to mount the rootFS RW, we'll turn off the codesign
1023 		 * bitmap optimization.
1024 		 */
1025 #if CHECK_CS_VALIDATION_BITMAP
1026 		if ((flags & MNT_RDONLY) == 0) {
1027 			root_fs_upgrade_try = TRUE;
1028 		}
1029 #endif
1030 	}
1031 
1032 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
1033 	    labelstr, ctx);
1034 
1035 out:
1036 
1037 #if CONFIG_MACF
1038 	kfree_data(labelstr, labelsz);
1039 #endif /* CONFIG_MACF */
1040 
1041 	if (vp) {
1042 		vnode_put(vp);
1043 		vp = NULLVP;
1044 	}
1045 	if (pvp) {
1046 		vnode_put(pvp);
1047 		pvp = NULLVP;
1048 	}
1049 	if (need_nameidone) {
1050 		nameidone(&nd);
1051 		need_nameidone = 0;
1052 	}
1053 
1054 	if (error == EBUSY) {
1055 		/* Retry the lookup and mount again due to concurrent mounts. */
1056 		if (++num_retries < MAX_MOUNT_RETRIES) {
1057 			goto retry;
1058 		}
1059 	}
1060 
1061 	return error;
1062 }
1063 
1064 /*
1065  * common mount implementation (final stage of mounting)
1066  *
1067  * Arguments:
1068  *  fstypename	file system type (ie it's vfs name)
1069  *  pvp		parent of covered vnode
1070  *  vp		covered vnode
1071  *  cnp		component name (ie path) of covered vnode
1072  *  flags	generic mount flags
1073  *  fsmountargs	file system specific data
1074  *  labelstr	optional MAC label
1075  *  kernelmount	TRUE for mounts initiated from inside the kernel
1076  *  ctx		caller's context
1077  */
1078 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1079 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1080     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1081     char *labelstr, vfs_context_t ctx)
1082 {
1083 #if !CONFIG_MACF
1084 #pragma unused(labelstr)
1085 #endif
1086 	struct vnode *devvp = NULLVP;
1087 	struct vnode *device_vnode = NULLVP;
1088 #if CONFIG_MACF
1089 	struct vnode *rvp;
1090 #endif
1091 	struct mount *mp = NULL;
1092 	struct vfstable *vfsp = (struct vfstable *)0;
1093 	struct proc *p = vfs_context_proc(ctx);
1094 	int error, flag = 0;
1095 	bool flag_set = false;
1096 	user_addr_t devpath = USER_ADDR_NULL;
1097 	int ronly = 0;
1098 	int mntalloc = 0;
1099 	boolean_t vfsp_ref = FALSE;
1100 	boolean_t is_rwlock_locked = FALSE;
1101 	boolean_t did_rele = FALSE;
1102 	boolean_t have_usecount = FALSE;
1103 	boolean_t did_set_lmount = FALSE;
1104 	boolean_t did_set_vmount = FALSE;
1105 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1106 
1107 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1108 	/* Check for mutually-exclusive flag bits */
1109 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1110 	int bitcount = 0;
1111 	while (checkflags != 0) {
1112 		checkflags &= (checkflags - 1);
1113 		bitcount++;
1114 	}
1115 
1116 	if (bitcount > 1) {
1117 		//not allowed to request multiple mount-by-role flags
1118 		error = EINVAL;
1119 		goto out1;
1120 	}
1121 #endif
1122 
1123 	/*
1124 	 * Process an update for an existing mount
1125 	 */
1126 	if (flags & MNT_UPDATE) {
1127 		if ((vp->v_flag & VROOT) == 0) {
1128 			error = EINVAL;
1129 			goto out1;
1130 		}
1131 		mp = vp->v_mount;
1132 
1133 		/* if unmount or mount in progress, return error */
1134 		mount_lock_spin(mp);
1135 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1136 			mount_unlock(mp);
1137 			error = EBUSY;
1138 			goto out1;
1139 		}
1140 		mp->mnt_lflag |= MNT_LMOUNT;
1141 		did_set_lmount = TRUE;
1142 		mount_unlock(mp);
1143 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1144 		is_rwlock_locked = TRUE;
1145 		/*
1146 		 * We only allow the filesystem to be reloaded if it
1147 		 * is currently mounted read-only.
1148 		 */
1149 		if ((flags & MNT_RELOAD) &&
1150 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1151 			error = ENOTSUP;
1152 			goto out1;
1153 		}
1154 
1155 		/*
1156 		 * If content protection is enabled, update mounts are not
1157 		 * allowed to turn it off.
1158 		 */
1159 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1160 		    ((flags & MNT_CPROTECT) == 0)) {
1161 			error = EINVAL;
1162 			goto out1;
1163 		}
1164 
1165 		/*
1166 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1167 		 * failure to return an error for this so we'll just silently
1168 		 * add it if it is not passed in.
1169 		 */
1170 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1171 		    ((flags & MNT_REMOVABLE) == 0)) {
1172 			flags |= MNT_REMOVABLE;
1173 		}
1174 
1175 		/* Can't downgrade the backer of the root FS */
1176 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1177 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1178 			error = ENOTSUP;
1179 			goto out1;
1180 		}
1181 
1182 		/*
1183 		 * Only root, or the user that did the original mount is
1184 		 * permitted to update it.
1185 		 */
1186 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1187 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1188 			goto out1;
1189 		}
1190 #if CONFIG_MACF
1191 		error = mac_mount_check_remount(ctx, mp, flags);
1192 		if (error != 0) {
1193 			goto out1;
1194 		}
1195 #endif
1196 		/*
1197 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1198 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1199 		 */
1200 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1201 			flags |= MNT_NOSUID | MNT_NODEV;
1202 			if (mp->mnt_flag & MNT_NOEXEC) {
1203 				flags |= MNT_NOEXEC;
1204 			}
1205 		}
1206 		flag = mp->mnt_flag;
1207 		flag_set = true;
1208 
1209 
1210 
1211 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1212 
1213 		vfsp = mp->mnt_vtable;
1214 		goto update;
1215 	} // MNT_UPDATE
1216 
1217 	/*
1218 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1219 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1220 	 */
1221 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1222 		flags |= MNT_NOSUID | MNT_NODEV;
1223 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1224 			flags |= MNT_NOEXEC;
1225 		}
1226 	}
1227 
1228 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1229 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1230 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1231 	mount_list_lock();
1232 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1233 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1234 			vfsp->vfc_refcount++;
1235 			vfsp_ref = TRUE;
1236 			break;
1237 		}
1238 	}
1239 	mount_list_unlock();
1240 	if (vfsp == NULL) {
1241 		error = ENODEV;
1242 		goto out1;
1243 	}
1244 
1245 	/*
1246 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1247 	 * except in ROSV configs and for the initial BaseSystem root.
1248 	 */
1249 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1250 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1251 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1252 		error = EINVAL;  /* unsupported request */
1253 		goto out1;
1254 	}
1255 
1256 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1257 	if (error != 0) {
1258 		goto out1;
1259 	}
1260 
1261 	/*
1262 	 * Upon successful of prepare_coveredvp(), VMOUNT is set for the covered vp.
1263 	 */
1264 	did_set_vmount = TRUE;
1265 
1266 	/*
1267 	 * Allocate and initialize the filesystem (mount_t)
1268 	 */
1269 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1270 	mntalloc = 1;
1271 
1272 	/* Initialize the default IO constraints */
1273 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1274 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1275 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1276 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1277 	mp->mnt_devblocksize = DEV_BSIZE;
1278 	mp->mnt_alignmentmask = PAGE_MASK;
1279 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1280 	mp->mnt_ioscale = 1;
1281 	mp->mnt_ioflags = 0;
1282 	mp->mnt_realrootvp = NULLVP;
1283 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1284 
1285 	mp->mnt_lflag |= MNT_LMOUNT;
1286 	did_set_lmount = TRUE;
1287 
1288 	TAILQ_INIT(&mp->mnt_vnodelist);
1289 	TAILQ_INIT(&mp->mnt_workerqueue);
1290 	TAILQ_INIT(&mp->mnt_newvnodes);
1291 	mount_lock_init(mp);
1292 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1293 	is_rwlock_locked = TRUE;
1294 	mp->mnt_op = vfsp->vfc_vfsops;
1295 	mp->mnt_vtable = vfsp;
1296 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1297 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1298 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1299 	do {
1300 		size_t pathlen = MAXPATHLEN;
1301 
1302 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1303 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1304 		}
1305 	} while (0);
1306 	mp->mnt_vnodecovered = vp;
1307 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1308 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1309 	mp->mnt_devbsdunit = 0;
1310 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1311 
1312 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1313 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1314 
1315 	if (kernelmount) {
1316 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1317 	}
1318 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1319 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1320 	}
1321 
1322 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1323 		// kernel mounted devfs
1324 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1325 	}
1326 
1327 update:
1328 
1329 	/*
1330 	 * Set the mount level flags.
1331 	 */
1332 	if (flags & MNT_RDONLY) {
1333 		mp->mnt_flag |= MNT_RDONLY;
1334 	} else if (mp->mnt_flag & MNT_RDONLY) {
1335 		// disallow read/write upgrades of file systems that
1336 		// had the TYPENAME_OVERRIDE feature set.
1337 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1338 			error = EPERM;
1339 			goto out1;
1340 		}
1341 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1342 	}
1343 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1344 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1345 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1346 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1347 	    MNT_QUARANTINE | MNT_CPROTECT);
1348 
1349 #if SECURE_KERNEL
1350 #if !CONFIG_MNT_SUID
1351 	/*
1352 	 * On release builds of iOS based platforms, always enforce NOSUID on
1353 	 * all mounts. We do this here because we can catch update mounts as well as
1354 	 * non-update mounts in this case.
1355 	 */
1356 	mp->mnt_flag |= (MNT_NOSUID);
1357 #endif
1358 #endif
1359 
1360 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1361 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1362 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1363 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1364 	    MNT_QUARANTINE | MNT_CPROTECT);
1365 
1366 #if CONFIG_MACF
1367 	if (flags & MNT_MULTILABEL) {
1368 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1369 			error = EINVAL;
1370 			goto out1;
1371 		}
1372 		mp->mnt_flag |= MNT_MULTILABEL;
1373 	}
1374 #endif
1375 	/*
1376 	 * Process device path for local file systems if requested.
1377 	 *
1378 	 * Snapshot and mount-by-role mounts do not use this path; they are
1379 	 * passing other opaque data in the device path field.
1380 	 *
1381 	 * Basesystemroot mounts pass a device path to be resolved here,
1382 	 * but it's just a char * already inside the kernel, which
1383 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1384 	 * mounts we must skip copyin (both of the address and of the string
1385 	 * (in NDINIT).
1386 	 */
1387 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1388 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1389 		boolean_t do_copyin_devpath = true;
1390 #if CONFIG_BASESYSTEMROOT
1391 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1392 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1393 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1394 			// but is actually a char ** pointing to a (kernelspace) string.
1395 			// We manually unpack it with a series of casts and dereferences
1396 			// that reverses what was done just above us on the stack in
1397 			// imageboot_pivot_image().
1398 			// After retrieving the path to the dev node (which we will NDINIT
1399 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1400 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1401 			char **devnamepp = (char **)fsmountargs;
1402 			char *devnamep = *devnamepp;
1403 			devpath = CAST_USER_ADDR_T(devnamep);
1404 			do_copyin_devpath = false;
1405 			fsmountargs = USER_ADDR_NULL;
1406 
1407 			//Now that we have a mp, denote that this mount is for the basesystem.
1408 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1409 		}
1410 #endif // CONFIG_BASESYSTEMROOT
1411 
1412 		if (do_copyin_devpath) {
1413 			if (vfs_context_is64bit(ctx)) {
1414 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1415 					goto out1;
1416 				}
1417 				fsmountargs += sizeof(devpath);
1418 			} else {
1419 				user32_addr_t tmp;
1420 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1421 					goto out1;
1422 				}
1423 				/* munge into LP64 addr */
1424 				devpath = CAST_USER_ADDR_T(tmp);
1425 				fsmountargs += sizeof(tmp);
1426 			}
1427 		}
1428 
1429 		/* Lookup device and authorize access to it */
1430 		if ((devpath)) {
1431 			struct nameidata nd;
1432 
1433 			enum uio_seg seg = UIO_USERSPACE;
1434 #if CONFIG_BASESYSTEMROOT
1435 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1436 				seg = UIO_SYSSPACE;
1437 			}
1438 #endif // CONFIG_BASESYSTEMROOT
1439 
1440 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1441 			if (flags & MNT_NOFOLLOW) {
1442 				nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
1443 			}
1444 			if ((error = namei(&nd))) {
1445 				goto out1;
1446 			}
1447 
1448 			devvp = nd.ni_vp;
1449 
1450 			if (devvp->v_type != VBLK) {
1451 				error = ENOTBLK;
1452 				nameidone(&nd);
1453 				goto out2;
1454 			}
1455 			if (major(devvp->v_rdev) >= nblkdev) {
1456 				error = ENXIO;
1457 				nameidone(&nd);
1458 				goto out2;
1459 			}
1460 			/*
1461 			 * If mount by non-root, then verify that user has necessary
1462 			 * permissions on the device.
1463 			 */
1464 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1465 				kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1466 
1467 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1468 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1469 				}
1470 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1471 					nameidone(&nd);
1472 					goto out2;
1473 				}
1474 			}
1475 
1476 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1477 			nameidone(&nd);
1478 		}
1479 		/* On first mount, preflight and open device */
1480 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1481 			if ((error = vnode_ref(devvp))) {
1482 				goto out2;
1483 			}
1484 			/*
1485 			 * Disallow multiple mounts of the same device.
1486 			 * Disallow mounting of a device that is currently in use
1487 			 * (except for root, which might share swap device for miniroot).
1488 			 * Flush out any old buffers remaining from a previous use.
1489 			 */
1490 			if ((error = vfs_setmounting(devvp))) {
1491 				vnode_rele(devvp);
1492 				goto out2;
1493 			}
1494 
1495 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1496 				error = EBUSY;
1497 				goto out3;
1498 			}
1499 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1500 				error = ENOTBLK;
1501 				goto out3;
1502 			}
1503 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1504 				goto out3;
1505 			}
1506 
1507 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1508 #if CONFIG_MACF
1509 			error = mac_vnode_check_open(ctx,
1510 			    devvp,
1511 			    ronly ? FREAD : FREAD | FWRITE);
1512 			if (error) {
1513 				goto out3;
1514 			}
1515 #endif /* MAC */
1516 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1517 				goto out3;
1518 			}
1519 
1520 			mp->mnt_devvp = devvp;
1521 			device_vnode = devvp;
1522 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1523 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1524 		    (device_vnode = mp->mnt_devvp)) {
1525 			dev_t dev;
1526 			int maj;
1527 			/*
1528 			 * If upgrade to read-write by non-root, then verify
1529 			 * that user has necessary permissions on the device.
1530 			 */
1531 			vnode_getalways(device_vnode);
1532 
1533 			if (suser(vfs_context_ucred(ctx), NULL) &&
1534 			    (error = vnode_authorize(device_vnode, NULL,
1535 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1536 			    ctx)) != 0) {
1537 				vnode_put(device_vnode);
1538 				goto out2;
1539 			}
1540 
1541 			/* Tell the device that we're upgrading */
1542 			dev = (dev_t)device_vnode->v_rdev;
1543 			maj = major(dev);
1544 
1545 			if ((u_int)maj >= (u_int)nblkdev) {
1546 				panic("Volume mounted on a device with invalid major number.");
1547 			}
1548 
1549 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1550 			vnode_put(device_vnode);
1551 			device_vnode = NULLVP;
1552 			if (error != 0) {
1553 				goto out2;
1554 			}
1555 		}
1556 	} // localargs && !(snapshot | data | vm)
1557 
1558 #if CONFIG_MACF
1559 	if ((flags & MNT_UPDATE) == 0) {
1560 		mac_mount_label_init(mp);
1561 		mac_mount_label_associate(ctx, mp);
1562 	}
1563 	if (labelstr) {
1564 		if ((flags & MNT_UPDATE) != 0) {
1565 			error = mac_mount_check_label_update(ctx, mp);
1566 			if (error != 0) {
1567 				goto out3;
1568 			}
1569 		}
1570 	}
1571 #endif
1572 	/*
1573 	 * Mount the filesystem.  We already asserted that internal_flags
1574 	 * cannot have more than one mount-by-role bit set.
1575 	 */
1576 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1577 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1578 		    (caddr_t)fsmountargs, 0, ctx);
1579 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1580 #if CONFIG_ROSV_STARTUP
1581 		struct mount *origin_mp = (struct mount*)fsmountargs;
1582 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1583 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1584 		if (error) {
1585 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1586 		} else {
1587 			/* Mark volume associated with system volume */
1588 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1589 
1590 			/* Attempt to acquire the mnt_devvp and set it up */
1591 			struct vnode *mp_devvp = NULL;
1592 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1593 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1594 				    0, &mp_devvp, vfs_context_kernel());
1595 				if (!lerr) {
1596 					mp->mnt_devvp = mp_devvp;
1597 					//vnode_lookup took an iocount, need to drop it.
1598 					vnode_put(mp_devvp);
1599 					// now set `device_vnode` to the devvp that was acquired.
1600 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1601 					// note that though the iocount above was dropped, the mount acquires
1602 					// an implicit reference against the device.
1603 					device_vnode = mp_devvp;
1604 				}
1605 			}
1606 		}
1607 #else
1608 		error = EINVAL;
1609 #endif
1610 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1611 #if CONFIG_MOUNT_VM
1612 		struct mount *origin_mp = (struct mount*)fsmountargs;
1613 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1614 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1615 		if (error) {
1616 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1617 		} else {
1618 			/* Mark volume associated with system volume and a swap mount */
1619 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1620 			/* Attempt to acquire the mnt_devvp and set it up */
1621 			struct vnode *mp_devvp = NULL;
1622 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1623 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1624 				    0, &mp_devvp, vfs_context_kernel());
1625 				if (!lerr) {
1626 					mp->mnt_devvp = mp_devvp;
1627 					//vnode_lookup took an iocount, need to drop it.
1628 					vnode_put(mp_devvp);
1629 
1630 					// now set `device_vnode` to the devvp that was acquired.
1631 					// note that though the iocount above was dropped, the mount acquires
1632 					// an implicit reference against the device.
1633 					device_vnode = mp_devvp;
1634 				}
1635 			}
1636 		}
1637 #else
1638 		error = EINVAL;
1639 #endif
1640 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1641 #if CONFIG_MOUNT_PREBOOTRECOVERY
1642 		struct mount *origin_mp = (struct mount*)fsmountargs;
1643 		uint32_t mount_role = 0;
1644 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1645 			mount_role = VFS_PREBOOT_ROLE;
1646 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1647 			mount_role = VFS_RECOVERY_ROLE;
1648 		}
1649 
1650 		if (mount_role != 0) {
1651 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1652 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1653 			if (error) {
1654 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1655 			} else {
1656 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1657 				/* Mark volume associated with system volume */
1658 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1659 				/* Attempt to acquire the mnt_devvp and set it up */
1660 				struct vnode *mp_devvp = NULL;
1661 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1662 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1663 					    0, &mp_devvp, vfs_context_kernel());
1664 					if (!lerr) {
1665 						mp->mnt_devvp = mp_devvp;
1666 						//vnode_lookup took an iocount, need to drop it.
1667 						vnode_put(mp_devvp);
1668 
1669 						// now set `device_vnode` to the devvp that was acquired.
1670 						// note that though the iocount above was dropped, the mount acquires
1671 						// an implicit reference against the device.
1672 						device_vnode = mp_devvp;
1673 					}
1674 				}
1675 			}
1676 		} else {
1677 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1678 			error = EINVAL;
1679 		}
1680 #else
1681 		error = EINVAL;
1682 #endif
1683 	} else {
1684 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1685 	}
1686 
1687 	if (flags & MNT_UPDATE) {
1688 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1689 			mp->mnt_flag &= ~MNT_RDONLY;
1690 		}
1691 		mp->mnt_flag &= ~
1692 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1693 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1694 		if (error) {
1695 			mp->mnt_flag = flag;  /* restore flag value */
1696 		}
1697 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1698 		lck_rw_done(&mp->mnt_rwlock);
1699 		is_rwlock_locked = FALSE;
1700 		if (!error) {
1701 			enablequotas(mp, ctx);
1702 		}
1703 		goto exit;
1704 	}
1705 
1706 	/*
1707 	 * Put the new filesystem on the mount list after root.
1708 	 */
1709 	if (error == 0) {
1710 		struct vfs_attr vfsattr;
1711 		if (device_vnode) {
1712 			/*
1713 			 *   cache the IO attributes for the underlying physical media...
1714 			 *   an error return indicates the underlying driver doesn't
1715 			 *   support all the queries necessary... however, reasonable
1716 			 *   defaults will have been set, so no reason to bail or care
1717 			 *
1718 			 *   Need to do this before calling the MAC hook as it needs
1719 			 *   information from this call.
1720 			 */
1721 			vfs_init_io_attributes(device_vnode, mp);
1722 		}
1723 
1724 #if CONFIG_MACF
1725 		error = mac_mount_check_mount_late(ctx, mp);
1726 		if (error != 0) {
1727 			goto out4;
1728 		}
1729 
1730 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1731 			error = VFS_ROOT(mp, &rvp, ctx);
1732 			if (error) {
1733 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1734 				goto out4;
1735 			}
1736 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1737 			/*
1738 			 * drop reference provided by VFS_ROOT
1739 			 */
1740 			vnode_put(rvp);
1741 
1742 			if (error) {
1743 				goto out4;
1744 			}
1745 		}
1746 #endif  /* MAC */
1747 
1748 		vnode_lock_spin(vp);
1749 		CLR(vp->v_flag, VMOUNT);
1750 		vp->v_mountedhere = mp;
1751 		SET(vp->v_flag, VMOUNTEDHERE);
1752 
1753 		/*
1754 		 * Wakeup any waiter(s) in prepare_coveredvp() that is waiting for the
1755 		 * 'v_mountedhere' to be planted.
1756 		 */
1757 		wakeup(&vp->v_flag);
1758 		vnode_unlock(vp);
1759 
1760 		/*
1761 		 * taking the name_cache_lock exclusively will
1762 		 * insure that everyone is out of the fast path who
1763 		 * might be trying to use a now stale copy of
1764 		 * vp->v_mountedhere->mnt_realrootvp
1765 		 * bumping mount_generation causes the cached values
1766 		 * to be invalidated
1767 		 */
1768 		name_cache_lock();
1769 		mount_generation++;
1770 		name_cache_unlock();
1771 
1772 		error = vnode_ref(vp);
1773 		if (error != 0) {
1774 			goto out4;
1775 		}
1776 
1777 		have_usecount = TRUE;
1778 
1779 		error = checkdirs(vp, ctx);
1780 		if (error != 0) {
1781 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1782 			goto out4;
1783 		}
1784 		/*
1785 		 * there is no cleanup code here so I have made it void
1786 		 * we need to revisit this
1787 		 */
1788 		(void)VFS_START(mp, 0, ctx);
1789 
1790 		if (mount_list_add(mp) != 0) {
1791 			/*
1792 			 * The system is shutting down trying to umount
1793 			 * everything, so fail with a plausible errno.
1794 			 */
1795 			error = EBUSY;
1796 			goto out4;
1797 		}
1798 		lck_rw_done(&mp->mnt_rwlock);
1799 		is_rwlock_locked = FALSE;
1800 
1801 		/* Check if this mounted file system supports EAs or named streams. */
1802 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1803 		VFSATTR_INIT(&vfsattr);
1804 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1805 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1806 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1807 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1808 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1809 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1810 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1811 			}
1812 #if NAMEDSTREAMS
1813 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1814 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1815 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1816 			}
1817 #endif
1818 			/* Check if this file system supports path from id lookups. */
1819 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1820 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1821 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1822 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1823 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1824 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1825 			}
1826 
1827 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1828 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1829 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1830 			}
1831 		}
1832 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1833 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1834 		}
1835 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1836 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1837 		}
1838 		/* Get subtype if supported to cache it */
1839 		VFSATTR_INIT(&vfsattr);
1840 		VFSATTR_WANTED(&vfsattr, f_fssubtype);
1841 		if (vfs_getattr(mp, &vfsattr, ctx) == 0 && VFSATTR_IS_SUPPORTED(&vfsattr, f_fssubtype)) {
1842 			mp->mnt_vfsstat.f_fssubtype = vfsattr.f_fssubtype;
1843 		}
1844 
1845 		/* increment the operations count */
1846 		OSAddAtomic(1, &vfs_nummntops);
1847 		enablequotas(mp, ctx);
1848 
1849 		if (device_vnode) {
1850 			vfs_setmountedon(device_vnode);
1851 		}
1852 
1853 		/* Now that mount is setup, notify the listeners */
1854 		vfs_notify_mount(pvp);
1855 		IOBSDMountChange(mp, kIOMountChangeMount);
1856 #if CONFIG_MACF
1857 		mac_mount_notify_mount(ctx, mp);
1858 #endif /* CONFIG_MACF */
1859 	} else {
1860 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1861 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1862 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1863 			    mp->mnt_vtable->vfc_name, error);
1864 		}
1865 
1866 		vnode_lock_spin(vp);
1867 		CLR(vp->v_flag, VMOUNT);
1868 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
1869 		wakeup(&vp->v_flag);
1870 		vnode_unlock(vp);
1871 		mount_list_lock();
1872 		mp->mnt_vtable->vfc_refcount--;
1873 		mount_list_unlock();
1874 
1875 		if (device_vnode) {
1876 			vnode_rele(device_vnode);
1877 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1878 			vfs_clearmounting(device_vnode);
1879 		}
1880 		lck_rw_done(&mp->mnt_rwlock);
1881 		is_rwlock_locked = FALSE;
1882 
1883 		if (nc_smr_enabled) {
1884 			vfs_smr_synchronize();
1885 		}
1886 
1887 		/*
1888 		 * if we get here, we have a mount structure that needs to be freed,
1889 		 * but since the coveredvp hasn't yet been updated to point at it,
1890 		 * no need to worry about other threads holding a crossref on this mp
1891 		 * so it's ok to just free it
1892 		 */
1893 		mount_lock_destroy(mp);
1894 #if CONFIG_MACF
1895 		mac_mount_label_destroy(mp);
1896 #endif
1897 		zfree(mount_zone, mp);
1898 		did_set_lmount = false;
1899 	}
1900 exit:
1901 	/*
1902 	 * drop I/O count on the device vp if there was one
1903 	 */
1904 	if (devpath && devvp) {
1905 		vnode_put(devvp);
1906 	}
1907 
1908 	if (did_set_lmount) {
1909 		mount_lock_spin(mp);
1910 		mp->mnt_lflag &= ~MNT_LMOUNT;
1911 		mount_unlock(mp);
1912 	}
1913 
1914 	return error;
1915 
1916 /* Error condition exits */
1917 out4:
1918 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1919 
1920 	/*
1921 	 * If the mount has been placed on the covered vp,
1922 	 * it may have been discovered by now, so we have
1923 	 * to treat this just like an unmount
1924 	 */
1925 	mount_lock_spin(mp);
1926 	mp->mnt_lflag |= MNT_LDEAD;
1927 	mount_unlock(mp);
1928 
1929 	if (device_vnode != NULLVP) {
1930 		vnode_rele(device_vnode);
1931 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1932 		    ctx);
1933 		vfs_clearmounting(device_vnode);
1934 		did_rele = TRUE;
1935 	}
1936 
1937 	vnode_lock_spin(vp);
1938 
1939 	mp->mnt_crossref++;
1940 	CLR(vp->v_flag, VMOUNTEDHERE);
1941 	vp->v_mountedhere = (mount_t) 0;
1942 
1943 	vnode_unlock(vp);
1944 
1945 	if (have_usecount) {
1946 		vnode_rele(vp);
1947 	}
1948 out3:
1949 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1950 		vnode_rele(devvp);
1951 		vfs_clearmounting(devvp);
1952 	}
1953 out2:
1954 	if (devpath && devvp) {
1955 		vnode_put(devvp);
1956 	}
1957 out1:
1958 	/* Release mnt_rwlock only when it was taken */
1959 	if (is_rwlock_locked == TRUE) {
1960 		if (flag_set) {
1961 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1962 		}
1963 		lck_rw_done(&mp->mnt_rwlock);
1964 	}
1965 
1966 	if (did_set_lmount) {
1967 		mount_lock_spin(mp);
1968 		mp->mnt_lflag &= ~MNT_LMOUNT;
1969 		mount_unlock(mp);
1970 	}
1971 
1972 	if (did_set_vmount) {
1973 		vnode_lock_spin(vp);
1974 		CLR(vp->v_flag, VMOUNT);
1975 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
1976 		wakeup(&vp->v_flag);
1977 		vnode_unlock(vp);
1978 	}
1979 
1980 	if (mntalloc) {
1981 		if (mp->mnt_crossref) {
1982 			mount_dropcrossref(mp, vp, 0);
1983 		} else {
1984 			if (nc_smr_enabled) {
1985 				vfs_smr_synchronize();
1986 			}
1987 
1988 			mount_lock_destroy(mp);
1989 #if CONFIG_MACF
1990 			mac_mount_label_destroy(mp);
1991 #endif
1992 			zfree(mount_zone, mp);
1993 		}
1994 	}
1995 	if (vfsp_ref) {
1996 		mount_list_lock();
1997 		vfsp->vfc_refcount--;
1998 		mount_list_unlock();
1999 	}
2000 
2001 	return error;
2002 }
2003 
2004 /*
2005  * Flush in-core data, check for competing mount attempts,
2006  * and set VMOUNT
2007  */
2008 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)2009 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
2010 {
2011 #if !CONFIG_MACF
2012 #pragma unused(cnp,fsname)
2013 #endif
2014 	struct vnode_attr va;
2015 	int error;
2016 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
2017 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
2018 	boolean_t is_kmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
2019 
2020 	if (!skip_auth) {
2021 		/*
2022 		 * If the user is not root, ensure that they own the directory
2023 		 * onto which we are attempting to mount.
2024 		 */
2025 		VATTR_INIT(&va);
2026 		VATTR_WANTED(&va, va_uid);
2027 		if ((error = vnode_getattr(vp, &va, ctx)) ||
2028 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2029 		    (!vfs_context_issuser(ctx)))) {
2030 			error = EPERM;
2031 			goto out;
2032 		}
2033 	}
2034 
2035 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
2036 		goto out;
2037 	}
2038 
2039 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
2040 		goto out;
2041 	}
2042 
2043 	if (vp->v_type != VDIR) {
2044 		error = ENOTDIR;
2045 		goto out;
2046 	}
2047 
2048 	vnode_lock_spin(vp);
2049 
2050 	if (is_fmount && (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL))) {
2051 		error = EBUSY;
2052 	} else if (!is_kmount && (ISSET(vp->v_flag, VMOUNT) ||
2053 	    (vp->v_mountedhere != NULL))) {
2054 		/*
2055 		 * For mount triggered from mount() call, we want to wait for the
2056 		 * current in-progress mount to complete, redo lookup and retry the
2057 		 * mount again. Similarly, we also want to retry if we lost the race
2058 		 * due to concurrent mounts and the 'VMOUNT' flag has been cleared and
2059 		 * 'v_mountedhere' has been planted after initial lookup.
2060 		 */
2061 		if (ISSET(vp->v_flag, VMOUNT)) {
2062 			vnode_lock_convert(vp);
2063 			msleep(&vp->v_flag, &vp->v_lock, PVFS, "vnode_waitformount", NULL);
2064 		}
2065 		error = EBUSY;
2066 	} else if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
2067 		error = EBUSY;
2068 	}
2069 
2070 	if (error) {
2071 		vnode_unlock(vp);
2072 		goto out;
2073 	}
2074 	SET(vp->v_flag, VMOUNT);
2075 	vnode_unlock(vp);
2076 
2077 #if CONFIG_MACF
2078 	error = mac_mount_check_mount(ctx, vp,
2079 	    cnp, fsname);
2080 	if (error != 0) {
2081 		vnode_lock_spin(vp);
2082 		CLR(vp->v_flag, VMOUNT);
2083 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2084 		wakeup(&vp->v_flag);
2085 		vnode_unlock(vp);
2086 	}
2087 #endif
2088 
2089 out:
2090 	return error;
2091 }
2092 
2093 #if CONFIG_IMGSRC_ACCESS
2094 
2095 #define DEBUG_IMGSRC 0
2096 
2097 #if DEBUG_IMGSRC
2098 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2099 #else
2100 #define IMGSRC_DEBUG(args...) do { } while(0)
2101 #endif
2102 
2103 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2104 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2105 {
2106 	struct nameidata nd;
2107 	vnode_t vp, realdevvp;
2108 	kauth_action_t accessmode;
2109 	int error;
2110 	enum uio_seg uio = UIO_USERSPACE;
2111 
2112 	if (ctx == vfs_context_kernel()) {
2113 		uio = UIO_SYSSPACE;
2114 	}
2115 
2116 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2117 	if ((error = namei(&nd))) {
2118 		IMGSRC_DEBUG("namei() failed with %d\n", error);
2119 		return error;
2120 	}
2121 
2122 	vp = nd.ni_vp;
2123 
2124 	if (!vnode_isblk(vp)) {
2125 		IMGSRC_DEBUG("Not block device.\n");
2126 		error = ENOTBLK;
2127 		goto out;
2128 	}
2129 
2130 	realdevvp = mp->mnt_devvp;
2131 	if (realdevvp == NULLVP) {
2132 		IMGSRC_DEBUG("No device backs the mount.\n");
2133 		error = ENXIO;
2134 		goto out;
2135 	}
2136 
2137 	error = vnode_getwithref(realdevvp);
2138 	if (error != 0) {
2139 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2140 		goto out;
2141 	}
2142 
2143 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2144 		IMGSRC_DEBUG("Wrong dev_t.\n");
2145 		error = ENXIO;
2146 		goto out1;
2147 	}
2148 
2149 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2150 
2151 	/*
2152 	 * If mount by non-root, then verify that user has necessary
2153 	 * permissions on the device.
2154 	 */
2155 	if (!vfs_context_issuser(ctx)) {
2156 		accessmode = KAUTH_VNODE_READ_DATA;
2157 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2158 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2159 		}
2160 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2161 			IMGSRC_DEBUG("Access denied.\n");
2162 			goto out1;
2163 		}
2164 	}
2165 
2166 	*devvpp = vp;
2167 
2168 out1:
2169 	vnode_put(realdevvp);
2170 
2171 out:
2172 	nameidone(&nd);
2173 
2174 	if (error) {
2175 		vnode_put(vp);
2176 	}
2177 
2178 	return error;
2179 }
2180 
2181 /*
2182  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2183  * and call checkdirs()
2184  */
2185 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2186 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2187 {
2188 	int error;
2189 
2190 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2191 
2192 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2193 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2194 
2195 	vnode_lock_spin(vp);
2196 	CLR(vp->v_flag, VMOUNT);
2197 	vp->v_mountedhere = mp;
2198 	SET(vp->v_flag, VMOUNTEDHERE);
2199 	/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2200 	wakeup(&vp->v_flag);
2201 	vnode_unlock(vp);
2202 
2203 	/*
2204 	 * taking the name_cache_lock exclusively will
2205 	 * insure that everyone is out of the fast path who
2206 	 * might be trying to use a now stale copy of
2207 	 * vp->v_mountedhere->mnt_realrootvp
2208 	 * bumping mount_generation causes the cached values
2209 	 * to be invalidated
2210 	 */
2211 	name_cache_lock();
2212 	mount_generation++;
2213 	name_cache_unlock();
2214 
2215 	error = vnode_ref(vp);
2216 	if (error != 0) {
2217 		goto out;
2218 	}
2219 
2220 	error = checkdirs(vp, ctx);
2221 	if (error != 0) {
2222 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2223 		vnode_rele(vp);
2224 		goto out;
2225 	}
2226 
2227 out:
2228 	if (error != 0) {
2229 		mp->mnt_vnodecovered = NULLVP;
2230 	}
2231 	return error;
2232 }
2233 
2234 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2235 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2236 {
2237 	vnode_rele(vp);
2238 	vnode_lock_spin(vp);
2239 	CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2240 	vp->v_mountedhere = (mount_t)NULL;
2241 	/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2242 	wakeup(&vp->v_flag);
2243 	vnode_unlock(vp);
2244 
2245 	mp->mnt_vnodecovered = NULLVP;
2246 }
2247 
2248 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2249 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2250 {
2251 	int error;
2252 
2253 	/* unmount in progress return error */
2254 	mount_lock_spin(mp);
2255 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2256 		mount_unlock(mp);
2257 		return EBUSY;
2258 	}
2259 	mount_unlock(mp);
2260 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2261 
2262 	/*
2263 	 * We only allow the filesystem to be reloaded if it
2264 	 * is currently mounted read-only.
2265 	 */
2266 	if ((flags & MNT_RELOAD) &&
2267 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2268 		error = ENOTSUP;
2269 		goto out;
2270 	}
2271 
2272 	/*
2273 	 * Only root, or the user that did the original mount is
2274 	 * permitted to update it.
2275 	 */
2276 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2277 	    (!vfs_context_issuser(ctx))) {
2278 		error = EPERM;
2279 		goto out;
2280 	}
2281 #if CONFIG_MACF
2282 	error = mac_mount_check_remount(ctx, mp, flags);
2283 	if (error != 0) {
2284 		goto out;
2285 	}
2286 #endif
2287 
2288 out:
2289 	if (error) {
2290 		lck_rw_done(&mp->mnt_rwlock);
2291 	}
2292 
2293 	return error;
2294 }
2295 
2296 static void
mount_end_update(mount_t mp)2297 mount_end_update(mount_t mp)
2298 {
2299 	lck_rw_done(&mp->mnt_rwlock);
2300 }
2301 
2302 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2303 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2304 {
2305 	vnode_t vp;
2306 
2307 	if (height >= MAX_IMAGEBOOT_NESTING) {
2308 		return EINVAL;
2309 	}
2310 
2311 	vp = imgsrc_rootvnodes[height];
2312 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2313 		*rvpp = vp;
2314 		return 0;
2315 	} else {
2316 		return ENOENT;
2317 	}
2318 }
2319 
2320 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2321 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2322     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2323     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2324 {
2325 	int error;
2326 	mount_t mp;
2327 	boolean_t placed = FALSE;
2328 	struct vfstable *vfsp;
2329 	user_addr_t devpath;
2330 	char *old_mntonname;
2331 	vnode_t rvp;
2332 	vnode_t devvp;
2333 	uint32_t height;
2334 	uint32_t flags;
2335 
2336 	/* If we didn't imageboot, nothing to move */
2337 	if (imgsrc_rootvnodes[0] == NULLVP) {
2338 		return EINVAL;
2339 	}
2340 
2341 	/* Only root can do this */
2342 	if (!vfs_context_issuser(ctx)) {
2343 		return EPERM;
2344 	}
2345 
2346 	IMGSRC_DEBUG("looking for root vnode.\n");
2347 
2348 	/*
2349 	 * Get root vnode of filesystem we're moving.
2350 	 */
2351 	if (by_index) {
2352 		if (is64bit) {
2353 			struct user64_mnt_imgsrc_args mia64;
2354 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2355 			if (error != 0) {
2356 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2357 				return error;
2358 			}
2359 
2360 			height = mia64.mi_height;
2361 			flags = mia64.mi_flags;
2362 			devpath = (user_addr_t)mia64.mi_devpath;
2363 		} else {
2364 			struct user32_mnt_imgsrc_args mia32;
2365 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2366 			if (error != 0) {
2367 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2368 				return error;
2369 			}
2370 
2371 			height = mia32.mi_height;
2372 			flags = mia32.mi_flags;
2373 			devpath = mia32.mi_devpath;
2374 		}
2375 	} else {
2376 		/*
2377 		 * For binary compatibility--assumes one level of nesting.
2378 		 */
2379 		if (is64bit) {
2380 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2381 				return error;
2382 			}
2383 		} else {
2384 			user32_addr_t tmp;
2385 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2386 				return error;
2387 			}
2388 
2389 			/* munge into LP64 addr */
2390 			devpath = CAST_USER_ADDR_T(tmp);
2391 		}
2392 
2393 		height = 0;
2394 		flags = 0;
2395 	}
2396 
2397 	if (flags != 0) {
2398 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2399 		return EINVAL;
2400 	}
2401 
2402 	error = get_imgsrc_rootvnode(height, &rvp);
2403 	if (error != 0) {
2404 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2405 		return error;
2406 	}
2407 
2408 	IMGSRC_DEBUG("got old root vnode\n");
2409 
2410 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2411 
2412 	/* Can only move once */
2413 	mp = vnode_mount(rvp);
2414 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2415 		IMGSRC_DEBUG("Already moved.\n");
2416 		error = EBUSY;
2417 		goto out0;
2418 	}
2419 
2420 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2421 	IMGSRC_DEBUG("Starting updated.\n");
2422 
2423 	/* Get exclusive rwlock on mount, authorize update on mp */
2424 	error = mount_begin_update(mp, ctx, 0);
2425 	if (error != 0) {
2426 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2427 		goto out0;
2428 	}
2429 
2430 	/*
2431 	 * It can only be moved once.  Flag is set under the rwlock,
2432 	 * so we're now safe to proceed.
2433 	 */
2434 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2435 		IMGSRC_DEBUG("Already moved [2]\n");
2436 		goto out1;
2437 	}
2438 
2439 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2440 
2441 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2442 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2443 	if (error != 0) {
2444 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2445 		goto out1;
2446 	}
2447 
2448 	IMGSRC_DEBUG("Covered vp OK.\n");
2449 
2450 	/* Sanity check the name caller has provided */
2451 	vfsp = mp->mnt_vtable;
2452 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2453 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2454 		    vfsp->vfc_name, fsname);
2455 		error = EINVAL;
2456 		goto out2;
2457 	}
2458 
2459 	/* Check the device vnode and update mount-from name, for local filesystems */
2460 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2461 		IMGSRC_DEBUG("Local, doing device validation.\n");
2462 
2463 		if (devpath != USER_ADDR_NULL) {
2464 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2465 			if (error) {
2466 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2467 				goto out2;
2468 			}
2469 
2470 			vnode_put(devvp);
2471 		}
2472 	}
2473 
2474 	/*
2475 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2476 	 * and increment the name cache's mount generation
2477 	 */
2478 
2479 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2480 	error = place_mount_and_checkdirs(mp, vp, ctx);
2481 	if (error != 0) {
2482 		goto out2;
2483 	}
2484 
2485 	placed = TRUE;
2486 
2487 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2488 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2489 
2490 	/* Forbid future moves */
2491 	mount_lock(mp);
2492 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2493 	mount_unlock(mp);
2494 
2495 	/* Finally, add to mount list, completely ready to go */
2496 	if (mount_list_add(mp) != 0) {
2497 		/*
2498 		 * The system is shutting down trying to umount
2499 		 * everything, so fail with a plausible errno.
2500 		 */
2501 		error = EBUSY;
2502 		goto out3;
2503 	}
2504 
2505 	mount_end_update(mp);
2506 	vnode_put(rvp);
2507 	zfree(ZV_NAMEI, old_mntonname);
2508 
2509 	vfs_notify_mount(pvp);
2510 #if CONFIG_MACF
2511 	mac_mount_notify_mount(ctx, mp);
2512 #endif /* CONFIG_MACF */
2513 
2514 	return 0;
2515 out3:
2516 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2517 
2518 	mount_lock(mp);
2519 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2520 	mount_unlock(mp);
2521 
2522 out2:
2523 	/*
2524 	 * Placing the mp on the vnode clears VMOUNT,
2525 	 * so cleanup is different after that point
2526 	 */
2527 	if (placed) {
2528 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2529 		undo_place_on_covered_vp(mp, vp);
2530 	} else {
2531 		vnode_lock_spin(vp);
2532 		CLR(vp->v_flag, VMOUNT);
2533 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
2534 		wakeup(&vp->v_flag);
2535 		vnode_unlock(vp);
2536 	}
2537 out1:
2538 	mount_end_update(mp);
2539 
2540 out0:
2541 	vnode_put(rvp);
2542 	zfree(ZV_NAMEI, old_mntonname);
2543 	return error;
2544 }
2545 
2546 #endif /* CONFIG_IMGSRC_ACCESS */
2547 
2548 void
enablequotas(struct mount * mp,vfs_context_t ctx)2549 enablequotas(struct mount *mp, vfs_context_t ctx)
2550 {
2551 	struct nameidata qnd;
2552 	int type;
2553 	char qfpath[MAXPATHLEN];
2554 	const char *qfname = QUOTAFILENAME;
2555 	const char *qfopsname = QUOTAOPSNAME;
2556 	const char *qfextension[] = INITQFNAMES;
2557 
2558 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2559 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2560 		return;
2561 	}
2562 	/*
2563 	 * Enable filesystem disk quotas if necessary.
2564 	 * We ignore errors as this should not interfere with final mount
2565 	 */
2566 	for (type = 0; type < MAXQUOTAS; type++) {
2567 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2568 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2569 		    CAST_USER_ADDR_T(qfpath), ctx);
2570 		if (namei(&qnd) != 0) {
2571 			continue;           /* option file to trigger quotas is not present */
2572 		}
2573 		vnode_put(qnd.ni_vp);
2574 		nameidone(&qnd);
2575 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2576 
2577 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2578 	}
2579 	return;
2580 }
2581 
2582 
2583 static int
checkdirs_callback(proc_t p,void * arg)2584 checkdirs_callback(proc_t p, void * arg)
2585 {
2586 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2587 	vnode_t olddp = cdrp->olddp;
2588 	vnode_t newdp = cdrp->newdp;
2589 	struct filedesc *fdp = &p->p_fd;
2590 	vnode_t new_cvp = newdp;
2591 	vnode_t new_rvp = newdp;
2592 	vnode_t old_cvp = NULL;
2593 	vnode_t old_rvp = NULL;
2594 
2595 	/*
2596 	 * XXX Also needs to iterate each thread in the process to see if it
2597 	 * XXX is using a per-thread current working directory, and, if so,
2598 	 * XXX update that as well.
2599 	 */
2600 
2601 	/*
2602 	 * First, with the proc_fdlock held, check to see if we will need
2603 	 * to do any work.  If not, we will get out fast.
2604 	 */
2605 	proc_fdlock(p);
2606 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2607 		proc_fdunlock(p);
2608 		return PROC_RETURNED;
2609 	}
2610 	proc_fdunlock(p);
2611 
2612 	/*
2613 	 * Ok, we will have to do some work.  Always take two refs
2614 	 * because we might need that many.  We'll dispose of whatever
2615 	 * we ended up not using.
2616 	 */
2617 	if (vnode_ref(newdp) != 0) {
2618 		return PROC_RETURNED;
2619 	}
2620 	if (vnode_ref(newdp) != 0) {
2621 		vnode_rele(newdp);
2622 		return PROC_RETURNED;
2623 	}
2624 
2625 	proc_dirs_lock_exclusive(p);
2626 	/*
2627 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2628 	 * have to do all of the checks again.
2629 	 */
2630 	proc_fdlock(p);
2631 	if (fdp->fd_cdir == olddp) {
2632 		old_cvp = olddp;
2633 		fdp->fd_cdir = newdp;
2634 		new_cvp = NULL;
2635 	}
2636 	if (fdp->fd_rdir == olddp) {
2637 		old_rvp = olddp;
2638 		fdp->fd_rdir = newdp;
2639 		new_rvp = NULL;
2640 	}
2641 	proc_fdunlock(p);
2642 	proc_dirs_unlock_exclusive(p);
2643 
2644 	/*
2645 	 * Dispose of any references that are no longer needed.
2646 	 */
2647 	if (old_cvp != NULL) {
2648 		vnode_rele(old_cvp);
2649 	}
2650 	if (old_rvp != NULL) {
2651 		vnode_rele(old_rvp);
2652 	}
2653 	if (new_cvp != NULL) {
2654 		vnode_rele(new_cvp);
2655 	}
2656 	if (new_rvp != NULL) {
2657 		vnode_rele(new_rvp);
2658 	}
2659 
2660 	return PROC_RETURNED;
2661 }
2662 
2663 
2664 
2665 /*
2666  * Scan all active processes to see if any of them have a current
2667  * or root directory onto which the new filesystem has just been
2668  * mounted. If so, replace them with the new mount point.
2669  */
2670 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2671 checkdirs(vnode_t olddp, vfs_context_t ctx)
2672 {
2673 	vnode_t newdp;
2674 	vnode_t tvp;
2675 	int err;
2676 	struct cdirargs cdr;
2677 
2678 	if (olddp->v_usecount == 1) {
2679 		return 0;
2680 	}
2681 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2682 
2683 	if (err != 0) {
2684 #if DIAGNOSTIC
2685 		panic("mount: lost mount: error %d", err);
2686 #endif
2687 		return err;
2688 	}
2689 
2690 	cdr.olddp = olddp;
2691 	cdr.newdp = newdp;
2692 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2693 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2694 
2695 	if (rootvnode == olddp) {
2696 		vnode_ref(newdp);
2697 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2698 		tvp = rootvnode;
2699 		rootvnode = newdp;
2700 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2701 		vnode_rele(tvp);
2702 	}
2703 
2704 	vnode_put(newdp);
2705 	return 0;
2706 }
2707 
2708 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2709 	"com.apple.private.vfs.role-account-unmount"
2710 #define SYSTEM_VOLUME_UNMOUNT_ENTITLEMENT       \
2711 	"com.apple.private.vfs.system-volume-unmount"
2712 
2713 /*
2714  * Unmount a file system.
2715  *
2716  * Note: unmount takes a path to the vnode mounted on as argument,
2717  * not special file (as before).
2718  */
2719 /* ARGSUSED */
2720 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2721 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2722 {
2723 	vnode_t vp;
2724 	struct mount *mp;
2725 	int flags = uap->flags;
2726 	int error;
2727 	struct nameidata nd;
2728 	vfs_context_t ctx;
2729 
2730 	/*
2731 	 * If the process has the entitlement, use the kernel's context when
2732 	 * performing lookup on the mount path as the process might lack proper
2733 	 * permission to access the directory.
2734 	 */
2735 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2736 	    vfs_context_kernel() : vfs_context_current();
2737 
2738 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2739 	    UIO_USERSPACE, uap->path, ctx);
2740 	if (flags & MNT_NOFOLLOW) {
2741 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
2742 	}
2743 
2744 	error = namei(&nd);
2745 	if (error) {
2746 		return error;
2747 	}
2748 	vp = nd.ni_vp;
2749 	mp = vp->v_mount;
2750 	nameidone(&nd);
2751 
2752 	/*
2753 	 * Must be the root of the filesystem
2754 	 */
2755 	if ((vp->v_flag & VROOT) == 0) {
2756 		vnode_put(vp);
2757 		return EINVAL;
2758 	}
2759 #if CONFIG_MACF
2760 	error = mac_mount_check_umount(ctx, mp);
2761 	if (error != 0) {
2762 		vnode_put(vp);
2763 		return error;
2764 	}
2765 #endif
2766 	mount_ref(mp, 0);
2767 	vnode_put(vp);
2768 	/* safedounmount consumes the mount ref */
2769 	return safedounmount(mp, flags, ctx);
2770 }
2771 
2772 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2773 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2774 {
2775 	mount_t mp;
2776 
2777 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2778 	if (mp == (mount_t)0) {
2779 		return ENOENT;
2780 	}
2781 	mount_ref(mp, 0);
2782 	mount_iterdrop(mp);
2783 	/* safedounmount consumes the mount ref */
2784 	return safedounmount(mp, flags, ctx);
2785 }
2786 
2787 /*
2788  * The mount struct comes with a mount ref which will be consumed.
2789  * Do the actual file system unmount, prevent some common foot shooting.
2790  */
2791 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2792 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2793 {
2794 	int error;
2795 	proc_t p = vfs_context_proc(ctx);
2796 
2797 	/*
2798 	 * If the file system is not responding and MNT_NOBLOCK
2799 	 * is set and not a forced unmount then return EBUSY.
2800 	 */
2801 	if ((mp->mnt_lflag & MNT_LNOTRESP) &&
2802 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2803 		error = EBUSY;
2804 		goto out;
2805 	}
2806 
2807 	/*
2808 	 * Skip authorization in two cases:
2809 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2810 	 *   This entitlement allows non-root processes unmount volumes mounted by
2811 	 *   other processes.
2812 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2813 	 *   attempt.
2814 	 */
2815 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2816 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2817 		/*
2818 		 * Only root, or the user that did the original mount is
2819 		 * permitted to unmount this filesystem.
2820 		 */
2821 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2822 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2823 			goto out;
2824 		}
2825 	}
2826 
2827 	/*
2828 	 * Don't allow unmounting the root file system, or other volumes
2829 	 * associated with it (for example, the associated VM or DATA mounts) .
2830 	 */
2831 	if (mp->mnt_flag & MNT_ROOTFS) {
2832 		error = EBUSY; /* the root is always busy */
2833 		goto out;
2834 	}
2835 	if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !IOCurrentTaskHasEntitlement(SYSTEM_VOLUME_UNMOUNT_ENTITLEMENT)) {
2836 		printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2837 		    mp->mnt_vfsstat.f_mntonname);
2838 		error = EBUSY; /* root-associated volumes are always busy unless caller is entitled */
2839 		goto out;
2840 	}
2841 
2842 	/*
2843 	 * If the mount is providing the root filesystem's disk image
2844 	 * (i.e. imageboot), don't allow unmounting
2845 	 */
2846 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2847 		error = EBUSY;
2848 		goto out;
2849 	}
2850 
2851 	return dounmount(mp, flags, 1, ctx);
2852 
2853 out:
2854 	mount_drop(mp, 0);
2855 	return error;
2856 }
2857 
2858 /*
2859  * Do the actual file system unmount.
2860  */
2861 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2862 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2863 {
2864 	vnode_t coveredvp = (vnode_t)0;
2865 	int error;
2866 	int needwakeup = 0;
2867 	int forcedunmount = 0;
2868 	int lflags = 0;
2869 	struct vnode *devvp = NULLVP;
2870 #if CONFIG_TRIGGERS
2871 	proc_t p = vfs_context_proc(ctx);
2872 	int did_vflush = 0;
2873 	int pflags_save = 0;
2874 #endif /* CONFIG_TRIGGERS */
2875 
2876 #if CONFIG_FSE
2877 	if (!(flags & MNT_FORCE)) {
2878 		fsevent_unmount(mp, ctx);  /* has to come first! */
2879 	}
2880 #endif
2881 
2882 	mount_lock(mp);
2883 
2884 	/*
2885 	 * If already an unmount in progress just return EBUSY.
2886 	 * Even a forced unmount cannot override.
2887 	 */
2888 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2889 		if (withref != 0) {
2890 			mount_drop(mp, 1);
2891 		}
2892 		mount_unlock(mp);
2893 		return EBUSY;
2894 	}
2895 
2896 	if (flags & MNT_FORCE) {
2897 		forcedunmount = 1;
2898 		mp->mnt_lflag |= MNT_LFORCE;
2899 	}
2900 
2901 #if CONFIG_TRIGGERS
2902 	if (flags & MNT_NOBLOCK && p != kernproc) {
2903 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2904 	}
2905 #endif
2906 
2907 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2908 	mp->mnt_lflag |= MNT_LUNMOUNT;
2909 	mp->mnt_flag &= ~MNT_ASYNC;
2910 	/*
2911 	 * anyone currently in the fast path that
2912 	 * trips over the cached rootvp will be
2913 	 * dumped out and forced into the slow path
2914 	 * to regenerate a new cached value
2915 	 */
2916 	mp->mnt_realrootvp = NULLVP;
2917 	mount_unlock(mp);
2918 
2919 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2920 		/*
2921 		 * Force unmount any mounts in this filesystem.
2922 		 * If any unmounts fail - just leave them dangling.
2923 		 * Avoids recursion.
2924 		 */
2925 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2926 	}
2927 
2928 	/*
2929 	 * taking the name_cache_lock exclusively will
2930 	 * insure that everyone is out of the fast path who
2931 	 * might be trying to use a now stale copy of
2932 	 * vp->v_mountedhere->mnt_realrootvp
2933 	 * bumping mount_generation causes the cached values
2934 	 * to be invalidated
2935 	 */
2936 	name_cache_lock();
2937 	mount_generation++;
2938 	name_cache_unlock();
2939 
2940 
2941 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2942 	if (withref != 0) {
2943 		mount_drop(mp, 0);
2944 	}
2945 	error = 0;
2946 	if (forcedunmount == 0) {
2947 		ubc_umount(mp); /* release cached vnodes */
2948 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2949 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2950 			if (error) {
2951 				mount_lock(mp);
2952 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2953 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2954 				mp->mnt_lflag &= ~MNT_LFORCE;
2955 				goto out;
2956 			}
2957 		}
2958 	}
2959 
2960 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2961 
2962 #if CONFIG_TRIGGERS
2963 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2964 	did_vflush = 1;
2965 #endif
2966 	if (forcedunmount) {
2967 		lflags |= FORCECLOSE;
2968 	}
2969 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2970 	if ((forcedunmount == 0) && error) {
2971 		mount_lock(mp);
2972 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2973 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2974 		mp->mnt_lflag &= ~MNT_LFORCE;
2975 		goto out;
2976 	}
2977 
2978 	/* make sure there are no one in the mount iterations or lookup */
2979 	mount_iterdrain(mp);
2980 
2981 	error = VFS_UNMOUNT(mp, flags, ctx);
2982 	if (error) {
2983 		mount_iterreset(mp);
2984 		mount_lock(mp);
2985 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2986 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2987 		mp->mnt_lflag &= ~MNT_LFORCE;
2988 		goto out;
2989 	}
2990 
2991 	/* increment the operations count */
2992 	if (!error) {
2993 		OSAddAtomic(1, &vfs_nummntops);
2994 	}
2995 
2996 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2997 		/* hold an io reference and drop the usecount before close */
2998 		devvp = mp->mnt_devvp;
2999 		vnode_getalways(devvp);
3000 		vnode_rele(devvp);
3001 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
3002 		    ctx);
3003 		vnode_clearmountedon(devvp);
3004 		vnode_put(devvp);
3005 	}
3006 	lck_rw_done(&mp->mnt_rwlock);
3007 	mount_list_remove(mp);
3008 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
3009 
3010 	/* mark the mount point hook in the vp but not drop the ref yet */
3011 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
3012 		/*
3013 		 * The covered vnode needs special handling. Trying to get an
3014 		 * iocount must not block here as this may lead to deadlocks
3015 		 * if the Filesystem to which the covered vnode belongs is
3016 		 * undergoing forced unmounts. Since we hold a usecount, the
3017 		 * vnode cannot be reused (it can, however, still be terminated)
3018 		 */
3019 		vnode_getalways(coveredvp);
3020 		vnode_lock_spin(coveredvp);
3021 
3022 		mp->mnt_crossref++;
3023 		coveredvp->v_mountedhere = (struct mount *)0;
3024 		CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
3025 		/* Wakeup waiter(s) waiting for in-progress mount to finish. */
3026 		wakeup(&coveredvp->v_flag);
3027 		vnode_unlock(coveredvp);
3028 		vnode_put(coveredvp);
3029 	}
3030 
3031 	mount_list_lock();
3032 	mp->mnt_vtable->vfc_refcount--;
3033 	mount_list_unlock();
3034 
3035 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
3036 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
3037 	mount_lock(mp);
3038 	mp->mnt_lflag |= MNT_LDEAD;
3039 
3040 	if (mp->mnt_lflag & MNT_LWAIT) {
3041 		/*
3042 		 * do the wakeup here
3043 		 * in case we block in mount_refdrain
3044 		 * which will drop the mount lock
3045 		 * and allow anyone blocked in vfs_busy
3046 		 * to wakeup and see the LDEAD state
3047 		 */
3048 		mp->mnt_lflag &= ~MNT_LWAIT;
3049 		wakeup((caddr_t)mp);
3050 	}
3051 	mount_refdrain(mp);
3052 
3053 	/* free disk_conditioner_info structure for this mount */
3054 	disk_conditioner_unmount(mp);
3055 
3056 out:
3057 	if (mp->mnt_lflag & MNT_LWAIT) {
3058 		mp->mnt_lflag &= ~MNT_LWAIT;
3059 		needwakeup = 1;
3060 	}
3061 
3062 #if CONFIG_TRIGGERS
3063 	if (flags & MNT_NOBLOCK && p != kernproc) {
3064 		// Restore P_NOREMOTEHANG bit to its previous value
3065 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
3066 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
3067 		}
3068 	}
3069 
3070 	/*
3071 	 * Callback and context are set together under the mount lock, and
3072 	 * never cleared, so we're safe to examine them here, drop the lock,
3073 	 * and call out.
3074 	 */
3075 	if (mp->mnt_triggercallback != NULL) {
3076 		mount_unlock(mp);
3077 		if (error == 0) {
3078 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
3079 		} else if (did_vflush) {
3080 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
3081 		}
3082 	} else {
3083 		mount_unlock(mp);
3084 	}
3085 #else
3086 	mount_unlock(mp);
3087 #endif /* CONFIG_TRIGGERS */
3088 
3089 	lck_rw_done(&mp->mnt_rwlock);
3090 
3091 	if (needwakeup) {
3092 		wakeup((caddr_t)mp);
3093 	}
3094 
3095 	if (!error) {
3096 		if ((coveredvp != NULLVP)) {
3097 			vnode_t pvp = NULLVP;
3098 
3099 			/*
3100 			 * The covered vnode needs special handling. Trying to
3101 			 * get an iocount must not block here as this may lead
3102 			 * to deadlocks if the Filesystem to which the covered
3103 			 * vnode belongs is undergoing forced unmounts. Since we
3104 			 * hold a usecount, the  vnode cannot be reused
3105 			 * (it can, however, still be terminated).
3106 			 */
3107 			vnode_getalways(coveredvp);
3108 
3109 			mount_dropcrossref(mp, coveredvp, 0);
3110 			/*
3111 			 * We'll _try_ to detect if this really needs to be
3112 			 * done. The coveredvp can only be in termination (or
3113 			 * terminated) if the coveredvp's mount point is in a
3114 			 * forced unmount (or has been) since we still hold the
3115 			 * ref.
3116 			 */
3117 			if (!vnode_isrecycled(coveredvp)) {
3118 				pvp = vnode_getparent(coveredvp);
3119 #if CONFIG_TRIGGERS
3120 				if (coveredvp->v_resolve) {
3121 					vnode_trigger_rearm(coveredvp, ctx);
3122 				}
3123 #endif
3124 			}
3125 
3126 			vnode_rele(coveredvp);
3127 			vnode_put(coveredvp);
3128 			coveredvp = NULLVP;
3129 
3130 			if (pvp) {
3131 				lock_vnode_and_post(pvp, NOTE_WRITE);
3132 				vnode_put(pvp);
3133 			}
3134 		} else if (mp->mnt_flag & MNT_ROOTFS) {
3135 			if (nc_smr_enabled) {
3136 				vfs_smr_synchronize();
3137 			}
3138 
3139 			mount_lock_destroy(mp);
3140 #if CONFIG_MACF
3141 			mac_mount_label_destroy(mp);
3142 #endif
3143 			zfree(mount_zone, mp);
3144 		} else {
3145 			panic("dounmount: no coveredvp");
3146 		}
3147 	}
3148 	return error;
3149 }
3150 
3151 /*
3152  * Unmount any mounts in this filesystem.
3153  */
3154 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3155 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3156 {
3157 	mount_t smp;
3158 	fsid_t *fsids, fsid;
3159 	int fsids_sz;
3160 	int count = 0, i, m = 0;
3161 	vnode_t vp;
3162 
3163 	mount_list_lock();
3164 
3165 	// Get an array to hold the submounts fsids.
3166 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
3167 	count++;
3168 	fsids_sz = count * sizeof(fsid_t);
3169 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3170 	if (fsids == NULL) {
3171 		mount_list_unlock();
3172 		goto out;
3173 	}
3174 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
3175 
3176 	/*
3177 	 * Fill the array with submount fsids.
3178 	 * Since mounts are always added to the tail of the mount list, the
3179 	 * list is always in mount order.
3180 	 * For each mount check if the mounted-on vnode belongs to a
3181 	 * mount that's already added to our array of mounts to be unmounted.
3182 	 */
3183 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3184 		vp = smp->mnt_vnodecovered;
3185 		if (vp == NULL) {
3186 			continue;
3187 		}
3188 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3189 		for (i = 0; i <= m; i++) {
3190 			if (fsids[i].val[0] == fsid.val[0] &&
3191 			    fsids[i].val[1] == fsid.val[1]) {
3192 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3193 				break;
3194 			}
3195 		}
3196 	}
3197 	mount_list_unlock();
3198 
3199 	// Unmount the submounts in reverse order. Ignore errors.
3200 	for (i = m; i > 0; i--) {
3201 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3202 		if (smp) {
3203 			mount_ref(smp, 0);
3204 			mount_iterdrop(smp);
3205 			(void) dounmount(smp, flags, 1, ctx);
3206 		}
3207 	}
3208 out:
3209 	kfree_data(fsids, fsids_sz);
3210 }
3211 
3212 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3213 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3214 {
3215 	vnode_hold(dp);
3216 	vnode_lock(dp);
3217 	mp->mnt_crossref--;
3218 
3219 	if (mp->mnt_crossref < 0) {
3220 		panic("mount cross refs -ve");
3221 	}
3222 
3223 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3224 		if (need_put) {
3225 			vnode_put_locked(dp);
3226 		}
3227 		vnode_drop_and_unlock(dp);
3228 
3229 		if (nc_smr_enabled) {
3230 			vfs_smr_synchronize();
3231 		}
3232 
3233 		mount_lock_destroy(mp);
3234 #if CONFIG_MACF
3235 		mac_mount_label_destroy(mp);
3236 #endif
3237 		zfree(mount_zone, mp);
3238 		return;
3239 	}
3240 	if (need_put) {
3241 		vnode_put_locked(dp);
3242 	}
3243 	vnode_drop_and_unlock(dp);
3244 }
3245 
3246 
3247 /*
3248  * Sync each mounted filesystem.
3249  */
3250 #if DIAGNOSTIC
3251 int syncprt = 0;
3252 #endif
3253 
3254 int print_vmpage_stat = 0;
3255 
3256 /*
3257  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3258  *			mounted read-write with the passed waitfor value.
3259  *
3260  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3261  *		arg	user argument (please see below)
3262  *
3263  * User argument is a pointer to 32 bit unsigned integer which describes the
3264  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3265  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3266  * waitfor value.
3267  *
3268  * Returns:		VFS_RETURNED
3269  */
3270 static int
sync_callback(mount_t mp,void * arg)3271 sync_callback(mount_t mp, void *arg)
3272 {
3273 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3274 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3275 		unsigned waitfor = MNT_NOWAIT;
3276 
3277 		if (arg) {
3278 			waitfor = *(uint32_t*)arg;
3279 		}
3280 
3281 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3282 		if (waitfor != MNT_WAIT &&
3283 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3284 		    waitfor != MNT_NOWAIT &&
3285 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3286 		    waitfor != MNT_DWAIT &&
3287 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3288 			panic("Passed inappropriate waitfor %u to "
3289 			    "sync_callback()", waitfor);
3290 		}
3291 
3292 		mp->mnt_flag &= ~MNT_ASYNC;
3293 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3294 		if (asyncflag) {
3295 			mp->mnt_flag |= MNT_ASYNC;
3296 		}
3297 	}
3298 
3299 	return VFS_RETURNED;
3300 }
3301 
3302 /* ARGSUSED */
3303 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3304 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3305 {
3306 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3307 
3308 	if (print_vmpage_stat) {
3309 		vm_countdirtypages();
3310 	}
3311 
3312 #if DIAGNOSTIC
3313 	if (syncprt) {
3314 		vfs_bufstats();
3315 	}
3316 #endif /* DIAGNOSTIC */
3317 	return 0;
3318 }
3319 
3320 typedef enum {
3321 	SYNC_ALL = 0,
3322 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3323 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3324 } sync_type_t;
3325 
3326 static int
sync_internal_callback(mount_t mp,void * arg)3327 sync_internal_callback(mount_t mp, void *arg)
3328 {
3329 	if (arg) {
3330 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3331 		    (mp->mnt_flag & MNT_LOCAL);
3332 		sync_type_t sync_type = *((sync_type_t *)arg);
3333 
3334 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3335 			return VFS_RETURNED;
3336 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3337 			return VFS_RETURNED;
3338 		}
3339 	}
3340 
3341 	(void)sync_callback(mp, NULL);
3342 
3343 	return VFS_RETURNED;
3344 }
3345 
3346 int sync_thread_state = 0;
3347 int sync_timeout_seconds = 5;
3348 
3349 #define SYNC_THREAD_RUN       0x0001
3350 #define SYNC_THREAD_RUNNING   0x0002
3351 
3352 #if CONFIG_PHYS_WRITE_ACCT
3353 thread_t pm_sync_thread;
3354 #endif /* CONFIG_PHYS_WRITE_ACCT */
3355 
3356 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3357 sync_thread(__unused void *arg, __unused wait_result_t wr)
3358 {
3359 	sync_type_t sync_type;
3360 #if CONFIG_PHYS_WRITE_ACCT
3361 	pm_sync_thread = current_thread();
3362 #endif /* CONFIG_PHYS_WRITE_ACCT */
3363 
3364 	lck_mtx_lock(&sync_mtx_lck);
3365 	while (sync_thread_state & SYNC_THREAD_RUN) {
3366 		sync_thread_state &= ~SYNC_THREAD_RUN;
3367 		lck_mtx_unlock(&sync_mtx_lck);
3368 
3369 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3370 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3371 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3372 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3373 
3374 		lck_mtx_lock(&sync_mtx_lck);
3375 	}
3376 	/*
3377 	 * This wakeup _has_ to be issued before the lock is released otherwise
3378 	 * we may end up waking up a thread in sync_internal which is
3379 	 * expecting a wakeup from a thread it just created and not from this
3380 	 * thread which is about to exit.
3381 	 */
3382 	wakeup(&sync_thread_state);
3383 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3384 #if CONFIG_PHYS_WRITE_ACCT
3385 	pm_sync_thread = NULL;
3386 #endif /* CONFIG_PHYS_WRITE_ACCT */
3387 	lck_mtx_unlock(&sync_mtx_lck);
3388 
3389 	if (print_vmpage_stat) {
3390 		vm_countdirtypages();
3391 	}
3392 
3393 #if DIAGNOSTIC
3394 	if (syncprt) {
3395 		vfs_bufstats();
3396 	}
3397 #endif /* DIAGNOSTIC */
3398 }
3399 
3400 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3401 
3402 /*
3403  * An in-kernel sync for power management to call.
3404  * This function always returns within sync_timeout seconds.
3405  */
3406 __private_extern__ int
sync_internal(void)3407 sync_internal(void)
3408 {
3409 	thread_t thd = NULL;
3410 	int error;
3411 	int thread_created = FALSE;
3412 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3413 
3414 	lck_mtx_lock(&sync_mtx_lck);
3415 	sync_thread_state |= SYNC_THREAD_RUN;
3416 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3417 		int kr;
3418 
3419 		sync_thread_state |= SYNC_THREAD_RUNNING;
3420 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3421 		if (kr != KERN_SUCCESS) {
3422 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3423 			lck_mtx_unlock(&sync_mtx_lck);
3424 			printf("sync_thread failed\n");
3425 			return 0;
3426 		}
3427 		thread_created = TRUE;
3428 	}
3429 
3430 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3431 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3432 	if (error) {
3433 		struct timeval now;
3434 
3435 		microtime(&now);
3436 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3437 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3438 			sync_timeout_last_print.tv_sec = now.tv_sec;
3439 		}
3440 	}
3441 
3442 	if (thread_created) {
3443 		thread_deallocate(thd);
3444 	}
3445 
3446 	return 0;
3447 } /* end of sync_internal call */
3448 
3449 /*
3450  * Change filesystem quotas.
3451  */
3452 #if QUOTA
3453 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3454 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3455 {
3456 	struct mount *mp;
3457 	int error, quota_cmd, quota_status = 0;
3458 	caddr_t datap;
3459 	size_t fnamelen;
3460 	struct nameidata nd;
3461 	vfs_context_t ctx = vfs_context_current();
3462 	struct dqblk my_dqblk = {};
3463 
3464 	AUDIT_ARG(uid, uap->uid);
3465 	AUDIT_ARG(cmd, uap->cmd);
3466 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3467 	    uap->path, ctx);
3468 	error = namei(&nd);
3469 	if (error) {
3470 		return error;
3471 	}
3472 	mp = nd.ni_vp->v_mount;
3473 	mount_ref(mp, 0);
3474 	vnode_put(nd.ni_vp);
3475 	nameidone(&nd);
3476 
3477 #if CONFIG_MACF
3478 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3479 	if (error != 0) {
3480 		goto out;
3481 	}
3482 #endif
3483 
3484 	/* copyin any data we will need for downstream code */
3485 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3486 
3487 	switch (quota_cmd) {
3488 	case Q_QUOTAON:
3489 		/* uap->arg specifies a file from which to take the quotas */
3490 		fnamelen = MAXPATHLEN;
3491 		datap = zalloc(ZV_NAMEI);
3492 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3493 		break;
3494 	case Q_GETQUOTA:
3495 		/* uap->arg is a pointer to a dqblk structure. */
3496 		datap = (caddr_t) &my_dqblk;
3497 		break;
3498 	case Q_SETQUOTA:
3499 	case Q_SETUSE:
3500 		/* uap->arg is a pointer to a dqblk structure. */
3501 		datap = (caddr_t) &my_dqblk;
3502 		if (proc_is64bit(p)) {
3503 			struct user_dqblk       my_dqblk64;
3504 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3505 			if (error == 0) {
3506 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3507 			}
3508 		} else {
3509 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3510 		}
3511 		break;
3512 	case Q_QUOTASTAT:
3513 		/* uap->arg is a pointer to an integer */
3514 		datap = (caddr_t) &quota_status;
3515 		break;
3516 	default:
3517 		datap = NULL;
3518 		break;
3519 	} /* switch */
3520 
3521 	if (error == 0) {
3522 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3523 	}
3524 
3525 	switch (quota_cmd) {
3526 	case Q_QUOTAON:
3527 		if (datap != NULL) {
3528 			zfree(ZV_NAMEI, datap);
3529 		}
3530 		break;
3531 	case Q_GETQUOTA:
3532 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3533 		if (error == 0) {
3534 			if (proc_is64bit(p)) {
3535 				struct user_dqblk       my_dqblk64;
3536 
3537 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3538 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3539 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3540 			} else {
3541 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3542 			}
3543 		}
3544 		break;
3545 	case Q_QUOTASTAT:
3546 		/* uap->arg is a pointer to an integer */
3547 		if (error == 0) {
3548 			error = copyout(datap, uap->arg, sizeof(quota_status));
3549 		}
3550 		break;
3551 	default:
3552 		break;
3553 	} /* switch */
3554 
3555 out:
3556 	mount_drop(mp, 0);
3557 	return error;
3558 }
3559 #else
3560 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3561 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3562 {
3563 	return EOPNOTSUPP;
3564 }
3565 #endif /* QUOTA */
3566 
3567 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3568 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3569 {
3570 	int error;
3571 	vfs_context_t ctx = vfs_context_current();
3572 
3573 #if CONFIG_MACF
3574 	error = mac_mount_check_stat(ctx, mp);
3575 	if (error != 0) {
3576 		return error;
3577 	}
3578 #endif
3579 
3580 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3581 	if (error != 0) {
3582 		return error;
3583 	}
3584 
3585 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3586 }
3587 
3588 /*
3589  * Get filesystem statistics.
3590  *
3591  * Returns:	0			Success
3592  *	namei:???
3593  *	vfs_update_vfsstat:???
3594  *	munge_statfs:EFAULT
3595  */
3596 /* ARGSUSED */
3597 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3598 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3599 {
3600 	int error;
3601 	struct mount *mp;
3602 	struct nameidata nd;
3603 	vfs_context_t ctx = vfs_context_current();
3604 	vnode_t vp;
3605 
3606 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3607 	    UIO_USERSPACE, uap->path, ctx);
3608 	error = namei(&nd);
3609 	if (error != 0) {
3610 		return error;
3611 	}
3612 	vp = nd.ni_vp;
3613 	mp = vp->v_mount;
3614 	nameidone(&nd);
3615 
3616 	error = statfs_internal(p, mp, uap->buf);
3617 	vnode_put(vp);
3618 
3619 	return error;
3620 }
3621 
3622 /*
3623  * Get filesystem statistics.
3624  */
3625 /* ARGSUSED */
3626 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3627 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3628 {
3629 	int error;
3630 	vnode_t vp = NULL;
3631 	struct mount *mp;
3632 
3633 	AUDIT_ARG(fd, uap->fd);
3634 
3635 	if ((error = file_vnode(uap->fd, &vp)) ||
3636 	    (error = vnode_getwithref(vp))) {
3637 		goto out;
3638 	}
3639 
3640 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3641 
3642 	mp = vp->v_mount;
3643 	if (!mp) {
3644 		error = EBADF;
3645 		goto out_vnode;
3646 	}
3647 
3648 	error = statfs_internal(p, mp, uap->buf);
3649 
3650 out_vnode:
3651 	vnode_put(vp);
3652 
3653 out:
3654 	if (vp != NULL) {
3655 		file_drop(uap->fd);
3656 	}
3657 
3658 	return error;
3659 }
3660 
3661 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3662 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3663 {
3664 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3665 
3666 	bzero(sfs, sizeof(*sfs));
3667 
3668 	sfs->f_bsize = vsfs->f_bsize;
3669 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3670 	sfs->f_blocks = vsfs->f_blocks;
3671 	sfs->f_bfree = vsfs->f_bfree;
3672 	sfs->f_bavail = vsfs->f_bavail;
3673 	sfs->f_files = vsfs->f_files;
3674 	sfs->f_ffree = vsfs->f_ffree;
3675 	sfs->f_fsid = vsfs->f_fsid;
3676 	sfs->f_owner = vsfs->f_owner;
3677 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3678 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3679 	sfs->f_fssubtype = vsfs->f_fssubtype;
3680 	sfs->f_flags_ext = vfs_getextflags(mp);
3681 	vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3682 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3683 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3684 }
3685 
3686 /*
3687  * Get file system statistics in 64-bit mode
3688  */
3689 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3690 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3691 {
3692 	struct mount *mp;
3693 	int error;
3694 	struct nameidata *ndp;
3695 	struct statfs64 *sfsp;
3696 	vfs_context_t ctxp = vfs_context_current();
3697 	vnode_t vp;
3698 	struct {
3699 		struct nameidata nd;
3700 		struct statfs64 sfs;
3701 	} *__nameidata_statfs64;
3702 
3703 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3704 	    Z_WAITOK);
3705 	ndp = &__nameidata_statfs64->nd;
3706 
3707 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3708 	    UIO_USERSPACE, uap->path, ctxp);
3709 	error = namei(ndp);
3710 	if (error != 0) {
3711 		goto out;
3712 	}
3713 	vp = ndp->ni_vp;
3714 	mp = vp->v_mount;
3715 	nameidone(ndp);
3716 
3717 #if CONFIG_MACF
3718 	error = mac_mount_check_stat(ctxp, mp);
3719 	if (error != 0) {
3720 		vnode_put(vp);
3721 		goto out;
3722 	}
3723 #endif
3724 
3725 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3726 	if (error != 0) {
3727 		vnode_put(vp);
3728 		goto out;
3729 	}
3730 
3731 	sfsp = &__nameidata_statfs64->sfs;
3732 	vfs_get_statfs64(mp, sfsp);
3733 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3734 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3735 		/* This process does not want to see a seperate data volume mountpoint */
3736 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3737 	}
3738 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3739 	vnode_put(vp);
3740 
3741 out:
3742 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3743 
3744 	return error;
3745 }
3746 
3747 /*
3748  * Get file system statistics in 64-bit mode
3749  */
3750 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3751 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3752 {
3753 	struct vnode *vp;
3754 	struct mount *mp;
3755 	struct statfs64 sfs;
3756 	int error;
3757 
3758 	AUDIT_ARG(fd, uap->fd);
3759 
3760 	if ((error = file_vnode(uap->fd, &vp))) {
3761 		return error;
3762 	}
3763 
3764 	error = vnode_getwithref(vp);
3765 	if (error) {
3766 		file_drop(uap->fd);
3767 		return error;
3768 	}
3769 
3770 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3771 
3772 	mp = vp->v_mount;
3773 	if (!mp) {
3774 		error = EBADF;
3775 		goto out;
3776 	}
3777 
3778 #if CONFIG_MACF
3779 	error = mac_mount_check_stat(vfs_context_current(), mp);
3780 	if (error != 0) {
3781 		goto out;
3782 	}
3783 #endif
3784 
3785 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3786 		goto out;
3787 	}
3788 
3789 	vfs_get_statfs64(mp, &sfs);
3790 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3791 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3792 		/* This process does not want to see a seperate data volume mountpoint */
3793 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3794 	}
3795 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3796 
3797 out:
3798 	file_drop(uap->fd);
3799 	vnode_put(vp);
3800 
3801 	return error;
3802 }
3803 
3804 struct getfsstat_struct {
3805 	user_addr_t     sfsp;
3806 	user_addr_t     *mp;
3807 	int             count;
3808 	int             maxcount;
3809 	int             flags;
3810 	int             error;
3811 };
3812 
3813 
3814 static int
getfsstat_callback(mount_t mp,void * arg)3815 getfsstat_callback(mount_t mp, void * arg)
3816 {
3817 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3818 	struct vfsstatfs *sp;
3819 	int error, my_size;
3820 	vfs_context_t ctx = vfs_context_current();
3821 
3822 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3823 #if CONFIG_MACF
3824 		error = mac_mount_check_stat(ctx, mp);
3825 		if (error != 0) {
3826 			fstp->error = error;
3827 			return VFS_RETURNED_DONE;
3828 		}
3829 #endif
3830 		sp = &mp->mnt_vfsstat;
3831 		/*
3832 		 * If MNT_NOWAIT is specified, do not refresh the
3833 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3834 		 */
3835 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3836 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3837 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3838 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3839 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3840 			return VFS_RETURNED;
3841 		}
3842 
3843 		/*
3844 		 * Need to handle LP64 version of struct statfs
3845 		 */
3846 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3847 		if (error) {
3848 			fstp->error = error;
3849 			return VFS_RETURNED_DONE;
3850 		}
3851 		fstp->sfsp += my_size;
3852 
3853 		if (fstp->mp) {
3854 #if CONFIG_MACF
3855 			error = mac_mount_label_get(mp, *fstp->mp);
3856 			if (error) {
3857 				fstp->error = error;
3858 				return VFS_RETURNED_DONE;
3859 			}
3860 #endif
3861 			fstp->mp++;
3862 		}
3863 	}
3864 	fstp->count++;
3865 	return VFS_RETURNED;
3866 }
3867 
3868 /*
3869  * Get statistics on all filesystems.
3870  */
3871 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3872 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3873 {
3874 	struct __mac_getfsstat_args muap;
3875 
3876 	muap.buf = uap->buf;
3877 	muap.bufsize = uap->bufsize;
3878 	muap.mac = USER_ADDR_NULL;
3879 	muap.macsize = 0;
3880 	muap.flags = uap->flags;
3881 
3882 	return __mac_getfsstat(p, &muap, retval);
3883 }
3884 
3885 /*
3886  * __mac_getfsstat: Get MAC-related file system statistics
3887  *
3888  * Parameters:    p                        (ignored)
3889  *                uap                      User argument descriptor (see below)
3890  *                retval                   Count of file system statistics (N stats)
3891  *
3892  * Indirect:      uap->bufsize             Buffer size
3893  *                uap->macsize             MAC info size
3894  *                uap->buf                 Buffer where information will be returned
3895  *                uap->mac                 MAC info
3896  *                uap->flags               File system flags
3897  *
3898  *
3899  * Returns:        0                       Success
3900  *                !0                       Not success
3901  *
3902  */
3903 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3904 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3905 {
3906 	user_addr_t sfsp;
3907 	user_addr_t *mp;
3908 	size_t count, maxcount, bufsize, macsize;
3909 	struct getfsstat_struct fst;
3910 
3911 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3912 		return EINVAL;
3913 	}
3914 
3915 	bufsize = (size_t) uap->bufsize;
3916 	macsize = (size_t) uap->macsize;
3917 
3918 	if (IS_64BIT_PROCESS(p)) {
3919 		maxcount = bufsize / sizeof(struct user64_statfs);
3920 	} else {
3921 		maxcount = bufsize / sizeof(struct user32_statfs);
3922 	}
3923 	sfsp = uap->buf;
3924 	count = 0;
3925 
3926 	mp = NULL;
3927 
3928 #if CONFIG_MACF
3929 	if (uap->mac != USER_ADDR_NULL) {
3930 		u_int32_t *mp0;
3931 		int error;
3932 		unsigned int i;
3933 
3934 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3935 		if (count != maxcount) {
3936 			return EINVAL;
3937 		}
3938 
3939 		/* Copy in the array */
3940 		mp0 = kalloc_data(macsize, Z_WAITOK);
3941 		if (mp0 == NULL) {
3942 			return ENOMEM;
3943 		}
3944 
3945 		error = copyin(uap->mac, mp0, macsize);
3946 		if (error) {
3947 			kfree_data(mp0, macsize);
3948 			return error;
3949 		}
3950 
3951 		/* Normalize to an array of user_addr_t */
3952 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3953 		if (mp == NULL) {
3954 			kfree_data(mp0, macsize);
3955 			return ENOMEM;
3956 		}
3957 
3958 		for (i = 0; i < count; i++) {
3959 			if (IS_64BIT_PROCESS(p)) {
3960 				mp[i] = ((user_addr_t *)mp0)[i];
3961 			} else {
3962 				mp[i] = (user_addr_t)mp0[i];
3963 			}
3964 		}
3965 		kfree_data(mp0, macsize);
3966 	}
3967 #endif
3968 
3969 
3970 	fst.sfsp = sfsp;
3971 	fst.mp = mp;
3972 	fst.flags = uap->flags;
3973 	fst.count = 0;
3974 	fst.error = 0;
3975 	fst.maxcount = (int)maxcount;
3976 
3977 
3978 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3979 
3980 	if (mp) {
3981 		kfree_data(mp, count * sizeof(user_addr_t));
3982 	}
3983 
3984 	if (fst.error) {
3985 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3986 		return fst.error;
3987 	}
3988 
3989 	if (fst.sfsp && fst.count > fst.maxcount) {
3990 		*retval = fst.maxcount;
3991 	} else {
3992 		*retval = fst.count;
3993 	}
3994 	return 0;
3995 }
3996 
3997 static int
getfsstat64_callback(mount_t mp,void * arg)3998 getfsstat64_callback(mount_t mp, void * arg)
3999 {
4000 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
4001 	struct vfsstatfs *sp;
4002 	struct statfs64 sfs;
4003 	int error;
4004 
4005 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
4006 #if CONFIG_MACF
4007 		error = mac_mount_check_stat(vfs_context_current(), mp);
4008 		if (error != 0) {
4009 			fstp->error = error;
4010 			return VFS_RETURNED_DONE;
4011 		}
4012 #endif
4013 		sp = &mp->mnt_vfsstat;
4014 		/*
4015 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
4016 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
4017 		 *
4018 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
4019 		 * getfsstat, since the constants are out of the same
4020 		 * namespace.
4021 		 */
4022 		if ((mp->mnt_lflag & MNT_LDEAD) ||
4023 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
4024 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
4025 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
4026 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
4027 			return VFS_RETURNED;
4028 		}
4029 
4030 		vfs_get_statfs64(mp, &sfs);
4031 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
4032 		if (error) {
4033 			fstp->error = error;
4034 			return VFS_RETURNED_DONE;
4035 		}
4036 		fstp->sfsp += sizeof(sfs);
4037 	}
4038 	fstp->count++;
4039 	return VFS_RETURNED;
4040 }
4041 
4042 /*
4043  * Get statistics on all file systems in 64 bit mode.
4044  */
4045 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)4046 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
4047 {
4048 	user_addr_t sfsp;
4049 	int count, maxcount;
4050 	struct getfsstat_struct fst;
4051 
4052 	maxcount = uap->bufsize / sizeof(struct statfs64);
4053 
4054 	sfsp = uap->buf;
4055 	count = 0;
4056 
4057 	fst.sfsp = sfsp;
4058 	fst.flags = uap->flags;
4059 	fst.count = 0;
4060 	fst.error = 0;
4061 	fst.maxcount = maxcount;
4062 
4063 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
4064 
4065 	if (fst.error) {
4066 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4067 		return fst.error;
4068 	}
4069 
4070 	if (fst.sfsp && fst.count > fst.maxcount) {
4071 		*retval = fst.maxcount;
4072 	} else {
4073 		*retval = fst.count;
4074 	}
4075 
4076 	return 0;
4077 }
4078 
4079 /*
4080  * gets the associated vnode with the file descriptor passed.
4081  * as input
4082  *
4083  * INPUT
4084  * ctx - vfs context of caller
4085  * fd - file descriptor for which vnode is required.
4086  * vpp - Pointer to pointer to vnode to be returned.
4087  *
4088  * The vnode is returned with an iocount so any vnode obtained
4089  * by this call needs a vnode_put
4090  *
4091  */
4092 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)4093 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
4094 {
4095 	int error;
4096 	vnode_t vp;
4097 	struct fileproc *fp;
4098 	proc_t p = vfs_context_proc(ctx);
4099 
4100 	*vpp =  NULLVP;
4101 
4102 	error = fp_getfvp(p, fd, &fp, &vp);
4103 	if (error) {
4104 		return error;
4105 	}
4106 
4107 	error = vnode_getwithref(vp);
4108 	if (error) {
4109 		(void)fp_drop(p, fd, fp, 0);
4110 		return error;
4111 	}
4112 
4113 	(void)fp_drop(p, fd, fp, 0);
4114 	*vpp = vp;
4115 	return error;
4116 }
4117 
4118 /*
4119  * Wrapper function around namei to start lookup from a directory
4120  * specified by a file descriptor ni_dirfd.
4121  *
4122  * In addition to all the errors returned by namei, this call can
4123  * return ENOTDIR if the file descriptor does not refer to a directory.
4124  * and EBADF if the file descriptor is not valid.
4125  */
4126 int
nameiat(struct nameidata * ndp,int dirfd)4127 nameiat(struct nameidata *ndp, int dirfd)
4128 {
4129 	if ((dirfd != AT_FDCWD) &&
4130 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4131 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
4132 		int error = 0;
4133 		char c;
4134 
4135 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4136 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4137 			if (error) {
4138 				return error;
4139 			}
4140 		} else {
4141 			c = *((char *)(ndp->ni_dirp));
4142 		}
4143 
4144 		if (c != '/') {
4145 			vnode_t dvp_at;
4146 
4147 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4148 			    &dvp_at);
4149 			if (error) {
4150 				return error;
4151 			}
4152 
4153 			if (vnode_vtype(dvp_at) != VDIR) {
4154 				vnode_put(dvp_at);
4155 				return ENOTDIR;
4156 			}
4157 
4158 			ndp->ni_dvp = dvp_at;
4159 			ndp->ni_cnd.cn_flags |= USEDVP;
4160 			error = namei(ndp);
4161 			ndp->ni_cnd.cn_flags &= ~USEDVP;
4162 			vnode_put(dvp_at);
4163 			return error;
4164 		}
4165 	}
4166 
4167 	return namei(ndp);
4168 }
4169 
4170 /*
4171  * Change current working directory to a given file descriptor.
4172  */
4173 /* ARGSUSED */
4174 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4175 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4176 {
4177 	vnode_t vp;
4178 	vnode_t tdp;
4179 	vnode_t tvp;
4180 	struct mount *mp;
4181 	int error, should_put = 1;
4182 
4183 	AUDIT_ARG(fd, fd);
4184 	if (per_thread && fd == -1) {
4185 		/*
4186 		 * Switching back from per-thread to per process CWD; verify we
4187 		 * in fact have one before proceeding.  The only success case
4188 		 * for this code path is to return 0 preemptively after zapping
4189 		 * the thread structure contents.
4190 		 */
4191 		thread_t th = vfs_context_thread(ctx);
4192 		if (th) {
4193 			uthread_t uth = get_bsdthread_info(th);
4194 			tvp = uth->uu_cdir;
4195 			uth->uu_cdir = NULLVP;
4196 			if (tvp != NULLVP) {
4197 				vnode_rele(tvp);
4198 				return 0;
4199 			}
4200 		}
4201 		return EBADF;
4202 	}
4203 
4204 	if ((error = file_vnode(fd, &vp))) {
4205 		return error;
4206 	}
4207 	if ((error = vnode_getwithref(vp))) {
4208 		file_drop(fd);
4209 		return error;
4210 	}
4211 
4212 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4213 
4214 	if (vp->v_type != VDIR) {
4215 		error = ENOTDIR;
4216 		goto out;
4217 	}
4218 
4219 #if CONFIG_MACF
4220 	error = mac_vnode_check_chdir(ctx, vp);
4221 	if (error) {
4222 		goto out;
4223 	}
4224 #endif
4225 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4226 	if (error) {
4227 		goto out;
4228 	}
4229 
4230 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4231 		if (vfs_busy(mp, LK_NOWAIT)) {
4232 			error = EACCES;
4233 			goto out;
4234 		}
4235 		error = VFS_ROOT(mp, &tdp, ctx);
4236 		vfs_unbusy(mp);
4237 		if (error) {
4238 			break;
4239 		}
4240 		vnode_put(vp);
4241 		vp = tdp;
4242 	}
4243 	if (error) {
4244 		goto out;
4245 	}
4246 	if ((error = vnode_ref(vp))) {
4247 		goto out;
4248 	}
4249 	vnode_put(vp);
4250 	should_put = 0;
4251 
4252 	if (per_thread) {
4253 		thread_t th = vfs_context_thread(ctx);
4254 		if (th) {
4255 			uthread_t uth = get_bsdthread_info(th);
4256 			tvp = uth->uu_cdir;
4257 			uth->uu_cdir = vp;
4258 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4259 		} else {
4260 			vnode_rele(vp);
4261 			error = ENOENT;
4262 			goto out;
4263 		}
4264 	} else {
4265 		proc_dirs_lock_exclusive(p);
4266 		proc_fdlock(p);
4267 		tvp = p->p_fd.fd_cdir;
4268 		p->p_fd.fd_cdir = vp;
4269 		proc_fdunlock(p);
4270 		proc_dirs_unlock_exclusive(p);
4271 	}
4272 
4273 	if (tvp) {
4274 		vnode_rele(tvp);
4275 	}
4276 
4277 out:
4278 	if (should_put) {
4279 		vnode_put(vp);
4280 	}
4281 	file_drop(fd);
4282 
4283 	return error;
4284 }
4285 
4286 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4287 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4288 {
4289 	return fchdir(p, vfs_context_current(), uap->fd, false);
4290 }
4291 
4292 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4293 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4294 {
4295 	return fchdir(p, vfs_context_current(), uap->fd, true);
4296 }
4297 
4298 
4299 /*
4300  * Change current working directory (".").
4301  *
4302  * Returns:	0			Success
4303  *	change_dir:ENOTDIR
4304  *	change_dir:???
4305  *	vnode_ref:ENOENT		No such file or directory
4306  */
4307 /* ARGSUSED */
4308 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4309 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4310 {
4311 	int error;
4312 	vnode_t tvp;
4313 
4314 	error = change_dir(ndp, ctx);
4315 	if (error) {
4316 		return error;
4317 	}
4318 	if ((error = vnode_ref(ndp->ni_vp))) {
4319 		vnode_put(ndp->ni_vp);
4320 		return error;
4321 	}
4322 	/*
4323 	 * drop the iocount we picked up in change_dir
4324 	 */
4325 	vnode_put(ndp->ni_vp);
4326 
4327 	if (per_thread) {
4328 		thread_t th = vfs_context_thread(ctx);
4329 		if (th) {
4330 			uthread_t uth = get_bsdthread_info(th);
4331 			tvp = uth->uu_cdir;
4332 			uth->uu_cdir = ndp->ni_vp;
4333 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4334 		} else {
4335 			vnode_rele(ndp->ni_vp);
4336 			return ENOENT;
4337 		}
4338 	} else {
4339 		proc_dirs_lock_exclusive(p);
4340 		proc_fdlock(p);
4341 		tvp = p->p_fd.fd_cdir;
4342 		p->p_fd.fd_cdir = ndp->ni_vp;
4343 		proc_fdunlock(p);
4344 		proc_dirs_unlock_exclusive(p);
4345 	}
4346 
4347 	if (tvp) {
4348 		vnode_rele(tvp);
4349 	}
4350 
4351 	return 0;
4352 }
4353 
4354 
4355 /*
4356  * Change current working directory (".").
4357  *
4358  * Returns:	0			Success
4359  *	chdir_internal:ENOTDIR
4360  *	chdir_internal:ENOENT		No such file or directory
4361  *	chdir_internal:???
4362  */
4363 /* ARGSUSED */
4364 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4365 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4366 {
4367 	struct nameidata nd;
4368 	vfs_context_t ctx = vfs_context_current();
4369 
4370 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4371 	    UIO_USERSPACE, uap->path, ctx);
4372 
4373 	return chdir_internal(p, ctx, &nd, per_thread);
4374 }
4375 
4376 
4377 /*
4378  * chdir
4379  *
4380  * Change current working directory (".") for the entire process
4381  *
4382  * Parameters:  p       Process requesting the call
4383  *              uap     User argument descriptor (see below)
4384  *              retval  (ignored)
4385  *
4386  * Indirect parameters:	uap->path	Directory path
4387  *
4388  * Returns:	0			Success
4389  *              common_chdir: ENOTDIR
4390  *              common_chdir: ENOENT	No such file or directory
4391  *              common_chdir: ???
4392  *
4393  */
4394 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4395 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4396 {
4397 	return common_chdir(p, (void *)uap, 0);
4398 }
4399 
4400 /*
4401  * __pthread_chdir
4402  *
4403  * Change current working directory (".") for a single thread
4404  *
4405  * Parameters:  p       Process requesting the call
4406  *              uap     User argument descriptor (see below)
4407  *              retval  (ignored)
4408  *
4409  * Indirect parameters:	uap->path	Directory path
4410  *
4411  * Returns:	0			Success
4412  *              common_chdir: ENOTDIR
4413  *		common_chdir: ENOENT	No such file or directory
4414  *		common_chdir: ???
4415  *
4416  */
4417 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4418 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4419 {
4420 	return common_chdir(p, (void *)uap, 1);
4421 }
4422 
4423 
4424 /*
4425  * Change notion of root (``/'') directory.
4426  */
4427 /* ARGSUSED */
4428 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4429 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4430 {
4431 	struct filedesc *fdp = &p->p_fd;
4432 	int error;
4433 	struct nameidata nd;
4434 	vnode_t tvp;
4435 	vfs_context_t ctx = vfs_context_current();
4436 
4437 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4438 		return error;
4439 	}
4440 
4441 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4442 	    UIO_USERSPACE, uap->path, ctx);
4443 	error = change_dir(&nd, ctx);
4444 	if (error) {
4445 		return error;
4446 	}
4447 
4448 #if CONFIG_MACF
4449 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4450 	    &nd.ni_cnd);
4451 	if (error) {
4452 		vnode_put(nd.ni_vp);
4453 		return error;
4454 	}
4455 #endif
4456 
4457 	if ((error = vnode_ref(nd.ni_vp))) {
4458 		vnode_put(nd.ni_vp);
4459 		return error;
4460 	}
4461 	vnode_put(nd.ni_vp);
4462 
4463 	/*
4464 	 * This lock provides the guarantee that as long as you hold the lock
4465 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4466 	 * on a referenced vnode in namei when determining the rootvnode for
4467 	 * a process.
4468 	 */
4469 	/* needed for synchronization with lookup */
4470 	proc_dirs_lock_exclusive(p);
4471 	/* needed for setting the flag and other activities on the fd itself */
4472 	proc_fdlock(p);
4473 	tvp = fdp->fd_rdir;
4474 	fdp->fd_rdir = nd.ni_vp;
4475 	fdt_flag_set(fdp, FD_CHROOT);
4476 	proc_fdunlock(p);
4477 	proc_dirs_unlock_exclusive(p);
4478 
4479 	if (tvp != NULL) {
4480 		vnode_rele(tvp);
4481 	}
4482 
4483 	return 0;
4484 }
4485 
4486 #define PATHSTATICBUFLEN 256
4487 #define PIVOT_ROOT_ENTITLEMENT              \
4488        "com.apple.private.vfs.pivot-root"
4489 
4490 #if defined(XNU_TARGET_OS_OSX)
4491 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4492 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4493 {
4494 	int error;
4495 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4496 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4497 	char *new_rootfs_path_before_buf = NULL;
4498 	char *old_rootfs_path_after_buf = NULL;
4499 	char *incoming = NULL;
4500 	char *outgoing = NULL;
4501 	vnode_t incoming_rootvp = NULLVP;
4502 	size_t bytes_copied;
4503 
4504 	/*
4505 	 * XXX : Additional restrictions needed
4506 	 * - perhaps callable only once.
4507 	 */
4508 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4509 		return error;
4510 	}
4511 
4512 	/*
4513 	 * pivot_root can be executed by launchd only.
4514 	 * Enforce entitlement.
4515 	 */
4516 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4517 		return EPERM;
4518 	}
4519 
4520 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4521 	if (error == ENAMETOOLONG) {
4522 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4523 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4524 	}
4525 
4526 	if (error) {
4527 		goto out;
4528 	}
4529 
4530 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4531 	if (error == ENAMETOOLONG) {
4532 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4533 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4534 	}
4535 	if (error) {
4536 		goto out;
4537 	}
4538 
4539 	if (new_rootfs_path_before_buf) {
4540 		incoming = new_rootfs_path_before_buf;
4541 	} else {
4542 		incoming = &new_rootfs_path_before[0];
4543 	}
4544 
4545 	if (old_rootfs_path_after_buf) {
4546 		outgoing = old_rootfs_path_after_buf;
4547 	} else {
4548 		outgoing = &old_rootfs_path_after[0];
4549 	}
4550 
4551 	/*
4552 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4553 	 * Userland is not allowed to pivot to an image.
4554 	 */
4555 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4556 	if (error) {
4557 		goto out;
4558 	}
4559 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4560 	if (error) {
4561 		goto out;
4562 	}
4563 
4564 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4565 
4566 out:
4567 	if (incoming_rootvp != NULLVP) {
4568 		vnode_put(incoming_rootvp);
4569 		incoming_rootvp = NULLVP;
4570 	}
4571 
4572 	if (old_rootfs_path_after_buf) {
4573 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4574 	}
4575 
4576 	if (new_rootfs_path_before_buf) {
4577 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4578 	}
4579 
4580 	return error;
4581 }
4582 #else
4583 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4584 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4585 {
4586 	return nosys(p, NULL, retval);
4587 }
4588 #endif /* XNU_TARGET_OS_OSX */
4589 
4590 /*
4591  * Common routine for chroot and chdir.
4592  *
4593  * Returns:	0			Success
4594  *		ENOTDIR			Not a directory
4595  *		namei:???		[anything namei can return]
4596  *		vnode_authorize:???	[anything vnode_authorize can return]
4597  */
4598 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4599 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4600 {
4601 	vnode_t vp;
4602 	int error;
4603 
4604 	if ((error = namei(ndp))) {
4605 		return error;
4606 	}
4607 	nameidone(ndp);
4608 	vp = ndp->ni_vp;
4609 
4610 	if (vp->v_type != VDIR) {
4611 		vnode_put(vp);
4612 		return ENOTDIR;
4613 	}
4614 
4615 #if CONFIG_MACF
4616 	error = mac_vnode_check_chdir(ctx, vp);
4617 	if (error) {
4618 		vnode_put(vp);
4619 		return error;
4620 	}
4621 #endif
4622 
4623 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4624 	if (error) {
4625 		vnode_put(vp);
4626 		return error;
4627 	}
4628 
4629 	return error;
4630 }
4631 
4632 /*
4633  * Free the vnode data (for directories) associated with the file glob.
4634  */
4635 struct fd_vn_data *
fg_vn_data_alloc(void)4636 fg_vn_data_alloc(void)
4637 {
4638 	struct fd_vn_data *fvdata;
4639 
4640 	/* Allocate per fd vnode data */
4641 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4642 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4643 	return fvdata;
4644 }
4645 
4646 /*
4647  * Free the vnode data (for directories) associated with the file glob.
4648  */
4649 void
fg_vn_data_free(void * fgvndata)4650 fg_vn_data_free(void *fgvndata)
4651 {
4652 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4653 
4654 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4655 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4656 	kfree_type(struct fd_vn_data, fvdata);
4657 }
4658 
4659 /*
4660  * Check permissions, allocate an open file structure,
4661  * and call the device open routine if any.
4662  *
4663  * Returns:	0			Success
4664  *		EINVAL
4665  *		EINTR
4666  *	falloc:ENFILE
4667  *	falloc:EMFILE
4668  *	falloc:ENOMEM
4669  *	vn_open_auth:???
4670  *	dupfdopen:???
4671  *	VNOP_ADVLOCK:???
4672  *	vnode_setsize:???
4673  *
4674  * XXX Need to implement uid, gid
4675  */
4676 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4677 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4678     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4679 {
4680 	proc_t p = vfs_context_proc(ctx);
4681 	kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4682 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4683 	struct fileproc *fp;
4684 	vnode_t vp;
4685 	int flags, oflags, amode;
4686 	int type, indx, error;
4687 	struct vfs_context context;
4688 	vnode_t authvp = NULLVP;
4689 
4690 	oflags = uflags;
4691 
4692 	amode = oflags & O_ACCMODE;
4693 	/*
4694 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4695 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4696 	 * with FREAD/FWRITE.
4697 	 */
4698 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4699 		return EINVAL;
4700 	}
4701 
4702 	flags = FFLAGS(uflags);
4703 	CLR(flags, FENCRYPTED);
4704 	CLR(flags, FUNENCRYPTED);
4705 
4706 	AUDIT_ARG(fflags, oflags);
4707 	AUDIT_ARG(mode, vap->va_mode);
4708 
4709 	if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4710 		return error;
4711 	}
4712 	if (flags & O_CLOEXEC) {
4713 		fp->fp_flags |= FP_CLOEXEC;
4714 	}
4715 	if (flags & O_CLOFORK) {
4716 		fp->fp_flags |= FP_CLOFORK;
4717 	}
4718 
4719 	/* setup state to recognize when fdesc_open was called */
4720 	uu->uu_dupfd = -1;
4721 
4722 	/*
4723 	 * Disable read/write access if file is opened with O_EVTONLY and
4724 	 * the process has requested to deny read/write access.
4725 	 */
4726 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4727 		flags &= ~(FREAD | FWRITE);
4728 	}
4729 
4730 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4731 		error = vnode_getfromfd(ctx, authfd, &authvp);
4732 		if (error) {
4733 			fp_free(p, indx, fp);
4734 			return error;
4735 		}
4736 	}
4737 
4738 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4739 		if (authvp != NULLVP) {
4740 			vnode_put(authvp);
4741 		}
4742 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4743 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4744 				*retval = indx;
4745 				return 0;
4746 			}
4747 		}
4748 		if (error == ERESTART) {
4749 			error = EINTR;
4750 		}
4751 		fp_free(p, indx, fp);
4752 		return error;
4753 	}
4754 
4755 	if (authvp != NULLVP) {
4756 		vnode_put(authvp);
4757 	}
4758 
4759 	uu->uu_dupfd = 0;
4760 	vp = ndp->ni_vp;
4761 
4762 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4763 	fp->fp_glob->fg_ops = &vnops;
4764 	fp_set_data(fp, vp);
4765 
4766 #if CONFIG_FILE_LEASES
4767 	/*
4768 	 * If we are creating a file or open with truncate, we need to break the
4769 	 * lease if there is a read lease placed on the parent dir.
4770 	 */
4771 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4772 		vnode_breakdirlease(vp, true, oflags);
4773 	}
4774 	/* Now check if there is a lease placed on the file itself. */
4775 	error = vnode_breaklease(vp, oflags, ctx);
4776 	if (error) {
4777 		goto bad;
4778 	}
4779 #endif /* CONFIG_FILE_LEASES */
4780 
4781 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4782 		struct flock lf = {
4783 			.l_whence = SEEK_SET,
4784 		};
4785 
4786 		if (flags & O_EXLOCK) {
4787 			lf.l_type = F_WRLCK;
4788 		} else {
4789 			lf.l_type = F_RDLCK;
4790 		}
4791 		type = F_FLOCK;
4792 		if ((flags & FNONBLOCK) == 0) {
4793 			type |= F_WAIT;
4794 		}
4795 #if CONFIG_MACF
4796 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4797 		    F_SETLK, &lf);
4798 		if (error) {
4799 			goto bad;
4800 		}
4801 #endif
4802 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4803 			goto bad;
4804 		}
4805 		fp->fp_glob->fg_flag |= FWASLOCKED;
4806 	}
4807 
4808 	/* try to truncate by setting the size attribute */
4809 	if (flags & O_TRUNC) {
4810 		if ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0) {
4811 			goto bad;
4812 		}
4813 		fp->fp_glob->fg_flag |= FWASWRITTEN;
4814 	}
4815 
4816 	/*
4817 	 * For directories we hold some additional information in the fd.
4818 	 */
4819 	if (vnode_vtype(vp) == VDIR) {
4820 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4821 	} else {
4822 		fp->fp_glob->fg_vn_data = NULL;
4823 	}
4824 
4825 #if CONFIG_SECLUDED_MEMORY
4826 	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4827 		memory_object_control_t moc;
4828 		const char *v_name;
4829 
4830 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4831 
4832 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4833 			/* nothing to do... */
4834 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4835 			/* writable -> no longer  eligible for secluded pages */
4836 			memory_object_mark_eligible_for_secluded(moc,
4837 			    FALSE);
4838 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4839 			char pathname[32] = { 0, };
4840 			size_t copied;
4841 			/* XXX FBDP: better way to detect /Applications/ ? */
4842 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4843 				(void)copyinstr(ndp->ni_dirp,
4844 				    pathname,
4845 				    sizeof(pathname),
4846 				    &copied);
4847 			} else {
4848 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4849 				    pathname,
4850 				    sizeof(pathname),
4851 				    &copied);
4852 			}
4853 			pathname[sizeof(pathname) - 1] = '\0';
4854 			if (strncmp(pathname,
4855 			    "/Applications/",
4856 			    strlen("/Applications/")) == 0 &&
4857 			    strncmp(pathname,
4858 			    "/Applications/Camera.app/",
4859 			    strlen("/Applications/Camera.app/")) != 0) {
4860 				/*
4861 				 * not writable
4862 				 * AND from "/Applications/"
4863 				 * AND not from "/Applications/Camera.app/"
4864 				 * ==> eligible for secluded
4865 				 */
4866 				memory_object_mark_eligible_for_secluded(moc,
4867 				    TRUE);
4868 			}
4869 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4870 		    (v_name = vnode_getname(vp))) {
4871 			size_t len = strlen(v_name);
4872 
4873 			if (!strncmp(v_name, "dyld", len) ||
4874 			    !strncmp(v_name, "launchd", len) ||
4875 			    !strncmp(v_name, "Camera", len) ||
4876 			    !strncmp(v_name, "SpringBoard", len) ||
4877 			    !strncmp(v_name, "backboardd", len) ||
4878 			    !strncmp(v_name, "cameracaptured", len)) {
4879 				/*
4880 				 * This file matters when launching Camera:
4881 				 * do not store its contents in the secluded
4882 				 * pool that will be drained on Camera launch.
4883 				 */
4884 				memory_object_mark_eligible_for_secluded(moc,
4885 				    FALSE);
4886 			} else if (!strncmp(v_name, "audiomxd", len) ||
4887 			    !strncmp(v_name, "mediaplaybackd", len)) {
4888 				memory_object_mark_eligible_for_secluded(moc,
4889 				    FALSE);
4890 				memory_object_mark_for_realtime(moc,
4891 				    true);
4892 			} else if (!strncmp(v_name, "bluetoothd", len)) {
4893 				/*
4894 				 * bluetoothd might be needed for realtime audio
4895 				 * playback.
4896 				 */
4897 				memory_object_mark_eligible_for_secluded(moc,
4898 				    FALSE);
4899 				memory_object_mark_for_realtime(moc,
4900 				    true);
4901 			} else {
4902 				char pathname[64] = { 0, };
4903 				size_t copied;
4904 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4905 					(void)copyinstr(ndp->ni_dirp,
4906 					    pathname,
4907 					    sizeof(pathname),
4908 					    &copied);
4909 				} else {
4910 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
4911 					    pathname,
4912 					    sizeof(pathname),
4913 					    &copied);
4914 				}
4915 				pathname[sizeof(pathname) - 1] = '\0';
4916 				if (strncmp(pathname,
4917 				    "/Library/Audio/Plug-Ins/",
4918 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4919 				    strncmp(pathname,
4920 				    "/System/Library/Audio/Plug-Ins/",
4921 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4922 					/*
4923 					 * This may be an audio plugin required
4924 					 * for realtime playback.
4925 					 * ==> NOT eligible for secluded.
4926 					 */
4927 					memory_object_mark_eligible_for_secluded(moc,
4928 					    FALSE);
4929 					memory_object_mark_for_realtime(moc,
4930 					    true);
4931 				}
4932 			}
4933 			vnode_putname(v_name);
4934 		}
4935 	}
4936 #endif /* CONFIG_SECLUDED_MEMORY */
4937 
4938 	vnode_put(vp);
4939 
4940 	/*
4941 	 * The first terminal open (without a O_NOCTTY) by a session leader
4942 	 * results in it being set as the controlling terminal.
4943 	 */
4944 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4945 	    !(flags & O_NOCTTY)) {
4946 		int tmp = 0;
4947 
4948 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4949 		    (caddr_t)&tmp, ctx);
4950 	}
4951 
4952 	proc_fdlock(p);
4953 	procfdtbl_releasefd(p, indx, NULL);
4954 
4955 	fp_drop(p, indx, fp, 1);
4956 	proc_fdunlock(p);
4957 
4958 	*retval = indx;
4959 
4960 	return 0;
4961 bad:
4962 	context = *vfs_context_current();
4963 	context.vc_ucred = fp->fp_glob->fg_cred;
4964 
4965 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4966 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4967 		struct flock lf = {
4968 			.l_whence = SEEK_SET,
4969 			.l_type = F_UNLCK,
4970 		};
4971 
4972 		(void)VNOP_ADVLOCK(
4973 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4974 	}
4975 
4976 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4977 	vnode_put(vp);
4978 	fp_free(p, indx, fp);
4979 
4980 	return error;
4981 }
4982 
4983 /*
4984  * While most of the *at syscall handlers can call nameiat() which
4985  * is a wrapper around namei, the use of namei and initialisation
4986  * of nameidata are far removed and in different functions  - namei
4987  * gets called in vn_open_auth for open1. So we'll just do here what
4988  * nameiat() does.
4989  */
4990 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4991 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4992     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4993     int dirfd, int authfd)
4994 {
4995 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4996 		int error;
4997 		char c;
4998 
4999 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5000 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
5001 			if (error) {
5002 				return error;
5003 			}
5004 		} else {
5005 			c = *((char *)(ndp->ni_dirp));
5006 		}
5007 
5008 		if (c != '/') {
5009 			vnode_t dvp_at;
5010 
5011 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
5012 			    &dvp_at);
5013 			if (error) {
5014 				return error;
5015 			}
5016 
5017 			if (vnode_vtype(dvp_at) != VDIR) {
5018 				vnode_put(dvp_at);
5019 				return ENOTDIR;
5020 			}
5021 
5022 			ndp->ni_dvp = dvp_at;
5023 			ndp->ni_cnd.cn_flags |= USEDVP;
5024 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
5025 			    retval, authfd);
5026 			vnode_put(dvp_at);
5027 			return error;
5028 		}
5029 	}
5030 
5031 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
5032 }
5033 
5034 /*
5035  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
5036  *
5037  * Parameters:	p			Process requesting the open
5038  *		uap			User argument descriptor (see below)
5039  *		retval			Pointer to an area to receive the
5040  *					return calue from the system call
5041  *
5042  * Indirect:	uap->path		Path to open (same as 'open')
5043  *		uap->flags		Flags to open (same as 'open'
5044  *		uap->uid		UID to set, if creating
5045  *		uap->gid		GID to set, if creating
5046  *		uap->mode		File mode, if creating (same as 'open')
5047  *		uap->xsecurity		ACL to set, if creating
5048  *
5049  * Returns:	0			Success
5050  *		!0			errno value
5051  *
5052  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5053  *
5054  * XXX:		We should enummerate the possible errno values here, and where
5055  *		in the code they originated.
5056  */
5057 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)5058 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
5059 {
5060 	int ciferror;
5061 	kauth_filesec_t xsecdst;
5062 	struct vnode_attr va;
5063 	struct nameidata nd;
5064 	int cmode;
5065 
5066 	AUDIT_ARG(owner, uap->uid, uap->gid);
5067 
5068 	xsecdst = NULL;
5069 	if ((uap->xsecurity != USER_ADDR_NULL) &&
5070 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
5071 		return ciferror;
5072 	}
5073 
5074 	VATTR_INIT(&va);
5075 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
5076 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5077 	if (uap->uid != KAUTH_UID_NONE) {
5078 		VATTR_SET(&va, va_uid, uap->uid);
5079 	}
5080 	if (uap->gid != KAUTH_GID_NONE) {
5081 		VATTR_SET(&va, va_gid, uap->gid);
5082 	}
5083 	if (xsecdst != NULL) {
5084 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5085 		va.va_vaflags |= VA_FILESEC_ACL;
5086 	}
5087 
5088 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
5089 	    uap->path, vfs_context_current());
5090 
5091 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
5092 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
5093 	if (xsecdst != NULL) {
5094 		kauth_filesec_free(xsecdst);
5095 	}
5096 
5097 	return ciferror;
5098 }
5099 
5100 /*
5101  * Go through the data-protected atomically controlled open (2)
5102  *
5103  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5104  */
5105 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5106 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5107     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5108 {
5109 	/*
5110 	 * Follow the same path as normal open(2)
5111 	 * Look up the item if it exists, and acquire the vnode.
5112 	 */
5113 	struct vnode_attr va;
5114 	struct nameidata nd;
5115 	int cmode;
5116 	int error;
5117 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5118 
5119 	VATTR_INIT(&va);
5120 	/* Mask off all but regular access permissions */
5121 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5122 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5123 
5124 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5125 	    path, ctx);
5126 
5127 	/*
5128 	 * Initialize the extra fields in vnode_attr to pass down our
5129 	 * extra fields.
5130 	 * 1. target cprotect class.
5131 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5132 	 */
5133 	if (flags & O_CREAT) {
5134 		/* lower level kernel code validates that the class is valid before applying it. */
5135 		if (class != PROTECTION_CLASS_DEFAULT) {
5136 			/*
5137 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5138 			 * file behave the same as open (2)
5139 			 */
5140 			VATTR_SET(&va, va_dataprotect_class, class);
5141 		}
5142 	}
5143 
5144 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5145 		if (flags & (O_RDWR | O_WRONLY)) {
5146 			/*
5147 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
5148 			 */
5149 			return EINVAL;
5150 		}
5151 		if (dpflags & O_DP_GETRAWENCRYPTED) {
5152 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5153 		}
5154 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5155 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5156 		}
5157 		if (dpflags & O_DP_AUTHENTICATE) {
5158 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5159 		}
5160 	}
5161 
5162 	error = open1at(vfs_context_current(), &nd, flags, &va,
5163 	    NULL, NULL, retval, fd, authfd);
5164 
5165 	return error;
5166 }
5167 
5168 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5169 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5170 {
5171 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5172 		return EINVAL;
5173 	}
5174 
5175 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5176 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5177 }
5178 
5179 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5180 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5181 {
5182 	if (uap->dpflags & O_DP_AUTHENTICATE) {
5183 		return EINVAL;
5184 	}
5185 
5186 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5187 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5188 }
5189 
5190 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5191 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5192     int fd, enum uio_seg segflg, int *retval)
5193 {
5194 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5195 	struct {
5196 		struct vnode_attr va;
5197 		struct nameidata nd;
5198 	} *__open_data;
5199 	struct vnode_attr *vap;
5200 	struct nameidata *ndp;
5201 	int cmode;
5202 	int error;
5203 
5204 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5205 	vap = &__open_data->va;
5206 	ndp = &__open_data->nd;
5207 
5208 	VATTR_INIT(vap);
5209 	/* Mask off all but regular access permissions */
5210 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5211 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5212 
5213 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5214 	    segflg, path, ctx);
5215 
5216 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5217 
5218 	kfree_type(typeof(*__open_data), __open_data);
5219 
5220 	return error;
5221 }
5222 
5223 int
open(proc_t p,struct open_args * uap,int32_t * retval)5224 open(proc_t p, struct open_args *uap, int32_t *retval)
5225 {
5226 	__pthread_testcancel(1);
5227 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5228 }
5229 
5230 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5231 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5232     int32_t *retval)
5233 {
5234 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5235 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5236 }
5237 
5238 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5239 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5240     int32_t *retval)
5241 {
5242 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5243 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
5244 }
5245 
5246 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5247 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5248 {
5249 	__pthread_testcancel(1);
5250 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5251 }
5252 
5253 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5254 
5255 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5256 vfs_context_can_open_by_id(vfs_context_t ctx)
5257 {
5258 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5259 		return TRUE;
5260 	}
5261 
5262 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5263 	           OPEN_BY_ID_ENTITLEMENT);
5264 }
5265 
5266 /*
5267  * openbyid_np: open a file given a file system id and a file system object id
5268  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5269  *	file systems that don't support object ids it is a node id (uint64_t).
5270  *
5271  * Parameters:	p			Process requesting the open
5272  *		uap			User argument descriptor (see below)
5273  *		retval			Pointer to an area to receive the
5274  *					return calue from the system call
5275  *
5276  * Indirect:	uap->path		Path to open (same as 'open')
5277  *
5278  *		uap->fsid		id of target file system
5279  *		uap->objid		id of target file system object
5280  *		uap->flags		Flags to open (same as 'open')
5281  *
5282  * Returns:	0			Success
5283  *		!0			errno value
5284  *
5285  *
5286  * XXX:		We should enummerate the possible errno values here, and where
5287  *		in the code they originated.
5288  */
5289 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5290 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5291 {
5292 	fsid_t fsid;
5293 	uint64_t objid;
5294 	int error;
5295 	char *buf = NULL;
5296 	int buflen = MAXPATHLEN;
5297 	int pathlen = 0;
5298 	vfs_context_t ctx = vfs_context_current();
5299 
5300 	if (!vfs_context_can_open_by_id(ctx)) {
5301 		return EPERM;
5302 	}
5303 
5304 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5305 		return error;
5306 	}
5307 
5308 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5309 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5310 		return error;
5311 	}
5312 
5313 	AUDIT_ARG(value32, fsid.val[0]);
5314 	AUDIT_ARG(value64, objid);
5315 
5316 	/*resolve path from fsis, objid*/
5317 	do {
5318 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5319 		if (buf == NULL) {
5320 			return ENOMEM;
5321 		}
5322 
5323 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5324 		    buf, FSOPT_ISREALFSID, &pathlen);
5325 
5326 		if (error) {
5327 			kfree_data(buf, buflen + 1);
5328 			buf = NULL;
5329 		}
5330 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5331 
5332 	if (error) {
5333 		return error;
5334 	}
5335 
5336 	buf[pathlen] = 0;
5337 
5338 	error = openat_internal(
5339 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5340 
5341 	kfree_data(buf, buflen + 1);
5342 
5343 	return error;
5344 }
5345 
5346 
5347 /*
5348  * Create a special file.
5349  */
5350 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5351     int fd);
5352 
5353 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5354 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5355     mode_t mode, int fd)
5356 {
5357 	vfs_context_t ctx = vfs_context_current();
5358 	struct nameidata nd;
5359 	vnode_t vp, dvp;
5360 	int error;
5361 
5362 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5363 	if ((mode & S_IFMT) == S_IFIFO) {
5364 		return mkfifo1(ctx, upath, vap, fd);
5365 	}
5366 
5367 	AUDIT_ARG(mode, mode);
5368 	AUDIT_ARG(value32, vap->va_rdev);
5369 
5370 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5371 		return error;
5372 	}
5373 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5374 	    UIO_USERSPACE, upath, ctx);
5375 	error = nameiat(&nd, fd);
5376 	if (error) {
5377 		return error;
5378 	}
5379 	dvp = nd.ni_dvp;
5380 	vp = nd.ni_vp;
5381 
5382 	if (vp != NULL) {
5383 		error = EEXIST;
5384 		goto out;
5385 	}
5386 
5387 	switch (mode & S_IFMT) {
5388 	case S_IFCHR:
5389 		VATTR_SET(vap, va_type, VCHR);
5390 		break;
5391 	case S_IFBLK:
5392 		VATTR_SET(vap, va_type, VBLK);
5393 		break;
5394 	default:
5395 		error = EINVAL;
5396 		goto out;
5397 	}
5398 
5399 #if CONFIG_MACF
5400 	error = mac_vnode_check_create(ctx,
5401 	    nd.ni_dvp, &nd.ni_cnd, vap);
5402 	if (error) {
5403 		goto out;
5404 	}
5405 #endif
5406 
5407 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5408 		goto out;
5409 	}
5410 
5411 #if CONFIG_FILE_LEASES
5412 	vnode_breakdirlease(dvp, false, O_WRONLY);
5413 #endif
5414 
5415 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5416 		goto out;
5417 	}
5418 
5419 	if (vp) {
5420 		int     update_flags = 0;
5421 
5422 		// Make sure the name & parent pointers are hooked up
5423 		if (vp->v_name == NULL) {
5424 			update_flags |= VNODE_UPDATE_NAME;
5425 		}
5426 		if (vp->v_parent == NULLVP) {
5427 			update_flags |= VNODE_UPDATE_PARENT;
5428 		}
5429 
5430 		if (update_flags) {
5431 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5432 		}
5433 
5434 #if CONFIG_FSE
5435 		add_fsevent(FSE_CREATE_FILE, ctx,
5436 		    FSE_ARG_VNODE, vp,
5437 		    FSE_ARG_DONE);
5438 #endif
5439 	}
5440 
5441 out:
5442 	/*
5443 	 * nameidone has to happen before we vnode_put(dvp)
5444 	 * since it may need to release the fs_nodelock on the dvp
5445 	 */
5446 	nameidone(&nd);
5447 
5448 	if (vp) {
5449 		vnode_put(vp);
5450 	}
5451 	vnode_put(dvp);
5452 
5453 	return error;
5454 }
5455 
5456 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5457 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5458 {
5459 	struct vnode_attr va;
5460 
5461 	VATTR_INIT(&va);
5462 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5463 	VATTR_SET(&va, va_rdev, uap->dev);
5464 
5465 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5466 }
5467 
5468 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5469 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5470 {
5471 	struct vnode_attr va;
5472 
5473 	VATTR_INIT(&va);
5474 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5475 	VATTR_SET(&va, va_rdev, uap->dev);
5476 
5477 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5478 }
5479 
5480 /*
5481  * Create a named pipe.
5482  *
5483  * Returns:	0			Success
5484  *		EEXIST
5485  *	namei:???
5486  *	vnode_authorize:???
5487  *	vn_create:???
5488  */
5489 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5490 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5491 {
5492 	vnode_t vp, dvp;
5493 	int error;
5494 	struct nameidata nd;
5495 
5496 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5497 	    UIO_USERSPACE, upath, ctx);
5498 	error = nameiat(&nd, fd);
5499 	if (error) {
5500 		return error;
5501 	}
5502 	dvp = nd.ni_dvp;
5503 	vp = nd.ni_vp;
5504 
5505 	/* check that this is a new file and authorize addition */
5506 	if (vp != NULL) {
5507 		error = EEXIST;
5508 		goto out;
5509 	}
5510 	VATTR_SET(vap, va_type, VFIFO);
5511 
5512 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5513 		goto out;
5514 	}
5515 
5516 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5517 out:
5518 	/*
5519 	 * nameidone has to happen before we vnode_put(dvp)
5520 	 * since it may need to release the fs_nodelock on the dvp
5521 	 */
5522 	nameidone(&nd);
5523 
5524 	if (vp) {
5525 		vnode_put(vp);
5526 	}
5527 	vnode_put(dvp);
5528 
5529 	return error;
5530 }
5531 
5532 
5533 /*
5534  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5535  *
5536  * Parameters:	p			Process requesting the open
5537  *		uap			User argument descriptor (see below)
5538  *		retval			(Ignored)
5539  *
5540  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5541  *		uap->uid		UID to set
5542  *		uap->gid		GID to set
5543  *		uap->mode		File mode to set (same as 'mkfifo')
5544  *		uap->xsecurity		ACL to set, if creating
5545  *
5546  * Returns:	0			Success
5547  *		!0			errno value
5548  *
5549  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5550  *
5551  * XXX:		We should enummerate the possible errno values here, and where
5552  *		in the code they originated.
5553  */
5554 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5555 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5556 {
5557 	int ciferror;
5558 	kauth_filesec_t xsecdst;
5559 	struct vnode_attr va;
5560 
5561 	AUDIT_ARG(owner, uap->uid, uap->gid);
5562 
5563 	xsecdst = KAUTH_FILESEC_NONE;
5564 	if (uap->xsecurity != USER_ADDR_NULL) {
5565 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5566 			return ciferror;
5567 		}
5568 	}
5569 
5570 	VATTR_INIT(&va);
5571 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5572 	if (uap->uid != KAUTH_UID_NONE) {
5573 		VATTR_SET(&va, va_uid, uap->uid);
5574 	}
5575 	if (uap->gid != KAUTH_GID_NONE) {
5576 		VATTR_SET(&va, va_gid, uap->gid);
5577 	}
5578 	if (xsecdst != KAUTH_FILESEC_NONE) {
5579 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5580 		va.va_vaflags |= VA_FILESEC_ACL;
5581 	}
5582 
5583 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5584 
5585 	if (xsecdst != KAUTH_FILESEC_NONE) {
5586 		kauth_filesec_free(xsecdst);
5587 	}
5588 	return ciferror;
5589 }
5590 
5591 /* ARGSUSED */
5592 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5593 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5594 {
5595 	struct vnode_attr va;
5596 
5597 	VATTR_INIT(&va);
5598 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5599 
5600 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5601 }
5602 
5603 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5604 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5605 {
5606 	struct vnode_attr va;
5607 
5608 	VATTR_INIT(&va);
5609 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5610 
5611 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5612 }
5613 
5614 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5615 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5616 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5617 
5618 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5619 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5620 {
5621 	int ret, len = _len;
5622 
5623 	*truncated_path = 0;
5624 
5625 	if (firmlink) {
5626 		ret = vn_getpath(dvp, path, &len);
5627 	} else {
5628 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5629 	}
5630 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5631 		if (leafname) {
5632 			path[len - 1] = '/';
5633 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5634 			if (len > MAXPATHLEN) {
5635 				char *ptr;
5636 
5637 				// the string got truncated!
5638 				*truncated_path = 1;
5639 				ptr = strrchr(path, '/');
5640 				if (ptr) {
5641 					*ptr = '\0';   // chop off the string at the last directory component
5642 				}
5643 				len = (int)strlen(path) + 1;
5644 			}
5645 		}
5646 	} else if (ret == 0) {
5647 		*truncated_path = 1;
5648 	} else if (ret != 0) {
5649 		struct vnode *mydvp = dvp;
5650 
5651 		if (ret != ENOSPC) {
5652 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5653 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5654 		}
5655 		*truncated_path = 1;
5656 
5657 		do {
5658 			if (mydvp->v_parent != NULL) {
5659 				mydvp = mydvp->v_parent;
5660 			} else if (mydvp->v_mount) {
5661 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5662 				break;
5663 			} else {
5664 				// no parent and no mount point?  only thing is to punt and say "/" changed
5665 				strlcpy(path, "/", _len);
5666 				len = 2;
5667 				mydvp = NULL;
5668 			}
5669 
5670 			if (mydvp == NULL) {
5671 				break;
5672 			}
5673 
5674 			len = _len;
5675 			if (firmlink) {
5676 				ret = vn_getpath(mydvp, path, &len);
5677 			} else {
5678 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5679 			}
5680 		} while (ret == ENOSPC);
5681 	}
5682 
5683 	return len;
5684 }
5685 
5686 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5687 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5688 {
5689 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5690 }
5691 
5692 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5693 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5694 {
5695 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5696 }
5697 
5698 /*
5699  * Make a hard file link.
5700  *
5701  * Returns:	0			Success
5702  *		EPERM
5703  *		EEXIST
5704  *		EXDEV
5705  *	namei:???
5706  *	vnode_authorize:???
5707  *	VNOP_LINK:???
5708  */
5709 /* ARGSUSED */
5710 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5711 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5712     user_addr_t link, int flag, enum uio_seg segflg)
5713 {
5714 	vnode_t vp, pvp, dvp, lvp;
5715 	struct nameidata nd;
5716 	int follow;
5717 	int error;
5718 #if CONFIG_FSE
5719 	fse_info finfo;
5720 #endif
5721 	char *target_path = NULL;
5722 	char  *no_firmlink_path = NULL;
5723 	vnode_t locked_vp = NULLVP;
5724 	int truncated = 0;
5725 	int truncated_no_firmlink_path = 0;
5726 	int num_retries = 0;
5727 	int need_event, has_listeners, need_kpath2;
5728 	bool do_retry;
5729 
5730 	/* look up the object we are linking to */
5731 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5732 
5733 retry:
5734 	do_retry = false;
5735 	vp = dvp = lvp = NULLVP;
5736 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5737 	    segflg, path, ctx);
5738 
5739 	error = nameiat(&nd, fd1);
5740 	if (error) {
5741 		return error;
5742 	}
5743 	vp = nd.ni_vp;
5744 
5745 	nameidone(&nd);
5746 
5747 	/*
5748 	 * Normally, linking to directories is not supported.
5749 	 * However, some file systems may have limited support.
5750 	 */
5751 	if (vp->v_type == VDIR) {
5752 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5753 			error = EPERM;   /* POSIX */
5754 			goto out;
5755 		}
5756 
5757 		/* Linking to a directory requires ownership. */
5758 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5759 			struct vnode_attr dva;
5760 
5761 			VATTR_INIT(&dva);
5762 			VATTR_WANTED(&dva, va_uid);
5763 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5764 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5765 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5766 				error = EACCES;
5767 				goto out;
5768 			}
5769 		}
5770 	}
5771 
5772 	/* lookup the target node */
5773 #if CONFIG_TRIGGERS
5774 	nd.ni_op = OP_LINK;
5775 #endif
5776 	nd.ni_cnd.cn_nameiop = CREATE;
5777 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5778 	nd.ni_dirp = link;
5779 	error = nameiat(&nd, fd2);
5780 	if (error != 0) {
5781 		goto out;
5782 	}
5783 	dvp = nd.ni_dvp;
5784 	lvp = nd.ni_vp;
5785 
5786 	assert(locked_vp == NULLVP);
5787 	vnode_link_lock(vp);
5788 	locked_vp = vp;
5789 
5790 #if CONFIG_MACF
5791 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5792 		goto out2;
5793 	}
5794 #endif
5795 
5796 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5797 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5798 		goto out2;
5799 	}
5800 
5801 	/* target node must not exist */
5802 	if (lvp != NULLVP) {
5803 		error = EEXIST;
5804 		goto out2;
5805 	}
5806 	/* cannot link across mountpoints */
5807 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5808 		error = EXDEV;
5809 		goto out2;
5810 	}
5811 
5812 	/* authorize creation of the target note */
5813 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5814 		goto out2;
5815 	}
5816 
5817 #if CONFIG_FILE_LEASES
5818 	vnode_breakdirlease(dvp, false, O_WRONLY);
5819 #endif
5820 
5821 	/* and finally make the link */
5822 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5823 	if (error) {
5824 		if (error == ENOENT && num_retries < MAX_LINK_ENOENT_RETRIES) {
5825 			do_retry = true;
5826 			num_retries += 1;
5827 		}
5828 		goto out2;
5829 	}
5830 
5831 #if CONFIG_MACF
5832 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5833 #endif
5834 
5835 	assert(locked_vp == vp);
5836 	vnode_link_unlock(locked_vp);
5837 	locked_vp = NULLVP;
5838 
5839 #if CONFIG_FSE
5840 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5841 #else
5842 	need_event = 0;
5843 #endif
5844 	has_listeners = kauth_authorize_fileop_has_listeners();
5845 
5846 	need_kpath2 = 0;
5847 #if CONFIG_AUDIT
5848 	if (AUDIT_RECORD_EXISTS()) {
5849 		need_kpath2 = 1;
5850 	}
5851 #endif
5852 
5853 	if (need_event || has_listeners || need_kpath2) {
5854 		char *link_to_path = NULL;
5855 		int len, link_name_len;
5856 		int  len_no_firmlink_path = 0;
5857 
5858 		/* build the path to the new link file */
5859 		GET_PATH(target_path);
5860 
5861 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5862 		if (no_firmlink_path == NULL) {
5863 			GET_PATH(no_firmlink_path);
5864 		}
5865 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5866 
5867 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5868 
5869 		if (has_listeners) {
5870 			/* build the path to file we are linking to */
5871 			GET_PATH(link_to_path);
5872 
5873 			link_name_len = MAXPATHLEN;
5874 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5875 				/*
5876 				 * Call out to allow 3rd party notification of rename.
5877 				 * Ignore result of kauth_authorize_fileop call.
5878 				 */
5879 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5880 				    (uintptr_t)link_to_path,
5881 				    (uintptr_t)target_path);
5882 			}
5883 			if (link_to_path != NULL) {
5884 				RELEASE_PATH(link_to_path);
5885 			}
5886 		}
5887 #if CONFIG_FSE
5888 		if (need_event) {
5889 			/* construct fsevent */
5890 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5891 				if (truncated_no_firmlink_path) {
5892 					finfo.mode |= FSE_TRUNCATED_PATH;
5893 				}
5894 
5895 				// build the path to the destination of the link
5896 				add_fsevent(FSE_CREATE_FILE, ctx,
5897 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5898 				    FSE_ARG_FINFO, &finfo,
5899 				    FSE_ARG_DONE);
5900 			}
5901 
5902 			pvp = vp->v_parent;
5903 			// need an iocount on parent vnode in this case
5904 			if (pvp && pvp != dvp) {
5905 				pvp = vnode_getparent_if_different(vp, dvp);
5906 			}
5907 			if (pvp) {
5908 				add_fsevent(FSE_STAT_CHANGED, ctx,
5909 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5910 			}
5911 			if (pvp && pvp != dvp) {
5912 				vnode_put(pvp);
5913 			}
5914 		}
5915 #endif
5916 	}
5917 out2:
5918 	/*
5919 	 * nameidone has to happen before we vnode_put(dvp)
5920 	 * since it may need to release the fs_nodelock on the dvp
5921 	 */
5922 	nameidone(&nd);
5923 	if (target_path != NULL) {
5924 		RELEASE_PATH(target_path);
5925 		target_path = NULL;
5926 	}
5927 	if (no_firmlink_path != NULL) {
5928 		RELEASE_PATH(no_firmlink_path);
5929 		no_firmlink_path = NULL;
5930 	}
5931 out:
5932 	if (locked_vp) {
5933 		assert(locked_vp == vp);
5934 		vnode_link_unlock(locked_vp);
5935 		locked_vp = NULLVP;
5936 	}
5937 	if (lvp) {
5938 		vnode_put(lvp);
5939 	}
5940 	if (dvp) {
5941 		vnode_put(dvp);
5942 	}
5943 	vnode_put(vp);
5944 
5945 	if (do_retry) {
5946 		goto retry;
5947 	}
5948 
5949 	return error;
5950 }
5951 
5952 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5953 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5954 {
5955 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5956 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5957 }
5958 
5959 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5960 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5961 {
5962 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5963 		return EINVAL;
5964 	}
5965 
5966 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5967 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5968 }
5969 
5970 /*
5971  * Make a symbolic link.
5972  *
5973  * We could add support for ACLs here too...
5974  */
5975 /* ARGSUSED */
5976 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5977 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5978     user_addr_t link, enum uio_seg segflg)
5979 {
5980 	struct vnode_attr va;
5981 	char *path;
5982 	int error;
5983 	struct nameidata nd;
5984 	vnode_t vp, dvp;
5985 	size_t dummy = 0;
5986 	proc_t p;
5987 
5988 	error = 0;
5989 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5990 		path = zalloc(ZV_NAMEI);
5991 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5992 	} else {
5993 		path = (char *)path_data;
5994 	}
5995 	if (error) {
5996 		goto out;
5997 	}
5998 	AUDIT_ARG(text, path);  /* This is the link string */
5999 
6000 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
6001 	    segflg, link, ctx);
6002 
6003 	error = nameiat(&nd, fd);
6004 	if (error) {
6005 		goto out;
6006 	}
6007 	dvp = nd.ni_dvp;
6008 	vp = nd.ni_vp;
6009 
6010 	p = vfs_context_proc(ctx);
6011 	VATTR_INIT(&va);
6012 	VATTR_SET(&va, va_type, VLNK);
6013 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
6014 
6015 #if CONFIG_MACF
6016 	error = mac_vnode_check_create(ctx,
6017 	    dvp, &nd.ni_cnd, &va);
6018 #endif
6019 	if (error != 0) {
6020 		goto skipit;
6021 	}
6022 
6023 	if (vp != NULL) {
6024 		error = EEXIST;
6025 		goto skipit;
6026 	}
6027 
6028 	/* authorize */
6029 	if (error == 0) {
6030 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6031 	}
6032 	/* get default ownership, etc. */
6033 	if (error == 0) {
6034 		error = vnode_authattr_new(dvp, &va, 0, ctx);
6035 	}
6036 
6037 #if CONFIG_FILE_LEASES
6038 	vnode_breakdirlease(dvp, false, O_WRONLY);
6039 #endif
6040 
6041 	if (error == 0) {
6042 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
6043 	}
6044 
6045 	/* do fallback attribute handling */
6046 	if (error == 0 && vp) {
6047 		error = vnode_setattr_fallback(vp, &va, ctx);
6048 	}
6049 
6050 #if CONFIG_MACF
6051 	if (error == 0 && vp) {
6052 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
6053 	}
6054 #endif
6055 
6056 	if (error == 0) {
6057 		int     update_flags = 0;
6058 
6059 		/*check if a new vnode was created, else try to get one*/
6060 		if (vp == NULL) {
6061 			nd.ni_cnd.cn_nameiop = LOOKUP;
6062 #if CONFIG_TRIGGERS
6063 			nd.ni_op = OP_LOOKUP;
6064 #endif
6065 			/*
6066 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
6067 			 * reallocated again in namei().
6068 			 */
6069 			nd.ni_cnd.cn_flags &= HASBUF;
6070 			error = nameiat(&nd, fd);
6071 			if (error) {
6072 				goto skipit;
6073 			}
6074 			vp = nd.ni_vp;
6075 		}
6076 
6077 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
6078 		/* call out to allow 3rd party notification of rename.
6079 		 * Ignore result of kauth_authorize_fileop call.
6080 		 */
6081 		if (kauth_authorize_fileop_has_listeners() &&
6082 		    namei(&nd) == 0) {
6083 			char *new_link_path = NULL;
6084 			int             len;
6085 
6086 			/* build the path to the new link file */
6087 			new_link_path = get_pathbuff();
6088 			len = MAXPATHLEN;
6089 			vn_getpath(dvp, new_link_path, &len);
6090 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
6091 				new_link_path[len - 1] = '/';
6092 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
6093 			}
6094 
6095 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
6096 			    (uintptr_t)path, (uintptr_t)new_link_path);
6097 			if (new_link_path != NULL) {
6098 				release_pathbuff(new_link_path);
6099 			}
6100 		}
6101 #endif
6102 		// Make sure the name & parent pointers are hooked up
6103 		if (vp->v_name == NULL) {
6104 			update_flags |= VNODE_UPDATE_NAME;
6105 		}
6106 		if (vp->v_parent == NULLVP) {
6107 			update_flags |= VNODE_UPDATE_PARENT;
6108 		}
6109 
6110 		if (update_flags) {
6111 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
6112 		}
6113 
6114 #if CONFIG_FSE
6115 		add_fsevent(FSE_CREATE_FILE, ctx,
6116 		    FSE_ARG_VNODE, vp,
6117 		    FSE_ARG_DONE);
6118 #endif
6119 	}
6120 
6121 skipit:
6122 	/*
6123 	 * nameidone has to happen before we vnode_put(dvp)
6124 	 * since it may need to release the fs_nodelock on the dvp
6125 	 */
6126 	nameidone(&nd);
6127 
6128 	if (vp) {
6129 		vnode_put(vp);
6130 	}
6131 	vnode_put(dvp);
6132 out:
6133 	if (path && (path != (char *)path_data)) {
6134 		zfree(ZV_NAMEI, path);
6135 	}
6136 
6137 	return error;
6138 }
6139 
6140 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6141 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6142 {
6143 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6144 	           uap->link, UIO_USERSPACE);
6145 }
6146 
6147 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6148 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6149     __unused int32_t *retval)
6150 {
6151 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6152 	           uap->path2, UIO_USERSPACE);
6153 }
6154 
6155 /*
6156  * Delete a whiteout from the filesystem.
6157  * No longer supported.
6158  */
6159 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6160 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6161 {
6162 	return ENOTSUP;
6163 }
6164 
6165 /*
6166  * Delete a name from the filesystem.
6167  */
6168 /* ARGSUSED */
6169 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6170 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6171     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6172 {
6173 	struct {
6174 		struct nameidata nd;
6175 #if CONFIG_FSE
6176 		struct vnode_attr va;
6177 		fse_info finfo;
6178 #endif
6179 	} *__unlink_data;
6180 	struct nameidata *ndp;
6181 	vnode_t vp, dvp;
6182 	int error;
6183 	struct componentname *cnp;
6184 	char  *path = NULL;
6185 	char  *no_firmlink_path = NULL;
6186 	int  len_path = 0;
6187 	int  len_no_firmlink_path = 0;
6188 	int flags;
6189 	int need_event;
6190 	int has_listeners;
6191 	int truncated_path;
6192 	int truncated_no_firmlink_path;
6193 	int batched;
6194 	struct vnode_attr *vap;
6195 	vnode_t locked_vp = NULLVP;
6196 	int do_retry;
6197 	int retry_count = 0;
6198 	int cn_flags;
6199 	int nofollow_any = 0;
6200 
6201 	cn_flags = LOCKPARENT;
6202 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6203 		cn_flags |= AUDITVNPATH1;
6204 	}
6205 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6206 		nofollow_any = NAMEI_NOFOLLOW_ANY;
6207 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6208 	}
6209 	/* If a starting dvp is passed, it trumps any fd passed. */
6210 	if (start_dvp) {
6211 		cn_flags |= USEDVP;
6212 	}
6213 
6214 #if NAMEDRSRCFORK
6215 	/* unlink or delete is allowed on rsrc forks and named streams */
6216 	cn_flags |= CN_ALLOWRSRCFORK;
6217 #endif
6218 
6219 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6220 	ndp = &__unlink_data->nd;
6221 #if CONFIG_FSE
6222 	fse_info *finfop = &__unlink_data->finfo;
6223 #endif
6224 
6225 retry:
6226 	do_retry = 0;
6227 	flags = 0;
6228 	need_event = 0;
6229 	has_listeners = 0;
6230 	truncated_path = 0;
6231 	truncated_no_firmlink_path = 0;
6232 	vap = NULL;
6233 
6234 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6235 
6236 	ndp->ni_dvp = start_dvp;
6237 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6238 	cnp = &ndp->ni_cnd;
6239 
6240 continue_lookup:
6241 	error = nameiat(ndp, fd);
6242 	if (error) {
6243 		goto early_out;
6244 	}
6245 
6246 	dvp = ndp->ni_dvp;
6247 	vp = ndp->ni_vp;
6248 
6249 	/* With Carbon delete semantics, busy files cannot be deleted */
6250 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6251 		flags |= VNODE_REMOVE_NODELETEBUSY;
6252 	}
6253 
6254 	/* Skip any potential upcalls if told to. */
6255 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6256 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6257 	}
6258 
6259 	/* Update speculative telemetry with system discarded use state */
6260 	if (unlink_flags & VNODE_REMOVE_SYSTEM_DISCARDED) {
6261 		flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6262 	}
6263 
6264 	if (vp) {
6265 		batched = vnode_compound_remove_available(vp);
6266 		/*
6267 		 * The root of a mounted filesystem cannot be deleted.
6268 		 */
6269 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6270 			error = EBUSY;
6271 			goto out;
6272 		}
6273 
6274 #if DEVELOPMENT || DEBUG
6275 		/*
6276 		 * XXX VSWAP: Check for entitlements or special flag here
6277 		 * so we can restrict access appropriately.
6278 		 */
6279 #else /* DEVELOPMENT || DEBUG */
6280 
6281 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6282 			error = EPERM;
6283 			goto out;
6284 		}
6285 #endif /* DEVELOPMENT || DEBUG */
6286 
6287 		if (!batched) {
6288 			vnode_link_lock(vp);
6289 			locked_vp = vp;
6290 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6291 			if (error) {
6292 				if (error == ENOENT) {
6293 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6294 						do_retry = 1;
6295 						retry_count++;
6296 					}
6297 				}
6298 				vnode_link_unlock(vp);
6299 				locked_vp = NULLVP;
6300 				goto out;
6301 			}
6302 		}
6303 	} else {
6304 		batched = 1;
6305 
6306 		if (!vnode_compound_remove_available(dvp)) {
6307 			panic("No vp, but no compound remove?");
6308 		}
6309 	}
6310 
6311 #if CONFIG_FSE
6312 	need_event = need_fsevent(FSE_DELETE, dvp);
6313 	if (need_event) {
6314 		if (!batched) {
6315 			if ((vp->v_flag & VISHARDLINK) == 0) {
6316 				/* XXX need to get these data in batched VNOP */
6317 				get_fse_info(vp, finfop, ctx);
6318 			}
6319 		} else {
6320 			error =
6321 			    vfs_get_notify_attributes(&__unlink_data->va);
6322 			if (error) {
6323 				goto out;
6324 			}
6325 
6326 			vap = &__unlink_data->va;
6327 		}
6328 	}
6329 #endif
6330 	has_listeners = kauth_authorize_fileop_has_listeners();
6331 	if (need_event || has_listeners) {
6332 		if (path == NULL) {
6333 			GET_PATH(path);
6334 		}
6335 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6336 		if (no_firmlink_path == NULL) {
6337 			GET_PATH(no_firmlink_path);
6338 		}
6339 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6340 	}
6341 
6342 #if NAMEDRSRCFORK
6343 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6344 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6345 	} else
6346 #endif
6347 	{
6348 #if CONFIG_FILE_LEASES
6349 		vnode_breakdirlease(dvp, false, O_WRONLY);
6350 #endif
6351 
6352 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6353 		vp = ndp->ni_vp;
6354 		if (error == EKEEPLOOKING) {
6355 			if (!batched) {
6356 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6357 			}
6358 
6359 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6360 				panic("EKEEPLOOKING, but continue flag not set?");
6361 			}
6362 
6363 			if (vnode_isdir(vp)) {
6364 				error = EISDIR;
6365 				goto out;
6366 			}
6367 			goto continue_lookup;
6368 		} else if (error == ENOENT && batched) {
6369 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6370 				/*
6371 				 * For compound VNOPs, the authorization callback may
6372 				 * return ENOENT in case of racing hardlink lookups
6373 				 * hitting the name  cache, redrive the lookup.
6374 				 */
6375 				do_retry = 1;
6376 				retry_count += 1;
6377 				goto out;
6378 			}
6379 		}
6380 	}
6381 
6382 	/*
6383 	 * Call out to allow 3rd party notification of delete.
6384 	 * Ignore result of kauth_authorize_fileop call.
6385 	 */
6386 	if (!error) {
6387 		if (has_listeners) {
6388 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6389 			    KAUTH_FILEOP_DELETE,
6390 			    (uintptr_t)vp,
6391 			    (uintptr_t)path);
6392 		}
6393 
6394 		if (vp->v_flag & VISHARDLINK) {
6395 			//
6396 			// if a hardlink gets deleted we want to blow away the
6397 			// v_parent link because the path that got us to this
6398 			// instance of the link is no longer valid.  this will
6399 			// force the next call to get the path to ask the file
6400 			// system instead of just following the v_parent link.
6401 			//
6402 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6403 		}
6404 
6405 #if CONFIG_FSE
6406 		if (need_event) {
6407 			if (vp->v_flag & VISHARDLINK) {
6408 				get_fse_info(vp, finfop, ctx);
6409 			} else if (vap) {
6410 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6411 			}
6412 			if (truncated_path) {
6413 				finfop->mode |= FSE_TRUNCATED_PATH;
6414 			}
6415 			add_fsevent(FSE_DELETE, ctx,
6416 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6417 			    FSE_ARG_FINFO, finfop,
6418 			    FSE_ARG_DONE);
6419 		}
6420 #endif
6421 
6422 #if CONFIG_MACF
6423 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6424 #endif
6425 	}
6426 
6427 out:
6428 	if (locked_vp) {
6429 		assert(locked_vp == vp);
6430 		vnode_link_unlock(locked_vp);
6431 		locked_vp = NULLVP;
6432 	}
6433 
6434 	if (path != NULL) {
6435 		RELEASE_PATH(path);
6436 		path = NULL;
6437 	}
6438 
6439 	if (no_firmlink_path != NULL) {
6440 		RELEASE_PATH(no_firmlink_path);
6441 		no_firmlink_path = NULL;
6442 	}
6443 #if NAMEDRSRCFORK
6444 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6445 	 * will cause its shadow file to go away if necessary.
6446 	 */
6447 	if (vp && (vnode_isnamedstream(vp)) &&
6448 	    (vp->v_parent != NULLVP) &&
6449 	    vnode_isshadow(vp)) {
6450 		vnode_recycle(vp);
6451 	}
6452 #endif
6453 	/*
6454 	 * nameidone has to happen before we vnode_put(dvp)
6455 	 * since it may need to release the fs_nodelock on the dvp
6456 	 */
6457 	nameidone(ndp);
6458 	vnode_put(dvp);
6459 	if (vp) {
6460 		vnode_put(vp);
6461 	}
6462 
6463 	if (do_retry) {
6464 		goto retry;
6465 	}
6466 
6467 early_out:
6468 	kfree_type(typeof(*__unlink_data), __unlink_data);
6469 	return error;
6470 }
6471 
6472 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6473 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6474     enum uio_seg segflg, int unlink_flags)
6475 {
6476 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6477 	           unlink_flags);
6478 }
6479 
6480 /*
6481  * Delete a name from the filesystem using Carbon semantics.
6482  */
6483 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6484 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6485 {
6486 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6487 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6488 }
6489 
6490 /*
6491  * Delete a name from the filesystem using POSIX semantics.
6492  */
6493 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6494 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6495 {
6496 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6497 	           uap->path, UIO_USERSPACE, 0);
6498 }
6499 
6500 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6501 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6502 {
6503 	int unlink_flags = 0;
6504 
6505 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY | AT_SYSTEM_DISCARDED)) {
6506 		return EINVAL;
6507 	}
6508 
6509 	if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6510 		unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6511 	}
6512 
6513 	if (uap->flag & AT_SYSTEM_DISCARDED) {
6514 		unlink_flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6515 	}
6516 
6517 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6518 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6519 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6520 		}
6521 		return rmdirat_internal(vfs_context_current(), uap->fd,
6522 		           uap->path, UIO_USERSPACE, unlink_flags);
6523 	} else {
6524 		return unlinkat_internal(vfs_context_current(), uap->fd,
6525 		           NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6526 	}
6527 }
6528 
6529 /*
6530  * Reposition read/write file offset.
6531  */
6532 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6533 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6534 {
6535 	struct fileproc *fp;
6536 	vnode_t vp;
6537 	struct vfs_context *ctx;
6538 	off_t offset = uap->offset, file_size;
6539 	int error;
6540 
6541 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6542 		if (error == ENOTSUP) {
6543 			return ESPIPE;
6544 		}
6545 		return error;
6546 	}
6547 	if (
6548 		// rdar://3837316: Seeking a pipe is disallowed by POSIX.
6549 		vnode_isfifo(vp)
6550 		// rdar://120750171: Seeking a TTY is undefined and should be denied.
6551 		|| vnode_istty(vp)
6552 		) {
6553 		file_drop(uap->fd);
6554 		return ESPIPE;
6555 	}
6556 
6557 
6558 	ctx = vfs_context_current();
6559 #if CONFIG_MACF
6560 	if (uap->whence == L_INCR && uap->offset == 0) {
6561 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6562 		    fp->fp_glob);
6563 	} else {
6564 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6565 		    fp->fp_glob);
6566 	}
6567 	if (error) {
6568 		file_drop(uap->fd);
6569 		return error;
6570 	}
6571 #endif
6572 	if ((error = vnode_getwithref(vp))) {
6573 		file_drop(uap->fd);
6574 		return error;
6575 	}
6576 
6577 	switch (uap->whence) {
6578 	case L_INCR:
6579 		offset += fp->fp_glob->fg_offset;
6580 		break;
6581 	case L_XTND:
6582 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6583 			break;
6584 		}
6585 		offset += file_size;
6586 		break;
6587 	case L_SET:
6588 		break;
6589 	case SEEK_HOLE:
6590 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6591 		break;
6592 	case SEEK_DATA:
6593 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6594 		break;
6595 	default:
6596 		error = EINVAL;
6597 	}
6598 	if (error == 0) {
6599 		if (uap->offset > 0 && offset < 0) {
6600 			/* Incremented/relative move past max size */
6601 			error = EOVERFLOW;
6602 		} else {
6603 			/*
6604 			 * Allow negative offsets on character devices, per
6605 			 * POSIX 1003.1-2001.  Most likely for writing disk
6606 			 * labels.
6607 			 */
6608 			if (offset < 0 && vp->v_type != VCHR) {
6609 				/* Decremented/relative move before start */
6610 				error = EINVAL;
6611 			} else {
6612 				/* Success */
6613 				fp->fp_glob->fg_offset = offset;
6614 				*retval = fp->fp_glob->fg_offset;
6615 			}
6616 		}
6617 	}
6618 
6619 	/*
6620 	 * An lseek can affect whether data is "available to read."  Use
6621 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6622 	 */
6623 	post_event_if_success(vp, error, NOTE_NONE);
6624 	(void)vnode_put(vp);
6625 	file_drop(uap->fd);
6626 	return error;
6627 }
6628 
6629 
6630 /*
6631  * Check access permissions.
6632  *
6633  * Returns:	0			Success
6634  *		vnode_authorize:???
6635  */
6636 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6637 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6638 {
6639 	kauth_action_t action;
6640 	int error;
6641 
6642 	/*
6643 	 * If just the regular access bits, convert them to something
6644 	 * that vnode_authorize will understand.
6645 	 */
6646 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6647 		action = 0;
6648 		if (uflags & R_OK) {
6649 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6650 		}
6651 		if (uflags & W_OK) {
6652 			if (vnode_isdir(vp)) {
6653 				action |= KAUTH_VNODE_ADD_FILE |
6654 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6655 				/* might want delete rights here too */
6656 			} else {
6657 				action |= KAUTH_VNODE_WRITE_DATA;
6658 			}
6659 		}
6660 		if (uflags & X_OK) {
6661 			if (vnode_isdir(vp)) {
6662 				action |= KAUTH_VNODE_SEARCH;
6663 			} else {
6664 				action |= KAUTH_VNODE_EXECUTE;
6665 			}
6666 		}
6667 	} else {
6668 		/* take advantage of definition of uflags */
6669 		action = uflags >> 8;
6670 	}
6671 
6672 #if CONFIG_MACF
6673 	error = mac_vnode_check_access(ctx, vp, uflags);
6674 	if (error) {
6675 		return error;
6676 	}
6677 #endif /* MAC */
6678 
6679 	/* action == 0 means only check for existence */
6680 	if (action != 0) {
6681 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6682 	} else {
6683 		error = 0;
6684 	}
6685 
6686 	return error;
6687 }
6688 
6689 
6690 
6691 /*
6692  * access_extended: Check access permissions in bulk.
6693  *
6694  * Description:	uap->entries		Pointer to an array of accessx
6695  *                                      descriptor structs, plus one or
6696  *                                      more NULL terminated strings (see
6697  *                                      "Notes" section below).
6698  *		uap->size		Size of the area pointed to by
6699  *					uap->entries.
6700  *		uap->results		Pointer to the results array.
6701  *
6702  * Returns:	0			Success
6703  *		ENOMEM			Insufficient memory
6704  *		EINVAL			Invalid arguments
6705  *		namei:EFAULT		Bad address
6706  *		namei:ENAMETOOLONG	Filename too long
6707  *		namei:ENOENT		No such file or directory
6708  *		namei:ELOOP		Too many levels of symbolic links
6709  *		namei:EBADF		Bad file descriptor
6710  *		namei:ENOTDIR		Not a directory
6711  *		namei:???
6712  *		access1:
6713  *
6714  * Implicit returns:
6715  *		uap->results		Array contents modified
6716  *
6717  * Notes:	The uap->entries are structured as an arbitrary length array
6718  *		of accessx descriptors, followed by one or more NULL terminated
6719  *		strings
6720  *
6721  *			struct accessx_descriptor[0]
6722  *			...
6723  *			struct accessx_descriptor[n]
6724  *			char name_data[0];
6725  *
6726  *		We determine the entry count by walking the buffer containing
6727  *		the uap->entries argument descriptor.  For each descriptor we
6728  *		see, the valid values for the offset ad_name_offset will be
6729  *		in the byte range:
6730  *
6731  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6732  *						to
6733  *				[ uap->entries + uap->size - 2 ]
6734  *
6735  *		since we must have at least one string, and the string must
6736  *		be at least one character plus the NULL terminator in length.
6737  *
6738  * XXX:		Need to support the check-as uid argument
6739  */
6740 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6741 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6742 {
6743 	struct accessx_descriptor *input = NULL;
6744 	errno_t *result = NULL;
6745 	errno_t error = 0;
6746 	int wantdelete = 0;
6747 	size_t desc_max, desc_actual = 0;
6748 	unsigned int i, j;
6749 	struct vfs_context context;
6750 	struct nameidata nd;
6751 	int niopts;
6752 	vnode_t vp = NULL;
6753 	vnode_t dvp = NULL;
6754 #define ACCESSX_MAX_DESCR_ON_STACK 10
6755 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6756 
6757 	context.vc_ucred = NULL;
6758 
6759 	/*
6760 	 * Validate parameters; if valid, copy the descriptor array and string
6761 	 * arguments into local memory.  Before proceeding, the following
6762 	 * conditions must have been met:
6763 	 *
6764 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6765 	 * o	There must be sufficient room in the request for at least one
6766 	 *	descriptor and a one yte NUL terminated string.
6767 	 * o	The allocation of local storage must not fail.
6768 	 */
6769 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6770 		return ENOMEM;
6771 	}
6772 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6773 		return EINVAL;
6774 	}
6775 	if (uap->size <= sizeof(stack_input)) {
6776 		input = stack_input;
6777 	} else {
6778 		input = kalloc_data(uap->size, Z_WAITOK);
6779 		if (input == NULL) {
6780 			error = ENOMEM;
6781 			goto out;
6782 		}
6783 	}
6784 	error = copyin(uap->entries, input, uap->size);
6785 	if (error) {
6786 		goto out;
6787 	}
6788 
6789 	AUDIT_ARG(opaque, input, uap->size);
6790 
6791 	/*
6792 	 * Force NUL termination of the copyin buffer to avoid nami() running
6793 	 * off the end.  If the caller passes us bogus data, they may get a
6794 	 * bogus result.
6795 	 */
6796 	((char *)input)[uap->size - 1] = 0;
6797 
6798 	/*
6799 	 * Access is defined as checking against the process' real identity,
6800 	 * even if operations are checking the effective identity.  This
6801 	 * requires that we use a local vfs context.
6802 	 */
6803 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6804 	context.vc_thread = current_thread();
6805 
6806 	/*
6807 	 * Find out how many entries we have, so we can allocate the result
6808 	 * array by walking the list and adjusting the count downward by the
6809 	 * earliest string offset we see.
6810 	 */
6811 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6812 	desc_actual = desc_max;
6813 	for (i = 0; i < desc_actual; i++) {
6814 		/*
6815 		 * Take the offset to the name string for this entry and
6816 		 * convert to an input array index, which would be one off
6817 		 * the end of the array if this entry was the lowest-addressed
6818 		 * name string.
6819 		 */
6820 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6821 
6822 		/*
6823 		 * An offset greater than the max allowable offset is an error.
6824 		 * It is also an error for any valid entry to point
6825 		 * to a location prior to the end of the current entry, if
6826 		 * it's not a reference to the string of the previous entry.
6827 		 */
6828 		if (j > desc_max || (j != 0 && j <= i)) {
6829 			error = EINVAL;
6830 			goto out;
6831 		}
6832 
6833 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6834 		if (input[i].ad_name_offset >= uap->size) {
6835 			error = EINVAL;
6836 			goto out;
6837 		}
6838 
6839 		/*
6840 		 * An offset of 0 means use the previous descriptor's offset;
6841 		 * this is used to chain multiple requests for the same file
6842 		 * to avoid multiple lookups.
6843 		 */
6844 		if (j == 0) {
6845 			/* This is not valid for the first entry */
6846 			if (i == 0) {
6847 				error = EINVAL;
6848 				goto out;
6849 			}
6850 			continue;
6851 		}
6852 
6853 		/*
6854 		 * If the offset of the string for this descriptor is before
6855 		 * what we believe is the current actual last descriptor,
6856 		 * then we need to adjust our estimate downward; this permits
6857 		 * the string table following the last descriptor to be out
6858 		 * of order relative to the descriptor list.
6859 		 */
6860 		if (j < desc_actual) {
6861 			desc_actual = j;
6862 		}
6863 	}
6864 
6865 	/*
6866 	 * We limit the actual number of descriptors we are willing to process
6867 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6868 	 * requested does not exceed this limit,
6869 	 */
6870 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6871 		error = ENOMEM;
6872 		goto out;
6873 	}
6874 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6875 	if (result == NULL) {
6876 		error = ENOMEM;
6877 		goto out;
6878 	}
6879 
6880 	/*
6881 	 * Do the work by iterating over the descriptor entries we know to
6882 	 * at least appear to contain valid data.
6883 	 */
6884 	error = 0;
6885 	for (i = 0; i < desc_actual; i++) {
6886 		/*
6887 		 * If the ad_name_offset is 0, then we use the previous
6888 		 * results to make the check; otherwise, we are looking up
6889 		 * a new file name.
6890 		 */
6891 		if (input[i].ad_name_offset != 0) {
6892 			/* discard old vnodes */
6893 			if (vp) {
6894 				vnode_put(vp);
6895 				vp = NULL;
6896 			}
6897 			if (dvp) {
6898 				vnode_put(dvp);
6899 				dvp = NULL;
6900 			}
6901 
6902 			/*
6903 			 * Scan forward in the descriptor list to see if we
6904 			 * need the parent vnode.  We will need it if we are
6905 			 * deleting, since we must have rights  to remove
6906 			 * entries in the parent directory, as well as the
6907 			 * rights to delete the object itself.
6908 			 */
6909 			wantdelete = input[i].ad_flags & _DELETE_OK;
6910 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6911 				if (input[j].ad_flags & _DELETE_OK) {
6912 					wantdelete = 1;
6913 				}
6914 			}
6915 
6916 			niopts = FOLLOW | AUDITVNPATH1;
6917 
6918 			/* need parent for vnode_authorize for deletion test */
6919 			if (wantdelete) {
6920 				niopts |= WANTPARENT;
6921 			}
6922 
6923 			/* do the lookup */
6924 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6925 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6926 			    &context);
6927 			error = namei(&nd);
6928 			if (!error) {
6929 				vp = nd.ni_vp;
6930 				if (wantdelete) {
6931 					dvp = nd.ni_dvp;
6932 				}
6933 			}
6934 			nameidone(&nd);
6935 		}
6936 
6937 		/*
6938 		 * Handle lookup errors.
6939 		 */
6940 		switch (error) {
6941 		case ENOENT:
6942 		case EACCES:
6943 		case EPERM:
6944 		case ENOTDIR:
6945 			result[i] = error;
6946 			break;
6947 		case 0:
6948 			/* run this access check */
6949 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6950 			break;
6951 		default:
6952 			/* fatal lookup error */
6953 
6954 			goto out;
6955 		}
6956 	}
6957 
6958 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6959 
6960 	/* copy out results */
6961 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6962 
6963 out:
6964 	if (input && input != stack_input) {
6965 		kfree_data(input, uap->size);
6966 	}
6967 	if (result) {
6968 		kfree_data(result, desc_actual * sizeof(errno_t));
6969 	}
6970 	if (vp) {
6971 		vnode_put(vp);
6972 	}
6973 	if (dvp) {
6974 		vnode_put(dvp);
6975 	}
6976 	if (IS_VALID_CRED(context.vc_ucred)) {
6977 		kauth_cred_unref(&context.vc_ucred);
6978 	}
6979 	return error;
6980 }
6981 
6982 
6983 /*
6984  * Returns:	0			Success
6985  *		namei:EFAULT		Bad address
6986  *		namei:ENAMETOOLONG	Filename too long
6987  *		namei:ENOENT		No such file or directory
6988  *		namei:ELOOP		Too many levels of symbolic links
6989  *		namei:EBADF		Bad file descriptor
6990  *		namei:ENOTDIR		Not a directory
6991  *		namei:???
6992  *		access1:
6993  */
6994 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6995 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6996     int flag, enum uio_seg segflg)
6997 {
6998 	int error;
6999 	struct nameidata nd;
7000 	int niopts;
7001 	struct vfs_context context;
7002 #if NAMEDRSRCFORK
7003 	int is_namedstream = 0;
7004 #endif
7005 
7006 	/*
7007 	 * Unless the AT_EACCESS option is used, Access is defined as checking
7008 	 * against the process' real identity, even if operations are checking
7009 	 * the effective identity.  So we need to tweak the credential
7010 	 * in the context for that case.
7011 	 */
7012 	if (!(flag & AT_EACCESS)) {
7013 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
7014 	} else {
7015 		context.vc_ucred = ctx->vc_ucred;
7016 	}
7017 	context.vc_thread = ctx->vc_thread;
7018 
7019 
7020 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
7021 	/* need parent for vnode_authorize for deletion test */
7022 	if (amode & _DELETE_OK) {
7023 		niopts |= WANTPARENT;
7024 	}
7025 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
7026 	    path, &context);
7027 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7028 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7029 	}
7030 
7031 #if NAMEDRSRCFORK
7032 	/* access(F_OK) calls are allowed for resource forks. */
7033 	if (amode == F_OK) {
7034 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7035 	}
7036 #endif
7037 	error = nameiat(&nd, fd);
7038 	if (error) {
7039 		goto out;
7040 	}
7041 
7042 #if NAMEDRSRCFORK
7043 	/* Grab reference on the shadow stream file vnode to
7044 	 * force an inactive on release which will mark it
7045 	 * for recycle.
7046 	 */
7047 	if (vnode_isnamedstream(nd.ni_vp) &&
7048 	    (nd.ni_vp->v_parent != NULLVP) &&
7049 	    vnode_isshadow(nd.ni_vp)) {
7050 		is_namedstream = 1;
7051 		vnode_ref(nd.ni_vp);
7052 	}
7053 #endif
7054 
7055 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
7056 
7057 #if NAMEDRSRCFORK
7058 	if (is_namedstream) {
7059 		vnode_rele(nd.ni_vp);
7060 	}
7061 #endif
7062 
7063 	vnode_put(nd.ni_vp);
7064 	if (amode & _DELETE_OK) {
7065 		vnode_put(nd.ni_dvp);
7066 	}
7067 	nameidone(&nd);
7068 
7069 out:
7070 	if (!(flag & AT_EACCESS)) {
7071 		kauth_cred_unref(&context.vc_ucred);
7072 	}
7073 	return error;
7074 }
7075 
7076 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)7077 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
7078 {
7079 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
7080 	           uap->path, uap->flags, 0, UIO_USERSPACE);
7081 }
7082 
7083 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)7084 faccessat(__unused proc_t p, struct faccessat_args *uap,
7085     __unused int32_t *retval)
7086 {
7087 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7088 		return EINVAL;
7089 	}
7090 
7091 	return faccessat_internal(vfs_context_current(), uap->fd,
7092 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
7093 }
7094 
7095 /*
7096  * Returns:	0			Success
7097  *		EFAULT
7098  *	copyout:EFAULT
7099  *	namei:???
7100  *	vn_stat:???
7101  */
7102 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)7103 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
7104     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
7105     enum uio_seg segflg, int fd, int flag)
7106 {
7107 	struct nameidata *ndp = NULL;
7108 	int follow;
7109 	union {
7110 		struct stat sb;
7111 		struct stat64 sb64;
7112 	} source = {};
7113 	union {
7114 		struct user64_stat user64_sb;
7115 		struct user32_stat user32_sb;
7116 		struct user64_stat64 user64_sb64;
7117 		struct user32_stat64 user32_sb64;
7118 	} dest = {};
7119 	caddr_t sbp;
7120 	int error, my_size;
7121 	kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
7122 	size_t xsecurity_bufsize;
7123 	void * statptr;
7124 	struct fileproc *fp = NULL;
7125 	int needsrealdev = 0;
7126 
7127 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7128 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
7129 	NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
7130 	    segflg, path, ctx);
7131 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7132 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
7133 	}
7134 
7135 #if NAMEDRSRCFORK
7136 	int is_namedstream = 0;
7137 	/* stat calls are allowed for resource forks. */
7138 	ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7139 #endif
7140 
7141 	if (flag & AT_FDONLY) {
7142 		vnode_t fvp;
7143 
7144 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
7145 		if (error) {
7146 			goto out;
7147 		}
7148 		if ((error = vnode_getwithref(fvp))) {
7149 			file_drop(fd);
7150 			goto out;
7151 		}
7152 		ndp->ni_vp = fvp;
7153 	} else {
7154 		error = nameiat(ndp, fd);
7155 		if (error) {
7156 			goto out;
7157 		}
7158 	}
7159 
7160 	statptr = (void *)&source;
7161 
7162 #if NAMEDRSRCFORK
7163 	/* Grab reference on the shadow stream file vnode to
7164 	 * force an inactive on release which will mark it
7165 	 * for recycle.
7166 	 */
7167 	if (vnode_isnamedstream(ndp->ni_vp) &&
7168 	    (ndp->ni_vp->v_parent != NULLVP) &&
7169 	    vnode_isshadow(ndp->ni_vp)) {
7170 		is_namedstream = 1;
7171 		vnode_ref(ndp->ni_vp);
7172 	}
7173 #endif
7174 
7175 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
7176 	if (fp && (xsecurity == USER_ADDR_NULL)) {
7177 		/*
7178 		 * If the caller has the file open, and is not
7179 		 * requesting extended security information, we are
7180 		 * going to let them get the basic stat information.
7181 		 */
7182 		error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7183 		    fp->fp_glob->fg_cred);
7184 	} else {
7185 		error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7186 		    isstat64, needsrealdev, ctx);
7187 	}
7188 
7189 #if NAMEDRSRCFORK
7190 	if (is_namedstream) {
7191 		vnode_rele(ndp->ni_vp);
7192 	}
7193 #endif
7194 	vnode_put(ndp->ni_vp);
7195 	nameidone(ndp);
7196 
7197 	if (fp) {
7198 		file_drop(fd);
7199 		fp = NULL;
7200 	}
7201 
7202 	if (error) {
7203 		goto out;
7204 	}
7205 	/* Zap spare fields */
7206 	if (isstat64 != 0) {
7207 		source.sb64.st_lspare = 0;
7208 		source.sb64.st_qspare[0] = 0LL;
7209 		source.sb64.st_qspare[1] = 0LL;
7210 		if (vfs_context_is64bit(ctx)) {
7211 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7212 			my_size = sizeof(dest.user64_sb64);
7213 			sbp = (caddr_t)&dest.user64_sb64;
7214 		} else {
7215 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7216 			my_size = sizeof(dest.user32_sb64);
7217 			sbp = (caddr_t)&dest.user32_sb64;
7218 		}
7219 		/*
7220 		 * Check if we raced (post lookup) against the last unlink of a file.
7221 		 */
7222 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7223 			source.sb64.st_nlink = 1;
7224 		}
7225 	} else {
7226 		source.sb.st_lspare = 0;
7227 		source.sb.st_qspare[0] = 0LL;
7228 		source.sb.st_qspare[1] = 0LL;
7229 		if (vfs_context_is64bit(ctx)) {
7230 			munge_user64_stat(&source.sb, &dest.user64_sb);
7231 			my_size = sizeof(dest.user64_sb);
7232 			sbp = (caddr_t)&dest.user64_sb;
7233 		} else {
7234 			munge_user32_stat(&source.sb, &dest.user32_sb);
7235 			my_size = sizeof(dest.user32_sb);
7236 			sbp = (caddr_t)&dest.user32_sb;
7237 		}
7238 
7239 		/*
7240 		 * Check if we raced (post lookup) against the last unlink of a file.
7241 		 */
7242 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7243 			source.sb.st_nlink = 1;
7244 		}
7245 	}
7246 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7247 		goto out;
7248 	}
7249 
7250 	/* caller wants extended security information? */
7251 	if (xsecurity != USER_ADDR_NULL) {
7252 		/* did we get any? */
7253 		if (fsec == KAUTH_FILESEC_NONE) {
7254 			if (susize(xsecurity_size, 0) != 0) {
7255 				error = EFAULT;
7256 				goto out;
7257 			}
7258 		} else {
7259 			/* find the user buffer size */
7260 			xsecurity_bufsize = fusize(xsecurity_size);
7261 
7262 			/* copy out the actual data size */
7263 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7264 				error = EFAULT;
7265 				goto out;
7266 			}
7267 
7268 			/* if the caller supplied enough room, copy out to it */
7269 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7270 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7271 			}
7272 		}
7273 	}
7274 out:
7275 	if (ndp) {
7276 		kfree_type(struct nameidata, ndp);
7277 	}
7278 	if (fsec != KAUTH_FILESEC_NONE) {
7279 		kauth_filesec_free(fsec);
7280 	}
7281 	return error;
7282 }
7283 
7284 /*
7285  * stat_extended: Get file status; with extended security (ACL).
7286  *
7287  * Parameters:    p                       (ignored)
7288  *                uap                     User argument descriptor (see below)
7289  *                retval                  (ignored)
7290  *
7291  * Indirect:      uap->path               Path of file to get status from
7292  *                uap->ub                 User buffer (holds file status info)
7293  *                uap->xsecurity          ACL to get (extended security)
7294  *                uap->xsecurity_size     Size of ACL
7295  *
7296  * Returns:        0                      Success
7297  *                !0                      errno value
7298  *
7299  */
7300 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7301 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7302     __unused int32_t *retval)
7303 {
7304 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7305 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7306 	           0);
7307 }
7308 
7309 /*
7310  * Returns:	0			Success
7311  *	fstatat_internal:???		[see fstatat_internal() in this file]
7312  */
7313 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7314 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7315 {
7316 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7317 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7318 }
7319 
7320 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7321 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7322 {
7323 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7324 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7325 }
7326 
7327 /*
7328  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7329  *
7330  * Parameters:    p                       (ignored)
7331  *                uap                     User argument descriptor (see below)
7332  *                retval                  (ignored)
7333  *
7334  * Indirect:      uap->path               Path of file to get status from
7335  *                uap->ub                 User buffer (holds file status info)
7336  *                uap->xsecurity          ACL to get (extended security)
7337  *                uap->xsecurity_size     Size of ACL
7338  *
7339  * Returns:        0                      Success
7340  *                !0                      errno value
7341  *
7342  */
7343 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7344 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7345 {
7346 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7347 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7348 	           0);
7349 }
7350 
7351 /*
7352  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7353  *
7354  * Parameters:    p                       (ignored)
7355  *                uap                     User argument descriptor (see below)
7356  *                retval                  (ignored)
7357  *
7358  * Indirect:      uap->path               Path of file to get status from
7359  *                uap->ub                 User buffer (holds file status info)
7360  *                uap->xsecurity          ACL to get (extended security)
7361  *                uap->xsecurity_size     Size of ACL
7362  *
7363  * Returns:        0                      Success
7364  *                !0                      errno value
7365  *
7366  */
7367 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7368 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7369 {
7370 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7371 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7372 	           AT_SYMLINK_NOFOLLOW);
7373 }
7374 
7375 /*
7376  * Get file status; this version does not follow links.
7377  */
7378 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7379 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7380 {
7381 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7382 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7383 }
7384 
7385 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7386 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7387 {
7388 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7389 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7390 }
7391 
7392 /*
7393  * lstat64_extended: Get file status; can handle large inode numbers; does not
7394  * follow links; with extended security (ACL).
7395  *
7396  * Parameters:    p                       (ignored)
7397  *                uap                     User argument descriptor (see below)
7398  *                retval                  (ignored)
7399  *
7400  * Indirect:      uap->path               Path of file to get status from
7401  *                uap->ub                 User buffer (holds file status info)
7402  *                uap->xsecurity          ACL to get (extended security)
7403  *                uap->xsecurity_size     Size of ACL
7404  *
7405  * Returns:        0                      Success
7406  *                !0                      errno value
7407  *
7408  */
7409 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7410 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7411 {
7412 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7413 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7414 	           AT_SYMLINK_NOFOLLOW);
7415 }
7416 
7417 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7418 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7419 {
7420 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7421 		return EINVAL;
7422 	}
7423 
7424 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7425 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7426 }
7427 
7428 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7429 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7430     __unused int32_t *retval)
7431 {
7432 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7433 		return EINVAL;
7434 	}
7435 
7436 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7437 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7438 }
7439 
7440 /*
7441  * Get configurable pathname variables.
7442  *
7443  * Returns:	0			Success
7444  *	namei:???
7445  *	vn_pathconf:???
7446  *
7447  * Notes:	Global implementation  constants are intended to be
7448  *		implemented in this function directly; all other constants
7449  *		are per-FS implementation, and therefore must be handled in
7450  *		each respective FS, instead.
7451  *
7452  * XXX We implement some things globally right now that should actually be
7453  * XXX per-FS; we will need to deal with this at some point.
7454  */
7455 /* ARGSUSED */
7456 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7457 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7458 {
7459 	int error;
7460 	struct nameidata nd;
7461 	vfs_context_t ctx = vfs_context_current();
7462 
7463 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7464 	    UIO_USERSPACE, uap->path, ctx);
7465 	error = namei(&nd);
7466 	if (error) {
7467 		return error;
7468 	}
7469 
7470 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7471 
7472 	vnode_put(nd.ni_vp);
7473 	nameidone(&nd);
7474 	return error;
7475 }
7476 
7477 /*
7478  * Return target name of a symbolic link.
7479  */
7480 /* ARGSUSED */
7481 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7482 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7483     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7484     int *retval)
7485 {
7486 	vnode_t vp;
7487 	uio_t auio;
7488 	int error;
7489 	struct nameidata nd;
7490 	UIO_STACKBUF(uio_buf, 1);
7491 	bool put_vnode;
7492 
7493 	if (bufsize > INT32_MAX) {
7494 		return EINVAL;
7495 	}
7496 
7497 	if (lnk_vp) {
7498 		vp = lnk_vp;
7499 		put_vnode = false;
7500 	} else {
7501 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7502 		    seg, path, ctx);
7503 
7504 		error = nameiat(&nd, fd);
7505 		if (error) {
7506 			return error;
7507 		}
7508 		vp = nd.ni_vp;
7509 		put_vnode = true;
7510 		nameidone(&nd);
7511 	}
7512 
7513 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7514 	    &uio_buf[0], sizeof(uio_buf));
7515 	uio_addiov(auio, buf, bufsize);
7516 	if (vp->v_type != VLNK) {
7517 		error = EINVAL;
7518 	} else {
7519 #if CONFIG_MACF
7520 		error = mac_vnode_check_readlink(ctx, vp);
7521 #endif
7522 		if (error == 0) {
7523 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7524 			    ctx);
7525 		}
7526 		if (error == 0) {
7527 			error = VNOP_READLINK(vp, auio, ctx);
7528 		}
7529 	}
7530 
7531 	if (put_vnode) {
7532 		vnode_put(vp);
7533 	}
7534 
7535 	*retval = (int)(bufsize - uio_resid(auio));
7536 	return error;
7537 }
7538 
7539 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7540 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7541 {
7542 	enum uio_seg procseg;
7543 	vnode_t vp;
7544 	int error;
7545 
7546 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7547 
7548 	AUDIT_ARG(fd, uap->fd);
7549 
7550 	if ((error = file_vnode(uap->fd, &vp))) {
7551 		return error;
7552 	}
7553 	if ((error = vnode_getwithref(vp))) {
7554 		file_drop(uap->fd);
7555 		return error;
7556 	}
7557 
7558 	error = readlinkat_internal(vfs_context_current(), -1,
7559 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7560 	    uap->bufsize, procseg, retval);
7561 
7562 	vnode_put(vp);
7563 	file_drop(uap->fd);
7564 	return error;
7565 }
7566 
7567 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7568 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7569 {
7570 	enum uio_seg procseg;
7571 
7572 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7573 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7574 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7575 	           uap->count, procseg, retval);
7576 }
7577 
7578 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7579 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7580 {
7581 	enum uio_seg procseg;
7582 
7583 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7584 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7585 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7586 	           retval);
7587 }
7588 
7589 /*
7590  * Change file flags, the deep inner layer.
7591  */
7592 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7593 chflags0(vnode_t vp, struct vnode_attr *va,
7594     int (*setattr)(vnode_t, void *, vfs_context_t),
7595     void *arg, vfs_context_t ctx)
7596 {
7597 	kauth_action_t action = 0;
7598 	int error;
7599 
7600 #if CONFIG_MACF
7601 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7602 	if (error) {
7603 		goto out;
7604 	}
7605 #endif
7606 
7607 	/* request authorisation, disregard immutability */
7608 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7609 		goto out;
7610 	}
7611 	/*
7612 	 * Request that the auth layer disregard those file flags it's allowed to when
7613 	 * authorizing this operation; we need to do this in order to be able to
7614 	 * clear immutable flags.
7615 	 */
7616 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7617 		goto out;
7618 	}
7619 	error = (*setattr)(vp, arg, ctx);
7620 
7621 #if CONFIG_MACF
7622 	if (error == 0) {
7623 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7624 	}
7625 #endif
7626 
7627 out:
7628 	return error;
7629 }
7630 
7631 /*
7632  * Change file flags.
7633  *
7634  * NOTE: this will vnode_put() `vp'
7635  */
7636 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7637 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7638 {
7639 	struct vnode_attr va;
7640 	int error;
7641 
7642 	VATTR_INIT(&va);
7643 	VATTR_SET(&va, va_flags, flags);
7644 
7645 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7646 	vnode_put(vp);
7647 
7648 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7649 		error = ENOTSUP;
7650 	}
7651 
7652 	return error;
7653 }
7654 
7655 /*
7656  * Change flags of a file given a path name.
7657  */
7658 /* ARGSUSED */
7659 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7660 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7661 {
7662 	vnode_t vp;
7663 	vfs_context_t ctx = vfs_context_current();
7664 	int error;
7665 	struct nameidata nd;
7666 	uint32_t wantparent = 0;
7667 
7668 #if CONFIG_FILE_LEASES
7669 	wantparent = WANTPARENT;
7670 #endif
7671 
7672 	AUDIT_ARG(fflags, uap->flags);
7673 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7674 	    UIO_USERSPACE, uap->path, ctx);
7675 	error = namei(&nd);
7676 	if (error) {
7677 		return error;
7678 	}
7679 	vp = nd.ni_vp;
7680 
7681 #if CONFIG_FILE_LEASES
7682 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7683 	vnode_put(nd.ni_dvp);
7684 #endif
7685 
7686 	nameidone(&nd);
7687 
7688 	/* we don't vnode_put() here because chflags1 does internally */
7689 	error = chflags1(vp, uap->flags, ctx);
7690 
7691 	return error;
7692 }
7693 
7694 /*
7695  * Change flags of a file given a file descriptor.
7696  */
7697 /* ARGSUSED */
7698 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7699 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7700 {
7701 	vnode_t vp;
7702 	int error;
7703 
7704 	AUDIT_ARG(fd, uap->fd);
7705 	AUDIT_ARG(fflags, uap->flags);
7706 	if ((error = file_vnode(uap->fd, &vp))) {
7707 		return error;
7708 	}
7709 
7710 	if ((error = vnode_getwithref(vp))) {
7711 		file_drop(uap->fd);
7712 		return error;
7713 	}
7714 
7715 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7716 
7717 #if CONFIG_FILE_LEASES
7718 	vnode_breakdirlease(vp, true, O_WRONLY);
7719 #endif
7720 
7721 	/* we don't vnode_put() here because chflags1 does internally */
7722 	error = chflags1(vp, uap->flags, vfs_context_current());
7723 
7724 	file_drop(uap->fd);
7725 	return error;
7726 }
7727 
7728 /*
7729  * Change security information on a filesystem object.
7730  *
7731  * Returns:	0			Success
7732  *		EPERM			Operation not permitted
7733  *		vnode_authattr:???	[anything vnode_authattr can return]
7734  *		vnode_authorize:???	[anything vnode_authorize can return]
7735  *		vnode_setattr:???	[anything vnode_setattr can return]
7736  *
7737  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7738  *		translated to EPERM before being returned.
7739  */
7740 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7741 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7742 {
7743 	kauth_action_t action;
7744 	int error;
7745 
7746 	AUDIT_ARG(mode, vap->va_mode);
7747 	/* XXX audit new args */
7748 
7749 #if NAMEDSTREAMS
7750 	/* chmod calls are not allowed for resource forks. */
7751 	if (vp->v_flag & VISNAMEDSTREAM) {
7752 		return EPERM;
7753 	}
7754 #endif
7755 
7756 #if CONFIG_MACF
7757 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7758 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7759 		return error;
7760 	}
7761 
7762 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7763 		if ((error = mac_vnode_check_setowner(ctx, vp,
7764 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7765 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7766 			return error;
7767 		}
7768 	}
7769 
7770 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7771 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7772 		return error;
7773 	}
7774 #endif
7775 
7776 	/* make sure that the caller is allowed to set this security information */
7777 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7778 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7779 		if (error == EACCES) {
7780 			error = EPERM;
7781 		}
7782 		return error;
7783 	}
7784 
7785 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7786 		return error;
7787 	}
7788 
7789 #if CONFIG_MACF
7790 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7791 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7792 	}
7793 
7794 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7795 		mac_vnode_notify_setowner(ctx, vp,
7796 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7797 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7798 	}
7799 
7800 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7801 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7802 	}
7803 #endif
7804 
7805 	return error;
7806 }
7807 
7808 
7809 /*
7810  * Change mode of a file given a path name.
7811  *
7812  * Returns:	0			Success
7813  *		namei:???		[anything namei can return]
7814  *		chmod_vnode:???		[anything chmod_vnode can return]
7815  */
7816 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7817 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7818     int fd, int flag, enum uio_seg segflg)
7819 {
7820 	struct nameidata nd;
7821 	int follow, error;
7822 	uint32_t wantparent = 0;
7823 
7824 #if CONFIG_FILE_LEASES
7825 	wantparent = WANTPARENT;
7826 #endif
7827 
7828 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7829 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7830 	    segflg, path, ctx);
7831 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7832 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7833 	}
7834 	if ((error = nameiat(&nd, fd))) {
7835 		return error;
7836 	}
7837 
7838 #if CONFIG_FILE_LEASES
7839 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7840 	vnode_put(nd.ni_dvp);
7841 #endif
7842 
7843 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7844 	vnode_put(nd.ni_vp);
7845 	nameidone(&nd);
7846 	return error;
7847 }
7848 
7849 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7850 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7851     gid_t gid, user_addr_t xsecurity)
7852 {
7853 	int error;
7854 
7855 	VATTR_INIT(pva);
7856 
7857 	if (mode != -1) {
7858 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
7859 	} else {
7860 		pva->va_mode = 0;
7861 	}
7862 
7863 	if (uid != KAUTH_UID_NONE) {
7864 		VATTR_SET(pva, va_uid, uid);
7865 	}
7866 
7867 	if (gid != KAUTH_GID_NONE) {
7868 		VATTR_SET(pva, va_gid, gid);
7869 	}
7870 
7871 	*pxsecdst = NULL;
7872 	switch (xsecurity) {
7873 	case USER_ADDR_NULL:
7874 		break;
7875 
7876 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7877 		VATTR_SET(pva, va_acl, NULL);
7878 		break;
7879 
7880 	default:
7881 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7882 			return error;
7883 		}
7884 
7885 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7886 		pva->va_vaflags |= VA_FILESEC_ACL;
7887 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7888 		break;
7889 	}
7890 
7891 	return 0;
7892 }
7893 
7894 /*
7895  * chmod_extended: Change the mode of a file given a path name; with extended
7896  * argument list (including extended security (ACL)).
7897  *
7898  * Parameters:	p			Process requesting the open
7899  *		uap			User argument descriptor (see below)
7900  *		retval			(ignored)
7901  *
7902  * Indirect:	uap->path		Path to object (same as 'chmod')
7903  *		uap->uid		UID to set
7904  *		uap->gid		GID to set
7905  *		uap->mode		File mode to set (same as 'chmod')
7906  *		uap->xsecurity		ACL to set (or delete)
7907  *
7908  * Returns:	0			Success
7909  *		!0			errno value
7910  *
7911  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7912  *
7913  * XXX:		We should enummerate the possible errno values here, and where
7914  *		in the code they originated.
7915  */
7916 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7917 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7918 {
7919 	int error;
7920 	struct vnode_attr va;
7921 	kauth_filesec_t xsecdst = NULL;
7922 
7923 	AUDIT_ARG(owner, uap->uid, uap->gid);
7924 
7925 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7926 	    uap->gid, uap->xsecurity);
7927 
7928 	if (error) {
7929 		return error;
7930 	}
7931 
7932 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7933 	    UIO_USERSPACE);
7934 
7935 	if (xsecdst != NULL) {
7936 		kauth_filesec_free(xsecdst);
7937 	}
7938 	return error;
7939 }
7940 
7941 /*
7942  * Returns:	0			Success
7943  *		chmodat:???		[anything chmodat can return]
7944  */
7945 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7946 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7947     int flag, enum uio_seg segflg)
7948 {
7949 	struct vnode_attr va;
7950 
7951 	VATTR_INIT(&va);
7952 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7953 
7954 	return chmodat(ctx, path, &va, fd, flag, segflg);
7955 }
7956 
7957 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7958 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7959 {
7960 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7961 	           AT_FDCWD, 0, UIO_USERSPACE);
7962 }
7963 
7964 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7965 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7966 {
7967 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7968 		return EINVAL;
7969 	}
7970 
7971 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7972 	           uap->fd, uap->flag, UIO_USERSPACE);
7973 }
7974 
7975 /*
7976  * Change mode of a file given a file descriptor.
7977  */
7978 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7979 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7980 {
7981 	vnode_t vp;
7982 	int error;
7983 
7984 	AUDIT_ARG(fd, fd);
7985 
7986 	if ((error = file_vnode(fd, &vp)) != 0) {
7987 		return error;
7988 	}
7989 	if ((error = vnode_getwithref(vp)) != 0) {
7990 		file_drop(fd);
7991 		return error;
7992 	}
7993 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7994 
7995 #if CONFIG_FILE_LEASES
7996 	vnode_breakdirlease(vp, true, O_WRONLY);
7997 #endif
7998 
7999 	error = chmod_vnode(vfs_context_current(), vp, vap);
8000 	(void)vnode_put(vp);
8001 	file_drop(fd);
8002 
8003 	return error;
8004 }
8005 
8006 /*
8007  * fchmod_extended: Change mode of a file given a file descriptor; with
8008  * extended argument list (including extended security (ACL)).
8009  *
8010  * Parameters:    p                       Process requesting to change file mode
8011  *                uap                     User argument descriptor (see below)
8012  *                retval                  (ignored)
8013  *
8014  * Indirect:      uap->mode               File mode to set (same as 'chmod')
8015  *                uap->uid                UID to set
8016  *                uap->gid                GID to set
8017  *                uap->xsecurity          ACL to set (or delete)
8018  *                uap->fd                 File descriptor of file to change mode
8019  *
8020  * Returns:        0                      Success
8021  *                !0                      errno value
8022  *
8023  */
8024 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)8025 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
8026 {
8027 	int error;
8028 	struct vnode_attr va;
8029 	kauth_filesec_t xsecdst = NULL;
8030 
8031 	AUDIT_ARG(owner, uap->uid, uap->gid);
8032 
8033 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8034 	    uap->gid, uap->xsecurity);
8035 
8036 	if (error) {
8037 		return error;
8038 	}
8039 
8040 	error = fchmod1(p, uap->fd, &va);
8041 
8042 	if (xsecdst != NULL) {
8043 		kauth_filesec_free(xsecdst);
8044 	}
8045 	return error;
8046 }
8047 
8048 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)8049 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
8050 {
8051 	struct vnode_attr va;
8052 
8053 	VATTR_INIT(&va);
8054 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
8055 
8056 	return fchmod1(p, uap->fd, &va);
8057 }
8058 
8059 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)8060 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
8061 {
8062 	struct vnode_attr va;
8063 	kauth_action_t action;
8064 	int error;
8065 
8066 	VATTR_INIT(&va);
8067 	if (uid != (uid_t)VNOVAL) {
8068 		VATTR_SET(&va, va_uid, uid);
8069 	}
8070 	if (gid != (gid_t)VNOVAL) {
8071 		VATTR_SET(&va, va_gid, gid);
8072 	}
8073 
8074 #if NAMEDSTREAMS
8075 	/* chown calls are not allowed for resource forks. */
8076 	if (vp->v_flag & VISNAMEDSTREAM) {
8077 		error = EPERM;
8078 		goto out;
8079 	}
8080 #endif
8081 
8082 #if CONFIG_MACF
8083 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
8084 	if (error) {
8085 		goto out;
8086 	}
8087 #endif
8088 
8089 	/* preflight and authorize attribute changes */
8090 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8091 		goto out;
8092 	}
8093 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8094 		/*
8095 		 * EACCES is only allowed from namei(); permissions failure should
8096 		 * return EPERM, so we need to translate the error code.
8097 		 */
8098 		if (error == EACCES) {
8099 			error = EPERM;
8100 		}
8101 
8102 		goto out;
8103 	}
8104 
8105 #if CONFIG_FILE_LEASES
8106 	vnode_breakdirlease(vp, true, O_WRONLY);
8107 #endif
8108 
8109 	error = vnode_setattr(vp, &va, ctx);
8110 
8111 #if CONFIG_MACF
8112 	if (error == 0) {
8113 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
8114 	}
8115 #endif
8116 
8117 out:
8118 	return error;
8119 }
8120 
8121 /*
8122  * Set ownership given a path name.
8123  */
8124 /* ARGSUSED */
8125 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)8126 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
8127     gid_t gid, int flag, enum uio_seg segflg)
8128 {
8129 	vnode_t vp;
8130 	int error;
8131 	struct nameidata nd;
8132 	int follow;
8133 
8134 	AUDIT_ARG(owner, uid, gid);
8135 
8136 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8137 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
8138 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8139 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8140 	}
8141 
8142 	error = nameiat(&nd, fd);
8143 	if (error) {
8144 		return error;
8145 	}
8146 
8147 	vp = nd.ni_vp;
8148 	error = vn_chown_internal(ctx, vp, uid, gid);
8149 
8150 	nameidone(&nd);
8151 	vnode_put(vp);
8152 	return error;
8153 }
8154 
8155 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8156 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8157 {
8158 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8159 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
8160 }
8161 
8162 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8163 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8164 {
8165 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8166 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8167 }
8168 
8169 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8170 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8171 {
8172 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
8173 		return EINVAL;
8174 	}
8175 
8176 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8177 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8178 }
8179 
8180 /*
8181  * Set ownership given a file descriptor.
8182  */
8183 /* ARGSUSED */
8184 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8185 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8186 {
8187 	vfs_context_t ctx = vfs_context_current();
8188 	vnode_t vp;
8189 	int error;
8190 
8191 	AUDIT_ARG(owner, uap->uid, uap->gid);
8192 	AUDIT_ARG(fd, uap->fd);
8193 
8194 	if ((error = file_vnode(uap->fd, &vp))) {
8195 		return error;
8196 	}
8197 
8198 	if ((error = vnode_getwithref(vp))) {
8199 		file_drop(uap->fd);
8200 		return error;
8201 	}
8202 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8203 
8204 	error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8205 
8206 	(void)vnode_put(vp);
8207 	file_drop(uap->fd);
8208 	return error;
8209 }
8210 
8211 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8212 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8213 {
8214 	int error;
8215 
8216 	if (usrtvp == USER_ADDR_NULL) {
8217 		struct timeval old_tv;
8218 		/* XXX Y2038 bug because of microtime argument */
8219 		microtime(&old_tv);
8220 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8221 		tsp[1] = tsp[0];
8222 	} else {
8223 		if (IS_64BIT_PROCESS(current_proc())) {
8224 			struct user64_timeval tv[2];
8225 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8226 			if (error) {
8227 				return error;
8228 			}
8229 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8230 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8231 		} else {
8232 			struct user32_timeval tv[2];
8233 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8234 			if (error) {
8235 				return error;
8236 			}
8237 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8238 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8239 		}
8240 	}
8241 	return 0;
8242 }
8243 
8244 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8245 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8246     int nullflag)
8247 {
8248 	int error;
8249 	struct vnode_attr va;
8250 	kauth_action_t action;
8251 
8252 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8253 
8254 	VATTR_INIT(&va);
8255 	VATTR_SET(&va, va_access_time, ts[0]);
8256 	VATTR_SET(&va, va_modify_time, ts[1]);
8257 	if (nullflag) {
8258 		va.va_vaflags |= VA_UTIMES_NULL;
8259 	}
8260 
8261 #if NAMEDSTREAMS
8262 	/* utimes calls are not allowed for resource forks. */
8263 	if (vp->v_flag & VISNAMEDSTREAM) {
8264 		error = EPERM;
8265 		goto out;
8266 	}
8267 #endif
8268 
8269 #if CONFIG_MACF
8270 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8271 	if (error) {
8272 		goto out;
8273 	}
8274 #endif
8275 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8276 		if (!nullflag && error == EACCES) {
8277 			error = EPERM;
8278 		}
8279 		goto out;
8280 	}
8281 
8282 	/* since we may not need to auth anything, check here */
8283 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8284 		if (!nullflag && error == EACCES) {
8285 			error = EPERM;
8286 		}
8287 		goto out;
8288 	}
8289 	error = vnode_setattr(vp, &va, ctx);
8290 
8291 #if CONFIG_MACF
8292 	if (error == 0) {
8293 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8294 	}
8295 #endif
8296 
8297 out:
8298 	return error;
8299 }
8300 
8301 /*
8302  * Set the access and modification times of a file.
8303  */
8304 /* ARGSUSED */
8305 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8306 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8307 {
8308 	struct timespec ts[2];
8309 	user_addr_t usrtvp;
8310 	int error;
8311 	struct nameidata nd;
8312 	vfs_context_t ctx = vfs_context_current();
8313 	uint32_t wantparent = 0;
8314 
8315 #if CONFIG_FILE_LEASES
8316 	wantparent = WANTPARENT;
8317 #endif
8318 
8319 	/*
8320 	 * AUDIT: Needed to change the order of operations to do the
8321 	 * name lookup first because auditing wants the path.
8322 	 */
8323 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8324 	    UIO_USERSPACE, uap->path, ctx);
8325 	error = namei(&nd);
8326 	if (error) {
8327 		return error;
8328 	}
8329 
8330 	/*
8331 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8332 	 * the current time instead.
8333 	 */
8334 	usrtvp = uap->tptr;
8335 	if ((error = getutimes(usrtvp, ts)) != 0) {
8336 		goto out;
8337 	}
8338 
8339 #if CONFIG_FILE_LEASES
8340 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8341 #endif
8342 
8343 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8344 
8345 out:
8346 #if CONFIG_FILE_LEASES
8347 	vnode_put(nd.ni_dvp);
8348 #endif
8349 	nameidone(&nd);
8350 	vnode_put(nd.ni_vp);
8351 	return error;
8352 }
8353 
8354 /*
8355  * Set the access and modification times of a file.
8356  */
8357 /* ARGSUSED */
8358 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8359 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8360 {
8361 	struct timespec ts[2];
8362 	vnode_t vp;
8363 	user_addr_t usrtvp;
8364 	int error;
8365 
8366 	AUDIT_ARG(fd, uap->fd);
8367 	usrtvp = uap->tptr;
8368 	if ((error = getutimes(usrtvp, ts)) != 0) {
8369 		return error;
8370 	}
8371 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8372 		return error;
8373 	}
8374 	if ((error = vnode_getwithref(vp))) {
8375 		file_drop(uap->fd);
8376 		return error;
8377 	}
8378 
8379 #if CONFIG_FILE_LEASES
8380 	vnode_breakdirlease(vp, true, O_WRONLY);
8381 #endif
8382 
8383 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8384 
8385 	vnode_put(vp);
8386 	file_drop(uap->fd);
8387 	return error;
8388 }
8389 
8390 static int
truncate_validate_common(proc_t p,off_t length)8391 truncate_validate_common(proc_t p, off_t length)
8392 {
8393 	rlim_t fsize_limit;
8394 
8395 	if (length < 0) {
8396 		return EINVAL;
8397 	}
8398 
8399 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8400 	if ((rlim_t)length > fsize_limit) {
8401 		psignal(p, SIGXFSZ);
8402 		return EFBIG;
8403 	}
8404 
8405 	return 0;
8406 }
8407 
8408 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8409 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8410     vfs_context_t ctx, boolean_t need_auth)
8411 {
8412 	struct vnode_attr va;
8413 	kauth_action_t action;
8414 	int error;
8415 
8416 	VATTR_INIT(&va);
8417 	VATTR_SET(&va, va_data_size, length);
8418 
8419 #if CONFIG_MACF
8420 	error = mac_vnode_check_truncate(ctx, cred, vp);
8421 	if (error) {
8422 		return error;
8423 	}
8424 #endif
8425 
8426 	/*
8427 	 * If we reached here from `ftruncate` then we already did an effective
8428 	 * `vnode_authorize` upon open.  We honour the result from then.
8429 	 */
8430 	if (need_auth) {
8431 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8432 			return error;
8433 		}
8434 
8435 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8436 			return error;
8437 		}
8438 	}
8439 
8440 #if CONFIG_FILE_LEASES
8441 	/* Check if there is a lease placed on the parent directory. */
8442 	vnode_breakdirlease(vp, true, O_WRONLY);
8443 
8444 	/* Now check if there is a lease placed on the file itself. */
8445 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8446 #endif
8447 
8448 	error = vnode_setattr(vp, &va, ctx);
8449 
8450 #if CONFIG_MACF
8451 	if (error == 0) {
8452 		mac_vnode_notify_truncate(ctx, cred, vp);
8453 	}
8454 #endif
8455 
8456 	return error;
8457 }
8458 
8459 /*
8460  * Truncate a file given its path name.
8461  */
8462 /* ARGSUSED */
8463 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8464 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8465 {
8466 	vfs_context_t ctx = vfs_context_current();
8467 	vnode_t vp;
8468 	int error;
8469 	struct nameidata nd;
8470 
8471 	if ((error = truncate_validate_common(p, uap->length))) {
8472 		return error;
8473 	}
8474 
8475 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8476 	    UIO_USERSPACE, uap->path, ctx);
8477 
8478 	if ((error = namei(&nd))) {
8479 		return error;
8480 	}
8481 
8482 	vp = nd.ni_vp;
8483 	nameidone(&nd);
8484 
8485 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8486 	vnode_put(vp);
8487 
8488 	return error;
8489 }
8490 
8491 /*
8492  * Truncate a file given a file descriptor.
8493  */
8494 /* ARGSUSED */
8495 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8496 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8497 {
8498 	struct vnode_attr va;
8499 	vnode_t vp = NULLVP;
8500 	struct fileproc *fp;
8501 	bool need_vnode_put = false;
8502 	int error;
8503 
8504 	AUDIT_ARG(fd, uap->fd);
8505 
8506 	if ((error = truncate_validate_common(p, uap->length))) {
8507 		return error;
8508 	}
8509 
8510 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8511 		return error;
8512 	}
8513 
8514 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8515 	case DTYPE_PSXSHM:
8516 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8517 		goto out;
8518 	case DTYPE_VNODE:
8519 		break;
8520 	default:
8521 		error = EINVAL;
8522 		goto out;
8523 	}
8524 
8525 	vp = (vnode_t)fp_get_data(fp);
8526 
8527 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8528 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8529 		error = EINVAL;
8530 		goto out;
8531 	}
8532 
8533 	if ((error = vnode_getwithref(vp)) != 0) {
8534 		goto out;
8535 	}
8536 	need_vnode_put = true;
8537 
8538 	VATTR_INIT(&va);
8539 	VATTR_WANTED(&va, va_flags);
8540 
8541 	error = vnode_getattr(vp, &va, vfs_context_current());
8542 	if (error) {
8543 		goto out;
8544 	}
8545 
8546 	/* Don't allow ftruncate if the file has append-only flag set. */
8547 	if (va.va_flags & APPEND) {
8548 		error = EPERM;
8549 		goto out;
8550 	}
8551 
8552 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8553 
8554 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8555 	    vfs_context_current(), false);
8556 	if (!error) {
8557 		fp->fp_glob->fg_flag |= FWASWRITTEN;
8558 	}
8559 
8560 out:
8561 	if (vp && need_vnode_put) {
8562 		vnode_put(vp);
8563 	}
8564 
8565 	file_drop(uap->fd);
8566 	return error;
8567 }
8568 
8569 
8570 /*
8571  * Sync an open file with synchronized I/O _file_ integrity completion
8572  */
8573 /* ARGSUSED */
8574 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8575 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8576 {
8577 	__pthread_testcancel(1);
8578 	return fsync_common(p, uap, MNT_WAIT);
8579 }
8580 
8581 
8582 /*
8583  * Sync an open file with synchronized I/O _file_ integrity completion
8584  *
8585  * Notes:	This is a legacy support function that does not test for
8586  *		thread cancellation points.
8587  */
8588 /* ARGSUSED */
8589 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8590 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8591 {
8592 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8593 }
8594 
8595 
8596 /*
8597  * Sync an open file with synchronized I/O _data_ integrity completion
8598  */
8599 /* ARGSUSED */
8600 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8601 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8602 {
8603 	__pthread_testcancel(1);
8604 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8605 }
8606 
8607 
8608 /*
8609  * fsync_common
8610  *
8611  * Common fsync code to support both synchronized I/O file integrity completion
8612  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8613  *
8614  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8615  * will only guarantee that the file data contents are retrievable.  If
8616  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8617  * includes additional metadata unnecessary for retrieving the file data
8618  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8619  * storage.
8620  *
8621  * Parameters:	p				The process
8622  *		uap->fd				The descriptor to synchronize
8623  *		flags				The data integrity flags
8624  *
8625  * Returns:	int				Success
8626  *	fp_getfvp:EBADF				Bad file descriptor
8627  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8628  *	VNOP_FSYNC:???				unspecified
8629  *
8630  * Notes:	We use struct fsync_args because it is a short name, and all
8631  *		caller argument structures are otherwise identical.
8632  */
8633 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8634 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8635 {
8636 	vnode_t vp;
8637 	struct fileproc *fp;
8638 	vfs_context_t ctx = vfs_context_current();
8639 	int error;
8640 
8641 	AUDIT_ARG(fd, uap->fd);
8642 
8643 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8644 		return error;
8645 	}
8646 	if ((error = vnode_getwithref(vp))) {
8647 		file_drop(uap->fd);
8648 		return error;
8649 	}
8650 
8651 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8652 
8653 	error = VNOP_FSYNC(vp, flags, ctx);
8654 
8655 #if NAMEDRSRCFORK
8656 	/* Sync resource fork shadow file if necessary. */
8657 	if ((error == 0) &&
8658 	    (vp->v_flag & VISNAMEDSTREAM) &&
8659 	    (vp->v_parent != NULLVP) &&
8660 	    vnode_isshadow(vp) &&
8661 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8662 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8663 	}
8664 #endif
8665 
8666 	(void)vnode_put(vp);
8667 	file_drop(uap->fd);
8668 	return error;
8669 }
8670 
8671 /*
8672  * Duplicate files.  Source must be a file, target must be a file or
8673  * must not exist.
8674  *
8675  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8676  *     perform inheritance correctly.
8677  */
8678 /* ARGSUSED */
8679 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8680 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8681 {
8682 	vnode_t tvp, fvp, tdvp, sdvp;
8683 	struct nameidata fromnd, tond;
8684 	int error;
8685 	vfs_context_t ctx = vfs_context_current();
8686 
8687 	/* Check that the flags are valid. */
8688 	if (uap->flags & ~CPF_MASK) {
8689 		return EINVAL;
8690 	}
8691 
8692 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8693 	    UIO_USERSPACE, uap->from, ctx);
8694 	if ((error = namei(&fromnd))) {
8695 		return error;
8696 	}
8697 	fvp = fromnd.ni_vp;
8698 
8699 	NDINIT(&tond, CREATE, OP_LINK,
8700 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8701 	    UIO_USERSPACE, uap->to, ctx);
8702 	if ((error = namei(&tond))) {
8703 		goto out1;
8704 	}
8705 	tdvp = tond.ni_dvp;
8706 	tvp = tond.ni_vp;
8707 
8708 	if (tvp != NULL) {
8709 		if (!(uap->flags & CPF_OVERWRITE)) {
8710 			error = EEXIST;
8711 			goto out;
8712 		}
8713 	}
8714 
8715 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8716 		error = EISDIR;
8717 		goto out;
8718 	}
8719 
8720 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8721 		error = EOPNOTSUPP;
8722 		goto out;
8723 	}
8724 
8725 #if CONFIG_MACF
8726 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8727 		goto out;
8728 	}
8729 #endif /* CONFIG_MACF */
8730 
8731 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8732 		goto out;
8733 	}
8734 	if (tvp) {
8735 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8736 			goto out;
8737 		}
8738 	}
8739 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8740 		goto out;
8741 	}
8742 
8743 	if (fvp == tdvp) {
8744 		error = EINVAL;
8745 	}
8746 	/*
8747 	 * If source is the same as the destination (that is the
8748 	 * same inode number) then there is nothing to do.
8749 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8750 	 */
8751 	if (fvp == tvp) {
8752 		error = -1;
8753 	}
8754 
8755 #if CONFIG_FILE_LEASES
8756 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8757 #endif
8758 
8759 	if (!error) {
8760 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8761 	}
8762 out:
8763 	sdvp = tond.ni_startdir;
8764 	/*
8765 	 * nameidone has to happen before we vnode_put(tdvp)
8766 	 * since it may need to release the fs_nodelock on the tdvp
8767 	 */
8768 	nameidone(&tond);
8769 
8770 	if (tvp) {
8771 		vnode_put(tvp);
8772 	}
8773 	vnode_put(tdvp);
8774 	vnode_put(sdvp);
8775 out1:
8776 	vnode_put(fvp);
8777 
8778 	nameidone(&fromnd);
8779 
8780 	if (error == -1) {
8781 		return 0;
8782 	}
8783 	return error;
8784 }
8785 
8786 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8787 
8788 /*
8789  * Helper function for doing clones. The caller is expected to provide an
8790  * iocounted source vnode and release it.
8791  */
8792 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8793 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8794     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8795 {
8796 	vnode_t tvp, tdvp;
8797 	struct nameidata *tondp = NULL;
8798 	int error;
8799 	int follow;
8800 	boolean_t free_src_acl;
8801 	boolean_t attr_cleanup;
8802 	enum vtype v_type;
8803 	kauth_action_t action;
8804 	struct componentname *cnp;
8805 	uint32_t defaulted = 0;
8806 	struct {
8807 		struct vnode_attr va[2];
8808 	} *va2p = NULL;
8809 	struct vnode_attr *vap = NULL;
8810 	struct vnode_attr *nvap = NULL;
8811 	uint32_t vnop_flags;
8812 
8813 	v_type = vnode_vtype(fvp);
8814 	switch (v_type) {
8815 	case VLNK:
8816 	/* FALLTHRU */
8817 	case VREG:
8818 		action = KAUTH_VNODE_ADD_FILE;
8819 		break;
8820 	case VDIR:
8821 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8822 		    fvp->v_mountedhere) {
8823 			return EINVAL;
8824 		}
8825 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8826 		break;
8827 	default:
8828 		return EINVAL;
8829 	}
8830 
8831 	AUDIT_ARG(fd2, dst_dirfd);
8832 	AUDIT_ARG(value32, flags);
8833 
8834 	tondp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8835 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8836 	NDINIT(tondp, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8837 	    UIO_USERSPACE, dst, ctx);
8838 	if (flags & CLONE_NOFOLLOW_ANY) {
8839 		tondp->ni_flag |= NAMEI_NOFOLLOW_ANY;
8840 	}
8841 
8842 	if ((error = nameiat(tondp, dst_dirfd))) {
8843 		kfree_type(struct nameidata, tondp);
8844 		return error;
8845 	}
8846 	cnp = &tondp->ni_cnd;
8847 	tdvp = tondp->ni_dvp;
8848 	tvp = tondp->ni_vp;
8849 
8850 	free_src_acl = FALSE;
8851 	attr_cleanup = FALSE;
8852 
8853 	if (tvp != NULL) {
8854 		error = EEXIST;
8855 		goto out;
8856 	}
8857 
8858 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8859 		error = EXDEV;
8860 		goto out;
8861 	}
8862 
8863 #if CONFIG_MACF
8864 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8865 		goto out;
8866 	}
8867 #endif
8868 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8869 		goto out;
8870 	}
8871 
8872 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8873 	if (data_read_authorised) {
8874 		action &= ~KAUTH_VNODE_READ_DATA;
8875 	}
8876 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8877 		goto out;
8878 	}
8879 
8880 	va2p = kalloc_type(typeof(*va2p), Z_WAITOK | Z_NOFAIL);
8881 	vap = &va2p->va[0];
8882 	nvap = &va2p->va[1];
8883 
8884 	/*
8885 	 * certain attributes may need to be changed from the source, we ask for
8886 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
8887 	 * flag is specified. By default, the clone file will inherit the target
8888 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8889 	 * will inherit the source file's ACLs instead.
8890 	 */
8891 	VATTR_INIT(vap);
8892 	VATTR_WANTED(vap, va_uid);
8893 	VATTR_WANTED(vap, va_gid);
8894 	VATTR_WANTED(vap, va_mode);
8895 	VATTR_WANTED(vap, va_flags);
8896 	if (flags & CLONE_ACL) {
8897 		VATTR_WANTED(vap, va_acl);
8898 	}
8899 
8900 	if ((error = vnode_getattr(fvp, vap, ctx)) != 0) {
8901 		goto out;
8902 	}
8903 
8904 	VATTR_INIT(nvap);
8905 	VATTR_SET(nvap, va_type, v_type);
8906 	if (VATTR_IS_SUPPORTED(vap, va_acl) && vap->va_acl != NULL) {
8907 		VATTR_SET(nvap, va_acl, vap->va_acl);
8908 		free_src_acl = TRUE;
8909 	}
8910 
8911 	/* Handle ACL inheritance, initialize vap. */
8912 	if (v_type == VLNK) {
8913 		error = vnode_authattr_new(tdvp, nvap, 0, ctx);
8914 	} else {
8915 		error = vn_attribute_prepare(tdvp, nvap, &defaulted, ctx);
8916 		if (error) {
8917 			goto out;
8918 		}
8919 		attr_cleanup = TRUE;
8920 	}
8921 
8922 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8923 	/*
8924 	 * We've got initial values for all security parameters,
8925 	 * If we are superuser, then we can change owners to be the
8926 	 * same as the source. Both superuser and the owner have default
8927 	 * WRITE_SECURITY privileges so all other fields can be taken
8928 	 * from source as well.
8929 	 */
8930 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8931 		if (VATTR_IS_SUPPORTED(vap, va_uid)) {
8932 			VATTR_SET(nvap, va_uid, vap->va_uid);
8933 		}
8934 		if (VATTR_IS_SUPPORTED(vap, va_gid)) {
8935 			VATTR_SET(nvap, va_gid, vap->va_gid);
8936 		}
8937 	} else {
8938 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8939 	}
8940 
8941 	if (VATTR_IS_SUPPORTED(vap, va_mode)) {
8942 		VATTR_SET(nvap, va_mode, vap->va_mode);
8943 	}
8944 	if (VATTR_IS_SUPPORTED(vap, va_flags)) {
8945 		VATTR_SET(nvap, va_flags,
8946 		    ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8947 		    (nvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8948 	}
8949 
8950 #if CONFIG_FILE_LEASES
8951 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8952 #endif
8953 
8954 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, nvap, vnop_flags, ctx);
8955 
8956 	if (!error && tvp) {
8957 		int     update_flags = 0;
8958 #if CONFIG_FSE
8959 		int fsevent;
8960 #endif /* CONFIG_FSE */
8961 
8962 		/*
8963 		 * If some of the requested attributes weren't handled by the
8964 		 * VNOP, use our fallback code.
8965 		 */
8966 		if (!VATTR_ALL_SUPPORTED(nvap)) {
8967 			(void)vnode_setattr_fallback(tvp, nvap, ctx);
8968 		}
8969 
8970 #if CONFIG_MACF
8971 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8972 		    VNODE_LABEL_CREATE, ctx);
8973 #endif
8974 
8975 		// Make sure the name & parent pointers are hooked up
8976 		if (tvp->v_name == NULL) {
8977 			update_flags |= VNODE_UPDATE_NAME;
8978 		}
8979 		if (tvp->v_parent == NULLVP) {
8980 			update_flags |= VNODE_UPDATE_PARENT;
8981 		}
8982 
8983 		if (update_flags) {
8984 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8985 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8986 		}
8987 
8988 #if CONFIG_FSE
8989 		switch (vnode_vtype(tvp)) {
8990 		case VLNK:
8991 		/* FALLTHRU */
8992 		case VREG:
8993 			fsevent = FSE_CREATE_FILE;
8994 			break;
8995 		case VDIR:
8996 			fsevent = FSE_CREATE_DIR;
8997 			break;
8998 		default:
8999 			goto out;
9000 		}
9001 
9002 		if (need_fsevent(fsevent, tvp)) {
9003 			/*
9004 			 * The following is a sequence of three explicit events.
9005 			 * A pair of FSE_CLONE events representing the source and destination
9006 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
9007 			 * fseventsd may coalesce the destination clone and create events
9008 			 * into a single event resulting in the following sequence for a client
9009 			 * FSE_CLONE (src)
9010 			 * FSE_CLONE | FSE_CREATE (dst)
9011 			 */
9012 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
9013 			    FSE_ARG_DONE);
9014 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
9015 			    FSE_ARG_DONE);
9016 		}
9017 #endif /* CONFIG_FSE */
9018 	}
9019 
9020 out:
9021 	if (attr_cleanup) {
9022 		vn_attribute_cleanup(nvap, defaulted);
9023 	}
9024 	if (free_src_acl && vap->va_acl) {
9025 		kauth_acl_free(vap->va_acl);
9026 	}
9027 	if (va2p) {
9028 		kfree_type(typeof(*va2p), va2p);
9029 	}
9030 	nameidone(tondp);
9031 	kfree_type(struct nameidata, tondp);
9032 	if (tvp) {
9033 		vnode_put(tvp);
9034 	}
9035 	vnode_put(tdvp);
9036 	return error;
9037 }
9038 
9039 /*
9040  * clone files or directories, target must not exist.
9041  */
9042 /* ARGSUSED */
9043 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)9044 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
9045     __unused int32_t *retval)
9046 {
9047 	vnode_t fvp;
9048 	struct nameidata *ndp = NULL;
9049 	int follow;
9050 	int error;
9051 	vfs_context_t ctx = vfs_context_current();
9052 
9053 	/* Check that the flags are valid. */
9054 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9055 	    CLONE_NOFOLLOW_ANY)) {
9056 		return EINVAL;
9057 	}
9058 
9059 	AUDIT_ARG(fd, uap->src_dirfd);
9060 
9061 	ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9062 
9063 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9064 	NDINIT(ndp, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
9065 	    UIO_USERSPACE, uap->src, ctx);
9066 	if (uap->flags & CLONE_NOFOLLOW_ANY) {
9067 		ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9068 	}
9069 
9070 	if ((error = nameiat(ndp, uap->src_dirfd))) {
9071 		kfree_type(struct nameidata, ndp);
9072 		return error;
9073 	}
9074 
9075 	fvp = ndp->ni_vp;
9076 	nameidone(ndp);
9077 	kfree_type(struct nameidata, ndp);
9078 
9079 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
9080 	    uap->flags, ctx);
9081 
9082 	vnode_put(fvp);
9083 	return error;
9084 }
9085 
9086 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)9087 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
9088     __unused int32_t *retval)
9089 {
9090 	vnode_t fvp;
9091 	struct fileproc *fp;
9092 	int error;
9093 	vfs_context_t ctx = vfs_context_current();
9094 
9095 	/* Check that the flags are valid. */
9096 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9097 	    CLONE_NOFOLLOW_ANY)) {
9098 		return EINVAL;
9099 	}
9100 
9101 	AUDIT_ARG(fd, uap->src_fd);
9102 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
9103 	if (error) {
9104 		return error;
9105 	}
9106 
9107 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9108 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
9109 		error = EBADF;
9110 		goto out;
9111 	}
9112 
9113 	if ((error = vnode_getwithref(fvp))) {
9114 		goto out;
9115 	}
9116 
9117 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
9118 
9119 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
9120 	    uap->flags, ctx);
9121 
9122 	vnode_put(fvp);
9123 out:
9124 	file_drop(uap->src_fd);
9125 	return error;
9126 }
9127 
9128 static int
rename_submounts_callback(mount_t mp,void * arg)9129 rename_submounts_callback(mount_t mp, void *arg)
9130 {
9131 	int error = 0;
9132 	mount_t pmp = (mount_t)arg;
9133 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
9134 
9135 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
9136 		return 0;
9137 	}
9138 
9139 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
9140 		return 0;
9141 	}
9142 
9143 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
9144 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
9145 		return -1;
9146 	}
9147 
9148 	size_t pathlen = MAXPATHLEN;
9149 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
9150 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
9151 	}
9152 
9153 	vfs_unbusy(mp);
9154 
9155 	return error;
9156 }
9157 
9158 /*
9159  * Rename files.  Source and destination must either both be directories,
9160  * or both not be directories.  If target is a directory, it must be empty.
9161  */
9162 /* ARGSUSED */
9163 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)9164 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
9165     int tofd, user_addr_t to, int segflg, u_int uflags)
9166 {
9167 	vnode_t tvp, tdvp;
9168 	vnode_t fvp, fdvp;
9169 	vnode_t mnt_fvp;
9170 	struct nameidata *fromnd, *tond;
9171 	int error = 0;
9172 	int do_retry;
9173 	int retry_count;
9174 	int mntrename;
9175 	int need_event;
9176 	int need_kpath2;
9177 	int has_listeners;
9178 	const char *oname = NULL;
9179 	char *from_name = NULL, *to_name = NULL;
9180 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
9181 	int from_len = 0, to_len = 0;
9182 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
9183 	int holding_mntlock;
9184 	int vn_authorize_skipped;
9185 	mount_t locked_mp = NULL;
9186 	vnode_t oparent = NULLVP;
9187 	vnode_t locked_vp = NULLVP;
9188 #if CONFIG_FSE
9189 	fse_info from_finfo = {}, to_finfo;
9190 #endif
9191 	int from_truncated = 0, to_truncated = 0;
9192 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
9193 	int batched = 0;
9194 	struct vnode_attr *fvap, *tvap;
9195 	int continuing = 0;
9196 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
9197 	int32_t nofollow_any = 0;
9198 	/* carving out a chunk for structs that are too big to be on stack. */
9199 	struct {
9200 		struct nameidata from_node, to_node;
9201 		struct vnode_attr fv_attr, tv_attr;
9202 	} * __rename_data;
9203 
9204 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9205 	fromnd = &__rename_data->from_node;
9206 	tond = &__rename_data->to_node;
9207 
9208 	holding_mntlock = 0;
9209 	do_retry = 0;
9210 	retry_count = 0;
9211 retry:
9212 	fvp = tvp = NULL;
9213 	fdvp = tdvp = NULL;
9214 	fvap = tvap = NULL;
9215 	mnt_fvp = NULLVP;
9216 	mntrename = FALSE;
9217 	vn_authorize_skipped = FALSE;
9218 
9219 	if (uflags & RENAME_NOFOLLOW_ANY) {
9220 		nofollow_any = NAMEI_NOFOLLOW_ANY;
9221 	}
9222 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9223 	    segflg, from, ctx);
9224 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9225 
9226 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9227 	    segflg, to, ctx);
9228 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9229 
9230 continue_lookup:
9231 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9232 		if ((error = nameiat(fromnd, fromfd))) {
9233 			goto out1;
9234 		}
9235 		fdvp = fromnd->ni_dvp;
9236 		fvp  = fromnd->ni_vp;
9237 
9238 		if (fvp && fvp->v_type == VDIR) {
9239 			tond->ni_cnd.cn_flags |= WILLBEDIR;
9240 		}
9241 	}
9242 
9243 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9244 		if ((error = nameiat(tond, tofd))) {
9245 			/*
9246 			 * Translate error code for rename("dir1", "dir2/.").
9247 			 */
9248 			if (error == EISDIR && fvp->v_type == VDIR) {
9249 				error = EINVAL;
9250 			}
9251 			goto out1;
9252 		}
9253 		tdvp = tond->ni_dvp;
9254 		tvp  = tond->ni_vp;
9255 	}
9256 
9257 #if DEVELOPMENT || DEBUG
9258 	/*
9259 	 * XXX VSWAP: Check for entitlements or special flag here
9260 	 * so we can restrict access appropriately.
9261 	 */
9262 #else /* DEVELOPMENT || DEBUG */
9263 
9264 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9265 		error = EPERM;
9266 		goto out1;
9267 	}
9268 
9269 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9270 		error = EPERM;
9271 		goto out1;
9272 	}
9273 #endif /* DEVELOPMENT || DEBUG */
9274 
9275 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9276 		error = ENOENT;
9277 		goto out1;
9278 	}
9279 
9280 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9281 		int32_t pval = 0;
9282 		int err = 0;
9283 
9284 		/*
9285 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9286 		 * has the same name as target iff the following conditions are met:
9287 		 * 1. the target file system is case insensitive
9288 		 * 2. source and target directories are the same
9289 		 * 3. source and target files are the same
9290 		 * 4. name only differs in case (determined by underlying filesystem)
9291 		 */
9292 		if (fvp != tvp || fdvp != tdvp) {
9293 			error = EEXIST;
9294 			goto out1;
9295 		}
9296 
9297 		/*
9298 		 * Assume that the target file system is case sensitive if
9299 		 * _PC_CASE_SENSITIVE selector isn't supported.
9300 		 */
9301 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9302 		if (err != 0 || pval != 0) {
9303 			error = EEXIST;
9304 			goto out1;
9305 		}
9306 	}
9307 
9308 	batched = vnode_compound_rename_available(fdvp);
9309 
9310 #if CONFIG_FSE
9311 	need_event = need_fsevent(FSE_RENAME, fdvp);
9312 	if (need_event) {
9313 		if (fvp) {
9314 			get_fse_info(fvp, &from_finfo, ctx);
9315 		} else {
9316 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9317 			if (error) {
9318 				goto out1;
9319 			}
9320 
9321 			fvap = &__rename_data->fv_attr;
9322 		}
9323 
9324 		if (tvp) {
9325 			get_fse_info(tvp, &to_finfo, ctx);
9326 		} else if (batched) {
9327 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9328 			if (error) {
9329 				goto out1;
9330 			}
9331 
9332 			tvap = &__rename_data->tv_attr;
9333 		}
9334 	}
9335 #else
9336 	need_event = 0;
9337 #endif /* CONFIG_FSE */
9338 
9339 	has_listeners = kauth_authorize_fileop_has_listeners();
9340 
9341 	need_kpath2 = 0;
9342 #if CONFIG_AUDIT
9343 	if (AUDIT_RECORD_EXISTS()) {
9344 		need_kpath2 = 1;
9345 	}
9346 #endif
9347 
9348 	if (need_event || has_listeners) {
9349 		if (from_name == NULL) {
9350 			GET_PATH(from_name);
9351 		}
9352 
9353 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9354 
9355 		if (from_name_no_firmlink == NULL) {
9356 			GET_PATH(from_name_no_firmlink);
9357 		}
9358 
9359 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9360 	}
9361 
9362 	if (need_event || need_kpath2 || has_listeners) {
9363 		if (to_name == NULL) {
9364 			GET_PATH(to_name);
9365 		}
9366 
9367 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9368 
9369 		if (to_name_no_firmlink == NULL) {
9370 			GET_PATH(to_name_no_firmlink);
9371 		}
9372 
9373 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9374 		if (to_name && need_kpath2) {
9375 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9376 		}
9377 	}
9378 	if (!fvp) {
9379 		/*
9380 		 * Claim: this check will never reject a valid rename.
9381 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9382 		 * Suppose fdvp and tdvp are not on the same mount.
9383 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9384 		 *      then you can't move it to within another dir on the same mountpoint.
9385 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9386 		 *
9387 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9388 		 */
9389 		if (fdvp->v_mount != tdvp->v_mount) {
9390 			error = EXDEV;
9391 			goto out1;
9392 		}
9393 		goto skipped_lookup;
9394 	}
9395 
9396 	/*
9397 	 * If the source and destination are the same (i.e. they're
9398 	 * links to the same vnode) and the target file system is
9399 	 * case sensitive, then there is nothing to do.
9400 	 *
9401 	 * XXX Come back to this.
9402 	 */
9403 	if (fvp == tvp) {
9404 		int pathconf_val;
9405 
9406 		/*
9407 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9408 		 * then assume that this file system is case sensitive.
9409 		 */
9410 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9411 		    pathconf_val != 0) {
9412 			vn_authorize_skipped = TRUE;
9413 			goto out1;
9414 		}
9415 	}
9416 
9417 	/*
9418 	 * Allow the renaming of mount points.
9419 	 * - target must not exist
9420 	 * - target must reside in the same directory as source
9421 	 * - union mounts cannot be renamed
9422 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9423 	 *
9424 	 * XXX Handle this in VFS after a continued lookup (if we missed
9425 	 * in the cache to start off)
9426 	 *
9427 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9428 	 * we'll skip past here.  The file system is responsible for
9429 	 * checking that @tvp is not a descendent of @fvp and vice versa
9430 	 * so it should always return EINVAL if either @tvp or @fvp is the
9431 	 * root of a volume.
9432 	 */
9433 	if ((fvp->v_flag & VROOT) &&
9434 	    (fvp->v_type == VDIR) &&
9435 	    (tvp == NULL) &&
9436 	    (fvp->v_mountedhere == NULL) &&
9437 	    (fdvp == tdvp) &&
9438 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9439 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9440 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9441 		vnode_t coveredvp;
9442 
9443 		/* switch fvp to the covered vnode */
9444 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9445 		if ((vnode_getwithref(coveredvp))) {
9446 			error = ENOENT;
9447 			goto out1;
9448 		}
9449 		/*
9450 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9451 		 * later.
9452 		 */
9453 		mnt_fvp = fvp;
9454 
9455 		fvp = coveredvp;
9456 		mntrename = TRUE;
9457 	}
9458 	/*
9459 	 * Check for cross-device rename.
9460 	 * For rename on mountpoint, we want to also check the source and its parent
9461 	 * belong to the same mountpoint.
9462 	 */
9463 	if ((fvp->v_mount != tdvp->v_mount) ||
9464 	    (fvp->v_mount != fdvp->v_mount) ||
9465 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9466 		error = EXDEV;
9467 		goto out1;
9468 	}
9469 
9470 	/*
9471 	 * If source is the same as the destination (that is the
9472 	 * same inode number) then there is nothing to do...
9473 	 * EXCEPT if the underlying file system supports case
9474 	 * insensitivity and is case preserving.  In this case
9475 	 * the file system needs to handle the special case of
9476 	 * getting the same vnode as target (fvp) and source (tvp).
9477 	 *
9478 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9479 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9480 	 * handle the special case of getting the same vnode as target and
9481 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9482 	 * so not to cause locking problems. There is a single reference on tvp.
9483 	 *
9484 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9485 	 * that correct behaviour then is just to return success without doing
9486 	 * anything.
9487 	 *
9488 	 * XXX filesystem should take care of this itself, perhaps...
9489 	 */
9490 	if (fvp == tvp && fdvp == tdvp) {
9491 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9492 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9493 		    fromnd->ni_cnd.cn_namelen)) {
9494 			vn_authorize_skipped = TRUE;
9495 			goto out1;
9496 		}
9497 	}
9498 
9499 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9500 		/*
9501 		 * we're holding a reference and lock
9502 		 * on locked_mp, but it no longer matches
9503 		 * what we want to do... so drop our hold
9504 		 */
9505 		mount_unlock_renames(locked_mp);
9506 		mount_drop(locked_mp, 0);
9507 		holding_mntlock = 0;
9508 	}
9509 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9510 		/*
9511 		 * serialize renames that re-shape
9512 		 * the tree... if holding_mntlock is
9513 		 * set, then we're ready to go...
9514 		 * otherwise we
9515 		 * first need to drop the iocounts
9516 		 * we picked up, second take the
9517 		 * lock to serialize the access,
9518 		 * then finally start the lookup
9519 		 * process over with the lock held
9520 		 */
9521 		if (!holding_mntlock) {
9522 			/*
9523 			 * need to grab a reference on
9524 			 * the mount point before we
9525 			 * drop all the iocounts... once
9526 			 * the iocounts are gone, the mount
9527 			 * could follow
9528 			 */
9529 			locked_mp = fvp->v_mount;
9530 			mount_ref(locked_mp, 0);
9531 
9532 			/*
9533 			 * nameidone has to happen before we vnode_put(tvp)
9534 			 * since it may need to release the fs_nodelock on the tvp
9535 			 */
9536 			nameidone(tond);
9537 
9538 			if (tvp) {
9539 				vnode_put(tvp);
9540 			}
9541 			vnode_put(tdvp);
9542 
9543 			/*
9544 			 * nameidone has to happen before we vnode_put(fdvp)
9545 			 * since it may need to release the fs_nodelock on the fvp
9546 			 */
9547 			nameidone(fromnd);
9548 
9549 			vnode_put(fvp);
9550 			vnode_put(fdvp);
9551 
9552 			if (mnt_fvp != NULLVP) {
9553 				vnode_put(mnt_fvp);
9554 			}
9555 
9556 			mount_lock_renames(locked_mp);
9557 			holding_mntlock = 1;
9558 
9559 			goto retry;
9560 		}
9561 	} else {
9562 		/*
9563 		 * when we dropped the iocounts to take
9564 		 * the lock, we allowed the identity of
9565 		 * the various vnodes to change... if they did,
9566 		 * we may no longer be dealing with a rename
9567 		 * that reshapes the tree... once we're holding
9568 		 * the iocounts, the vnodes can't change type
9569 		 * so we're free to drop the lock at this point
9570 		 * and continue on
9571 		 */
9572 		if (holding_mntlock) {
9573 			mount_unlock_renames(locked_mp);
9574 			mount_drop(locked_mp, 0);
9575 			holding_mntlock = 0;
9576 		}
9577 	}
9578 
9579 	if (!batched) {
9580 		assert(locked_vp == NULLVP);
9581 		vnode_link_lock(fvp);
9582 		locked_vp = fvp;
9583 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9584 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9585 		    flags, NULL);
9586 		if (error) {
9587 			if (error == ENOENT) {
9588 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9589 					/*
9590 					 * We encountered a race where after doing the namei,
9591 					 * tvp stops being valid. If so, simply re-drive the rename
9592 					 * call from the top.
9593 					 */
9594 					do_retry = 1;
9595 					retry_count += 1;
9596 				}
9597 			}
9598 			vnode_link_unlock(fvp);
9599 			locked_vp = NULLVP;
9600 			goto out1;
9601 		}
9602 	}
9603 
9604 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9605 	if (mnt_fvp != NULLVP) {
9606 		vnode_put(mnt_fvp);
9607 		mnt_fvp = NULLVP;
9608 	}
9609 
9610 	// save these off so we can later verify that fvp is the same
9611 	oname   = fvp->v_name;
9612 	oparent = fvp->v_parent;
9613 
9614 skipped_lookup:
9615 #if CONFIG_FILE_LEASES
9616 	/* Lease break needed for source's parent dir? */
9617 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9618 
9619 	/* Lease break needed for target's parent dir? */
9620 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9621 #endif
9622 
9623 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9624 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9625 	    flags, ctx);
9626 
9627 	if (locked_vp) {
9628 		vnode_link_unlock(fvp);
9629 		locked_vp = NULLVP;
9630 	}
9631 
9632 	if (holding_mntlock) {
9633 		/*
9634 		 * we can drop our serialization
9635 		 * lock now
9636 		 */
9637 		mount_unlock_renames(locked_mp);
9638 		mount_drop(locked_mp, 0);
9639 		holding_mntlock = 0;
9640 	}
9641 	if (error) {
9642 		if (error == EDATALESS) {
9643 			/*
9644 			 * If we've been here before, something has gone
9645 			 * horribly wrong and we should just get out lest
9646 			 * we spiral around the drain forever.
9647 			 */
9648 			if (flags & VFS_RENAME_DATALESS) {
9649 				error = EIO;
9650 				goto out1;
9651 			}
9652 
9653 			/*
9654 			 * The object we're renaming is dataless (or has a
9655 			 * dataless descendent) and requires materialization
9656 			 * before the rename occurs.  But we're holding the
9657 			 * mount point's rename lock, so it's not safe to
9658 			 * make the upcall.
9659 			 *
9660 			 * In this case, we release the lock (above), perform
9661 			 * the materialization, and start the whole thing over.
9662 			 */
9663 			error = vfs_materialize_reparent(fvp, tdvp);
9664 			if (error == 0) {
9665 				/*
9666 				 * The next time around we need to tell the
9667 				 * file system that the materializtaion has
9668 				 * been performed.
9669 				 */
9670 				flags |= VFS_RENAME_DATALESS;
9671 				do_retry = 1;
9672 			}
9673 			goto out1;
9674 		}
9675 		if (error == EKEEPLOOKING) {
9676 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9677 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9678 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9679 				}
9680 			}
9681 
9682 			fromnd->ni_vp = fvp;
9683 			tond->ni_vp = tvp;
9684 
9685 			goto continue_lookup;
9686 		}
9687 
9688 		/*
9689 		 * We may encounter a race in the VNOP where the destination didn't
9690 		 * exist when we did the namei, but it does by the time we go and
9691 		 * try to create the entry. In this case, we should re-drive this rename
9692 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9693 		 * but other filesystems susceptible to this race could return it, too.
9694 		 */
9695 		if (error == ERECYCLE) {
9696 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9697 				do_retry = 1;
9698 				retry_count += 1;
9699 			} else {
9700 				printf("rename retry limit due to ERECYCLE reached\n");
9701 				error = ENOENT;
9702 			}
9703 		}
9704 
9705 		/*
9706 		 * For compound VNOPs, the authorization callback may return
9707 		 * ENOENT in case of racing hardlink lookups hitting the name
9708 		 * cache, redrive the lookup.
9709 		 */
9710 		if (batched && error == ENOENT) {
9711 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9712 				do_retry = 1;
9713 				retry_count += 1;
9714 			}
9715 		}
9716 
9717 		goto out1;
9718 	}
9719 
9720 	/* call out to allow 3rd party notification of rename.
9721 	 * Ignore result of kauth_authorize_fileop call.
9722 	 */
9723 	kauth_authorize_fileop(vfs_context_ucred(ctx),
9724 	    KAUTH_FILEOP_RENAME,
9725 	    (uintptr_t)from_name, (uintptr_t)to_name);
9726 	if (flags & VFS_RENAME_SWAP) {
9727 		kauth_authorize_fileop(vfs_context_ucred(ctx),
9728 		    KAUTH_FILEOP_RENAME,
9729 		    (uintptr_t)to_name, (uintptr_t)from_name);
9730 	}
9731 
9732 #if CONFIG_FSE
9733 	if (from_name != NULL && to_name != NULL) {
9734 		if (from_truncated || to_truncated) {
9735 			// set it here since only the from_finfo gets reported up to user space
9736 			from_finfo.mode |= FSE_TRUNCATED_PATH;
9737 		}
9738 
9739 		if (tvap && tvp) {
9740 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9741 		}
9742 		if (fvap) {
9743 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9744 		}
9745 
9746 		if (tvp) {
9747 			add_fsevent(FSE_RENAME, ctx,
9748 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9749 			    FSE_ARG_FINFO, &from_finfo,
9750 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9751 			    FSE_ARG_FINFO, &to_finfo,
9752 			    FSE_ARG_DONE);
9753 			if (flags & VFS_RENAME_SWAP) {
9754 				/*
9755 				 * Strictly speaking, swap is the equivalent of
9756 				 * *three* renames.  FSEvents clients should only take
9757 				 * the events as a hint, so we only bother reporting
9758 				 * two.
9759 				 */
9760 				add_fsevent(FSE_RENAME, ctx,
9761 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9762 				    FSE_ARG_FINFO, &to_finfo,
9763 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9764 				    FSE_ARG_FINFO, &from_finfo,
9765 				    FSE_ARG_DONE);
9766 			}
9767 		} else {
9768 			add_fsevent(FSE_RENAME, ctx,
9769 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9770 			    FSE_ARG_FINFO, &from_finfo,
9771 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9772 			    FSE_ARG_DONE);
9773 		}
9774 	}
9775 #endif /* CONFIG_FSE */
9776 
9777 	/*
9778 	 * update filesystem's mount point data
9779 	 */
9780 	if (mntrename) {
9781 		char *cp, *pathend, *mpname;
9782 		char * tobuf;
9783 		struct mount *mp;
9784 		int maxlen;
9785 		size_t len = 0;
9786 
9787 		mp = fvp->v_mountedhere;
9788 
9789 		if (vfs_busy(mp, LK_NOWAIT)) {
9790 			error = EBUSY;
9791 			goto out1;
9792 		}
9793 		tobuf = zalloc(ZV_NAMEI);
9794 
9795 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
9796 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9797 		} else {
9798 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9799 		}
9800 		if (!error) {
9801 			/* find current mount point prefix */
9802 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
9803 			for (cp = pathend; *cp != '\0'; ++cp) {
9804 				if (*cp == '/') {
9805 					pathend = cp + 1;
9806 				}
9807 			}
9808 			/* find last component of target name */
9809 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9810 				if (*cp == '/') {
9811 					mpname = cp + 1;
9812 				}
9813 			}
9814 
9815 			/* Update f_mntonname of sub mounts */
9816 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
9817 
9818 			/* append name to prefix */
9819 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9820 			bzero(pathend, maxlen);
9821 
9822 			strlcpy(pathend, mpname, maxlen);
9823 		}
9824 		zfree(ZV_NAMEI, tobuf);
9825 
9826 		vfs_unbusy(mp);
9827 
9828 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9829 	}
9830 	/*
9831 	 * fix up name & parent pointers.  note that we first
9832 	 * check that fvp has the same name/parent pointers it
9833 	 * had before the rename call... this is a 'weak' check
9834 	 * at best...
9835 	 *
9836 	 * XXX oparent and oname may not be set in the compound vnop case
9837 	 */
9838 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9839 		int update_flags;
9840 
9841 		update_flags = VNODE_UPDATE_NAME;
9842 
9843 		if (fdvp != tdvp) {
9844 			update_flags |= VNODE_UPDATE_PARENT;
9845 		}
9846 
9847 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9848 	}
9849 out1:
9850 	/*
9851 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9852 	 * skipped earlier as no actual rename was performed.
9853 	 */
9854 	if (vn_authorize_skipped && error == 0) {
9855 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
9856 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9857 		    flags, NULL);
9858 		if (error && error == ENOENT) {
9859 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9860 				do_retry = 1;
9861 				retry_count += 1;
9862 			}
9863 		}
9864 	}
9865 	if (to_name != NULL) {
9866 		RELEASE_PATH(to_name);
9867 		to_name = NULL;
9868 	}
9869 	if (to_name_no_firmlink != NULL) {
9870 		RELEASE_PATH(to_name_no_firmlink);
9871 		to_name_no_firmlink = NULL;
9872 	}
9873 	if (from_name != NULL) {
9874 		RELEASE_PATH(from_name);
9875 		from_name = NULL;
9876 	}
9877 	if (from_name_no_firmlink != NULL) {
9878 		RELEASE_PATH(from_name_no_firmlink);
9879 		from_name_no_firmlink = NULL;
9880 	}
9881 	if (holding_mntlock) {
9882 		mount_unlock_renames(locked_mp);
9883 		mount_drop(locked_mp, 0);
9884 		holding_mntlock = 0;
9885 	}
9886 	if (tdvp) {
9887 		/*
9888 		 * nameidone has to happen before we vnode_put(tdvp)
9889 		 * since it may need to release the fs_nodelock on the tdvp
9890 		 */
9891 		nameidone(tond);
9892 
9893 		if (tvp) {
9894 			vnode_put(tvp);
9895 		}
9896 		vnode_put(tdvp);
9897 	}
9898 	if (fdvp) {
9899 		/*
9900 		 * nameidone has to happen before we vnode_put(fdvp)
9901 		 * since it may need to release the fs_nodelock on the fdvp
9902 		 */
9903 		nameidone(fromnd);
9904 
9905 		if (fvp) {
9906 			vnode_put(fvp);
9907 		}
9908 		vnode_put(fdvp);
9909 	}
9910 	if (mnt_fvp != NULLVP) {
9911 		vnode_put(mnt_fvp);
9912 	}
9913 	/*
9914 	 * If things changed after we did the namei, then we will re-drive
9915 	 * this rename call from the top.
9916 	 */
9917 	if (do_retry) {
9918 		do_retry = 0;
9919 		goto retry;
9920 	}
9921 
9922 	kfree_type(typeof(*__rename_data), __rename_data);
9923 	return error;
9924 }
9925 
9926 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9927 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9928 {
9929 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9930 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9931 }
9932 
9933 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9934 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9935 {
9936 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9937 		return EINVAL;
9938 	}
9939 
9940 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9941 		return EINVAL;
9942 	}
9943 
9944 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9945 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9946 }
9947 
9948 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9949 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9950 {
9951 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9952 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9953 }
9954 
9955 /*
9956  * Make a directory file.
9957  *
9958  * Returns:	0			Success
9959  *		EEXIST
9960  *	namei:???
9961  *	vnode_authorize:???
9962  *	vn_create:???
9963  */
9964 /* ARGSUSED */
9965 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9966 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9967     enum uio_seg segflg)
9968 {
9969 	vnode_t vp, dvp;
9970 	int error;
9971 	int update_flags = 0;
9972 	int batched;
9973 	struct nameidata nd;
9974 
9975 	AUDIT_ARG(mode, vap->va_mode);
9976 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9977 	    path, ctx);
9978 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9979 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9980 
9981 continue_lookup:
9982 	error = nameiat(&nd, fd);
9983 	if (error) {
9984 		return error;
9985 	}
9986 	dvp = nd.ni_dvp;
9987 	vp = nd.ni_vp;
9988 
9989 	if (vp != NULL) {
9990 		error = EEXIST;
9991 		goto out;
9992 	}
9993 
9994 	batched = vnode_compound_mkdir_available(dvp);
9995 
9996 	VATTR_SET(vap, va_type, VDIR);
9997 
9998 	/*
9999 	 * XXX
10000 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
10001 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
10002 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
10003 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
10004 	 */
10005 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
10006 		if (error == EACCES || error == EPERM) {
10007 			int error2;
10008 
10009 			nameidone(&nd);
10010 			vnode_put(dvp);
10011 			dvp = NULLVP;
10012 
10013 			/*
10014 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
10015 			 * rather than EACCESS if the target exists.
10016 			 */
10017 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
10018 			    path, ctx);
10019 			error2 = nameiat(&nd, fd);
10020 			if (error2) {
10021 				goto out;
10022 			} else {
10023 				vp = nd.ni_vp;
10024 				error = EEXIST;
10025 				goto out;
10026 			}
10027 		}
10028 
10029 		goto out;
10030 	}
10031 
10032 #if CONFIG_FILE_LEASES
10033 	vnode_breakdirlease(dvp, false, O_WRONLY);
10034 #endif
10035 
10036 	/*
10037 	 * make the directory
10038 	 */
10039 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
10040 		if (error == EKEEPLOOKING) {
10041 			nd.ni_vp = vp;
10042 			goto continue_lookup;
10043 		}
10044 
10045 		goto out;
10046 	}
10047 
10048 	// Make sure the name & parent pointers are hooked up
10049 	if (vp->v_name == NULL) {
10050 		update_flags |= VNODE_UPDATE_NAME;
10051 	}
10052 	if (vp->v_parent == NULLVP) {
10053 		update_flags |= VNODE_UPDATE_PARENT;
10054 	}
10055 
10056 	if (update_flags) {
10057 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
10058 	}
10059 
10060 #if CONFIG_FSE
10061 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
10062 #endif
10063 
10064 out:
10065 	/*
10066 	 * nameidone has to happen before we vnode_put(dvp)
10067 	 * since it may need to release the fs_nodelock on the dvp
10068 	 */
10069 	nameidone(&nd);
10070 
10071 	if (vp) {
10072 		vnode_put(vp);
10073 	}
10074 	if (dvp) {
10075 		vnode_put(dvp);
10076 	}
10077 
10078 	return error;
10079 }
10080 
10081 /*
10082  * mkdir_extended: Create a directory; with extended security (ACL).
10083  *
10084  * Parameters:    p                       Process requesting to create the directory
10085  *                uap                     User argument descriptor (see below)
10086  *                retval                  (ignored)
10087  *
10088  * Indirect:      uap->path               Path of directory to create
10089  *                uap->mode               Access permissions to set
10090  *                uap->xsecurity          ACL to set
10091  *
10092  * Returns:        0                      Success
10093  *                !0                      Not success
10094  *
10095  */
10096 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)10097 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
10098 {
10099 	int ciferror;
10100 	kauth_filesec_t xsecdst;
10101 	struct vnode_attr va;
10102 
10103 	AUDIT_ARG(owner, uap->uid, uap->gid);
10104 
10105 	xsecdst = NULL;
10106 	if ((uap->xsecurity != USER_ADDR_NULL) &&
10107 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
10108 		return ciferror;
10109 	}
10110 
10111 	VATTR_INIT(&va);
10112 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10113 	if (xsecdst != NULL) {
10114 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
10115 		va.va_vaflags |= VA_FILESEC_ACL;
10116 	}
10117 
10118 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10119 	    UIO_USERSPACE);
10120 	if (xsecdst != NULL) {
10121 		kauth_filesec_free(xsecdst);
10122 	}
10123 	return ciferror;
10124 }
10125 
10126 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)10127 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
10128 {
10129 	struct vnode_attr va;
10130 
10131 	VATTR_INIT(&va);
10132 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10133 
10134 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10135 	           UIO_USERSPACE);
10136 }
10137 
10138 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)10139 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
10140 {
10141 	struct vnode_attr va;
10142 
10143 	VATTR_INIT(&va);
10144 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10145 
10146 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
10147 	           UIO_USERSPACE);
10148 }
10149 
10150 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)10151 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
10152     enum uio_seg segflg, int unlink_flags)
10153 {
10154 	struct {
10155 		struct nameidata nd;
10156 #if CONFIG_FSE
10157 		struct vnode_attr va;
10158 #endif /* CONFIG_FSE */
10159 	} *__rmdir_data;
10160 	vnode_t vp, dvp;
10161 	int error;
10162 	struct nameidata *ndp;
10163 	char     *path = NULL;
10164 	char     *no_firmlink_path = NULL;
10165 	int       len_path = 0;
10166 	int       len_no_firmlink_path = 0;
10167 	int has_listeners = 0;
10168 	int need_event = 0;
10169 	int truncated_path = 0;
10170 	int truncated_no_firmlink_path = 0;
10171 	struct vnode_attr *vap = NULL;
10172 	int restart_count = 0;
10173 	int batched;
10174 
10175 	int restart_flag;
10176 	int nofollow_any = 0;
10177 
10178 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
10179 	ndp = &__rmdir_data->nd;
10180 
10181 	if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
10182 		nofollow_any = NAMEI_NOFOLLOW_ANY;
10183 		unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
10184 	}
10185 
10186 	/*
10187 	 * This loop exists to restart rmdir in the unlikely case that two
10188 	 * processes are simultaneously trying to remove the same directory
10189 	 * containing orphaned appleDouble files.
10190 	 */
10191 	do {
10192 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
10193 		    segflg, dirpath, ctx);
10194 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
10195 continue_lookup:
10196 		restart_flag = 0;
10197 		vap = NULL;
10198 
10199 		error = nameiat(ndp, fd);
10200 		if (error) {
10201 			goto err_out;
10202 		}
10203 
10204 		dvp = ndp->ni_dvp;
10205 		vp = ndp->ni_vp;
10206 
10207 		if (vp) {
10208 			batched = vnode_compound_rmdir_available(vp);
10209 
10210 			if (vp->v_flag & VROOT) {
10211 				/*
10212 				 * The root of a mounted filesystem cannot be deleted.
10213 				 */
10214 				error = EBUSY;
10215 				goto out;
10216 			}
10217 
10218 #if DEVELOPMENT || DEBUG
10219 			/*
10220 			 * XXX VSWAP: Check for entitlements or special flag here
10221 			 * so we can restrict access appropriately.
10222 			 */
10223 #else /* DEVELOPMENT || DEBUG */
10224 
10225 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10226 				error = EPERM;
10227 				goto out;
10228 			}
10229 #endif /* DEVELOPMENT || DEBUG */
10230 
10231 			/*
10232 			 * Removed a check here; we used to abort if vp's vid
10233 			 * was not the same as what we'd seen the last time around.
10234 			 * I do not think that check was valid, because if we retry
10235 			 * and all dirents are gone, the directory could legitimately
10236 			 * be recycled but still be present in a situation where we would
10237 			 * have had permission to delete.  Therefore, we won't make
10238 			 * an effort to preserve that check now that we may not have a
10239 			 * vp here.
10240 			 */
10241 
10242 			if (!batched) {
10243 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10244 				if (error) {
10245 					if (error == ENOENT) {
10246 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10247 							restart_flag = 1;
10248 							restart_count += 1;
10249 						}
10250 					}
10251 					goto out;
10252 				}
10253 			}
10254 		} else {
10255 			batched = 1;
10256 
10257 			if (!vnode_compound_rmdir_available(dvp)) {
10258 				panic("No error, but no compound rmdir?");
10259 			}
10260 		}
10261 
10262 #if CONFIG_FSE
10263 		fse_info  finfo = {0};
10264 
10265 		need_event = need_fsevent(FSE_DELETE, dvp);
10266 		if (need_event) {
10267 			if (!batched) {
10268 				get_fse_info(vp, &finfo, ctx);
10269 			} else {
10270 				error = vfs_get_notify_attributes(&__rmdir_data->va);
10271 				if (error) {
10272 					goto out;
10273 				}
10274 
10275 				vap = &__rmdir_data->va;
10276 			}
10277 		}
10278 #endif
10279 		has_listeners = kauth_authorize_fileop_has_listeners();
10280 		if (need_event || has_listeners) {
10281 			if (path == NULL) {
10282 				GET_PATH(path);
10283 			}
10284 
10285 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10286 
10287 			if (no_firmlink_path == NULL) {
10288 				GET_PATH(no_firmlink_path);
10289 			}
10290 
10291 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10292 #if CONFIG_FSE
10293 			if (truncated_no_firmlink_path) {
10294 				finfo.mode |= FSE_TRUNCATED_PATH;
10295 			}
10296 #endif
10297 		}
10298 
10299 #if CONFIG_FILE_LEASES
10300 		vnode_breakdirlease(dvp, false, O_WRONLY);
10301 #endif
10302 
10303 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10304 		ndp->ni_vp = vp;
10305 		if (vp == NULLVP) {
10306 			/* Couldn't find a vnode */
10307 			goto out;
10308 		}
10309 
10310 		if (error == EKEEPLOOKING) {
10311 			goto continue_lookup;
10312 		} else if (batched && error == ENOENT) {
10313 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10314 				/*
10315 				 * For compound VNOPs, the authorization callback
10316 				 * may return ENOENT in case of racing hard link lookups
10317 				 * redrive the lookup.
10318 				 */
10319 				restart_flag = 1;
10320 				restart_count += 1;
10321 				goto out;
10322 			}
10323 		}
10324 
10325 		/*
10326 		 * XXX There's no provision for passing flags
10327 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10328 		 * because it's not empty, then we try again
10329 		 * with VNOP_REMOVE(), passing in a special
10330 		 * flag that clever file systems will know
10331 		 * how to handle.
10332 		 */
10333 		if (error == ENOTEMPTY &&
10334 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10335 			/*
10336 			 * Only do this if the directory is actually
10337 			 * marked as DATALESS.
10338 			 */
10339 			struct vnode_attr *lvap =
10340 			    kalloc_type(struct vnode_attr, Z_WAITOK);
10341 
10342 			VATTR_INIT(lvap);
10343 			VATTR_WANTED(lvap, va_flags);
10344 			if (vnode_getattr(vp, lvap, ctx) == 0 &&
10345 			    VATTR_IS_SUPPORTED(lvap, va_flags) &&
10346 			    (lvap->va_flags & SF_DATALESS) != 0) {
10347 				/*
10348 				 * If this fails, we want to keep the original
10349 				 * error.
10350 				 */
10351 				if (vn_remove(dvp, &vp, ndp,
10352 				    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10353 					error = 0;
10354 				}
10355 			}
10356 			kfree_type(struct vnode_attr, lvap);
10357 		}
10358 
10359 #if CONFIG_APPLEDOUBLE
10360 		/*
10361 		 * Special case to remove orphaned AppleDouble
10362 		 * files. I don't like putting this in the kernel,
10363 		 * but carbon does not like putting this in carbon either,
10364 		 * so here we are.
10365 		 */
10366 		if (error == ENOTEMPTY) {
10367 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10368 			if (ad_error == EBUSY) {
10369 				error = ad_error;
10370 				goto out;
10371 			}
10372 
10373 
10374 			/*
10375 			 * Assuming everything went well, we will try the RMDIR again
10376 			 */
10377 			if (!ad_error) {
10378 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10379 			}
10380 		}
10381 #endif /* CONFIG_APPLEDOUBLE */
10382 		/*
10383 		 * Call out to allow 3rd party notification of delete.
10384 		 * Ignore result of kauth_authorize_fileop call.
10385 		 */
10386 		if (!error) {
10387 			if (has_listeners) {
10388 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10389 				    KAUTH_FILEOP_DELETE,
10390 				    (uintptr_t)vp,
10391 				    (uintptr_t)path);
10392 			}
10393 
10394 			if (vp->v_flag & VISHARDLINK) {
10395 				// see the comment in unlink1() about why we update
10396 				// the parent of a hard link when it is removed
10397 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10398 			}
10399 
10400 #if CONFIG_FSE
10401 			if (need_event) {
10402 				if (vap) {
10403 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10404 				}
10405 				add_fsevent(FSE_DELETE, ctx,
10406 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10407 				    FSE_ARG_FINFO, &finfo,
10408 				    FSE_ARG_DONE);
10409 			}
10410 #endif
10411 
10412 #if CONFIG_MACF
10413 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10414 #endif
10415 		}
10416 
10417 out:
10418 		if (path != NULL) {
10419 			RELEASE_PATH(path);
10420 			path = NULL;
10421 		}
10422 
10423 		if (no_firmlink_path != NULL) {
10424 			RELEASE_PATH(no_firmlink_path);
10425 			no_firmlink_path = NULL;
10426 		}
10427 
10428 		/*
10429 		 * nameidone has to happen before we vnode_put(dvp)
10430 		 * since it may need to release the fs_nodelock on the dvp
10431 		 */
10432 		nameidone(ndp);
10433 		vnode_put(dvp);
10434 
10435 		if (vp) {
10436 			vnode_put(vp);
10437 		}
10438 
10439 		if (restart_flag == 0) {
10440 			wakeup_one((caddr_t)vp);
10441 			goto err_out;
10442 		}
10443 		tsleep(vp, PVFS, "rm AD", 1);
10444 	} while (restart_flag != 0);
10445 
10446 err_out:
10447 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10448 
10449 	return error;
10450 }
10451 
10452 /*
10453  * Remove a directory file.
10454  */
10455 /* ARGSUSED */
10456 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10457 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10458 {
10459 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10460 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10461 }
10462 
10463 /* Get direntry length padded to 8 byte alignment */
10464 #define DIRENT64_LEN(namlen) \
10465 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10466 
10467 /* Get dirent length padded to 4 byte alignment */
10468 #define DIRENT_LEN(namelen) \
10469 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10470 
10471 /* Get the end of this dirent */
10472 #define DIRENT_END(dep) \
10473 	(((char *)(dep)) + (dep)->d_reclen - 1)
10474 
10475 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10476 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10477     int *numdirent, vfs_context_t ctxp)
10478 {
10479 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10480 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10481 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10482 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10483 	} else {
10484 		size_t bufsize;
10485 		void * bufptr;
10486 		uio_t auio;
10487 		struct direntry *entry64;
10488 		struct dirent *dep;
10489 		size_t bytesread;
10490 		int error;
10491 
10492 		/*
10493 		 * We're here because the underlying file system does not
10494 		 * support direnties or we mounted denying support so we must
10495 		 * fall back to dirents and convert them to direntries.
10496 		 *
10497 		 * Our kernel buffer needs to be smaller since re-packing will
10498 		 * expand each dirent.  The worse case (when the name length
10499 		 * is 3 or less) corresponds to a struct direntry size of 32
10500 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10501 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10502 		 * will prevent us from reading more than we can pack.
10503 		 *
10504 		 * Since this buffer is wired memory, we will limit the
10505 		 * buffer size to a maximum of 32K. We would really like to
10506 		 * use 32K in the MIN(), but we use magic number 87371 to
10507 		 * prevent uio_resid() * 3 / 8 from overflowing.
10508 		 */
10509 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10510 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10511 		if (bufptr == NULL) {
10512 			return ENOMEM;
10513 		}
10514 
10515 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10516 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10517 		auio->uio_offset = uio->uio_offset;
10518 
10519 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10520 
10521 		dep = (struct dirent *)bufptr;
10522 		bytesread = bufsize - uio_resid(auio);
10523 
10524 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10525 		/*
10526 		 * Convert all the entries and copy them out to user's buffer.
10527 		 */
10528 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10529 			/* First check that the dirent struct up to d_name is within the buffer */
10530 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10531 			    /* Check that the length of the entire dirent is within the buffer */
10532 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10533 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10534 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10535 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10536 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10537 				    vp->v_name ? vp->v_name : "<unknown>");
10538 				error = EIO;
10539 				break;
10540 			}
10541 
10542 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10543 
10544 			bzero(entry64, enbufsize);
10545 			/* Convert a dirent to a dirent64. */
10546 			entry64->d_ino = dep->d_ino;
10547 			entry64->d_seekoff = 0;
10548 			entry64->d_reclen = (uint16_t)enbufsize;
10549 			entry64->d_namlen = dep->d_namlen;
10550 			entry64->d_type = dep->d_type;
10551 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10552 
10553 			/* Move to next entry. */
10554 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10555 
10556 			/* Copy entry64 to user's buffer. */
10557 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10558 		}
10559 
10560 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10561 		if (error == 0) {
10562 			uio->uio_offset = auio->uio_offset;
10563 		}
10564 		uio_free(auio);
10565 		kfree_data(bufptr, bufsize);
10566 		kfree_type(struct direntry, entry64);
10567 		return error;
10568 	}
10569 }
10570 
10571 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10572 
10573 /*
10574  * Read a block of directory entries in a file system independent format.
10575  */
10576 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10577 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10578     off_t *offset, int *eofflag, int flags)
10579 {
10580 	vnode_t vp;
10581 	struct vfs_context context = *vfs_context_current();    /* local copy */
10582 	struct fileproc *fp;
10583 	uio_t auio;
10584 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10585 	off_t loff;
10586 	int error, numdirent;
10587 	UIO_STACKBUF(uio_buf, 1);
10588 
10589 get_from_fd:
10590 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10591 	if (error) {
10592 		return error;
10593 	}
10594 
10595 	vn_offset_lock(fp->fp_glob);
10596 	if (((vnode_t)fp_get_data(fp)) != vp) {
10597 		vn_offset_unlock(fp->fp_glob);
10598 		file_drop(fd);
10599 		goto get_from_fd;
10600 	}
10601 
10602 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10603 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10604 		error = EBADF;
10605 		goto out;
10606 	}
10607 
10608 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10609 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10610 	}
10611 
10612 #if CONFIG_MACF
10613 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10614 	if (error) {
10615 		goto out;
10616 	}
10617 #endif
10618 
10619 	if ((error = vnode_getwithref(vp))) {
10620 		goto out;
10621 	}
10622 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10623 
10624 #if CONFIG_UNION_MOUNTS
10625 unionread:
10626 #endif /* CONFIG_UNION_MOUNTS */
10627 	if (vp->v_type != VDIR) {
10628 		(void)vnode_put(vp);
10629 		error = EINVAL;
10630 		goto out;
10631 	}
10632 
10633 #if CONFIG_MACF
10634 	error = mac_vnode_check_readdir(&context, vp);
10635 	if (error != 0) {
10636 		(void)vnode_put(vp);
10637 		goto out;
10638 	}
10639 #endif /* MAC */
10640 
10641 	loff = fp->fp_glob->fg_offset;
10642 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10643 	uio_addiov(auio, bufp, bufsize);
10644 
10645 	if (flags & VNODE_READDIR_EXTENDED) {
10646 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10647 		fp->fp_glob->fg_offset = uio_offset(auio);
10648 	} else {
10649 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10650 		fp->fp_glob->fg_offset = uio_offset(auio);
10651 	}
10652 	if (error) {
10653 		(void)vnode_put(vp);
10654 		goto out;
10655 	}
10656 
10657 #if CONFIG_UNION_MOUNTS
10658 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10659 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10660 		vnode_t uvp;
10661 
10662 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10663 			if (vnode_ref(uvp) == 0) {
10664 				fp_set_data(fp, uvp);
10665 				fp->fp_glob->fg_offset = 0;
10666 				vnode_rele(vp);
10667 				vnode_put(vp);
10668 				vp = uvp;
10669 				goto unionread;
10670 			} else {
10671 				/* could not get a ref, can't replace in fd */
10672 				vnode_put(uvp);
10673 			}
10674 		}
10675 	}
10676 #endif /* CONFIG_UNION_MOUNTS */
10677 
10678 	vnode_put(vp);
10679 	if (offset) {
10680 		*offset = loff;
10681 	}
10682 
10683 	*bytesread = bufsize - uio_resid(auio);
10684 out:
10685 	vn_offset_unlock(fp->fp_glob);
10686 	file_drop(fd);
10687 	return error;
10688 }
10689 
10690 
10691 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10692 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10693 {
10694 	off_t offset;
10695 	ssize_t bytesread;
10696 	int error, eofflag;
10697 
10698 	AUDIT_ARG(fd, uap->fd);
10699 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
10700 	    &bytesread, &offset, &eofflag, 0);
10701 
10702 	if (error == 0) {
10703 		if (proc_is64bit(p)) {
10704 			user64_long_t base = (user64_long_t)offset;
10705 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10706 		} else {
10707 			user32_long_t base = (user32_long_t)offset;
10708 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10709 		}
10710 		*retval = (int)bytesread;
10711 	}
10712 	return error;
10713 }
10714 
10715 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10716 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10717 {
10718 	off_t offset;
10719 	ssize_t bytesread;
10720 	int error, eofflag;
10721 	user_size_t bufsize;
10722 
10723 	AUDIT_ARG(fd, uap->fd);
10724 
10725 	/*
10726 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10727 	 * then the kernel carves out the last 4 bytes to return extended
10728 	 * information to userspace (namely whether we reached EOF with this call).
10729 	 */
10730 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10731 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10732 	} else {
10733 		bufsize = uap->bufsize;
10734 	}
10735 
10736 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
10737 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10738 
10739 	if (error == 0) {
10740 		*retval = bytesread;
10741 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10742 
10743 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10744 			getdirentries64_flags_t flags = 0;
10745 			if (eofflag) {
10746 				flags |= GETDIRENTRIES64_EOF;
10747 			}
10748 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10749 			    sizeof(flags));
10750 		}
10751 	}
10752 	return error;
10753 }
10754 
10755 
10756 /*
10757  * Set the mode mask for creation of filesystem nodes.
10758  * XXX implement xsecurity
10759  */
10760 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
10761 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10762 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10763 {
10764 	AUDIT_ARG(mask, newmask);
10765 	proc_fdlock(p);
10766 	*retval = p->p_fd.fd_cmask;
10767 	p->p_fd.fd_cmask = newmask & ALLPERMS;
10768 	proc_fdunlock(p);
10769 	return 0;
10770 }
10771 
10772 /*
10773  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10774  *
10775  * Parameters:    p                       Process requesting to set the umask
10776  *                uap                     User argument descriptor (see below)
10777  *                retval                  umask of the process (parameter p)
10778  *
10779  * Indirect:      uap->newmask            umask to set
10780  *                uap->xsecurity          ACL to set
10781  *
10782  * Returns:        0                      Success
10783  *                !0                      Not success
10784  *
10785  */
10786 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10787 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10788 {
10789 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10790 }
10791 
10792 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10793 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10794 {
10795 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10796 }
10797 
10798 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
10799 	"com.apple.private.vfs.revoke-mounted-device"
10800 
10801 /*
10802  * Void all references to file by ripping underlying filesystem
10803  * away from vnode.
10804  */
10805 /* ARGSUSED */
10806 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10807 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10808 {
10809 	vnode_t vp;
10810 	struct vnode_attr va;
10811 	vfs_context_t ctx = vfs_context_current();
10812 	int error;
10813 	struct nameidata nd;
10814 
10815 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10816 	    uap->path, ctx);
10817 	error = namei(&nd);
10818 	if (error) {
10819 		return error;
10820 	}
10821 	vp = nd.ni_vp;
10822 
10823 	nameidone(&nd);
10824 
10825 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10826 		error = ENOTSUP;
10827 		goto out;
10828 	}
10829 
10830 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10831 		error = EBUSY;
10832 		goto out;
10833 	}
10834 
10835 #if CONFIG_MACF
10836 	error = mac_vnode_check_revoke(ctx, vp);
10837 	if (error) {
10838 		goto out;
10839 	}
10840 #endif
10841 
10842 	VATTR_INIT(&va);
10843 	VATTR_WANTED(&va, va_uid);
10844 	if ((error = vnode_getattr(vp, &va, ctx))) {
10845 		goto out;
10846 	}
10847 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10848 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10849 		goto out;
10850 	}
10851 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10852 		VNOP_REVOKE(vp, REVOKEALL, ctx);
10853 	}
10854 out:
10855 	vnode_put(vp);
10856 	return error;
10857 }
10858 
10859 
10860 /*
10861  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10862  *  The following system calls are designed to support features
10863  *  which are specific to the HFS & HFS Plus volume formats
10864  */
10865 
10866 
10867 /*
10868  * Obtain attribute information on objects in a directory while enumerating
10869  * the directory.
10870  */
10871 /* ARGSUSED */
10872 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10873 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10874 {
10875 	vnode_t vp;
10876 	struct fileproc *fp;
10877 	uio_t auio = NULL;
10878 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10879 	uint32_t count = 0, savecount = 0;
10880 	uint32_t newstate = 0;
10881 	int error, eofflag = 0;
10882 	off_t loff = 0;
10883 	struct attrlist attributelist;
10884 	vfs_context_t ctx = vfs_context_current();
10885 	int fd = uap->fd;
10886 	UIO_STACKBUF(uio_buf, 1);
10887 	kauth_action_t action;
10888 
10889 	AUDIT_ARG(fd, fd);
10890 
10891 	/* Get the attributes into kernel space */
10892 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10893 		return error;
10894 	}
10895 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10896 		return error;
10897 	}
10898 	savecount = count;
10899 
10900 get_from_fd:
10901 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10902 		return error;
10903 	}
10904 
10905 	vn_offset_lock(fp->fp_glob);
10906 	if (((vnode_t)fp_get_data(fp)) != vp) {
10907 		vn_offset_unlock(fp->fp_glob);
10908 		file_drop(fd);
10909 		goto get_from_fd;
10910 	}
10911 
10912 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10913 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10914 		error = EBADF;
10915 		goto out;
10916 	}
10917 
10918 
10919 #if CONFIG_MACF
10920 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10921 	    fp->fp_glob);
10922 	if (error) {
10923 		goto out;
10924 	}
10925 #endif
10926 
10927 
10928 	if ((error = vnode_getwithref(vp))) {
10929 		goto out;
10930 	}
10931 
10932 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10933 
10934 #if CONFIG_UNION_MOUNTS
10935 unionread:
10936 #endif /* CONFIG_UNION_MOUNTS */
10937 	if (vp->v_type != VDIR) {
10938 		(void)vnode_put(vp);
10939 		error = EINVAL;
10940 		goto out;
10941 	}
10942 
10943 #if CONFIG_MACF
10944 	error = mac_vnode_check_readdir(ctx, vp);
10945 	if (error != 0) {
10946 		(void)vnode_put(vp);
10947 		goto out;
10948 	}
10949 #endif /* MAC */
10950 
10951 	/* set up the uio structure which will contain the users return buffer */
10952 	loff = fp->fp_glob->fg_offset;
10953 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10954 	uio_addiov(auio, uap->buffer, uap->buffersize);
10955 
10956 	/*
10957 	 * If the only item requested is file names, we can let that past with
10958 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10959 	 * they need SEARCH as well.
10960 	 */
10961 	action = KAUTH_VNODE_LIST_DIRECTORY;
10962 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10963 	    attributelist.fileattr || attributelist.dirattr) {
10964 		action |= KAUTH_VNODE_SEARCH;
10965 	}
10966 
10967 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10968 		/* Believe it or not, uap->options only has 32-bits of valid
10969 		 * info, so truncate before extending again */
10970 
10971 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10972 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10973 	}
10974 
10975 	if (error) {
10976 		(void) vnode_put(vp);
10977 		goto out;
10978 	}
10979 
10980 #if CONFIG_UNION_MOUNTS
10981 	/*
10982 	 * If we've got the last entry of a directory in a union mount
10983 	 * then reset the eofflag and pretend there's still more to come.
10984 	 * The next call will again set eofflag and the buffer will be empty,
10985 	 * so traverse to the underlying directory and do the directory
10986 	 * read there.
10987 	 */
10988 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10989 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10990 			eofflag = 0;
10991 		} else {                                                // Empty buffer
10992 			vnode_t uvp;
10993 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10994 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10995 					fp_set_data(fp, uvp);
10996 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10997 					count = savecount;
10998 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10999 					vnode_put(vp);
11000 					vp = uvp;
11001 					goto unionread;
11002 				} else {
11003 					/* could not get a ref, can't replace in fd */
11004 					vnode_put(uvp);
11005 				}
11006 			}
11007 		}
11008 	}
11009 #endif /* CONFIG_UNION_MOUNTS */
11010 
11011 	(void)vnode_put(vp);
11012 
11013 	if (error) {
11014 		goto out;
11015 	}
11016 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
11017 
11018 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
11019 		goto out;
11020 	}
11021 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
11022 		goto out;
11023 	}
11024 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
11025 		goto out;
11026 	}
11027 
11028 	*retval = eofflag;  /* similar to getdirentries */
11029 	error = 0;
11030 out:
11031 	vn_offset_unlock(fp->fp_glob);
11032 	file_drop(fd);
11033 	return error; /* return error earlier, an retval of 0 or 1 now */
11034 } /* end of getdirentriesattr system call */
11035 
11036 /*
11037  * Exchange data between two files
11038  */
11039 
11040 /* ARGSUSED */
11041 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)11042 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
11043 {
11044 	struct nameidata fnd, snd;
11045 	vfs_context_t ctx = vfs_context_current();
11046 	vnode_t fvp;
11047 	vnode_t svp;
11048 	int error;
11049 	u_int32_t nameiflags;
11050 	char *fpath = NULL;
11051 	char *spath = NULL;
11052 	int   flen = 0, slen = 0;
11053 	int from_truncated = 0, to_truncated = 0;
11054 #if CONFIG_FSE
11055 	fse_info f_finfo, s_finfo;
11056 #endif
11057 
11058 	nameiflags = 0;
11059 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11060 		nameiflags |= FOLLOW;
11061 	}
11062 
11063 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
11064 	    UIO_USERSPACE, uap->path1, ctx);
11065 
11066 	error = namei(&fnd);
11067 	if (error) {
11068 		goto out2;
11069 	}
11070 
11071 	nameidone(&fnd);
11072 	fvp = fnd.ni_vp;
11073 
11074 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
11075 	    UIO_USERSPACE, uap->path2, ctx);
11076 
11077 	error = namei(&snd);
11078 	if (error) {
11079 		vnode_put(fvp);
11080 		goto out2;
11081 	}
11082 	nameidone(&snd);
11083 	svp = snd.ni_vp;
11084 
11085 	/*
11086 	 * if the files are the same, return an inval error
11087 	 */
11088 	if (svp == fvp) {
11089 		error = EINVAL;
11090 		goto out;
11091 	}
11092 
11093 	/*
11094 	 * if the files are on different volumes, return an error
11095 	 */
11096 	if (svp->v_mount != fvp->v_mount) {
11097 		error = EXDEV;
11098 		goto out;
11099 	}
11100 
11101 	/* If they're not files, return an error */
11102 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
11103 		error = EINVAL;
11104 		goto out;
11105 	}
11106 
11107 #if CONFIG_MACF
11108 	error = mac_vnode_check_exchangedata(ctx,
11109 	    fvp, svp);
11110 	if (error) {
11111 		goto out;
11112 	}
11113 #endif
11114 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
11115 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
11116 		goto out;
11117 	}
11118 
11119 	if (
11120 #if CONFIG_FSE
11121 		need_fsevent(FSE_EXCHANGE, fvp) ||
11122 #endif
11123 		kauth_authorize_fileop_has_listeners()) {
11124 		GET_PATH(fpath);
11125 		GET_PATH(spath);
11126 
11127 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
11128 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
11129 
11130 #if CONFIG_FSE
11131 		get_fse_info(fvp, &f_finfo, ctx);
11132 		get_fse_info(svp, &s_finfo, ctx);
11133 		if (from_truncated || to_truncated) {
11134 			// set it here since only the f_finfo gets reported up to user space
11135 			f_finfo.mode |= FSE_TRUNCATED_PATH;
11136 		}
11137 #endif
11138 	}
11139 	/* Ok, make the call */
11140 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
11141 
11142 	if (error == 0) {
11143 		const char *tmpname;
11144 
11145 		if (fpath != NULL && spath != NULL) {
11146 			/* call out to allow 3rd party notification of exchangedata.
11147 			 * Ignore result of kauth_authorize_fileop call.
11148 			 */
11149 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
11150 			    (uintptr_t)fpath, (uintptr_t)spath);
11151 		}
11152 		name_cache_lock();
11153 
11154 		tmpname     = fvp->v_name;
11155 		fvp->v_name = svp->v_name;
11156 		svp->v_name = tmpname;
11157 
11158 		if (fvp->v_parent != svp->v_parent) {
11159 			vnode_t tmp;
11160 
11161 			tmp           = fvp->v_parent;
11162 			fvp->v_parent = svp->v_parent;
11163 			svp->v_parent = tmp;
11164 		}
11165 		name_cache_unlock();
11166 
11167 #if CONFIG_FSE
11168 		if (fpath != NULL && spath != NULL) {
11169 			add_fsevent(FSE_EXCHANGE, ctx,
11170 			    FSE_ARG_STRING, flen, fpath,
11171 			    FSE_ARG_FINFO, &f_finfo,
11172 			    FSE_ARG_STRING, slen, spath,
11173 			    FSE_ARG_FINFO, &s_finfo,
11174 			    FSE_ARG_DONE);
11175 		}
11176 #endif
11177 	}
11178 
11179 out:
11180 	if (fpath != NULL) {
11181 		RELEASE_PATH(fpath);
11182 	}
11183 	if (spath != NULL) {
11184 		RELEASE_PATH(spath);
11185 	}
11186 	vnode_put(svp);
11187 	vnode_put(fvp);
11188 out2:
11189 	return error;
11190 }
11191 
11192 /*
11193  * Return (in MB) the amount of freespace on the given vnode's volume.
11194  */
11195 uint32_t freespace_mb(vnode_t vp);
11196 
11197 uint32_t
freespace_mb(vnode_t vp)11198 freespace_mb(vnode_t vp)
11199 {
11200 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
11201 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
11202 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
11203 }
11204 
11205 #if CONFIG_SEARCHFS
11206 
11207 /* ARGSUSED */
11208 
11209 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)11210 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
11211 {
11212 	vnode_t vp, tvp;
11213 	int i, error = 0;
11214 	int fserror = 0;
11215 	struct nameidata nd;
11216 	struct user64_fssearchblock searchblock;
11217 	struct searchstate *state;
11218 	struct attrlist *returnattrs;
11219 	struct timeval timelimit;
11220 	void *searchparams1, *searchparams2;
11221 	uio_t auio = NULL;
11222 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11223 	uint32_t nummatches;
11224 	size_t mallocsize;
11225 	uint32_t nameiflags;
11226 	vfs_context_t ctx = vfs_context_current();
11227 	UIO_STACKBUF(uio_buf, 1);
11228 
11229 	/* Start by copying in fsearchblock parameter list */
11230 	if (IS_64BIT_PROCESS(p)) {
11231 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11232 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
11233 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
11234 	} else {
11235 		struct user32_fssearchblock tmp_searchblock;
11236 
11237 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11238 		// munge into 64-bit version
11239 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11240 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11241 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11242 		searchblock.maxmatches = tmp_searchblock.maxmatches;
11243 		/*
11244 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11245 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11246 		 */
11247 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11248 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11249 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11250 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11251 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11252 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11253 		searchblock.searchattrs = tmp_searchblock.searchattrs;
11254 	}
11255 	if (error) {
11256 		return error;
11257 	}
11258 
11259 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11260 	 */
11261 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11262 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11263 		return EINVAL;
11264 	}
11265 
11266 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11267 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
11268 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11269 	/* block.                                                                                             */
11270 	/*												      */
11271 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
11272 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
11273 	/*       assumes the size is still 556 bytes it will continue to work				      */
11274 
11275 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11276 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11277 
11278 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11279 
11280 	/* Now set up the various pointers to the correct place in our newly allocated memory */
11281 
11282 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11283 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11284 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11285 
11286 	/* Now copy in the stuff given our local variables. */
11287 
11288 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11289 		goto freeandexit;
11290 	}
11291 
11292 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11293 		goto freeandexit;
11294 	}
11295 
11296 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11297 		goto freeandexit;
11298 	}
11299 
11300 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11301 		goto freeandexit;
11302 	}
11303 
11304 	/*
11305 	 * When searching a union mount, need to set the
11306 	 * start flag at the first call on each layer to
11307 	 * reset state for the new volume.
11308 	 */
11309 	if (uap->options & SRCHFS_START) {
11310 		state->ss_union_layer = 0;
11311 	} else {
11312 		uap->options |= state->ss_union_flags;
11313 	}
11314 	state->ss_union_flags = 0;
11315 
11316 	/*
11317 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11318 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11319 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11320 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11321 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11322 	 */
11323 
11324 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11325 		attrreference_t* string_ref;
11326 		u_int32_t* start_length;
11327 		user64_size_t param_length;
11328 
11329 		/* validate searchparams1 */
11330 		param_length = searchblock.sizeofsearchparams1;
11331 		/* skip the word that specifies length of the buffer */
11332 		start_length = (u_int32_t*) searchparams1;
11333 		start_length = start_length + 1;
11334 		string_ref = (attrreference_t*) start_length;
11335 
11336 		/* ensure no negative offsets or too big offsets */
11337 		if (string_ref->attr_dataoffset < 0) {
11338 			error = EINVAL;
11339 			goto freeandexit;
11340 		}
11341 		if (string_ref->attr_length > MAXPATHLEN) {
11342 			error = EINVAL;
11343 			goto freeandexit;
11344 		}
11345 
11346 		/* Check for pointer overflow in the string ref */
11347 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11348 			error = EINVAL;
11349 			goto freeandexit;
11350 		}
11351 
11352 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11353 			error = EINVAL;
11354 			goto freeandexit;
11355 		}
11356 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11357 			error = EINVAL;
11358 			goto freeandexit;
11359 		}
11360 	}
11361 
11362 	/* set up the uio structure which will contain the users return buffer */
11363 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11364 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11365 
11366 	nameiflags = 0;
11367 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11368 		nameiflags |= FOLLOW;
11369 	}
11370 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11371 	    UIO_USERSPACE, uap->path, ctx);
11372 
11373 	error = namei(&nd);
11374 	if (error) {
11375 		goto freeandexit;
11376 	}
11377 	vp = nd.ni_vp;
11378 	nameidone(&nd);
11379 
11380 	/*
11381 	 * Switch to the root vnode for the volume
11382 	 */
11383 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11384 	vnode_put(vp);
11385 	if (error) {
11386 		goto freeandexit;
11387 	}
11388 	vp = tvp;
11389 
11390 #if CONFIG_UNION_MOUNTS
11391 	/*
11392 	 * If it's a union mount, the path lookup takes
11393 	 * us to the top layer. But we may need to descend
11394 	 * to a lower layer. For non-union mounts the layer
11395 	 * is always zero.
11396 	 */
11397 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11398 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11399 			break;
11400 		}
11401 		tvp = vp;
11402 		vp = vp->v_mount->mnt_vnodecovered;
11403 		if (vp == NULL) {
11404 			vnode_put(tvp);
11405 			error = ENOENT;
11406 			goto freeandexit;
11407 		}
11408 		error = vnode_getwithref(vp);
11409 		vnode_put(tvp);
11410 		if (error) {
11411 			goto freeandexit;
11412 		}
11413 	}
11414 #endif /* CONFIG_UNION_MOUNTS */
11415 
11416 #if CONFIG_MACF
11417 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11418 	if (error) {
11419 		vnode_put(vp);
11420 		goto freeandexit;
11421 	}
11422 #endif
11423 
11424 
11425 	/*
11426 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11427 	 * before and sometimes the underlying code doesnt deal with it well.
11428 	 */
11429 	if (searchblock.maxmatches == 0) {
11430 		nummatches = 0;
11431 		goto saveandexit;
11432 	}
11433 
11434 	/*
11435 	 * Allright, we have everything we need, so lets make that call.
11436 	 *
11437 	 * We keep special track of the return value from the file system:
11438 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11439 	 * from copying out any results...
11440 	 */
11441 
11442 	fserror = VNOP_SEARCHFS(vp,
11443 	    searchparams1,
11444 	    searchparams2,
11445 	    &searchblock.searchattrs,
11446 	    (uint32_t)searchblock.maxmatches,
11447 	    &timelimit,
11448 	    returnattrs,
11449 	    &nummatches,
11450 	    (uint32_t)uap->scriptcode,
11451 	    (uint32_t)uap->options,
11452 	    auio,
11453 	    (struct searchstate *) &state->ss_fsstate,
11454 	    ctx);
11455 
11456 #if CONFIG_UNION_MOUNTS
11457 	/*
11458 	 * If it's a union mount we need to be called again
11459 	 * to search the mounted-on filesystem.
11460 	 */
11461 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11462 		state->ss_union_flags = SRCHFS_START;
11463 		state->ss_union_layer++;        // search next layer down
11464 		fserror = EAGAIN;
11465 	}
11466 #endif /* CONFIG_UNION_MOUNTS */
11467 
11468 saveandexit:
11469 
11470 	vnode_put(vp);
11471 
11472 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11473 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11474 
11475 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11476 		goto freeandexit;
11477 	}
11478 
11479 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11480 		goto freeandexit;
11481 	}
11482 
11483 	error = fserror;
11484 
11485 freeandexit:
11486 
11487 	kfree_data(searchparams1, mallocsize);
11488 
11489 	return error;
11490 } /* end of searchfs system call */
11491 
11492 #else /* CONFIG_SEARCHFS */
11493 
11494 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11495 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11496 {
11497 	return ENOTSUP;
11498 }
11499 
11500 #endif /* CONFIG_SEARCHFS */
11501 
11502 
11503 #if CONFIG_DATALESS_FILES
11504 
11505 /*
11506  * === Namespace Resolver Up-call Mechanism ===
11507  *
11508  * When I/O is performed to a dataless file or directory (read, write,
11509  * lookup-in, etc.), the file system performs an upcall to the namespace
11510  * resolver (filecoordinationd) to materialize the object.
11511  *
11512  * We need multiple up-calls to be in flight at once, and we need these
11513  * up-calls to be interruptible, thus the following implementation:
11514  *
11515  * => The nspace_resolver_request represents the in-kernel request state.
11516  *    It contains a request ID, storage space for the errno code returned
11517  *    by filecoordinationd, and flags.
11518  *
11519  * => The request ID is simply a global monotonically incrementing 32-bit
11520  *    number.  Outstanding requests are stored in a hash table, and the
11521  *    hash function is extremely simple.
11522  *
11523  * => When an upcall is to be made to filecoordinationd, a request structure
11524  *    is allocated on the stack (it is small, and needs to live only during
11525  *    the duration of the call to resolve_nspace_item_ext()).  It is
11526  *    initialized and inserted into the table.  Some backpressure from
11527  *    filecoordinationd is applied by limiting the numnber of entries that
11528  *    can be inserted into the table (and thus limiting the number of
11529  *    outstanding requests issued to filecoordinationd); waiting for an
11530  *    available slot is interruptible.
11531  *
11532  * => Once the request has been inserted into the table, the up-call is made
11533  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11534  *    immediately and filecoordinationd processes the request asynchronously.
11535  *
11536  * => The caller now waits for the request to complete.  Tnis is achieved by
11537  *    sleeping on the address of the request structure and waiting for
11538  *    filecoordinationd to mark the request structure as complete.  This
11539  *    is an interruptible sleep call; if interrupted, the request structure
11540  *    is removed from the table and EINTR is returned to the caller.  If
11541  *    this occurs, an advisory up-call is made to filecoordinationd with
11542  *    the request ID to indicate that the request can be aborted or
11543  *    de-prioritized at the discretion of filecoordinationd.
11544  *
11545  * => When filecoordinationd has completed the request, it signals completion
11546  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11547  *    decorated as a namespace resolver can write to this sysctl node.  The
11548  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11549  *    The request ID is looked up in the table, and if the request is found,
11550  *    the error code is stored in the request structure and a wakeup()
11551  *    issued on the address of the request structure.  If the request is not
11552  *    found, we simply drop the completion notification, assuming that the
11553  *    caller was interrupted.
11554  *
11555  * => When the waiting thread wakes up, it extracts the error code from the
11556  *    request structure, removes the request from the table, and returns the
11557  *    error code to the calling function.  Fini!
11558  */
11559 
11560 struct nspace_resolver_request {
11561 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11562 	vnode_t         r_vp;
11563 	vnode_t         r_tdvp;
11564 	uint32_t        r_req_id;
11565 	int             r_resolver_error;
11566 	int             r_flags;
11567 };
11568 
11569 #define RRF_COMPLETE    0x0001
11570 #define RRF_COMPLETING  0x0002
11571 
11572 struct nspace_resolver_completion_data {
11573 	uint32_t req_id;
11574 	int32_t  resolver_error;
11575 	uint64_t orig_gencount;
11576 	uint64_t orig_syncroot;
11577 };
11578 
11579 static uint32_t
next_nspace_req_id(void)11580 next_nspace_req_id(void)
11581 {
11582 	static uint32_t next_req_id;
11583 
11584 	return OSAddAtomic(1, &next_req_id);
11585 }
11586 
11587 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11588 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11589 
11590 static LIST_HEAD(nspace_resolver_requesthead,
11591     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11592 static u_long nspace_resolver_request_hashmask;
11593 static u_int nspace_resolver_request_count;
11594 static bool nspace_resolver_request_wait_slot;
11595 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11596 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11597     &nspace_resolver_request_lck_grp);
11598 
11599 #define NSPACE_REQ_LOCK() \
11600 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11601 #define NSPACE_REQ_UNLOCK() \
11602 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11603 
11604 #define NSPACE_RESOLVER_HASH(req_id)    \
11605 	(&nspace_resolver_request_hashtbl[(req_id) & \
11606 	 nspace_resolver_request_hashmask])
11607 
11608 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11609 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11610 {
11611 	struct nspace_resolver_requesthead *bucket;
11612 	struct nspace_resolver_request *req;
11613 
11614 	bucket = NSPACE_RESOLVER_HASH(req_id);
11615 	LIST_FOREACH(req, bucket, r_hashlink) {
11616 		if (req->r_req_id == req_id) {
11617 			/*
11618 			 * If this request already has a completion
11619 			 * pending, don't return it again.
11620 			 */
11621 			if ((req->r_flags & RRF_COMPLETING) != 0 &&
11622 			    skip_completing) {
11623 				req = NULL;
11624 			}
11625 			return req;
11626 		}
11627 	}
11628 
11629 	return NULL;
11630 }
11631 
11632 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11633 nspace_resolver_req_add(struct nspace_resolver_request *req)
11634 {
11635 	struct nspace_resolver_requesthead *bucket;
11636 	int error;
11637 
11638 	NSPACE_REQ_LOCK();
11639 
11640 	while (nspace_resolver_request_count >=
11641 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11642 		nspace_resolver_request_wait_slot = true;
11643 		error = msleep(&nspace_resolver_request_count,
11644 		    &nspace_resolver_request_hash_mutex,
11645 		    PVFS | PCATCH, "nspacerq", NULL);
11646 		if (error) {
11647 			NSPACE_REQ_UNLOCK();
11648 			return error;
11649 		}
11650 	}
11651 
11652 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11653 #if DIAGNOSTIC
11654 	assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11655 #endif /* DIAGNOSTIC */
11656 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11657 	nspace_resolver_request_count++;
11658 
11659 	NSPACE_REQ_UNLOCK();
11660 
11661 	return 0;
11662 }
11663 
11664 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11665 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11666 {
11667 	/*
11668 	 * If a completion is in-progress, we have to wait for the
11669 	 * completion handler to finish because it's still using 'req',
11670 	 * which is allocated on our stack a couple of frames up.
11671 	 */
11672 	while ((req->r_flags & RRF_COMPLETING) != 0) {
11673 		(void) msleep(req, &nspace_resolver_request_hash_mutex,
11674 		    PVFS, "nspacecmplt", NULL);
11675 	}
11676 }
11677 
11678 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11679 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11680 {
11681 	struct nspace_resolver_requesthead *bucket;
11682 
11683 	/* We're called with NSPACE_REQ_LOCK held. */
11684 
11685 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11686 #if DIAGNOSTIC
11687 	assert((req->r_flags & RRF_COMPLETING) == 0);
11688 	assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11689 #endif /* DIAGNOSTIC */
11690 	LIST_REMOVE(req, r_hashlink);
11691 	nspace_resolver_request_count--;
11692 
11693 	if (nspace_resolver_request_wait_slot) {
11694 		nspace_resolver_request_wait_slot = false;
11695 		wakeup(&nspace_resolver_request_count);
11696 	}
11697 
11698 	nspace_resolver_req_wait_pending_completion(req);
11699 
11700 	NSPACE_REQ_UNLOCK();
11701 }
11702 
11703 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11704 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11705 {
11706 	NSPACE_REQ_LOCK();
11707 	nspace_resolver_req_remove_and_unlock(req);
11708 }
11709 
11710 static void
nspace_resolver_req_cancel(uint32_t req_id)11711 nspace_resolver_req_cancel(uint32_t req_id)
11712 {
11713 	kern_return_t kr;
11714 	mach_port_t mp;
11715 
11716 	// Failures here aren't fatal -- the cancellation message
11717 	// sent to the resolver is merely advisory.
11718 
11719 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11720 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11721 		return;
11722 	}
11723 
11724 	kr = send_nspace_resolve_cancel(mp, req_id);
11725 	if (kr != KERN_SUCCESS) {
11726 		os_log_error(OS_LOG_DEFAULT,
11727 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11728 	}
11729 
11730 	ipc_port_release_send(mp);
11731 }
11732 
11733 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11734 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11735 {
11736 	bool send_cancel_message = false;
11737 	int error;
11738 
11739 	NSPACE_REQ_LOCK();
11740 
11741 	while ((req->r_flags & RRF_COMPLETE) == 0) {
11742 		error = msleep(req, &nspace_resolver_request_hash_mutex,
11743 		    PVFS | PCATCH, "nspace", NULL);
11744 		if (error && error != ERESTART) {
11745 			req->r_resolver_error = (error == EINTR) ? EINTR :
11746 			    ETIMEDOUT;
11747 			send_cancel_message = true;
11748 			break;
11749 		}
11750 	}
11751 
11752 	nspace_resolver_req_remove_and_unlock(req);
11753 
11754 	/*
11755 	 * It's safe to continue referencing 'req' here because it's
11756 	 * allocated on our caller's stack.
11757 	 */
11758 
11759 	if (send_cancel_message) {
11760 		nspace_resolver_req_cancel(req->r_req_id);
11761 	}
11762 
11763 	return req->r_resolver_error;
11764 }
11765 
11766 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11767 nspace_resolver_req_mark_complete(
11768 	struct nspace_resolver_request *req,
11769 	int resolver_error)
11770 {
11771 	req->r_resolver_error = resolver_error;
11772 	req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11773 	wakeup(req);
11774 }
11775 
11776 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11777 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11778 {
11779 	req->r_flags |= RRF_COMPLETING;
11780 }
11781 
11782 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11783 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11784 {
11785 	struct nspace_resolver_request *req;
11786 	int error;
11787 	struct vnode_attr va;
11788 	vnode_t vp;
11789 
11790 	NSPACE_REQ_LOCK();
11791 
11792 	req = nspace_resolver_req_lookup(c->req_id, true);
11793 	if (req == NULL) {
11794 		/*
11795 		 * If we don't find the request corresponding to our req_id,
11796 		 * just drop the completion on the floor; it's likely that
11797 		 * the requester interrupted with a signal, or it may already
11798 		 * be completing.
11799 		 */
11800 		NSPACE_REQ_UNLOCK();
11801 		return;
11802 	}
11803 
11804 	/*
11805 	 * Get out now if the resolver reported an error.
11806 	 */
11807 	if ((error = c->resolver_error) != 0) {
11808 		goto out;
11809 	}
11810 
11811 	/*
11812 	 * If the resolver did not specify any namespace shape criteria
11813 	 * for letting the operation proceed, then get out now.
11814 	 */
11815 	if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11816 		goto out;
11817 	}
11818 
11819 	/*
11820 	 * We're going to have to acquire the mount rename lock and do
11821 	 * some I/O in order to verify the criteria.  Mark the request
11822 	 * as pending so no one else messes with it after we drop the
11823 	 * NSPACE_REQ_LOCK.
11824 	 */
11825 	nspace_resolver_req_mark_completion_pending(req);
11826 	NSPACE_REQ_UNLOCK();
11827 
11828 	/*
11829 	 * Lock out renames from changing the shape of the tree while
11830 	 * validate the criteria.
11831 	 */
11832 	mount_t locked_mp = req->r_vp->v_mount;
11833 	mount_ref(locked_mp, 0);
11834 	mount_lock_renames(locked_mp);
11835 
11836 	if (c->orig_gencount != 0) {
11837 		vp = req->r_vp;
11838 		if (error) {
11839 			goto out_dropmount;
11840 		}
11841 
11842 		VATTR_INIT(&va);
11843 		VATTR_WANTED(&va, va_recursive_gencount);
11844 		error = vnode_getattr(vp, &va, vfs_context_kernel());
11845 		if (error) {
11846 			goto out_dropmount;
11847 		}
11848 		if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11849 		    va.va_recursive_gencount != c->orig_gencount) {
11850 			printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11851 			    c->orig_gencount, va.va_recursive_gencount);
11852 			error = EBUSY;
11853 			goto out_dropmount;
11854 		}
11855 	}
11856 
11857 	/*
11858 	 * Ignore orig_syncroot if a destination directory wasn't specified
11859 	 * in the request.
11860 	 */
11861 	if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11862 		uint64_t syncroot_id;
11863 
11864 		if (error) {
11865 			goto out_dropmount;
11866 		}
11867 
11868 #ifndef APFSIOC_GET_SYNC_ROOT
11869 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11870 #endif
11871 
11872 		error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11873 		    (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11874 		if (error) {
11875 			goto out_dropmount;
11876 		}
11877 		if (syncroot_id != c->orig_syncroot) {
11878 			printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11879 			    c->orig_syncroot, syncroot_id);
11880 			error = EBUSY;
11881 			goto out_dropmount;
11882 		}
11883 	}
11884 
11885 out_dropmount:
11886 	mount_unlock_renames(locked_mp);
11887 	mount_drop(locked_mp, 0);
11888 	NSPACE_REQ_LOCK();
11889 
11890 out:
11891 	nspace_resolver_req_mark_complete(req, error);
11892 	NSPACE_REQ_UNLOCK();
11893 }
11894 
11895 static struct proc *nspace_resolver_proc;
11896 
11897 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11898 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11899 {
11900 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11901 	    p == nspace_resolver_proc) ? 1 : 0;
11902 	return 0;
11903 }
11904 
11905 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11906 
11907 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11908 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11909 {
11910 	vfs_context_t ctx = vfs_context_current();
11911 	int error = 0;
11912 
11913 	//
11914 	// The system filecoordinationd runs as uid == 0.  This also
11915 	// has the nice side-effect of filtering out filecoordinationd
11916 	// running in the simulator.
11917 	//
11918 	if (!vfs_context_issuser(ctx) ||
11919 	    !vfs_context_is_dataless_resolver(ctx)) {
11920 		return EPERM;
11921 	}
11922 
11923 	if (is_resolver) {
11924 		NSPACE_REQ_LOCK();
11925 
11926 		if (nspace_resolver_proc == NULL) {
11927 			proc_lock(p);
11928 			p->p_lflag |= P_LNSPACE_RESOLVER;
11929 			proc_unlock(p);
11930 			nspace_resolver_proc = p;
11931 		} else {
11932 			error = EBUSY;
11933 		}
11934 
11935 		NSPACE_REQ_UNLOCK();
11936 	} else {
11937 		// This is basically just like the exit case.
11938 		// nspace_resolver_exited() will verify that the
11939 		// process is the resolver, and will clear the
11940 		// global.
11941 		nspace_resolver_exited(p);
11942 	}
11943 
11944 	return error;
11945 }
11946 
11947 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11948 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11949 {
11950 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11951 	    (p->p_vfs_iopolicy &
11952 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11953 		*is_prevented = 1;
11954 	} else {
11955 		*is_prevented = 0;
11956 	}
11957 	return 0;
11958 }
11959 
11960 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11961 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11962 {
11963 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11964 		return is_prevented ? 0 : EBUSY;
11965 	}
11966 
11967 	if (is_prevented) {
11968 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11969 	} else {
11970 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11971 	}
11972 	return 0;
11973 }
11974 
11975 static int
nspace_materialization_get_thread_state(int * is_prevented)11976 nspace_materialization_get_thread_state(int *is_prevented)
11977 {
11978 	uthread_t ut = current_uthread();
11979 
11980 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11981 	return 0;
11982 }
11983 
11984 static int
nspace_materialization_set_thread_state(int is_prevented)11985 nspace_materialization_set_thread_state(int is_prevented)
11986 {
11987 	uthread_t ut = current_uthread();
11988 
11989 	if (is_prevented) {
11990 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11991 	} else {
11992 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11993 	}
11994 	return 0;
11995 }
11996 
11997 /* the vfs.nspace branch */
11998 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11999 
12000 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12001 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
12002     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12003 {
12004 	struct proc *p = req->p;
12005 	int new_value, old_value, changed = 0;
12006 	int error;
12007 
12008 	error = nspace_resolver_get_proc_state(p, &old_value);
12009 	if (error) {
12010 		return error;
12011 	}
12012 
12013 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12014 	    &changed);
12015 	if (error == 0 && changed) {
12016 		error = nspace_resolver_set_proc_state(p, new_value);
12017 	}
12018 	return error;
12019 }
12020 
12021 /* decorate this process as the dataless file resolver */
12022 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
12023     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12024     0, 0, sysctl_nspace_resolver, "I", "");
12025 
12026 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12027 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
12028     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12029 {
12030 	struct proc *p = req->p;
12031 	int new_value, old_value, changed = 0;
12032 	int error;
12033 
12034 	error = nspace_materialization_get_proc_state(p, &old_value);
12035 	if (error) {
12036 		return error;
12037 	}
12038 
12039 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12040 	    &changed);
12041 	if (error == 0 && changed) {
12042 		error = nspace_materialization_set_proc_state(p, new_value);
12043 	}
12044 	return error;
12045 }
12046 
12047 /* decorate this process as not wanting to materialize dataless files */
12048 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
12049     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12050     0, 0, sysctl_nspace_prevent_materialization, "I", "");
12051 
12052 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12053 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
12054     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12055 {
12056 	int new_value, old_value, changed = 0;
12057 	int error;
12058 
12059 	error = nspace_materialization_get_thread_state(&old_value);
12060 	if (error) {
12061 		return error;
12062 	}
12063 
12064 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12065 	    &changed);
12066 	if (error == 0 && changed) {
12067 		error = nspace_materialization_set_thread_state(new_value);
12068 	}
12069 	return error;
12070 }
12071 
12072 /* decorate this thread as not wanting to materialize dataless files */
12073 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
12074     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12075     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
12076 
12077 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12078 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
12079     __unused int arg2, struct sysctl_req *req)
12080 {
12081 	struct proc *p = req->p;
12082 	uint32_t req_status[2] = { 0, 0 };
12083 	uint64_t gencount = 0;
12084 	uint64_t syncroot = 0;
12085 	int error, is_resolver, changed = 0, other_changed;
12086 
12087 	error = nspace_resolver_get_proc_state(p, &is_resolver);
12088 	if (error) {
12089 		return error;
12090 	}
12091 
12092 	if (!is_resolver) {
12093 		return EPERM;
12094 	}
12095 
12096 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
12097 	    &changed);
12098 	if (error) {
12099 		return error;
12100 	}
12101 
12102 	/*
12103 	 * Get the gencount if it was passed.  Ignore errors, because
12104 	 * it's optional.
12105 	 */
12106 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
12107 	    &other_changed);
12108 	if (error) {
12109 		gencount = 0;
12110 		error = 0;
12111 	}
12112 
12113 	/*
12114 	 * ...and now the syncroot ID.
12115 	 */
12116 	error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
12117 	    &other_changed);
12118 	if (error) {
12119 		syncroot = 0;
12120 		error = 0;
12121 	}
12122 
12123 	/*
12124 	 * req_status[0] is the req_id
12125 	 *
12126 	 * req_status[1] is the errno
12127 	 */
12128 	if (error == 0 && changed) {
12129 		const struct nspace_resolver_completion_data cd = {
12130 			.req_id = req_status[0],
12131 			.resolver_error = req_status[1],
12132 			.orig_gencount = gencount,
12133 			.orig_syncroot = syncroot,
12134 		};
12135 		nspace_resolver_req_completed(&cd);
12136 	}
12137 	return error;
12138 }
12139 
12140 /* Resolver reports completed reqs here. */
12141 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
12142     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12143     0, 0, sysctl_nspace_complete, "-", "");
12144 
12145 #endif /* CONFIG_DATALESS_FILES */
12146 
12147 #if CONFIG_DATALESS_FILES
12148 #define __no_dataless_unused    /* nothing */
12149 #else
12150 #define __no_dataless_unused    __unused
12151 #endif
12152 
12153 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)12154 vfs_context_dataless_materialization_is_prevented(
12155 	vfs_context_t const ctx __no_dataless_unused)
12156 {
12157 #if CONFIG_DATALESS_FILES
12158 	proc_t const p = vfs_context_proc(ctx);
12159 	thread_t const t = vfs_context_thread(ctx);
12160 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
12161 
12162 	/*
12163 	 * Kernel context ==> return EDEADLK, as we would with any random
12164 	 * process decorated as no-materialize.
12165 	 */
12166 	if (ctx == vfs_context_kernel()) {
12167 		return EDEADLK;
12168 	}
12169 
12170 	/*
12171 	 * If the process has the dataless-manipulation entitlement,
12172 	 * materialization is prevented, and depending on the kind
12173 	 * of file system operation, things get to proceed as if the
12174 	 * object is not dataless.
12175 	 */
12176 	if (vfs_context_is_dataless_manipulator(ctx)) {
12177 		return EJUSTRETURN;
12178 	}
12179 
12180 	/*
12181 	 * Per-thread decorations override any process-wide decorations.
12182 	 * (Foundation uses this, and this overrides even the dataless-
12183 	 * manipulation entitlement so as to make API contracts consistent.)
12184 	 */
12185 	if (ut != NULL) {
12186 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
12187 			return EDEADLK;
12188 		}
12189 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
12190 			return 0;
12191 		}
12192 	}
12193 
12194 	/*
12195 	 * If the process's iopolicy specifies that dataless files
12196 	 * can be materialized, then we let it go ahead.
12197 	 */
12198 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
12199 		return 0;
12200 	}
12201 #endif /* CONFIG_DATALESS_FILES */
12202 
12203 	/*
12204 	 * The default behavior is to not materialize dataless files;
12205 	 * return to the caller that deadlock was detected.
12206 	 */
12207 	return EDEADLK;
12208 }
12209 
12210 void
nspace_resolver_init(void)12211 nspace_resolver_init(void)
12212 {
12213 #if CONFIG_DATALESS_FILES
12214 	nspace_resolver_request_hashtbl =
12215 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12216 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12217 #endif /* CONFIG_DATALESS_FILES */
12218 }
12219 
12220 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12221 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12222 {
12223 #if CONFIG_DATALESS_FILES
12224 	struct nspace_resolver_requesthead *bucket;
12225 	struct nspace_resolver_request *req;
12226 	u_long idx;
12227 
12228 	NSPACE_REQ_LOCK();
12229 
12230 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12231 	    p == nspace_resolver_proc) {
12232 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12233 			bucket = &nspace_resolver_request_hashtbl[idx];
12234 			LIST_FOREACH(req, bucket, r_hashlink) {
12235 				nspace_resolver_req_wait_pending_completion(req);
12236 				nspace_resolver_req_mark_complete(req,
12237 				    ETIMEDOUT);
12238 			}
12239 		}
12240 		nspace_resolver_proc = NULL;
12241 	}
12242 
12243 	NSPACE_REQ_UNLOCK();
12244 #endif /* CONFIG_DATALESS_FILES */
12245 }
12246 
12247 #define DATALESS_RESOLVER_ENTITLEMENT     \
12248 	"com.apple.private.vfs.dataless-resolver"
12249 #define DATALESS_MANIPULATION_ENTITLEMENT \
12250 	"com.apple.private.vfs.dataless-manipulation"
12251 
12252 #if CONFIG_DATALESS_FILES
12253 /*
12254  * Return TRUE if the vfs context is associated with the dataless
12255  * resolver.
12256  */
12257 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12258 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12259 {
12260 	return IOTaskHasEntitlement(vfs_context_task(ctx),
12261 	           DATALESS_RESOLVER_ENTITLEMENT);
12262 }
12263 #endif /* CONFIG_DATALESS_FILES */
12264 
12265 /*
12266  * Return TRUE if the vfs context is associated with a process entitled
12267  * for dataless manipulation.
12268  *
12269  * XXX Arguably belongs in vfs_subr.c, but is here because of the
12270  * complication around CONFIG_DATALESS_FILES.
12271  */
12272 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12273 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12274 {
12275 #if CONFIG_DATALESS_FILES
12276 	task_t task = vfs_context_task(ctx);
12277 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12278 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12279 #else
12280 	return false;
12281 #endif /* CONFIG_DATALESS_FILES */
12282 }
12283 
12284 #if CONFIG_DATALESS_FILES
12285 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12286 log_materialization_prevented(vnode_t vp, uint64_t op)
12287 {
12288 	char p_name[MAXCOMLEN + 1];
12289 	char *vntype;
12290 	proc_selfname(&p_name[0], sizeof(p_name));
12291 
12292 	if (vp->v_type == VREG) {
12293 		vntype = "File";
12294 	} else if (vp->v_type == VDIR) {
12295 		vntype = "Dir";
12296 	} else if (vp->v_type == VLNK) {
12297 		vntype = "SymLink";
12298 	} else {
12299 		vntype = "Other";
12300 	}
12301 
12302 #if DEVELOPMENT
12303 	struct vnode_attr *vap = kalloc_type(struct vnode_attr, Z_WAITOK);
12304 
12305 	VATTR_INIT(vap);
12306 	VATTR_WANTED(vap, va_fsid);
12307 	VATTR_WANTED(vap, va_fileid);
12308 	if (vnode_getattr(vp, vap, vfs_context_current()) == 0) {
12309 		os_log_debug(OS_LOG_DEFAULT,
12310 		    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) fsid 0x%08x/%u fileid=%llu",
12311 		    p_name, proc_selfpid(), op, vntype,
12312 		    vap->va_fsid, vap->va_fsid, vap->va_fileid);
12313 	} else
12314 #endif
12315 	{
12316 		os_log_debug(OS_LOG_DEFAULT,
12317 		    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12318 		    p_name, proc_selfpid(), op, vntype);
12319 	}
12320 #if DEVELOPMENT
12321 	kfree_type(struct vnode_attr, vap);
12322 #endif
12323 }
12324 #endif /* CONFIG_DATALESS_FILES */
12325 
12326 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12327 vfs_materialize_item(
12328 	vnode_t vp __no_dataless_unused,
12329 	uint32_t op __no_dataless_unused,
12330 	int64_t offset __no_dataless_unused,
12331 	int64_t size __no_dataless_unused,
12332 	char *lookup_name __no_dataless_unused,
12333 	size_t const namelen __no_dataless_unused,
12334 	vnode_t tdvp __no_dataless_unused)
12335 {
12336 #if CONFIG_DATALESS_FILES
12337 	kern_return_t kern_ret;
12338 	mach_port_t mach_port;
12339 	char *path = NULL;
12340 	vfs_context_t context;
12341 	int path_len;
12342 	int error;
12343 	audit_token_t atoken;
12344 	enum vtype vp_vtype;
12345 
12346 	/* Swap files are special; ignore them */
12347 	if (vnode_isswap(vp)) {
12348 		return 0;
12349 	}
12350 
12351 	/*
12352 	 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12353 	 * are no longer used nor supported.
12354 	 */
12355 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12356 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12357 		return ENOTSUP;
12358 	}
12359 	if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12360 		os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12361 		return ENOTSUP;
12362 	}
12363 
12364 	/* Normalize 'op'. */
12365 	op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12366 
12367 	/*
12368 	 * To-directory is only meaningful for rename operations;
12369 	 * ignore it if someone handed one to us unexpectedly.
12370 	 */
12371 	if (op != NAMESPACE_HANDLER_RENAME_OP) {
12372 		tdvp = NULL;
12373 	}
12374 
12375 	context = vfs_context_current();
12376 
12377 	/* Remember this for later. */
12378 	vp_vtype = vnode_vtype(vp);
12379 
12380 	error = vfs_context_dataless_materialization_is_prevented(context);
12381 	if (error) {
12382 		log_materialization_prevented(vp, op);
12383 		goto out_check_errors;
12384 	}
12385 
12386 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12387 	    &mach_port);
12388 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12389 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12390 		/*
12391 		 * Treat this like being unable to access the backing store
12392 		 * server.
12393 		 */
12394 		return ETIMEDOUT;
12395 	}
12396 
12397 	int path_alloc_len = MAXPATHLEN;
12398 	do {
12399 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12400 		if (path == NULL) {
12401 			return ENOMEM;
12402 		}
12403 
12404 		path_len = path_alloc_len;
12405 		error = vn_getpath(vp, path, &path_len);
12406 		if (error == 0) {
12407 			break;
12408 		} else if (error == ENOSPC) {
12409 			kfree_data(path, path_alloc_len);
12410 			path = NULL;
12411 		} else {
12412 			goto out_release_port;
12413 		}
12414 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) &&
12415 	    path_alloc_len <= MAXLONGPATHLEN);
12416 
12417 	error = vfs_context_copy_audit_token(context, &atoken);
12418 	if (error) {
12419 		goto out_release_port;
12420 	}
12421 
12422 	struct nspace_resolver_request req = {
12423 		.r_req_id = next_nspace_req_id(),
12424 		.r_vp = vp,
12425 		.r_tdvp = tdvp,
12426 	};
12427 
12428 	error = nspace_resolver_req_add(&req);
12429 	if (error) {
12430 		goto out_release_port;
12431 	}
12432 
12433 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12434 
12435 	if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12436 		char *dest_path = NULL;
12437 		int dest_path_len;
12438 
12439 		dest_path = zalloc(ZV_NAMEI);
12440 		dest_path_len = MAXPATHLEN;
12441 
12442 		error = vn_getpath(tdvp, dest_path, &dest_path_len);
12443 		if (error) {
12444 			zfree(ZV_NAMEI, dest_path);
12445 			goto out_release_port;
12446 		}
12447 
12448 		/*
12449 		 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12450 		 * compatibility with existing agents in user-space
12451 		 * who get passed this value.
12452 		 */
12453 		kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12454 		    req.r_req_id,
12455 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12456 		    path, dest_path, atoken);
12457 
12458 		zfree(ZV_NAMEI, dest_path);
12459 	} else if (vp_vtype == VDIR) {
12460 		char *tmpname = NULL;
12461 
12462 		/*
12463 		 * If the caller provided a lookup_name *and* a name length,
12464 		 * then we assume the lookup_name is not NUL-terminated.
12465 		 * Allocate a temporary buffer in this case to provide
12466 		 * a NUL-terminated path name to the IPC call.
12467 		 */
12468 		if (lookup_name != NULL && namelen != 0) {
12469 			if (namelen >= PATH_MAX) {
12470 				error = EINVAL;
12471 				goto out_req_remove;
12472 			}
12473 			tmpname = zalloc(ZV_NAMEI);
12474 			strlcpy(tmpname, lookup_name, namelen + 1);
12475 			lookup_name = tmpname;
12476 		} else if (lookup_name != NULL) {
12477 			/*
12478 			 * If the caller provided a lookup_name with a
12479 			 * zero name length, then we assume it's NUL-
12480 			 * terminated.  Verify it has a valid length.
12481 			 */
12482 			if (strlen(lookup_name) >= PATH_MAX) {
12483 				error = EINVAL;
12484 				goto out_req_remove;
12485 			}
12486 		}
12487 
12488 		/* (See above.) */
12489 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12490 		    req.r_req_id,
12491 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12492 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12493 
12494 		if (tmpname != NULL) {
12495 			zfree(ZV_NAMEI, tmpname);
12496 
12497 			/*
12498 			 * Poison lookup_name rather than reference
12499 			 * freed memory.
12500 			 */
12501 			lookup_name = NULL;
12502 		}
12503 	} else {
12504 		/* (See above.) */
12505 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12506 		    req.r_req_id,
12507 		    op | NAMESPACE_HANDLER_NSPACE_EVENT,
12508 		    offset, size, path, atoken);
12509 	}
12510 	if (kern_ret != KERN_SUCCESS) {
12511 		/*
12512 		 * Also treat this like being unable to access the backing
12513 		 * store server.
12514 		 */
12515 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12516 		    kern_ret);
12517 		error = ETIMEDOUT;
12518 		goto out_req_remove;
12519 	}
12520 
12521 	/*
12522 	 * Give back the memory we allocated earlier while we wait; we
12523 	 * no longer need it.
12524 	 */
12525 	kfree_data(path, path_alloc_len);
12526 	path = NULL;
12527 
12528 	/*
12529 	 * Request has been submitted to the resolver. Now (interruptibly)
12530 	 * wait for completion. Upon requrn, the request will have been
12531 	 * removed from the lookup table.
12532 	 */
12533 	error = nspace_resolver_req_wait(&req);
12534 
12535 out_release_port:
12536 	if (path != NULL) {
12537 		kfree_data(path, path_alloc_len);
12538 		path = NULL;
12539 	}
12540 	ipc_port_release_send(mach_port);
12541 
12542 out_check_errors:
12543 	/*
12544 	 * The file resolver owns the logic about what error to return
12545 	 * to the caller.  We only need to handle a couple of special
12546 	 * cases here:
12547 	 */
12548 	if (error == EJUSTRETURN) {
12549 		/*
12550 		 * The requesting process is allowed to interact with
12551 		 * dataless objects.  Make a couple of sanity-checks
12552 		 * here to ensure the action makes sense.
12553 		 */
12554 		switch (op) {
12555 		case NAMESPACE_HANDLER_WRITE_OP:
12556 		case NAMESPACE_HANDLER_TRUNCATE_OP:
12557 		case NAMESPACE_HANDLER_RENAME_OP:
12558 			/*
12559 			 * This handles the case of the resolver itself
12560 			 * writing data to the file (or throwing it
12561 			 * away).
12562 			 */
12563 			error = 0;
12564 			break;
12565 		case NAMESPACE_HANDLER_READ_OP:
12566 		case NAMESPACE_HANDLER_LOOKUP_OP:
12567 			/*
12568 			 * This handles the case of the resolver needing
12569 			 * to look up inside of a dataless directory while
12570 			 * it's in the process of materializing it (for
12571 			 * example, creating files or directories).
12572 			 */
12573 			error = (vp_vtype == VDIR) ? 0 : EBADF;
12574 			break;
12575 		default:
12576 			error = EBADF;
12577 			break;
12578 		}
12579 	}
12580 
12581 	return error;
12582 
12583 out_req_remove:
12584 	nspace_resolver_req_remove(&req);
12585 	goto out_release_port;
12586 #else
12587 	return ENOTSUP;
12588 #endif /* CONFIG_DATALESS_FILES */
12589 }
12590 
12591 /*
12592  * vfs_materialize_file: Materialize a regular file.
12593  *
12594  * Inputs:
12595  * vp		The dataless file to be materialized.
12596  *
12597  * op		What kind of operation is being performed:
12598  *		-> NAMESPACE_HANDLER_READ_OP
12599  *		-> NAMESPACE_HANDLER_WRITE_OP
12600  *		-> NAMESPACE_HANDLER_LINK_CREATE
12601  *		-> NAMESPACE_HANDLER_DELETE_OP
12602  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12603  *		-> NAMESPACE_HANDLER_RENAME_OP
12604  *
12605  * offset	offset of I/O for READ or WRITE.  Ignored for
12606  *		other ops.
12607  *
12608  * size		size of I/O for READ or WRITE  Ignored for
12609  *		other ops.
12610  *
12611  * If offset or size are -1 for a READ or WRITE, then the resolver should
12612  * consider the range to be unknown.
12613  *
12614  * Upon successful return, the caller may proceed with the operation.
12615  * N.B. the file may still be "dataless" in this case.
12616  */
12617 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12618 vfs_materialize_file(
12619 	struct vnode *vp,
12620 	uint64_t op,
12621 	int64_t offset,
12622 	int64_t size)
12623 {
12624 	if (vp->v_type != VREG) {
12625 		return EFTYPE;
12626 	}
12627 	return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12628 	           NULL);
12629 }
12630 
12631 /*
12632  * vfs_materialize_dir:
12633  *
12634  * Inputs:
12635  * vp		The dataless directory to be materialized.
12636  *
12637  * op		What kind of operation is being performed:
12638  *		-> NAMESPACE_HANDLER_READ_OP
12639  *		-> NAMESPACE_HANDLER_WRITE_OP
12640  *		-> NAMESPACE_HANDLER_DELETE_OP
12641  *		-> NAMESPACE_HANDLER_RENAME_OP
12642  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12643  *
12644  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12645  *		other ops.  May or may not be NUL-terminated; see below.
12646  *
12647  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12648  *		terminated and namelen is the number of valid bytes in
12649  *		lookup_name. If zero, then lookup_name is assumed to be
12650  *		NUL-terminated.
12651  *
12652  * Upon successful return, the caller may proceed with the operation.
12653  * N.B. the directory may still be "dataless" in this case.
12654  */
12655 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12656 vfs_materialize_dir(
12657 	struct vnode *vp,
12658 	uint64_t op,
12659 	char *lookup_name,
12660 	size_t namelen)
12661 {
12662 	if (vp->v_type != VDIR) {
12663 		return EFTYPE;
12664 	}
12665 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12666 		return EINVAL;
12667 	}
12668 	return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12669 	           namelen, NULL);
12670 }
12671 
12672 /*
12673  * vfs_materialize_reparent:
12674  *
12675  * Inputs:
12676  * vp		The dataless file or directory to be materialized.
12677  *
12678  * tdvp		The new parent directory for the dataless file.
12679  *
12680  * Upon successful return, the caller may proceed with the operation.
12681  * N.B. the item may still be "dataless" in this case.
12682  */
12683 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12684 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12685 {
12686 	if (vp->v_type != VDIR && vp->v_type != VREG) {
12687 		return EFTYPE;
12688 	}
12689 	return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12690 	           0, 0, NULL, 0, tdvp);
12691 }
12692 
12693 #if 0
12694 static int
12695 build_volfs_path(struct vnode *vp, char *path, int *len)
12696 {
12697 	struct vnode_attr va;
12698 	int ret;
12699 
12700 	VATTR_INIT(&va);
12701 	VATTR_WANTED(&va, va_fsid);
12702 	VATTR_WANTED(&va, va_fileid);
12703 
12704 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12705 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12706 		ret = -1;
12707 	} else {
12708 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12709 		ret = 0;
12710 	}
12711 
12712 	return ret;
12713 }
12714 #endif
12715 
12716 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12717 fsctl_bogus_command_compat(unsigned long cmd)
12718 {
12719 	switch (cmd) {
12720 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12721 		return FSIOC_SYNC_VOLUME;
12722 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12723 		return FSIOC_ROUTEFS_SETROUTEID;
12724 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12725 		return FSIOC_SET_PACKAGE_EXTS;
12726 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12727 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
12728 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12729 		return DISK_CONDITIONER_IOC_GET;
12730 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12731 		return DISK_CONDITIONER_IOC_SET;
12732 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12733 		return FSIOC_FIOSEEKHOLE;
12734 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12735 		return FSIOC_FIOSEEKDATA;
12736 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12737 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12738 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12739 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
12740 	}
12741 
12742 	return cmd;
12743 }
12744 
12745 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12746 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12747 {
12748 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12749 }
12750 
12751 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12752 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12753 {
12754 	struct vfs_attr vfa;
12755 	mount_t mp = vp->v_mount;
12756 	unsigned arg;
12757 	int error;
12758 
12759 	/* record vid of vp so we can drop it below. */
12760 	uint32_t vvid = vp->v_id;
12761 
12762 	/*
12763 	 * Then grab mount_iterref so that we can release the vnode.
12764 	 * Without this, a thread may call vnode_iterate_prepare then
12765 	 * get into a deadlock because we've never released the root vp
12766 	 */
12767 	error = mount_iterref(mp, 0);
12768 	if (error) {
12769 		return error;
12770 	}
12771 	vnode_hold(vp);
12772 	vnode_put(vp);
12773 
12774 	arg = MNT_NOWAIT;
12775 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12776 		arg = MNT_WAIT;
12777 	}
12778 
12779 	/*
12780 	 * If the filessytem supports multiple filesytems in a
12781 	 * partition (For eg APFS volumes in a container, it knows
12782 	 * that the waitfor argument to VFS_SYNC are flags.
12783 	 */
12784 	VFSATTR_INIT(&vfa);
12785 	VFSATTR_WANTED(&vfa, f_capabilities);
12786 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12787 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12788 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12789 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12790 		arg |= MNT_VOLUME;
12791 	}
12792 
12793 	/* issue the sync for this volume */
12794 	(void)sync_callback(mp, &arg);
12795 
12796 	/*
12797 	 * Then release the mount_iterref once we're done syncing; it's not
12798 	 * needed for the VNOP_IOCTL below
12799 	 */
12800 	mount_iterdrop(mp);
12801 
12802 	if (arg & FSCTL_SYNC_FULLSYNC) {
12803 		/* re-obtain vnode iocount on the root vp, if possible */
12804 		error = vnode_getwithvid(vp, vvid);
12805 		if (error == 0) {
12806 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12807 			vnode_put(vp);
12808 		}
12809 	}
12810 	vnode_drop(vp);
12811 	/* mark the argument VP as having been released */
12812 	*arg_vp = NULL;
12813 	return error;
12814 }
12815 
12816 #if ROUTEFS
12817 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12818 handle_routes(user_addr_t udata)
12819 {
12820 	char routepath[MAXPATHLEN];
12821 	size_t len = 0;
12822 	int error;
12823 
12824 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12825 		return error;
12826 	}
12827 	bzero(routepath, MAXPATHLEN);
12828 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12829 	if (error) {
12830 		return error;
12831 	}
12832 	error = routefs_kernel_mount(routepath);
12833 	return error;
12834 }
12835 #endif
12836 
12837 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12838 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12839 {
12840 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12841 	struct vnode_attr va;
12842 	int error;
12843 
12844 	VATTR_INIT(&va);
12845 	VATTR_SET(&va, va_flags, cas->new_flags);
12846 
12847 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12848 
12849 #if CONFIG_FSE
12850 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12851 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12852 	}
12853 #endif
12854 
12855 	return error;
12856 }
12857 
12858 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12859 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12860 {
12861 	struct mount *mp = NULL;
12862 	errno_t rootauth = 0;
12863 
12864 	mp = vp->v_mount;
12865 
12866 	/*
12867 	 * query the underlying FS and see if it reports something
12868 	 * sane for this vnode. If volume is authenticated via
12869 	 * chunklist, leave that for the caller to determine.
12870 	 */
12871 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12872 
12873 	return rootauth;
12874 }
12875 
12876 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12877 	"com.apple.private.kernel.set-package-extensions"
12878 
12879 /*
12880  * Make a filesystem-specific control call:
12881  */
12882 /* ARGSUSED */
12883 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12884 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12885 {
12886 	int error = 0;
12887 	boolean_t is64bit;
12888 	u_int size;
12889 #define STK_PARAMS 128
12890 	char stkbuf[STK_PARAMS] = {0};
12891 	caddr_t data, memp;
12892 	vnode_t vp = *arg_vp;
12893 
12894 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
12895 		return ENOTTY;
12896 	}
12897 
12898 	cmd = fsctl_bogus_command_compat(cmd);
12899 
12900 	size = IOCPARM_LEN(cmd);
12901 	if (size > IOCPARM_MAX) {
12902 		return EINVAL;
12903 	}
12904 
12905 	is64bit = proc_is64bit(p);
12906 
12907 	memp = NULL;
12908 
12909 	if (size > sizeof(stkbuf)) {
12910 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12911 			return ENOMEM;
12912 		}
12913 		data = memp;
12914 	} else {
12915 		data = &stkbuf[0];
12916 	};
12917 
12918 	if (cmd & IOC_IN) {
12919 		if (size) {
12920 			error = copyin(udata, data, size);
12921 			if (error) {
12922 				if (memp) {
12923 					kfree_data(memp, size);
12924 				}
12925 				return error;
12926 			}
12927 		} else {
12928 			if (is64bit) {
12929 				*(user_addr_t *)data = udata;
12930 			} else {
12931 				*(uint32_t *)data = (uint32_t)udata;
12932 			}
12933 		};
12934 	} else if ((cmd & IOC_OUT) && size) {
12935 		/*
12936 		 * Zero the buffer so the user always
12937 		 * gets back something deterministic.
12938 		 */
12939 		bzero(data, size);
12940 	} else if (cmd & IOC_VOID) {
12941 		if (is64bit) {
12942 			*(user_addr_t *)data = udata;
12943 		} else {
12944 			*(uint32_t *)data = (uint32_t)udata;
12945 		}
12946 	}
12947 
12948 	/* Check to see if it's a generic command */
12949 	switch (cmd) {
12950 	case FSIOC_SYNC_VOLUME:
12951 		error = handle_sync_volume(vp, arg_vp, data, ctx);
12952 		break;
12953 
12954 	case FSIOC_ROUTEFS_SETROUTEID:
12955 #if ROUTEFS
12956 		error = handle_routes(udata);
12957 #endif
12958 		break;
12959 
12960 	case FSIOC_SET_PACKAGE_EXTS: {
12961 		user_addr_t ext_strings;
12962 		uint32_t    num_entries;
12963 		uint32_t    max_width;
12964 
12965 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12966 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12967 			error = EPERM;
12968 			break;
12969 		}
12970 
12971 		if ((is64bit && size != sizeof(user64_package_ext_info))
12972 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12973 			// either you're 64-bit and passed a 64-bit struct or
12974 			// you're 32-bit and passed a 32-bit struct.  otherwise
12975 			// it's not ok.
12976 			error = EINVAL;
12977 			break;
12978 		}
12979 
12980 		if (is64bit) {
12981 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12982 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12983 			}
12984 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12985 			num_entries = ((user64_package_ext_info *)data)->num_entries;
12986 			max_width   = ((user64_package_ext_info *)data)->max_width;
12987 		} else {
12988 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12989 			num_entries = ((user32_package_ext_info *)data)->num_entries;
12990 			max_width   = ((user32_package_ext_info *)data)->max_width;
12991 		}
12992 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
12993 	}
12994 	break;
12995 
12996 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12997 	{
12998 		mount_t mp;
12999 
13000 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
13001 			break;
13002 		}
13003 		if ((mp = vp->v_mount) != NULL) {
13004 			mount_lock(mp);
13005 			if (data[0] != 0) {
13006 				for (int i = 0; i < MFSTYPENAMELEN; i++) {
13007 					if (!data[i]) {
13008 						goto continue_copy;
13009 					}
13010 				}
13011 				/*
13012 				 * Getting here means we have a user data
13013 				 * string which has no NULL termination in
13014 				 * its first MFSTYPENAMELEN bytes.  This is
13015 				 * bogus, let's avoid strlcpy-ing the read
13016 				 * data and return an error.
13017 				 */
13018 				error = EINVAL;
13019 				goto unlock;
13020 continue_copy:
13021 				vfs_setfstypename_locked(mp, data);
13022 				if (vfs_isrdonly(mp) &&
13023 				    strcmp(data, "mtmfs") == 0) {
13024 					mp->mnt_kern_flag |=
13025 					    MNTK_EXTENDED_SECURITY;
13026 					mp->mnt_kern_flag &=
13027 					    ~MNTK_AUTH_OPAQUE;
13028 				}
13029 			} else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13030 				const char *name =
13031 				    vfs_getfstypenameref_locked(mp, NULL);
13032 				if (strcmp(name, "mtmfs") == 0) {
13033 					mp->mnt_kern_flag &=
13034 					    ~MNTK_EXTENDED_SECURITY;
13035 				}
13036 				vfs_setfstypename_locked(mp, NULL);
13037 			}
13038 unlock:
13039 			mount_unlock(mp);
13040 		}
13041 	}
13042 	break;
13043 
13044 	case DISK_CONDITIONER_IOC_GET: {
13045 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
13046 	}
13047 	break;
13048 
13049 	case DISK_CONDITIONER_IOC_SET: {
13050 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
13051 	}
13052 	break;
13053 
13054 	case FSIOC_CAS_BSDFLAGS:
13055 		error = handle_flags(vp, data, ctx);
13056 		break;
13057 
13058 	case FSIOC_FD_ONLY_OPEN_ONCE: {
13059 		error = 0;
13060 		if (vnode_usecount(vp) > 1) {
13061 			vnode_lock_spin(vp);
13062 			if (vp->v_lflag & VL_HASSTREAMS) {
13063 				if (vnode_isinuse_locked(vp, 1, 1)) {
13064 					error = EBUSY;
13065 				}
13066 			} else if (vnode_usecount(vp) > 1) {
13067 				error = EBUSY;
13068 			}
13069 			vnode_unlock(vp);
13070 		}
13071 	}
13072 	break;
13073 
13074 	case FSIOC_EVAL_ROOTAUTH:
13075 		error = handle_auth(vp, cmd, data, options, ctx);
13076 		break;
13077 
13078 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
13079 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
13080 		break;
13081 
13082 #if CONFIG_EXCLAVES
13083 	case FSIOC_EXCLAVE_FS_REGISTER:
13084 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13085 			error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
13086 		} else {
13087 			error = EPERM;
13088 		}
13089 		break;
13090 
13091 	case FSIOC_EXCLAVE_FS_UNREGISTER:
13092 		if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13093 			error = vfs_exclave_fs_unregister(vp);
13094 		} else {
13095 			error = EPERM;
13096 		}
13097 		break;
13098 
13099 	case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
13100 		exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
13101 		exclave_fs_base_dir_t *dirs = NULL;
13102 		if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13103 			error = EPERM;
13104 			break;
13105 		}
13106 		if (get_base_dirs->base_dirs) {
13107 			if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
13108 				error = EINVAL;
13109 				break;
13110 			}
13111 			dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
13112 			if (!dirs) {
13113 				error = ENOSPC;
13114 				break;
13115 			}
13116 		}
13117 		error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
13118 		if (!error && dirs) {
13119 			error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
13120 			    get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
13121 		}
13122 		if (dirs) {
13123 			kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
13124 		}
13125 	}
13126 	break;
13127 #endif
13128 
13129 	default: {
13130 		/*
13131 		 * Other, known commands shouldn't be passed down here.
13132 		 * (When adding a selector to this list, it may be prudent
13133 		 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
13134 		 */
13135 		switch (cmd) {
13136 		case F_PUNCHHOLE:
13137 		case F_TRIM_ACTIVE_FILE:
13138 		case F_RDADVISE:
13139 		case F_TRANSCODEKEY:
13140 		case F_GETPROTECTIONLEVEL:
13141 		case F_GETDEFAULTPROTLEVEL:
13142 		case F_MAKECOMPRESSED:
13143 		case F_SET_GREEDY_MODE:
13144 		case F_SETSTATICCONTENT:
13145 		case F_SETIOTYPE:
13146 		case F_SETBACKINGSTORE:
13147 		case F_GETPATH_MTMINFO:
13148 		case APFSIOC_REVERT_TO_SNAPSHOT:
13149 		case FSIOC_FIOSEEKHOLE:
13150 		case FSIOC_FIOSEEKDATA:
13151 		case HFS_GET_BOOT_INFO:
13152 		case HFS_SET_BOOT_INFO:
13153 		case FIOPINSWAP:
13154 		case F_CHKCLEAN:
13155 		case F_FULLFSYNC:
13156 		case F_BARRIERFSYNC:
13157 		case F_FREEZE_FS:
13158 		case F_THAW_FS:
13159 		case FSIOC_KERNEL_ROOTAUTH:
13160 		case FSIOC_GRAFT_FS:
13161 		case FSIOC_UNGRAFT_FS:
13162 		case FSIOC_AUTH_FS:
13163 		case F_SPECULATIVE_READ:
13164 		case F_ATTRIBUTION_TAG:
13165 		case F_TRANSFEREXTENTS:
13166 		case F_ASSERT_BG_ACCESS:
13167 		case F_RELEASE_BG_ACCESS:
13168 			error = EINVAL;
13169 			goto outdrop;
13170 		}
13171 		/* Invoke the filesystem-specific code */
13172 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13173 	}
13174 	} /* end switch stmt */
13175 
13176 	/*
13177 	 * if no errors, copy any data to user. Size was
13178 	 * already set and checked above.
13179 	 */
13180 	if (error == 0 && (cmd & IOC_OUT) && size) {
13181 		error = copyout(data, udata, size);
13182 	}
13183 
13184 outdrop:
13185 	if (memp) {
13186 		kfree_data(memp, size);
13187 	}
13188 
13189 	return error;
13190 }
13191 
13192 /* ARGSUSED */
13193 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)13194 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
13195 {
13196 	int error;
13197 	struct nameidata nd;
13198 	uint32_t nameiflags;
13199 	vnode_t vp = NULL;
13200 	vfs_context_t ctx = vfs_context_current();
13201 
13202 	AUDIT_ARG(cmd, (int)uap->cmd);
13203 	AUDIT_ARG(value32, uap->options);
13204 	/* Get the vnode for the file we are getting info on:  */
13205 	nameiflags = 0;
13206 	//
13207 	// if we come through fsctl() then the file is by definition not open.
13208 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
13209 	// lest the caller mistakenly thinks the only open is their own (but in
13210 	// reality it's someone elses).
13211 	//
13212 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
13213 		return EINVAL;
13214 	}
13215 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
13216 		nameiflags |= FOLLOW;
13217 	}
13218 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
13219 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13220 	}
13221 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13222 	    UIO_USERSPACE, uap->path, ctx);
13223 	if ((error = namei(&nd))) {
13224 		goto done;
13225 	}
13226 	vp = nd.ni_vp;
13227 	nameidone(&nd);
13228 
13229 #if CONFIG_MACF
13230 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13231 	if (error) {
13232 		goto done;
13233 	}
13234 #endif
13235 
13236 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13237 
13238 done:
13239 	if (vp) {
13240 		vnode_put(vp);
13241 	}
13242 	return error;
13243 }
13244 /* ARGSUSED */
13245 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13246 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13247 {
13248 	int error;
13249 	vnode_t vp = NULL;
13250 	vfs_context_t ctx = vfs_context_current();
13251 	int fd = -1;
13252 
13253 	AUDIT_ARG(fd, uap->fd);
13254 	AUDIT_ARG(cmd, (int)uap->cmd);
13255 	AUDIT_ARG(value32, uap->options);
13256 
13257 	/* Get the vnode for the file we are getting info on:  */
13258 	if ((error = file_vnode(uap->fd, &vp))) {
13259 		return error;
13260 	}
13261 	fd = uap->fd;
13262 	if ((error = vnode_getwithref(vp))) {
13263 		file_drop(fd);
13264 		return error;
13265 	}
13266 
13267 #if CONFIG_MACF
13268 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13269 		file_drop(fd);
13270 		vnode_put(vp);
13271 		return error;
13272 	}
13273 #endif
13274 
13275 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13276 
13277 	file_drop(fd);
13278 
13279 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13280 	if (vp) {
13281 		vnode_put(vp);
13282 	}
13283 
13284 	return error;
13285 }
13286 /* end of fsctl system call */
13287 
13288 #define FILESEC_ACCESS_ENTITLEMENT              \
13289 	"com.apple.private.vfs.filesec-access"
13290 
13291 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13292 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13293 {
13294 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13295 		/*
13296 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13297 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13298 		 */
13299 		if ((!setting && vfs_context_issuser(ctx)) ||
13300 		    IOTaskHasEntitlement(vfs_context_task(ctx),
13301 		    FILESEC_ACCESS_ENTITLEMENT)) {
13302 			return 0;
13303 		}
13304 	}
13305 
13306 	return EPERM;
13307 }
13308 
13309 /*
13310  *  Retrieve the data of an extended attribute.
13311  */
13312 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13313 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13314 {
13315 	vnode_t vp;
13316 	struct nameidata nd;
13317 	char attrname[XATTR_MAXNAMELEN + 1];
13318 	vfs_context_t ctx = vfs_context_current();
13319 	uio_t auio = NULL;
13320 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13321 	size_t attrsize = 0;
13322 	size_t namelen;
13323 	u_int32_t nameiflags;
13324 	int error;
13325 	UIO_STACKBUF(uio_buf, 1);
13326 
13327 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13328 		return EINVAL;
13329 	}
13330 
13331 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13332 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13333 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13334 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13335 	}
13336 
13337 	if ((error = namei(&nd))) {
13338 		return error;
13339 	}
13340 	vp = nd.ni_vp;
13341 	nameidone(&nd);
13342 
13343 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13344 	if (error != 0) {
13345 		goto out;
13346 	}
13347 	if (xattr_protected(attrname) &&
13348 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13349 		goto out;
13350 	}
13351 	/*
13352 	 * the specific check for 0xffffffff is a hack to preserve
13353 	 * binaray compatibilty in K64 with applications that discovered
13354 	 * that passing in a buf pointer and a size of -1 resulted in
13355 	 * just the size of the indicated extended attribute being returned.
13356 	 * this isn't part of the documented behavior, but because of the
13357 	 * original implemtation's check for "uap->size > 0", this behavior
13358 	 * was allowed. In K32 that check turned into a signed comparison
13359 	 * even though uap->size is unsigned...  in K64, we blow by that
13360 	 * check because uap->size is unsigned and doesn't get sign smeared
13361 	 * in the munger for a 32 bit user app.  we also need to add a
13362 	 * check to limit the maximum size of the buffer being passed in...
13363 	 * unfortunately, the underlying fileystems seem to just malloc
13364 	 * the requested size even if the actual extended attribute is tiny.
13365 	 * because that malloc is for kernel wired memory, we have to put a
13366 	 * sane limit on it.
13367 	 *
13368 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13369 	 * U64 running on K64 will yield -1 (64 bits wide)
13370 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
13371 	 */
13372 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13373 		goto no_uio;
13374 	}
13375 
13376 	if (uap->value) {
13377 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13378 			uap->size = XATTR_MAXSIZE;
13379 		}
13380 
13381 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13382 		    &uio_buf[0], sizeof(uio_buf));
13383 		uio_addiov(auio, uap->value, uap->size);
13384 	}
13385 no_uio:
13386 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13387 out:
13388 	vnode_put(vp);
13389 
13390 	if (auio) {
13391 		*retval = uap->size - uio_resid(auio);
13392 	} else {
13393 		*retval = (user_ssize_t)attrsize;
13394 	}
13395 
13396 	return error;
13397 }
13398 
13399 /*
13400  * Retrieve the data of an extended attribute.
13401  */
13402 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13403 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13404 {
13405 	vnode_t vp;
13406 	char attrname[XATTR_MAXNAMELEN + 1];
13407 	vfs_context_t ctx = vfs_context_current();
13408 	uio_t auio = NULL;
13409 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13410 	size_t attrsize = 0;
13411 	size_t namelen;
13412 	int error;
13413 	UIO_STACKBUF(uio_buf, 1);
13414 
13415 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13416 	    XATTR_NOFOLLOW_ANY)) {
13417 		return EINVAL;
13418 	}
13419 
13420 	if ((error = file_vnode(uap->fd, &vp))) {
13421 		return error;
13422 	}
13423 	if ((error = vnode_getwithref(vp))) {
13424 		file_drop(uap->fd);
13425 		return error;
13426 	}
13427 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13428 	if (error != 0) {
13429 		goto out;
13430 	}
13431 	if (xattr_protected(attrname) &&
13432 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13433 		goto out;
13434 	}
13435 	if (uap->value && uap->size > 0) {
13436 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13437 			uap->size = XATTR_MAXSIZE;
13438 		}
13439 
13440 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13441 		    &uio_buf[0], sizeof(uio_buf));
13442 		uio_addiov(auio, uap->value, uap->size);
13443 	}
13444 
13445 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13446 out:
13447 	(void)vnode_put(vp);
13448 	file_drop(uap->fd);
13449 
13450 	if (auio) {
13451 		*retval = uap->size - uio_resid(auio);
13452 	} else {
13453 		*retval = (user_ssize_t)attrsize;
13454 	}
13455 	return error;
13456 }
13457 
13458 /* struct for checkdirs iteration */
13459 struct setxattr_ctx {
13460 	struct nameidata nd;
13461 	char attrname[XATTR_MAXNAMELEN + 1];
13462 	UIO_STACKBUF(uio_buf, 1);
13463 };
13464 
13465 /*
13466  * Set the data of an extended attribute.
13467  */
13468 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13469 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13470 {
13471 	vnode_t vp;
13472 	vfs_context_t ctx = vfs_context_current();
13473 	uio_t auio = NULL;
13474 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13475 	size_t namelen;
13476 	u_int32_t nameiflags;
13477 	int error;
13478 	struct setxattr_ctx *sactx;
13479 
13480 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13481 		return EINVAL;
13482 	}
13483 
13484 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13485 	if (sactx == NULL) {
13486 		return ENOMEM;
13487 	}
13488 
13489 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13490 	if (error != 0) {
13491 		if (error == EPERM) {
13492 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13493 			error = ENAMETOOLONG;
13494 		}
13495 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13496 		goto out;
13497 	}
13498 	if (xattr_protected(sactx->attrname) &&
13499 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13500 		goto out;
13501 	}
13502 	if (uap->size != 0 && uap->value == 0) {
13503 		error = EINVAL;
13504 		goto out;
13505 	}
13506 	if (uap->size > INT_MAX) {
13507 		error = E2BIG;
13508 		goto out;
13509 	}
13510 
13511 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13512 #if CONFIG_FILE_LEASES
13513 	nameiflags |= WANTPARENT;
13514 #endif
13515 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13516 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13517 		sactx->nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13518 	}
13519 
13520 	if ((error = namei(&sactx->nd))) {
13521 		goto out;
13522 	}
13523 	vp = sactx->nd.ni_vp;
13524 #if CONFIG_FILE_LEASES
13525 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13526 	vnode_put(sactx->nd.ni_dvp);
13527 #endif
13528 	nameidone(&sactx->nd);
13529 
13530 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13531 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13532 	uio_addiov(auio, uap->value, uap->size);
13533 
13534 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13535 #if CONFIG_FSE
13536 	if (error == 0) {
13537 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13538 		    FSE_ARG_VNODE, vp,
13539 		    FSE_ARG_DONE);
13540 	}
13541 #endif
13542 	vnode_put(vp);
13543 out:
13544 	kfree_type(struct setxattr_ctx, sactx);
13545 	*retval = 0;
13546 	return error;
13547 }
13548 
13549 /*
13550  * Set the data of an extended attribute.
13551  */
13552 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13553 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13554 {
13555 	vnode_t vp;
13556 	char attrname[XATTR_MAXNAMELEN + 1];
13557 	vfs_context_t ctx = vfs_context_current();
13558 	uio_t auio = NULL;
13559 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13560 	size_t namelen;
13561 	int error;
13562 	UIO_STACKBUF(uio_buf, 1);
13563 
13564 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13565 	    XATTR_NOFOLLOW_ANY)) {
13566 		return EINVAL;
13567 	}
13568 
13569 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13570 	if (error != 0) {
13571 		if (error == EPERM) {
13572 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13573 			return ENAMETOOLONG;
13574 		}
13575 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13576 		return error;
13577 	}
13578 	if (xattr_protected(attrname) &&
13579 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13580 		return error;
13581 	}
13582 	if (uap->size != 0 && uap->value == 0) {
13583 		return EINVAL;
13584 	}
13585 	if (uap->size > INT_MAX) {
13586 		return E2BIG;
13587 	}
13588 	if ((error = file_vnode(uap->fd, &vp))) {
13589 		return error;
13590 	}
13591 	if ((error = vnode_getwithref(vp))) {
13592 		file_drop(uap->fd);
13593 		return error;
13594 	}
13595 
13596 #if CONFIG_FILE_LEASES
13597 	vnode_breakdirlease(vp, true, O_WRONLY);
13598 #endif
13599 
13600 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13601 	    &uio_buf[0], sizeof(uio_buf));
13602 	uio_addiov(auio, uap->value, uap->size);
13603 
13604 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13605 #if CONFIG_FSE
13606 	if (error == 0) {
13607 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13608 		    FSE_ARG_VNODE, vp,
13609 		    FSE_ARG_DONE);
13610 	}
13611 #endif
13612 	vnode_put(vp);
13613 	file_drop(uap->fd);
13614 	*retval = 0;
13615 	return error;
13616 }
13617 
13618 /*
13619  * Remove an extended attribute.
13620  * XXX Code duplication here.
13621  */
13622 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13623 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13624 {
13625 	vnode_t vp;
13626 	struct nameidata nd;
13627 	char attrname[XATTR_MAXNAMELEN + 1];
13628 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13629 	vfs_context_t ctx = vfs_context_current();
13630 	size_t namelen;
13631 	u_int32_t nameiflags;
13632 	int error;
13633 
13634 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13635 		return EINVAL;
13636 	}
13637 
13638 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13639 	if (error != 0) {
13640 		return error;
13641 	}
13642 	if (xattr_protected(attrname)) {
13643 		return EPERM;
13644 	}
13645 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13646 #if CONFIG_FILE_LEASES
13647 	nameiflags |= WANTPARENT;
13648 #endif
13649 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13650 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13651 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13652 	}
13653 
13654 	if ((error = namei(&nd))) {
13655 		return error;
13656 	}
13657 	vp = nd.ni_vp;
13658 #if CONFIG_FILE_LEASES
13659 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13660 	vnode_put(nd.ni_dvp);
13661 #endif
13662 	nameidone(&nd);
13663 
13664 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13665 #if CONFIG_FSE
13666 	if (error == 0) {
13667 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13668 		    FSE_ARG_VNODE, vp,
13669 		    FSE_ARG_DONE);
13670 	}
13671 #endif
13672 	vnode_put(vp);
13673 	*retval = 0;
13674 	return error;
13675 }
13676 
13677 /*
13678  * Remove an extended attribute.
13679  * XXX Code duplication here.
13680  */
13681 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13682 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13683 {
13684 	vnode_t vp;
13685 	char attrname[XATTR_MAXNAMELEN + 1];
13686 	size_t namelen;
13687 	int error;
13688 #if CONFIG_FSE
13689 	vfs_context_t ctx = vfs_context_current();
13690 #endif
13691 
13692 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13693 	    XATTR_NOFOLLOW_ANY)) {
13694 		return EINVAL;
13695 	}
13696 
13697 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13698 	if (error != 0) {
13699 		return error;
13700 	}
13701 	if (xattr_protected(attrname)) {
13702 		return EPERM;
13703 	}
13704 	if ((error = file_vnode(uap->fd, &vp))) {
13705 		return error;
13706 	}
13707 	if ((error = vnode_getwithref(vp))) {
13708 		file_drop(uap->fd);
13709 		return error;
13710 	}
13711 
13712 #if CONFIG_FILE_LEASES
13713 	vnode_breakdirlease(vp, true, O_WRONLY);
13714 #endif
13715 
13716 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13717 #if CONFIG_FSE
13718 	if (error == 0) {
13719 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13720 		    FSE_ARG_VNODE, vp,
13721 		    FSE_ARG_DONE);
13722 	}
13723 #endif
13724 	vnode_put(vp);
13725 	file_drop(uap->fd);
13726 	*retval = 0;
13727 	return error;
13728 }
13729 
13730 /*
13731  * Retrieve the list of extended attribute names.
13732  * XXX Code duplication here.
13733  */
13734 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13735 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13736 {
13737 	vnode_t vp;
13738 	struct nameidata nd;
13739 	vfs_context_t ctx = vfs_context_current();
13740 	uio_t auio = NULL;
13741 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13742 	size_t attrsize = 0;
13743 	u_int32_t nameiflags;
13744 	int error;
13745 	UIO_STACKBUF(uio_buf, 1);
13746 
13747 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13748 		return EINVAL;
13749 	}
13750 
13751 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13752 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13753 	if (uap->options & XATTR_NOFOLLOW_ANY) {
13754 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13755 	}
13756 
13757 	if ((error = namei(&nd))) {
13758 		return error;
13759 	}
13760 	vp = nd.ni_vp;
13761 	nameidone(&nd);
13762 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13763 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13764 		    &uio_buf[0], sizeof(uio_buf));
13765 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13766 	}
13767 
13768 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13769 
13770 	vnode_put(vp);
13771 	if (auio) {
13772 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13773 	} else {
13774 		*retval = (user_ssize_t)attrsize;
13775 	}
13776 	return error;
13777 }
13778 
13779 /*
13780  * Retrieve the list of extended attribute names.
13781  * XXX Code duplication here.
13782  */
13783 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13784 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13785 {
13786 	vnode_t vp;
13787 	uio_t auio = NULL;
13788 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13789 	size_t attrsize = 0;
13790 	int error;
13791 	UIO_STACKBUF(uio_buf, 1);
13792 
13793 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13794 	    XATTR_NOFOLLOW_ANY)) {
13795 		return EINVAL;
13796 	}
13797 
13798 	if ((error = file_vnode(uap->fd, &vp))) {
13799 		return error;
13800 	}
13801 	if ((error = vnode_getwithref(vp))) {
13802 		file_drop(uap->fd);
13803 		return error;
13804 	}
13805 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13806 		auio = uio_createwithbuffer(1, 0, spacetype,
13807 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
13808 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13809 	}
13810 
13811 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13812 
13813 	vnode_put(vp);
13814 	file_drop(uap->fd);
13815 	if (auio) {
13816 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13817 	} else {
13818 		*retval = (user_ssize_t)attrsize;
13819 	}
13820 	return error;
13821 }
13822 
13823 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13824 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13825     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13826 {
13827 	int error;
13828 	struct mount *mp = NULL;
13829 	vnode_t vp;
13830 	int length;
13831 	int bpflags;
13832 	/* maximum number of times to retry build_path */
13833 	unsigned int retries = 0x10;
13834 
13835 	if (bufsize > MAXLONGPATHLEN) {
13836 		return EINVAL;
13837 	}
13838 
13839 	if (buf == NULL) {
13840 		return ENOMEM;
13841 	}
13842 
13843 retry:
13844 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13845 		error = ENOTSUP;  /* unexpected failure */
13846 		return ENOTSUP;
13847 	}
13848 
13849 #if CONFIG_UNION_MOUNTS
13850 unionget:
13851 #endif /* CONFIG_UNION_MOUNTS */
13852 	if (objid == 2) {
13853 		struct vfs_attr vfsattr;
13854 		int use_vfs_root = TRUE;
13855 
13856 		VFSATTR_INIT(&vfsattr);
13857 		VFSATTR_WANTED(&vfsattr, f_capabilities);
13858 		if (!(options & FSOPT_ISREALFSID) &&
13859 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13860 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13861 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13862 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13863 				use_vfs_root = FALSE;
13864 			}
13865 		}
13866 
13867 		if (use_vfs_root) {
13868 			error = VFS_ROOT(mp, &vp, ctx);
13869 		} else {
13870 			error = VFS_VGET(mp, objid, &vp, ctx);
13871 		}
13872 	} else {
13873 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13874 	}
13875 
13876 #if CONFIG_UNION_MOUNTS
13877 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13878 		/*
13879 		 * If the fileid isn't found and we're in a union
13880 		 * mount volume, then see if the fileid is in the
13881 		 * mounted-on volume.
13882 		 */
13883 		struct mount *tmp = mp;
13884 		mp = vnode_mount(tmp->mnt_vnodecovered);
13885 		vfs_unbusy(tmp);
13886 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
13887 			goto unionget;
13888 		}
13889 	} else {
13890 		vfs_unbusy(mp);
13891 	}
13892 #else
13893 	vfs_unbusy(mp);
13894 #endif /* CONFIG_UNION_MOUNTS */
13895 
13896 	if (error) {
13897 		return error;
13898 	}
13899 
13900 #if CONFIG_MACF
13901 	error = mac_vnode_check_fsgetpath(ctx, vp);
13902 	if (error) {
13903 		vnode_put(vp);
13904 		return error;
13905 	}
13906 #endif
13907 
13908 	/* Obtain the absolute path to this vnode. */
13909 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13910 	if (options & FSOPT_NOFIRMLINKPATH) {
13911 		bpflags |= BUILDPATH_NO_FIRMLINK;
13912 	}
13913 	bpflags |= BUILDPATH_CHECK_MOVED;
13914 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13915 	vnode_put(vp);
13916 
13917 	if (error) {
13918 		/* there was a race building the path, try a few more times */
13919 		if (error == EAGAIN) {
13920 			--retries;
13921 			if (retries > 0) {
13922 				goto retry;
13923 			}
13924 
13925 			error = ENOENT;
13926 		}
13927 		goto out;
13928 	}
13929 
13930 	AUDIT_ARG(text, buf);
13931 
13932 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13933 		kdebug_vfs_lookup(buf, length, vp, KDBG_VFSLKUP_LOOKUP);
13934 	}
13935 
13936 	*pathlen = length; /* may be superseded by error */
13937 
13938 out:
13939 	return error;
13940 }
13941 
13942 /*
13943  * Obtain the full pathname of a file system object by id.
13944  */
13945 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13946 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13947     uint32_t options, user_ssize_t *retval)
13948 {
13949 	vfs_context_t ctx = vfs_context_current();
13950 	fsid_t fsid;
13951 	char *realpath;
13952 	int length;
13953 	int error;
13954 
13955 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13956 		return EINVAL;
13957 	}
13958 
13959 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13960 		return error;
13961 	}
13962 	AUDIT_ARG(value32, fsid.val[0]);
13963 	AUDIT_ARG(value64, objid);
13964 	/* Restrict output buffer size for now. */
13965 
13966 	if (bufsize > MAXLONGPATHLEN || bufsize <= 0) {
13967 		return EINVAL;
13968 	}
13969 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13970 	if (realpath == NULL) {
13971 		return ENOMEM;
13972 	}
13973 
13974 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13975 	    options, &length);
13976 
13977 	if (error) {
13978 		goto out;
13979 	}
13980 
13981 	error = copyout((caddr_t)realpath, buf, length);
13982 
13983 	*retval = (user_ssize_t)length; /* may be superseded by error */
13984 out:
13985 	kfree_data(realpath, bufsize);
13986 	return error;
13987 }
13988 
13989 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13990 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13991 {
13992 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13993 	           0, retval);
13994 }
13995 
13996 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13997 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13998 {
13999 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
14000 	           uap->options, retval);
14001 }
14002 
14003 /*
14004  * Common routine to handle various flavors of statfs data heading out
14005  *	to user space.
14006  *
14007  * Returns:	0			Success
14008  *		EFAULT
14009  */
14010 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)14011 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
14012     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
14013     boolean_t partial_copy)
14014 {
14015 	int             error;
14016 	int             my_size, copy_size;
14017 
14018 	if (is_64_bit) {
14019 		struct user64_statfs sfs;
14020 		my_size = copy_size = sizeof(sfs);
14021 		bzero(&sfs, my_size);
14022 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14023 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14024 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14025 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
14026 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
14027 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
14028 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
14029 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
14030 		sfs.f_files = (user64_long_t)sfsp->f_files;
14031 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
14032 		sfs.f_fsid = sfsp->f_fsid;
14033 		sfs.f_owner = sfsp->f_owner;
14034 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14035 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14036 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14037 
14038 		if (partial_copy) {
14039 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14040 		}
14041 		error = copyout((caddr_t)&sfs, bufp, copy_size);
14042 	} else {
14043 		struct user32_statfs sfs;
14044 
14045 		my_size = copy_size = sizeof(sfs);
14046 		bzero(&sfs, my_size);
14047 
14048 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14049 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14050 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14051 
14052 		/*
14053 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
14054 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
14055 		 * to reflect the filesystem size as best we can.
14056 		 */
14057 		if ((sfsp->f_blocks > INT_MAX)
14058 		    /* Hack for 4061702 . I think the real fix is for Carbon to
14059 		     * look for some volume capability and not depend on hidden
14060 		     * semantics agreed between a FS and carbon.
14061 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
14062 		     * for Carbon to set bNoVolumeSizes volume attribute.
14063 		     * Without this the webdavfs files cannot be copied onto
14064 		     * disk as they look huge. This change should not affect
14065 		     * XSAN as they should not setting these to -1..
14066 		     */
14067 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
14068 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
14069 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
14070 			int             shift;
14071 
14072 			/*
14073 			 * Work out how far we have to shift the block count down to make it fit.
14074 			 * Note that it's possible to have to shift so far that the resulting
14075 			 * blocksize would be unreportably large.  At that point, we will clip
14076 			 * any values that don't fit.
14077 			 *
14078 			 * For safety's sake, we also ensure that f_iosize is never reported as
14079 			 * being smaller than f_bsize.
14080 			 */
14081 			for (shift = 0; shift < 32; shift++) {
14082 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
14083 					break;
14084 				}
14085 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
14086 					break;
14087 				}
14088 			}
14089 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
14090 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
14091 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
14092 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
14093 #undef __SHIFT_OR_CLIP
14094 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
14095 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
14096 		} else {
14097 			/* filesystem is small enough to be reported honestly */
14098 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
14099 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
14100 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
14101 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
14102 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
14103 		}
14104 		sfs.f_files = (user32_long_t)sfsp->f_files;
14105 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
14106 		sfs.f_fsid = sfsp->f_fsid;
14107 		sfs.f_owner = sfsp->f_owner;
14108 		vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14109 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14110 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14111 
14112 		if (partial_copy) {
14113 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14114 		}
14115 		error = copyout((caddr_t)&sfs, bufp, copy_size);
14116 	}
14117 
14118 	if (sizep != NULL) {
14119 		*sizep = my_size;
14120 	}
14121 	return error;
14122 }
14123 
14124 /*
14125  * copy stat structure into user_stat structure.
14126  */
14127 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)14128 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
14129 {
14130 	bzero(usbp, sizeof(*usbp));
14131 
14132 	usbp->st_dev = sbp->st_dev;
14133 	usbp->st_ino = sbp->st_ino;
14134 	usbp->st_mode = sbp->st_mode;
14135 	usbp->st_nlink = sbp->st_nlink;
14136 	usbp->st_uid = sbp->st_uid;
14137 	usbp->st_gid = sbp->st_gid;
14138 	usbp->st_rdev = sbp->st_rdev;
14139 #ifndef _POSIX_C_SOURCE
14140 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14141 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14142 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14143 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14144 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14145 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14146 #else
14147 	usbp->st_atime = sbp->st_atime;
14148 	usbp->st_atimensec = sbp->st_atimensec;
14149 	usbp->st_mtime = sbp->st_mtime;
14150 	usbp->st_mtimensec = sbp->st_mtimensec;
14151 	usbp->st_ctime = sbp->st_ctime;
14152 	usbp->st_ctimensec = sbp->st_ctimensec;
14153 #endif
14154 	usbp->st_size = sbp->st_size;
14155 	usbp->st_blocks = sbp->st_blocks;
14156 	usbp->st_blksize = sbp->st_blksize;
14157 	usbp->st_flags = sbp->st_flags;
14158 	usbp->st_gen = sbp->st_gen;
14159 	usbp->st_lspare = sbp->st_lspare;
14160 	usbp->st_qspare[0] = sbp->st_qspare[0];
14161 	usbp->st_qspare[1] = sbp->st_qspare[1];
14162 }
14163 
14164 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)14165 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
14166 {
14167 	bzero(usbp, sizeof(*usbp));
14168 
14169 	usbp->st_dev = sbp->st_dev;
14170 	usbp->st_ino = sbp->st_ino;
14171 	usbp->st_mode = sbp->st_mode;
14172 	usbp->st_nlink = sbp->st_nlink;
14173 	usbp->st_uid = sbp->st_uid;
14174 	usbp->st_gid = sbp->st_gid;
14175 	usbp->st_rdev = sbp->st_rdev;
14176 #ifndef _POSIX_C_SOURCE
14177 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14178 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14179 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14180 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14181 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14182 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14183 #else
14184 	usbp->st_atime = sbp->st_atime;
14185 	usbp->st_atimensec = sbp->st_atimensec;
14186 	usbp->st_mtime = sbp->st_mtime;
14187 	usbp->st_mtimensec = sbp->st_mtimensec;
14188 	usbp->st_ctime = sbp->st_ctime;
14189 	usbp->st_ctimensec = sbp->st_ctimensec;
14190 #endif
14191 	usbp->st_size = sbp->st_size;
14192 	usbp->st_blocks = sbp->st_blocks;
14193 	usbp->st_blksize = sbp->st_blksize;
14194 	usbp->st_flags = sbp->st_flags;
14195 	usbp->st_gen = sbp->st_gen;
14196 	usbp->st_lspare = sbp->st_lspare;
14197 	usbp->st_qspare[0] = sbp->st_qspare[0];
14198 	usbp->st_qspare[1] = sbp->st_qspare[1];
14199 }
14200 
14201 /*
14202  * copy stat64 structure into user_stat64 structure.
14203  */
14204 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)14205 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
14206 {
14207 	bzero(usbp, sizeof(*usbp));
14208 
14209 	usbp->st_dev = sbp->st_dev;
14210 	usbp->st_ino = sbp->st_ino;
14211 	usbp->st_mode = sbp->st_mode;
14212 	usbp->st_nlink = sbp->st_nlink;
14213 	usbp->st_uid = sbp->st_uid;
14214 	usbp->st_gid = sbp->st_gid;
14215 	usbp->st_rdev = sbp->st_rdev;
14216 #ifndef _POSIX_C_SOURCE
14217 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14218 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14219 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14220 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14221 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14222 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14223 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
14224 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
14225 #else
14226 	usbp->st_atime = sbp->st_atime;
14227 	usbp->st_atimensec = sbp->st_atimensec;
14228 	usbp->st_mtime = sbp->st_mtime;
14229 	usbp->st_mtimensec = sbp->st_mtimensec;
14230 	usbp->st_ctime = sbp->st_ctime;
14231 	usbp->st_ctimensec = sbp->st_ctimensec;
14232 	usbp->st_birthtime = sbp->st_birthtime;
14233 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14234 #endif
14235 	usbp->st_size = sbp->st_size;
14236 	usbp->st_blocks = sbp->st_blocks;
14237 	usbp->st_blksize = sbp->st_blksize;
14238 	usbp->st_flags = sbp->st_flags;
14239 	usbp->st_gen = sbp->st_gen;
14240 	usbp->st_lspare = sbp->st_lspare;
14241 	usbp->st_qspare[0] = sbp->st_qspare[0];
14242 	usbp->st_qspare[1] = sbp->st_qspare[1];
14243 }
14244 
14245 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14246 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14247 {
14248 	bzero(usbp, sizeof(*usbp));
14249 
14250 	usbp->st_dev = sbp->st_dev;
14251 	usbp->st_ino = sbp->st_ino;
14252 	usbp->st_mode = sbp->st_mode;
14253 	usbp->st_nlink = sbp->st_nlink;
14254 	usbp->st_uid = sbp->st_uid;
14255 	usbp->st_gid = sbp->st_gid;
14256 	usbp->st_rdev = sbp->st_rdev;
14257 #ifndef _POSIX_C_SOURCE
14258 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14259 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14260 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14261 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14262 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14263 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14264 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14265 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14266 #else
14267 	usbp->st_atime = sbp->st_atime;
14268 	usbp->st_atimensec = sbp->st_atimensec;
14269 	usbp->st_mtime = sbp->st_mtime;
14270 	usbp->st_mtimensec = sbp->st_mtimensec;
14271 	usbp->st_ctime = sbp->st_ctime;
14272 	usbp->st_ctimensec = sbp->st_ctimensec;
14273 	usbp->st_birthtime = sbp->st_birthtime;
14274 	usbp->st_birthtimensec = sbp->st_birthtimensec;
14275 #endif
14276 	usbp->st_size = sbp->st_size;
14277 	usbp->st_blocks = sbp->st_blocks;
14278 	usbp->st_blksize = sbp->st_blksize;
14279 	usbp->st_flags = sbp->st_flags;
14280 	usbp->st_gen = sbp->st_gen;
14281 	usbp->st_lspare = sbp->st_lspare;
14282 	usbp->st_qspare[0] = sbp->st_qspare[0];
14283 	usbp->st_qspare[1] = sbp->st_qspare[1];
14284 }
14285 
14286 /*
14287  * Purge buffer cache for simulating cold starts
14288  */
14289 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14290 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14291 {
14292 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14293 
14294 	return VNODE_RETURNED;
14295 }
14296 
14297 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14298 vfs_purge_callback(mount_t mp, __unused void * arg)
14299 {
14300 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14301 
14302 	return VFS_RETURNED;
14303 }
14304 
14305 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14306 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14307 
14308 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14309 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14310 {
14311 	if (!kauth_cred_issuser(kauth_cred_get())) {
14312 		return EPERM;
14313 	}
14314 
14315 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14316 
14317 	/* also flush any VM pagers backed by files */
14318 	if (vfs_purge_vm_pagers) {
14319 		vm_purge_filebacked_pagers();
14320 	}
14321 
14322 	return 0;
14323 }
14324 
14325 /*
14326  * gets the vnode associated with the (unnamed) snapshot directory
14327  * for a Filesystem. The snapshot directory vnode is returned with
14328  * an iocount on it.
14329  */
14330 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14331 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14332 {
14333 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14334 }
14335 
14336 /*
14337  * Get the snapshot vnode.
14338  *
14339  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14340  * needs nameidone() on ndp.
14341  *
14342  * If the snapshot vnode exists it is returned in ndp->ni_vp.
14343  *
14344  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14345  * not needed.
14346  */
14347 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14348 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14349     user_addr_t name, struct nameidata *ndp, int32_t op,
14350 #if !CONFIG_TRIGGERS
14351     __unused
14352 #endif
14353     enum path_operation pathop,
14354     vfs_context_t ctx)
14355 {
14356 	int error, i;
14357 	caddr_t name_buf;
14358 	size_t name_len;
14359 	struct vfs_attr vfa;
14360 
14361 	*sdvpp = NULLVP;
14362 	*rvpp = NULLVP;
14363 
14364 	error = vnode_getfromfd(ctx, dirfd, rvpp);
14365 	if (error) {
14366 		return error;
14367 	}
14368 
14369 	if (!vnode_isvroot(*rvpp)) {
14370 		error = EINVAL;
14371 		goto out;
14372 	}
14373 
14374 	/* Make sure the filesystem supports snapshots */
14375 	VFSATTR_INIT(&vfa);
14376 	VFSATTR_WANTED(&vfa, f_capabilities);
14377 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14378 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14379 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14380 	    VOL_CAP_INT_SNAPSHOT)) ||
14381 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14382 	    VOL_CAP_INT_SNAPSHOT))) {
14383 		error = ENOTSUP;
14384 		goto out;
14385 	}
14386 
14387 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14388 	if (error) {
14389 		goto out;
14390 	}
14391 
14392 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14393 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14394 	if (error) {
14395 		goto out1;
14396 	}
14397 
14398 	/*
14399 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14400 	 * (the length returned by copyinstr includes the terminating NUL)
14401 	 */
14402 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14403 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14404 		error = EINVAL;
14405 		goto out1;
14406 	}
14407 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14408 		;
14409 	}
14410 	if (i < (int)name_len) {
14411 		error = EINVAL;
14412 		goto out1;
14413 	}
14414 
14415 #if CONFIG_MACF
14416 	if (op == CREATE) {
14417 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14418 		    name_buf);
14419 	} else if (op == DELETE) {
14420 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14421 		    name_buf);
14422 	}
14423 	if (error) {
14424 		goto out1;
14425 	}
14426 #endif
14427 
14428 	/* Check if the snapshot already exists ... */
14429 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14430 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14431 	ndp->ni_dvp = *sdvpp;
14432 
14433 	error = namei(ndp);
14434 out1:
14435 	zfree(ZV_NAMEI, name_buf);
14436 out:
14437 	if (error) {
14438 		if (*sdvpp) {
14439 			vnode_put(*sdvpp);
14440 			*sdvpp = NULLVP;
14441 		}
14442 		if (*rvpp) {
14443 			vnode_put(*rvpp);
14444 			*rvpp = NULLVP;
14445 		}
14446 	}
14447 	return error;
14448 }
14449 
14450 /*
14451  * create a filesystem snapshot (for supporting filesystems)
14452  *
14453  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14454  * We get to the (unnamed) snapshot directory vnode and create the vnode
14455  * for the snapshot in it.
14456  *
14457  * Restrictions:
14458  *
14459  *    a) Passed in name for snapshot cannot have slashes.
14460  *    b) name can't be "." or ".."
14461  *
14462  * Since this requires superuser privileges, vnode_authorize calls are not
14463  * made.
14464  */
14465 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14466 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14467     vfs_context_t ctx)
14468 {
14469 	vnode_t rvp, snapdvp;
14470 	int error;
14471 	struct nameidata *ndp;
14472 
14473 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14474 
14475 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14476 	    OP_LINK, ctx);
14477 	if (error) {
14478 		goto out;
14479 	}
14480 
14481 	if (ndp->ni_vp) {
14482 		vnode_put(ndp->ni_vp);
14483 		error = EEXIST;
14484 	} else {
14485 		struct vnode_attr *vap;
14486 		vnode_t vp = NULLVP;
14487 
14488 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14489 
14490 		VATTR_INIT(vap);
14491 		VATTR_SET(vap, va_type, VREG);
14492 		VATTR_SET(vap, va_mode, 0);
14493 
14494 		error = vn_create(snapdvp, &vp, ndp, vap,
14495 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14496 		if (!error && vp) {
14497 			vnode_put(vp);
14498 		}
14499 
14500 		kfree_type(struct vnode_attr, vap);
14501 	}
14502 
14503 	nameidone(ndp);
14504 	vnode_put(snapdvp);
14505 	vnode_put(rvp);
14506 out:
14507 	kfree_type(struct nameidata, ndp);
14508 
14509 	return error;
14510 }
14511 
14512 /*
14513  * Delete a Filesystem snapshot
14514  *
14515  * get the vnode for the unnamed snapshot directory and the snapshot and
14516  * delete the snapshot.
14517  */
14518 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14519 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14520     vfs_context_t ctx)
14521 {
14522 	vnode_t rvp, snapdvp;
14523 	int error;
14524 	struct nameidata *ndp;
14525 
14526 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14527 
14528 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14529 	    OP_UNLINK, ctx);
14530 	if (error) {
14531 		goto out;
14532 	}
14533 
14534 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14535 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14536 
14537 	vnode_put(ndp->ni_vp);
14538 	nameidone(ndp);
14539 	vnode_put(snapdvp);
14540 	vnode_put(rvp);
14541 out:
14542 	kfree_type(struct nameidata, ndp);
14543 
14544 	return error;
14545 }
14546 
14547 /*
14548  * Revert a filesystem to a snapshot
14549  *
14550  * Marks the filesystem to revert to the given snapshot on next mount.
14551  */
14552 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14553 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14554     vfs_context_t ctx)
14555 {
14556 	int error;
14557 	vnode_t rvp;
14558 	mount_t mp;
14559 	struct fs_snapshot_revert_args revert_data;
14560 	struct componentname cnp;
14561 	caddr_t name_buf;
14562 	size_t name_len;
14563 
14564 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14565 	if (error) {
14566 		return error;
14567 	}
14568 	mp = vnode_mount(rvp);
14569 
14570 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14571 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14572 	if (error) {
14573 		zfree(ZV_NAMEI, name_buf);
14574 		vnode_put(rvp);
14575 		return error;
14576 	}
14577 
14578 #if CONFIG_MACF
14579 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14580 	if (error) {
14581 		zfree(ZV_NAMEI, name_buf);
14582 		vnode_put(rvp);
14583 		return error;
14584 	}
14585 #endif
14586 
14587 	/*
14588 	 * Grab mount_iterref so that we can release the vnode,
14589 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14590 	 */
14591 	error = mount_iterref(mp, 0);
14592 	vnode_put(rvp);
14593 	if (error) {
14594 		zfree(ZV_NAMEI, name_buf);
14595 		return error;
14596 	}
14597 
14598 	memset(&cnp, 0, sizeof(cnp));
14599 	cnp.cn_pnbuf = (char *)name_buf;
14600 	cnp.cn_nameiop = LOOKUP;
14601 	cnp.cn_flags = ISLASTCN | HASBUF;
14602 	cnp.cn_pnlen = MAXPATHLEN;
14603 	cnp.cn_nameptr = cnp.cn_pnbuf;
14604 	cnp.cn_namelen = (int)name_len;
14605 	revert_data.sr_cnp = &cnp;
14606 
14607 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14608 	mount_iterdrop(mp);
14609 	zfree(ZV_NAMEI, name_buf);
14610 
14611 	if (error) {
14612 		/* If there was any error, try again using VNOP_IOCTL */
14613 
14614 		vnode_t snapdvp;
14615 		struct nameidata namend;
14616 
14617 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14618 		    OP_LOOKUP, ctx);
14619 		if (error) {
14620 			return error;
14621 		}
14622 
14623 
14624 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14625 		    0, ctx);
14626 
14627 		vnode_put(namend.ni_vp);
14628 		nameidone(&namend);
14629 		vnode_put(snapdvp);
14630 		vnode_put(rvp);
14631 	}
14632 
14633 	return error;
14634 }
14635 
14636 /*
14637  * rename a Filesystem snapshot
14638  *
14639  * get the vnode for the unnamed snapshot directory and the snapshot and
14640  * rename the snapshot. This is a very specialised (and simple) case of
14641  * rename(2) (which has to deal with a lot more complications). It differs
14642  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14643  */
14644 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14645 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14646     __unused uint32_t flags, vfs_context_t ctx)
14647 {
14648 	vnode_t rvp, snapdvp;
14649 	int error, i;
14650 	caddr_t newname_buf;
14651 	size_t name_len;
14652 	vnode_t fvp;
14653 	struct nameidata *fromnd, *tond;
14654 	/* carving out a chunk for structs that are too big to be on stack. */
14655 	struct {
14656 		struct nameidata from_node;
14657 		struct nameidata to_node;
14658 	} * __rename_data;
14659 
14660 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14661 	fromnd = &__rename_data->from_node;
14662 	tond = &__rename_data->to_node;
14663 
14664 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14665 	    OP_UNLINK, ctx);
14666 	if (error) {
14667 		goto out;
14668 	}
14669 	fvp  = fromnd->ni_vp;
14670 
14671 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14672 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14673 	if (error) {
14674 		goto out1;
14675 	}
14676 
14677 	/*
14678 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14679 	 * slashes.
14680 	 * (the length returned by copyinstr includes the terminating NUL)
14681 	 *
14682 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14683 	 * off here itself.
14684 	 */
14685 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14686 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14687 		error = EINVAL;
14688 		goto out1;
14689 	}
14690 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14691 		;
14692 	}
14693 	if (i < (int)name_len) {
14694 		error = EINVAL;
14695 		goto out1;
14696 	}
14697 
14698 #if CONFIG_MACF
14699 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14700 	    newname_buf);
14701 	if (error) {
14702 		goto out1;
14703 	}
14704 #endif
14705 
14706 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14707 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14708 	tond->ni_dvp = snapdvp;
14709 
14710 	error = namei(tond);
14711 	if (error) {
14712 		goto out2;
14713 	} else if (tond->ni_vp) {
14714 		/*
14715 		 * snapshot rename behaves differently than rename(2) - if the
14716 		 * new name exists, EEXIST is returned.
14717 		 */
14718 		vnode_put(tond->ni_vp);
14719 		error = EEXIST;
14720 		goto out2;
14721 	}
14722 
14723 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14724 	    &tond->ni_cnd, ctx);
14725 
14726 out2:
14727 	nameidone(tond);
14728 out1:
14729 	zfree(ZV_NAMEI, newname_buf);
14730 	vnode_put(fvp);
14731 	vnode_put(snapdvp);
14732 	vnode_put(rvp);
14733 	nameidone(fromnd);
14734 out:
14735 	kfree_type(typeof(*__rename_data), __rename_data);
14736 	return error;
14737 }
14738 
14739 /*
14740  * Mount a Filesystem snapshot
14741  *
14742  * get the vnode for the unnamed snapshot directory and the snapshot and
14743  * mount the snapshot.
14744  */
14745 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14746 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14747     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14748 {
14749 	mount_t mp;
14750 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14751 	struct fs_snapshot_mount_args smnt_data;
14752 	int error, mount_flags = 0;
14753 	struct nameidata *snapndp, *dirndp;
14754 	/* carving out a chunk for structs that are too big to be on stack. */
14755 	struct {
14756 		struct nameidata snapnd;
14757 		struct nameidata dirnd;
14758 	} * __snapshot_mount_data;
14759 
14760 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14761 	snapndp = &__snapshot_mount_data->snapnd;
14762 	dirndp = &__snapshot_mount_data->dirnd;
14763 
14764 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14765 	    OP_LOOKUP, ctx);
14766 	if (error) {
14767 		goto out;
14768 	}
14769 
14770 	snapvp  = snapndp->ni_vp;
14771 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14772 		error = EIO;
14773 		goto out1;
14774 	}
14775 
14776 	/* Convert snapshot_mount flags to mount flags */
14777 	if (flags & SNAPSHOT_MNT_NOSUID) {
14778 		mount_flags |= MNT_NOSUID;
14779 	}
14780 	if (flags & SNAPSHOT_MNT_NODEV) {
14781 		mount_flags |= MNT_NODEV;
14782 	}
14783 	if (flags & SNAPSHOT_MNT_DONTBROWSE) {
14784 		mount_flags |= MNT_DONTBROWSE;
14785 	}
14786 	if (flags & SNAPSHOT_MNT_IGNORE_OWNERSHIP) {
14787 		mount_flags |= MNT_IGNORE_OWNERSHIP;
14788 	}
14789 	if (flags & SNAPSHOT_MNT_NOFOLLOW) {
14790 		mount_flags |= MNT_NOFOLLOW;
14791 	}
14792 
14793 	/* Get the vnode to be covered */
14794 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14795 	    UIO_USERSPACE, directory, ctx);
14796 	if (mount_flags & MNT_NOFOLLOW) {
14797 		dirndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
14798 	}
14799 
14800 	error = namei(dirndp);
14801 	if (error) {
14802 		goto out1;
14803 	}
14804 
14805 	vp = dirndp->ni_vp;
14806 	pvp = dirndp->ni_dvp;
14807 	mp = vnode_mount(rvp);
14808 
14809 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14810 		error = EINVAL;
14811 		goto out2;
14812 	}
14813 
14814 #if CONFIG_MACF
14815 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14816 	    mp->mnt_vfsstat.f_fstypename);
14817 	if (error) {
14818 		goto out2;
14819 	}
14820 #endif
14821 
14822 	smnt_data.sm_mp  = mp;
14823 	smnt_data.sm_cnp = &snapndp->ni_cnd;
14824 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14825 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), mount_flags,
14826 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14827 
14828 out2:
14829 	vnode_put(vp);
14830 	vnode_put(pvp);
14831 	nameidone(dirndp);
14832 out1:
14833 	vnode_put(snapvp);
14834 	vnode_put(snapdvp);
14835 	vnode_put(rvp);
14836 	nameidone(snapndp);
14837 out:
14838 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14839 	return error;
14840 }
14841 
14842 /*
14843  * Root from a snapshot of the filesystem
14844  *
14845  * Marks the filesystem to root from the given snapshot on next boot.
14846  */
14847 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14848 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14849     vfs_context_t ctx)
14850 {
14851 	int error;
14852 	vnode_t rvp;
14853 	mount_t mp;
14854 	struct fs_snapshot_root_args root_data;
14855 	struct componentname cnp;
14856 	caddr_t name_buf;
14857 	size_t name_len;
14858 
14859 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14860 	if (error) {
14861 		return error;
14862 	}
14863 	mp = vnode_mount(rvp);
14864 
14865 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14866 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14867 	if (error) {
14868 		zfree(ZV_NAMEI, name_buf);
14869 		vnode_put(rvp);
14870 		return error;
14871 	}
14872 
14873 	// XXX MAC checks ?
14874 
14875 	/*
14876 	 * Grab mount_iterref so that we can release the vnode,
14877 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14878 	 */
14879 	error = mount_iterref(mp, 0);
14880 	vnode_put(rvp);
14881 	if (error) {
14882 		zfree(ZV_NAMEI, name_buf);
14883 		return error;
14884 	}
14885 
14886 	memset(&cnp, 0, sizeof(cnp));
14887 	cnp.cn_pnbuf = (char *)name_buf;
14888 	cnp.cn_nameiop = LOOKUP;
14889 	cnp.cn_flags = ISLASTCN | HASBUF;
14890 	cnp.cn_pnlen = MAXPATHLEN;
14891 	cnp.cn_nameptr = cnp.cn_pnbuf;
14892 	cnp.cn_namelen = (int)name_len;
14893 	root_data.sr_cnp = &cnp;
14894 
14895 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14896 
14897 	mount_iterdrop(mp);
14898 	zfree(ZV_NAMEI, name_buf);
14899 
14900 	return error;
14901 }
14902 
14903 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14904 vfs_context_can_snapshot(vfs_context_t ctx)
14905 {
14906 	static const char * const snapshot_entitlements[] = {
14907 		"com.apple.private.vfs.snapshot",
14908 		"com.apple.developer.vfs.snapshot",
14909 		"com.apple.private.apfs.arv.limited.snapshot",
14910 	};
14911 	static const size_t nentitlements =
14912 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14913 	size_t i;
14914 
14915 	task_t task = vfs_context_task(ctx);
14916 	for (i = 0; i < nentitlements; i++) {
14917 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14918 			return TRUE;
14919 		}
14920 	}
14921 	return FALSE;
14922 }
14923 
14924 /*
14925  * FS snapshot operations dispatcher
14926  */
14927 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14928 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14929     __unused int32_t *retval)
14930 {
14931 	int error;
14932 	vfs_context_t ctx = vfs_context_current();
14933 
14934 	AUDIT_ARG(fd, uap->dirfd);
14935 	AUDIT_ARG(value32, uap->op);
14936 
14937 	if (!vfs_context_can_snapshot(ctx)) {
14938 		return EPERM;
14939 	}
14940 
14941 	/*
14942 	 * Enforce user authorization for snapshot modification operations,
14943 	 * or if trying to root from snapshot.
14944 	 */
14945 	if (uap->op != SNAPSHOT_OP_MOUNT) {
14946 		vnode_t dvp = NULLVP;
14947 		vnode_t devvp = NULLVP;
14948 		mount_t mp;
14949 
14950 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14951 		if (error) {
14952 			return error;
14953 		}
14954 		mp = vnode_mount(dvp);
14955 		devvp = mp->mnt_devvp;
14956 
14957 		/* get an iocount on devvp */
14958 		if (devvp == NULLVP) {
14959 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14960 			/* for mounts which arent block devices */
14961 			if (error == ENOENT) {
14962 				error = ENXIO;
14963 			}
14964 		} else {
14965 			error = vnode_getwithref(devvp);
14966 		}
14967 
14968 		if (error) {
14969 			vnode_put(dvp);
14970 			return error;
14971 		}
14972 
14973 		if ((vfs_context_issuser(ctx) == 0) &&
14974 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14975 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14976 			error = EPERM;
14977 		}
14978 		vnode_put(dvp);
14979 		vnode_put(devvp);
14980 
14981 		if (error) {
14982 			return error;
14983 		}
14984 	}
14985 
14986 	switch (uap->op) {
14987 	case SNAPSHOT_OP_CREATE:
14988 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14989 		break;
14990 	case SNAPSHOT_OP_DELETE:
14991 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14992 		break;
14993 	case SNAPSHOT_OP_RENAME:
14994 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14995 		    uap->flags, ctx);
14996 		break;
14997 	case SNAPSHOT_OP_MOUNT:
14998 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14999 		    uap->data, uap->flags, ctx);
15000 		break;
15001 	case SNAPSHOT_OP_REVERT:
15002 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
15003 		break;
15004 #if CONFIG_MNT_ROOTSNAP
15005 	case SNAPSHOT_OP_ROOT:
15006 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
15007 		break;
15008 #endif /* CONFIG_MNT_ROOTSNAP */
15009 	default:
15010 		error = ENOSYS;
15011 	}
15012 
15013 	return error;
15014 }
15015