xref: /xnu-8796.101.5/bsd/vfs/vfs_syscalls.c (revision aca3beaa3dfbd42498b42c5e5ce20a938e6554e5)
1 /*
2  * Copyright (c) 1995-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117 
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122 
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125 
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130 
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137 
138 #include <nfs/nfs_conf.h>
139 
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143 
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148 
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 	((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 	release_pathbuff(x)
154 #else
155 #define GET_PATH(x)     \
156 	((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 	zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160 
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164 
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168 
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
171 #endif
172 
173 extern void disk_conditioner_unmount(mount_t mp);
174 
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 	vnode_t olddp;
178 	vnode_t newdp;
179 };
180 /* callback  for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182 
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192     boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195     struct componentname *cnp, user_addr_t fsmountargs,
196     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198 
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200 
201 struct fd_vn_data * fg_vn_data_alloc(void);
202 
203 /*
204  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205  * Concurrent lookups (or lookups by ids) on hard links can cause the
206  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207  * does) to return ENOENT as the path cannot be returned from the name cache
208  * alone. We have no option but to retry and hope to get one namei->reverse path
209  * generation done without an intervening lookup, lookup by id on the hard link
210  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211  * which currently are the MAC hooks for rename, unlink and rmdir.
212  */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214 
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217 
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219     int unlink_flags);
220 
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229 
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236 
237 __private_extern__
238 int sync_internal(void);
239 
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242 
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245 
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249 
250 extern lck_rw_t rootvnode_rw_lock;
251 
252 VFS_SMR_DECLARE;
253 extern uint32_t nc_smr_enabled;
254 
255 /*
256  * incremented each time a mount or unmount operation occurs
257  * used to invalidate the cached value of the rootvp in the
258  * mount structure utilized by cache_lookup_path
259  */
260 uint32_t mount_generation = 0;
261 
262 /* counts number of mount and unmount operations */
263 unsigned int vfs_nummntops = 0;
264 
265 /* system-wide, per-boot unique mount ID */
266 static _Atomic uint64_t mount_unique_id = 1;
267 
268 extern const struct fileops vnops;
269 #if CONFIG_APPLEDOUBLE
270 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
271 #endif /* CONFIG_APPLEDOUBLE */
272 
273 /* Maximum buffer length supported by fsgetpath(2) */
274 #define FSGETPATH_MAXBUFLEN  8192
275 
276 /*
277  * Virtual File System System Calls
278  */
279 
280 /*
281  * Private in-kernel mounting spi (specific use-cases only)
282  */
283 boolean_t
vfs_iskernelmount(mount_t mp)284 vfs_iskernelmount(mount_t mp)
285 {
286 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
287 }
288 
289 __private_extern__
290 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)291 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
292     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
293     vfs_context_t ctx)
294 {
295 	struct nameidata nd;
296 	boolean_t did_namei;
297 	int error;
298 
299 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
300 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
301 
302 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
303 
304 	/*
305 	 * Get the vnode to be covered if it's not supplied
306 	 */
307 	if (vp == NULLVP) {
308 		error = namei(&nd);
309 		if (error) {
310 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
311 				printf("failed to locate mount-on path: %s ", path);
312 			}
313 			return error;
314 		}
315 		vp = nd.ni_vp;
316 		pvp = nd.ni_dvp;
317 		did_namei = TRUE;
318 	} else {
319 		char *pnbuf = CAST_DOWN(char *, path);
320 
321 		nd.ni_cnd.cn_pnbuf = pnbuf;
322 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
323 		did_namei = FALSE;
324 	}
325 
326 	kern_flags |= KERNEL_MOUNT_KMOUNT;
327 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
328 	    syscall_flags, kern_flags, NULL, ctx);
329 
330 	if (did_namei) {
331 		vnode_put(vp);
332 		vnode_put(pvp);
333 		nameidone(&nd);
334 	}
335 
336 	return error;
337 }
338 
339 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)340 vfs_mount_at_path(const char *fstype, const char *path,
341     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
342     int mnt_flags, int flags)
343 {
344 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
345 	int error, km_flags = 0;
346 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
347 
348 	/*
349 	 * This call is currently restricted to specific use cases.
350 	 */
351 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
352 		return ENOTSUP;
353 	}
354 
355 #if !defined(XNU_TARGET_OS_OSX)
356 	if (strcmp(fstype, "lifs") == 0) {
357 		syscall_flags |= MNT_NOEXEC;
358 	}
359 #endif
360 
361 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
362 		km_flags |= KERNEL_MOUNT_NOAUTH;
363 	}
364 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
365 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
366 	}
367 
368 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
369 	    syscall_flags, km_flags, ctx);
370 	if (error) {
371 		printf("%s: mount on %s failed, error %d\n", __func__, path,
372 		    error);
373 	}
374 
375 	return error;
376 }
377 
378 int
vfs_mount_override_type_name(mount_t mp,const char * name)379 vfs_mount_override_type_name(mount_t mp, const char *name)
380 {
381 	if (mp == NULL || name == NULL) {
382 		return EINVAL;
383 	}
384 
385 	/* Override the FS type name. */
386 	mount_lock_spin(mp);
387 	strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
388 	mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
389 	mount_unlock(mp);
390 
391 	return 0;
392 }
393 
394 /*
395  * Mount a file system.
396  */
397 /* ARGSUSED */
398 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)399 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
400 {
401 	struct __mac_mount_args muap;
402 
403 	muap.type = uap->type;
404 	muap.path = uap->path;
405 	muap.flags = uap->flags;
406 	muap.data = uap->data;
407 	muap.mac_p = USER_ADDR_NULL;
408 	return __mac_mount(p, &muap, retval);
409 }
410 
411 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)412 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
413 {
414 	struct componentname    cn;
415 	vfs_context_t           ctx = vfs_context_current();
416 	size_t                  dummy = 0;
417 	int                     error;
418 	int                     flags = uap->flags;
419 	char                    fstypename[MFSNAMELEN];
420 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
421 	vnode_t                 pvp;
422 	vnode_t                 vp;
423 
424 	AUDIT_ARG(fd, uap->fd);
425 	AUDIT_ARG(fflags, flags);
426 	/* fstypename will get audited by mount_common */
427 
428 	/* Sanity check the flags */
429 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
430 		return ENOTSUP;
431 	}
432 
433 	if (flags & MNT_UNION) {
434 		return EPERM;
435 	}
436 
437 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
438 	if (error) {
439 		return error;
440 	}
441 
442 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
443 		return error;
444 	}
445 
446 	if ((error = vnode_getwithref(vp)) != 0) {
447 		file_drop(uap->fd);
448 		return error;
449 	}
450 
451 	pvp = vnode_getparent(vp);
452 	if (pvp == NULL) {
453 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
454 			error = EBUSY;
455 		} else {
456 			error = EINVAL;
457 		}
458 		vnode_put(vp);
459 		file_drop(uap->fd);
460 		return error;
461 	}
462 
463 	memset(&cn, 0, sizeof(struct componentname));
464 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
465 	cn.cn_pnlen = MAXPATHLEN;
466 
467 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
468 		zfree(ZV_NAMEI, cn.cn_pnbuf);
469 		vnode_put(pvp);
470 		vnode_put(vp);
471 		file_drop(uap->fd);
472 		return error;
473 	}
474 
475 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
476 
477 	zfree(ZV_NAMEI, cn.cn_pnbuf);
478 	vnode_put(pvp);
479 	vnode_put(vp);
480 	file_drop(uap->fd);
481 
482 	return error;
483 }
484 
485 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
486 
487 /*
488  * Get the size of a graft file (a manifest or payload file).
489  * The vp should be an iocounted vnode.
490  */
491 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)492 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
493 {
494 	struct stat64 sb = {};
495 	int error;
496 
497 	*size = 0;
498 
499 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
500 	if (error) {
501 		return error;
502 	}
503 
504 	if (sb.st_size == 0) {
505 		error = ENODATA;
506 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
507 		error = EFBIG;
508 	} else {
509 		*size = (size_t) sb.st_size;
510 	}
511 
512 	return error;
513 }
514 
515 /*
516  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
517  * `size` must already be validated.
518  */
519 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)520 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
521 {
522 	return vn_rdwr(UIO_READ, graft_vp,
523 	           (caddr_t) buf, (int) size, /* offset */ 0,
524 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
525 	           vfs_context_ucred(vctx), /* resid */ NULL,
526 	           vfs_context_proc(vctx));
527 }
528 
529 /*
530  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
531  * and read it into `buf`.
532  */
533 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)534 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
535 {
536 	vnode_t metadata_vp = NULLVP;
537 	int error;
538 
539 	// Convert this graft fd to a vnode.
540 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
541 		goto out;
542 	}
543 
544 	// Get (and validate) size information.
545 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
546 		goto out;
547 	}
548 
549 	// Read each file into the provided buffer - we must get the expected amount of bytes.
550 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
551 		goto out;
552 	}
553 
554 out:
555 	if (metadata_vp) {
556 		vnode_put(metadata_vp);
557 		metadata_vp = NULLVP;
558 	}
559 
560 	return error;
561 }
562 
563 /*
564  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
565  * provided in `gfs`, saving the size of data read in `gfs`.
566  */
567 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)568 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
569     fsioc_graft_fs_t *gfs)
570 {
571 	int error;
572 
573 	// Read the authentic manifest.
574 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
575 	    &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
576 		return error;
577 	}
578 
579 	// The user manifest is currently unused, but set its size.
580 	gfs->user_manifest_size = 0;
581 
582 	// Read the payload.
583 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
584 	    &gfs->payload_size, gfs->payload))) {
585 		return error;
586 	}
587 
588 	return 0;
589 }
590 
591 /*
592  * Call into the filesystem to verify and graft a cryptex.
593  */
594 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)595 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
596     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
597 {
598 	fsioc_graft_fs_t gfs = {};
599 	uint64_t graft_dir_ino = 0;
600 	struct stat64 sb = {};
601 	int error;
602 
603 	// Pre-flight arguments.
604 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
605 		// Make sure that this graft version matches what we support.
606 		return ENOTSUP;
607 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
608 		// For this type, cryptex VP must live on same volume as the target of graft.
609 		return EXDEV;
610 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
611 		// We cannot graft upon non-directories.
612 		return ENOTDIR;
613 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
614 	    sbc_args->sbc_payload_fd < 0) {
615 		// We cannot graft without a manifest and payload.
616 		return EINVAL;
617 	}
618 
619 	if (mounton_vp) {
620 		// Get the mounton's inode number.
621 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
622 		if (error) {
623 			return error;
624 		}
625 		graft_dir_ino = (uint64_t) sb.st_ino;
626 	}
627 
628 	// Create buffers (of our maximum-defined size) to store authentication info.
629 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
630 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
631 
632 	if (!gfs.authentic_manifest || !gfs.payload) {
633 		error = ENOMEM;
634 		goto out;
635 	}
636 
637 	// Read our fd's into our buffers.
638 	// (Note that this will set the buffer size fields in `gfs`.)
639 	error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
640 	if (error) {
641 		goto out;
642 	}
643 
644 	gfs.graft_version = FSIOC_GRAFT_VERSION;
645 	gfs.graft_type = graft_type;
646 	gfs.graft_4cc = sbc_args->sbc_4cc;
647 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
648 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
649 	}
650 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
651 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
652 	}
653 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
654 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
655 	}
656 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
657 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
658 	}
659 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
660 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
661 	}
662 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
663 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
664 	}
665 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
666 
667 	// Call into the FS to perform the graft (and validation).
668 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
669 
670 out:
671 	if (gfs.authentic_manifest) {
672 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
673 		gfs.authentic_manifest = NULL;
674 	}
675 	if (gfs.payload) {
676 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
677 		gfs.payload = NULL;
678 	}
679 
680 	return error;
681 }
682 
683 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
684 
685 /*
686  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
687  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
688  */
689 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)690 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
691 {
692 	int ua_dmgfd = uap->dmg_fd;
693 	user_addr_t ua_mountdir = uap->mountdir;
694 	uint32_t ua_grafttype = uap->graft_type;
695 	user_addr_t ua_graftargs = uap->gda;
696 
697 	graftdmg_args_un kern_gda = {};
698 	int error = 0;
699 	secure_boot_cryptex_args_t *sbc_args = NULL;
700 
701 	vnode_t cryptex_vp = NULLVP;
702 	vnode_t mounton_vp = NULLVP;
703 	struct nameidata nd = {};
704 	vfs_context_t ctx = vfs_context_current();
705 
706 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
707 		return EPERM;
708 	}
709 
710 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
711 	if (error) {
712 		return error;
713 	}
714 
715 	// Copy mount dir in, if provided.
716 	if (ua_mountdir != USER_ADDR_NULL) {
717 		// Acquire vnode for mount-on path
718 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
719 		    UIO_USERSPACE, ua_mountdir, ctx);
720 
721 		error = namei(&nd);
722 		if (error) {
723 			return error;
724 		}
725 		mounton_vp = nd.ni_vp;
726 	}
727 
728 	// Convert fd to vnode.
729 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
730 	if (error) {
731 		goto graftout;
732 	}
733 
734 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_DOWNLEVEL) {
735 		error = EINVAL;
736 	} else {
737 		sbc_args = &kern_gda.sbc_args;
738 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
739 	}
740 
741 graftout:
742 	if (cryptex_vp) {
743 		vnode_put(cryptex_vp);
744 		cryptex_vp = NULLVP;
745 	}
746 	if (mounton_vp) {
747 		vnode_put(mounton_vp);
748 		mounton_vp = NULLVP;
749 	}
750 	if (ua_mountdir != USER_ADDR_NULL) {
751 		nameidone(&nd);
752 	}
753 
754 	return error;
755 }
756 
757 /*
758  * Ungraft a cryptex disk image (via mount dir FD)
759  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
760  */
761 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)762 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
763 {
764 	int error = 0;
765 	user_addr_t ua_mountdir = uap->mountdir;
766 	fsioc_ungraft_fs_t ugfs;
767 	vnode_t mounton_vp = NULLVP;
768 	struct nameidata nd = {};
769 	vfs_context_t ctx = vfs_context_current();
770 
771 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
772 		return EPERM;
773 	}
774 
775 	if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
776 		return EINVAL;
777 	}
778 
779 	ugfs.ungraft_flags = 0;
780 
781 	// Acquire vnode for mount-on path
782 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
783 	    UIO_USERSPACE, ua_mountdir, ctx);
784 
785 	error = namei(&nd);
786 	if (error) {
787 		return error;
788 	}
789 	mounton_vp = nd.ni_vp;
790 
791 	// Call into the FS to perform the ungraft
792 	error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
793 
794 	vnode_put(mounton_vp);
795 	nameidone(&nd);
796 
797 	return error;
798 }
799 
800 
801 void
vfs_notify_mount(vnode_t pdvp)802 vfs_notify_mount(vnode_t pdvp)
803 {
804 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
805 	lock_vnode_and_post(pdvp, NOTE_WRITE);
806 }
807 
808 /*
809  * __mac_mount:
810  *	Mount a file system taking into account MAC label behavior.
811  *	See mount(2) man page for more information
812  *
813  * Parameters:    p                        Process requesting the mount
814  *                uap                      User argument descriptor (see below)
815  *                retval                   (ignored)
816  *
817  * Indirect:      uap->type                Filesystem type
818  *                uap->path                Path to mount
819  *                uap->data                Mount arguments
820  *                uap->mac_p               MAC info
821  *                uap->flags               Mount flags
822  *
823  *
824  * Returns:        0                       Success
825  *                !0                       Not success
826  */
827 boolean_t root_fs_upgrade_try = FALSE;
828 
829 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)830 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
831 {
832 	vnode_t pvp = NULL;
833 	vnode_t vp = NULL;
834 	int need_nameidone = 0;
835 	vfs_context_t ctx = vfs_context_current();
836 	char fstypename[MFSNAMELEN];
837 	struct nameidata nd;
838 	size_t dummy = 0;
839 	char *labelstr = NULL;
840 	size_t labelsz = 0;
841 	int flags = uap->flags;
842 	int error;
843 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
844 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
845 #else
846 #pragma unused(p)
847 #endif
848 	/*
849 	 * Get the fs type name from user space
850 	 */
851 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
852 	if (error) {
853 		return error;
854 	}
855 
856 	/*
857 	 * Get the vnode to be covered
858 	 */
859 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
860 	    UIO_USERSPACE, uap->path, ctx);
861 	if (flags & MNT_NOFOLLOW) {
862 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
863 	}
864 	error = namei(&nd);
865 	if (error) {
866 		goto out;
867 	}
868 	need_nameidone = 1;
869 	vp = nd.ni_vp;
870 	pvp = nd.ni_dvp;
871 
872 #ifdef CONFIG_IMGSRC_ACCESS
873 	/* Mounting image source cannot be batched with other operations */
874 	if (flags == MNT_IMGSRC_BY_INDEX) {
875 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
876 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
877 		goto out;
878 	}
879 #endif /* CONFIG_IMGSRC_ACCESS */
880 
881 #if CONFIG_MACF
882 	/*
883 	 * Get the label string (if any) from user space
884 	 */
885 	if (uap->mac_p != USER_ADDR_NULL) {
886 		struct user_mac mac;
887 		size_t ulen = 0;
888 
889 		if (is_64bit) {
890 			struct user64_mac mac64;
891 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
892 			mac.m_buflen = (user_size_t)mac64.m_buflen;
893 			mac.m_string = (user_addr_t)mac64.m_string;
894 		} else {
895 			struct user32_mac mac32;
896 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
897 			mac.m_buflen = mac32.m_buflen;
898 			mac.m_string = mac32.m_string;
899 		}
900 		if (error) {
901 			goto out;
902 		}
903 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
904 		    (mac.m_buflen < 2)) {
905 			error = EINVAL;
906 			goto out;
907 		}
908 		labelsz = mac.m_buflen;
909 		labelstr = kalloc_data(labelsz, Z_WAITOK);
910 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
911 		if (error) {
912 			goto out;
913 		}
914 		AUDIT_ARG(mac_string, labelstr);
915 	}
916 #endif /* CONFIG_MACF */
917 
918 	AUDIT_ARG(fflags, flags);
919 
920 #if !CONFIG_UNION_MOUNTS
921 	if (flags & MNT_UNION) {
922 		error = EPERM;
923 		goto out;
924 	}
925 #endif
926 
927 	if ((vp->v_flag & VROOT) &&
928 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
929 #if CONFIG_UNION_MOUNTS
930 		if (!(flags & MNT_UNION)) {
931 			flags |= MNT_UPDATE;
932 		} else {
933 			/*
934 			 * For a union mount on '/', treat it as fresh
935 			 * mount instead of update.
936 			 * Otherwise, union mouting on '/' used to panic the
937 			 * system before, since mnt_vnodecovered was found to
938 			 * be NULL for '/' which is required for unionlookup
939 			 * after it gets ENOENT on union mount.
940 			 */
941 			flags = (flags & ~(MNT_UPDATE));
942 		}
943 #else
944 		flags |= MNT_UPDATE;
945 #endif /* CONFIG_UNION_MOUNTS */
946 
947 #if SECURE_KERNEL
948 		if ((flags & MNT_RDONLY) == 0) {
949 			/* Release kernels are not allowed to mount "/" as rw */
950 			error = EPERM;
951 			goto out;
952 		}
953 #endif
954 
955 		/*
956 		 * See 7392553 for more details on why this check exists.
957 		 * Suffice to say: If this check is ON and something tries
958 		 * to mount the rootFS RW, we'll turn off the codesign
959 		 * bitmap optimization.
960 		 */
961 #if CHECK_CS_VALIDATION_BITMAP
962 		if ((flags & MNT_RDONLY) == 0) {
963 			root_fs_upgrade_try = TRUE;
964 		}
965 #endif
966 	}
967 
968 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
969 	    labelstr, ctx);
970 
971 out:
972 
973 #if CONFIG_MACF
974 	kfree_data(labelstr, labelsz);
975 #endif /* CONFIG_MACF */
976 
977 	if (vp) {
978 		vnode_put(vp);
979 	}
980 	if (pvp) {
981 		vnode_put(pvp);
982 	}
983 	if (need_nameidone) {
984 		nameidone(&nd);
985 	}
986 
987 	return error;
988 }
989 
990 /*
991  * common mount implementation (final stage of mounting)
992  *
993  * Arguments:
994  *  fstypename	file system type (ie it's vfs name)
995  *  pvp		parent of covered vnode
996  *  vp		covered vnode
997  *  cnp		component name (ie path) of covered vnode
998  *  flags	generic mount flags
999  *  fsmountargs	file system specific data
1000  *  labelstr	optional MAC label
1001  *  kernelmount	TRUE for mounts initiated from inside the kernel
1002  *  ctx		caller's context
1003  */
1004 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1005 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1006     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1007     char *labelstr, vfs_context_t ctx)
1008 {
1009 #if !CONFIG_MACF
1010 #pragma unused(labelstr)
1011 #endif
1012 	struct vnode *devvp = NULLVP;
1013 	struct vnode *device_vnode = NULLVP;
1014 #if CONFIG_MACF
1015 	struct vnode *rvp;
1016 #endif
1017 	struct mount *mp = NULL;
1018 	struct vfstable *vfsp = (struct vfstable *)0;
1019 	struct proc *p = vfs_context_proc(ctx);
1020 	int error, flag = 0;
1021 	bool flag_set = false;
1022 	user_addr_t devpath = USER_ADDR_NULL;
1023 	int ronly = 0;
1024 	int mntalloc = 0;
1025 	boolean_t vfsp_ref = FALSE;
1026 	boolean_t is_rwlock_locked = FALSE;
1027 	boolean_t did_rele = FALSE;
1028 	boolean_t have_usecount = FALSE;
1029 	boolean_t did_set_lmount = FALSE;
1030 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1031 
1032 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1033 	/* Check for mutually-exclusive flag bits */
1034 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1035 	int bitcount = 0;
1036 	while (checkflags != 0) {
1037 		checkflags &= (checkflags - 1);
1038 		bitcount++;
1039 	}
1040 
1041 	if (bitcount > 1) {
1042 		//not allowed to request multiple mount-by-role flags
1043 		error = EINVAL;
1044 		goto out1;
1045 	}
1046 #endif
1047 
1048 	/*
1049 	 * Process an update for an existing mount
1050 	 */
1051 	if (flags & MNT_UPDATE) {
1052 		if ((vp->v_flag & VROOT) == 0) {
1053 			error = EINVAL;
1054 			goto out1;
1055 		}
1056 		mp = vp->v_mount;
1057 
1058 		/* if unmount or mount in progress, return error */
1059 		mount_lock_spin(mp);
1060 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1061 			mount_unlock(mp);
1062 			error = EBUSY;
1063 			goto out1;
1064 		}
1065 		mp->mnt_lflag |= MNT_LMOUNT;
1066 		did_set_lmount = TRUE;
1067 		mount_unlock(mp);
1068 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1069 		is_rwlock_locked = TRUE;
1070 		/*
1071 		 * We only allow the filesystem to be reloaded if it
1072 		 * is currently mounted read-only.
1073 		 */
1074 		if ((flags & MNT_RELOAD) &&
1075 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1076 			error = ENOTSUP;
1077 			goto out1;
1078 		}
1079 
1080 		/*
1081 		 * If content protection is enabled, update mounts are not
1082 		 * allowed to turn it off.
1083 		 */
1084 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1085 		    ((flags & MNT_CPROTECT) == 0)) {
1086 			error = EINVAL;
1087 			goto out1;
1088 		}
1089 
1090 		/*
1091 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1092 		 * failure to return an error for this so we'll just silently
1093 		 * add it if it is not passed in.
1094 		 */
1095 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1096 		    ((flags & MNT_REMOVABLE) == 0)) {
1097 			flags |= MNT_REMOVABLE;
1098 		}
1099 
1100 		/* Can't downgrade the backer of the root FS */
1101 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1102 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1103 			error = ENOTSUP;
1104 			goto out1;
1105 		}
1106 
1107 		/*
1108 		 * Only root, or the user that did the original mount is
1109 		 * permitted to update it.
1110 		 */
1111 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1112 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1113 			goto out1;
1114 		}
1115 #if CONFIG_MACF
1116 		error = mac_mount_check_remount(ctx, mp);
1117 		if (error != 0) {
1118 			goto out1;
1119 		}
1120 #endif
1121 		/*
1122 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1123 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1124 		 */
1125 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1126 			flags |= MNT_NOSUID | MNT_NODEV;
1127 			if (mp->mnt_flag & MNT_NOEXEC) {
1128 				flags |= MNT_NOEXEC;
1129 			}
1130 		}
1131 		flag = mp->mnt_flag;
1132 		flag_set = true;
1133 
1134 
1135 
1136 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1137 
1138 		vfsp = mp->mnt_vtable;
1139 		goto update;
1140 	} // MNT_UPDATE
1141 
1142 	/*
1143 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1144 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1145 	 */
1146 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1147 		flags |= MNT_NOSUID | MNT_NODEV;
1148 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1149 			flags |= MNT_NOEXEC;
1150 		}
1151 	}
1152 
1153 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1154 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1155 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1156 	mount_list_lock();
1157 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1158 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1159 			vfsp->vfc_refcount++;
1160 			vfsp_ref = TRUE;
1161 			break;
1162 		}
1163 	}
1164 	mount_list_unlock();
1165 	if (vfsp == NULL) {
1166 		error = ENODEV;
1167 		goto out1;
1168 	}
1169 
1170 	/*
1171 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1172 	 * except in ROSV configs and for the initial BaseSystem root.
1173 	 */
1174 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1175 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1176 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1177 		error = EINVAL;  /* unsupported request */
1178 		goto out1;
1179 	}
1180 
1181 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1182 	if (error != 0) {
1183 		goto out1;
1184 	}
1185 
1186 	/*
1187 	 * Allocate and initialize the filesystem (mount_t)
1188 	 */
1189 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1190 	mntalloc = 1;
1191 
1192 	/* Initialize the default IO constraints */
1193 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1194 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1195 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1196 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1197 	mp->mnt_devblocksize = DEV_BSIZE;
1198 	mp->mnt_alignmentmask = PAGE_MASK;
1199 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1200 	mp->mnt_ioscale = 1;
1201 	mp->mnt_ioflags = 0;
1202 	mp->mnt_realrootvp = NULLVP;
1203 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1204 
1205 	mp->mnt_lflag |= MNT_LMOUNT;
1206 	did_set_lmount = TRUE;
1207 
1208 	TAILQ_INIT(&mp->mnt_vnodelist);
1209 	TAILQ_INIT(&mp->mnt_workerqueue);
1210 	TAILQ_INIT(&mp->mnt_newvnodes);
1211 	mount_lock_init(mp);
1212 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1213 	is_rwlock_locked = TRUE;
1214 	mp->mnt_op = vfsp->vfc_vfsops;
1215 	mp->mnt_vtable = vfsp;
1216 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1217 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1218 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1219 	do {
1220 		size_t pathlen = MAXPATHLEN;
1221 
1222 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1223 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1224 		}
1225 	} while (0);
1226 	mp->mnt_vnodecovered = vp;
1227 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1228 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1229 	mp->mnt_devbsdunit = 0;
1230 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1231 
1232 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1233 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1234 
1235 	if (kernelmount) {
1236 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1237 	}
1238 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1239 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1240 	}
1241 
1242 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1243 		// kernel mounted devfs
1244 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1245 	}
1246 
1247 update:
1248 
1249 	/*
1250 	 * Set the mount level flags.
1251 	 */
1252 	if (flags & MNT_RDONLY) {
1253 		mp->mnt_flag |= MNT_RDONLY;
1254 	} else if (mp->mnt_flag & MNT_RDONLY) {
1255 		// disallow read/write upgrades of file systems that
1256 		// had the TYPENAME_OVERRIDE feature set.
1257 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1258 			error = EPERM;
1259 			goto out1;
1260 		}
1261 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1262 	}
1263 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1264 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1265 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1266 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1267 	    MNT_QUARANTINE | MNT_CPROTECT);
1268 
1269 #if SECURE_KERNEL
1270 #if !CONFIG_MNT_SUID
1271 	/*
1272 	 * On release builds of iOS based platforms, always enforce NOSUID on
1273 	 * all mounts. We do this here because we can catch update mounts as well as
1274 	 * non-update mounts in this case.
1275 	 */
1276 	mp->mnt_flag |= (MNT_NOSUID);
1277 #endif
1278 #endif
1279 
1280 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1281 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1282 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1283 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1284 	    MNT_QUARANTINE | MNT_CPROTECT);
1285 
1286 #if CONFIG_MACF
1287 	if (flags & MNT_MULTILABEL) {
1288 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1289 			error = EINVAL;
1290 			goto out1;
1291 		}
1292 		mp->mnt_flag |= MNT_MULTILABEL;
1293 	}
1294 #endif
1295 	/*
1296 	 * Process device path for local file systems if requested.
1297 	 *
1298 	 * Snapshot and mount-by-role mounts do not use this path; they are
1299 	 * passing other opaque data in the device path field.
1300 	 *
1301 	 * Basesystemroot mounts pass a device path to be resolved here,
1302 	 * but it's just a char * already inside the kernel, which
1303 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1304 	 * mounts we must skip copyin (both of the address and of the string
1305 	 * (in NDINIT).
1306 	 */
1307 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1308 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1309 		boolean_t do_copyin_devpath = true;
1310 #if CONFIG_BASESYSTEMROOT
1311 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1312 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1313 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1314 			// but is actually a char ** pointing to a (kernelspace) string.
1315 			// We manually unpack it with a series of casts and dereferences
1316 			// that reverses what was done just above us on the stack in
1317 			// imageboot_pivot_image().
1318 			// After retrieving the path to the dev node (which we will NDINIT
1319 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1320 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1321 			char **devnamepp = (char **)fsmountargs;
1322 			char *devnamep = *devnamepp;
1323 			devpath = CAST_USER_ADDR_T(devnamep);
1324 			do_copyin_devpath = false;
1325 			fsmountargs = USER_ADDR_NULL;
1326 
1327 			//Now that we have a mp, denote that this mount is for the basesystem.
1328 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1329 		}
1330 #endif // CONFIG_BASESYSTEMROOT
1331 
1332 		if (do_copyin_devpath) {
1333 			if (vfs_context_is64bit(ctx)) {
1334 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1335 					goto out1;
1336 				}
1337 				fsmountargs += sizeof(devpath);
1338 			} else {
1339 				user32_addr_t tmp;
1340 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1341 					goto out1;
1342 				}
1343 				/* munge into LP64 addr */
1344 				devpath = CAST_USER_ADDR_T(tmp);
1345 				fsmountargs += sizeof(tmp);
1346 			}
1347 		}
1348 
1349 		/* Lookup device and authorize access to it */
1350 		if ((devpath)) {
1351 			struct nameidata nd;
1352 
1353 			enum uio_seg seg = UIO_USERSPACE;
1354 #if CONFIG_BASESYSTEMROOT
1355 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1356 				seg = UIO_SYSSPACE;
1357 			}
1358 #endif // CONFIG_BASESYSTEMROOT
1359 
1360 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1361 			if ((error = namei(&nd))) {
1362 				goto out1;
1363 			}
1364 
1365 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1366 			devvp = nd.ni_vp;
1367 
1368 			nameidone(&nd);
1369 
1370 			if (devvp->v_type != VBLK) {
1371 				error = ENOTBLK;
1372 				goto out2;
1373 			}
1374 			if (major(devvp->v_rdev) >= nblkdev) {
1375 				error = ENXIO;
1376 				goto out2;
1377 			}
1378 			/*
1379 			 * If mount by non-root, then verify that user has necessary
1380 			 * permissions on the device.
1381 			 */
1382 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1383 				mode_t accessmode = KAUTH_VNODE_READ_DATA;
1384 
1385 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1386 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1387 				}
1388 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1389 					goto out2;
1390 				}
1391 			}
1392 		}
1393 		/* On first mount, preflight and open device */
1394 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1395 			if ((error = vnode_ref(devvp))) {
1396 				goto out2;
1397 			}
1398 			/*
1399 			 * Disallow multiple mounts of the same device.
1400 			 * Disallow mounting of a device that is currently in use
1401 			 * (except for root, which might share swap device for miniroot).
1402 			 * Flush out any old buffers remaining from a previous use.
1403 			 */
1404 			if ((error = vfs_mountedon(devvp))) {
1405 				goto out3;
1406 			}
1407 
1408 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1409 				error = EBUSY;
1410 				goto out3;
1411 			}
1412 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1413 				error = ENOTBLK;
1414 				goto out3;
1415 			}
1416 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1417 				goto out3;
1418 			}
1419 
1420 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1421 #if CONFIG_MACF
1422 			error = mac_vnode_check_open(ctx,
1423 			    devvp,
1424 			    ronly ? FREAD : FREAD | FWRITE);
1425 			if (error) {
1426 				goto out3;
1427 			}
1428 #endif /* MAC */
1429 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1430 				goto out3;
1431 			}
1432 
1433 			mp->mnt_devvp = devvp;
1434 			device_vnode = devvp;
1435 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1436 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1437 		    (device_vnode = mp->mnt_devvp)) {
1438 			dev_t dev;
1439 			int maj;
1440 			/*
1441 			 * If upgrade to read-write by non-root, then verify
1442 			 * that user has necessary permissions on the device.
1443 			 */
1444 			vnode_getalways(device_vnode);
1445 
1446 			if (suser(vfs_context_ucred(ctx), NULL) &&
1447 			    (error = vnode_authorize(device_vnode, NULL,
1448 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1449 			    ctx)) != 0) {
1450 				vnode_put(device_vnode);
1451 				goto out2;
1452 			}
1453 
1454 			/* Tell the device that we're upgrading */
1455 			dev = (dev_t)device_vnode->v_rdev;
1456 			maj = major(dev);
1457 
1458 			if ((u_int)maj >= (u_int)nblkdev) {
1459 				panic("Volume mounted on a device with invalid major number.");
1460 			}
1461 
1462 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1463 			vnode_put(device_vnode);
1464 			device_vnode = NULLVP;
1465 			if (error != 0) {
1466 				goto out2;
1467 			}
1468 		}
1469 	} // localargs && !(snapshot | data | vm)
1470 
1471 #if CONFIG_MACF
1472 	if ((flags & MNT_UPDATE) == 0) {
1473 		mac_mount_label_init(mp);
1474 		mac_mount_label_associate(ctx, mp);
1475 	}
1476 	if (labelstr) {
1477 		if ((flags & MNT_UPDATE) != 0) {
1478 			error = mac_mount_check_label_update(ctx, mp);
1479 			if (error != 0) {
1480 				goto out3;
1481 			}
1482 		}
1483 	}
1484 #endif
1485 	/*
1486 	 * Mount the filesystem.  We already asserted that internal_flags
1487 	 * cannot have more than one mount-by-role bit set.
1488 	 */
1489 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1490 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1491 		    (caddr_t)fsmountargs, 0, ctx);
1492 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1493 #if CONFIG_ROSV_STARTUP
1494 		struct mount *origin_mp = (struct mount*)fsmountargs;
1495 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1496 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1497 		if (error) {
1498 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1499 		} else {
1500 			/* Mark volume associated with system volume */
1501 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1502 
1503 			/* Attempt to acquire the mnt_devvp and set it up */
1504 			struct vnode *mp_devvp = NULL;
1505 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1506 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1507 				    0, &mp_devvp, vfs_context_kernel());
1508 				if (!lerr) {
1509 					mp->mnt_devvp = mp_devvp;
1510 					//vnode_lookup took an iocount, need to drop it.
1511 					vnode_put(mp_devvp);
1512 					// now set `device_vnode` to the devvp that was acquired.
1513 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1514 					// note that though the iocount above was dropped, the mount acquires
1515 					// an implicit reference against the device.
1516 					device_vnode = mp_devvp;
1517 				}
1518 			}
1519 		}
1520 #else
1521 		error = EINVAL;
1522 #endif
1523 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1524 #if CONFIG_MOUNT_VM
1525 		struct mount *origin_mp = (struct mount*)fsmountargs;
1526 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1527 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1528 		if (error) {
1529 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1530 		} else {
1531 			/* Mark volume associated with system volume and a swap mount */
1532 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1533 			/* Attempt to acquire the mnt_devvp and set it up */
1534 			struct vnode *mp_devvp = NULL;
1535 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1536 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1537 				    0, &mp_devvp, vfs_context_kernel());
1538 				if (!lerr) {
1539 					mp->mnt_devvp = mp_devvp;
1540 					//vnode_lookup took an iocount, need to drop it.
1541 					vnode_put(mp_devvp);
1542 
1543 					// now set `device_vnode` to the devvp that was acquired.
1544 					// note that though the iocount above was dropped, the mount acquires
1545 					// an implicit reference against the device.
1546 					device_vnode = mp_devvp;
1547 				}
1548 			}
1549 		}
1550 #else
1551 		error = EINVAL;
1552 #endif
1553 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1554 #if CONFIG_MOUNT_PREBOOTRECOVERY
1555 		struct mount *origin_mp = (struct mount*)fsmountargs;
1556 		uint32_t mount_role = 0;
1557 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1558 			mount_role = VFS_PREBOOT_ROLE;
1559 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1560 			mount_role = VFS_RECOVERY_ROLE;
1561 		}
1562 
1563 		if (mount_role != 0) {
1564 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1565 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1566 			if (error) {
1567 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1568 			} else {
1569 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1570 				/* Mark volume associated with system volume */
1571 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1572 				/* Attempt to acquire the mnt_devvp and set it up */
1573 				struct vnode *mp_devvp = NULL;
1574 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1575 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1576 					    0, &mp_devvp, vfs_context_kernel());
1577 					if (!lerr) {
1578 						mp->mnt_devvp = mp_devvp;
1579 						//vnode_lookup took an iocount, need to drop it.
1580 						vnode_put(mp_devvp);
1581 
1582 						// now set `device_vnode` to the devvp that was acquired.
1583 						// note that though the iocount above was dropped, the mount acquires
1584 						// an implicit reference against the device.
1585 						device_vnode = mp_devvp;
1586 					}
1587 				}
1588 			}
1589 		} else {
1590 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1591 			error = EINVAL;
1592 		}
1593 #else
1594 		error = EINVAL;
1595 #endif
1596 	} else {
1597 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1598 	}
1599 
1600 	if (flags & MNT_UPDATE) {
1601 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1602 			mp->mnt_flag &= ~MNT_RDONLY;
1603 		}
1604 		mp->mnt_flag &= ~
1605 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1606 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1607 		if (error) {
1608 			mp->mnt_flag = flag;  /* restore flag value */
1609 		}
1610 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1611 		lck_rw_done(&mp->mnt_rwlock);
1612 		is_rwlock_locked = FALSE;
1613 		if (!error) {
1614 			enablequotas(mp, ctx);
1615 		}
1616 		goto exit;
1617 	}
1618 
1619 	/*
1620 	 * Put the new filesystem on the mount list after root.
1621 	 */
1622 	if (error == 0) {
1623 		struct vfs_attr vfsattr;
1624 		if (device_vnode) {
1625 			/*
1626 			 *   cache the IO attributes for the underlying physical media...
1627 			 *   an error return indicates the underlying driver doesn't
1628 			 *   support all the queries necessary... however, reasonable
1629 			 *   defaults will have been set, so no reason to bail or care
1630 			 *
1631 			 *   Need to do this before calling the MAC hook as it needs
1632 			 *   information from this call.
1633 			 */
1634 			vfs_init_io_attributes(device_vnode, mp);
1635 		}
1636 
1637 #if CONFIG_MACF
1638 		error = mac_mount_check_mount_late(ctx, mp);
1639 		if (error != 0) {
1640 			goto out4;
1641 		}
1642 
1643 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1644 			error = VFS_ROOT(mp, &rvp, ctx);
1645 			if (error) {
1646 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1647 				goto out4;
1648 			}
1649 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1650 			/*
1651 			 * drop reference provided by VFS_ROOT
1652 			 */
1653 			vnode_put(rvp);
1654 
1655 			if (error) {
1656 				goto out4;
1657 			}
1658 		}
1659 #endif  /* MAC */
1660 
1661 		vnode_lock_spin(vp);
1662 		CLR(vp->v_flag, VMOUNT);
1663 		vp->v_mountedhere = mp;
1664 		vnode_unlock(vp);
1665 
1666 		/*
1667 		 * taking the name_cache_lock exclusively will
1668 		 * insure that everyone is out of the fast path who
1669 		 * might be trying to use a now stale copy of
1670 		 * vp->v_mountedhere->mnt_realrootvp
1671 		 * bumping mount_generation causes the cached values
1672 		 * to be invalidated
1673 		 */
1674 		name_cache_lock();
1675 		mount_generation++;
1676 		name_cache_unlock();
1677 
1678 		error = vnode_ref(vp);
1679 		if (error != 0) {
1680 			goto out4;
1681 		}
1682 
1683 		have_usecount = TRUE;
1684 
1685 		error = checkdirs(vp, ctx);
1686 		if (error != 0) {
1687 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1688 			goto out4;
1689 		}
1690 		/*
1691 		 * there is no cleanup code here so I have made it void
1692 		 * we need to revisit this
1693 		 */
1694 		(void)VFS_START(mp, 0, ctx);
1695 
1696 		if (mount_list_add(mp) != 0) {
1697 			/*
1698 			 * The system is shutting down trying to umount
1699 			 * everything, so fail with a plausible errno.
1700 			 */
1701 			error = EBUSY;
1702 			goto out4;
1703 		}
1704 		lck_rw_done(&mp->mnt_rwlock);
1705 		is_rwlock_locked = FALSE;
1706 
1707 		/* Check if this mounted file system supports EAs or named streams. */
1708 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1709 		VFSATTR_INIT(&vfsattr);
1710 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1711 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1712 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1713 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1714 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1715 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1716 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1717 			}
1718 #if NAMEDSTREAMS
1719 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1720 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1721 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1722 			}
1723 #endif
1724 			/* Check if this file system supports path from id lookups. */
1725 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1726 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1727 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1728 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1729 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1730 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1731 			}
1732 
1733 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1734 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1735 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1736 			}
1737 		}
1738 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1739 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1740 		}
1741 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1742 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1743 		}
1744 		/* increment the operations count */
1745 		OSAddAtomic(1, &vfs_nummntops);
1746 		enablequotas(mp, ctx);
1747 
1748 		if (device_vnode) {
1749 			device_vnode->v_specflags |= SI_MOUNTEDON;
1750 		}
1751 
1752 		/* Now that mount is setup, notify the listeners */
1753 		vfs_notify_mount(pvp);
1754 		IOBSDMountChange(mp, kIOMountChangeMount);
1755 	} else {
1756 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1757 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1758 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1759 			    mp->mnt_vtable->vfc_name, error);
1760 		}
1761 
1762 		vnode_lock_spin(vp);
1763 		CLR(vp->v_flag, VMOUNT);
1764 		vnode_unlock(vp);
1765 		mount_list_lock();
1766 		mp->mnt_vtable->vfc_refcount--;
1767 		mount_list_unlock();
1768 
1769 		if (device_vnode) {
1770 			vnode_rele(device_vnode);
1771 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1772 		}
1773 		lck_rw_done(&mp->mnt_rwlock);
1774 		is_rwlock_locked = FALSE;
1775 
1776 		if (nc_smr_enabled) {
1777 			vfs_smr_synchronize();
1778 		}
1779 
1780 		/*
1781 		 * if we get here, we have a mount structure that needs to be freed,
1782 		 * but since the coveredvp hasn't yet been updated to point at it,
1783 		 * no need to worry about other threads holding a crossref on this mp
1784 		 * so it's ok to just free it
1785 		 */
1786 		mount_lock_destroy(mp);
1787 #if CONFIG_MACF
1788 		mac_mount_label_destroy(mp);
1789 #endif
1790 		zfree(mount_zone, mp);
1791 		did_set_lmount = false;
1792 	}
1793 exit:
1794 	/*
1795 	 * drop I/O count on the device vp if there was one
1796 	 */
1797 	if (devpath && devvp) {
1798 		vnode_put(devvp);
1799 	}
1800 
1801 	if (did_set_lmount) {
1802 		mount_lock_spin(mp);
1803 		mp->mnt_lflag &= ~MNT_LMOUNT;
1804 		mount_unlock(mp);
1805 	}
1806 
1807 	return error;
1808 
1809 /* Error condition exits */
1810 out4:
1811 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1812 
1813 	/*
1814 	 * If the mount has been placed on the covered vp,
1815 	 * it may have been discovered by now, so we have
1816 	 * to treat this just like an unmount
1817 	 */
1818 	mount_lock_spin(mp);
1819 	mp->mnt_lflag |= MNT_LDEAD;
1820 	mount_unlock(mp);
1821 
1822 	if (device_vnode != NULLVP) {
1823 		vnode_rele(device_vnode);
1824 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1825 		    ctx);
1826 		did_rele = TRUE;
1827 	}
1828 
1829 	vnode_lock_spin(vp);
1830 
1831 	mp->mnt_crossref++;
1832 	vp->v_mountedhere = (mount_t) 0;
1833 
1834 	vnode_unlock(vp);
1835 
1836 	if (have_usecount) {
1837 		vnode_rele(vp);
1838 	}
1839 out3:
1840 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1841 		vnode_rele(devvp);
1842 	}
1843 out2:
1844 	if (devpath && devvp) {
1845 		vnode_put(devvp);
1846 	}
1847 out1:
1848 	/* Release mnt_rwlock only when it was taken */
1849 	if (is_rwlock_locked == TRUE) {
1850 		if (flag_set) {
1851 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1852 		}
1853 		lck_rw_done(&mp->mnt_rwlock);
1854 	}
1855 
1856 	if (did_set_lmount) {
1857 		mount_lock_spin(mp);
1858 		mp->mnt_lflag &= ~MNT_LMOUNT;
1859 		mount_unlock(mp);
1860 	}
1861 
1862 	if (mntalloc) {
1863 		if (mp->mnt_crossref) {
1864 			mount_dropcrossref(mp, vp, 0);
1865 		} else {
1866 			if (nc_smr_enabled) {
1867 				vfs_smr_synchronize();
1868 			}
1869 
1870 			mount_lock_destroy(mp);
1871 #if CONFIG_MACF
1872 			mac_mount_label_destroy(mp);
1873 #endif
1874 			zfree(mount_zone, mp);
1875 		}
1876 	}
1877 	if (vfsp_ref) {
1878 		mount_list_lock();
1879 		vfsp->vfc_refcount--;
1880 		mount_list_unlock();
1881 	}
1882 
1883 	return error;
1884 }
1885 
1886 /*
1887  * Flush in-core data, check for competing mount attempts,
1888  * and set VMOUNT
1889  */
1890 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1891 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1892 {
1893 #if !CONFIG_MACF
1894 #pragma unused(cnp,fsname)
1895 #endif
1896 	struct vnode_attr va;
1897 	int error;
1898 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1899 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1900 	boolean_t is_busy;
1901 
1902 	if (!skip_auth) {
1903 		/*
1904 		 * If the user is not root, ensure that they own the directory
1905 		 * onto which we are attempting to mount.
1906 		 */
1907 		VATTR_INIT(&va);
1908 		VATTR_WANTED(&va, va_uid);
1909 		if ((error = vnode_getattr(vp, &va, ctx)) ||
1910 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1911 		    (!vfs_context_issuser(ctx)))) {
1912 			error = EPERM;
1913 			goto out;
1914 		}
1915 	}
1916 
1917 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1918 		goto out;
1919 	}
1920 
1921 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1922 		goto out;
1923 	}
1924 
1925 	if (vp->v_type != VDIR) {
1926 		error = ENOTDIR;
1927 		goto out;
1928 	}
1929 
1930 	vnode_lock_spin(vp);
1931 	is_busy = is_fmount ?
1932 	    (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1933 	    (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1934 	if (is_busy) {
1935 		vnode_unlock(vp);
1936 		error = EBUSY;
1937 		goto out;
1938 	}
1939 	SET(vp->v_flag, VMOUNT);
1940 	vnode_unlock(vp);
1941 
1942 #if CONFIG_MACF
1943 	error = mac_mount_check_mount(ctx, vp,
1944 	    cnp, fsname);
1945 	if (error != 0) {
1946 		vnode_lock_spin(vp);
1947 		CLR(vp->v_flag, VMOUNT);
1948 		vnode_unlock(vp);
1949 	}
1950 #endif
1951 
1952 out:
1953 	return error;
1954 }
1955 
1956 #if CONFIG_IMGSRC_ACCESS
1957 
1958 #define DEBUG_IMGSRC 0
1959 
1960 #if DEBUG_IMGSRC
1961 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1962 #else
1963 #define IMGSRC_DEBUG(args...) do { } while(0)
1964 #endif
1965 
1966 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1967 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1968 {
1969 	struct nameidata nd;
1970 	vnode_t vp, realdevvp;
1971 	mode_t accessmode;
1972 	int error;
1973 	enum uio_seg uio = UIO_USERSPACE;
1974 
1975 	if (ctx == vfs_context_kernel()) {
1976 		uio = UIO_SYSSPACE;
1977 	}
1978 
1979 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1980 	if ((error = namei(&nd))) {
1981 		IMGSRC_DEBUG("namei() failed with %d\n", error);
1982 		return error;
1983 	}
1984 
1985 	vp = nd.ni_vp;
1986 
1987 	if (!vnode_isblk(vp)) {
1988 		IMGSRC_DEBUG("Not block device.\n");
1989 		error = ENOTBLK;
1990 		goto out;
1991 	}
1992 
1993 	realdevvp = mp->mnt_devvp;
1994 	if (realdevvp == NULLVP) {
1995 		IMGSRC_DEBUG("No device backs the mount.\n");
1996 		error = ENXIO;
1997 		goto out;
1998 	}
1999 
2000 	error = vnode_getwithref(realdevvp);
2001 	if (error != 0) {
2002 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2003 		goto out;
2004 	}
2005 
2006 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2007 		IMGSRC_DEBUG("Wrong dev_t.\n");
2008 		error = ENXIO;
2009 		goto out1;
2010 	}
2011 
2012 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2013 
2014 	/*
2015 	 * If mount by non-root, then verify that user has necessary
2016 	 * permissions on the device.
2017 	 */
2018 	if (!vfs_context_issuser(ctx)) {
2019 		accessmode = KAUTH_VNODE_READ_DATA;
2020 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2021 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2022 		}
2023 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2024 			IMGSRC_DEBUG("Access denied.\n");
2025 			goto out1;
2026 		}
2027 	}
2028 
2029 	*devvpp = vp;
2030 
2031 out1:
2032 	vnode_put(realdevvp);
2033 
2034 out:
2035 	nameidone(&nd);
2036 
2037 	if (error) {
2038 		vnode_put(vp);
2039 	}
2040 
2041 	return error;
2042 }
2043 
2044 /*
2045  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2046  * and call checkdirs()
2047  */
2048 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2049 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2050 {
2051 	int error;
2052 
2053 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2054 
2055 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2056 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2057 
2058 	vnode_lock_spin(vp);
2059 	CLR(vp->v_flag, VMOUNT);
2060 	vp->v_mountedhere = mp;
2061 	vnode_unlock(vp);
2062 
2063 	/*
2064 	 * taking the name_cache_lock exclusively will
2065 	 * insure that everyone is out of the fast path who
2066 	 * might be trying to use a now stale copy of
2067 	 * vp->v_mountedhere->mnt_realrootvp
2068 	 * bumping mount_generation causes the cached values
2069 	 * to be invalidated
2070 	 */
2071 	name_cache_lock();
2072 	mount_generation++;
2073 	name_cache_unlock();
2074 
2075 	error = vnode_ref(vp);
2076 	if (error != 0) {
2077 		goto out;
2078 	}
2079 
2080 	error = checkdirs(vp, ctx);
2081 	if (error != 0) {
2082 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2083 		vnode_rele(vp);
2084 		goto out;
2085 	}
2086 
2087 out:
2088 	if (error != 0) {
2089 		mp->mnt_vnodecovered = NULLVP;
2090 	}
2091 	return error;
2092 }
2093 
2094 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2095 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2096 {
2097 	vnode_rele(vp);
2098 	vnode_lock_spin(vp);
2099 	vp->v_mountedhere = (mount_t)NULL;
2100 	vnode_unlock(vp);
2101 
2102 	mp->mnt_vnodecovered = NULLVP;
2103 }
2104 
2105 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2106 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2107 {
2108 	int error;
2109 
2110 	/* unmount in progress return error */
2111 	mount_lock_spin(mp);
2112 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2113 		mount_unlock(mp);
2114 		return EBUSY;
2115 	}
2116 	mount_unlock(mp);
2117 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2118 
2119 	/*
2120 	 * We only allow the filesystem to be reloaded if it
2121 	 * is currently mounted read-only.
2122 	 */
2123 	if ((flags & MNT_RELOAD) &&
2124 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2125 		error = ENOTSUP;
2126 		goto out;
2127 	}
2128 
2129 	/*
2130 	 * Only root, or the user that did the original mount is
2131 	 * permitted to update it.
2132 	 */
2133 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2134 	    (!vfs_context_issuser(ctx))) {
2135 		error = EPERM;
2136 		goto out;
2137 	}
2138 #if CONFIG_MACF
2139 	error = mac_mount_check_remount(ctx, mp);
2140 	if (error != 0) {
2141 		goto out;
2142 	}
2143 #endif
2144 
2145 out:
2146 	if (error) {
2147 		lck_rw_done(&mp->mnt_rwlock);
2148 	}
2149 
2150 	return error;
2151 }
2152 
2153 static void
mount_end_update(mount_t mp)2154 mount_end_update(mount_t mp)
2155 {
2156 	lck_rw_done(&mp->mnt_rwlock);
2157 }
2158 
2159 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2160 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2161 {
2162 	vnode_t vp;
2163 
2164 	if (height >= MAX_IMAGEBOOT_NESTING) {
2165 		return EINVAL;
2166 	}
2167 
2168 	vp = imgsrc_rootvnodes[height];
2169 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2170 		*rvpp = vp;
2171 		return 0;
2172 	} else {
2173 		return ENOENT;
2174 	}
2175 }
2176 
2177 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2178 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2179     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2180     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2181 {
2182 	int error;
2183 	mount_t mp;
2184 	boolean_t placed = FALSE;
2185 	struct vfstable *vfsp;
2186 	user_addr_t devpath;
2187 	char *old_mntonname;
2188 	vnode_t rvp;
2189 	vnode_t devvp;
2190 	uint32_t height;
2191 	uint32_t flags;
2192 
2193 	/* If we didn't imageboot, nothing to move */
2194 	if (imgsrc_rootvnodes[0] == NULLVP) {
2195 		return EINVAL;
2196 	}
2197 
2198 	/* Only root can do this */
2199 	if (!vfs_context_issuser(ctx)) {
2200 		return EPERM;
2201 	}
2202 
2203 	IMGSRC_DEBUG("looking for root vnode.\n");
2204 
2205 	/*
2206 	 * Get root vnode of filesystem we're moving.
2207 	 */
2208 	if (by_index) {
2209 		if (is64bit) {
2210 			struct user64_mnt_imgsrc_args mia64;
2211 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2212 			if (error != 0) {
2213 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2214 				return error;
2215 			}
2216 
2217 			height = mia64.mi_height;
2218 			flags = mia64.mi_flags;
2219 			devpath = (user_addr_t)mia64.mi_devpath;
2220 		} else {
2221 			struct user32_mnt_imgsrc_args mia32;
2222 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2223 			if (error != 0) {
2224 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2225 				return error;
2226 			}
2227 
2228 			height = mia32.mi_height;
2229 			flags = mia32.mi_flags;
2230 			devpath = mia32.mi_devpath;
2231 		}
2232 	} else {
2233 		/*
2234 		 * For binary compatibility--assumes one level of nesting.
2235 		 */
2236 		if (is64bit) {
2237 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2238 				return error;
2239 			}
2240 		} else {
2241 			user32_addr_t tmp;
2242 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2243 				return error;
2244 			}
2245 
2246 			/* munge into LP64 addr */
2247 			devpath = CAST_USER_ADDR_T(tmp);
2248 		}
2249 
2250 		height = 0;
2251 		flags = 0;
2252 	}
2253 
2254 	if (flags != 0) {
2255 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2256 		return EINVAL;
2257 	}
2258 
2259 	error = get_imgsrc_rootvnode(height, &rvp);
2260 	if (error != 0) {
2261 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2262 		return error;
2263 	}
2264 
2265 	IMGSRC_DEBUG("got old root vnode\n");
2266 
2267 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2268 
2269 	/* Can only move once */
2270 	mp = vnode_mount(rvp);
2271 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2272 		IMGSRC_DEBUG("Already moved.\n");
2273 		error = EBUSY;
2274 		goto out0;
2275 	}
2276 
2277 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2278 	IMGSRC_DEBUG("Starting updated.\n");
2279 
2280 	/* Get exclusive rwlock on mount, authorize update on mp */
2281 	error = mount_begin_update(mp, ctx, 0);
2282 	if (error != 0) {
2283 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2284 		goto out0;
2285 	}
2286 
2287 	/*
2288 	 * It can only be moved once.  Flag is set under the rwlock,
2289 	 * so we're now safe to proceed.
2290 	 */
2291 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2292 		IMGSRC_DEBUG("Already moved [2]\n");
2293 		goto out1;
2294 	}
2295 
2296 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2297 
2298 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2299 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2300 	if (error != 0) {
2301 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2302 		goto out1;
2303 	}
2304 
2305 	IMGSRC_DEBUG("Covered vp OK.\n");
2306 
2307 	/* Sanity check the name caller has provided */
2308 	vfsp = mp->mnt_vtable;
2309 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2310 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2311 		    vfsp->vfc_name, fsname);
2312 		error = EINVAL;
2313 		goto out2;
2314 	}
2315 
2316 	/* Check the device vnode and update mount-from name, for local filesystems */
2317 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2318 		IMGSRC_DEBUG("Local, doing device validation.\n");
2319 
2320 		if (devpath != USER_ADDR_NULL) {
2321 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2322 			if (error) {
2323 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2324 				goto out2;
2325 			}
2326 
2327 			vnode_put(devvp);
2328 		}
2329 	}
2330 
2331 	/*
2332 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2333 	 * and increment the name cache's mount generation
2334 	 */
2335 
2336 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2337 	error = place_mount_and_checkdirs(mp, vp, ctx);
2338 	if (error != 0) {
2339 		goto out2;
2340 	}
2341 
2342 	placed = TRUE;
2343 
2344 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2345 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2346 
2347 	/* Forbid future moves */
2348 	mount_lock(mp);
2349 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2350 	mount_unlock(mp);
2351 
2352 	/* Finally, add to mount list, completely ready to go */
2353 	if (mount_list_add(mp) != 0) {
2354 		/*
2355 		 * The system is shutting down trying to umount
2356 		 * everything, so fail with a plausible errno.
2357 		 */
2358 		error = EBUSY;
2359 		goto out3;
2360 	}
2361 
2362 	mount_end_update(mp);
2363 	vnode_put(rvp);
2364 	zfree(ZV_NAMEI, old_mntonname);
2365 
2366 	vfs_notify_mount(pvp);
2367 
2368 	return 0;
2369 out3:
2370 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2371 
2372 	mount_lock(mp);
2373 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2374 	mount_unlock(mp);
2375 
2376 out2:
2377 	/*
2378 	 * Placing the mp on the vnode clears VMOUNT,
2379 	 * so cleanup is different after that point
2380 	 */
2381 	if (placed) {
2382 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2383 		undo_place_on_covered_vp(mp, vp);
2384 	} else {
2385 		vnode_lock_spin(vp);
2386 		CLR(vp->v_flag, VMOUNT);
2387 		vnode_unlock(vp);
2388 	}
2389 out1:
2390 	mount_end_update(mp);
2391 
2392 out0:
2393 	vnode_put(rvp);
2394 	zfree(ZV_NAMEI, old_mntonname);
2395 	return error;
2396 }
2397 
2398 #endif /* CONFIG_IMGSRC_ACCESS */
2399 
2400 void
enablequotas(struct mount * mp,vfs_context_t ctx)2401 enablequotas(struct mount *mp, vfs_context_t ctx)
2402 {
2403 	struct nameidata qnd;
2404 	int type;
2405 	char qfpath[MAXPATHLEN];
2406 	const char *qfname = QUOTAFILENAME;
2407 	const char *qfopsname = QUOTAOPSNAME;
2408 	const char *qfextension[] = INITQFNAMES;
2409 
2410 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2411 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2412 		return;
2413 	}
2414 	/*
2415 	 * Enable filesystem disk quotas if necessary.
2416 	 * We ignore errors as this should not interfere with final mount
2417 	 */
2418 	for (type = 0; type < MAXQUOTAS; type++) {
2419 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2420 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2421 		    CAST_USER_ADDR_T(qfpath), ctx);
2422 		if (namei(&qnd) != 0) {
2423 			continue;           /* option file to trigger quotas is not present */
2424 		}
2425 		vnode_put(qnd.ni_vp);
2426 		nameidone(&qnd);
2427 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2428 
2429 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2430 	}
2431 	return;
2432 }
2433 
2434 
2435 static int
checkdirs_callback(proc_t p,void * arg)2436 checkdirs_callback(proc_t p, void * arg)
2437 {
2438 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2439 	vnode_t olddp = cdrp->olddp;
2440 	vnode_t newdp = cdrp->newdp;
2441 	struct filedesc *fdp = &p->p_fd;
2442 	vnode_t new_cvp = newdp;
2443 	vnode_t new_rvp = newdp;
2444 	vnode_t old_cvp = NULL;
2445 	vnode_t old_rvp = NULL;
2446 
2447 	/*
2448 	 * XXX Also needs to iterate each thread in the process to see if it
2449 	 * XXX is using a per-thread current working directory, and, if so,
2450 	 * XXX update that as well.
2451 	 */
2452 
2453 	/*
2454 	 * First, with the proc_fdlock held, check to see if we will need
2455 	 * to do any work.  If not, we will get out fast.
2456 	 */
2457 	proc_fdlock(p);
2458 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2459 		proc_fdunlock(p);
2460 		return PROC_RETURNED;
2461 	}
2462 	proc_fdunlock(p);
2463 
2464 	/*
2465 	 * Ok, we will have to do some work.  Always take two refs
2466 	 * because we might need that many.  We'll dispose of whatever
2467 	 * we ended up not using.
2468 	 */
2469 	if (vnode_ref(newdp) != 0) {
2470 		return PROC_RETURNED;
2471 	}
2472 	if (vnode_ref(newdp) != 0) {
2473 		vnode_rele(newdp);
2474 		return PROC_RETURNED;
2475 	}
2476 
2477 	proc_dirs_lock_exclusive(p);
2478 	/*
2479 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2480 	 * have to do all of the checks again.
2481 	 */
2482 	proc_fdlock(p);
2483 	if (fdp->fd_cdir == olddp) {
2484 		old_cvp = olddp;
2485 		fdp->fd_cdir = newdp;
2486 		new_cvp = NULL;
2487 	}
2488 	if (fdp->fd_rdir == olddp) {
2489 		old_rvp = olddp;
2490 		fdp->fd_rdir = newdp;
2491 		new_rvp = NULL;
2492 	}
2493 	proc_fdunlock(p);
2494 	proc_dirs_unlock_exclusive(p);
2495 
2496 	/*
2497 	 * Dispose of any references that are no longer needed.
2498 	 */
2499 	if (old_cvp != NULL) {
2500 		vnode_rele(old_cvp);
2501 	}
2502 	if (old_rvp != NULL) {
2503 		vnode_rele(old_rvp);
2504 	}
2505 	if (new_cvp != NULL) {
2506 		vnode_rele(new_cvp);
2507 	}
2508 	if (new_rvp != NULL) {
2509 		vnode_rele(new_rvp);
2510 	}
2511 
2512 	return PROC_RETURNED;
2513 }
2514 
2515 
2516 
2517 /*
2518  * Scan all active processes to see if any of them have a current
2519  * or root directory onto which the new filesystem has just been
2520  * mounted. If so, replace them with the new mount point.
2521  */
2522 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2523 checkdirs(vnode_t olddp, vfs_context_t ctx)
2524 {
2525 	vnode_t newdp;
2526 	vnode_t tvp;
2527 	int err;
2528 	struct cdirargs cdr;
2529 
2530 	if (olddp->v_usecount == 1) {
2531 		return 0;
2532 	}
2533 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2534 
2535 	if (err != 0) {
2536 #if DIAGNOSTIC
2537 		panic("mount: lost mount: error %d", err);
2538 #endif
2539 		return err;
2540 	}
2541 
2542 	cdr.olddp = olddp;
2543 	cdr.newdp = newdp;
2544 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2545 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2546 
2547 	if (rootvnode == olddp) {
2548 		vnode_ref(newdp);
2549 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2550 		tvp = rootvnode;
2551 		rootvnode = newdp;
2552 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2553 		vnode_rele(tvp);
2554 	}
2555 
2556 	vnode_put(newdp);
2557 	return 0;
2558 }
2559 
2560 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2561 	"com.apple.private.vfs.role-account-unmount"
2562 
2563 /*
2564  * Unmount a file system.
2565  *
2566  * Note: unmount takes a path to the vnode mounted on as argument,
2567  * not special file (as before).
2568  */
2569 /* ARGSUSED */
2570 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2571 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2572 {
2573 	vnode_t vp;
2574 	struct mount *mp;
2575 	int error;
2576 	struct nameidata nd;
2577 	vfs_context_t ctx;
2578 
2579 	/*
2580 	 * If the process has the entitlement, use the kernel's context when
2581 	 * performing lookup on the mount path as the process might lack proper
2582 	 * permission to access the directory.
2583 	 */
2584 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2585 	    vfs_context_kernel() : vfs_context_current();
2586 
2587 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2588 	    UIO_USERSPACE, uap->path, ctx);
2589 	error = namei(&nd);
2590 	if (error) {
2591 		return error;
2592 	}
2593 	vp = nd.ni_vp;
2594 	mp = vp->v_mount;
2595 	nameidone(&nd);
2596 
2597 #if CONFIG_MACF
2598 	error = mac_mount_check_umount(ctx, mp);
2599 	if (error != 0) {
2600 		vnode_put(vp);
2601 		return error;
2602 	}
2603 #endif
2604 	/*
2605 	 * Must be the root of the filesystem
2606 	 */
2607 	if ((vp->v_flag & VROOT) == 0) {
2608 		vnode_put(vp);
2609 		return EINVAL;
2610 	}
2611 	mount_ref(mp, 0);
2612 	vnode_put(vp);
2613 	/* safedounmount consumes the mount ref */
2614 	return safedounmount(mp, uap->flags, ctx);
2615 }
2616 
2617 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2618 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2619 {
2620 	mount_t mp;
2621 
2622 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2623 	if (mp == (mount_t)0) {
2624 		return ENOENT;
2625 	}
2626 	mount_ref(mp, 0);
2627 	mount_iterdrop(mp);
2628 	/* safedounmount consumes the mount ref */
2629 	return safedounmount(mp, flags, ctx);
2630 }
2631 
2632 /*
2633  * The mount struct comes with a mount ref which will be consumed.
2634  * Do the actual file system unmount, prevent some common foot shooting.
2635  */
2636 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2637 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2638 {
2639 	int error;
2640 	proc_t p = vfs_context_proc(ctx);
2641 
2642 	/*
2643 	 * If the file system is not responding and MNT_NOBLOCK
2644 	 * is set and not a forced unmount then return EBUSY.
2645 	 */
2646 	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2647 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2648 		error = EBUSY;
2649 		goto out;
2650 	}
2651 
2652 	/*
2653 	 * Skip authorization in two cases:
2654 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2655 	 *   This entitlement allows non-root processes unmount volumes mounted by
2656 	 *   other processes.
2657 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2658 	 *   attempt.
2659 	 */
2660 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2661 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2662 		/*
2663 		 * Only root, or the user that did the original mount is
2664 		 * permitted to unmount this filesystem.
2665 		 */
2666 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2667 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2668 			goto out;
2669 		}
2670 	}
2671 	/*
2672 	 * Don't allow unmounting the root file system, or other volumes
2673 	 * associated with it (for example, the associated VM or DATA mounts) .
2674 	 */
2675 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2676 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2677 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2678 			    mp->mnt_vfsstat.f_mntonname);
2679 		}
2680 		error = EBUSY; /* the root (or associated volumes) is always busy */
2681 		goto out;
2682 	}
2683 
2684 	/*
2685 	 * If the mount is providing the root filesystem's disk image
2686 	 * (i.e. imageboot), don't allow unmounting
2687 	 */
2688 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2689 		error = EBUSY;
2690 		goto out;
2691 	}
2692 
2693 	return dounmount(mp, flags, 1, ctx);
2694 
2695 out:
2696 	mount_drop(mp, 0);
2697 	return error;
2698 }
2699 
2700 /*
2701  * Do the actual file system unmount.
2702  */
2703 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2704 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2705 {
2706 	vnode_t coveredvp = (vnode_t)0;
2707 	int error;
2708 	int needwakeup = 0;
2709 	int forcedunmount = 0;
2710 	int lflags = 0;
2711 	struct vnode *devvp = NULLVP;
2712 #if CONFIG_TRIGGERS
2713 	proc_t p = vfs_context_proc(ctx);
2714 	int did_vflush = 0;
2715 	int pflags_save = 0;
2716 #endif /* CONFIG_TRIGGERS */
2717 
2718 #if CONFIG_FSE
2719 	if (!(flags & MNT_FORCE)) {
2720 		fsevent_unmount(mp, ctx);  /* has to come first! */
2721 	}
2722 #endif
2723 
2724 	mount_lock(mp);
2725 
2726 	/*
2727 	 * If already an unmount in progress just return EBUSY.
2728 	 * Even a forced unmount cannot override.
2729 	 */
2730 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2731 		if (withref != 0) {
2732 			mount_drop(mp, 1);
2733 		}
2734 		mount_unlock(mp);
2735 		return EBUSY;
2736 	}
2737 
2738 	if (flags & MNT_FORCE) {
2739 		forcedunmount = 1;
2740 		mp->mnt_lflag |= MNT_LFORCE;
2741 	}
2742 
2743 #if CONFIG_TRIGGERS
2744 	if (flags & MNT_NOBLOCK && p != kernproc) {
2745 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2746 	}
2747 #endif
2748 
2749 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2750 	mp->mnt_lflag |= MNT_LUNMOUNT;
2751 	mp->mnt_flag &= ~MNT_ASYNC;
2752 	/*
2753 	 * anyone currently in the fast path that
2754 	 * trips over the cached rootvp will be
2755 	 * dumped out and forced into the slow path
2756 	 * to regenerate a new cached value
2757 	 */
2758 	mp->mnt_realrootvp = NULLVP;
2759 	mount_unlock(mp);
2760 
2761 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2762 		/*
2763 		 * Force unmount any mounts in this filesystem.
2764 		 * If any unmounts fail - just leave them dangling.
2765 		 * Avoids recursion.
2766 		 */
2767 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2768 	}
2769 
2770 	/*
2771 	 * taking the name_cache_lock exclusively will
2772 	 * insure that everyone is out of the fast path who
2773 	 * might be trying to use a now stale copy of
2774 	 * vp->v_mountedhere->mnt_realrootvp
2775 	 * bumping mount_generation causes the cached values
2776 	 * to be invalidated
2777 	 */
2778 	name_cache_lock();
2779 	mount_generation++;
2780 	name_cache_unlock();
2781 
2782 
2783 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2784 	if (withref != 0) {
2785 		mount_drop(mp, 0);
2786 	}
2787 	error = 0;
2788 	if (forcedunmount == 0) {
2789 		ubc_umount(mp); /* release cached vnodes */
2790 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2791 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2792 			if (error) {
2793 				mount_lock(mp);
2794 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2795 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2796 				mp->mnt_lflag &= ~MNT_LFORCE;
2797 				goto out;
2798 			}
2799 		}
2800 	}
2801 
2802 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2803 
2804 #if CONFIG_TRIGGERS
2805 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2806 	did_vflush = 1;
2807 #endif
2808 	if (forcedunmount) {
2809 		lflags |= FORCECLOSE;
2810 	}
2811 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2812 	if ((forcedunmount == 0) && error) {
2813 		mount_lock(mp);
2814 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2815 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2816 		mp->mnt_lflag &= ~MNT_LFORCE;
2817 		goto out;
2818 	}
2819 
2820 	/* make sure there are no one in the mount iterations or lookup */
2821 	mount_iterdrain(mp);
2822 
2823 	error = VFS_UNMOUNT(mp, flags, ctx);
2824 	if (error) {
2825 		mount_iterreset(mp);
2826 		mount_lock(mp);
2827 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2828 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2829 		mp->mnt_lflag &= ~MNT_LFORCE;
2830 		goto out;
2831 	}
2832 
2833 	/* increment the operations count */
2834 	if (!error) {
2835 		OSAddAtomic(1, &vfs_nummntops);
2836 	}
2837 
2838 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2839 		/* hold an io reference and drop the usecount before close */
2840 		devvp = mp->mnt_devvp;
2841 		vnode_getalways(devvp);
2842 		vnode_rele(devvp);
2843 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2844 		    ctx);
2845 		vnode_clearmountedon(devvp);
2846 		vnode_put(devvp);
2847 	}
2848 	lck_rw_done(&mp->mnt_rwlock);
2849 	mount_list_remove(mp);
2850 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2851 
2852 	/* mark the mount point hook in the vp but not drop the ref yet */
2853 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2854 		/*
2855 		 * The covered vnode needs special handling. Trying to get an
2856 		 * iocount must not block here as this may lead to deadlocks
2857 		 * if the Filesystem to which the covered vnode belongs is
2858 		 * undergoing forced unmounts. Since we hold a usecount, the
2859 		 * vnode cannot be reused (it can, however, still be terminated)
2860 		 */
2861 		vnode_getalways(coveredvp);
2862 		vnode_lock_spin(coveredvp);
2863 
2864 		mp->mnt_crossref++;
2865 		coveredvp->v_mountedhere = (struct mount *)0;
2866 		CLR(coveredvp->v_flag, VMOUNT);
2867 
2868 		vnode_unlock(coveredvp);
2869 		vnode_put(coveredvp);
2870 	}
2871 
2872 	mount_list_lock();
2873 	mp->mnt_vtable->vfc_refcount--;
2874 	mount_list_unlock();
2875 
2876 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
2877 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2878 	mount_lock(mp);
2879 	mp->mnt_lflag |= MNT_LDEAD;
2880 
2881 	if (mp->mnt_lflag & MNT_LWAIT) {
2882 		/*
2883 		 * do the wakeup here
2884 		 * in case we block in mount_refdrain
2885 		 * which will drop the mount lock
2886 		 * and allow anyone blocked in vfs_busy
2887 		 * to wakeup and see the LDEAD state
2888 		 */
2889 		mp->mnt_lflag &= ~MNT_LWAIT;
2890 		wakeup((caddr_t)mp);
2891 	}
2892 	mount_refdrain(mp);
2893 
2894 	/* free disk_conditioner_info structure for this mount */
2895 	disk_conditioner_unmount(mp);
2896 
2897 out:
2898 	if (mp->mnt_lflag & MNT_LWAIT) {
2899 		mp->mnt_lflag &= ~MNT_LWAIT;
2900 		needwakeup = 1;
2901 	}
2902 
2903 #if CONFIG_TRIGGERS
2904 	if (flags & MNT_NOBLOCK && p != kernproc) {
2905 		// Restore P_NOREMOTEHANG bit to its previous value
2906 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
2907 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2908 		}
2909 	}
2910 
2911 	/*
2912 	 * Callback and context are set together under the mount lock, and
2913 	 * never cleared, so we're safe to examine them here, drop the lock,
2914 	 * and call out.
2915 	 */
2916 	if (mp->mnt_triggercallback != NULL) {
2917 		mount_unlock(mp);
2918 		if (error == 0) {
2919 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2920 		} else if (did_vflush) {
2921 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2922 		}
2923 	} else {
2924 		mount_unlock(mp);
2925 	}
2926 #else
2927 	mount_unlock(mp);
2928 #endif /* CONFIG_TRIGGERS */
2929 
2930 	lck_rw_done(&mp->mnt_rwlock);
2931 
2932 	if (needwakeup) {
2933 		wakeup((caddr_t)mp);
2934 	}
2935 
2936 	if (!error) {
2937 		if ((coveredvp != NULLVP)) {
2938 			vnode_t pvp = NULLVP;
2939 
2940 			/*
2941 			 * The covered vnode needs special handling. Trying to
2942 			 * get an iocount must not block here as this may lead
2943 			 * to deadlocks if the Filesystem to which the covered
2944 			 * vnode belongs is undergoing forced unmounts. Since we
2945 			 * hold a usecount, the  vnode cannot be reused
2946 			 * (it can, however, still be terminated).
2947 			 */
2948 			vnode_getalways(coveredvp);
2949 
2950 			mount_dropcrossref(mp, coveredvp, 0);
2951 			/*
2952 			 * We'll _try_ to detect if this really needs to be
2953 			 * done. The coveredvp can only be in termination (or
2954 			 * terminated) if the coveredvp's mount point is in a
2955 			 * forced unmount (or has been) since we still hold the
2956 			 * ref.
2957 			 */
2958 			if (!vnode_isrecycled(coveredvp)) {
2959 				pvp = vnode_getparent(coveredvp);
2960 #if CONFIG_TRIGGERS
2961 				if (coveredvp->v_resolve) {
2962 					vnode_trigger_rearm(coveredvp, ctx);
2963 				}
2964 #endif
2965 			}
2966 
2967 			vnode_rele(coveredvp);
2968 			vnode_put(coveredvp);
2969 			coveredvp = NULLVP;
2970 
2971 			if (pvp) {
2972 				lock_vnode_and_post(pvp, NOTE_WRITE);
2973 				vnode_put(pvp);
2974 			}
2975 		} else if (mp->mnt_flag & MNT_ROOTFS) {
2976 			if (nc_smr_enabled) {
2977 				vfs_smr_synchronize();
2978 			}
2979 
2980 			mount_lock_destroy(mp);
2981 #if CONFIG_MACF
2982 			mac_mount_label_destroy(mp);
2983 #endif
2984 			zfree(mount_zone, mp);
2985 		} else {
2986 			panic("dounmount: no coveredvp");
2987 		}
2988 	}
2989 	return error;
2990 }
2991 
2992 /*
2993  * Unmount any mounts in this filesystem.
2994  */
2995 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2996 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2997 {
2998 	mount_t smp;
2999 	fsid_t *fsids, fsid;
3000 	int fsids_sz;
3001 	int count = 0, i, m = 0;
3002 	vnode_t vp;
3003 
3004 	mount_list_lock();
3005 
3006 	// Get an array to hold the submounts fsids.
3007 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
3008 	count++;
3009 	fsids_sz = count * sizeof(fsid_t);
3010 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3011 	if (fsids == NULL) {
3012 		mount_list_unlock();
3013 		goto out;
3014 	}
3015 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
3016 
3017 	/*
3018 	 * Fill the array with submount fsids.
3019 	 * Since mounts are always added to the tail of the mount list, the
3020 	 * list is always in mount order.
3021 	 * For each mount check if the mounted-on vnode belongs to a
3022 	 * mount that's already added to our array of mounts to be unmounted.
3023 	 */
3024 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3025 		vp = smp->mnt_vnodecovered;
3026 		if (vp == NULL) {
3027 			continue;
3028 		}
3029 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3030 		for (i = 0; i <= m; i++) {
3031 			if (fsids[i].val[0] == fsid.val[0] &&
3032 			    fsids[i].val[1] == fsid.val[1]) {
3033 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3034 				break;
3035 			}
3036 		}
3037 	}
3038 	mount_list_unlock();
3039 
3040 	// Unmount the submounts in reverse order. Ignore errors.
3041 	for (i = m; i > 0; i--) {
3042 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3043 		if (smp) {
3044 			mount_ref(smp, 0);
3045 			mount_iterdrop(smp);
3046 			(void) dounmount(smp, flags, 1, ctx);
3047 		}
3048 	}
3049 out:
3050 	kfree_data(fsids, fsids_sz);
3051 }
3052 
3053 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3054 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3055 {
3056 	vnode_hold(dp);
3057 	vnode_lock(dp);
3058 	mp->mnt_crossref--;
3059 
3060 	if (mp->mnt_crossref < 0) {
3061 		panic("mount cross refs -ve");
3062 	}
3063 
3064 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3065 		if (need_put) {
3066 			vnode_put_locked(dp);
3067 		}
3068 		vnode_drop_and_unlock(dp);
3069 
3070 		if (nc_smr_enabled) {
3071 			vfs_smr_synchronize();
3072 		}
3073 
3074 		mount_lock_destroy(mp);
3075 #if CONFIG_MACF
3076 		mac_mount_label_destroy(mp);
3077 #endif
3078 		zfree(mount_zone, mp);
3079 		return;
3080 	}
3081 	if (need_put) {
3082 		vnode_put_locked(dp);
3083 	}
3084 	vnode_drop_and_unlock(dp);
3085 }
3086 
3087 
3088 /*
3089  * Sync each mounted filesystem.
3090  */
3091 #if DIAGNOSTIC
3092 int syncprt = 0;
3093 #endif
3094 
3095 int print_vmpage_stat = 0;
3096 
3097 /*
3098  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3099  *			mounted read-write with the passed waitfor value.
3100  *
3101  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3102  *		arg	user argument (please see below)
3103  *
3104  * User argument is a pointer to 32 bit unsigned integer which describes the
3105  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3106  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3107  * waitfor value.
3108  *
3109  * Returns:		VFS_RETURNED
3110  */
3111 static int
sync_callback(mount_t mp,void * arg)3112 sync_callback(mount_t mp, void *arg)
3113 {
3114 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3115 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3116 		unsigned waitfor = MNT_NOWAIT;
3117 
3118 		if (arg) {
3119 			waitfor = *(uint32_t*)arg;
3120 		}
3121 
3122 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3123 		if (waitfor != MNT_WAIT &&
3124 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3125 		    waitfor != MNT_NOWAIT &&
3126 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3127 		    waitfor != MNT_DWAIT &&
3128 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3129 			panic("Passed inappropriate waitfor %u to "
3130 			    "sync_callback()", waitfor);
3131 		}
3132 
3133 		mp->mnt_flag &= ~MNT_ASYNC;
3134 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3135 		if (asyncflag) {
3136 			mp->mnt_flag |= MNT_ASYNC;
3137 		}
3138 	}
3139 
3140 	return VFS_RETURNED;
3141 }
3142 
3143 /* ARGSUSED */
3144 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3145 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3146 {
3147 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3148 
3149 	if (print_vmpage_stat) {
3150 		vm_countdirtypages();
3151 	}
3152 
3153 #if DIAGNOSTIC
3154 	if (syncprt) {
3155 		vfs_bufstats();
3156 	}
3157 #endif /* DIAGNOSTIC */
3158 	return 0;
3159 }
3160 
3161 typedef enum {
3162 	SYNC_ALL = 0,
3163 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3164 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3165 } sync_type_t;
3166 
3167 static int
sync_internal_callback(mount_t mp,void * arg)3168 sync_internal_callback(mount_t mp, void *arg)
3169 {
3170 	if (arg) {
3171 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3172 		    (mp->mnt_flag & MNT_LOCAL);
3173 		sync_type_t sync_type = *((sync_type_t *)arg);
3174 
3175 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3176 			return VFS_RETURNED;
3177 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3178 			return VFS_RETURNED;
3179 		}
3180 	}
3181 
3182 	(void)sync_callback(mp, NULL);
3183 
3184 	return VFS_RETURNED;
3185 }
3186 
3187 int sync_thread_state = 0;
3188 int sync_timeout_seconds = 5;
3189 
3190 #define SYNC_THREAD_RUN       0x0001
3191 #define SYNC_THREAD_RUNNING   0x0002
3192 
3193 #if CONFIG_PHYS_WRITE_ACCT
3194 thread_t pm_sync_thread;
3195 #endif /* CONFIG_PHYS_WRITE_ACCT */
3196 
3197 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3198 sync_thread(__unused void *arg, __unused wait_result_t wr)
3199 {
3200 	sync_type_t sync_type;
3201 #if CONFIG_PHYS_WRITE_ACCT
3202 	pm_sync_thread = current_thread();
3203 #endif /* CONFIG_PHYS_WRITE_ACCT */
3204 
3205 	lck_mtx_lock(&sync_mtx_lck);
3206 	while (sync_thread_state & SYNC_THREAD_RUN) {
3207 		sync_thread_state &= ~SYNC_THREAD_RUN;
3208 		lck_mtx_unlock(&sync_mtx_lck);
3209 
3210 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3211 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3212 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3213 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3214 
3215 		lck_mtx_lock(&sync_mtx_lck);
3216 	}
3217 	/*
3218 	 * This wakeup _has_ to be issued before the lock is released otherwise
3219 	 * we may end up waking up a thread in sync_internal which is
3220 	 * expecting a wakeup from a thread it just created and not from this
3221 	 * thread which is about to exit.
3222 	 */
3223 	wakeup(&sync_thread_state);
3224 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3225 #if CONFIG_PHYS_WRITE_ACCT
3226 	pm_sync_thread = NULL;
3227 #endif /* CONFIG_PHYS_WRITE_ACCT */
3228 	lck_mtx_unlock(&sync_mtx_lck);
3229 
3230 	if (print_vmpage_stat) {
3231 		vm_countdirtypages();
3232 	}
3233 
3234 #if DIAGNOSTIC
3235 	if (syncprt) {
3236 		vfs_bufstats();
3237 	}
3238 #endif /* DIAGNOSTIC */
3239 }
3240 
3241 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3242 
3243 /*
3244  * An in-kernel sync for power management to call.
3245  * This function always returns within sync_timeout seconds.
3246  */
3247 __private_extern__ int
sync_internal(void)3248 sync_internal(void)
3249 {
3250 	thread_t thd = NULL;
3251 	int error;
3252 	int thread_created = FALSE;
3253 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3254 
3255 	lck_mtx_lock(&sync_mtx_lck);
3256 	sync_thread_state |= SYNC_THREAD_RUN;
3257 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3258 		int kr;
3259 
3260 		sync_thread_state |= SYNC_THREAD_RUNNING;
3261 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3262 		if (kr != KERN_SUCCESS) {
3263 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3264 			lck_mtx_unlock(&sync_mtx_lck);
3265 			printf("sync_thread failed\n");
3266 			return 0;
3267 		}
3268 		thread_created = TRUE;
3269 	}
3270 
3271 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3272 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3273 	if (error) {
3274 		struct timeval now;
3275 
3276 		microtime(&now);
3277 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3278 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3279 			sync_timeout_last_print.tv_sec = now.tv_sec;
3280 		}
3281 	}
3282 
3283 	if (thread_created) {
3284 		thread_deallocate(thd);
3285 	}
3286 
3287 	return 0;
3288 } /* end of sync_internal call */
3289 
3290 /*
3291  * Change filesystem quotas.
3292  */
3293 #if QUOTA
3294 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3295 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3296 {
3297 	struct mount *mp;
3298 	int error, quota_cmd, quota_status = 0;
3299 	caddr_t datap;
3300 	size_t fnamelen;
3301 	struct nameidata nd;
3302 	vfs_context_t ctx = vfs_context_current();
3303 	struct dqblk my_dqblk = {};
3304 
3305 	AUDIT_ARG(uid, uap->uid);
3306 	AUDIT_ARG(cmd, uap->cmd);
3307 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3308 	    uap->path, ctx);
3309 	error = namei(&nd);
3310 	if (error) {
3311 		return error;
3312 	}
3313 	mp = nd.ni_vp->v_mount;
3314 	mount_ref(mp, 0);
3315 	vnode_put(nd.ni_vp);
3316 	nameidone(&nd);
3317 
3318 #if CONFIG_MACF
3319 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3320 	if (error != 0) {
3321 		goto out;
3322 	}
3323 #endif
3324 
3325 	/* copyin any data we will need for downstream code */
3326 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3327 
3328 	switch (quota_cmd) {
3329 	case Q_QUOTAON:
3330 		/* uap->arg specifies a file from which to take the quotas */
3331 		fnamelen = MAXPATHLEN;
3332 		datap = zalloc(ZV_NAMEI);
3333 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3334 		break;
3335 	case Q_GETQUOTA:
3336 		/* uap->arg is a pointer to a dqblk structure. */
3337 		datap = (caddr_t) &my_dqblk;
3338 		break;
3339 	case Q_SETQUOTA:
3340 	case Q_SETUSE:
3341 		/* uap->arg is a pointer to a dqblk structure. */
3342 		datap = (caddr_t) &my_dqblk;
3343 		if (proc_is64bit(p)) {
3344 			struct user_dqblk       my_dqblk64;
3345 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3346 			if (error == 0) {
3347 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3348 			}
3349 		} else {
3350 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3351 		}
3352 		break;
3353 	case Q_QUOTASTAT:
3354 		/* uap->arg is a pointer to an integer */
3355 		datap = (caddr_t) &quota_status;
3356 		break;
3357 	default:
3358 		datap = NULL;
3359 		break;
3360 	} /* switch */
3361 
3362 	if (error == 0) {
3363 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3364 	}
3365 
3366 	switch (quota_cmd) {
3367 	case Q_QUOTAON:
3368 		if (datap != NULL) {
3369 			zfree(ZV_NAMEI, datap);
3370 		}
3371 		break;
3372 	case Q_GETQUOTA:
3373 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3374 		if (error == 0) {
3375 			if (proc_is64bit(p)) {
3376 				struct user_dqblk       my_dqblk64;
3377 
3378 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3379 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3380 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3381 			} else {
3382 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3383 			}
3384 		}
3385 		break;
3386 	case Q_QUOTASTAT:
3387 		/* uap->arg is a pointer to an integer */
3388 		if (error == 0) {
3389 			error = copyout(datap, uap->arg, sizeof(quota_status));
3390 		}
3391 		break;
3392 	default:
3393 		break;
3394 	} /* switch */
3395 
3396 out:
3397 	mount_drop(mp, 0);
3398 	return error;
3399 }
3400 #else
3401 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3402 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3403 {
3404 	return EOPNOTSUPP;
3405 }
3406 #endif /* QUOTA */
3407 
3408 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3409 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3410 {
3411 	int error;
3412 	vfs_context_t ctx = vfs_context_current();
3413 
3414 #if CONFIG_MACF
3415 	error = mac_mount_check_stat(ctx, mp);
3416 	if (error != 0) {
3417 		return error;
3418 	}
3419 #endif
3420 
3421 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3422 	if (error != 0) {
3423 		return error;
3424 	}
3425 
3426 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3427 }
3428 
3429 /*
3430  * Get filesystem statistics.
3431  *
3432  * Returns:	0			Success
3433  *	namei:???
3434  *	vfs_update_vfsstat:???
3435  *	munge_statfs:EFAULT
3436  */
3437 /* ARGSUSED */
3438 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3439 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3440 {
3441 	int error;
3442 	struct mount *mp;
3443 	struct nameidata nd;
3444 	vfs_context_t ctx = vfs_context_current();
3445 	vnode_t vp;
3446 
3447 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3448 	    UIO_USERSPACE, uap->path, ctx);
3449 	error = namei(&nd);
3450 	if (error != 0) {
3451 		return error;
3452 	}
3453 	vp = nd.ni_vp;
3454 	mp = vp->v_mount;
3455 	nameidone(&nd);
3456 
3457 	error = statfs_internal(p, mp, uap->buf);
3458 	vnode_put(vp);
3459 
3460 	return error;
3461 }
3462 
3463 /*
3464  * Get filesystem statistics.
3465  */
3466 /* ARGSUSED */
3467 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3468 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3469 {
3470 	int error;
3471 	vnode_t vp = NULL;
3472 	struct mount *mp;
3473 
3474 	AUDIT_ARG(fd, uap->fd);
3475 
3476 	if ((error = file_vnode(uap->fd, &vp)) ||
3477 	    (error = vnode_getwithref(vp))) {
3478 		goto out;
3479 	}
3480 
3481 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3482 
3483 	mp = vp->v_mount;
3484 	if (!mp) {
3485 		error = EBADF;
3486 		goto out_vnode;
3487 	}
3488 
3489 	error = statfs_internal(p, mp, uap->buf);
3490 
3491 out_vnode:
3492 	vnode_put(vp);
3493 
3494 out:
3495 	if (vp != NULL) {
3496 		file_drop(uap->fd);
3497 	}
3498 
3499 	return error;
3500 }
3501 
3502 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3503 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3504 {
3505 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3506 
3507 	bzero(sfs, sizeof(*sfs));
3508 
3509 	sfs->f_bsize = vsfs->f_bsize;
3510 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3511 	sfs->f_blocks = vsfs->f_blocks;
3512 	sfs->f_bfree = vsfs->f_bfree;
3513 	sfs->f_bavail = vsfs->f_bavail;
3514 	sfs->f_files = vsfs->f_files;
3515 	sfs->f_ffree = vsfs->f_ffree;
3516 	sfs->f_fsid = vsfs->f_fsid;
3517 	sfs->f_owner = vsfs->f_owner;
3518 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3519 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3520 	sfs->f_fssubtype = vsfs->f_fssubtype;
3521 	sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3522 	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3523 		strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3524 	} else {
3525 		strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3526 	}
3527 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3528 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3529 }
3530 
3531 /*
3532  * Get file system statistics in 64-bit mode
3533  */
3534 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3535 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3536 {
3537 	struct mount *mp;
3538 	int error;
3539 	struct nameidata *ndp;
3540 	struct statfs64 *sfsp;
3541 	vfs_context_t ctxp = vfs_context_current();
3542 	vnode_t vp;
3543 	struct {
3544 		struct nameidata nd;
3545 		struct statfs64 sfs;
3546 	} *__nameidata_statfs64;
3547 
3548 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3549 	    Z_WAITOK);
3550 	ndp = &__nameidata_statfs64->nd;
3551 
3552 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3553 	    UIO_USERSPACE, uap->path, ctxp);
3554 	error = namei(ndp);
3555 	if (error != 0) {
3556 		goto out;
3557 	}
3558 	vp = ndp->ni_vp;
3559 	mp = vp->v_mount;
3560 	nameidone(ndp);
3561 
3562 #if CONFIG_MACF
3563 	error = mac_mount_check_stat(ctxp, mp);
3564 	if (error != 0) {
3565 		vnode_put(vp);
3566 		goto out;
3567 	}
3568 #endif
3569 
3570 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3571 	if (error != 0) {
3572 		vnode_put(vp);
3573 		goto out;
3574 	}
3575 
3576 	sfsp = &__nameidata_statfs64->sfs;
3577 	vfs_get_statfs64(mp, sfsp);
3578 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3579 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3580 		/* This process does not want to see a seperate data volume mountpoint */
3581 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3582 	}
3583 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3584 	vnode_put(vp);
3585 
3586 out:
3587 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3588 
3589 	return error;
3590 }
3591 
3592 /*
3593  * Get file system statistics in 64-bit mode
3594  */
3595 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3596 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3597 {
3598 	struct vnode *vp;
3599 	struct mount *mp;
3600 	struct statfs64 sfs;
3601 	int error;
3602 
3603 	AUDIT_ARG(fd, uap->fd);
3604 
3605 	if ((error = file_vnode(uap->fd, &vp))) {
3606 		return error;
3607 	}
3608 
3609 	error = vnode_getwithref(vp);
3610 	if (error) {
3611 		file_drop(uap->fd);
3612 		return error;
3613 	}
3614 
3615 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3616 
3617 	mp = vp->v_mount;
3618 	if (!mp) {
3619 		error = EBADF;
3620 		goto out;
3621 	}
3622 
3623 #if CONFIG_MACF
3624 	error = mac_mount_check_stat(vfs_context_current(), mp);
3625 	if (error != 0) {
3626 		goto out;
3627 	}
3628 #endif
3629 
3630 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3631 		goto out;
3632 	}
3633 
3634 	vfs_get_statfs64(mp, &sfs);
3635 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3636 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3637 		/* This process does not want to see a seperate data volume mountpoint */
3638 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3639 	}
3640 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3641 
3642 out:
3643 	file_drop(uap->fd);
3644 	vnode_put(vp);
3645 
3646 	return error;
3647 }
3648 
3649 struct getfsstat_struct {
3650 	user_addr_t     sfsp;
3651 	user_addr_t     *mp;
3652 	int             count;
3653 	int             maxcount;
3654 	int             flags;
3655 	int             error;
3656 };
3657 
3658 
3659 static int
getfsstat_callback(mount_t mp,void * arg)3660 getfsstat_callback(mount_t mp, void * arg)
3661 {
3662 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3663 	struct vfsstatfs *sp;
3664 	int error, my_size;
3665 	vfs_context_t ctx = vfs_context_current();
3666 
3667 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3668 #if CONFIG_MACF
3669 		error = mac_mount_check_stat(ctx, mp);
3670 		if (error != 0) {
3671 			fstp->error = error;
3672 			return VFS_RETURNED_DONE;
3673 		}
3674 #endif
3675 		sp = &mp->mnt_vfsstat;
3676 		/*
3677 		 * If MNT_NOWAIT is specified, do not refresh the
3678 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3679 		 */
3680 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3681 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3682 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3683 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3684 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3685 			return VFS_RETURNED;
3686 		}
3687 
3688 		/*
3689 		 * Need to handle LP64 version of struct statfs
3690 		 */
3691 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3692 		if (error) {
3693 			fstp->error = error;
3694 			return VFS_RETURNED_DONE;
3695 		}
3696 		fstp->sfsp += my_size;
3697 
3698 		if (fstp->mp) {
3699 #if CONFIG_MACF
3700 			error = mac_mount_label_get(mp, *fstp->mp);
3701 			if (error) {
3702 				fstp->error = error;
3703 				return VFS_RETURNED_DONE;
3704 			}
3705 #endif
3706 			fstp->mp++;
3707 		}
3708 	}
3709 	fstp->count++;
3710 	return VFS_RETURNED;
3711 }
3712 
3713 /*
3714  * Get statistics on all filesystems.
3715  */
3716 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3717 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3718 {
3719 	struct __mac_getfsstat_args muap;
3720 
3721 	muap.buf = uap->buf;
3722 	muap.bufsize = uap->bufsize;
3723 	muap.mac = USER_ADDR_NULL;
3724 	muap.macsize = 0;
3725 	muap.flags = uap->flags;
3726 
3727 	return __mac_getfsstat(p, &muap, retval);
3728 }
3729 
3730 /*
3731  * __mac_getfsstat: Get MAC-related file system statistics
3732  *
3733  * Parameters:    p                        (ignored)
3734  *                uap                      User argument descriptor (see below)
3735  *                retval                   Count of file system statistics (N stats)
3736  *
3737  * Indirect:      uap->bufsize             Buffer size
3738  *                uap->macsize             MAC info size
3739  *                uap->buf                 Buffer where information will be returned
3740  *                uap->mac                 MAC info
3741  *                uap->flags               File system flags
3742  *
3743  *
3744  * Returns:        0                       Success
3745  *                !0                       Not success
3746  *
3747  */
3748 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3749 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3750 {
3751 	user_addr_t sfsp;
3752 	user_addr_t *mp;
3753 	size_t count, maxcount, bufsize, macsize;
3754 	struct getfsstat_struct fst;
3755 
3756 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3757 		return EINVAL;
3758 	}
3759 
3760 	bufsize = (size_t) uap->bufsize;
3761 	macsize = (size_t) uap->macsize;
3762 
3763 	if (IS_64BIT_PROCESS(p)) {
3764 		maxcount = bufsize / sizeof(struct user64_statfs);
3765 	} else {
3766 		maxcount = bufsize / sizeof(struct user32_statfs);
3767 	}
3768 	sfsp = uap->buf;
3769 	count = 0;
3770 
3771 	mp = NULL;
3772 
3773 #if CONFIG_MACF
3774 	if (uap->mac != USER_ADDR_NULL) {
3775 		u_int32_t *mp0;
3776 		int error;
3777 		unsigned int i;
3778 
3779 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3780 		if (count != maxcount) {
3781 			return EINVAL;
3782 		}
3783 
3784 		/* Copy in the array */
3785 		mp0 = kalloc_data(macsize, Z_WAITOK);
3786 		if (mp0 == NULL) {
3787 			return ENOMEM;
3788 		}
3789 
3790 		error = copyin(uap->mac, mp0, macsize);
3791 		if (error) {
3792 			kfree_data(mp0, macsize);
3793 			return error;
3794 		}
3795 
3796 		/* Normalize to an array of user_addr_t */
3797 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3798 		if (mp == NULL) {
3799 			kfree_data(mp0, macsize);
3800 			return ENOMEM;
3801 		}
3802 
3803 		for (i = 0; i < count; i++) {
3804 			if (IS_64BIT_PROCESS(p)) {
3805 				mp[i] = ((user_addr_t *)mp0)[i];
3806 			} else {
3807 				mp[i] = (user_addr_t)mp0[i];
3808 			}
3809 		}
3810 		kfree_data(mp0, macsize);
3811 	}
3812 #endif
3813 
3814 
3815 	fst.sfsp = sfsp;
3816 	fst.mp = mp;
3817 	fst.flags = uap->flags;
3818 	fst.count = 0;
3819 	fst.error = 0;
3820 	fst.maxcount = (int)maxcount;
3821 
3822 
3823 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3824 
3825 	if (mp) {
3826 		kfree_data(mp, count * sizeof(user_addr_t));
3827 	}
3828 
3829 	if (fst.error) {
3830 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3831 		return fst.error;
3832 	}
3833 
3834 	if (fst.sfsp && fst.count > fst.maxcount) {
3835 		*retval = fst.maxcount;
3836 	} else {
3837 		*retval = fst.count;
3838 	}
3839 	return 0;
3840 }
3841 
3842 static int
getfsstat64_callback(mount_t mp,void * arg)3843 getfsstat64_callback(mount_t mp, void * arg)
3844 {
3845 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3846 	struct vfsstatfs *sp;
3847 	struct statfs64 sfs;
3848 	int error;
3849 
3850 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3851 #if CONFIG_MACF
3852 		error = mac_mount_check_stat(vfs_context_current(), mp);
3853 		if (error != 0) {
3854 			fstp->error = error;
3855 			return VFS_RETURNED_DONE;
3856 		}
3857 #endif
3858 		sp = &mp->mnt_vfsstat;
3859 		/*
3860 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3861 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
3862 		 *
3863 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3864 		 * getfsstat, since the constants are out of the same
3865 		 * namespace.
3866 		 */
3867 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3868 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3869 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3870 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3871 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3872 			return VFS_RETURNED;
3873 		}
3874 
3875 		vfs_get_statfs64(mp, &sfs);
3876 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3877 		if (error) {
3878 			fstp->error = error;
3879 			return VFS_RETURNED_DONE;
3880 		}
3881 		fstp->sfsp += sizeof(sfs);
3882 	}
3883 	fstp->count++;
3884 	return VFS_RETURNED;
3885 }
3886 
3887 /*
3888  * Get statistics on all file systems in 64 bit mode.
3889  */
3890 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3891 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3892 {
3893 	user_addr_t sfsp;
3894 	int count, maxcount;
3895 	struct getfsstat_struct fst;
3896 
3897 	maxcount = uap->bufsize / sizeof(struct statfs64);
3898 
3899 	sfsp = uap->buf;
3900 	count = 0;
3901 
3902 	fst.sfsp = sfsp;
3903 	fst.flags = uap->flags;
3904 	fst.count = 0;
3905 	fst.error = 0;
3906 	fst.maxcount = maxcount;
3907 
3908 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3909 
3910 	if (fst.error) {
3911 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3912 		return fst.error;
3913 	}
3914 
3915 	if (fst.sfsp && fst.count > fst.maxcount) {
3916 		*retval = fst.maxcount;
3917 	} else {
3918 		*retval = fst.count;
3919 	}
3920 
3921 	return 0;
3922 }
3923 
3924 /*
3925  * gets the associated vnode with the file descriptor passed.
3926  * as input
3927  *
3928  * INPUT
3929  * ctx - vfs context of caller
3930  * fd - file descriptor for which vnode is required.
3931  * vpp - Pointer to pointer to vnode to be returned.
3932  *
3933  * The vnode is returned with an iocount so any vnode obtained
3934  * by this call needs a vnode_put
3935  *
3936  */
3937 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3938 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3939 {
3940 	int error;
3941 	vnode_t vp;
3942 	struct fileproc *fp;
3943 	proc_t p = vfs_context_proc(ctx);
3944 
3945 	*vpp =  NULLVP;
3946 
3947 	error = fp_getfvp(p, fd, &fp, &vp);
3948 	if (error) {
3949 		return error;
3950 	}
3951 
3952 	error = vnode_getwithref(vp);
3953 	if (error) {
3954 		(void)fp_drop(p, fd, fp, 0);
3955 		return error;
3956 	}
3957 
3958 	(void)fp_drop(p, fd, fp, 0);
3959 	*vpp = vp;
3960 	return error;
3961 }
3962 
3963 /*
3964  * Wrapper function around namei to start lookup from a directory
3965  * specified by a file descriptor ni_dirfd.
3966  *
3967  * In addition to all the errors returned by namei, this call can
3968  * return ENOTDIR if the file descriptor does not refer to a directory.
3969  * and EBADF if the file descriptor is not valid.
3970  */
3971 int
nameiat(struct nameidata * ndp,int dirfd)3972 nameiat(struct nameidata *ndp, int dirfd)
3973 {
3974 	if ((dirfd != AT_FDCWD) &&
3975 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3976 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
3977 		int error = 0;
3978 		char c;
3979 
3980 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3981 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
3982 			if (error) {
3983 				return error;
3984 			}
3985 		} else {
3986 			c = *((char *)(ndp->ni_dirp));
3987 		}
3988 
3989 		if (c != '/') {
3990 			vnode_t dvp_at;
3991 
3992 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3993 			    &dvp_at);
3994 			if (error) {
3995 				return error;
3996 			}
3997 
3998 			if (vnode_vtype(dvp_at) != VDIR) {
3999 				vnode_put(dvp_at);
4000 				return ENOTDIR;
4001 			}
4002 
4003 			ndp->ni_dvp = dvp_at;
4004 			ndp->ni_cnd.cn_flags |= USEDVP;
4005 			error = namei(ndp);
4006 			ndp->ni_cnd.cn_flags &= ~USEDVP;
4007 			vnode_put(dvp_at);
4008 			return error;
4009 		}
4010 	}
4011 
4012 	return namei(ndp);
4013 }
4014 
4015 /*
4016  * Change current working directory to a given file descriptor.
4017  */
4018 /* ARGSUSED */
4019 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)4020 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
4021 {
4022 	vnode_t vp;
4023 	vnode_t tdp;
4024 	vnode_t tvp;
4025 	struct mount *mp;
4026 	int error, should_put = 1;
4027 	vfs_context_t ctx = vfs_context_current();
4028 
4029 	AUDIT_ARG(fd, uap->fd);
4030 	if (per_thread && uap->fd == -1) {
4031 		/*
4032 		 * Switching back from per-thread to per process CWD; verify we
4033 		 * in fact have one before proceeding.  The only success case
4034 		 * for this code path is to return 0 preemptively after zapping
4035 		 * the thread structure contents.
4036 		 */
4037 		thread_t th = vfs_context_thread(ctx);
4038 		if (th) {
4039 			uthread_t uth = get_bsdthread_info(th);
4040 			tvp = uth->uu_cdir;
4041 			uth->uu_cdir = NULLVP;
4042 			if (tvp != NULLVP) {
4043 				vnode_rele(tvp);
4044 				return 0;
4045 			}
4046 		}
4047 		return EBADF;
4048 	}
4049 
4050 	if ((error = file_vnode(uap->fd, &vp))) {
4051 		return error;
4052 	}
4053 	if ((error = vnode_getwithref(vp))) {
4054 		file_drop(uap->fd);
4055 		return error;
4056 	}
4057 
4058 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4059 
4060 	if (vp->v_type != VDIR) {
4061 		error = ENOTDIR;
4062 		goto out;
4063 	}
4064 
4065 #if CONFIG_MACF
4066 	error = mac_vnode_check_chdir(ctx, vp);
4067 	if (error) {
4068 		goto out;
4069 	}
4070 #endif
4071 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4072 	if (error) {
4073 		goto out;
4074 	}
4075 
4076 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4077 		if (vfs_busy(mp, LK_NOWAIT)) {
4078 			error = EACCES;
4079 			goto out;
4080 		}
4081 		error = VFS_ROOT(mp, &tdp, ctx);
4082 		vfs_unbusy(mp);
4083 		if (error) {
4084 			break;
4085 		}
4086 		vnode_put(vp);
4087 		vp = tdp;
4088 	}
4089 	if (error) {
4090 		goto out;
4091 	}
4092 	if ((error = vnode_ref(vp))) {
4093 		goto out;
4094 	}
4095 	vnode_put(vp);
4096 	should_put = 0;
4097 
4098 	if (per_thread) {
4099 		thread_t th = vfs_context_thread(ctx);
4100 		if (th) {
4101 			uthread_t uth = get_bsdthread_info(th);
4102 			tvp = uth->uu_cdir;
4103 			uth->uu_cdir = vp;
4104 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4105 		} else {
4106 			vnode_rele(vp);
4107 			error = ENOENT;
4108 			goto out;
4109 		}
4110 	} else {
4111 		proc_dirs_lock_exclusive(p);
4112 		proc_fdlock(p);
4113 		tvp = p->p_fd.fd_cdir;
4114 		p->p_fd.fd_cdir = vp;
4115 		proc_fdunlock(p);
4116 		proc_dirs_unlock_exclusive(p);
4117 	}
4118 
4119 	if (tvp) {
4120 		vnode_rele(tvp);
4121 	}
4122 
4123 out:
4124 	if (should_put) {
4125 		vnode_put(vp);
4126 	}
4127 	file_drop(uap->fd);
4128 
4129 	return error;
4130 }
4131 
4132 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4133 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4134 {
4135 	return common_fchdir(p, uap, 0);
4136 }
4137 
4138 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4139 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4140 {
4141 	return common_fchdir(p, (void *)uap, 1);
4142 }
4143 
4144 
4145 /*
4146  * Change current working directory (".").
4147  *
4148  * Returns:	0			Success
4149  *	change_dir:ENOTDIR
4150  *	change_dir:???
4151  *	vnode_ref:ENOENT		No such file or directory
4152  */
4153 /* ARGSUSED */
4154 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4155 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4156 {
4157 	int error;
4158 	vnode_t tvp;
4159 
4160 	error = change_dir(ndp, ctx);
4161 	if (error) {
4162 		return error;
4163 	}
4164 	if ((error = vnode_ref(ndp->ni_vp))) {
4165 		vnode_put(ndp->ni_vp);
4166 		return error;
4167 	}
4168 	/*
4169 	 * drop the iocount we picked up in change_dir
4170 	 */
4171 	vnode_put(ndp->ni_vp);
4172 
4173 	if (per_thread) {
4174 		thread_t th = vfs_context_thread(ctx);
4175 		if (th) {
4176 			uthread_t uth = get_bsdthread_info(th);
4177 			tvp = uth->uu_cdir;
4178 			uth->uu_cdir = ndp->ni_vp;
4179 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4180 		} else {
4181 			vnode_rele(ndp->ni_vp);
4182 			return ENOENT;
4183 		}
4184 	} else {
4185 		proc_dirs_lock_exclusive(p);
4186 		proc_fdlock(p);
4187 		tvp = p->p_fd.fd_cdir;
4188 		p->p_fd.fd_cdir = ndp->ni_vp;
4189 		proc_fdunlock(p);
4190 		proc_dirs_unlock_exclusive(p);
4191 	}
4192 
4193 	if (tvp) {
4194 		vnode_rele(tvp);
4195 	}
4196 
4197 	return 0;
4198 }
4199 
4200 
4201 /*
4202  * Change current working directory (".").
4203  *
4204  * Returns:	0			Success
4205  *	chdir_internal:ENOTDIR
4206  *	chdir_internal:ENOENT		No such file or directory
4207  *	chdir_internal:???
4208  */
4209 /* ARGSUSED */
4210 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4211 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4212 {
4213 	struct nameidata nd;
4214 	vfs_context_t ctx = vfs_context_current();
4215 
4216 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4217 	    UIO_USERSPACE, uap->path, ctx);
4218 
4219 	return chdir_internal(p, ctx, &nd, per_thread);
4220 }
4221 
4222 
4223 /*
4224  * chdir
4225  *
4226  * Change current working directory (".") for the entire process
4227  *
4228  * Parameters:  p       Process requesting the call
4229  *              uap     User argument descriptor (see below)
4230  *              retval  (ignored)
4231  *
4232  * Indirect parameters:	uap->path	Directory path
4233  *
4234  * Returns:	0			Success
4235  *              common_chdir: ENOTDIR
4236  *              common_chdir: ENOENT	No such file or directory
4237  *              common_chdir: ???
4238  *
4239  */
4240 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4241 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4242 {
4243 	return common_chdir(p, (void *)uap, 0);
4244 }
4245 
4246 /*
4247  * __pthread_chdir
4248  *
4249  * Change current working directory (".") for a single thread
4250  *
4251  * Parameters:  p       Process requesting the call
4252  *              uap     User argument descriptor (see below)
4253  *              retval  (ignored)
4254  *
4255  * Indirect parameters:	uap->path	Directory path
4256  *
4257  * Returns:	0			Success
4258  *              common_chdir: ENOTDIR
4259  *		common_chdir: ENOENT	No such file or directory
4260  *		common_chdir: ???
4261  *
4262  */
4263 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4264 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4265 {
4266 	return common_chdir(p, (void *)uap, 1);
4267 }
4268 
4269 
4270 /*
4271  * Change notion of root (``/'') directory.
4272  */
4273 /* ARGSUSED */
4274 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4275 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4276 {
4277 	struct filedesc *fdp = &p->p_fd;
4278 	int error;
4279 	struct nameidata nd;
4280 	vnode_t tvp;
4281 	vfs_context_t ctx = vfs_context_current();
4282 
4283 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4284 		return error;
4285 	}
4286 
4287 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4288 	    UIO_USERSPACE, uap->path, ctx);
4289 	error = change_dir(&nd, ctx);
4290 	if (error) {
4291 		return error;
4292 	}
4293 
4294 #if CONFIG_MACF
4295 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4296 	    &nd.ni_cnd);
4297 	if (error) {
4298 		vnode_put(nd.ni_vp);
4299 		return error;
4300 	}
4301 #endif
4302 
4303 	if ((error = vnode_ref(nd.ni_vp))) {
4304 		vnode_put(nd.ni_vp);
4305 		return error;
4306 	}
4307 	vnode_put(nd.ni_vp);
4308 
4309 	/*
4310 	 * This lock provides the guarantee that as long as you hold the lock
4311 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4312 	 * on a referenced vnode in namei when determining the rootvnode for
4313 	 * a process.
4314 	 */
4315 	/* needed for synchronization with lookup */
4316 	proc_dirs_lock_exclusive(p);
4317 	/* needed for setting the flag and other activities on the fd itself */
4318 	proc_fdlock(p);
4319 	tvp = fdp->fd_rdir;
4320 	fdp->fd_rdir = nd.ni_vp;
4321 	fdt_flag_set(fdp, FD_CHROOT);
4322 	proc_fdunlock(p);
4323 	proc_dirs_unlock_exclusive(p);
4324 
4325 	if (tvp != NULL) {
4326 		vnode_rele(tvp);
4327 	}
4328 
4329 	return 0;
4330 }
4331 
4332 #define PATHSTATICBUFLEN 256
4333 #define PIVOT_ROOT_ENTITLEMENT              \
4334        "com.apple.private.vfs.pivot-root"
4335 
4336 #if defined(XNU_TARGET_OS_OSX)
4337 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4338 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4339 {
4340 	int error;
4341 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4342 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4343 	char *new_rootfs_path_before_buf = NULL;
4344 	char *old_rootfs_path_after_buf = NULL;
4345 	char *incoming = NULL;
4346 	char *outgoing = NULL;
4347 	vnode_t incoming_rootvp = NULLVP;
4348 	size_t bytes_copied;
4349 
4350 	/*
4351 	 * XXX : Additional restrictions needed
4352 	 * - perhaps callable only once.
4353 	 */
4354 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4355 		return error;
4356 	}
4357 
4358 	/*
4359 	 * pivot_root can be executed by launchd only.
4360 	 * Enforce entitlement.
4361 	 */
4362 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4363 		return EPERM;
4364 	}
4365 
4366 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4367 	if (error == ENAMETOOLONG) {
4368 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4369 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4370 	}
4371 
4372 	if (error) {
4373 		goto out;
4374 	}
4375 
4376 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4377 	if (error == ENAMETOOLONG) {
4378 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4379 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4380 	}
4381 	if (error) {
4382 		goto out;
4383 	}
4384 
4385 	if (new_rootfs_path_before_buf) {
4386 		incoming = new_rootfs_path_before_buf;
4387 	} else {
4388 		incoming = &new_rootfs_path_before[0];
4389 	}
4390 
4391 	if (old_rootfs_path_after_buf) {
4392 		outgoing = old_rootfs_path_after_buf;
4393 	} else {
4394 		outgoing = &old_rootfs_path_after[0];
4395 	}
4396 
4397 	/*
4398 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4399 	 * Userland is not allowed to pivot to an image.
4400 	 */
4401 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4402 	if (error) {
4403 		goto out;
4404 	}
4405 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4406 	if (error) {
4407 		goto out;
4408 	}
4409 
4410 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4411 
4412 out:
4413 	if (incoming_rootvp != NULLVP) {
4414 		vnode_put(incoming_rootvp);
4415 		incoming_rootvp = NULLVP;
4416 	}
4417 
4418 	if (old_rootfs_path_after_buf) {
4419 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4420 	}
4421 
4422 	if (new_rootfs_path_before_buf) {
4423 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4424 	}
4425 
4426 	return error;
4427 }
4428 #else
4429 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4430 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4431 {
4432 	return nosys(p, NULL, retval);
4433 }
4434 #endif /* XNU_TARGET_OS_OSX */
4435 
4436 /*
4437  * Common routine for chroot and chdir.
4438  *
4439  * Returns:	0			Success
4440  *		ENOTDIR			Not a directory
4441  *		namei:???		[anything namei can return]
4442  *		vnode_authorize:???	[anything vnode_authorize can return]
4443  */
4444 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4445 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4446 {
4447 	vnode_t vp;
4448 	int error;
4449 
4450 	if ((error = namei(ndp))) {
4451 		return error;
4452 	}
4453 	nameidone(ndp);
4454 	vp = ndp->ni_vp;
4455 
4456 	if (vp->v_type != VDIR) {
4457 		vnode_put(vp);
4458 		return ENOTDIR;
4459 	}
4460 
4461 #if CONFIG_MACF
4462 	error = mac_vnode_check_chdir(ctx, vp);
4463 	if (error) {
4464 		vnode_put(vp);
4465 		return error;
4466 	}
4467 #endif
4468 
4469 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4470 	if (error) {
4471 		vnode_put(vp);
4472 		return error;
4473 	}
4474 
4475 	return error;
4476 }
4477 
4478 /*
4479  * Free the vnode data (for directories) associated with the file glob.
4480  */
4481 struct fd_vn_data *
fg_vn_data_alloc(void)4482 fg_vn_data_alloc(void)
4483 {
4484 	struct fd_vn_data *fvdata;
4485 
4486 	/* Allocate per fd vnode data */
4487 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4488 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4489 	return fvdata;
4490 }
4491 
4492 /*
4493  * Free the vnode data (for directories) associated with the file glob.
4494  */
4495 void
fg_vn_data_free(void * fgvndata)4496 fg_vn_data_free(void *fgvndata)
4497 {
4498 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4499 
4500 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4501 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4502 	kfree_type(struct fd_vn_data, fvdata);
4503 }
4504 
4505 /*
4506  * Check permissions, allocate an open file structure,
4507  * and call the device open routine if any.
4508  *
4509  * Returns:	0			Success
4510  *		EINVAL
4511  *		EINTR
4512  *	falloc:ENFILE
4513  *	falloc:EMFILE
4514  *	falloc:ENOMEM
4515  *	vn_open_auth:???
4516  *	dupfdopen:???
4517  *	VNOP_ADVLOCK:???
4518  *	vnode_setsize:???
4519  *
4520  * XXX Need to implement uid, gid
4521  */
4522 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4523 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4524     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4525 {
4526 	proc_t p = vfs_context_proc(ctx);
4527 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4528 	struct fileproc *fp;
4529 	vnode_t vp;
4530 	int flags, oflags, amode;
4531 	int type, indx, error;
4532 	struct vfs_context context;
4533 	vnode_t authvp = NULLVP;
4534 
4535 	oflags = uflags;
4536 
4537 	amode = oflags & O_ACCMODE;
4538 	/*
4539 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4540 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4541 	 * with FREAD/FWRITE.
4542 	 */
4543 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4544 		return EINVAL;
4545 	}
4546 
4547 	flags = FFLAGS(uflags);
4548 	CLR(flags, FENCRYPTED);
4549 	CLR(flags, FUNENCRYPTED);
4550 
4551 	AUDIT_ARG(fflags, oflags);
4552 	AUDIT_ARG(mode, vap->va_mode);
4553 
4554 	if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4555 		return error;
4556 	}
4557 	if (flags & O_CLOEXEC) {
4558 		fp->fp_flags |= FP_CLOEXEC;
4559 	}
4560 	if (flags & O_CLOFORK) {
4561 		fp->fp_flags |= FP_CLOFORK;
4562 	}
4563 
4564 	/* setup state to recognize when fdesc_open was called */
4565 	uu->uu_dupfd = -1;
4566 
4567 	/*
4568 	 * Disable read/write access if file is opened with O_EVTONLY and
4569 	 * the process has requested to deny read/write access.
4570 	 */
4571 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4572 		flags &= ~(FREAD | FWRITE);
4573 	}
4574 
4575 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4576 		error = vnode_getfromfd(ctx, authfd, &authvp);
4577 		if (error) {
4578 			fp_free(p, indx, fp);
4579 			return error;
4580 		}
4581 	}
4582 
4583 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4584 		if (authvp != NULLVP) {
4585 			vnode_put(authvp);
4586 		}
4587 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4588 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4589 				*retval = indx;
4590 				return 0;
4591 			}
4592 		}
4593 		if (error == ERESTART) {
4594 			error = EINTR;
4595 		}
4596 		fp_free(p, indx, fp);
4597 		return error;
4598 	}
4599 
4600 	if (authvp != NULLVP) {
4601 		vnode_put(authvp);
4602 	}
4603 
4604 	uu->uu_dupfd = 0;
4605 	vp = ndp->ni_vp;
4606 
4607 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4608 	fp->fp_glob->fg_ops = &vnops;
4609 	fp_set_data(fp, vp);
4610 
4611 #if CONFIG_FILE_LEASES
4612 	/*
4613 	 * If we are creating a file or open with truncate, we need to break the
4614 	 * lease if there is a read lease placed on the parent dir.
4615 	 */
4616 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4617 		vnode_breakdirlease(vp, true, oflags);
4618 	}
4619 	/* Now check if there is a lease placed on the file itself. */
4620 	error = vnode_breaklease(vp, oflags, ctx);
4621 	if (error) {
4622 		goto bad;
4623 	}
4624 #endif /* CONFIG_FILE_LEASES */
4625 
4626 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4627 		struct flock lf = {
4628 			.l_whence = SEEK_SET,
4629 		};
4630 
4631 		if (flags & O_EXLOCK) {
4632 			lf.l_type = F_WRLCK;
4633 		} else {
4634 			lf.l_type = F_RDLCK;
4635 		}
4636 		type = F_FLOCK;
4637 		if ((flags & FNONBLOCK) == 0) {
4638 			type |= F_WAIT;
4639 		}
4640 #if CONFIG_MACF
4641 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4642 		    F_SETLK, &lf);
4643 		if (error) {
4644 			goto bad;
4645 		}
4646 #endif
4647 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4648 			goto bad;
4649 		}
4650 		fp->fp_glob->fg_flag |= FWASLOCKED;
4651 	}
4652 
4653 	/* try to truncate by setting the size attribute */
4654 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4655 		goto bad;
4656 	}
4657 
4658 	/*
4659 	 * For directories we hold some additional information in the fd.
4660 	 */
4661 	if (vnode_vtype(vp) == VDIR) {
4662 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4663 	} else {
4664 		fp->fp_glob->fg_vn_data = NULL;
4665 	}
4666 
4667 #if CONFIG_SECLUDED_MEMORY
4668 	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4669 		memory_object_control_t moc;
4670 		const char *v_name;
4671 
4672 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4673 
4674 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4675 			/* nothing to do... */
4676 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4677 			/* writable -> no longer  eligible for secluded pages */
4678 			memory_object_mark_eligible_for_secluded(moc,
4679 			    FALSE);
4680 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4681 			char pathname[32] = { 0, };
4682 			size_t copied;
4683 			/* XXX FBDP: better way to detect /Applications/ ? */
4684 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4685 				(void)copyinstr(ndp->ni_dirp,
4686 				    pathname,
4687 				    sizeof(pathname),
4688 				    &copied);
4689 			} else {
4690 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4691 				    pathname,
4692 				    sizeof(pathname),
4693 				    &copied);
4694 			}
4695 			pathname[sizeof(pathname) - 1] = '\0';
4696 			if (strncmp(pathname,
4697 			    "/Applications/",
4698 			    strlen("/Applications/")) == 0 &&
4699 			    strncmp(pathname,
4700 			    "/Applications/Camera.app/",
4701 			    strlen("/Applications/Camera.app/")) != 0) {
4702 				/*
4703 				 * not writable
4704 				 * AND from "/Applications/"
4705 				 * AND not from "/Applications/Camera.app/"
4706 				 * ==> eligible for secluded
4707 				 */
4708 				memory_object_mark_eligible_for_secluded(moc,
4709 				    TRUE);
4710 			}
4711 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4712 		    (v_name = vnode_getname(vp))) {
4713 			size_t len = strlen(v_name);
4714 
4715 			if (!strncmp(v_name, "dyld", len) ||
4716 			    !strncmp(v_name, "launchd", len) ||
4717 			    !strncmp(v_name, "Camera", len) ||
4718 			    !strncmp(v_name, "SpringBoard", len) ||
4719 			    !strncmp(v_name, "backboardd", len)) {
4720 				/*
4721 				 * This file matters when launching Camera:
4722 				 * do not store its contents in the secluded
4723 				 * pool that will be drained on Camera launch.
4724 				 */
4725 				memory_object_mark_eligible_for_secluded(moc,
4726 				    FALSE);
4727 			} else if (!strncmp(v_name, "mediaserverd", len)) {
4728 				memory_object_mark_eligible_for_secluded(moc,
4729 				    FALSE);
4730 				memory_object_mark_for_realtime(moc,
4731 				    true);
4732 			} else if (!strncmp(v_name, "bluetoothd", len)) {
4733 				/*
4734 				 * bluetoothd might be needed for realtime audio
4735 				 * playback.
4736 				 */
4737 				memory_object_mark_eligible_for_secluded(moc,
4738 				    FALSE);
4739 				memory_object_mark_for_realtime(moc,
4740 				    true);
4741 			} else {
4742 				char pathname[64] = { 0, };
4743 				size_t copied;
4744 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4745 					(void)copyinstr(ndp->ni_dirp,
4746 					    pathname,
4747 					    sizeof(pathname),
4748 					    &copied);
4749 				} else {
4750 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
4751 					    pathname,
4752 					    sizeof(pathname),
4753 					    &copied);
4754 				}
4755 				pathname[sizeof(pathname) - 1] = '\0';
4756 				if (strncmp(pathname,
4757 				    "/Library/Audio/Plug-Ins/",
4758 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4759 				    strncmp(pathname,
4760 				    "/System/Library/Audio/Plug-Ins/",
4761 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4762 					/*
4763 					 * This may be an audio plugin required
4764 					 * for realtime playback.
4765 					 * ==> NOT eligible for secluded.
4766 					 */
4767 					memory_object_mark_eligible_for_secluded(moc,
4768 					    FALSE);
4769 					memory_object_mark_for_realtime(moc,
4770 					    true);
4771 				}
4772 			}
4773 			vnode_putname(v_name);
4774 		}
4775 	}
4776 #endif /* CONFIG_SECLUDED_MEMORY */
4777 
4778 	vnode_put(vp);
4779 
4780 	/*
4781 	 * The first terminal open (without a O_NOCTTY) by a session leader
4782 	 * results in it being set as the controlling terminal.
4783 	 */
4784 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4785 	    !(flags & O_NOCTTY)) {
4786 		int tmp = 0;
4787 
4788 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4789 		    (caddr_t)&tmp, ctx);
4790 	}
4791 
4792 	proc_fdlock(p);
4793 	procfdtbl_releasefd(p, indx, NULL);
4794 
4795 	fp_drop(p, indx, fp, 1);
4796 	proc_fdunlock(p);
4797 
4798 	*retval = indx;
4799 
4800 	return 0;
4801 bad:
4802 	context = *vfs_context_current();
4803 	context.vc_ucred = fp->fp_glob->fg_cred;
4804 
4805 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4806 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4807 		struct flock lf = {
4808 			.l_whence = SEEK_SET,
4809 			.l_type = F_UNLCK,
4810 		};
4811 
4812 		(void)VNOP_ADVLOCK(
4813 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4814 	}
4815 
4816 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4817 	vnode_put(vp);
4818 	fp_free(p, indx, fp);
4819 
4820 	return error;
4821 }
4822 
4823 /*
4824  * While most of the *at syscall handlers can call nameiat() which
4825  * is a wrapper around namei, the use of namei and initialisation
4826  * of nameidata are far removed and in different functions  - namei
4827  * gets called in vn_open_auth for open1. So we'll just do here what
4828  * nameiat() does.
4829  */
4830 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4831 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4832     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4833     int dirfd, int authfd)
4834 {
4835 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4836 		int error;
4837 		char c;
4838 
4839 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4840 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4841 			if (error) {
4842 				return error;
4843 			}
4844 		} else {
4845 			c = *((char *)(ndp->ni_dirp));
4846 		}
4847 
4848 		if (c != '/') {
4849 			vnode_t dvp_at;
4850 
4851 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4852 			    &dvp_at);
4853 			if (error) {
4854 				return error;
4855 			}
4856 
4857 			if (vnode_vtype(dvp_at) != VDIR) {
4858 				vnode_put(dvp_at);
4859 				return ENOTDIR;
4860 			}
4861 
4862 			ndp->ni_dvp = dvp_at;
4863 			ndp->ni_cnd.cn_flags |= USEDVP;
4864 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4865 			    retval, authfd);
4866 			vnode_put(dvp_at);
4867 			return error;
4868 		}
4869 	}
4870 
4871 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4872 }
4873 
4874 /*
4875  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4876  *
4877  * Parameters:	p			Process requesting the open
4878  *		uap			User argument descriptor (see below)
4879  *		retval			Pointer to an area to receive the
4880  *					return calue from the system call
4881  *
4882  * Indirect:	uap->path		Path to open (same as 'open')
4883  *		uap->flags		Flags to open (same as 'open'
4884  *		uap->uid		UID to set, if creating
4885  *		uap->gid		GID to set, if creating
4886  *		uap->mode		File mode, if creating (same as 'open')
4887  *		uap->xsecurity		ACL to set, if creating
4888  *
4889  * Returns:	0			Success
4890  *		!0			errno value
4891  *
4892  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4893  *
4894  * XXX:		We should enummerate the possible errno values here, and where
4895  *		in the code they originated.
4896  */
4897 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4898 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4899 {
4900 	int ciferror;
4901 	kauth_filesec_t xsecdst;
4902 	struct vnode_attr va;
4903 	struct nameidata nd;
4904 	int cmode;
4905 
4906 	AUDIT_ARG(owner, uap->uid, uap->gid);
4907 
4908 	xsecdst = NULL;
4909 	if ((uap->xsecurity != USER_ADDR_NULL) &&
4910 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4911 		return ciferror;
4912 	}
4913 
4914 	VATTR_INIT(&va);
4915 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4916 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4917 	if (uap->uid != KAUTH_UID_NONE) {
4918 		VATTR_SET(&va, va_uid, uap->uid);
4919 	}
4920 	if (uap->gid != KAUTH_GID_NONE) {
4921 		VATTR_SET(&va, va_gid, uap->gid);
4922 	}
4923 	if (xsecdst != NULL) {
4924 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4925 		va.va_vaflags |= VA_FILESEC_ACL;
4926 	}
4927 
4928 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4929 	    uap->path, vfs_context_current());
4930 
4931 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4932 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4933 	if (xsecdst != NULL) {
4934 		kauth_filesec_free(xsecdst);
4935 	}
4936 
4937 	return ciferror;
4938 }
4939 
4940 /*
4941  * Go through the data-protected atomically controlled open (2)
4942  *
4943  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4944  */
4945 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4946 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4947     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4948 {
4949 	/*
4950 	 * Follow the same path as normal open(2)
4951 	 * Look up the item if it exists, and acquire the vnode.
4952 	 */
4953 	struct vnode_attr va;
4954 	struct nameidata nd;
4955 	int cmode;
4956 	int error;
4957 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4958 
4959 	VATTR_INIT(&va);
4960 	/* Mask off all but regular access permissions */
4961 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4962 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4963 
4964 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4965 	    path, ctx);
4966 
4967 	/*
4968 	 * Initialize the extra fields in vnode_attr to pass down our
4969 	 * extra fields.
4970 	 * 1. target cprotect class.
4971 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4972 	 */
4973 	if (flags & O_CREAT) {
4974 		/* lower level kernel code validates that the class is valid before applying it. */
4975 		if (class != PROTECTION_CLASS_DEFAULT) {
4976 			/*
4977 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4978 			 * file behave the same as open (2)
4979 			 */
4980 			VATTR_SET(&va, va_dataprotect_class, class);
4981 		}
4982 	}
4983 
4984 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4985 		if (flags & (O_RDWR | O_WRONLY)) {
4986 			/*
4987 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
4988 			 */
4989 			return EINVAL;
4990 		}
4991 		if (dpflags & O_DP_GETRAWENCRYPTED) {
4992 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4993 		}
4994 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4995 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4996 		}
4997 		if (dpflags & O_DP_AUTHENTICATE) {
4998 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4999 		}
5000 	}
5001 
5002 	error = open1at(vfs_context_current(), &nd, flags, &va,
5003 	    NULL, NULL, retval, fd, authfd);
5004 
5005 	return error;
5006 }
5007 
5008 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5009 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5010 {
5011 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5012 		return EINVAL;
5013 	}
5014 
5015 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5016 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5017 }
5018 
5019 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5020 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5021 {
5022 	if (uap->dpflags & O_DP_AUTHENTICATE) {
5023 		return EINVAL;
5024 	}
5025 
5026 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5027 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5028 }
5029 
5030 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5031 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5032     int fd, enum uio_seg segflg, int *retval)
5033 {
5034 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5035 	struct {
5036 		struct vnode_attr va;
5037 		struct nameidata nd;
5038 	} *__open_data;
5039 	struct vnode_attr *vap;
5040 	struct nameidata *ndp;
5041 	int cmode;
5042 	int error;
5043 
5044 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5045 	vap = &__open_data->va;
5046 	ndp = &__open_data->nd;
5047 
5048 	VATTR_INIT(vap);
5049 	/* Mask off all but regular access permissions */
5050 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5051 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5052 
5053 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5054 	    segflg, path, ctx);
5055 
5056 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5057 
5058 	kfree_type(typeof(*__open_data), __open_data);
5059 
5060 	return error;
5061 }
5062 
5063 int
open(proc_t p,struct open_args * uap,int32_t * retval)5064 open(proc_t p, struct open_args *uap, int32_t *retval)
5065 {
5066 	__pthread_testcancel(1);
5067 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5068 }
5069 
5070 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5071 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5072     int32_t *retval)
5073 {
5074 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5075 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5076 }
5077 
5078 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5079 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5080     int32_t *retval)
5081 {
5082 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5083 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
5084 }
5085 
5086 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5087 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5088 {
5089 	__pthread_testcancel(1);
5090 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5091 }
5092 
5093 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5094 
5095 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5096 vfs_context_can_open_by_id(vfs_context_t ctx)
5097 {
5098 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5099 		return TRUE;
5100 	}
5101 
5102 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5103 	           OPEN_BY_ID_ENTITLEMENT);
5104 }
5105 
5106 /*
5107  * openbyid_np: open a file given a file system id and a file system object id
5108  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5109  *	file systems that don't support object ids it is a node id (uint64_t).
5110  *
5111  * Parameters:	p			Process requesting the open
5112  *		uap			User argument descriptor (see below)
5113  *		retval			Pointer to an area to receive the
5114  *					return calue from the system call
5115  *
5116  * Indirect:	uap->path		Path to open (same as 'open')
5117  *
5118  *		uap->fsid		id of target file system
5119  *		uap->objid		id of target file system object
5120  *		uap->flags		Flags to open (same as 'open')
5121  *
5122  * Returns:	0			Success
5123  *		!0			errno value
5124  *
5125  *
5126  * XXX:		We should enummerate the possible errno values here, and where
5127  *		in the code they originated.
5128  */
5129 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5130 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5131 {
5132 	fsid_t fsid;
5133 	uint64_t objid;
5134 	int error;
5135 	char *buf = NULL;
5136 	int buflen = MAXPATHLEN;
5137 	int pathlen = 0;
5138 	vfs_context_t ctx = vfs_context_current();
5139 
5140 	if (!vfs_context_can_open_by_id(ctx)) {
5141 		return EPERM;
5142 	}
5143 
5144 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5145 		return error;
5146 	}
5147 
5148 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5149 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5150 		return error;
5151 	}
5152 
5153 	AUDIT_ARG(value32, fsid.val[0]);
5154 	AUDIT_ARG(value64, objid);
5155 
5156 	/*resolve path from fsis, objid*/
5157 	do {
5158 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5159 		if (buf == NULL) {
5160 			return ENOMEM;
5161 		}
5162 
5163 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5164 		    buf, FSOPT_ISREALFSID, &pathlen);
5165 
5166 		if (error) {
5167 			kfree_data(buf, buflen + 1);
5168 			buf = NULL;
5169 		}
5170 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5171 
5172 	if (error) {
5173 		return error;
5174 	}
5175 
5176 	buf[pathlen] = 0;
5177 
5178 	error = openat_internal(
5179 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5180 
5181 	kfree_data(buf, buflen + 1);
5182 
5183 	return error;
5184 }
5185 
5186 
5187 /*
5188  * Create a special file.
5189  */
5190 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5191     int fd);
5192 
5193 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5194 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5195     mode_t mode, int fd)
5196 {
5197 	vfs_context_t ctx = vfs_context_current();
5198 	struct nameidata nd;
5199 	vnode_t vp, dvp;
5200 	int error;
5201 
5202 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5203 	if ((mode & S_IFMT) == S_IFIFO) {
5204 		return mkfifo1(ctx, upath, vap, fd);
5205 	}
5206 
5207 	AUDIT_ARG(mode, mode);
5208 	AUDIT_ARG(value32, vap->va_rdev);
5209 
5210 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5211 		return error;
5212 	}
5213 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5214 	    UIO_USERSPACE, upath, ctx);
5215 	error = nameiat(&nd, fd);
5216 	if (error) {
5217 		return error;
5218 	}
5219 	dvp = nd.ni_dvp;
5220 	vp = nd.ni_vp;
5221 
5222 	if (vp != NULL) {
5223 		error = EEXIST;
5224 		goto out;
5225 	}
5226 
5227 	switch (mode & S_IFMT) {
5228 	case S_IFCHR:
5229 		VATTR_SET(vap, va_type, VCHR);
5230 		break;
5231 	case S_IFBLK:
5232 		VATTR_SET(vap, va_type, VBLK);
5233 		break;
5234 	default:
5235 		error = EINVAL;
5236 		goto out;
5237 	}
5238 
5239 #if CONFIG_MACF
5240 	error = mac_vnode_check_create(ctx,
5241 	    nd.ni_dvp, &nd.ni_cnd, vap);
5242 	if (error) {
5243 		goto out;
5244 	}
5245 #endif
5246 
5247 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5248 		goto out;
5249 	}
5250 
5251 #if CONFIG_FILE_LEASES
5252 	vnode_breakdirlease(dvp, false, O_WRONLY);
5253 #endif
5254 
5255 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5256 		goto out;
5257 	}
5258 
5259 	if (vp) {
5260 		int     update_flags = 0;
5261 
5262 		// Make sure the name & parent pointers are hooked up
5263 		if (vp->v_name == NULL) {
5264 			update_flags |= VNODE_UPDATE_NAME;
5265 		}
5266 		if (vp->v_parent == NULLVP) {
5267 			update_flags |= VNODE_UPDATE_PARENT;
5268 		}
5269 
5270 		if (update_flags) {
5271 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5272 		}
5273 
5274 #if CONFIG_FSE
5275 		add_fsevent(FSE_CREATE_FILE, ctx,
5276 		    FSE_ARG_VNODE, vp,
5277 		    FSE_ARG_DONE);
5278 #endif
5279 	}
5280 
5281 out:
5282 	/*
5283 	 * nameidone has to happen before we vnode_put(dvp)
5284 	 * since it may need to release the fs_nodelock on the dvp
5285 	 */
5286 	nameidone(&nd);
5287 
5288 	if (vp) {
5289 		vnode_put(vp);
5290 	}
5291 	vnode_put(dvp);
5292 
5293 	return error;
5294 }
5295 
5296 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5297 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5298 {
5299 	struct vnode_attr va;
5300 
5301 	VATTR_INIT(&va);
5302 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5303 	VATTR_SET(&va, va_rdev, uap->dev);
5304 
5305 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5306 }
5307 
5308 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5309 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5310 {
5311 	struct vnode_attr va;
5312 
5313 	VATTR_INIT(&va);
5314 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5315 	VATTR_SET(&va, va_rdev, uap->dev);
5316 
5317 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5318 }
5319 
5320 /*
5321  * Create a named pipe.
5322  *
5323  * Returns:	0			Success
5324  *		EEXIST
5325  *	namei:???
5326  *	vnode_authorize:???
5327  *	vn_create:???
5328  */
5329 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5330 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5331 {
5332 	vnode_t vp, dvp;
5333 	int error;
5334 	struct nameidata nd;
5335 
5336 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5337 	    UIO_USERSPACE, upath, ctx);
5338 	error = nameiat(&nd, fd);
5339 	if (error) {
5340 		return error;
5341 	}
5342 	dvp = nd.ni_dvp;
5343 	vp = nd.ni_vp;
5344 
5345 	/* check that this is a new file and authorize addition */
5346 	if (vp != NULL) {
5347 		error = EEXIST;
5348 		goto out;
5349 	}
5350 	VATTR_SET(vap, va_type, VFIFO);
5351 
5352 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5353 		goto out;
5354 	}
5355 
5356 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5357 out:
5358 	/*
5359 	 * nameidone has to happen before we vnode_put(dvp)
5360 	 * since it may need to release the fs_nodelock on the dvp
5361 	 */
5362 	nameidone(&nd);
5363 
5364 	if (vp) {
5365 		vnode_put(vp);
5366 	}
5367 	vnode_put(dvp);
5368 
5369 	return error;
5370 }
5371 
5372 
5373 /*
5374  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5375  *
5376  * Parameters:	p			Process requesting the open
5377  *		uap			User argument descriptor (see below)
5378  *		retval			(Ignored)
5379  *
5380  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5381  *		uap->uid		UID to set
5382  *		uap->gid		GID to set
5383  *		uap->mode		File mode to set (same as 'mkfifo')
5384  *		uap->xsecurity		ACL to set, if creating
5385  *
5386  * Returns:	0			Success
5387  *		!0			errno value
5388  *
5389  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5390  *
5391  * XXX:		We should enummerate the possible errno values here, and where
5392  *		in the code they originated.
5393  */
5394 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5395 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5396 {
5397 	int ciferror;
5398 	kauth_filesec_t xsecdst;
5399 	struct vnode_attr va;
5400 
5401 	AUDIT_ARG(owner, uap->uid, uap->gid);
5402 
5403 	xsecdst = KAUTH_FILESEC_NONE;
5404 	if (uap->xsecurity != USER_ADDR_NULL) {
5405 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5406 			return ciferror;
5407 		}
5408 	}
5409 
5410 	VATTR_INIT(&va);
5411 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5412 	if (uap->uid != KAUTH_UID_NONE) {
5413 		VATTR_SET(&va, va_uid, uap->uid);
5414 	}
5415 	if (uap->gid != KAUTH_GID_NONE) {
5416 		VATTR_SET(&va, va_gid, uap->gid);
5417 	}
5418 	if (xsecdst != KAUTH_FILESEC_NONE) {
5419 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5420 		va.va_vaflags |= VA_FILESEC_ACL;
5421 	}
5422 
5423 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5424 
5425 	if (xsecdst != KAUTH_FILESEC_NONE) {
5426 		kauth_filesec_free(xsecdst);
5427 	}
5428 	return ciferror;
5429 }
5430 
5431 /* ARGSUSED */
5432 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5433 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5434 {
5435 	struct vnode_attr va;
5436 
5437 	VATTR_INIT(&va);
5438 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5439 
5440 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5441 }
5442 
5443 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5444 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5445 {
5446 	struct vnode_attr va;
5447 
5448 	VATTR_INIT(&va);
5449 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5450 
5451 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5452 }
5453 
5454 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5455 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5456 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5457 
5458 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5459 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5460 {
5461 	int ret, len = _len;
5462 
5463 	*truncated_path = 0;
5464 
5465 	if (firmlink) {
5466 		ret = vn_getpath(dvp, path, &len);
5467 	} else {
5468 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5469 	}
5470 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5471 		if (leafname) {
5472 			path[len - 1] = '/';
5473 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5474 			if (len > MAXPATHLEN) {
5475 				char *ptr;
5476 
5477 				// the string got truncated!
5478 				*truncated_path = 1;
5479 				ptr = strrchr(path, '/');
5480 				if (ptr) {
5481 					*ptr = '\0';   // chop off the string at the last directory component
5482 				}
5483 				len = (int)strlen(path) + 1;
5484 			}
5485 		}
5486 	} else if (ret == 0) {
5487 		*truncated_path = 1;
5488 	} else if (ret != 0) {
5489 		struct vnode *mydvp = dvp;
5490 
5491 		if (ret != ENOSPC) {
5492 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5493 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5494 		}
5495 		*truncated_path = 1;
5496 
5497 		do {
5498 			if (mydvp->v_parent != NULL) {
5499 				mydvp = mydvp->v_parent;
5500 			} else if (mydvp->v_mount) {
5501 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5502 				break;
5503 			} else {
5504 				// no parent and no mount point?  only thing is to punt and say "/" changed
5505 				strlcpy(path, "/", _len);
5506 				len = 2;
5507 				mydvp = NULL;
5508 			}
5509 
5510 			if (mydvp == NULL) {
5511 				break;
5512 			}
5513 
5514 			len = _len;
5515 			if (firmlink) {
5516 				ret = vn_getpath(mydvp, path, &len);
5517 			} else {
5518 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5519 			}
5520 		} while (ret == ENOSPC);
5521 	}
5522 
5523 	return len;
5524 }
5525 
5526 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5527 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5528 {
5529 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5530 }
5531 
5532 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5533 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5534 {
5535 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5536 }
5537 
5538 /*
5539  * Make a hard file link.
5540  *
5541  * Returns:	0			Success
5542  *		EPERM
5543  *		EEXIST
5544  *		EXDEV
5545  *	namei:???
5546  *	vnode_authorize:???
5547  *	VNOP_LINK:???
5548  */
5549 /* ARGSUSED */
5550 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5551 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5552     user_addr_t link, int flag, enum uio_seg segflg)
5553 {
5554 	vnode_t vp, pvp, dvp, lvp;
5555 	struct nameidata nd;
5556 	int follow;
5557 	int error;
5558 #if CONFIG_FSE
5559 	fse_info finfo;
5560 #endif
5561 	int need_event, has_listeners, need_kpath2;
5562 	char *target_path = NULL;
5563 	char  *no_firmlink_path = NULL;
5564 	int truncated = 0;
5565 	int truncated_no_firmlink_path = 0;
5566 
5567 	vp = dvp = lvp = NULLVP;
5568 
5569 	/* look up the object we are linking to */
5570 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5571 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5572 	    segflg, path, ctx);
5573 
5574 	error = nameiat(&nd, fd1);
5575 	if (error) {
5576 		return error;
5577 	}
5578 	vp = nd.ni_vp;
5579 
5580 	nameidone(&nd);
5581 
5582 	/*
5583 	 * Normally, linking to directories is not supported.
5584 	 * However, some file systems may have limited support.
5585 	 */
5586 	if (vp->v_type == VDIR) {
5587 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5588 			error = EPERM;   /* POSIX */
5589 			goto out;
5590 		}
5591 
5592 		/* Linking to a directory requires ownership. */
5593 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5594 			struct vnode_attr dva;
5595 
5596 			VATTR_INIT(&dva);
5597 			VATTR_WANTED(&dva, va_uid);
5598 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5599 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5600 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5601 				error = EACCES;
5602 				goto out;
5603 			}
5604 		}
5605 	}
5606 
5607 	/* lookup the target node */
5608 #if CONFIG_TRIGGERS
5609 	nd.ni_op = OP_LINK;
5610 #endif
5611 	nd.ni_cnd.cn_nameiop = CREATE;
5612 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5613 	nd.ni_dirp = link;
5614 	error = nameiat(&nd, fd2);
5615 	if (error != 0) {
5616 		goto out;
5617 	}
5618 	dvp = nd.ni_dvp;
5619 	lvp = nd.ni_vp;
5620 
5621 #if CONFIG_MACF
5622 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5623 		goto out2;
5624 	}
5625 #endif
5626 
5627 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5628 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5629 		goto out2;
5630 	}
5631 
5632 	/* target node must not exist */
5633 	if (lvp != NULLVP) {
5634 		error = EEXIST;
5635 		goto out2;
5636 	}
5637 	/* cannot link across mountpoints */
5638 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5639 		error = EXDEV;
5640 		goto out2;
5641 	}
5642 
5643 	/* authorize creation of the target note */
5644 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5645 		goto out2;
5646 	}
5647 
5648 #if CONFIG_FILE_LEASES
5649 	vnode_breakdirlease(dvp, false, O_WRONLY);
5650 #endif
5651 
5652 	/* and finally make the link */
5653 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5654 	if (error) {
5655 		goto out2;
5656 	}
5657 
5658 #if CONFIG_MACF
5659 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5660 #endif
5661 
5662 #if CONFIG_FSE
5663 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5664 #else
5665 	need_event = 0;
5666 #endif
5667 	has_listeners = kauth_authorize_fileop_has_listeners();
5668 
5669 	need_kpath2 = 0;
5670 #if CONFIG_AUDIT
5671 	if (AUDIT_RECORD_EXISTS()) {
5672 		need_kpath2 = 1;
5673 	}
5674 #endif
5675 
5676 	if (need_event || has_listeners || need_kpath2) {
5677 		char *link_to_path = NULL;
5678 		int len, link_name_len;
5679 		int  len_no_firmlink_path = 0;
5680 
5681 		/* build the path to the new link file */
5682 		GET_PATH(target_path);
5683 
5684 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5685 		if (no_firmlink_path == NULL) {
5686 			GET_PATH(no_firmlink_path);
5687 		}
5688 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5689 
5690 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5691 
5692 		if (has_listeners) {
5693 			/* build the path to file we are linking to */
5694 			GET_PATH(link_to_path);
5695 
5696 			link_name_len = MAXPATHLEN;
5697 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5698 				/*
5699 				 * Call out to allow 3rd party notification of rename.
5700 				 * Ignore result of kauth_authorize_fileop call.
5701 				 */
5702 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5703 				    (uintptr_t)link_to_path,
5704 				    (uintptr_t)target_path);
5705 			}
5706 			if (link_to_path != NULL) {
5707 				RELEASE_PATH(link_to_path);
5708 			}
5709 		}
5710 #if CONFIG_FSE
5711 		if (need_event) {
5712 			/* construct fsevent */
5713 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5714 				if (truncated_no_firmlink_path) {
5715 					finfo.mode |= FSE_TRUNCATED_PATH;
5716 				}
5717 
5718 				// build the path to the destination of the link
5719 				add_fsevent(FSE_CREATE_FILE, ctx,
5720 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5721 				    FSE_ARG_FINFO, &finfo,
5722 				    FSE_ARG_DONE);
5723 			}
5724 
5725 			pvp = vp->v_parent;
5726 			// need an iocount on parent vnode in this case
5727 			if (pvp && pvp != dvp) {
5728 				pvp = vnode_getparent_if_different(vp, dvp);
5729 			}
5730 			if (pvp) {
5731 				add_fsevent(FSE_STAT_CHANGED, ctx,
5732 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5733 			}
5734 			if (pvp && pvp != dvp) {
5735 				vnode_put(pvp);
5736 			}
5737 		}
5738 #endif
5739 	}
5740 out2:
5741 	/*
5742 	 * nameidone has to happen before we vnode_put(dvp)
5743 	 * since it may need to release the fs_nodelock on the dvp
5744 	 */
5745 	nameidone(&nd);
5746 	if (target_path != NULL) {
5747 		RELEASE_PATH(target_path);
5748 	}
5749 	if (no_firmlink_path != NULL) {
5750 		RELEASE_PATH(no_firmlink_path);
5751 		no_firmlink_path = NULL;
5752 	}
5753 out:
5754 	if (lvp) {
5755 		vnode_put(lvp);
5756 	}
5757 	if (dvp) {
5758 		vnode_put(dvp);
5759 	}
5760 	vnode_put(vp);
5761 	return error;
5762 }
5763 
5764 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5765 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5766 {
5767 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5768 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5769 }
5770 
5771 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5772 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5773 {
5774 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5775 		return EINVAL;
5776 	}
5777 
5778 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5779 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5780 }
5781 
5782 /*
5783  * Make a symbolic link.
5784  *
5785  * We could add support for ACLs here too...
5786  */
5787 /* ARGSUSED */
5788 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5789 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5790     user_addr_t link, enum uio_seg segflg)
5791 {
5792 	struct vnode_attr va;
5793 	char *path;
5794 	int error;
5795 	struct nameidata nd;
5796 	vnode_t vp, dvp;
5797 	size_t dummy = 0;
5798 	proc_t p;
5799 
5800 	error = 0;
5801 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5802 		path = zalloc(ZV_NAMEI);
5803 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5804 	} else {
5805 		path = (char *)path_data;
5806 	}
5807 	if (error) {
5808 		goto out;
5809 	}
5810 	AUDIT_ARG(text, path);  /* This is the link string */
5811 
5812 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5813 	    segflg, link, ctx);
5814 
5815 	error = nameiat(&nd, fd);
5816 	if (error) {
5817 		goto out;
5818 	}
5819 	dvp = nd.ni_dvp;
5820 	vp = nd.ni_vp;
5821 
5822 	p = vfs_context_proc(ctx);
5823 	VATTR_INIT(&va);
5824 	VATTR_SET(&va, va_type, VLNK);
5825 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5826 
5827 #if CONFIG_MACF
5828 	error = mac_vnode_check_create(ctx,
5829 	    dvp, &nd.ni_cnd, &va);
5830 #endif
5831 	if (error != 0) {
5832 		goto skipit;
5833 	}
5834 
5835 	if (vp != NULL) {
5836 		error = EEXIST;
5837 		goto skipit;
5838 	}
5839 
5840 	/* authorize */
5841 	if (error == 0) {
5842 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5843 	}
5844 	/* get default ownership, etc. */
5845 	if (error == 0) {
5846 		error = vnode_authattr_new(dvp, &va, 0, ctx);
5847 	}
5848 
5849 #if CONFIG_FILE_LEASES
5850 	vnode_breakdirlease(dvp, false, O_WRONLY);
5851 #endif
5852 
5853 	if (error == 0) {
5854 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5855 	}
5856 
5857 	/* do fallback attribute handling */
5858 	if (error == 0 && vp) {
5859 		error = vnode_setattr_fallback(vp, &va, ctx);
5860 	}
5861 
5862 #if CONFIG_MACF
5863 	if (error == 0 && vp) {
5864 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5865 	}
5866 #endif
5867 
5868 	if (error == 0) {
5869 		int     update_flags = 0;
5870 
5871 		/*check if a new vnode was created, else try to get one*/
5872 		if (vp == NULL) {
5873 			nd.ni_cnd.cn_nameiop = LOOKUP;
5874 #if CONFIG_TRIGGERS
5875 			nd.ni_op = OP_LOOKUP;
5876 #endif
5877 			/*
5878 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5879 			 * reallocated again in namei().
5880 			 */
5881 			nd.ni_cnd.cn_flags &= HASBUF;
5882 			error = nameiat(&nd, fd);
5883 			if (error) {
5884 				goto skipit;
5885 			}
5886 			vp = nd.ni_vp;
5887 		}
5888 
5889 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5890 		/* call out to allow 3rd party notification of rename.
5891 		 * Ignore result of kauth_authorize_fileop call.
5892 		 */
5893 		if (kauth_authorize_fileop_has_listeners() &&
5894 		    namei(&nd) == 0) {
5895 			char *new_link_path = NULL;
5896 			int             len;
5897 
5898 			/* build the path to the new link file */
5899 			new_link_path = get_pathbuff();
5900 			len = MAXPATHLEN;
5901 			vn_getpath(dvp, new_link_path, &len);
5902 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5903 				new_link_path[len - 1] = '/';
5904 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5905 			}
5906 
5907 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5908 			    (uintptr_t)path, (uintptr_t)new_link_path);
5909 			if (new_link_path != NULL) {
5910 				release_pathbuff(new_link_path);
5911 			}
5912 		}
5913 #endif
5914 		// Make sure the name & parent pointers are hooked up
5915 		if (vp->v_name == NULL) {
5916 			update_flags |= VNODE_UPDATE_NAME;
5917 		}
5918 		if (vp->v_parent == NULLVP) {
5919 			update_flags |= VNODE_UPDATE_PARENT;
5920 		}
5921 
5922 		if (update_flags) {
5923 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5924 		}
5925 
5926 #if CONFIG_FSE
5927 		add_fsevent(FSE_CREATE_FILE, ctx,
5928 		    FSE_ARG_VNODE, vp,
5929 		    FSE_ARG_DONE);
5930 #endif
5931 	}
5932 
5933 skipit:
5934 	/*
5935 	 * nameidone has to happen before we vnode_put(dvp)
5936 	 * since it may need to release the fs_nodelock on the dvp
5937 	 */
5938 	nameidone(&nd);
5939 
5940 	if (vp) {
5941 		vnode_put(vp);
5942 	}
5943 	vnode_put(dvp);
5944 out:
5945 	if (path && (path != (char *)path_data)) {
5946 		zfree(ZV_NAMEI, path);
5947 	}
5948 
5949 	return error;
5950 }
5951 
5952 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5953 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5954 {
5955 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5956 	           uap->link, UIO_USERSPACE);
5957 }
5958 
5959 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5960 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5961     __unused int32_t *retval)
5962 {
5963 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5964 	           uap->path2, UIO_USERSPACE);
5965 }
5966 
5967 /*
5968  * Delete a whiteout from the filesystem.
5969  * No longer supported.
5970  */
5971 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5972 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5973 {
5974 	return ENOTSUP;
5975 }
5976 
5977 /*
5978  * Delete a name from the filesystem.
5979  */
5980 /* ARGSUSED */
5981 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5982 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5983     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5984 {
5985 	struct {
5986 		struct nameidata nd;
5987 #if CONFIG_FSE
5988 		struct vnode_attr va;
5989 		fse_info finfo;
5990 #endif
5991 	} *__unlink_data;
5992 	struct nameidata *ndp;
5993 	vnode_t vp, dvp;
5994 	int error;
5995 	struct componentname *cnp;
5996 	char  *path = NULL;
5997 	char  *no_firmlink_path = NULL;
5998 	int  len_path = 0;
5999 	int  len_no_firmlink_path = 0;
6000 	int flags;
6001 	int need_event;
6002 	int has_listeners;
6003 	int truncated_path;
6004 	int truncated_no_firmlink_path;
6005 	int batched;
6006 	struct vnode_attr *vap;
6007 	int do_retry;
6008 	int retry_count = 0;
6009 	int cn_flags;
6010 
6011 	cn_flags = LOCKPARENT;
6012 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6013 		cn_flags |= AUDITVNPATH1;
6014 	}
6015 	/* If a starting dvp is passed, it trumps any fd passed. */
6016 	if (start_dvp) {
6017 		cn_flags |= USEDVP;
6018 	}
6019 
6020 #if NAMEDRSRCFORK
6021 	/* unlink or delete is allowed on rsrc forks and named streams */
6022 	cn_flags |= CN_ALLOWRSRCFORK;
6023 #endif
6024 
6025 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6026 	ndp = &__unlink_data->nd;
6027 #if CONFIG_FSE
6028 	fse_info *finfop = &__unlink_data->finfo;
6029 #endif
6030 
6031 retry:
6032 	do_retry = 0;
6033 	flags = 0;
6034 	need_event = 0;
6035 	has_listeners = 0;
6036 	truncated_path = 0;
6037 	truncated_no_firmlink_path = 0;
6038 	vap = NULL;
6039 
6040 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6041 
6042 	ndp->ni_dvp = start_dvp;
6043 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
6044 	cnp = &ndp->ni_cnd;
6045 
6046 continue_lookup:
6047 	error = nameiat(ndp, fd);
6048 	if (error) {
6049 		goto early_out;
6050 	}
6051 
6052 	dvp = ndp->ni_dvp;
6053 	vp = ndp->ni_vp;
6054 
6055 	/* With Carbon delete semantics, busy files cannot be deleted */
6056 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6057 		flags |= VNODE_REMOVE_NODELETEBUSY;
6058 	}
6059 
6060 	/* Skip any potential upcalls if told to. */
6061 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6062 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6063 	}
6064 
6065 	if (vp) {
6066 		batched = vnode_compound_remove_available(vp);
6067 		/*
6068 		 * The root of a mounted filesystem cannot be deleted.
6069 		 */
6070 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6071 			error = EBUSY;
6072 			goto out;
6073 		}
6074 
6075 #if DEVELOPMENT || DEBUG
6076 		/*
6077 		 * XXX VSWAP: Check for entitlements or special flag here
6078 		 * so we can restrict access appropriately.
6079 		 */
6080 #else /* DEVELOPMENT || DEBUG */
6081 
6082 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6083 			error = EPERM;
6084 			goto out;
6085 		}
6086 #endif /* DEVELOPMENT || DEBUG */
6087 
6088 		if (!batched) {
6089 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6090 			if (error) {
6091 				if (error == ENOENT) {
6092 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6093 						do_retry = 1;
6094 						retry_count++;
6095 					}
6096 				}
6097 				goto out;
6098 			}
6099 		}
6100 	} else {
6101 		batched = 1;
6102 
6103 		if (!vnode_compound_remove_available(dvp)) {
6104 			panic("No vp, but no compound remove?");
6105 		}
6106 	}
6107 
6108 #if CONFIG_FSE
6109 	need_event = need_fsevent(FSE_DELETE, dvp);
6110 	if (need_event) {
6111 		if (!batched) {
6112 			if ((vp->v_flag & VISHARDLINK) == 0) {
6113 				/* XXX need to get these data in batched VNOP */
6114 				get_fse_info(vp, finfop, ctx);
6115 			}
6116 		} else {
6117 			error =
6118 			    vfs_get_notify_attributes(&__unlink_data->va);
6119 			if (error) {
6120 				goto out;
6121 			}
6122 
6123 			vap = &__unlink_data->va;
6124 		}
6125 	}
6126 #endif
6127 	has_listeners = kauth_authorize_fileop_has_listeners();
6128 	if (need_event || has_listeners) {
6129 		if (path == NULL) {
6130 			GET_PATH(path);
6131 		}
6132 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6133 		if (no_firmlink_path == NULL) {
6134 			GET_PATH(no_firmlink_path);
6135 		}
6136 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6137 	}
6138 
6139 #if NAMEDRSRCFORK
6140 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6141 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6142 	} else
6143 #endif
6144 	{
6145 #if CONFIG_FILE_LEASES
6146 		vnode_breakdirlease(dvp, false, O_WRONLY);
6147 #endif
6148 
6149 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6150 		vp = ndp->ni_vp;
6151 		if (error == EKEEPLOOKING) {
6152 			if (!batched) {
6153 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6154 			}
6155 
6156 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6157 				panic("EKEEPLOOKING, but continue flag not set?");
6158 			}
6159 
6160 			if (vnode_isdir(vp)) {
6161 				error = EISDIR;
6162 				goto out;
6163 			}
6164 			goto continue_lookup;
6165 		} else if (error == ENOENT && batched) {
6166 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6167 				/*
6168 				 * For compound VNOPs, the authorization callback may
6169 				 * return ENOENT in case of racing hardlink lookups
6170 				 * hitting the name  cache, redrive the lookup.
6171 				 */
6172 				do_retry = 1;
6173 				retry_count += 1;
6174 				goto out;
6175 			}
6176 		}
6177 	}
6178 
6179 	/*
6180 	 * Call out to allow 3rd party notification of delete.
6181 	 * Ignore result of kauth_authorize_fileop call.
6182 	 */
6183 	if (!error) {
6184 		if (has_listeners) {
6185 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6186 			    KAUTH_FILEOP_DELETE,
6187 			    (uintptr_t)vp,
6188 			    (uintptr_t)path);
6189 		}
6190 
6191 		if (vp->v_flag & VISHARDLINK) {
6192 			//
6193 			// if a hardlink gets deleted we want to blow away the
6194 			// v_parent link because the path that got us to this
6195 			// instance of the link is no longer valid.  this will
6196 			// force the next call to get the path to ask the file
6197 			// system instead of just following the v_parent link.
6198 			//
6199 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6200 		}
6201 
6202 #if CONFIG_FSE
6203 		if (need_event) {
6204 			if (vp->v_flag & VISHARDLINK) {
6205 				get_fse_info(vp, finfop, ctx);
6206 			} else if (vap) {
6207 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6208 			}
6209 			if (truncated_path) {
6210 				finfop->mode |= FSE_TRUNCATED_PATH;
6211 			}
6212 			add_fsevent(FSE_DELETE, ctx,
6213 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6214 			    FSE_ARG_FINFO, finfop,
6215 			    FSE_ARG_DONE);
6216 		}
6217 #endif
6218 
6219 #if CONFIG_MACF
6220 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6221 #endif
6222 	}
6223 
6224 out:
6225 	if (path != NULL) {
6226 		RELEASE_PATH(path);
6227 		path = NULL;
6228 	}
6229 
6230 	if (no_firmlink_path != NULL) {
6231 		RELEASE_PATH(no_firmlink_path);
6232 		no_firmlink_path = NULL;
6233 	}
6234 #if NAMEDRSRCFORK
6235 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6236 	 * will cause its shadow file to go away if necessary.
6237 	 */
6238 	if (vp && (vnode_isnamedstream(vp)) &&
6239 	    (vp->v_parent != NULLVP) &&
6240 	    vnode_isshadow(vp)) {
6241 		vnode_recycle(vp);
6242 	}
6243 #endif
6244 	/*
6245 	 * nameidone has to happen before we vnode_put(dvp)
6246 	 * since it may need to release the fs_nodelock on the dvp
6247 	 */
6248 	nameidone(ndp);
6249 	vnode_put(dvp);
6250 	if (vp) {
6251 		vnode_put(vp);
6252 	}
6253 
6254 	if (do_retry) {
6255 		goto retry;
6256 	}
6257 
6258 early_out:
6259 	kfree_type(typeof(*__unlink_data), __unlink_data);
6260 	return error;
6261 }
6262 
6263 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6264 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6265     enum uio_seg segflg, int unlink_flags)
6266 {
6267 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6268 	           unlink_flags);
6269 }
6270 
6271 /*
6272  * Delete a name from the filesystem using Carbon semantics.
6273  */
6274 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6275 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6276 {
6277 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6278 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6279 }
6280 
6281 /*
6282  * Delete a name from the filesystem using POSIX semantics.
6283  */
6284 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6285 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6286 {
6287 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6288 	           uap->path, UIO_USERSPACE, 0);
6289 }
6290 
6291 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6292 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6293 {
6294 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6295 		return EINVAL;
6296 	}
6297 
6298 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6299 		int unlink_flags = 0;
6300 
6301 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6302 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6303 		}
6304 		return rmdirat_internal(vfs_context_current(), uap->fd,
6305 		           uap->path, UIO_USERSPACE, unlink_flags);
6306 	} else {
6307 		return unlinkat_internal(vfs_context_current(), uap->fd,
6308 		           NULLVP, uap->path, UIO_USERSPACE, 0);
6309 	}
6310 }
6311 
6312 /*
6313  * Reposition read/write file offset.
6314  */
6315 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6316 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6317 {
6318 	struct fileproc *fp;
6319 	vnode_t vp;
6320 	struct vfs_context *ctx;
6321 	off_t offset = uap->offset, file_size;
6322 	int error;
6323 
6324 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6325 		if (error == ENOTSUP) {
6326 			return ESPIPE;
6327 		}
6328 		return error;
6329 	}
6330 	if (vnode_isfifo(vp)) {
6331 		file_drop(uap->fd);
6332 		return ESPIPE;
6333 	}
6334 
6335 
6336 	ctx = vfs_context_current();
6337 #if CONFIG_MACF
6338 	if (uap->whence == L_INCR && uap->offset == 0) {
6339 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6340 		    fp->fp_glob);
6341 	} else {
6342 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6343 		    fp->fp_glob);
6344 	}
6345 	if (error) {
6346 		file_drop(uap->fd);
6347 		return error;
6348 	}
6349 #endif
6350 	if ((error = vnode_getwithref(vp))) {
6351 		file_drop(uap->fd);
6352 		return error;
6353 	}
6354 
6355 	switch (uap->whence) {
6356 	case L_INCR:
6357 		offset += fp->fp_glob->fg_offset;
6358 		break;
6359 	case L_XTND:
6360 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6361 			break;
6362 		}
6363 		offset += file_size;
6364 		break;
6365 	case L_SET:
6366 		break;
6367 	case SEEK_HOLE:
6368 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6369 		break;
6370 	case SEEK_DATA:
6371 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6372 		break;
6373 	default:
6374 		error = EINVAL;
6375 	}
6376 	if (error == 0) {
6377 		if (uap->offset > 0 && offset < 0) {
6378 			/* Incremented/relative move past max size */
6379 			error = EOVERFLOW;
6380 		} else {
6381 			/*
6382 			 * Allow negative offsets on character devices, per
6383 			 * POSIX 1003.1-2001.  Most likely for writing disk
6384 			 * labels.
6385 			 */
6386 			if (offset < 0 && vp->v_type != VCHR) {
6387 				/* Decremented/relative move before start */
6388 				error = EINVAL;
6389 			} else {
6390 				/* Success */
6391 				fp->fp_glob->fg_offset = offset;
6392 				*retval = fp->fp_glob->fg_offset;
6393 			}
6394 		}
6395 	}
6396 
6397 	/*
6398 	 * An lseek can affect whether data is "available to read."  Use
6399 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6400 	 */
6401 	post_event_if_success(vp, error, NOTE_NONE);
6402 	(void)vnode_put(vp);
6403 	file_drop(uap->fd);
6404 	return error;
6405 }
6406 
6407 
6408 /*
6409  * Check access permissions.
6410  *
6411  * Returns:	0			Success
6412  *		vnode_authorize:???
6413  */
6414 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6415 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6416 {
6417 	kauth_action_t action;
6418 	int error;
6419 
6420 	/*
6421 	 * If just the regular access bits, convert them to something
6422 	 * that vnode_authorize will understand.
6423 	 */
6424 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6425 		action = 0;
6426 		if (uflags & R_OK) {
6427 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6428 		}
6429 		if (uflags & W_OK) {
6430 			if (vnode_isdir(vp)) {
6431 				action |= KAUTH_VNODE_ADD_FILE |
6432 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6433 				/* might want delete rights here too */
6434 			} else {
6435 				action |= KAUTH_VNODE_WRITE_DATA;
6436 			}
6437 		}
6438 		if (uflags & X_OK) {
6439 			if (vnode_isdir(vp)) {
6440 				action |= KAUTH_VNODE_SEARCH;
6441 			} else {
6442 				action |= KAUTH_VNODE_EXECUTE;
6443 			}
6444 		}
6445 	} else {
6446 		/* take advantage of definition of uflags */
6447 		action = uflags >> 8;
6448 	}
6449 
6450 #if CONFIG_MACF
6451 	error = mac_vnode_check_access(ctx, vp, uflags);
6452 	if (error) {
6453 		return error;
6454 	}
6455 #endif /* MAC */
6456 
6457 	/* action == 0 means only check for existence */
6458 	if (action != 0) {
6459 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6460 	} else {
6461 		error = 0;
6462 	}
6463 
6464 	return error;
6465 }
6466 
6467 
6468 
6469 /*
6470  * access_extended: Check access permissions in bulk.
6471  *
6472  * Description:	uap->entries		Pointer to an array of accessx
6473  *                                      descriptor structs, plus one or
6474  *                                      more NULL terminated strings (see
6475  *                                      "Notes" section below).
6476  *		uap->size		Size of the area pointed to by
6477  *					uap->entries.
6478  *		uap->results		Pointer to the results array.
6479  *
6480  * Returns:	0			Success
6481  *		ENOMEM			Insufficient memory
6482  *		EINVAL			Invalid arguments
6483  *		namei:EFAULT		Bad address
6484  *		namei:ENAMETOOLONG	Filename too long
6485  *		namei:ENOENT		No such file or directory
6486  *		namei:ELOOP		Too many levels of symbolic links
6487  *		namei:EBADF		Bad file descriptor
6488  *		namei:ENOTDIR		Not a directory
6489  *		namei:???
6490  *		access1:
6491  *
6492  * Implicit returns:
6493  *		uap->results		Array contents modified
6494  *
6495  * Notes:	The uap->entries are structured as an arbitrary length array
6496  *		of accessx descriptors, followed by one or more NULL terminated
6497  *		strings
6498  *
6499  *			struct accessx_descriptor[0]
6500  *			...
6501  *			struct accessx_descriptor[n]
6502  *			char name_data[0];
6503  *
6504  *		We determine the entry count by walking the buffer containing
6505  *		the uap->entries argument descriptor.  For each descriptor we
6506  *		see, the valid values for the offset ad_name_offset will be
6507  *		in the byte range:
6508  *
6509  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6510  *						to
6511  *				[ uap->entries + uap->size - 2 ]
6512  *
6513  *		since we must have at least one string, and the string must
6514  *		be at least one character plus the NULL terminator in length.
6515  *
6516  * XXX:		Need to support the check-as uid argument
6517  */
6518 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6519 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6520 {
6521 	struct accessx_descriptor *input = NULL;
6522 	errno_t *result = NULL;
6523 	errno_t error = 0;
6524 	int wantdelete = 0;
6525 	size_t desc_max, desc_actual = 0;
6526 	unsigned int i, j;
6527 	struct vfs_context context;
6528 	struct nameidata nd;
6529 	int niopts;
6530 	vnode_t vp = NULL;
6531 	vnode_t dvp = NULL;
6532 #define ACCESSX_MAX_DESCR_ON_STACK 10
6533 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6534 
6535 	context.vc_ucred = NULL;
6536 
6537 	/*
6538 	 * Validate parameters; if valid, copy the descriptor array and string
6539 	 * arguments into local memory.  Before proceeding, the following
6540 	 * conditions must have been met:
6541 	 *
6542 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6543 	 * o	There must be sufficient room in the request for at least one
6544 	 *	descriptor and a one yte NUL terminated string.
6545 	 * o	The allocation of local storage must not fail.
6546 	 */
6547 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6548 		return ENOMEM;
6549 	}
6550 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6551 		return EINVAL;
6552 	}
6553 	if (uap->size <= sizeof(stack_input)) {
6554 		input = stack_input;
6555 	} else {
6556 		input = kalloc_data(uap->size, Z_WAITOK);
6557 		if (input == NULL) {
6558 			error = ENOMEM;
6559 			goto out;
6560 		}
6561 	}
6562 	error = copyin(uap->entries, input, uap->size);
6563 	if (error) {
6564 		goto out;
6565 	}
6566 
6567 	AUDIT_ARG(opaque, input, uap->size);
6568 
6569 	/*
6570 	 * Force NUL termination of the copyin buffer to avoid nami() running
6571 	 * off the end.  If the caller passes us bogus data, they may get a
6572 	 * bogus result.
6573 	 */
6574 	((char *)input)[uap->size - 1] = 0;
6575 
6576 	/*
6577 	 * Access is defined as checking against the process' real identity,
6578 	 * even if operations are checking the effective identity.  This
6579 	 * requires that we use a local vfs context.
6580 	 */
6581 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6582 	context.vc_thread = current_thread();
6583 
6584 	/*
6585 	 * Find out how many entries we have, so we can allocate the result
6586 	 * array by walking the list and adjusting the count downward by the
6587 	 * earliest string offset we see.
6588 	 */
6589 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6590 	desc_actual = desc_max;
6591 	for (i = 0; i < desc_actual; i++) {
6592 		/*
6593 		 * Take the offset to the name string for this entry and
6594 		 * convert to an input array index, which would be one off
6595 		 * the end of the array if this entry was the lowest-addressed
6596 		 * name string.
6597 		 */
6598 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6599 
6600 		/*
6601 		 * An offset greater than the max allowable offset is an error.
6602 		 * It is also an error for any valid entry to point
6603 		 * to a location prior to the end of the current entry, if
6604 		 * it's not a reference to the string of the previous entry.
6605 		 */
6606 		if (j > desc_max || (j != 0 && j <= i)) {
6607 			error = EINVAL;
6608 			goto out;
6609 		}
6610 
6611 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6612 		if (input[i].ad_name_offset >= uap->size) {
6613 			error = EINVAL;
6614 			goto out;
6615 		}
6616 
6617 		/*
6618 		 * An offset of 0 means use the previous descriptor's offset;
6619 		 * this is used to chain multiple requests for the same file
6620 		 * to avoid multiple lookups.
6621 		 */
6622 		if (j == 0) {
6623 			/* This is not valid for the first entry */
6624 			if (i == 0) {
6625 				error = EINVAL;
6626 				goto out;
6627 			}
6628 			continue;
6629 		}
6630 
6631 		/*
6632 		 * If the offset of the string for this descriptor is before
6633 		 * what we believe is the current actual last descriptor,
6634 		 * then we need to adjust our estimate downward; this permits
6635 		 * the string table following the last descriptor to be out
6636 		 * of order relative to the descriptor list.
6637 		 */
6638 		if (j < desc_actual) {
6639 			desc_actual = j;
6640 		}
6641 	}
6642 
6643 	/*
6644 	 * We limit the actual number of descriptors we are willing to process
6645 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6646 	 * requested does not exceed this limit,
6647 	 */
6648 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6649 		error = ENOMEM;
6650 		goto out;
6651 	}
6652 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6653 	if (result == NULL) {
6654 		error = ENOMEM;
6655 		goto out;
6656 	}
6657 
6658 	/*
6659 	 * Do the work by iterating over the descriptor entries we know to
6660 	 * at least appear to contain valid data.
6661 	 */
6662 	error = 0;
6663 	for (i = 0; i < desc_actual; i++) {
6664 		/*
6665 		 * If the ad_name_offset is 0, then we use the previous
6666 		 * results to make the check; otherwise, we are looking up
6667 		 * a new file name.
6668 		 */
6669 		if (input[i].ad_name_offset != 0) {
6670 			/* discard old vnodes */
6671 			if (vp) {
6672 				vnode_put(vp);
6673 				vp = NULL;
6674 			}
6675 			if (dvp) {
6676 				vnode_put(dvp);
6677 				dvp = NULL;
6678 			}
6679 
6680 			/*
6681 			 * Scan forward in the descriptor list to see if we
6682 			 * need the parent vnode.  We will need it if we are
6683 			 * deleting, since we must have rights  to remove
6684 			 * entries in the parent directory, as well as the
6685 			 * rights to delete the object itself.
6686 			 */
6687 			wantdelete = input[i].ad_flags & _DELETE_OK;
6688 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6689 				if (input[j].ad_flags & _DELETE_OK) {
6690 					wantdelete = 1;
6691 				}
6692 			}
6693 
6694 			niopts = FOLLOW | AUDITVNPATH1;
6695 
6696 			/* need parent for vnode_authorize for deletion test */
6697 			if (wantdelete) {
6698 				niopts |= WANTPARENT;
6699 			}
6700 
6701 			/* do the lookup */
6702 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6703 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6704 			    &context);
6705 			error = namei(&nd);
6706 			if (!error) {
6707 				vp = nd.ni_vp;
6708 				if (wantdelete) {
6709 					dvp = nd.ni_dvp;
6710 				}
6711 			}
6712 			nameidone(&nd);
6713 		}
6714 
6715 		/*
6716 		 * Handle lookup errors.
6717 		 */
6718 		switch (error) {
6719 		case ENOENT:
6720 		case EACCES:
6721 		case EPERM:
6722 		case ENOTDIR:
6723 			result[i] = error;
6724 			break;
6725 		case 0:
6726 			/* run this access check */
6727 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6728 			break;
6729 		default:
6730 			/* fatal lookup error */
6731 
6732 			goto out;
6733 		}
6734 	}
6735 
6736 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6737 
6738 	/* copy out results */
6739 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6740 
6741 out:
6742 	if (input && input != stack_input) {
6743 		kfree_data(input, uap->size);
6744 	}
6745 	if (result) {
6746 		kfree_data(result, desc_actual * sizeof(errno_t));
6747 	}
6748 	if (vp) {
6749 		vnode_put(vp);
6750 	}
6751 	if (dvp) {
6752 		vnode_put(dvp);
6753 	}
6754 	if (IS_VALID_CRED(context.vc_ucred)) {
6755 		kauth_cred_unref(&context.vc_ucred);
6756 	}
6757 	return error;
6758 }
6759 
6760 
6761 /*
6762  * Returns:	0			Success
6763  *		namei:EFAULT		Bad address
6764  *		namei:ENAMETOOLONG	Filename too long
6765  *		namei:ENOENT		No such file or directory
6766  *		namei:ELOOP		Too many levels of symbolic links
6767  *		namei:EBADF		Bad file descriptor
6768  *		namei:ENOTDIR		Not a directory
6769  *		namei:???
6770  *		access1:
6771  */
6772 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6773 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6774     int flag, enum uio_seg segflg)
6775 {
6776 	int error;
6777 	struct nameidata nd;
6778 	int niopts;
6779 	struct vfs_context context;
6780 #if NAMEDRSRCFORK
6781 	int is_namedstream = 0;
6782 #endif
6783 
6784 	/*
6785 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6786 	 * against the process' real identity, even if operations are checking
6787 	 * the effective identity.  So we need to tweak the credential
6788 	 * in the context for that case.
6789 	 */
6790 	if (!(flag & AT_EACCESS)) {
6791 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6792 	} else {
6793 		context.vc_ucred = ctx->vc_ucred;
6794 	}
6795 	context.vc_thread = ctx->vc_thread;
6796 
6797 
6798 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6799 	/* need parent for vnode_authorize for deletion test */
6800 	if (amode & _DELETE_OK) {
6801 		niopts |= WANTPARENT;
6802 	}
6803 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6804 	    path, &context);
6805 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6806 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6807 	}
6808 
6809 #if NAMEDRSRCFORK
6810 	/* access(F_OK) calls are allowed for resource forks. */
6811 	if (amode == F_OK) {
6812 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6813 	}
6814 #endif
6815 	error = nameiat(&nd, fd);
6816 	if (error) {
6817 		goto out;
6818 	}
6819 
6820 #if NAMEDRSRCFORK
6821 	/* Grab reference on the shadow stream file vnode to
6822 	 * force an inactive on release which will mark it
6823 	 * for recycle.
6824 	 */
6825 	if (vnode_isnamedstream(nd.ni_vp) &&
6826 	    (nd.ni_vp->v_parent != NULLVP) &&
6827 	    vnode_isshadow(nd.ni_vp)) {
6828 		is_namedstream = 1;
6829 		vnode_ref(nd.ni_vp);
6830 	}
6831 #endif
6832 
6833 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6834 
6835 #if NAMEDRSRCFORK
6836 	if (is_namedstream) {
6837 		vnode_rele(nd.ni_vp);
6838 	}
6839 #endif
6840 
6841 	vnode_put(nd.ni_vp);
6842 	if (amode & _DELETE_OK) {
6843 		vnode_put(nd.ni_dvp);
6844 	}
6845 	nameidone(&nd);
6846 
6847 out:
6848 	if (!(flag & AT_EACCESS)) {
6849 		kauth_cred_unref(&context.vc_ucred);
6850 	}
6851 	return error;
6852 }
6853 
6854 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6855 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6856 {
6857 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
6858 	           uap->path, uap->flags, 0, UIO_USERSPACE);
6859 }
6860 
6861 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6862 faccessat(__unused proc_t p, struct faccessat_args *uap,
6863     __unused int32_t *retval)
6864 {
6865 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6866 		return EINVAL;
6867 	}
6868 
6869 	return faccessat_internal(vfs_context_current(), uap->fd,
6870 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6871 }
6872 
6873 /*
6874  * Returns:	0			Success
6875  *		EFAULT
6876  *	copyout:EFAULT
6877  *	namei:???
6878  *	vn_stat:???
6879  */
6880 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6881 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6882     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6883     enum uio_seg segflg, int fd, int flag)
6884 {
6885 	struct nameidata nd;
6886 	int follow;
6887 	union {
6888 		struct stat sb;
6889 		struct stat64 sb64;
6890 	} source = {};
6891 	union {
6892 		struct user64_stat user64_sb;
6893 		struct user32_stat user32_sb;
6894 		struct user64_stat64 user64_sb64;
6895 		struct user32_stat64 user32_sb64;
6896 	} dest = {};
6897 	caddr_t sbp;
6898 	int error, my_size;
6899 	kauth_filesec_t fsec;
6900 	size_t xsecurity_bufsize;
6901 	void * statptr;
6902 	struct fileproc *fp = NULL;
6903 	int needsrealdev = 0;
6904 
6905 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6906 	NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6907 	    segflg, path, ctx);
6908 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6909 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6910 	}
6911 
6912 #if NAMEDRSRCFORK
6913 	int is_namedstream = 0;
6914 	/* stat calls are allowed for resource forks. */
6915 	nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6916 #endif
6917 
6918 	if (flag & AT_FDONLY) {
6919 		vnode_t fvp;
6920 
6921 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6922 		if (error) {
6923 			return error;
6924 		}
6925 		if ((error = vnode_getwithref(fvp))) {
6926 			file_drop(fd);
6927 			return error;
6928 		}
6929 		nd.ni_vp = fvp;
6930 	} else {
6931 		error = nameiat(&nd, fd);
6932 		if (error) {
6933 			return error;
6934 		}
6935 	}
6936 	fsec = KAUTH_FILESEC_NONE;
6937 
6938 	statptr = (void *)&source;
6939 
6940 #if NAMEDRSRCFORK
6941 	/* Grab reference on the shadow stream file vnode to
6942 	 * force an inactive on release which will mark it
6943 	 * for recycle.
6944 	 */
6945 	if (vnode_isnamedstream(nd.ni_vp) &&
6946 	    (nd.ni_vp->v_parent != NULLVP) &&
6947 	    vnode_isshadow(nd.ni_vp)) {
6948 		is_namedstream = 1;
6949 		vnode_ref(nd.ni_vp);
6950 	}
6951 #endif
6952 
6953 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
6954 	if (fp && (xsecurity == USER_ADDR_NULL)) {
6955 		/*
6956 		 * If the caller has the file open, and is not
6957 		 * requesting extended security information, we are
6958 		 * going to let them get the basic stat information.
6959 		 */
6960 		error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6961 		    fp->fp_glob->fg_cred);
6962 	} else {
6963 		error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6964 		    isstat64, needsrealdev, ctx);
6965 	}
6966 
6967 #if NAMEDRSRCFORK
6968 	if (is_namedstream) {
6969 		vnode_rele(nd.ni_vp);
6970 	}
6971 #endif
6972 	vnode_put(nd.ni_vp);
6973 	nameidone(&nd);
6974 	if (fp) {
6975 		file_drop(fd);
6976 		fp = NULL;
6977 	}
6978 
6979 	if (error) {
6980 		return error;
6981 	}
6982 	/* Zap spare fields */
6983 	if (isstat64 != 0) {
6984 		source.sb64.st_lspare = 0;
6985 		source.sb64.st_qspare[0] = 0LL;
6986 		source.sb64.st_qspare[1] = 0LL;
6987 		if (vfs_context_is64bit(ctx)) {
6988 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6989 			my_size = sizeof(dest.user64_sb64);
6990 			sbp = (caddr_t)&dest.user64_sb64;
6991 		} else {
6992 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6993 			my_size = sizeof(dest.user32_sb64);
6994 			sbp = (caddr_t)&dest.user32_sb64;
6995 		}
6996 		/*
6997 		 * Check if we raced (post lookup) against the last unlink of a file.
6998 		 */
6999 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7000 			source.sb64.st_nlink = 1;
7001 		}
7002 	} else {
7003 		source.sb.st_lspare = 0;
7004 		source.sb.st_qspare[0] = 0LL;
7005 		source.sb.st_qspare[1] = 0LL;
7006 		if (vfs_context_is64bit(ctx)) {
7007 			munge_user64_stat(&source.sb, &dest.user64_sb);
7008 			my_size = sizeof(dest.user64_sb);
7009 			sbp = (caddr_t)&dest.user64_sb;
7010 		} else {
7011 			munge_user32_stat(&source.sb, &dest.user32_sb);
7012 			my_size = sizeof(dest.user32_sb);
7013 			sbp = (caddr_t)&dest.user32_sb;
7014 		}
7015 
7016 		/*
7017 		 * Check if we raced (post lookup) against the last unlink of a file.
7018 		 */
7019 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7020 			source.sb.st_nlink = 1;
7021 		}
7022 	}
7023 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7024 		goto out;
7025 	}
7026 
7027 	/* caller wants extended security information? */
7028 	if (xsecurity != USER_ADDR_NULL) {
7029 		/* did we get any? */
7030 		if (fsec == KAUTH_FILESEC_NONE) {
7031 			if (susize(xsecurity_size, 0) != 0) {
7032 				error = EFAULT;
7033 				goto out;
7034 			}
7035 		} else {
7036 			/* find the user buffer size */
7037 			xsecurity_bufsize = fusize(xsecurity_size);
7038 
7039 			/* copy out the actual data size */
7040 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7041 				error = EFAULT;
7042 				goto out;
7043 			}
7044 
7045 			/* if the caller supplied enough room, copy out to it */
7046 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7047 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7048 			}
7049 		}
7050 	}
7051 out:
7052 	if (fsec != KAUTH_FILESEC_NONE) {
7053 		kauth_filesec_free(fsec);
7054 	}
7055 	return error;
7056 }
7057 
7058 /*
7059  * stat_extended: Get file status; with extended security (ACL).
7060  *
7061  * Parameters:    p                       (ignored)
7062  *                uap                     User argument descriptor (see below)
7063  *                retval                  (ignored)
7064  *
7065  * Indirect:      uap->path               Path of file to get status from
7066  *                uap->ub                 User buffer (holds file status info)
7067  *                uap->xsecurity          ACL to get (extended security)
7068  *                uap->xsecurity_size     Size of ACL
7069  *
7070  * Returns:        0                      Success
7071  *                !0                      errno value
7072  *
7073  */
7074 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7075 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7076     __unused int32_t *retval)
7077 {
7078 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7079 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7080 	           0);
7081 }
7082 
7083 /*
7084  * Returns:	0			Success
7085  *	fstatat_internal:???		[see fstatat_internal() in this file]
7086  */
7087 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7088 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7089 {
7090 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7091 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7092 }
7093 
7094 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7095 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7096 {
7097 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7098 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7099 }
7100 
7101 /*
7102  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7103  *
7104  * Parameters:    p                       (ignored)
7105  *                uap                     User argument descriptor (see below)
7106  *                retval                  (ignored)
7107  *
7108  * Indirect:      uap->path               Path of file to get status from
7109  *                uap->ub                 User buffer (holds file status info)
7110  *                uap->xsecurity          ACL to get (extended security)
7111  *                uap->xsecurity_size     Size of ACL
7112  *
7113  * Returns:        0                      Success
7114  *                !0                      errno value
7115  *
7116  */
7117 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7118 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7119 {
7120 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7121 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7122 	           0);
7123 }
7124 
7125 /*
7126  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7127  *
7128  * Parameters:    p                       (ignored)
7129  *                uap                     User argument descriptor (see below)
7130  *                retval                  (ignored)
7131  *
7132  * Indirect:      uap->path               Path of file to get status from
7133  *                uap->ub                 User buffer (holds file status info)
7134  *                uap->xsecurity          ACL to get (extended security)
7135  *                uap->xsecurity_size     Size of ACL
7136  *
7137  * Returns:        0                      Success
7138  *                !0                      errno value
7139  *
7140  */
7141 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7142 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7143 {
7144 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7145 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7146 	           AT_SYMLINK_NOFOLLOW);
7147 }
7148 
7149 /*
7150  * Get file status; this version does not follow links.
7151  */
7152 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7153 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7154 {
7155 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7156 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7157 }
7158 
7159 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7160 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7161 {
7162 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7163 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7164 }
7165 
7166 /*
7167  * lstat64_extended: Get file status; can handle large inode numbers; does not
7168  * follow links; with extended security (ACL).
7169  *
7170  * Parameters:    p                       (ignored)
7171  *                uap                     User argument descriptor (see below)
7172  *                retval                  (ignored)
7173  *
7174  * Indirect:      uap->path               Path of file to get status from
7175  *                uap->ub                 User buffer (holds file status info)
7176  *                uap->xsecurity          ACL to get (extended security)
7177  *                uap->xsecurity_size     Size of ACL
7178  *
7179  * Returns:        0                      Success
7180  *                !0                      errno value
7181  *
7182  */
7183 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7184 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7185 {
7186 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7187 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7188 	           AT_SYMLINK_NOFOLLOW);
7189 }
7190 
7191 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7192 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7193 {
7194 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7195 		return EINVAL;
7196 	}
7197 
7198 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7199 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7200 }
7201 
7202 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7203 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7204     __unused int32_t *retval)
7205 {
7206 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7207 		return EINVAL;
7208 	}
7209 
7210 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7211 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7212 }
7213 
7214 /*
7215  * Get configurable pathname variables.
7216  *
7217  * Returns:	0			Success
7218  *	namei:???
7219  *	vn_pathconf:???
7220  *
7221  * Notes:	Global implementation  constants are intended to be
7222  *		implemented in this function directly; all other constants
7223  *		are per-FS implementation, and therefore must be handled in
7224  *		each respective FS, instead.
7225  *
7226  * XXX We implement some things globally right now that should actually be
7227  * XXX per-FS; we will need to deal with this at some point.
7228  */
7229 /* ARGSUSED */
7230 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7231 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7232 {
7233 	int error;
7234 	struct nameidata nd;
7235 	vfs_context_t ctx = vfs_context_current();
7236 
7237 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7238 	    UIO_USERSPACE, uap->path, ctx);
7239 	error = namei(&nd);
7240 	if (error) {
7241 		return error;
7242 	}
7243 
7244 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7245 
7246 	vnode_put(nd.ni_vp);
7247 	nameidone(&nd);
7248 	return error;
7249 }
7250 
7251 /*
7252  * Return target name of a symbolic link.
7253  */
7254 /* ARGSUSED */
7255 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7256 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7257     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7258     int *retval)
7259 {
7260 	vnode_t vp;
7261 	uio_t auio;
7262 	int error;
7263 	struct nameidata nd;
7264 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
7265 	bool put_vnode;
7266 
7267 	if (bufsize > INT32_MAX) {
7268 		return EINVAL;
7269 	}
7270 
7271 	if (lnk_vp) {
7272 		vp = lnk_vp;
7273 		put_vnode = false;
7274 	} else {
7275 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7276 		    seg, path, ctx);
7277 
7278 		error = nameiat(&nd, fd);
7279 		if (error) {
7280 			return error;
7281 		}
7282 		vp = nd.ni_vp;
7283 		put_vnode = true;
7284 		nameidone(&nd);
7285 	}
7286 
7287 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7288 	    &uio_buf[0], sizeof(uio_buf));
7289 	uio_addiov(auio, buf, bufsize);
7290 	if (vp->v_type != VLNK) {
7291 		error = EINVAL;
7292 	} else {
7293 #if CONFIG_MACF
7294 		error = mac_vnode_check_readlink(ctx, vp);
7295 #endif
7296 		if (error == 0) {
7297 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7298 			    ctx);
7299 		}
7300 		if (error == 0) {
7301 			error = VNOP_READLINK(vp, auio, ctx);
7302 		}
7303 	}
7304 
7305 	if (put_vnode) {
7306 		vnode_put(vp);
7307 	}
7308 
7309 	*retval = (int)(bufsize - uio_resid(auio));
7310 	return error;
7311 }
7312 
7313 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7314 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7315 {
7316 	enum uio_seg procseg;
7317 	vnode_t vp;
7318 	int error;
7319 
7320 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7321 
7322 	AUDIT_ARG(fd, uap->fd);
7323 
7324 	if ((error = file_vnode(uap->fd, &vp))) {
7325 		return error;
7326 	}
7327 	if ((error = vnode_getwithref(vp))) {
7328 		file_drop(uap->fd);
7329 		return error;
7330 	}
7331 
7332 	error = readlinkat_internal(vfs_context_current(), -1,
7333 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7334 	    uap->bufsize, procseg, retval);
7335 
7336 	vnode_put(vp);
7337 	file_drop(uap->fd);
7338 	return error;
7339 }
7340 
7341 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7342 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7343 {
7344 	enum uio_seg procseg;
7345 
7346 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7347 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7348 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7349 	           uap->count, procseg, retval);
7350 }
7351 
7352 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7353 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7354 {
7355 	enum uio_seg procseg;
7356 
7357 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7358 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7359 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7360 	           retval);
7361 }
7362 
7363 /*
7364  * Change file flags, the deep inner layer.
7365  */
7366 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7367 chflags0(vnode_t vp, struct vnode_attr *va,
7368     int (*setattr)(vnode_t, void *, vfs_context_t),
7369     void *arg, vfs_context_t ctx)
7370 {
7371 	kauth_action_t action = 0;
7372 	int error;
7373 
7374 #if CONFIG_MACF
7375 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7376 	if (error) {
7377 		goto out;
7378 	}
7379 #endif
7380 
7381 	/* request authorisation, disregard immutability */
7382 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7383 		goto out;
7384 	}
7385 	/*
7386 	 * Request that the auth layer disregard those file flags it's allowed to when
7387 	 * authorizing this operation; we need to do this in order to be able to
7388 	 * clear immutable flags.
7389 	 */
7390 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7391 		goto out;
7392 	}
7393 	error = (*setattr)(vp, arg, ctx);
7394 
7395 #if CONFIG_MACF
7396 	if (error == 0) {
7397 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7398 	}
7399 #endif
7400 
7401 out:
7402 	return error;
7403 }
7404 
7405 /*
7406  * Change file flags.
7407  *
7408  * NOTE: this will vnode_put() `vp'
7409  */
7410 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7411 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7412 {
7413 	struct vnode_attr va;
7414 	int error;
7415 
7416 	VATTR_INIT(&va);
7417 	VATTR_SET(&va, va_flags, flags);
7418 
7419 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7420 	vnode_put(vp);
7421 
7422 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7423 		error = ENOTSUP;
7424 	}
7425 
7426 	return error;
7427 }
7428 
7429 /*
7430  * Change flags of a file given a path name.
7431  */
7432 /* ARGSUSED */
7433 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7434 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7435 {
7436 	vnode_t vp;
7437 	vfs_context_t ctx = vfs_context_current();
7438 	int error;
7439 	struct nameidata nd;
7440 	uint32_t wantparent = 0;
7441 
7442 #if CONFIG_FILE_LEASES
7443 	wantparent = WANTPARENT;
7444 #endif
7445 
7446 	AUDIT_ARG(fflags, uap->flags);
7447 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7448 	    UIO_USERSPACE, uap->path, ctx);
7449 	error = namei(&nd);
7450 	if (error) {
7451 		return error;
7452 	}
7453 	vp = nd.ni_vp;
7454 
7455 #if CONFIG_FILE_LEASES
7456 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7457 	vnode_put(nd.ni_dvp);
7458 #endif
7459 
7460 	nameidone(&nd);
7461 
7462 	/* we don't vnode_put() here because chflags1 does internally */
7463 	error = chflags1(vp, uap->flags, ctx);
7464 
7465 	return error;
7466 }
7467 
7468 /*
7469  * Change flags of a file given a file descriptor.
7470  */
7471 /* ARGSUSED */
7472 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7473 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7474 {
7475 	vnode_t vp;
7476 	int error;
7477 
7478 	AUDIT_ARG(fd, uap->fd);
7479 	AUDIT_ARG(fflags, uap->flags);
7480 	if ((error = file_vnode(uap->fd, &vp))) {
7481 		return error;
7482 	}
7483 
7484 	if ((error = vnode_getwithref(vp))) {
7485 		file_drop(uap->fd);
7486 		return error;
7487 	}
7488 
7489 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7490 
7491 #if CONFIG_FILE_LEASES
7492 	vnode_breakdirlease(vp, true, O_WRONLY);
7493 #endif
7494 
7495 	/* we don't vnode_put() here because chflags1 does internally */
7496 	error = chflags1(vp, uap->flags, vfs_context_current());
7497 
7498 	file_drop(uap->fd);
7499 	return error;
7500 }
7501 
7502 /*
7503  * Change security information on a filesystem object.
7504  *
7505  * Returns:	0			Success
7506  *		EPERM			Operation not permitted
7507  *		vnode_authattr:???	[anything vnode_authattr can return]
7508  *		vnode_authorize:???	[anything vnode_authorize can return]
7509  *		vnode_setattr:???	[anything vnode_setattr can return]
7510  *
7511  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7512  *		translated to EPERM before being returned.
7513  */
7514 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7515 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7516 {
7517 	kauth_action_t action;
7518 	int error;
7519 
7520 	AUDIT_ARG(mode, vap->va_mode);
7521 	/* XXX audit new args */
7522 
7523 #if NAMEDSTREAMS
7524 	/* chmod calls are not allowed for resource forks. */
7525 	if (vp->v_flag & VISNAMEDSTREAM) {
7526 		return EPERM;
7527 	}
7528 #endif
7529 
7530 #if CONFIG_MACF
7531 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7532 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7533 		return error;
7534 	}
7535 
7536 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7537 		if ((error = mac_vnode_check_setowner(ctx, vp,
7538 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7539 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7540 			return error;
7541 		}
7542 	}
7543 
7544 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7545 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7546 		return error;
7547 	}
7548 #endif
7549 
7550 	/* make sure that the caller is allowed to set this security information */
7551 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7552 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7553 		if (error == EACCES) {
7554 			error = EPERM;
7555 		}
7556 		return error;
7557 	}
7558 
7559 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7560 		return error;
7561 	}
7562 
7563 #if CONFIG_MACF
7564 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7565 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7566 	}
7567 
7568 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7569 		mac_vnode_notify_setowner(ctx, vp,
7570 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7571 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7572 	}
7573 
7574 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7575 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7576 	}
7577 #endif
7578 
7579 	return error;
7580 }
7581 
7582 
7583 /*
7584  * Change mode of a file given a path name.
7585  *
7586  * Returns:	0			Success
7587  *		namei:???		[anything namei can return]
7588  *		chmod_vnode:???		[anything chmod_vnode can return]
7589  */
7590 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7591 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7592     int fd, int flag, enum uio_seg segflg)
7593 {
7594 	struct nameidata nd;
7595 	int follow, error;
7596 	uint32_t wantparent = 0;
7597 
7598 #if CONFIG_FILE_LEASES
7599 	wantparent = WANTPARENT;
7600 #endif
7601 
7602 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7603 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7604 	    segflg, path, ctx);
7605 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7606 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7607 	}
7608 	if ((error = nameiat(&nd, fd))) {
7609 		return error;
7610 	}
7611 
7612 #if CONFIG_FILE_LEASES
7613 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7614 	vnode_put(nd.ni_dvp);
7615 #endif
7616 
7617 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7618 	vnode_put(nd.ni_vp);
7619 	nameidone(&nd);
7620 	return error;
7621 }
7622 
7623 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7624 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7625     gid_t gid, user_addr_t xsecurity)
7626 {
7627 	int error;
7628 
7629 	VATTR_INIT(pva);
7630 
7631 	if (mode != -1) {
7632 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
7633 	} else {
7634 		pva->va_mode = 0;
7635 	}
7636 
7637 	if (uid != KAUTH_UID_NONE) {
7638 		VATTR_SET(pva, va_uid, uid);
7639 	}
7640 
7641 	if (gid != KAUTH_GID_NONE) {
7642 		VATTR_SET(pva, va_gid, gid);
7643 	}
7644 
7645 	*pxsecdst = NULL;
7646 	switch (xsecurity) {
7647 	case USER_ADDR_NULL:
7648 		break;
7649 
7650 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7651 		VATTR_SET(pva, va_acl, NULL);
7652 		break;
7653 
7654 	default:
7655 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7656 			return error;
7657 		}
7658 
7659 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7660 		pva->va_vaflags |= VA_FILESEC_ACL;
7661 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7662 		break;
7663 	}
7664 
7665 	return 0;
7666 }
7667 
7668 /*
7669  * chmod_extended: Change the mode of a file given a path name; with extended
7670  * argument list (including extended security (ACL)).
7671  *
7672  * Parameters:	p			Process requesting the open
7673  *		uap			User argument descriptor (see below)
7674  *		retval			(ignored)
7675  *
7676  * Indirect:	uap->path		Path to object (same as 'chmod')
7677  *		uap->uid		UID to set
7678  *		uap->gid		GID to set
7679  *		uap->mode		File mode to set (same as 'chmod')
7680  *		uap->xsecurity		ACL to set (or delete)
7681  *
7682  * Returns:	0			Success
7683  *		!0			errno value
7684  *
7685  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7686  *
7687  * XXX:		We should enummerate the possible errno values here, and where
7688  *		in the code they originated.
7689  */
7690 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7691 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7692 {
7693 	int error;
7694 	struct vnode_attr va;
7695 	kauth_filesec_t xsecdst = NULL;
7696 
7697 	AUDIT_ARG(owner, uap->uid, uap->gid);
7698 
7699 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7700 	    uap->gid, uap->xsecurity);
7701 
7702 	if (error) {
7703 		return error;
7704 	}
7705 
7706 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7707 	    UIO_USERSPACE);
7708 
7709 	if (xsecdst != NULL) {
7710 		kauth_filesec_free(xsecdst);
7711 	}
7712 	return error;
7713 }
7714 
7715 /*
7716  * Returns:	0			Success
7717  *		chmodat:???		[anything chmodat can return]
7718  */
7719 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7720 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7721     int flag, enum uio_seg segflg)
7722 {
7723 	struct vnode_attr va;
7724 
7725 	VATTR_INIT(&va);
7726 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7727 
7728 	return chmodat(ctx, path, &va, fd, flag, segflg);
7729 }
7730 
7731 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7732 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7733 {
7734 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7735 	           AT_FDCWD, 0, UIO_USERSPACE);
7736 }
7737 
7738 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7739 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7740 {
7741 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7742 		return EINVAL;
7743 	}
7744 
7745 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7746 	           uap->fd, uap->flag, UIO_USERSPACE);
7747 }
7748 
7749 /*
7750  * Change mode of a file given a file descriptor.
7751  */
7752 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7753 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7754 {
7755 	vnode_t vp;
7756 	int error;
7757 
7758 	AUDIT_ARG(fd, fd);
7759 
7760 	if ((error = file_vnode(fd, &vp)) != 0) {
7761 		return error;
7762 	}
7763 	if ((error = vnode_getwithref(vp)) != 0) {
7764 		file_drop(fd);
7765 		return error;
7766 	}
7767 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7768 
7769 #if CONFIG_FILE_LEASES
7770 	vnode_breakdirlease(vp, true, O_WRONLY);
7771 #endif
7772 
7773 	error = chmod_vnode(vfs_context_current(), vp, vap);
7774 	(void)vnode_put(vp);
7775 	file_drop(fd);
7776 
7777 	return error;
7778 }
7779 
7780 /*
7781  * fchmod_extended: Change mode of a file given a file descriptor; with
7782  * extended argument list (including extended security (ACL)).
7783  *
7784  * Parameters:    p                       Process requesting to change file mode
7785  *                uap                     User argument descriptor (see below)
7786  *                retval                  (ignored)
7787  *
7788  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7789  *                uap->uid                UID to set
7790  *                uap->gid                GID to set
7791  *                uap->xsecurity          ACL to set (or delete)
7792  *                uap->fd                 File descriptor of file to change mode
7793  *
7794  * Returns:        0                      Success
7795  *                !0                      errno value
7796  *
7797  */
7798 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7799 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7800 {
7801 	int error;
7802 	struct vnode_attr va;
7803 	kauth_filesec_t xsecdst = NULL;
7804 
7805 	AUDIT_ARG(owner, uap->uid, uap->gid);
7806 
7807 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7808 	    uap->gid, uap->xsecurity);
7809 
7810 	if (error) {
7811 		return error;
7812 	}
7813 
7814 	error = fchmod1(p, uap->fd, &va);
7815 
7816 	if (xsecdst != NULL) {
7817 		kauth_filesec_free(xsecdst);
7818 	}
7819 	return error;
7820 }
7821 
7822 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7823 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7824 {
7825 	struct vnode_attr va;
7826 
7827 	VATTR_INIT(&va);
7828 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7829 
7830 	return fchmod1(p, uap->fd, &va);
7831 }
7832 
7833 
7834 /*
7835  * Set ownership given a path name.
7836  */
7837 /* ARGSUSED */
7838 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7839 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7840     gid_t gid, int flag, enum uio_seg segflg)
7841 {
7842 	vnode_t vp;
7843 	struct vnode_attr va;
7844 	int error;
7845 	struct nameidata nd;
7846 	int follow;
7847 	kauth_action_t action;
7848 	uint32_t wantparent = 0;
7849 
7850 #if CONFIG_FILE_LEASES
7851 	wantparent = WANTPARENT;
7852 #endif
7853 
7854 	AUDIT_ARG(owner, uid, gid);
7855 
7856 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7857 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent, segflg,
7858 	    path, ctx);
7859 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7860 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7861 	}
7862 	error = nameiat(&nd, fd);
7863 	if (error) {
7864 		return error;
7865 	}
7866 	vp = nd.ni_vp;
7867 
7868 	VATTR_INIT(&va);
7869 	if (uid != (uid_t)VNOVAL) {
7870 		VATTR_SET(&va, va_uid, uid);
7871 	}
7872 	if (gid != (gid_t)VNOVAL) {
7873 		VATTR_SET(&va, va_gid, gid);
7874 	}
7875 
7876 #if CONFIG_MACF
7877 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7878 	if (error) {
7879 		goto out;
7880 	}
7881 #endif
7882 
7883 	/* preflight and authorize attribute changes */
7884 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7885 		goto out;
7886 	}
7887 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7888 		goto out;
7889 	}
7890 
7891 #if CONFIG_FILE_LEASES
7892 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7893 #endif
7894 
7895 	error = vnode_setattr(vp, &va, ctx);
7896 
7897 #if CONFIG_MACF
7898 	if (error == 0) {
7899 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
7900 	}
7901 #endif
7902 
7903 out:
7904 	/*
7905 	 * EACCES is only allowed from namei(); permissions failure should
7906 	 * return EPERM, so we need to translate the error code.
7907 	 */
7908 	if (error == EACCES) {
7909 		error = EPERM;
7910 	}
7911 
7912 #if CONFIG_FILE_LEASES
7913 	vnode_put(nd.ni_dvp);
7914 #endif
7915 	nameidone(&nd);
7916 	vnode_put(vp);
7917 	return error;
7918 }
7919 
7920 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7921 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7922 {
7923 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7924 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
7925 }
7926 
7927 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7928 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7929 {
7930 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7931 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7932 }
7933 
7934 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7935 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7936 {
7937 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7938 		return EINVAL;
7939 	}
7940 
7941 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7942 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7943 }
7944 
7945 /*
7946  * Set ownership given a file descriptor.
7947  */
7948 /* ARGSUSED */
7949 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7950 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7951 {
7952 	struct vnode_attr va;
7953 	vfs_context_t ctx = vfs_context_current();
7954 	vnode_t vp;
7955 	int error;
7956 	kauth_action_t action;
7957 
7958 	AUDIT_ARG(owner, uap->uid, uap->gid);
7959 	AUDIT_ARG(fd, uap->fd);
7960 
7961 	if ((error = file_vnode(uap->fd, &vp))) {
7962 		return error;
7963 	}
7964 
7965 	if ((error = vnode_getwithref(vp))) {
7966 		file_drop(uap->fd);
7967 		return error;
7968 	}
7969 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7970 
7971 	VATTR_INIT(&va);
7972 	if (uap->uid != VNOVAL) {
7973 		VATTR_SET(&va, va_uid, uap->uid);
7974 	}
7975 	if (uap->gid != VNOVAL) {
7976 		VATTR_SET(&va, va_gid, uap->gid);
7977 	}
7978 
7979 #if NAMEDSTREAMS
7980 	/* chown calls are not allowed for resource forks. */
7981 	if (vp->v_flag & VISNAMEDSTREAM) {
7982 		error = EPERM;
7983 		goto out;
7984 	}
7985 #endif
7986 
7987 #if CONFIG_MACF
7988 	error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7989 	if (error) {
7990 		goto out;
7991 	}
7992 #endif
7993 
7994 	/* preflight and authorize attribute changes */
7995 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7996 		goto out;
7997 	}
7998 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7999 		if (error == EACCES) {
8000 			error = EPERM;
8001 		}
8002 		goto out;
8003 	}
8004 
8005 #if CONFIG_FILE_LEASES
8006 	vnode_breakdirlease(vp, true, O_WRONLY);
8007 #endif
8008 
8009 	error = vnode_setattr(vp, &va, ctx);
8010 
8011 #if CONFIG_MACF
8012 	if (error == 0) {
8013 		mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
8014 	}
8015 #endif
8016 
8017 out:
8018 	(void)vnode_put(vp);
8019 	file_drop(uap->fd);
8020 	return error;
8021 }
8022 
8023 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8024 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8025 {
8026 	int error;
8027 
8028 	if (usrtvp == USER_ADDR_NULL) {
8029 		struct timeval old_tv;
8030 		/* XXX Y2038 bug because of microtime argument */
8031 		microtime(&old_tv);
8032 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8033 		tsp[1] = tsp[0];
8034 	} else {
8035 		if (IS_64BIT_PROCESS(current_proc())) {
8036 			struct user64_timeval tv[2];
8037 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8038 			if (error) {
8039 				return error;
8040 			}
8041 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8042 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8043 		} else {
8044 			struct user32_timeval tv[2];
8045 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8046 			if (error) {
8047 				return error;
8048 			}
8049 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8050 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8051 		}
8052 	}
8053 	return 0;
8054 }
8055 
8056 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8057 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8058     int nullflag)
8059 {
8060 	int error;
8061 	struct vnode_attr va;
8062 	kauth_action_t action;
8063 
8064 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8065 
8066 	VATTR_INIT(&va);
8067 	VATTR_SET(&va, va_access_time, ts[0]);
8068 	VATTR_SET(&va, va_modify_time, ts[1]);
8069 	if (nullflag) {
8070 		va.va_vaflags |= VA_UTIMES_NULL;
8071 	}
8072 
8073 #if NAMEDSTREAMS
8074 	/* utimes calls are not allowed for resource forks. */
8075 	if (vp->v_flag & VISNAMEDSTREAM) {
8076 		error = EPERM;
8077 		goto out;
8078 	}
8079 #endif
8080 
8081 #if CONFIG_MACF
8082 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8083 	if (error) {
8084 		goto out;
8085 	}
8086 #endif
8087 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8088 		if (!nullflag && error == EACCES) {
8089 			error = EPERM;
8090 		}
8091 		goto out;
8092 	}
8093 
8094 	/* since we may not need to auth anything, check here */
8095 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8096 		if (!nullflag && error == EACCES) {
8097 			error = EPERM;
8098 		}
8099 		goto out;
8100 	}
8101 	error = vnode_setattr(vp, &va, ctx);
8102 
8103 #if CONFIG_MACF
8104 	if (error == 0) {
8105 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8106 	}
8107 #endif
8108 
8109 out:
8110 	return error;
8111 }
8112 
8113 /*
8114  * Set the access and modification times of a file.
8115  */
8116 /* ARGSUSED */
8117 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8118 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8119 {
8120 	struct timespec ts[2];
8121 	user_addr_t usrtvp;
8122 	int error;
8123 	struct nameidata nd;
8124 	vfs_context_t ctx = vfs_context_current();
8125 	uint32_t wantparent = 0;
8126 
8127 #if CONFIG_FILE_LEASES
8128 	wantparent = WANTPARENT;
8129 #endif
8130 
8131 	/*
8132 	 * AUDIT: Needed to change the order of operations to do the
8133 	 * name lookup first because auditing wants the path.
8134 	 */
8135 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8136 	    UIO_USERSPACE, uap->path, ctx);
8137 	error = namei(&nd);
8138 	if (error) {
8139 		return error;
8140 	}
8141 
8142 	/*
8143 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8144 	 * the current time instead.
8145 	 */
8146 	usrtvp = uap->tptr;
8147 	if ((error = getutimes(usrtvp, ts)) != 0) {
8148 		goto out;
8149 	}
8150 
8151 #if CONFIG_FILE_LEASES
8152 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8153 #endif
8154 
8155 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8156 
8157 out:
8158 #if CONFIG_FILE_LEASES
8159 	vnode_put(nd.ni_dvp);
8160 #endif
8161 	nameidone(&nd);
8162 	vnode_put(nd.ni_vp);
8163 	return error;
8164 }
8165 
8166 /*
8167  * Set the access and modification times of a file.
8168  */
8169 /* ARGSUSED */
8170 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8171 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8172 {
8173 	struct timespec ts[2];
8174 	vnode_t vp;
8175 	user_addr_t usrtvp;
8176 	int error;
8177 
8178 	AUDIT_ARG(fd, uap->fd);
8179 	usrtvp = uap->tptr;
8180 	if ((error = getutimes(usrtvp, ts)) != 0) {
8181 		return error;
8182 	}
8183 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8184 		return error;
8185 	}
8186 	if ((error = vnode_getwithref(vp))) {
8187 		file_drop(uap->fd);
8188 		return error;
8189 	}
8190 
8191 #if CONFIG_FILE_LEASES
8192 	vnode_breakdirlease(vp, true, O_WRONLY);
8193 #endif
8194 
8195 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8196 
8197 	vnode_put(vp);
8198 	file_drop(uap->fd);
8199 	return error;
8200 }
8201 
8202 static int
truncate_validate_common(proc_t p,off_t length)8203 truncate_validate_common(proc_t p, off_t length)
8204 {
8205 	rlim_t fsize_limit;
8206 
8207 	if (length < 0) {
8208 		return EINVAL;
8209 	}
8210 
8211 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8212 	if ((rlim_t)length > fsize_limit) {
8213 		psignal(p, SIGXFSZ);
8214 		return EFBIG;
8215 	}
8216 
8217 	return 0;
8218 }
8219 
8220 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8221 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8222     vfs_context_t ctx, boolean_t need_auth)
8223 {
8224 	struct vnode_attr va;
8225 	kauth_action_t action;
8226 	int error;
8227 
8228 	VATTR_INIT(&va);
8229 	VATTR_SET(&va, va_data_size, length);
8230 
8231 #if CONFIG_MACF
8232 	error = mac_vnode_check_truncate(ctx, cred, vp);
8233 	if (error) {
8234 		return error;
8235 	}
8236 #endif
8237 
8238 	/*
8239 	 * If we reached here from `ftruncate` then we already did an effective
8240 	 * `vnode_authorize` upon open.  We honour the result from then.
8241 	 */
8242 	if (need_auth) {
8243 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8244 			return error;
8245 		}
8246 
8247 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8248 			return error;
8249 		}
8250 	}
8251 
8252 #if CONFIG_FILE_LEASES
8253 	/* Check if there is a lease placed on the parent directory. */
8254 	vnode_breakdirlease(vp, true, O_WRONLY);
8255 
8256 	/* Now check if there is a lease placed on the file itself. */
8257 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8258 #endif
8259 
8260 	error = vnode_setattr(vp, &va, ctx);
8261 
8262 #if CONFIG_MACF
8263 	if (error == 0) {
8264 		mac_vnode_notify_truncate(ctx, cred, vp);
8265 	}
8266 #endif
8267 
8268 	return error;
8269 }
8270 
8271 /*
8272  * Truncate a file given its path name.
8273  */
8274 /* ARGSUSED */
8275 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8276 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8277 {
8278 	vfs_context_t ctx = vfs_context_current();
8279 	vnode_t vp;
8280 	int error;
8281 	struct nameidata nd;
8282 
8283 	if ((error = truncate_validate_common(p, uap->length))) {
8284 		return error;
8285 	}
8286 
8287 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8288 	    UIO_USERSPACE, uap->path, ctx);
8289 
8290 	if ((error = namei(&nd))) {
8291 		return error;
8292 	}
8293 
8294 	vp = nd.ni_vp;
8295 	nameidone(&nd);
8296 
8297 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8298 	vnode_put(vp);
8299 
8300 	return error;
8301 }
8302 
8303 /*
8304  * Truncate a file given a file descriptor.
8305  */
8306 /* ARGSUSED */
8307 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8308 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8309 {
8310 	vnode_t vp;
8311 	struct fileproc *fp;
8312 	int error;
8313 
8314 	AUDIT_ARG(fd, uap->fd);
8315 
8316 	if ((error = truncate_validate_common(p, uap->length))) {
8317 		return error;
8318 	}
8319 
8320 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8321 		return error;
8322 	}
8323 
8324 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8325 	case DTYPE_PSXSHM:
8326 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8327 		goto out;
8328 	case DTYPE_VNODE:
8329 		break;
8330 	default:
8331 		error = EINVAL;
8332 		goto out;
8333 	}
8334 
8335 	vp = (vnode_t)fp_get_data(fp);
8336 
8337 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8338 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8339 		error = EINVAL;
8340 		goto out;
8341 	}
8342 
8343 	if ((error = vnode_getwithref(vp)) != 0) {
8344 		goto out;
8345 	}
8346 
8347 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8348 
8349 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8350 	    vfs_context_current(), false);
8351 	vnode_put(vp);
8352 
8353 out:
8354 	file_drop(uap->fd);
8355 	return error;
8356 }
8357 
8358 
8359 /*
8360  * Sync an open file with synchronized I/O _file_ integrity completion
8361  */
8362 /* ARGSUSED */
8363 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8364 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8365 {
8366 	__pthread_testcancel(1);
8367 	return fsync_common(p, uap, MNT_WAIT);
8368 }
8369 
8370 
8371 /*
8372  * Sync an open file with synchronized I/O _file_ integrity completion
8373  *
8374  * Notes:	This is a legacy support function that does not test for
8375  *		thread cancellation points.
8376  */
8377 /* ARGSUSED */
8378 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8379 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8380 {
8381 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8382 }
8383 
8384 
8385 /*
8386  * Sync an open file with synchronized I/O _data_ integrity completion
8387  */
8388 /* ARGSUSED */
8389 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8390 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8391 {
8392 	__pthread_testcancel(1);
8393 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8394 }
8395 
8396 
8397 /*
8398  * fsync_common
8399  *
8400  * Common fsync code to support both synchronized I/O file integrity completion
8401  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8402  *
8403  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8404  * will only guarantee that the file data contents are retrievable.  If
8405  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8406  * includes additional metadata unnecessary for retrieving the file data
8407  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8408  * storage.
8409  *
8410  * Parameters:	p				The process
8411  *		uap->fd				The descriptor to synchronize
8412  *		flags				The data integrity flags
8413  *
8414  * Returns:	int				Success
8415  *	fp_getfvp:EBADF				Bad file descriptor
8416  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8417  *	VNOP_FSYNC:???				unspecified
8418  *
8419  * Notes:	We use struct fsync_args because it is a short name, and all
8420  *		caller argument structures are otherwise identical.
8421  */
8422 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8423 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8424 {
8425 	vnode_t vp;
8426 	struct fileproc *fp;
8427 	vfs_context_t ctx = vfs_context_current();
8428 	int error;
8429 
8430 	AUDIT_ARG(fd, uap->fd);
8431 
8432 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8433 		return error;
8434 	}
8435 	if ((error = vnode_getwithref(vp))) {
8436 		file_drop(uap->fd);
8437 		return error;
8438 	}
8439 
8440 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8441 
8442 	error = VNOP_FSYNC(vp, flags, ctx);
8443 
8444 #if NAMEDRSRCFORK
8445 	/* Sync resource fork shadow file if necessary. */
8446 	if ((error == 0) &&
8447 	    (vp->v_flag & VISNAMEDSTREAM) &&
8448 	    (vp->v_parent != NULLVP) &&
8449 	    vnode_isshadow(vp) &&
8450 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8451 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8452 	}
8453 #endif
8454 
8455 	(void)vnode_put(vp);
8456 	file_drop(uap->fd);
8457 	return error;
8458 }
8459 
8460 /*
8461  * Duplicate files.  Source must be a file, target must be a file or
8462  * must not exist.
8463  *
8464  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8465  *     perform inheritance correctly.
8466  */
8467 /* ARGSUSED */
8468 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8469 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8470 {
8471 	vnode_t tvp, fvp, tdvp, sdvp;
8472 	struct nameidata fromnd, tond;
8473 	int error;
8474 	vfs_context_t ctx = vfs_context_current();
8475 
8476 	/* Check that the flags are valid. */
8477 	if (uap->flags & ~CPF_MASK) {
8478 		return EINVAL;
8479 	}
8480 
8481 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8482 	    UIO_USERSPACE, uap->from, ctx);
8483 	if ((error = namei(&fromnd))) {
8484 		return error;
8485 	}
8486 	fvp = fromnd.ni_vp;
8487 
8488 	NDINIT(&tond, CREATE, OP_LINK,
8489 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8490 	    UIO_USERSPACE, uap->to, ctx);
8491 	if ((error = namei(&tond))) {
8492 		goto out1;
8493 	}
8494 	tdvp = tond.ni_dvp;
8495 	tvp = tond.ni_vp;
8496 
8497 	if (tvp != NULL) {
8498 		if (!(uap->flags & CPF_OVERWRITE)) {
8499 			error = EEXIST;
8500 			goto out;
8501 		}
8502 	}
8503 
8504 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8505 		error = EISDIR;
8506 		goto out;
8507 	}
8508 
8509 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8510 		error = EOPNOTSUPP;
8511 		goto out;
8512 	}
8513 
8514 #if CONFIG_MACF
8515 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8516 		goto out;
8517 	}
8518 #endif /* CONFIG_MACF */
8519 
8520 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8521 		goto out;
8522 	}
8523 	if (tvp) {
8524 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8525 			goto out;
8526 		}
8527 	}
8528 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8529 		goto out;
8530 	}
8531 
8532 	if (fvp == tdvp) {
8533 		error = EINVAL;
8534 	}
8535 	/*
8536 	 * If source is the same as the destination (that is the
8537 	 * same inode number) then there is nothing to do.
8538 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8539 	 */
8540 	if (fvp == tvp) {
8541 		error = -1;
8542 	}
8543 
8544 #if CONFIG_FILE_LEASES
8545 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8546 #endif
8547 
8548 	if (!error) {
8549 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8550 	}
8551 out:
8552 	sdvp = tond.ni_startdir;
8553 	/*
8554 	 * nameidone has to happen before we vnode_put(tdvp)
8555 	 * since it may need to release the fs_nodelock on the tdvp
8556 	 */
8557 	nameidone(&tond);
8558 
8559 	if (tvp) {
8560 		vnode_put(tvp);
8561 	}
8562 	vnode_put(tdvp);
8563 	vnode_put(sdvp);
8564 out1:
8565 	vnode_put(fvp);
8566 
8567 	nameidone(&fromnd);
8568 
8569 	if (error == -1) {
8570 		return 0;
8571 	}
8572 	return error;
8573 }
8574 
8575 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8576 
8577 /*
8578  * Helper function for doing clones. The caller is expected to provide an
8579  * iocounted source vnode and release it.
8580  */
8581 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8582 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8583     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8584 {
8585 	vnode_t tvp, tdvp;
8586 	struct nameidata tond;
8587 	int error;
8588 	int follow;
8589 	boolean_t free_src_acl;
8590 	boolean_t attr_cleanup;
8591 	enum vtype v_type;
8592 	kauth_action_t action;
8593 	struct componentname *cnp;
8594 	uint32_t defaulted = 0;
8595 	struct vnode_attr va;
8596 	struct vnode_attr nva;
8597 	uint32_t vnop_flags;
8598 
8599 	v_type = vnode_vtype(fvp);
8600 	switch (v_type) {
8601 	case VLNK:
8602 	/* FALLTHRU */
8603 	case VREG:
8604 		action = KAUTH_VNODE_ADD_FILE;
8605 		break;
8606 	case VDIR:
8607 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8608 		    fvp->v_mountedhere) {
8609 			return EINVAL;
8610 		}
8611 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8612 		break;
8613 	default:
8614 		return EINVAL;
8615 	}
8616 
8617 	AUDIT_ARG(fd2, dst_dirfd);
8618 	AUDIT_ARG(value32, flags);
8619 
8620 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8621 	NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8622 	    UIO_USERSPACE, dst, ctx);
8623 	if ((error = nameiat(&tond, dst_dirfd))) {
8624 		return error;
8625 	}
8626 	cnp = &tond.ni_cnd;
8627 	tdvp = tond.ni_dvp;
8628 	tvp = tond.ni_vp;
8629 
8630 	free_src_acl = FALSE;
8631 	attr_cleanup = FALSE;
8632 
8633 	if (tvp != NULL) {
8634 		error = EEXIST;
8635 		goto out;
8636 	}
8637 
8638 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8639 		error = EXDEV;
8640 		goto out;
8641 	}
8642 
8643 #if CONFIG_MACF
8644 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8645 		goto out;
8646 	}
8647 #endif
8648 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8649 		goto out;
8650 	}
8651 
8652 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8653 	if (data_read_authorised) {
8654 		action &= ~KAUTH_VNODE_READ_DATA;
8655 	}
8656 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8657 		goto out;
8658 	}
8659 
8660 	/*
8661 	 * certain attributes may need to be changed from the source, we ask for
8662 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
8663 	 * flag is specified. By default, the clone file will inherit the target
8664 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8665 	 * will inherit the source file's ACLs instead.
8666 	 */
8667 	VATTR_INIT(&va);
8668 	VATTR_WANTED(&va, va_uid);
8669 	VATTR_WANTED(&va, va_gid);
8670 	VATTR_WANTED(&va, va_mode);
8671 	VATTR_WANTED(&va, va_flags);
8672 	if (flags & CLONE_ACL) {
8673 		VATTR_WANTED(&va, va_acl);
8674 	}
8675 
8676 	if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8677 		goto out;
8678 	}
8679 
8680 	VATTR_INIT(&nva);
8681 	VATTR_SET(&nva, va_type, v_type);
8682 	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8683 		VATTR_SET(&nva, va_acl, va.va_acl);
8684 		free_src_acl = TRUE;
8685 	}
8686 
8687 	/* Handle ACL inheritance, initialize vap. */
8688 	if (v_type == VLNK) {
8689 		error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8690 	} else {
8691 		error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8692 		if (error) {
8693 			goto out;
8694 		}
8695 		attr_cleanup = TRUE;
8696 	}
8697 
8698 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8699 	/*
8700 	 * We've got initial values for all security parameters,
8701 	 * If we are superuser, then we can change owners to be the
8702 	 * same as the source. Both superuser and the owner have default
8703 	 * WRITE_SECURITY privileges so all other fields can be taken
8704 	 * from source as well.
8705 	 */
8706 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8707 		if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8708 			VATTR_SET(&nva, va_uid, va.va_uid);
8709 		}
8710 		if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8711 			VATTR_SET(&nva, va_gid, va.va_gid);
8712 		}
8713 	} else {
8714 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8715 	}
8716 
8717 	if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8718 		VATTR_SET(&nva, va_mode, va.va_mode);
8719 	}
8720 	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8721 		VATTR_SET(&nva, va_flags,
8722 		    ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8723 		    (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8724 	}
8725 
8726 #if CONFIG_FILE_LEASES
8727 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8728 #endif
8729 
8730 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8731 
8732 	if (!error && tvp) {
8733 		int     update_flags = 0;
8734 #if CONFIG_FSE
8735 		int fsevent;
8736 #endif /* CONFIG_FSE */
8737 
8738 		/*
8739 		 * If some of the requested attributes weren't handled by the
8740 		 * VNOP, use our fallback code.
8741 		 */
8742 		if (!VATTR_ALL_SUPPORTED(&nva)) {
8743 			(void)vnode_setattr_fallback(tvp, &nva, ctx);
8744 		}
8745 
8746 #if CONFIG_MACF
8747 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8748 		    VNODE_LABEL_CREATE, ctx);
8749 #endif
8750 
8751 		// Make sure the name & parent pointers are hooked up
8752 		if (tvp->v_name == NULL) {
8753 			update_flags |= VNODE_UPDATE_NAME;
8754 		}
8755 		if (tvp->v_parent == NULLVP) {
8756 			update_flags |= VNODE_UPDATE_PARENT;
8757 		}
8758 
8759 		if (update_flags) {
8760 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8761 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8762 		}
8763 
8764 #if CONFIG_FSE
8765 		switch (vnode_vtype(tvp)) {
8766 		case VLNK:
8767 		/* FALLTHRU */
8768 		case VREG:
8769 			fsevent = FSE_CREATE_FILE;
8770 			break;
8771 		case VDIR:
8772 			fsevent = FSE_CREATE_DIR;
8773 			break;
8774 		default:
8775 			goto out;
8776 		}
8777 
8778 		if (need_fsevent(fsevent, tvp)) {
8779 			/*
8780 			 * The following is a sequence of three explicit events.
8781 			 * A pair of FSE_CLONE events representing the source and destination
8782 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8783 			 * fseventsd may coalesce the destination clone and create events
8784 			 * into a single event resulting in the following sequence for a client
8785 			 * FSE_CLONE (src)
8786 			 * FSE_CLONE | FSE_CREATE (dst)
8787 			 */
8788 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8789 			    FSE_ARG_DONE);
8790 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8791 			    FSE_ARG_DONE);
8792 		}
8793 #endif /* CONFIG_FSE */
8794 	}
8795 
8796 out:
8797 	if (attr_cleanup) {
8798 		vn_attribute_cleanup(&nva, defaulted);
8799 	}
8800 	if (free_src_acl && va.va_acl) {
8801 		kauth_acl_free(va.va_acl);
8802 	}
8803 	nameidone(&tond);
8804 	if (tvp) {
8805 		vnode_put(tvp);
8806 	}
8807 	vnode_put(tdvp);
8808 	return error;
8809 }
8810 
8811 /*
8812  * clone files or directories, target must not exist.
8813  */
8814 /* ARGSUSED */
8815 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8816 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8817     __unused int32_t *retval)
8818 {
8819 	vnode_t fvp;
8820 	struct nameidata fromnd;
8821 	int follow;
8822 	int error;
8823 	vfs_context_t ctx = vfs_context_current();
8824 
8825 	/* Check that the flags are valid. */
8826 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8827 		return EINVAL;
8828 	}
8829 
8830 	AUDIT_ARG(fd, uap->src_dirfd);
8831 
8832 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8833 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8834 	    UIO_USERSPACE, uap->src, ctx);
8835 	if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8836 		return error;
8837 	}
8838 
8839 	fvp = fromnd.ni_vp;
8840 	nameidone(&fromnd);
8841 
8842 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8843 	    uap->flags, ctx);
8844 
8845 	vnode_put(fvp);
8846 	return error;
8847 }
8848 
8849 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8850 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8851     __unused int32_t *retval)
8852 {
8853 	vnode_t fvp;
8854 	struct fileproc *fp;
8855 	int error;
8856 	vfs_context_t ctx = vfs_context_current();
8857 
8858 	/* Check that the flags are valid. */
8859 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8860 		return EINVAL;
8861 	}
8862 
8863 	AUDIT_ARG(fd, uap->src_fd);
8864 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8865 	if (error) {
8866 		return error;
8867 	}
8868 
8869 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8870 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8871 		error = EBADF;
8872 		goto out;
8873 	}
8874 
8875 	if ((error = vnode_getwithref(fvp))) {
8876 		goto out;
8877 	}
8878 
8879 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8880 
8881 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8882 	    uap->flags, ctx);
8883 
8884 	vnode_put(fvp);
8885 out:
8886 	file_drop(uap->src_fd);
8887 	return error;
8888 }
8889 
8890 static int
rename_submounts_callback(mount_t mp,void * arg)8891 rename_submounts_callback(mount_t mp, void *arg)
8892 {
8893 	int error = 0;
8894 	mount_t pmp = (mount_t)arg;
8895 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8896 
8897 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8898 		return 0;
8899 	}
8900 
8901 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8902 		return 0;
8903 	}
8904 
8905 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8906 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8907 		return -1;
8908 	}
8909 
8910 	size_t pathlen = MAXPATHLEN;
8911 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8912 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8913 	}
8914 
8915 	vfs_unbusy(mp);
8916 
8917 	return error;
8918 }
8919 
8920 /*
8921  * Rename files.  Source and destination must either both be directories,
8922  * or both not be directories.  If target is a directory, it must be empty.
8923  */
8924 /* ARGSUSED */
8925 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8926 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8927     int tofd, user_addr_t to, int segflg, u_int uflags)
8928 {
8929 	vnode_t tvp, tdvp;
8930 	vnode_t fvp, fdvp;
8931 	vnode_t mnt_fvp;
8932 	struct nameidata *fromnd, *tond;
8933 	int error = 0;
8934 	int do_retry;
8935 	int retry_count;
8936 	int mntrename;
8937 	int need_event;
8938 	int need_kpath2;
8939 	int has_listeners;
8940 	const char *oname = NULL;
8941 	char *from_name = NULL, *to_name = NULL;
8942 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8943 	int from_len = 0, to_len = 0;
8944 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8945 	int holding_mntlock;
8946 	int vn_authorize_skipped;
8947 	mount_t locked_mp = NULL;
8948 	vnode_t oparent = NULLVP;
8949 #if CONFIG_FSE
8950 	fse_info from_finfo = {}, to_finfo;
8951 #endif
8952 	int from_truncated = 0, to_truncated = 0;
8953 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8954 	int batched = 0;
8955 	struct vnode_attr *fvap, *tvap;
8956 	int continuing = 0;
8957 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8958 	int32_t nofollow_any = 0;
8959 	/* carving out a chunk for structs that are too big to be on stack. */
8960 	struct {
8961 		struct nameidata from_node, to_node;
8962 		struct vnode_attr fv_attr, tv_attr;
8963 	} * __rename_data;
8964 
8965 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8966 	fromnd = &__rename_data->from_node;
8967 	tond = &__rename_data->to_node;
8968 
8969 	holding_mntlock = 0;
8970 	do_retry = 0;
8971 	retry_count = 0;
8972 retry:
8973 	fvp = tvp = NULL;
8974 	fdvp = tdvp = NULL;
8975 	fvap = tvap = NULL;
8976 	mnt_fvp = NULLVP;
8977 	mntrename = FALSE;
8978 	vn_authorize_skipped = FALSE;
8979 
8980 	if (uflags & RENAME_NOFOLLOW_ANY) {
8981 		nofollow_any = NAMEI_NOFOLLOW_ANY;
8982 	}
8983 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8984 	    segflg, from, ctx);
8985 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8986 
8987 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8988 	    segflg, to, ctx);
8989 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8990 
8991 continue_lookup:
8992 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8993 		if ((error = nameiat(fromnd, fromfd))) {
8994 			goto out1;
8995 		}
8996 		fdvp = fromnd->ni_dvp;
8997 		fvp  = fromnd->ni_vp;
8998 
8999 		if (fvp && fvp->v_type == VDIR) {
9000 			tond->ni_cnd.cn_flags |= WILLBEDIR;
9001 		}
9002 	}
9003 
9004 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9005 		if ((error = nameiat(tond, tofd))) {
9006 			/*
9007 			 * Translate error code for rename("dir1", "dir2/.").
9008 			 */
9009 			if (error == EISDIR && fvp->v_type == VDIR) {
9010 				error = EINVAL;
9011 			}
9012 			goto out1;
9013 		}
9014 		tdvp = tond->ni_dvp;
9015 		tvp  = tond->ni_vp;
9016 	}
9017 
9018 #if DEVELOPMENT || DEBUG
9019 	/*
9020 	 * XXX VSWAP: Check for entitlements or special flag here
9021 	 * so we can restrict access appropriately.
9022 	 */
9023 #else /* DEVELOPMENT || DEBUG */
9024 
9025 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9026 		error = EPERM;
9027 		goto out1;
9028 	}
9029 
9030 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9031 		error = EPERM;
9032 		goto out1;
9033 	}
9034 #endif /* DEVELOPMENT || DEBUG */
9035 
9036 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9037 		error = ENOENT;
9038 		goto out1;
9039 	}
9040 
9041 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9042 		int32_t pval = 0;
9043 		int err = 0;
9044 
9045 		/*
9046 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9047 		 * has the same name as target iff the following conditions are met:
9048 		 * 1. the target file system is case insensitive
9049 		 * 2. source and target directories are the same
9050 		 * 3. source and target files are the same
9051 		 * 4. name only differs in case (determined by underlying filesystem)
9052 		 */
9053 		if (fvp != tvp || fdvp != tdvp) {
9054 			error = EEXIST;
9055 			goto out1;
9056 		}
9057 
9058 		/*
9059 		 * Assume that the target file system is case sensitive if
9060 		 * _PC_CASE_SENSITIVE selector isn't supported.
9061 		 */
9062 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9063 		if (err != 0 || pval != 0) {
9064 			error = EEXIST;
9065 			goto out1;
9066 		}
9067 	}
9068 
9069 	batched = vnode_compound_rename_available(fdvp);
9070 
9071 #if CONFIG_FSE
9072 	need_event = need_fsevent(FSE_RENAME, fdvp);
9073 	if (need_event) {
9074 		if (fvp) {
9075 			get_fse_info(fvp, &from_finfo, ctx);
9076 		} else {
9077 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9078 			if (error) {
9079 				goto out1;
9080 			}
9081 
9082 			fvap = &__rename_data->fv_attr;
9083 		}
9084 
9085 		if (tvp) {
9086 			get_fse_info(tvp, &to_finfo, ctx);
9087 		} else if (batched) {
9088 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9089 			if (error) {
9090 				goto out1;
9091 			}
9092 
9093 			tvap = &__rename_data->tv_attr;
9094 		}
9095 	}
9096 #else
9097 	need_event = 0;
9098 #endif /* CONFIG_FSE */
9099 
9100 	has_listeners = kauth_authorize_fileop_has_listeners();
9101 
9102 	need_kpath2 = 0;
9103 #if CONFIG_AUDIT
9104 	if (AUDIT_RECORD_EXISTS()) {
9105 		need_kpath2 = 1;
9106 	}
9107 #endif
9108 
9109 	if (need_event || has_listeners) {
9110 		if (from_name == NULL) {
9111 			GET_PATH(from_name);
9112 		}
9113 
9114 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9115 
9116 		if (from_name_no_firmlink == NULL) {
9117 			GET_PATH(from_name_no_firmlink);
9118 		}
9119 
9120 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9121 	}
9122 
9123 	if (need_event || need_kpath2 || has_listeners) {
9124 		if (to_name == NULL) {
9125 			GET_PATH(to_name);
9126 		}
9127 
9128 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9129 
9130 		if (to_name_no_firmlink == NULL) {
9131 			GET_PATH(to_name_no_firmlink);
9132 		}
9133 
9134 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9135 		if (to_name && need_kpath2) {
9136 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9137 		}
9138 	}
9139 	if (!fvp) {
9140 		/*
9141 		 * Claim: this check will never reject a valid rename.
9142 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9143 		 * Suppose fdvp and tdvp are not on the same mount.
9144 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9145 		 *      then you can't move it to within another dir on the same mountpoint.
9146 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9147 		 *
9148 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9149 		 */
9150 		if (fdvp->v_mount != tdvp->v_mount) {
9151 			error = EXDEV;
9152 			goto out1;
9153 		}
9154 		goto skipped_lookup;
9155 	}
9156 
9157 	/*
9158 	 * If the source and destination are the same (i.e. they're
9159 	 * links to the same vnode) and the target file system is
9160 	 * case sensitive, then there is nothing to do.
9161 	 *
9162 	 * XXX Come back to this.
9163 	 */
9164 	if (fvp == tvp) {
9165 		int pathconf_val;
9166 
9167 		/*
9168 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9169 		 * then assume that this file system is case sensitive.
9170 		 */
9171 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9172 		    pathconf_val != 0) {
9173 			vn_authorize_skipped = TRUE;
9174 			goto out1;
9175 		}
9176 	}
9177 
9178 	/*
9179 	 * Allow the renaming of mount points.
9180 	 * - target must not exist
9181 	 * - target must reside in the same directory as source
9182 	 * - union mounts cannot be renamed
9183 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9184 	 *
9185 	 * XXX Handle this in VFS after a continued lookup (if we missed
9186 	 * in the cache to start off)
9187 	 *
9188 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9189 	 * we'll skip past here.  The file system is responsible for
9190 	 * checking that @tvp is not a descendent of @fvp and vice versa
9191 	 * so it should always return EINVAL if either @tvp or @fvp is the
9192 	 * root of a volume.
9193 	 */
9194 	if ((fvp->v_flag & VROOT) &&
9195 	    (fvp->v_type == VDIR) &&
9196 	    (tvp == NULL) &&
9197 	    (fvp->v_mountedhere == NULL) &&
9198 	    (fdvp == tdvp) &&
9199 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9200 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9201 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9202 		vnode_t coveredvp;
9203 
9204 		/* switch fvp to the covered vnode */
9205 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9206 		if ((vnode_getwithref(coveredvp))) {
9207 			error = ENOENT;
9208 			goto out1;
9209 		}
9210 		/*
9211 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9212 		 * later.
9213 		 */
9214 		mnt_fvp = fvp;
9215 
9216 		fvp = coveredvp;
9217 		mntrename = TRUE;
9218 	}
9219 	/*
9220 	 * Check for cross-device rename.
9221 	 */
9222 	if ((fvp->v_mount != tdvp->v_mount) ||
9223 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9224 		error = EXDEV;
9225 		goto out1;
9226 	}
9227 
9228 	/*
9229 	 * If source is the same as the destination (that is the
9230 	 * same inode number) then there is nothing to do...
9231 	 * EXCEPT if the underlying file system supports case
9232 	 * insensitivity and is case preserving.  In this case
9233 	 * the file system needs to handle the special case of
9234 	 * getting the same vnode as target (fvp) and source (tvp).
9235 	 *
9236 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9237 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9238 	 * handle the special case of getting the same vnode as target and
9239 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9240 	 * so not to cause locking problems. There is a single reference on tvp.
9241 	 *
9242 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9243 	 * that correct behaviour then is just to return success without doing
9244 	 * anything.
9245 	 *
9246 	 * XXX filesystem should take care of this itself, perhaps...
9247 	 */
9248 	if (fvp == tvp && fdvp == tdvp) {
9249 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9250 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9251 		    fromnd->ni_cnd.cn_namelen)) {
9252 			vn_authorize_skipped = TRUE;
9253 			goto out1;
9254 		}
9255 	}
9256 
9257 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9258 		/*
9259 		 * we're holding a reference and lock
9260 		 * on locked_mp, but it no longer matches
9261 		 * what we want to do... so drop our hold
9262 		 */
9263 		mount_unlock_renames(locked_mp);
9264 		mount_drop(locked_mp, 0);
9265 		holding_mntlock = 0;
9266 	}
9267 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9268 		/*
9269 		 * serialize renames that re-shape
9270 		 * the tree... if holding_mntlock is
9271 		 * set, then we're ready to go...
9272 		 * otherwise we
9273 		 * first need to drop the iocounts
9274 		 * we picked up, second take the
9275 		 * lock to serialize the access,
9276 		 * then finally start the lookup
9277 		 * process over with the lock held
9278 		 */
9279 		if (!holding_mntlock) {
9280 			/*
9281 			 * need to grab a reference on
9282 			 * the mount point before we
9283 			 * drop all the iocounts... once
9284 			 * the iocounts are gone, the mount
9285 			 * could follow
9286 			 */
9287 			locked_mp = fvp->v_mount;
9288 			mount_ref(locked_mp, 0);
9289 
9290 			/*
9291 			 * nameidone has to happen before we vnode_put(tvp)
9292 			 * since it may need to release the fs_nodelock on the tvp
9293 			 */
9294 			nameidone(tond);
9295 
9296 			if (tvp) {
9297 				vnode_put(tvp);
9298 			}
9299 			vnode_put(tdvp);
9300 
9301 			/*
9302 			 * nameidone has to happen before we vnode_put(fdvp)
9303 			 * since it may need to release the fs_nodelock on the fvp
9304 			 */
9305 			nameidone(fromnd);
9306 
9307 			vnode_put(fvp);
9308 			vnode_put(fdvp);
9309 
9310 			if (mnt_fvp != NULLVP) {
9311 				vnode_put(mnt_fvp);
9312 			}
9313 
9314 			mount_lock_renames(locked_mp);
9315 			holding_mntlock = 1;
9316 
9317 			goto retry;
9318 		}
9319 	} else {
9320 		/*
9321 		 * when we dropped the iocounts to take
9322 		 * the lock, we allowed the identity of
9323 		 * the various vnodes to change... if they did,
9324 		 * we may no longer be dealing with a rename
9325 		 * that reshapes the tree... once we're holding
9326 		 * the iocounts, the vnodes can't change type
9327 		 * so we're free to drop the lock at this point
9328 		 * and continue on
9329 		 */
9330 		if (holding_mntlock) {
9331 			mount_unlock_renames(locked_mp);
9332 			mount_drop(locked_mp, 0);
9333 			holding_mntlock = 0;
9334 		}
9335 	}
9336 
9337 	if (!batched) {
9338 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9339 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9340 		    flags, NULL);
9341 		if (error) {
9342 			if (error == ENOENT) {
9343 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9344 					/*
9345 					 * We encountered a race where after doing the namei,
9346 					 * tvp stops being valid. If so, simply re-drive the rename
9347 					 * call from the top.
9348 					 */
9349 					do_retry = 1;
9350 					retry_count += 1;
9351 				}
9352 			}
9353 			goto out1;
9354 		}
9355 	}
9356 
9357 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9358 	if (mnt_fvp != NULLVP) {
9359 		vnode_put(mnt_fvp);
9360 		mnt_fvp = NULLVP;
9361 	}
9362 
9363 	// save these off so we can later verify that fvp is the same
9364 	oname   = fvp->v_name;
9365 	oparent = fvp->v_parent;
9366 
9367 skipped_lookup:
9368 #if CONFIG_FILE_LEASES
9369 	/* Lease break needed for source's parent dir? */
9370 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9371 
9372 	/* Lease break needed for target's parent dir? */
9373 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9374 #endif
9375 
9376 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9377 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9378 	    flags, ctx);
9379 
9380 	if (holding_mntlock) {
9381 		/*
9382 		 * we can drop our serialization
9383 		 * lock now
9384 		 */
9385 		mount_unlock_renames(locked_mp);
9386 		mount_drop(locked_mp, 0);
9387 		holding_mntlock = 0;
9388 	}
9389 	if (error) {
9390 		if (error == EDATALESS) {
9391 			/*
9392 			 * If we've been here before, something has gone
9393 			 * horribly wrong and we should just get out lest
9394 			 * we spiral around the drain forever.
9395 			 */
9396 			if (flags & VFS_RENAME_DATALESS) {
9397 				error = EIO;
9398 				goto out1;
9399 			}
9400 
9401 			/*
9402 			 * The object we're renaming is dataless (or has a
9403 			 * dataless descendent) and requires materialization
9404 			 * before the rename occurs.  But we're holding the
9405 			 * mount point's rename lock, so it's not safe to
9406 			 * make the upcall.
9407 			 *
9408 			 * In this case, we release the lock, perform the
9409 			 * materialization, and start the whole thing over.
9410 			 */
9411 			error = vnode_materialize_dataless_file(fvp,
9412 			    NAMESPACE_HANDLER_RENAME_OP);
9413 
9414 			if (error == 0) {
9415 				/*
9416 				 * The next time around we need to tell the
9417 				 * file system that the materializtaion has
9418 				 * been performed.
9419 				 */
9420 				flags |= VFS_RENAME_DATALESS;
9421 				do_retry = 1;
9422 			}
9423 			goto out1;
9424 		}
9425 		if (error == EKEEPLOOKING) {
9426 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9427 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9428 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9429 				}
9430 			}
9431 
9432 			fromnd->ni_vp = fvp;
9433 			tond->ni_vp = tvp;
9434 
9435 			goto continue_lookup;
9436 		}
9437 
9438 		/*
9439 		 * We may encounter a race in the VNOP where the destination didn't
9440 		 * exist when we did the namei, but it does by the time we go and
9441 		 * try to create the entry. In this case, we should re-drive this rename
9442 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9443 		 * but other filesystems susceptible to this race could return it, too.
9444 		 */
9445 		if (error == ERECYCLE) {
9446 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9447 				do_retry = 1;
9448 				retry_count += 1;
9449 			} else {
9450 				printf("rename retry limit due to ERECYCLE reached\n");
9451 				error = ENOENT;
9452 			}
9453 		}
9454 
9455 		/*
9456 		 * For compound VNOPs, the authorization callback may return
9457 		 * ENOENT in case of racing hardlink lookups hitting the name
9458 		 * cache, redrive the lookup.
9459 		 */
9460 		if (batched && error == ENOENT) {
9461 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9462 				do_retry = 1;
9463 				retry_count += 1;
9464 			}
9465 		}
9466 
9467 		goto out1;
9468 	}
9469 
9470 	/* call out to allow 3rd party notification of rename.
9471 	 * Ignore result of kauth_authorize_fileop call.
9472 	 */
9473 	kauth_authorize_fileop(vfs_context_ucred(ctx),
9474 	    KAUTH_FILEOP_RENAME,
9475 	    (uintptr_t)from_name, (uintptr_t)to_name);
9476 	if (flags & VFS_RENAME_SWAP) {
9477 		kauth_authorize_fileop(vfs_context_ucred(ctx),
9478 		    KAUTH_FILEOP_RENAME,
9479 		    (uintptr_t)to_name, (uintptr_t)from_name);
9480 	}
9481 
9482 #if CONFIG_FSE
9483 	if (from_name != NULL && to_name != NULL) {
9484 		if (from_truncated || to_truncated) {
9485 			// set it here since only the from_finfo gets reported up to user space
9486 			from_finfo.mode |= FSE_TRUNCATED_PATH;
9487 		}
9488 
9489 		if (tvap && tvp) {
9490 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9491 		}
9492 		if (fvap) {
9493 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9494 		}
9495 
9496 		if (tvp) {
9497 			add_fsevent(FSE_RENAME, ctx,
9498 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9499 			    FSE_ARG_FINFO, &from_finfo,
9500 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9501 			    FSE_ARG_FINFO, &to_finfo,
9502 			    FSE_ARG_DONE);
9503 			if (flags & VFS_RENAME_SWAP) {
9504 				/*
9505 				 * Strictly speaking, swap is the equivalent of
9506 				 * *three* renames.  FSEvents clients should only take
9507 				 * the events as a hint, so we only bother reporting
9508 				 * two.
9509 				 */
9510 				add_fsevent(FSE_RENAME, ctx,
9511 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9512 				    FSE_ARG_FINFO, &to_finfo,
9513 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9514 				    FSE_ARG_FINFO, &from_finfo,
9515 				    FSE_ARG_DONE);
9516 			}
9517 		} else {
9518 			add_fsevent(FSE_RENAME, ctx,
9519 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9520 			    FSE_ARG_FINFO, &from_finfo,
9521 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9522 			    FSE_ARG_DONE);
9523 		}
9524 	}
9525 #endif /* CONFIG_FSE */
9526 
9527 	/*
9528 	 * update filesystem's mount point data
9529 	 */
9530 	if (mntrename) {
9531 		char *cp, *pathend, *mpname;
9532 		char * tobuf;
9533 		struct mount *mp;
9534 		int maxlen;
9535 		size_t len = 0;
9536 
9537 		mp = fvp->v_mountedhere;
9538 
9539 		if (vfs_busy(mp, LK_NOWAIT)) {
9540 			error = EBUSY;
9541 			goto out1;
9542 		}
9543 		tobuf = zalloc(ZV_NAMEI);
9544 
9545 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
9546 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9547 		} else {
9548 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9549 		}
9550 		if (!error) {
9551 			/* find current mount point prefix */
9552 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
9553 			for (cp = pathend; *cp != '\0'; ++cp) {
9554 				if (*cp == '/') {
9555 					pathend = cp + 1;
9556 				}
9557 			}
9558 			/* find last component of target name */
9559 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9560 				if (*cp == '/') {
9561 					mpname = cp + 1;
9562 				}
9563 			}
9564 
9565 			/* Update f_mntonname of sub mounts */
9566 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
9567 
9568 			/* append name to prefix */
9569 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9570 			bzero(pathend, maxlen);
9571 
9572 			strlcpy(pathend, mpname, maxlen);
9573 		}
9574 		zfree(ZV_NAMEI, tobuf);
9575 
9576 		vfs_unbusy(mp);
9577 
9578 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9579 	}
9580 	/*
9581 	 * fix up name & parent pointers.  note that we first
9582 	 * check that fvp has the same name/parent pointers it
9583 	 * had before the rename call... this is a 'weak' check
9584 	 * at best...
9585 	 *
9586 	 * XXX oparent and oname may not be set in the compound vnop case
9587 	 */
9588 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9589 		int update_flags;
9590 
9591 		update_flags = VNODE_UPDATE_NAME;
9592 
9593 		if (fdvp != tdvp) {
9594 			update_flags |= VNODE_UPDATE_PARENT;
9595 		}
9596 
9597 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9598 	}
9599 out1:
9600 	/*
9601 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9602 	 * skipped earlier as no actual rename was performed.
9603 	 */
9604 	if (vn_authorize_skipped && error == 0) {
9605 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
9606 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9607 		    flags, NULL);
9608 		if (error && error == ENOENT) {
9609 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9610 				do_retry = 1;
9611 				retry_count += 1;
9612 			}
9613 		}
9614 	}
9615 	if (to_name != NULL) {
9616 		RELEASE_PATH(to_name);
9617 		to_name = NULL;
9618 	}
9619 	if (to_name_no_firmlink != NULL) {
9620 		RELEASE_PATH(to_name_no_firmlink);
9621 		to_name_no_firmlink = NULL;
9622 	}
9623 	if (from_name != NULL) {
9624 		RELEASE_PATH(from_name);
9625 		from_name = NULL;
9626 	}
9627 	if (from_name_no_firmlink != NULL) {
9628 		RELEASE_PATH(from_name_no_firmlink);
9629 		from_name_no_firmlink = NULL;
9630 	}
9631 	if (holding_mntlock) {
9632 		mount_unlock_renames(locked_mp);
9633 		mount_drop(locked_mp, 0);
9634 		holding_mntlock = 0;
9635 	}
9636 	if (tdvp) {
9637 		/*
9638 		 * nameidone has to happen before we vnode_put(tdvp)
9639 		 * since it may need to release the fs_nodelock on the tdvp
9640 		 */
9641 		nameidone(tond);
9642 
9643 		if (tvp) {
9644 			vnode_put(tvp);
9645 		}
9646 		vnode_put(tdvp);
9647 	}
9648 	if (fdvp) {
9649 		/*
9650 		 * nameidone has to happen before we vnode_put(fdvp)
9651 		 * since it may need to release the fs_nodelock on the fdvp
9652 		 */
9653 		nameidone(fromnd);
9654 
9655 		if (fvp) {
9656 			vnode_put(fvp);
9657 		}
9658 		vnode_put(fdvp);
9659 	}
9660 	if (mnt_fvp != NULLVP) {
9661 		vnode_put(mnt_fvp);
9662 	}
9663 	/*
9664 	 * If things changed after we did the namei, then we will re-drive
9665 	 * this rename call from the top.
9666 	 */
9667 	if (do_retry) {
9668 		do_retry = 0;
9669 		goto retry;
9670 	}
9671 
9672 	kfree_type(typeof(*__rename_data), __rename_data);
9673 	return error;
9674 }
9675 
9676 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9677 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9678 {
9679 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9680 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9681 }
9682 
9683 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9684 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9685 {
9686 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9687 		return EINVAL;
9688 	}
9689 
9690 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9691 		return EINVAL;
9692 	}
9693 
9694 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9695 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9696 }
9697 
9698 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9699 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9700 {
9701 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9702 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9703 }
9704 
9705 /*
9706  * Make a directory file.
9707  *
9708  * Returns:	0			Success
9709  *		EEXIST
9710  *	namei:???
9711  *	vnode_authorize:???
9712  *	vn_create:???
9713  */
9714 /* ARGSUSED */
9715 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9716 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9717     enum uio_seg segflg)
9718 {
9719 	vnode_t vp, dvp;
9720 	int error;
9721 	int update_flags = 0;
9722 	int batched;
9723 	struct nameidata nd;
9724 
9725 	AUDIT_ARG(mode, vap->va_mode);
9726 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9727 	    path, ctx);
9728 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9729 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9730 
9731 continue_lookup:
9732 	error = nameiat(&nd, fd);
9733 	if (error) {
9734 		return error;
9735 	}
9736 	dvp = nd.ni_dvp;
9737 	vp = nd.ni_vp;
9738 
9739 	if (vp != NULL) {
9740 		error = EEXIST;
9741 		goto out;
9742 	}
9743 
9744 	batched = vnode_compound_mkdir_available(dvp);
9745 
9746 	VATTR_SET(vap, va_type, VDIR);
9747 
9748 	/*
9749 	 * XXX
9750 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9751 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9752 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9753 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9754 	 */
9755 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9756 		if (error == EACCES || error == EPERM) {
9757 			int error2;
9758 
9759 			nameidone(&nd);
9760 			vnode_put(dvp);
9761 			dvp = NULLVP;
9762 
9763 			/*
9764 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9765 			 * rather than EACCESS if the target exists.
9766 			 */
9767 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9768 			    path, ctx);
9769 			error2 = nameiat(&nd, fd);
9770 			if (error2) {
9771 				goto out;
9772 			} else {
9773 				vp = nd.ni_vp;
9774 				error = EEXIST;
9775 				goto out;
9776 			}
9777 		}
9778 
9779 		goto out;
9780 	}
9781 
9782 #if CONFIG_FILE_LEASES
9783 	vnode_breakdirlease(dvp, false, O_WRONLY);
9784 #endif
9785 
9786 	/*
9787 	 * make the directory
9788 	 */
9789 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9790 		if (error == EKEEPLOOKING) {
9791 			nd.ni_vp = vp;
9792 			goto continue_lookup;
9793 		}
9794 
9795 		goto out;
9796 	}
9797 
9798 	// Make sure the name & parent pointers are hooked up
9799 	if (vp->v_name == NULL) {
9800 		update_flags |= VNODE_UPDATE_NAME;
9801 	}
9802 	if (vp->v_parent == NULLVP) {
9803 		update_flags |= VNODE_UPDATE_PARENT;
9804 	}
9805 
9806 	if (update_flags) {
9807 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9808 	}
9809 
9810 #if CONFIG_FSE
9811 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9812 #endif
9813 
9814 out:
9815 	/*
9816 	 * nameidone has to happen before we vnode_put(dvp)
9817 	 * since it may need to release the fs_nodelock on the dvp
9818 	 */
9819 	nameidone(&nd);
9820 
9821 	if (vp) {
9822 		vnode_put(vp);
9823 	}
9824 	if (dvp) {
9825 		vnode_put(dvp);
9826 	}
9827 
9828 	return error;
9829 }
9830 
9831 /*
9832  * mkdir_extended: Create a directory; with extended security (ACL).
9833  *
9834  * Parameters:    p                       Process requesting to create the directory
9835  *                uap                     User argument descriptor (see below)
9836  *                retval                  (ignored)
9837  *
9838  * Indirect:      uap->path               Path of directory to create
9839  *                uap->mode               Access permissions to set
9840  *                uap->xsecurity          ACL to set
9841  *
9842  * Returns:        0                      Success
9843  *                !0                      Not success
9844  *
9845  */
9846 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9847 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9848 {
9849 	int ciferror;
9850 	kauth_filesec_t xsecdst;
9851 	struct vnode_attr va;
9852 
9853 	AUDIT_ARG(owner, uap->uid, uap->gid);
9854 
9855 	xsecdst = NULL;
9856 	if ((uap->xsecurity != USER_ADDR_NULL) &&
9857 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9858 		return ciferror;
9859 	}
9860 
9861 	VATTR_INIT(&va);
9862 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9863 	if (xsecdst != NULL) {
9864 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9865 		va.va_vaflags |= VA_FILESEC_ACL;
9866 	}
9867 
9868 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9869 	    UIO_USERSPACE);
9870 	if (xsecdst != NULL) {
9871 		kauth_filesec_free(xsecdst);
9872 	}
9873 	return ciferror;
9874 }
9875 
9876 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9877 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9878 {
9879 	struct vnode_attr va;
9880 
9881 	VATTR_INIT(&va);
9882 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9883 
9884 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9885 	           UIO_USERSPACE);
9886 }
9887 
9888 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9889 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9890 {
9891 	struct vnode_attr va;
9892 
9893 	VATTR_INIT(&va);
9894 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9895 
9896 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9897 	           UIO_USERSPACE);
9898 }
9899 
9900 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9901 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9902     enum uio_seg segflg, int unlink_flags)
9903 {
9904 	struct {
9905 		struct nameidata nd;
9906 #if CONFIG_FSE
9907 		struct vnode_attr va;
9908 #endif /* CONFIG_FSE */
9909 	} *__rmdir_data;
9910 	vnode_t vp, dvp;
9911 	int error;
9912 	struct nameidata *ndp;
9913 	char     *path = NULL;
9914 	char     *no_firmlink_path = NULL;
9915 	int       len_path = 0;
9916 	int       len_no_firmlink_path = 0;
9917 	int has_listeners = 0;
9918 	int need_event = 0;
9919 	int truncated_path = 0;
9920 	int truncated_no_firmlink_path = 0;
9921 	struct vnode_attr *vap = NULL;
9922 	int restart_count = 0;
9923 	int batched;
9924 
9925 	int restart_flag;
9926 
9927 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9928 	ndp = &__rmdir_data->nd;
9929 
9930 	/*
9931 	 * This loop exists to restart rmdir in the unlikely case that two
9932 	 * processes are simultaneously trying to remove the same directory
9933 	 * containing orphaned appleDouble files.
9934 	 */
9935 	do {
9936 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9937 		    segflg, dirpath, ctx);
9938 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9939 continue_lookup:
9940 		restart_flag = 0;
9941 		vap = NULL;
9942 
9943 		error = nameiat(ndp, fd);
9944 		if (error) {
9945 			goto err_out;
9946 		}
9947 
9948 		dvp = ndp->ni_dvp;
9949 		vp = ndp->ni_vp;
9950 
9951 		if (vp) {
9952 			batched = vnode_compound_rmdir_available(vp);
9953 
9954 			if (vp->v_flag & VROOT) {
9955 				/*
9956 				 * The root of a mounted filesystem cannot be deleted.
9957 				 */
9958 				error = EBUSY;
9959 				goto out;
9960 			}
9961 
9962 #if DEVELOPMENT || DEBUG
9963 			/*
9964 			 * XXX VSWAP: Check for entitlements or special flag here
9965 			 * so we can restrict access appropriately.
9966 			 */
9967 #else /* DEVELOPMENT || DEBUG */
9968 
9969 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9970 				error = EPERM;
9971 				goto out;
9972 			}
9973 #endif /* DEVELOPMENT || DEBUG */
9974 
9975 			/*
9976 			 * Removed a check here; we used to abort if vp's vid
9977 			 * was not the same as what we'd seen the last time around.
9978 			 * I do not think that check was valid, because if we retry
9979 			 * and all dirents are gone, the directory could legitimately
9980 			 * be recycled but still be present in a situation where we would
9981 			 * have had permission to delete.  Therefore, we won't make
9982 			 * an effort to preserve that check now that we may not have a
9983 			 * vp here.
9984 			 */
9985 
9986 			if (!batched) {
9987 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9988 				if (error) {
9989 					if (error == ENOENT) {
9990 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9991 							restart_flag = 1;
9992 							restart_count += 1;
9993 						}
9994 					}
9995 					goto out;
9996 				}
9997 			}
9998 		} else {
9999 			batched = 1;
10000 
10001 			if (!vnode_compound_rmdir_available(dvp)) {
10002 				panic("No error, but no compound rmdir?");
10003 			}
10004 		}
10005 
10006 #if CONFIG_FSE
10007 		fse_info  finfo = {0};
10008 
10009 		need_event = need_fsevent(FSE_DELETE, dvp);
10010 		if (need_event) {
10011 			if (!batched) {
10012 				get_fse_info(vp, &finfo, ctx);
10013 			} else {
10014 				error = vfs_get_notify_attributes(&__rmdir_data->va);
10015 				if (error) {
10016 					goto out;
10017 				}
10018 
10019 				vap = &__rmdir_data->va;
10020 			}
10021 		}
10022 #endif
10023 		has_listeners = kauth_authorize_fileop_has_listeners();
10024 		if (need_event || has_listeners) {
10025 			if (path == NULL) {
10026 				GET_PATH(path);
10027 			}
10028 
10029 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10030 
10031 			if (no_firmlink_path == NULL) {
10032 				GET_PATH(no_firmlink_path);
10033 			}
10034 
10035 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10036 #if CONFIG_FSE
10037 			if (truncated_no_firmlink_path) {
10038 				finfo.mode |= FSE_TRUNCATED_PATH;
10039 			}
10040 #endif
10041 		}
10042 
10043 #if CONFIG_FILE_LEASES
10044 		vnode_breakdirlease(dvp, false, O_WRONLY);
10045 #endif
10046 
10047 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10048 		ndp->ni_vp = vp;
10049 		if (vp == NULLVP) {
10050 			/* Couldn't find a vnode */
10051 			goto out;
10052 		}
10053 
10054 		if (error == EKEEPLOOKING) {
10055 			goto continue_lookup;
10056 		} else if (batched && error == ENOENT) {
10057 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10058 				/*
10059 				 * For compound VNOPs, the authorization callback
10060 				 * may return ENOENT in case of racing hard link lookups
10061 				 * redrive the lookup.
10062 				 */
10063 				restart_flag = 1;
10064 				restart_count += 1;
10065 				goto out;
10066 			}
10067 		}
10068 
10069 		/*
10070 		 * XXX There's no provision for passing flags
10071 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10072 		 * because it's not empty, then we try again
10073 		 * with VNOP_REMOVE(), passing in a special
10074 		 * flag that clever file systems will know
10075 		 * how to handle.
10076 		 */
10077 		if (error == ENOTEMPTY &&
10078 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10079 			/*
10080 			 * If this fails, we want to keep the original
10081 			 * error.
10082 			 */
10083 			if (vn_remove(dvp, &vp, ndp,
10084 			    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10085 				error = 0;
10086 			}
10087 		}
10088 
10089 #if CONFIG_APPLEDOUBLE
10090 		/*
10091 		 * Special case to remove orphaned AppleDouble
10092 		 * files. I don't like putting this in the kernel,
10093 		 * but carbon does not like putting this in carbon either,
10094 		 * so here we are.
10095 		 */
10096 		if (error == ENOTEMPTY) {
10097 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10098 			if (ad_error == EBUSY) {
10099 				error = ad_error;
10100 				goto out;
10101 			}
10102 
10103 
10104 			/*
10105 			 * Assuming everything went well, we will try the RMDIR again
10106 			 */
10107 			if (!ad_error) {
10108 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10109 			}
10110 		}
10111 #endif /* CONFIG_APPLEDOUBLE */
10112 		/*
10113 		 * Call out to allow 3rd party notification of delete.
10114 		 * Ignore result of kauth_authorize_fileop call.
10115 		 */
10116 		if (!error) {
10117 			if (has_listeners) {
10118 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10119 				    KAUTH_FILEOP_DELETE,
10120 				    (uintptr_t)vp,
10121 				    (uintptr_t)path);
10122 			}
10123 
10124 			if (vp->v_flag & VISHARDLINK) {
10125 				// see the comment in unlink1() about why we update
10126 				// the parent of a hard link when it is removed
10127 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10128 			}
10129 
10130 #if CONFIG_FSE
10131 			if (need_event) {
10132 				if (vap) {
10133 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10134 				}
10135 				add_fsevent(FSE_DELETE, ctx,
10136 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10137 				    FSE_ARG_FINFO, &finfo,
10138 				    FSE_ARG_DONE);
10139 			}
10140 #endif
10141 
10142 #if CONFIG_MACF
10143 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10144 #endif
10145 		}
10146 
10147 out:
10148 		if (path != NULL) {
10149 			RELEASE_PATH(path);
10150 			path = NULL;
10151 		}
10152 
10153 		if (no_firmlink_path != NULL) {
10154 			RELEASE_PATH(no_firmlink_path);
10155 			no_firmlink_path = NULL;
10156 		}
10157 
10158 		/*
10159 		 * nameidone has to happen before we vnode_put(dvp)
10160 		 * since it may need to release the fs_nodelock on the dvp
10161 		 */
10162 		nameidone(ndp);
10163 		vnode_put(dvp);
10164 
10165 		if (vp) {
10166 			vnode_put(vp);
10167 		}
10168 
10169 		if (restart_flag == 0) {
10170 			wakeup_one((caddr_t)vp);
10171 			goto err_out;
10172 		}
10173 		tsleep(vp, PVFS, "rm AD", 1);
10174 	} while (restart_flag != 0);
10175 
10176 err_out:
10177 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10178 
10179 	return error;
10180 }
10181 
10182 /*
10183  * Remove a directory file.
10184  */
10185 /* ARGSUSED */
10186 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10187 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10188 {
10189 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10190 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10191 }
10192 
10193 /* Get direntry length padded to 8 byte alignment */
10194 #define DIRENT64_LEN(namlen) \
10195 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10196 
10197 /* Get dirent length padded to 4 byte alignment */
10198 #define DIRENT_LEN(namelen) \
10199 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10200 
10201 /* Get the end of this dirent */
10202 #define DIRENT_END(dep) \
10203 	(((char *)(dep)) + (dep)->d_reclen - 1)
10204 
10205 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10206 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10207     int *numdirent, vfs_context_t ctxp)
10208 {
10209 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10210 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10211 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10212 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10213 	} else {
10214 		size_t bufsize;
10215 		void * bufptr;
10216 		uio_t auio;
10217 		struct direntry *entry64;
10218 		struct dirent *dep;
10219 		size_t bytesread;
10220 		int error;
10221 
10222 		/*
10223 		 * We're here because the underlying file system does not
10224 		 * support direnties or we mounted denying support so we must
10225 		 * fall back to dirents and convert them to direntries.
10226 		 *
10227 		 * Our kernel buffer needs to be smaller since re-packing will
10228 		 * expand each dirent.  The worse case (when the name length
10229 		 * is 3 or less) corresponds to a struct direntry size of 32
10230 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10231 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10232 		 * will prevent us from reading more than we can pack.
10233 		 *
10234 		 * Since this buffer is wired memory, we will limit the
10235 		 * buffer size to a maximum of 32K. We would really like to
10236 		 * use 32K in the MIN(), but we use magic number 87371 to
10237 		 * prevent uio_resid() * 3 / 8 from overflowing.
10238 		 */
10239 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10240 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10241 		if (bufptr == NULL) {
10242 			return ENOMEM;
10243 		}
10244 
10245 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10246 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10247 		auio->uio_offset = uio->uio_offset;
10248 
10249 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10250 
10251 		dep = (struct dirent *)bufptr;
10252 		bytesread = bufsize - uio_resid(auio);
10253 
10254 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10255 		/*
10256 		 * Convert all the entries and copy them out to user's buffer.
10257 		 */
10258 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10259 			/* First check that the dirent struct up to d_name is within the buffer */
10260 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10261 			    /* Check that the length of the entire dirent is within the buffer */
10262 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10263 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10264 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10265 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10266 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10267 				    vp->v_name ? vp->v_name : "<unknown>");
10268 				error = EIO;
10269 				break;
10270 			}
10271 
10272 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10273 
10274 			bzero(entry64, enbufsize);
10275 			/* Convert a dirent to a dirent64. */
10276 			entry64->d_ino = dep->d_ino;
10277 			entry64->d_seekoff = 0;
10278 			entry64->d_reclen = (uint16_t)enbufsize;
10279 			entry64->d_namlen = dep->d_namlen;
10280 			entry64->d_type = dep->d_type;
10281 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10282 
10283 			/* Move to next entry. */
10284 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10285 
10286 			/* Copy entry64 to user's buffer. */
10287 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10288 		}
10289 
10290 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10291 		if (error == 0) {
10292 			uio->uio_offset = auio->uio_offset;
10293 		}
10294 		uio_free(auio);
10295 		kfree_data(bufptr, bufsize);
10296 		kfree_type(struct direntry, entry64);
10297 		return error;
10298 	}
10299 }
10300 
10301 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10302 
10303 /*
10304  * Read a block of directory entries in a file system independent format.
10305  */
10306 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10307 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10308     off_t *offset, int *eofflag, int flags)
10309 {
10310 	vnode_t vp;
10311 	struct vfs_context context = *vfs_context_current();    /* local copy */
10312 	struct fileproc *fp;
10313 	uio_t auio;
10314 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10315 	off_t loff;
10316 	int error, numdirent;
10317 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10318 
10319 get_from_fd:
10320 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10321 	if (error) {
10322 		return error;
10323 	}
10324 
10325 	vn_offset_lock(fp->fp_glob);
10326 	if (((vnode_t)fp_get_data(fp)) != vp) {
10327 		vn_offset_unlock(fp->fp_glob);
10328 		file_drop(fd);
10329 		goto get_from_fd;
10330 	}
10331 
10332 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10333 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10334 		error = EBADF;
10335 		goto out;
10336 	}
10337 
10338 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10339 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10340 	}
10341 
10342 #if CONFIG_MACF
10343 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10344 	if (error) {
10345 		goto out;
10346 	}
10347 #endif
10348 
10349 	if ((error = vnode_getwithref(vp))) {
10350 		goto out;
10351 	}
10352 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10353 
10354 #if CONFIG_UNION_MOUNTS
10355 unionread:
10356 #endif /* CONFIG_UNION_MOUNTS */
10357 	if (vp->v_type != VDIR) {
10358 		(void)vnode_put(vp);
10359 		error = EINVAL;
10360 		goto out;
10361 	}
10362 
10363 #if CONFIG_MACF
10364 	error = mac_vnode_check_readdir(&context, vp);
10365 	if (error != 0) {
10366 		(void)vnode_put(vp);
10367 		goto out;
10368 	}
10369 #endif /* MAC */
10370 
10371 	loff = fp->fp_glob->fg_offset;
10372 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10373 	uio_addiov(auio, bufp, bufsize);
10374 
10375 	if (flags & VNODE_READDIR_EXTENDED) {
10376 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10377 		fp->fp_glob->fg_offset = uio_offset(auio);
10378 	} else {
10379 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10380 		fp->fp_glob->fg_offset = uio_offset(auio);
10381 	}
10382 	if (error) {
10383 		(void)vnode_put(vp);
10384 		goto out;
10385 	}
10386 
10387 #if CONFIG_UNION_MOUNTS
10388 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10389 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10390 		vnode_t uvp;
10391 
10392 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10393 			if (vnode_ref(uvp) == 0) {
10394 				fp_set_data(fp, uvp);
10395 				fp->fp_glob->fg_offset = 0;
10396 				vnode_rele(vp);
10397 				vnode_put(vp);
10398 				vp = uvp;
10399 				goto unionread;
10400 			} else {
10401 				/* could not get a ref, can't replace in fd */
10402 				vnode_put(uvp);
10403 			}
10404 		}
10405 	}
10406 #endif /* CONFIG_UNION_MOUNTS */
10407 
10408 	vnode_put(vp);
10409 	if (offset) {
10410 		*offset = loff;
10411 	}
10412 
10413 	*bytesread = bufsize - uio_resid(auio);
10414 out:
10415 	vn_offset_unlock(fp->fp_glob);
10416 	file_drop(fd);
10417 	return error;
10418 }
10419 
10420 
10421 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10422 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10423 {
10424 	off_t offset;
10425 	ssize_t bytesread;
10426 	int error, eofflag;
10427 
10428 	AUDIT_ARG(fd, uap->fd);
10429 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
10430 	    &bytesread, &offset, &eofflag, 0);
10431 
10432 	if (error == 0) {
10433 		if (proc_is64bit(p)) {
10434 			user64_long_t base = (user64_long_t)offset;
10435 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10436 		} else {
10437 			user32_long_t base = (user32_long_t)offset;
10438 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10439 		}
10440 		*retval = (int)bytesread;
10441 	}
10442 	return error;
10443 }
10444 
10445 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10446 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10447 {
10448 	off_t offset;
10449 	ssize_t bytesread;
10450 	int error, eofflag;
10451 	user_size_t bufsize;
10452 
10453 	AUDIT_ARG(fd, uap->fd);
10454 
10455 	/*
10456 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10457 	 * then the kernel carves out the last 4 bytes to return extended
10458 	 * information to userspace (namely whether we reached EOF with this call).
10459 	 */
10460 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10461 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10462 	} else {
10463 		bufsize = uap->bufsize;
10464 	}
10465 
10466 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
10467 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10468 
10469 	if (error == 0) {
10470 		*retval = bytesread;
10471 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10472 
10473 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10474 			getdirentries64_flags_t flags = 0;
10475 			if (eofflag) {
10476 				flags |= GETDIRENTRIES64_EOF;
10477 			}
10478 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10479 			    sizeof(flags));
10480 		}
10481 	}
10482 	return error;
10483 }
10484 
10485 
10486 /*
10487  * Set the mode mask for creation of filesystem nodes.
10488  * XXX implement xsecurity
10489  */
10490 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
10491 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10492 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10493 {
10494 	AUDIT_ARG(mask, newmask);
10495 	proc_fdlock(p);
10496 	*retval = p->p_fd.fd_cmask;
10497 	p->p_fd.fd_cmask = newmask & ALLPERMS;
10498 	proc_fdunlock(p);
10499 	return 0;
10500 }
10501 
10502 /*
10503  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10504  *
10505  * Parameters:    p                       Process requesting to set the umask
10506  *                uap                     User argument descriptor (see below)
10507  *                retval                  umask of the process (parameter p)
10508  *
10509  * Indirect:      uap->newmask            umask to set
10510  *                uap->xsecurity          ACL to set
10511  *
10512  * Returns:        0                      Success
10513  *                !0                      Not success
10514  *
10515  */
10516 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10517 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10518 {
10519 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10520 }
10521 
10522 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10523 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10524 {
10525 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10526 }
10527 
10528 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
10529 	"com.apple.private.vfs.revoke-mounted-device"
10530 
10531 /*
10532  * Void all references to file by ripping underlying filesystem
10533  * away from vnode.
10534  */
10535 /* ARGSUSED */
10536 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10537 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10538 {
10539 	vnode_t vp;
10540 	struct vnode_attr va;
10541 	vfs_context_t ctx = vfs_context_current();
10542 	int error;
10543 	struct nameidata nd;
10544 
10545 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10546 	    uap->path, ctx);
10547 	error = namei(&nd);
10548 	if (error) {
10549 		return error;
10550 	}
10551 	vp = nd.ni_vp;
10552 
10553 	nameidone(&nd);
10554 
10555 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10556 		error = ENOTSUP;
10557 		goto out;
10558 	}
10559 
10560 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10561 		error = EBUSY;
10562 		goto out;
10563 	}
10564 
10565 #if CONFIG_MACF
10566 	error = mac_vnode_check_revoke(ctx, vp);
10567 	if (error) {
10568 		goto out;
10569 	}
10570 #endif
10571 
10572 	VATTR_INIT(&va);
10573 	VATTR_WANTED(&va, va_uid);
10574 	if ((error = vnode_getattr(vp, &va, ctx))) {
10575 		goto out;
10576 	}
10577 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10578 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10579 		goto out;
10580 	}
10581 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10582 		VNOP_REVOKE(vp, REVOKEALL, ctx);
10583 	}
10584 out:
10585 	vnode_put(vp);
10586 	return error;
10587 }
10588 
10589 
10590 /*
10591  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10592  *  The following system calls are designed to support features
10593  *  which are specific to the HFS & HFS Plus volume formats
10594  */
10595 
10596 
10597 /*
10598  * Obtain attribute information on objects in a directory while enumerating
10599  * the directory.
10600  */
10601 /* ARGSUSED */
10602 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10603 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10604 {
10605 	vnode_t vp;
10606 	struct fileproc *fp;
10607 	uio_t auio = NULL;
10608 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10609 	uint32_t count = 0, savecount = 0;
10610 	uint32_t newstate = 0;
10611 	int error, eofflag = 0;
10612 	off_t loff = 0;
10613 	struct attrlist attributelist;
10614 	vfs_context_t ctx = vfs_context_current();
10615 	int fd = uap->fd;
10616 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10617 	kauth_action_t action;
10618 
10619 	AUDIT_ARG(fd, fd);
10620 
10621 	/* Get the attributes into kernel space */
10622 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10623 		return error;
10624 	}
10625 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10626 		return error;
10627 	}
10628 	savecount = count;
10629 
10630 get_from_fd:
10631 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10632 		return error;
10633 	}
10634 
10635 	vn_offset_lock(fp->fp_glob);
10636 	if (((vnode_t)fp_get_data(fp)) != vp) {
10637 		vn_offset_unlock(fp->fp_glob);
10638 		file_drop(fd);
10639 		goto get_from_fd;
10640 	}
10641 
10642 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10643 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10644 		error = EBADF;
10645 		goto out;
10646 	}
10647 
10648 
10649 #if CONFIG_MACF
10650 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10651 	    fp->fp_glob);
10652 	if (error) {
10653 		goto out;
10654 	}
10655 #endif
10656 
10657 
10658 	if ((error = vnode_getwithref(vp))) {
10659 		goto out;
10660 	}
10661 
10662 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10663 
10664 #if CONFIG_UNION_MOUNTS
10665 unionread:
10666 #endif /* CONFIG_UNION_MOUNTS */
10667 	if (vp->v_type != VDIR) {
10668 		(void)vnode_put(vp);
10669 		error = EINVAL;
10670 		goto out;
10671 	}
10672 
10673 #if CONFIG_MACF
10674 	error = mac_vnode_check_readdir(ctx, vp);
10675 	if (error != 0) {
10676 		(void)vnode_put(vp);
10677 		goto out;
10678 	}
10679 #endif /* MAC */
10680 
10681 	/* set up the uio structure which will contain the users return buffer */
10682 	loff = fp->fp_glob->fg_offset;
10683 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10684 	uio_addiov(auio, uap->buffer, uap->buffersize);
10685 
10686 	/*
10687 	 * If the only item requested is file names, we can let that past with
10688 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10689 	 * they need SEARCH as well.
10690 	 */
10691 	action = KAUTH_VNODE_LIST_DIRECTORY;
10692 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10693 	    attributelist.fileattr || attributelist.dirattr) {
10694 		action |= KAUTH_VNODE_SEARCH;
10695 	}
10696 
10697 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10698 		/* Believe it or not, uap->options only has 32-bits of valid
10699 		 * info, so truncate before extending again */
10700 
10701 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10702 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10703 	}
10704 
10705 	if (error) {
10706 		(void) vnode_put(vp);
10707 		goto out;
10708 	}
10709 
10710 #if CONFIG_UNION_MOUNTS
10711 	/*
10712 	 * If we've got the last entry of a directory in a union mount
10713 	 * then reset the eofflag and pretend there's still more to come.
10714 	 * The next call will again set eofflag and the buffer will be empty,
10715 	 * so traverse to the underlying directory and do the directory
10716 	 * read there.
10717 	 */
10718 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10719 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10720 			eofflag = 0;
10721 		} else {                                                // Empty buffer
10722 			vnode_t uvp;
10723 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10724 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10725 					fp_set_data(fp, uvp);
10726 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10727 					count = savecount;
10728 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10729 					vnode_put(vp);
10730 					vp = uvp;
10731 					goto unionread;
10732 				} else {
10733 					/* could not get a ref, can't replace in fd */
10734 					vnode_put(uvp);
10735 				}
10736 			}
10737 		}
10738 	}
10739 #endif /* CONFIG_UNION_MOUNTS */
10740 
10741 	(void)vnode_put(vp);
10742 
10743 	if (error) {
10744 		goto out;
10745 	}
10746 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10747 
10748 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10749 		goto out;
10750 	}
10751 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10752 		goto out;
10753 	}
10754 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10755 		goto out;
10756 	}
10757 
10758 	*retval = eofflag;  /* similar to getdirentries */
10759 	error = 0;
10760 out:
10761 	vn_offset_unlock(fp->fp_glob);
10762 	file_drop(fd);
10763 	return error; /* return error earlier, an retval of 0 or 1 now */
10764 } /* end of getdirentriesattr system call */
10765 
10766 /*
10767  * Exchange data between two files
10768  */
10769 
10770 /* ARGSUSED */
10771 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10772 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10773 {
10774 	struct nameidata fnd, snd;
10775 	vfs_context_t ctx = vfs_context_current();
10776 	vnode_t fvp;
10777 	vnode_t svp;
10778 	int error;
10779 	u_int32_t nameiflags;
10780 	char *fpath = NULL;
10781 	char *spath = NULL;
10782 	int   flen = 0, slen = 0;
10783 	int from_truncated = 0, to_truncated = 0;
10784 #if CONFIG_FSE
10785 	fse_info f_finfo, s_finfo;
10786 #endif
10787 
10788 	nameiflags = 0;
10789 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10790 		nameiflags |= FOLLOW;
10791 	}
10792 
10793 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10794 	    UIO_USERSPACE, uap->path1, ctx);
10795 
10796 	error = namei(&fnd);
10797 	if (error) {
10798 		goto out2;
10799 	}
10800 
10801 	nameidone(&fnd);
10802 	fvp = fnd.ni_vp;
10803 
10804 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10805 	    UIO_USERSPACE, uap->path2, ctx);
10806 
10807 	error = namei(&snd);
10808 	if (error) {
10809 		vnode_put(fvp);
10810 		goto out2;
10811 	}
10812 	nameidone(&snd);
10813 	svp = snd.ni_vp;
10814 
10815 	/*
10816 	 * if the files are the same, return an inval error
10817 	 */
10818 	if (svp == fvp) {
10819 		error = EINVAL;
10820 		goto out;
10821 	}
10822 
10823 	/*
10824 	 * if the files are on different volumes, return an error
10825 	 */
10826 	if (svp->v_mount != fvp->v_mount) {
10827 		error = EXDEV;
10828 		goto out;
10829 	}
10830 
10831 	/* If they're not files, return an error */
10832 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10833 		error = EINVAL;
10834 		goto out;
10835 	}
10836 
10837 #if CONFIG_MACF
10838 	error = mac_vnode_check_exchangedata(ctx,
10839 	    fvp, svp);
10840 	if (error) {
10841 		goto out;
10842 	}
10843 #endif
10844 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10845 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10846 		goto out;
10847 	}
10848 
10849 	if (
10850 #if CONFIG_FSE
10851 		need_fsevent(FSE_EXCHANGE, fvp) ||
10852 #endif
10853 		kauth_authorize_fileop_has_listeners()) {
10854 		GET_PATH(fpath);
10855 		GET_PATH(spath);
10856 
10857 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10858 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10859 
10860 #if CONFIG_FSE
10861 		get_fse_info(fvp, &f_finfo, ctx);
10862 		get_fse_info(svp, &s_finfo, ctx);
10863 		if (from_truncated || to_truncated) {
10864 			// set it here since only the f_finfo gets reported up to user space
10865 			f_finfo.mode |= FSE_TRUNCATED_PATH;
10866 		}
10867 #endif
10868 	}
10869 	/* Ok, make the call */
10870 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10871 
10872 	if (error == 0) {
10873 		const char *tmpname;
10874 
10875 		if (fpath != NULL && spath != NULL) {
10876 			/* call out to allow 3rd party notification of exchangedata.
10877 			 * Ignore result of kauth_authorize_fileop call.
10878 			 */
10879 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10880 			    (uintptr_t)fpath, (uintptr_t)spath);
10881 		}
10882 		name_cache_lock();
10883 
10884 		tmpname     = fvp->v_name;
10885 		fvp->v_name = svp->v_name;
10886 		svp->v_name = tmpname;
10887 
10888 		if (fvp->v_parent != svp->v_parent) {
10889 			vnode_t tmp;
10890 
10891 			tmp           = fvp->v_parent;
10892 			fvp->v_parent = svp->v_parent;
10893 			svp->v_parent = tmp;
10894 		}
10895 		name_cache_unlock();
10896 
10897 #if CONFIG_FSE
10898 		if (fpath != NULL && spath != NULL) {
10899 			add_fsevent(FSE_EXCHANGE, ctx,
10900 			    FSE_ARG_STRING, flen, fpath,
10901 			    FSE_ARG_FINFO, &f_finfo,
10902 			    FSE_ARG_STRING, slen, spath,
10903 			    FSE_ARG_FINFO, &s_finfo,
10904 			    FSE_ARG_DONE);
10905 		}
10906 #endif
10907 	}
10908 
10909 out:
10910 	if (fpath != NULL) {
10911 		RELEASE_PATH(fpath);
10912 	}
10913 	if (spath != NULL) {
10914 		RELEASE_PATH(spath);
10915 	}
10916 	vnode_put(svp);
10917 	vnode_put(fvp);
10918 out2:
10919 	return error;
10920 }
10921 
10922 /*
10923  * Return (in MB) the amount of freespace on the given vnode's volume.
10924  */
10925 uint32_t freespace_mb(vnode_t vp);
10926 
10927 uint32_t
freespace_mb(vnode_t vp)10928 freespace_mb(vnode_t vp)
10929 {
10930 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10931 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10932 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10933 }
10934 
10935 #if CONFIG_SEARCHFS
10936 
10937 /* ARGSUSED */
10938 
10939 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10940 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10941 {
10942 	vnode_t vp, tvp;
10943 	int i, error = 0;
10944 	int fserror = 0;
10945 	struct nameidata nd;
10946 	struct user64_fssearchblock searchblock;
10947 	struct searchstate *state;
10948 	struct attrlist *returnattrs;
10949 	struct timeval timelimit;
10950 	void *searchparams1, *searchparams2;
10951 	uio_t auio = NULL;
10952 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10953 	uint32_t nummatches;
10954 	size_t mallocsize;
10955 	uint32_t nameiflags;
10956 	vfs_context_t ctx = vfs_context_current();
10957 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10958 
10959 	/* Start by copying in fsearchblock parameter list */
10960 	if (IS_64BIT_PROCESS(p)) {
10961 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10962 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
10963 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
10964 	} else {
10965 		struct user32_fssearchblock tmp_searchblock;
10966 
10967 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10968 		// munge into 64-bit version
10969 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10970 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10971 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10972 		searchblock.maxmatches = tmp_searchblock.maxmatches;
10973 		/*
10974 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10975 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10976 		 */
10977 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10978 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10979 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10980 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10981 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10982 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10983 		searchblock.searchattrs = tmp_searchblock.searchattrs;
10984 	}
10985 	if (error) {
10986 		return error;
10987 	}
10988 
10989 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10990 	 */
10991 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10992 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10993 		return EINVAL;
10994 	}
10995 
10996 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10997 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
10998 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10999 	/* block.                                                                                             */
11000 	/*												      */
11001 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
11002 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
11003 	/*       assumes the size is still 556 bytes it will continue to work				      */
11004 
11005 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11006 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11007 
11008 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11009 
11010 	/* Now set up the various pointers to the correct place in our newly allocated memory */
11011 
11012 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11013 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11014 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11015 
11016 	/* Now copy in the stuff given our local variables. */
11017 
11018 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11019 		goto freeandexit;
11020 	}
11021 
11022 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11023 		goto freeandexit;
11024 	}
11025 
11026 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11027 		goto freeandexit;
11028 	}
11029 
11030 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11031 		goto freeandexit;
11032 	}
11033 
11034 	/*
11035 	 * When searching a union mount, need to set the
11036 	 * start flag at the first call on each layer to
11037 	 * reset state for the new volume.
11038 	 */
11039 	if (uap->options & SRCHFS_START) {
11040 		state->ss_union_layer = 0;
11041 	} else {
11042 		uap->options |= state->ss_union_flags;
11043 	}
11044 	state->ss_union_flags = 0;
11045 
11046 	/*
11047 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11048 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11049 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11050 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11051 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11052 	 */
11053 
11054 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11055 		attrreference_t* string_ref;
11056 		u_int32_t* start_length;
11057 		user64_size_t param_length;
11058 
11059 		/* validate searchparams1 */
11060 		param_length = searchblock.sizeofsearchparams1;
11061 		/* skip the word that specifies length of the buffer */
11062 		start_length = (u_int32_t*) searchparams1;
11063 		start_length = start_length + 1;
11064 		string_ref = (attrreference_t*) start_length;
11065 
11066 		/* ensure no negative offsets or too big offsets */
11067 		if (string_ref->attr_dataoffset < 0) {
11068 			error = EINVAL;
11069 			goto freeandexit;
11070 		}
11071 		if (string_ref->attr_length > MAXPATHLEN) {
11072 			error = EINVAL;
11073 			goto freeandexit;
11074 		}
11075 
11076 		/* Check for pointer overflow in the string ref */
11077 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11078 			error = EINVAL;
11079 			goto freeandexit;
11080 		}
11081 
11082 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11083 			error = EINVAL;
11084 			goto freeandexit;
11085 		}
11086 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11087 			error = EINVAL;
11088 			goto freeandexit;
11089 		}
11090 	}
11091 
11092 	/* set up the uio structure which will contain the users return buffer */
11093 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11094 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11095 
11096 	nameiflags = 0;
11097 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11098 		nameiflags |= FOLLOW;
11099 	}
11100 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11101 	    UIO_USERSPACE, uap->path, ctx);
11102 
11103 	error = namei(&nd);
11104 	if (error) {
11105 		goto freeandexit;
11106 	}
11107 	vp = nd.ni_vp;
11108 	nameidone(&nd);
11109 
11110 	/*
11111 	 * Switch to the root vnode for the volume
11112 	 */
11113 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11114 	vnode_put(vp);
11115 	if (error) {
11116 		goto freeandexit;
11117 	}
11118 	vp = tvp;
11119 
11120 #if CONFIG_UNION_MOUNTS
11121 	/*
11122 	 * If it's a union mount, the path lookup takes
11123 	 * us to the top layer. But we may need to descend
11124 	 * to a lower layer. For non-union mounts the layer
11125 	 * is always zero.
11126 	 */
11127 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11128 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11129 			break;
11130 		}
11131 		tvp = vp;
11132 		vp = vp->v_mount->mnt_vnodecovered;
11133 		if (vp == NULL) {
11134 			vnode_put(tvp);
11135 			error = ENOENT;
11136 			goto freeandexit;
11137 		}
11138 		error = vnode_getwithref(vp);
11139 		vnode_put(tvp);
11140 		if (error) {
11141 			goto freeandexit;
11142 		}
11143 	}
11144 #endif /* CONFIG_UNION_MOUNTS */
11145 
11146 #if CONFIG_MACF
11147 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11148 	if (error) {
11149 		vnode_put(vp);
11150 		goto freeandexit;
11151 	}
11152 #endif
11153 
11154 
11155 	/*
11156 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11157 	 * before and sometimes the underlying code doesnt deal with it well.
11158 	 */
11159 	if (searchblock.maxmatches == 0) {
11160 		nummatches = 0;
11161 		goto saveandexit;
11162 	}
11163 
11164 	/*
11165 	 * Allright, we have everything we need, so lets make that call.
11166 	 *
11167 	 * We keep special track of the return value from the file system:
11168 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11169 	 * from copying out any results...
11170 	 */
11171 
11172 	fserror = VNOP_SEARCHFS(vp,
11173 	    searchparams1,
11174 	    searchparams2,
11175 	    &searchblock.searchattrs,
11176 	    (uint32_t)searchblock.maxmatches,
11177 	    &timelimit,
11178 	    returnattrs,
11179 	    &nummatches,
11180 	    (uint32_t)uap->scriptcode,
11181 	    (uint32_t)uap->options,
11182 	    auio,
11183 	    (struct searchstate *) &state->ss_fsstate,
11184 	    ctx);
11185 
11186 #if CONFIG_UNION_MOUNTS
11187 	/*
11188 	 * If it's a union mount we need to be called again
11189 	 * to search the mounted-on filesystem.
11190 	 */
11191 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11192 		state->ss_union_flags = SRCHFS_START;
11193 		state->ss_union_layer++;        // search next layer down
11194 		fserror = EAGAIN;
11195 	}
11196 #endif /* CONFIG_UNION_MOUNTS */
11197 
11198 saveandexit:
11199 
11200 	vnode_put(vp);
11201 
11202 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11203 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11204 
11205 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11206 		goto freeandexit;
11207 	}
11208 
11209 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11210 		goto freeandexit;
11211 	}
11212 
11213 	error = fserror;
11214 
11215 freeandexit:
11216 
11217 	kfree_data(searchparams1, mallocsize);
11218 
11219 	return error;
11220 } /* end of searchfs system call */
11221 
11222 #else /* CONFIG_SEARCHFS */
11223 
11224 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11225 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11226 {
11227 	return ENOTSUP;
11228 }
11229 
11230 #endif /* CONFIG_SEARCHFS */
11231 
11232 
11233 #if CONFIG_DATALESS_FILES
11234 
11235 /*
11236  * === Namespace Resolver Up-call Mechanism ===
11237  *
11238  * When I/O is performed to a dataless file or directory (read, write,
11239  * lookup-in, etc.), the file system performs an upcall to the namespace
11240  * resolver (filecoordinationd) to materialize the object.
11241  *
11242  * We need multiple up-calls to be in flight at once, and we need these
11243  * up-calls to be interruptible, thus the following implementation:
11244  *
11245  * => The nspace_resolver_request represents the in-kernel request state.
11246  *    It contains a request ID, storage space for the errno code returned
11247  *    by filecoordinationd, and flags.
11248  *
11249  * => The request ID is simply a global monotonically incrementing 32-bit
11250  *    number.  Outstanding requests are stored in a hash table, and the
11251  *    hash function is extremely simple.
11252  *
11253  * => When an upcall is to be made to filecoordinationd, a request structure
11254  *    is allocated on the stack (it is small, and needs to live only during
11255  *    the duration of the call to resolve_nspace_item_ext()).  It is
11256  *    initialized and inserted into the table.  Some backpressure from
11257  *    filecoordinationd is applied by limiting the numnber of entries that
11258  *    can be inserted into the table (and thus limiting the number of
11259  *    outstanding requests issued to filecoordinationd); waiting for an
11260  *    available slot is interruptible.
11261  *
11262  * => Once the request has been inserted into the table, the up-call is made
11263  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11264  *    immediately and filecoordinationd processes the request asynchronously.
11265  *
11266  * => The caller now waits for the request to complete.  Tnis is achieved by
11267  *    sleeping on the address of the request structure and waiting for
11268  *    filecoordinationd to mark the request structure as complete.  This
11269  *    is an interruptible sleep call; if interrupted, the request structure
11270  *    is removed from the table and EINTR is returned to the caller.  If
11271  *    this occurs, an advisory up-call is made to filecoordinationd with
11272  *    the request ID to indicate that the request can be aborted or
11273  *    de-prioritized at the discretion of filecoordinationd.
11274  *
11275  * => When filecoordinationd has completed the request, it signals completion
11276  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11277  *    decorated as a namespace resolver can write to this sysctl node.  The
11278  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11279  *    The request ID is looked up in the table, and if the request is found,
11280  *    the error code is stored in the request structure and a wakeup()
11281  *    issued on the address of the request structure.  If the request is not
11282  *    found, we simply drop the completion notification, assuming that the
11283  *    caller was interrupted.
11284  *
11285  * => When the waiting thread wakes up, it extracts the error code from the
11286  *    request structure, removes the request from the table, and returns the
11287  *    error code to the calling function.  Fini!
11288  */
11289 
11290 struct nspace_resolver_request {
11291 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11292 	vnode_t         r_vp;
11293 	uint32_t        r_req_id;
11294 	int             r_resolver_error;
11295 	int             r_flags;
11296 };
11297 
11298 #define RRF_COMPLETE    0x0001
11299 
11300 static uint32_t
next_nspace_req_id(void)11301 next_nspace_req_id(void)
11302 {
11303 	static uint32_t next_req_id;
11304 
11305 	return OSAddAtomic(1, &next_req_id);
11306 }
11307 
11308 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11309 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11310 
11311 static LIST_HEAD(nspace_resolver_requesthead,
11312     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11313 static u_long nspace_resolver_request_hashmask;
11314 static u_int nspace_resolver_request_count;
11315 static bool nspace_resolver_request_wait_slot;
11316 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11317 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11318     &nspace_resolver_request_lck_grp);
11319 
11320 #define NSPACE_REQ_LOCK() \
11321 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11322 #define NSPACE_REQ_UNLOCK() \
11323 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11324 
11325 #define NSPACE_RESOLVER_HASH(req_id)    \
11326 	(&nspace_resolver_request_hashtbl[(req_id) & \
11327 	 nspace_resolver_request_hashmask])
11328 
11329 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)11330 nspace_resolver_req_lookup(uint32_t req_id)
11331 {
11332 	struct nspace_resolver_requesthead *bucket;
11333 	struct nspace_resolver_request *req;
11334 
11335 	bucket = NSPACE_RESOLVER_HASH(req_id);
11336 	LIST_FOREACH(req, bucket, r_hashlink) {
11337 		if (req->r_req_id == req_id) {
11338 			return req;
11339 		}
11340 	}
11341 
11342 	return NULL;
11343 }
11344 
11345 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11346 nspace_resolver_req_add(struct nspace_resolver_request *req)
11347 {
11348 	struct nspace_resolver_requesthead *bucket;
11349 	int error;
11350 
11351 	while (nspace_resolver_request_count >=
11352 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11353 		nspace_resolver_request_wait_slot = true;
11354 		error = msleep(&nspace_resolver_request_count,
11355 		    &nspace_resolver_request_hash_mutex,
11356 		    PVFS | PCATCH, "nspacerq", NULL);
11357 		if (error) {
11358 			return error;
11359 		}
11360 	}
11361 
11362 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11363 #if DIAGNOSTIC
11364 	assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
11365 #endif /* DIAGNOSTIC */
11366 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11367 	nspace_resolver_request_count++;
11368 
11369 	return 0;
11370 }
11371 
11372 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11373 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11374 {
11375 	struct nspace_resolver_requesthead *bucket;
11376 
11377 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11378 #if DIAGNOSTIC
11379 	assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
11380 #endif /* DIAGNOSTIC */
11381 	LIST_REMOVE(req, r_hashlink);
11382 	nspace_resolver_request_count--;
11383 
11384 	if (nspace_resolver_request_wait_slot) {
11385 		nspace_resolver_request_wait_slot = false;
11386 		wakeup(&nspace_resolver_request_count);
11387 	}
11388 }
11389 
11390 static void
nspace_resolver_req_cancel(uint32_t req_id)11391 nspace_resolver_req_cancel(uint32_t req_id)
11392 {
11393 	kern_return_t kr;
11394 	mach_port_t mp;
11395 
11396 	// Failures here aren't fatal -- the cancellation message
11397 	// sent to the resolver is merely advisory.
11398 
11399 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11400 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11401 		return;
11402 	}
11403 
11404 	kr = send_nspace_resolve_cancel(mp, req_id);
11405 	if (kr != KERN_SUCCESS) {
11406 		os_log_error(OS_LOG_DEFAULT,
11407 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11408 	}
11409 
11410 	ipc_port_release_send(mp);
11411 }
11412 
11413 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11414 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11415 {
11416 	bool send_cancel_message = false;
11417 	int error;
11418 
11419 	NSPACE_REQ_LOCK();
11420 
11421 	while ((req->r_flags & RRF_COMPLETE) == 0) {
11422 		error = msleep(req, &nspace_resolver_request_hash_mutex,
11423 		    PVFS | PCATCH, "nspace", NULL);
11424 		if (error && error != ERESTART) {
11425 			req->r_resolver_error = (error == EINTR) ? EINTR :
11426 			    ETIMEDOUT;
11427 			send_cancel_message = true;
11428 			break;
11429 		}
11430 	}
11431 
11432 	nspace_resolver_req_remove(req);
11433 
11434 	NSPACE_REQ_UNLOCK();
11435 
11436 	if (send_cancel_message) {
11437 		nspace_resolver_req_cancel(req->r_req_id);
11438 	}
11439 
11440 	return req->r_resolver_error;
11441 }
11442 
11443 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11444 nspace_resolver_req_mark_complete(
11445 	struct nspace_resolver_request *req,
11446 	int resolver_error)
11447 {
11448 	req->r_resolver_error = resolver_error;
11449 	req->r_flags |= RRF_COMPLETE;
11450 	wakeup(req);
11451 }
11452 
11453 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)11454 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
11455 {
11456 	struct nspace_resolver_request *req;
11457 
11458 	NSPACE_REQ_LOCK();
11459 
11460 	// If we don't find the request corresponding to our req_id,
11461 	// just drop the completion signal on the floor; it's likely
11462 	// that the requester interrupted with a signal.
11463 
11464 	req = nspace_resolver_req_lookup(req_id);
11465 	if (req) {
11466 		mount_t locked_mp = NULL;
11467 
11468 		locked_mp = req->r_vp->v_mount;
11469 		mount_ref(locked_mp, 0);
11470 		mount_lock_renames(locked_mp);
11471 
11472 		//
11473 		// if the resolver isn't already returning an error and we have an
11474 		// orig_gencount, then get an iocount on the request vnode and check
11475 		// that the gencount on req->r_vp has not changed.
11476 		//
11477 		// note: a ref was taken on req->r_vp when the request was created
11478 		// and that ref will be dropped by that thread when it wakes up.
11479 		//
11480 		if (resolver_error == 0 &&
11481 		    orig_gencount != 0 &&
11482 		    vnode_getwithref(req->r_vp) == 0) {
11483 			struct vnode_attr va;
11484 			uint64_t cur_gencount;
11485 
11486 			VATTR_INIT(&va);
11487 			VATTR_WANTED(&va, va_recursive_gencount);
11488 
11489 			if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
11490 				cur_gencount = va.va_recursive_gencount;
11491 			} else {
11492 				cur_gencount = 0;
11493 			}
11494 
11495 			if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
11496 				printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
11497 
11498 				// this error will be returned to the thread that initiated the
11499 				// materialization of req->r_vp.
11500 				resolver_error = EBUSY;
11501 
11502 				// note: we explicitly do not return an error to the caller (i.e.
11503 				// the thread that did the materialization) because they said they
11504 				// don't want one.
11505 			}
11506 
11507 			vnode_put(req->r_vp);
11508 		}
11509 
11510 		mount_unlock_renames(locked_mp);
11511 		mount_drop(locked_mp, 0);
11512 
11513 		nspace_resolver_req_mark_complete(req, resolver_error);
11514 	}
11515 
11516 	NSPACE_REQ_UNLOCK();
11517 
11518 	return;
11519 }
11520 
11521 static struct proc *nspace_resolver_proc;
11522 
11523 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11524 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11525 {
11526 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11527 	    p == nspace_resolver_proc) ? 1 : 0;
11528 	return 0;
11529 }
11530 
11531 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11532 
11533 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11534 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11535 {
11536 	vfs_context_t ctx = vfs_context_current();
11537 	int error = 0;
11538 
11539 	//
11540 	// The system filecoordinationd runs as uid == 0.  This also
11541 	// has the nice side-effect of filtering out filecoordinationd
11542 	// running in the simulator.
11543 	//
11544 	if (!vfs_context_issuser(ctx) ||
11545 	    !vfs_context_is_dataless_resolver(ctx)) {
11546 		return EPERM;
11547 	}
11548 
11549 	if (is_resolver) {
11550 		NSPACE_REQ_LOCK();
11551 
11552 		if (nspace_resolver_proc == NULL) {
11553 			proc_lock(p);
11554 			p->p_lflag |= P_LNSPACE_RESOLVER;
11555 			proc_unlock(p);
11556 			nspace_resolver_proc = p;
11557 		} else {
11558 			error = EBUSY;
11559 		}
11560 
11561 		NSPACE_REQ_UNLOCK();
11562 	} else {
11563 		// This is basically just like the exit case.
11564 		// nspace_resolver_exited() will verify that the
11565 		// process is the resolver, and will clear the
11566 		// global.
11567 		nspace_resolver_exited(p);
11568 	}
11569 
11570 	return error;
11571 }
11572 
11573 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11574 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11575 {
11576 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11577 	    (p->p_vfs_iopolicy &
11578 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11579 		*is_prevented = 1;
11580 	} else {
11581 		*is_prevented = 0;
11582 	}
11583 	return 0;
11584 }
11585 
11586 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11587 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11588 {
11589 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11590 		return is_prevented ? 0 : EBUSY;
11591 	}
11592 
11593 	if (is_prevented) {
11594 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11595 	} else {
11596 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11597 	}
11598 	return 0;
11599 }
11600 
11601 static int
nspace_materialization_get_thread_state(int * is_prevented)11602 nspace_materialization_get_thread_state(int *is_prevented)
11603 {
11604 	uthread_t ut = current_uthread();
11605 
11606 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11607 	return 0;
11608 }
11609 
11610 static int
nspace_materialization_set_thread_state(int is_prevented)11611 nspace_materialization_set_thread_state(int is_prevented)
11612 {
11613 	uthread_t ut = current_uthread();
11614 
11615 	if (is_prevented) {
11616 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11617 	} else {
11618 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11619 	}
11620 	return 0;
11621 }
11622 
11623 /* the vfs.nspace branch */
11624 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11625 
11626 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11627 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11628     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11629 {
11630 	struct proc *p = req->p;
11631 	int new_value, old_value, changed = 0;
11632 	int error;
11633 
11634 	error = nspace_resolver_get_proc_state(p, &old_value);
11635 	if (error) {
11636 		return error;
11637 	}
11638 
11639 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11640 	    &changed);
11641 	if (error == 0 && changed) {
11642 		error = nspace_resolver_set_proc_state(p, new_value);
11643 	}
11644 	return error;
11645 }
11646 
11647 /* decorate this process as the dataless file resolver */
11648 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11649     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11650     0, 0, sysctl_nspace_resolver, "I", "");
11651 
11652 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11653 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11654     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11655 {
11656 	struct proc *p = req->p;
11657 	int new_value, old_value, changed = 0;
11658 	int error;
11659 
11660 	error = nspace_materialization_get_proc_state(p, &old_value);
11661 	if (error) {
11662 		return error;
11663 	}
11664 
11665 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11666 	    &changed);
11667 	if (error == 0 && changed) {
11668 		error = nspace_materialization_set_proc_state(p, new_value);
11669 	}
11670 	return error;
11671 }
11672 
11673 /* decorate this process as not wanting to materialize dataless files */
11674 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11675     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11676     0, 0, sysctl_nspace_prevent_materialization, "I", "");
11677 
11678 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11679 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11680     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11681 {
11682 	int new_value, old_value, changed = 0;
11683 	int error;
11684 
11685 	error = nspace_materialization_get_thread_state(&old_value);
11686 	if (error) {
11687 		return error;
11688 	}
11689 
11690 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11691 	    &changed);
11692 	if (error == 0 && changed) {
11693 		error = nspace_materialization_set_thread_state(new_value);
11694 	}
11695 	return error;
11696 }
11697 
11698 /* decorate this thread as not wanting to materialize dataless files */
11699 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11700     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11701     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11702 
11703 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11704 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11705     __unused int arg2, struct sysctl_req *req)
11706 {
11707 	struct proc *p = req->p;
11708 	uint32_t req_status[2] = { 0, 0 };
11709 	uint64_t gencount = 0;
11710 	int error, is_resolver, changed = 0, gencount_changed;
11711 
11712 	error = nspace_resolver_get_proc_state(p, &is_resolver);
11713 	if (error) {
11714 		return error;
11715 	}
11716 
11717 	if (!is_resolver) {
11718 		return EPERM;
11719 	}
11720 
11721 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11722 	    &changed);
11723 	if (error) {
11724 		return error;
11725 	}
11726 
11727 	// get the gencount if it was passed
11728 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11729 	    &gencount_changed);
11730 	if (error) {
11731 		gencount = 0;
11732 		// we ignore the error because the gencount was optional
11733 		error = 0;
11734 	}
11735 
11736 	/*
11737 	 * req_status[0] is the req_id
11738 	 *
11739 	 * req_status[1] is the errno
11740 	 */
11741 	if (error == 0 && changed) {
11742 		nspace_resolver_req_completed(req_status[0],
11743 		    (int)req_status[1], gencount);
11744 	}
11745 	return error;
11746 }
11747 
11748 /* Resolver reports completed reqs here. */
11749 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11750     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11751     0, 0, sysctl_nspace_complete, "-", "");
11752 
11753 #endif /* CONFIG_DATALESS_FILES */
11754 
11755 #if CONFIG_DATALESS_FILES
11756 #define __no_dataless_unused    /* nothing */
11757 #else
11758 #define __no_dataless_unused    __unused
11759 #endif
11760 
11761 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11762 vfs_context_dataless_materialization_is_prevented(
11763 	vfs_context_t const ctx __no_dataless_unused)
11764 {
11765 #if CONFIG_DATALESS_FILES
11766 	proc_t const p = vfs_context_proc(ctx);
11767 	thread_t const t = vfs_context_thread(ctx);
11768 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11769 
11770 	/*
11771 	 * Kernel context ==> return EDEADLK, as we would with any random
11772 	 * process decorated as no-materialize.
11773 	 */
11774 	if (ctx == vfs_context_kernel()) {
11775 		return EDEADLK;
11776 	}
11777 
11778 	/*
11779 	 * If the process has the dataless-manipulation entitlement,
11780 	 * materialization is prevented, and depending on the kind
11781 	 * of file system operation, things get to proceed as if the
11782 	 * object is not dataless.
11783 	 */
11784 	if (vfs_context_is_dataless_manipulator(ctx)) {
11785 		return EJUSTRETURN;
11786 	}
11787 
11788 	/*
11789 	 * Per-thread decorations override any process-wide decorations.
11790 	 * (Foundation uses this, and this overrides even the dataless-
11791 	 * manipulation entitlement so as to make API contracts consistent.)
11792 	 */
11793 	if (ut != NULL) {
11794 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11795 			return EDEADLK;
11796 		}
11797 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11798 			return 0;
11799 		}
11800 	}
11801 
11802 	/*
11803 	 * If the process's iopolicy specifies that dataless files
11804 	 * can be materialized, then we let it go ahead.
11805 	 */
11806 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11807 		return 0;
11808 	}
11809 #endif /* CONFIG_DATALESS_FILES */
11810 
11811 	/*
11812 	 * The default behavior is to not materialize dataless files;
11813 	 * return to the caller that deadlock was detected.
11814 	 */
11815 	return EDEADLK;
11816 }
11817 
11818 void
nspace_resolver_init(void)11819 nspace_resolver_init(void)
11820 {
11821 #if CONFIG_DATALESS_FILES
11822 	nspace_resolver_request_hashtbl =
11823 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11824 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11825 #endif /* CONFIG_DATALESS_FILES */
11826 }
11827 
11828 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11829 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11830 {
11831 #if CONFIG_DATALESS_FILES
11832 	struct nspace_resolver_requesthead *bucket;
11833 	struct nspace_resolver_request *req;
11834 	u_long idx;
11835 
11836 	NSPACE_REQ_LOCK();
11837 
11838 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11839 	    p == nspace_resolver_proc) {
11840 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11841 			bucket = &nspace_resolver_request_hashtbl[idx];
11842 			LIST_FOREACH(req, bucket, r_hashlink) {
11843 				nspace_resolver_req_mark_complete(req,
11844 				    ETIMEDOUT);
11845 			}
11846 		}
11847 		nspace_resolver_proc = NULL;
11848 	}
11849 
11850 	NSPACE_REQ_UNLOCK();
11851 #endif /* CONFIG_DATALESS_FILES */
11852 }
11853 
11854 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11855 resolve_nspace_item(struct vnode *vp, uint64_t op)
11856 {
11857 	return resolve_nspace_item_ext(vp, op, NULL);
11858 }
11859 
11860 #define DATALESS_RESOLVER_ENTITLEMENT     \
11861 	"com.apple.private.vfs.dataless-resolver"
11862 #define DATALESS_MANIPULATION_ENTITLEMENT \
11863 	"com.apple.private.vfs.dataless-manipulation"
11864 
11865 #if CONFIG_DATALESS_FILES
11866 /*
11867  * Return TRUE if the vfs context is associated with the dataless
11868  * resolver.
11869  */
11870 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11871 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11872 {
11873 	return IOTaskHasEntitlement(vfs_context_task(ctx),
11874 	           DATALESS_RESOLVER_ENTITLEMENT);
11875 }
11876 #endif /* CONFIG_DATALESS_FILES */
11877 
11878 /*
11879  * Return TRUE if the vfs context is associated with a process entitled
11880  * for dataless manipulation.
11881  *
11882  * XXX Arguably belongs in vfs_subr.c, but is here because of the
11883  * complication around CONFIG_DATALESS_FILES.
11884  */
11885 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11886 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11887 {
11888 #if CONFIG_DATALESS_FILES
11889 	task_t task = vfs_context_task(ctx);
11890 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11891 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11892 #else
11893 	return false;
11894 #endif /* CONFIG_DATALESS_FILES */
11895 }
11896 
11897 #if CONFIG_DATALESS_FILES
11898 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11899 log_materialization_prevented(vnode_t vp, uint64_t op)
11900 {
11901 	char p_name[MAXCOMLEN + 1];
11902 	char *vntype;
11903 	proc_selfname(&p_name[0], sizeof(p_name));
11904 
11905 	if (vp->v_type == VREG) {
11906 		vntype = "File";
11907 	} else if (vp->v_type == VDIR) {
11908 		vntype = "Dir";
11909 	} else if (vp->v_type == VLNK) {
11910 		vntype = "SymLink";
11911 	} else {
11912 		vntype = "Other";
11913 	}
11914 
11915 #if DEVELOPMENT
11916 	char *path = NULL;
11917 	int   len;
11918 
11919 	path = get_pathbuff();
11920 	len = MAXPATHLEN;
11921 	if (path) {
11922 		vn_getpath(vp, path, &len);
11923 	}
11924 
11925 	os_log_debug(OS_LOG_DEFAULT,
11926 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11927 	    p_name, proc_selfpid(),
11928 	    op, vntype, path ? path : "<unknown-path>");
11929 	if (path) {
11930 		release_pathbuff(path);
11931 	}
11932 #else
11933 	os_log_debug(OS_LOG_DEFAULT,
11934 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11935 	    p_name, proc_selfpid(),
11936 	    op, vntype);
11937 #endif
11938 }
11939 #endif /* CONFIG_DATALESS_FILES */
11940 
11941 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11942 vfs_materialize_item(
11943 	struct vnode *vp __no_dataless_unused,
11944 	uint64_t op __no_dataless_unused,
11945 	int64_t offset __no_dataless_unused,
11946 	int64_t size __no_dataless_unused,
11947 	char *lookup_name __no_dataless_unused,
11948 	size_t const namelen __no_dataless_unused)
11949 {
11950 #if CONFIG_DATALESS_FILES
11951 	struct nspace_resolver_request req;
11952 	kern_return_t kern_ret;
11953 	mach_port_t mach_port;
11954 	char *path = NULL;
11955 	vfs_context_t context;
11956 	int path_len;
11957 	int error;
11958 	audit_token_t atoken;
11959 
11960 	/*
11961 	 * If this is a snapshot event and the vnode is on a disk image just
11962 	 * pretend nothing happened since any change to the disk image will
11963 	 * cause the disk image itself to get backed up and this avoids multi-
11964 	 * way deadlocks between the snapshot handler and the ever popular
11965 	 * diskimages-helper process. The variable nspace_allow_virtual_devs
11966 	 * allows this behavior to be overridden (for use by the Mobile
11967 	 * TimeMachine testing infrastructure which uses disk images).
11968 	 */
11969 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11970 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11971 		return ENOTSUP;
11972 	}
11973 
11974 	context = vfs_context_current();
11975 
11976 	error = vfs_context_dataless_materialization_is_prevented(context);
11977 	if (error) {
11978 		log_materialization_prevented(vp, op);
11979 		return error;
11980 	}
11981 
11982 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11983 	    &mach_port);
11984 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11985 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11986 		/*
11987 		 * Treat this like being unable to access the backing store
11988 		 * server.
11989 		 */
11990 		return ETIMEDOUT;
11991 	}
11992 
11993 	int path_alloc_len = MAXPATHLEN;
11994 	do {
11995 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
11996 		if (path == NULL) {
11997 			return ENOMEM;
11998 		}
11999 
12000 		path_len = path_alloc_len;
12001 		error = vn_getpath(vp, path, &path_len);
12002 		if (error == 0) {
12003 			break;
12004 		} else if (error == ENOSPC) {
12005 			kfree_data(path, path_alloc_len);
12006 			path = NULL;
12007 		} else {
12008 			goto out_release_port;
12009 		}
12010 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12011 
12012 	error = vfs_context_copy_audit_token(context, &atoken);
12013 	if (error) {
12014 		goto out_release_port;
12015 	}
12016 
12017 	req.r_req_id = next_nspace_req_id();
12018 	req.r_resolver_error = 0;
12019 	req.r_flags = 0;
12020 	req.r_vp = vp;
12021 
12022 	NSPACE_REQ_LOCK();
12023 	error = nspace_resolver_req_add(&req);
12024 	NSPACE_REQ_UNLOCK();
12025 	if (error) {
12026 		goto out_release_port;
12027 	}
12028 
12029 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12030 	if (vp->v_type == VDIR) {
12031 		char *tmpname = NULL;
12032 
12033 		/*
12034 		 * If the caller provided a lookup_name *and* a name length,
12035 		 * then we assume the lookup_name is not NUL-terminated.
12036 		 * Allocate a temporary buffer in this case to provide
12037 		 * a NUL-terminated path name to the IPC call.
12038 		 */
12039 		if (lookup_name != NULL && namelen != 0) {
12040 			if (namelen >= PATH_MAX) {
12041 				error = EINVAL;
12042 				goto out_release_port;
12043 			}
12044 			tmpname = zalloc(ZV_NAMEI);
12045 			strlcpy(tmpname, lookup_name, namelen + 1);
12046 			lookup_name = tmpname;
12047 		} else if (lookup_name != NULL) {
12048 			/*
12049 			 * If the caller provided a lookup_name with a
12050 			 * zero name length, then we assume it's NUL-
12051 			 * terminated.  Verify it has a valid length.
12052 			 */
12053 			if (strlen(lookup_name) >= PATH_MAX) {
12054 				error = EINVAL;
12055 				goto out_release_port;
12056 			}
12057 		}
12058 
12059 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12060 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
12061 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12062 
12063 		if (tmpname != NULL) {
12064 			zfree(ZV_NAMEI, tmpname);
12065 
12066 			/*
12067 			 * Poison lookup_name rather than reference
12068 			 * freed memory.
12069 			 */
12070 			lookup_name = NULL;
12071 		}
12072 	} else {
12073 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12074 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
12075 		    offset, size, path, atoken);
12076 	}
12077 	if (kern_ret != KERN_SUCCESS) {
12078 		/*
12079 		 * Also treat this like being unable to access the backing
12080 		 * store server.
12081 		 */
12082 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12083 		    kern_ret);
12084 		error = ETIMEDOUT;
12085 
12086 		NSPACE_REQ_LOCK();
12087 		nspace_resolver_req_remove(&req);
12088 		NSPACE_REQ_UNLOCK();
12089 		goto out_release_port;
12090 	}
12091 
12092 	/*
12093 	 * Give back the memory we allocated earlier while we wait; we
12094 	 * no longer need it.
12095 	 */
12096 	kfree_data(path, path_alloc_len);
12097 	path = NULL;
12098 
12099 	/*
12100 	 * Request has been submitted to the resolver. Now (interruptibly)
12101 	 * wait for completion. Upon requrn, the request will have been
12102 	 * removed from the lookup table.
12103 	 */
12104 	error = nspace_resolver_req_wait(&req);
12105 
12106 out_release_port:
12107 	if (path != NULL) {
12108 		kfree_data(path, path_alloc_len);
12109 		path = NULL;
12110 	}
12111 	ipc_port_release_send(mach_port);
12112 
12113 	return error;
12114 #else
12115 	return ENOTSUP;
12116 #endif /* CONFIG_DATALESS_FILES */
12117 }
12118 
12119 /*
12120  * vfs_materialize_file: Materialize a regular file.
12121  *
12122  * Inputs:
12123  * vp		The dataless file to be materialized.
12124  *
12125  * op		What kind of operation is being performed:
12126  *		-> NAMESPACE_HANDLER_READ_OP
12127  *		-> NAMESPACE_HANDLER_WRITE_OP
12128  *		-> NAMESPACE_HANDLER_LINK_CREATE
12129  *		-> NAMESPACE_HANDLER_DELETE_OP
12130  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12131  *		-> NAMESPACE_HANDLER_RENAME_OP
12132  *
12133  * offset	offset of I/O for READ or WRITE.  Ignored for
12134  *		other ops.
12135  *
12136  * size		size of I/O for READ or WRITE  Ignored for
12137  *		other ops.
12138  *
12139  * If offsize or size are -1 for a READ or WRITE, then the resolver should
12140  * consider the range to be unknown.
12141  *
12142  * Upon successful return, the caller may proceed with the operation.
12143  * N.B. the file may still be "dataless" in this case.
12144  */
12145 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12146 vfs_materialize_file(
12147 	struct vnode *vp,
12148 	uint64_t op,
12149 	int64_t offset,
12150 	int64_t size)
12151 {
12152 	if (vp->v_type != VREG) {
12153 		return EFTYPE;
12154 	}
12155 	return vfs_materialize_item(vp, op, offset, size, NULL, 0);
12156 }
12157 
12158 /*
12159  * vfs_materialize_dir:
12160  *
12161  * Inputs:
12162  * vp		The dataless directory to be materialized.
12163  *
12164  * op		What kind of operation is being performed:
12165  *		-> NAMESPACE_HANDLER_READ_OP
12166  *		-> NAMESPACE_HANDLER_WRITE_OP
12167  *		-> NAMESPACE_HANDLER_DELETE_OP
12168  *		-> NAMESPACE_HANDLER_RENAME_OP
12169  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12170  *
12171  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12172  *		other ops.  May or may not be NUL-terminated; see below.
12173  *
12174  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12175  *		terminated and namelen is the number of valid bytes in
12176  *		lookup_name. If zero, then lookup_name is assumed to be
12177  *		NUL-terminated.
12178  *
12179  * Upon successful return, the caller may proceed with the operation.
12180  * N.B. the directory may still be "dataless" in this case.
12181  */
12182 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12183 vfs_materialize_dir(
12184 	struct vnode *vp,
12185 	uint64_t op,
12186 	char *lookup_name,
12187 	size_t namelen)
12188 {
12189 	if (vp->v_type != VDIR) {
12190 		return EFTYPE;
12191 	}
12192 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12193 		return EINVAL;
12194 	}
12195 	return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
12196 }
12197 
12198 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)12199 resolve_nspace_item_ext(
12200 	struct vnode *vp __no_dataless_unused,
12201 	uint64_t op __no_dataless_unused,
12202 	void *arg __unused)
12203 {
12204 #if CONFIG_DATALESS_FILES
12205 	int error;
12206 	mach_port_t mp;
12207 	char *path = NULL;
12208 	int path_len;
12209 	kern_return_t kr;
12210 	struct nspace_resolver_request req;
12211 
12212 	// only allow namespace events on regular files, directories and symlinks.
12213 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
12214 		return EFTYPE;
12215 	}
12216 
12217 	//
12218 	// if this is a snapshot event and the vnode is on a
12219 	// disk image just pretend nothing happened since any
12220 	// change to the disk image will cause the disk image
12221 	// itself to get backed up and this avoids multi-way
12222 	// deadlocks between the snapshot handler and the ever
12223 	// popular diskimages-helper process.  the variable
12224 	// nspace_allow_virtual_devs allows this behavior to
12225 	// be overridden (for use by the Mobile TimeMachine
12226 	// testing infrastructure which uses disk images)
12227 	//
12228 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12229 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12230 		return ENOTSUP;
12231 	}
12232 
12233 	error = vfs_context_dataless_materialization_is_prevented(
12234 		vfs_context_current());
12235 	if (error) {
12236 		log_materialization_prevented(vp, op);
12237 		return error;
12238 	}
12239 
12240 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
12241 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
12242 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12243 		// Treat this like being unable to access the backing
12244 		// store server.
12245 		return ETIMEDOUT;
12246 	}
12247 
12248 	path = zalloc(ZV_NAMEI);
12249 	path_len = MAXPATHLEN;
12250 
12251 	error = vn_getpath(vp, path, &path_len);
12252 	if (error == 0) {
12253 		int xxx_rdar44371223;   /* XXX Mig bug */
12254 		req.r_req_id = next_nspace_req_id();
12255 		req.r_resolver_error = 0;
12256 		req.r_flags = 0;
12257 
12258 		if ((error = vnode_ref(vp)) == 0) {     // take a ref so that the vnode doesn't go away
12259 			req.r_vp = vp;
12260 		} else {
12261 			goto out_release_port;
12262 		}
12263 
12264 		NSPACE_REQ_LOCK();
12265 		error = nspace_resolver_req_add(&req);
12266 		NSPACE_REQ_UNLOCK();
12267 		if (error) {
12268 			vnode_rele(req.r_vp);
12269 			goto out_release_port;
12270 		}
12271 
12272 		os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12273 		kr = send_nspace_resolve_path(mp, req.r_req_id,
12274 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
12275 		    path, &xxx_rdar44371223);
12276 		if (kr != KERN_SUCCESS) {
12277 			// Also treat this like being unable to access
12278 			// the backing store server.
12279 			os_log_error(OS_LOG_DEFAULT,
12280 			    "NSPACE resolve_path failure: %d", kr);
12281 			error = ETIMEDOUT;
12282 
12283 			NSPACE_REQ_LOCK();
12284 			nspace_resolver_req_remove(&req);
12285 			NSPACE_REQ_UNLOCK();
12286 			vnode_rele(req.r_vp);
12287 			goto out_release_port;
12288 		}
12289 
12290 		// Give back the memory we allocated earlier while
12291 		// we wait; we no longer need it.
12292 		zfree(ZV_NAMEI, path);
12293 		path = NULL;
12294 
12295 		// Request has been submitted to the resolver.
12296 		// Now (interruptibly) wait for completion.
12297 		// Upon requrn, the request will have been removed
12298 		// from the lookup table.
12299 		error = nspace_resolver_req_wait(&req);
12300 
12301 		vnode_rele(req.r_vp);
12302 	}
12303 
12304 out_release_port:
12305 	if (path != NULL) {
12306 		zfree(ZV_NAMEI, path);
12307 	}
12308 	ipc_port_release_send(mp);
12309 
12310 	return error;
12311 #else
12312 	return ENOTSUP;
12313 #endif /* CONFIG_DATALESS_FILES */
12314 }
12315 
12316 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)12317 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
12318     __unused uint64_t op_type, __unused void *arg)
12319 {
12320 	return 0;
12321 }
12322 
12323 #if 0
12324 static int
12325 build_volfs_path(struct vnode *vp, char *path, int *len)
12326 {
12327 	struct vnode_attr va;
12328 	int ret;
12329 
12330 	VATTR_INIT(&va);
12331 	VATTR_WANTED(&va, va_fsid);
12332 	VATTR_WANTED(&va, va_fileid);
12333 
12334 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12335 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12336 		ret = -1;
12337 	} else {
12338 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12339 		ret = 0;
12340 	}
12341 
12342 	return ret;
12343 }
12344 #endif
12345 
12346 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12347 fsctl_bogus_command_compat(unsigned long cmd)
12348 {
12349 	switch (cmd) {
12350 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12351 		return FSIOC_SYNC_VOLUME;
12352 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12353 		return FSIOC_ROUTEFS_SETROUTEID;
12354 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12355 		return FSIOC_SET_PACKAGE_EXTS;
12356 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12357 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
12358 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12359 		return DISK_CONDITIONER_IOC_GET;
12360 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12361 		return DISK_CONDITIONER_IOC_SET;
12362 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12363 		return FSIOC_FIOSEEKHOLE;
12364 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12365 		return FSIOC_FIOSEEKDATA;
12366 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12367 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12368 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12369 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
12370 	}
12371 
12372 	return cmd;
12373 }
12374 
12375 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12376 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12377 {
12378 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12379 }
12380 
12381 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12382 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12383 {
12384 	struct vfs_attr vfa;
12385 	mount_t mp = vp->v_mount;
12386 	unsigned arg;
12387 	int error;
12388 
12389 	/* record vid of vp so we can drop it below. */
12390 	uint32_t vvid = vp->v_id;
12391 
12392 	/*
12393 	 * Then grab mount_iterref so that we can release the vnode.
12394 	 * Without this, a thread may call vnode_iterate_prepare then
12395 	 * get into a deadlock because we've never released the root vp
12396 	 */
12397 	error = mount_iterref(mp, 0);
12398 	if (error) {
12399 		return error;
12400 	}
12401 	vnode_hold(vp);
12402 	vnode_put(vp);
12403 
12404 	arg = MNT_NOWAIT;
12405 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12406 		arg = MNT_WAIT;
12407 	}
12408 
12409 	/*
12410 	 * If the filessytem supports multiple filesytems in a
12411 	 * partition (For eg APFS volumes in a container, it knows
12412 	 * that the waitfor argument to VFS_SYNC are flags.
12413 	 */
12414 	VFSATTR_INIT(&vfa);
12415 	VFSATTR_WANTED(&vfa, f_capabilities);
12416 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12417 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12418 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12419 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12420 		arg |= MNT_VOLUME;
12421 	}
12422 
12423 	/* issue the sync for this volume */
12424 	(void)sync_callback(mp, &arg);
12425 
12426 	/*
12427 	 * Then release the mount_iterref once we're done syncing; it's not
12428 	 * needed for the VNOP_IOCTL below
12429 	 */
12430 	mount_iterdrop(mp);
12431 
12432 	if (arg & FSCTL_SYNC_FULLSYNC) {
12433 		/* re-obtain vnode iocount on the root vp, if possible */
12434 		error = vnode_getwithvid(vp, vvid);
12435 		if (error == 0) {
12436 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12437 			vnode_put(vp);
12438 		}
12439 	}
12440 	vnode_drop(vp);
12441 	/* mark the argument VP as having been released */
12442 	*arg_vp = NULL;
12443 	return error;
12444 }
12445 
12446 #if ROUTEFS
12447 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12448 handle_routes(user_addr_t udata)
12449 {
12450 	char routepath[MAXPATHLEN];
12451 	size_t len = 0;
12452 	int error;
12453 
12454 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12455 		return error;
12456 	}
12457 	bzero(routepath, MAXPATHLEN);
12458 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12459 	if (error) {
12460 		return error;
12461 	}
12462 	error = routefs_kernel_mount(routepath);
12463 	return error;
12464 }
12465 #endif
12466 
12467 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12468 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12469 {
12470 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12471 	struct vnode_attr va;
12472 	int error;
12473 
12474 	VATTR_INIT(&va);
12475 	VATTR_SET(&va, va_flags, cas->new_flags);
12476 
12477 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12478 
12479 #if CONFIG_FSE
12480 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12481 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12482 	}
12483 #endif
12484 
12485 	return error;
12486 }
12487 
12488 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12489 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12490 {
12491 	struct mount *mp = NULL;
12492 	errno_t rootauth = 0;
12493 
12494 	mp = vp->v_mount;
12495 
12496 	/*
12497 	 * query the underlying FS and see if it reports something
12498 	 * sane for this vnode. If volume is authenticated via
12499 	 * chunklist, leave that for the caller to determine.
12500 	 */
12501 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12502 
12503 	return rootauth;
12504 }
12505 
12506 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12507 	"com.apple.private.kernel.set-package-extensions"
12508 
12509 /*
12510  * Make a filesystem-specific control call:
12511  */
12512 /* ARGSUSED */
12513 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12514 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12515 {
12516 	int error = 0;
12517 	boolean_t is64bit;
12518 	u_int size;
12519 #define STK_PARAMS 128
12520 	char stkbuf[STK_PARAMS] = {0};
12521 	caddr_t data, memp;
12522 	vnode_t vp = *arg_vp;
12523 
12524 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
12525 		return ENOTTY;
12526 	}
12527 
12528 	cmd = fsctl_bogus_command_compat(cmd);
12529 
12530 	size = IOCPARM_LEN(cmd);
12531 	if (size > IOCPARM_MAX) {
12532 		return EINVAL;
12533 	}
12534 
12535 	is64bit = proc_is64bit(p);
12536 
12537 	memp = NULL;
12538 
12539 	if (size > sizeof(stkbuf)) {
12540 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12541 			return ENOMEM;
12542 		}
12543 		data = memp;
12544 	} else {
12545 		data = &stkbuf[0];
12546 	};
12547 
12548 	if (cmd & IOC_IN) {
12549 		if (size) {
12550 			error = copyin(udata, data, size);
12551 			if (error) {
12552 				if (memp) {
12553 					kfree_data(memp, size);
12554 				}
12555 				return error;
12556 			}
12557 		} else {
12558 			if (is64bit) {
12559 				*(user_addr_t *)data = udata;
12560 			} else {
12561 				*(uint32_t *)data = (uint32_t)udata;
12562 			}
12563 		};
12564 	} else if ((cmd & IOC_OUT) && size) {
12565 		/*
12566 		 * Zero the buffer so the user always
12567 		 * gets back something deterministic.
12568 		 */
12569 		bzero(data, size);
12570 	} else if (cmd & IOC_VOID) {
12571 		if (is64bit) {
12572 			*(user_addr_t *)data = udata;
12573 		} else {
12574 			*(uint32_t *)data = (uint32_t)udata;
12575 		}
12576 	}
12577 
12578 	/* Check to see if it's a generic command */
12579 	switch (cmd) {
12580 	case FSIOC_SYNC_VOLUME:
12581 		error = handle_sync_volume(vp, arg_vp, data, ctx);
12582 		break;
12583 
12584 	case FSIOC_ROUTEFS_SETROUTEID:
12585 #if ROUTEFS
12586 		error = handle_routes(udata);
12587 #endif
12588 		break;
12589 
12590 	case FSIOC_SET_PACKAGE_EXTS: {
12591 		user_addr_t ext_strings;
12592 		uint32_t    num_entries;
12593 		uint32_t    max_width;
12594 
12595 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12596 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12597 			error = EPERM;
12598 			break;
12599 		}
12600 
12601 		if ((is64bit && size != sizeof(user64_package_ext_info))
12602 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12603 			// either you're 64-bit and passed a 64-bit struct or
12604 			// you're 32-bit and passed a 32-bit struct.  otherwise
12605 			// it's not ok.
12606 			error = EINVAL;
12607 			break;
12608 		}
12609 
12610 		if (is64bit) {
12611 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12612 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12613 			}
12614 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12615 			num_entries = ((user64_package_ext_info *)data)->num_entries;
12616 			max_width   = ((user64_package_ext_info *)data)->max_width;
12617 		} else {
12618 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12619 			num_entries = ((user32_package_ext_info *)data)->num_entries;
12620 			max_width   = ((user32_package_ext_info *)data)->max_width;
12621 		}
12622 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
12623 	}
12624 	break;
12625 
12626 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12627 	{
12628 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12629 			break;
12630 		}
12631 		if (vp->v_mount) {
12632 			mount_lock(vp->v_mount);
12633 			if (data[0] != 0) {
12634 				int i;
12635 				for (i = 0; i < MFSTYPENAMELEN; i++) {
12636 					if (!data[i]) {
12637 						goto continue_copy;
12638 					}
12639 				}
12640 				/*
12641 				 * Getting here means we have a user data string which has no
12642 				 * NULL termination in its first MFSTYPENAMELEN bytes.
12643 				 * This is bogus, let's avoid strlcpy-ing the read data and
12644 				 * return an error.
12645 				 */
12646 				error = EINVAL;
12647 				goto unlock;
12648 continue_copy:
12649 				strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
12650 				vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
12651 				if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12652 					vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
12653 					vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
12654 				}
12655 			} else {
12656 				if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12657 					vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
12658 				}
12659 				vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
12660 				vp->v_mount->fstypename_override[0] = '\0';
12661 			}
12662 unlock:
12663 			mount_unlock(vp->v_mount);
12664 		}
12665 	}
12666 	break;
12667 
12668 	case DISK_CONDITIONER_IOC_GET: {
12669 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12670 	}
12671 	break;
12672 
12673 	case DISK_CONDITIONER_IOC_SET: {
12674 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12675 	}
12676 	break;
12677 
12678 	case FSIOC_CAS_BSDFLAGS:
12679 		error = handle_flags(vp, data, ctx);
12680 		break;
12681 
12682 	case FSIOC_FD_ONLY_OPEN_ONCE: {
12683 		error = 0;
12684 		if (vnode_usecount(vp) > 1) {
12685 			vnode_lock_spin(vp);
12686 			if (vp->v_lflag & VL_HASSTREAMS) {
12687 				if (vnode_isinuse_locked(vp, 1, 1)) {
12688 					error = EBUSY;
12689 				}
12690 			} else if (vnode_usecount(vp) > 1) {
12691 				error = EBUSY;
12692 			}
12693 			vnode_unlock(vp);
12694 		}
12695 	}
12696 	break;
12697 
12698 	case FSIOC_EVAL_ROOTAUTH:
12699 		error = handle_auth(vp, cmd, data, options, ctx);
12700 		break;
12701 
12702 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
12703 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12704 		break;
12705 
12706 	default: {
12707 		/* other, known commands shouldn't be passed down here */
12708 		switch (cmd) {
12709 		case F_PUNCHHOLE:
12710 		case F_TRIM_ACTIVE_FILE:
12711 		case F_RDADVISE:
12712 		case F_TRANSCODEKEY:
12713 		case F_GETPROTECTIONLEVEL:
12714 		case F_GETDEFAULTPROTLEVEL:
12715 		case F_MAKECOMPRESSED:
12716 		case F_SET_GREEDY_MODE:
12717 		case F_SETSTATICCONTENT:
12718 		case F_SETIOTYPE:
12719 		case F_SETBACKINGSTORE:
12720 		case F_GETPATH_MTMINFO:
12721 		case APFSIOC_REVERT_TO_SNAPSHOT:
12722 		case FSIOC_FIOSEEKHOLE:
12723 		case FSIOC_FIOSEEKDATA:
12724 		case HFS_GET_BOOT_INFO:
12725 		case HFS_SET_BOOT_INFO:
12726 		case FIOPINSWAP:
12727 		case F_CHKCLEAN:
12728 		case F_FULLFSYNC:
12729 		case F_BARRIERFSYNC:
12730 		case F_FREEZE_FS:
12731 		case F_THAW_FS:
12732 		case FSIOC_KERNEL_ROOTAUTH:
12733 		case FSIOC_GRAFT_FS:
12734 		case FSIOC_UNGRAFT_FS:
12735 		case FSIOC_AUTH_FS:
12736 			error = EINVAL;
12737 			goto outdrop;
12738 		}
12739 		/* Invoke the filesystem-specific code */
12740 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12741 	}
12742 	} /* end switch stmt */
12743 
12744 	/*
12745 	 * if no errors, copy any data to user. Size was
12746 	 * already set and checked above.
12747 	 */
12748 	if (error == 0 && (cmd & IOC_OUT) && size) {
12749 		error = copyout(data, udata, size);
12750 	}
12751 
12752 outdrop:
12753 	if (memp) {
12754 		kfree_data(memp, size);
12755 	}
12756 
12757 	return error;
12758 }
12759 
12760 /* ARGSUSED */
12761 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12762 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12763 {
12764 	int error;
12765 	struct nameidata nd;
12766 	uint32_t nameiflags;
12767 	vnode_t vp = NULL;
12768 	vfs_context_t ctx = vfs_context_current();
12769 
12770 	AUDIT_ARG(cmd, (int)uap->cmd);
12771 	AUDIT_ARG(value32, uap->options);
12772 	/* Get the vnode for the file we are getting info on:  */
12773 	nameiflags = 0;
12774 	//
12775 	// if we come through fsctl() then the file is by definition not open.
12776 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12777 	// lest the caller mistakenly thinks the only open is their own (but in
12778 	// reality it's someone elses).
12779 	//
12780 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12781 		return EINVAL;
12782 	}
12783 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12784 		nameiflags |= FOLLOW;
12785 	}
12786 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12787 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12788 	}
12789 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12790 	    UIO_USERSPACE, uap->path, ctx);
12791 	if ((error = namei(&nd))) {
12792 		goto done;
12793 	}
12794 	vp = nd.ni_vp;
12795 	nameidone(&nd);
12796 
12797 #if CONFIG_MACF
12798 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12799 	if (error) {
12800 		goto done;
12801 	}
12802 #endif
12803 
12804 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12805 
12806 done:
12807 	if (vp) {
12808 		vnode_put(vp);
12809 	}
12810 	return error;
12811 }
12812 /* ARGSUSED */
12813 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12814 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12815 {
12816 	int error;
12817 	vnode_t vp = NULL;
12818 	vfs_context_t ctx = vfs_context_current();
12819 	int fd = -1;
12820 
12821 	AUDIT_ARG(fd, uap->fd);
12822 	AUDIT_ARG(cmd, (int)uap->cmd);
12823 	AUDIT_ARG(value32, uap->options);
12824 
12825 	/* Get the vnode for the file we are getting info on:  */
12826 	if ((error = file_vnode(uap->fd, &vp))) {
12827 		return error;
12828 	}
12829 	fd = uap->fd;
12830 	if ((error = vnode_getwithref(vp))) {
12831 		file_drop(fd);
12832 		return error;
12833 	}
12834 
12835 #if CONFIG_MACF
12836 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12837 		file_drop(fd);
12838 		vnode_put(vp);
12839 		return error;
12840 	}
12841 #endif
12842 
12843 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12844 
12845 	file_drop(fd);
12846 
12847 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12848 	if (vp) {
12849 		vnode_put(vp);
12850 	}
12851 
12852 	return error;
12853 }
12854 /* end of fsctl system call */
12855 
12856 #define FILESEC_ACCESS_ENTITLEMENT              \
12857 	"com.apple.private.vfs.filesec-access"
12858 
12859 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12860 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12861 {
12862 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12863 		/*
12864 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12865 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12866 		 */
12867 		if ((!setting && vfs_context_issuser(ctx)) ||
12868 		    IOTaskHasEntitlement(vfs_context_task(ctx),
12869 		    FILESEC_ACCESS_ENTITLEMENT)) {
12870 			return 0;
12871 		}
12872 	}
12873 
12874 	return EPERM;
12875 }
12876 
12877 /*
12878  *  Retrieve the data of an extended attribute.
12879  */
12880 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12881 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12882 {
12883 	vnode_t vp;
12884 	struct nameidata nd;
12885 	char attrname[XATTR_MAXNAMELEN + 1];
12886 	vfs_context_t ctx = vfs_context_current();
12887 	uio_t auio = NULL;
12888 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12889 	size_t attrsize = 0;
12890 	size_t namelen;
12891 	u_int32_t nameiflags;
12892 	int error;
12893 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12894 
12895 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12896 		return EINVAL;
12897 	}
12898 
12899 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12900 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12901 	if ((error = namei(&nd))) {
12902 		return error;
12903 	}
12904 	vp = nd.ni_vp;
12905 	nameidone(&nd);
12906 
12907 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12908 	if (error != 0) {
12909 		goto out;
12910 	}
12911 	if (xattr_protected(attrname) &&
12912 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12913 		goto out;
12914 	}
12915 	/*
12916 	 * the specific check for 0xffffffff is a hack to preserve
12917 	 * binaray compatibilty in K64 with applications that discovered
12918 	 * that passing in a buf pointer and a size of -1 resulted in
12919 	 * just the size of the indicated extended attribute being returned.
12920 	 * this isn't part of the documented behavior, but because of the
12921 	 * original implemtation's check for "uap->size > 0", this behavior
12922 	 * was allowed. In K32 that check turned into a signed comparison
12923 	 * even though uap->size is unsigned...  in K64, we blow by that
12924 	 * check because uap->size is unsigned and doesn't get sign smeared
12925 	 * in the munger for a 32 bit user app.  we also need to add a
12926 	 * check to limit the maximum size of the buffer being passed in...
12927 	 * unfortunately, the underlying fileystems seem to just malloc
12928 	 * the requested size even if the actual extended attribute is tiny.
12929 	 * because that malloc is for kernel wired memory, we have to put a
12930 	 * sane limit on it.
12931 	 *
12932 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12933 	 * U64 running on K64 will yield -1 (64 bits wide)
12934 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
12935 	 */
12936 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12937 		goto no_uio;
12938 	}
12939 
12940 	if (uap->value) {
12941 		if (uap->size > (size_t)XATTR_MAXSIZE) {
12942 			uap->size = XATTR_MAXSIZE;
12943 		}
12944 
12945 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12946 		    &uio_buf[0], sizeof(uio_buf));
12947 		uio_addiov(auio, uap->value, uap->size);
12948 	}
12949 no_uio:
12950 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12951 out:
12952 	vnode_put(vp);
12953 
12954 	if (auio) {
12955 		*retval = uap->size - uio_resid(auio);
12956 	} else {
12957 		*retval = (user_ssize_t)attrsize;
12958 	}
12959 
12960 	return error;
12961 }
12962 
12963 /*
12964  * Retrieve the data of an extended attribute.
12965  */
12966 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12967 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12968 {
12969 	vnode_t vp;
12970 	char attrname[XATTR_MAXNAMELEN + 1];
12971 	vfs_context_t ctx = vfs_context_current();
12972 	uio_t auio = NULL;
12973 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12974 	size_t attrsize = 0;
12975 	size_t namelen;
12976 	int error;
12977 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12978 
12979 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12980 		return EINVAL;
12981 	}
12982 
12983 	if ((error = file_vnode(uap->fd, &vp))) {
12984 		return error;
12985 	}
12986 	if ((error = vnode_getwithref(vp))) {
12987 		file_drop(uap->fd);
12988 		return error;
12989 	}
12990 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12991 	if (error != 0) {
12992 		goto out;
12993 	}
12994 	if (xattr_protected(attrname) &&
12995 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12996 		goto out;
12997 	}
12998 	if (uap->value && uap->size > 0) {
12999 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13000 			uap->size = XATTR_MAXSIZE;
13001 		}
13002 
13003 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13004 		    &uio_buf[0], sizeof(uio_buf));
13005 		uio_addiov(auio, uap->value, uap->size);
13006 	}
13007 
13008 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13009 out:
13010 	(void)vnode_put(vp);
13011 	file_drop(uap->fd);
13012 
13013 	if (auio) {
13014 		*retval = uap->size - uio_resid(auio);
13015 	} else {
13016 		*retval = (user_ssize_t)attrsize;
13017 	}
13018 	return error;
13019 }
13020 
13021 /* struct for checkdirs iteration */
13022 struct setxattr_ctx {
13023 	struct nameidata nd;
13024 	char attrname[XATTR_MAXNAMELEN + 1];
13025 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13026 };
13027 
13028 /*
13029  * Set the data of an extended attribute.
13030  */
13031 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13032 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13033 {
13034 	vnode_t vp;
13035 	vfs_context_t ctx = vfs_context_current();
13036 	uio_t auio = NULL;
13037 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13038 	size_t namelen;
13039 	u_int32_t nameiflags;
13040 	int error;
13041 	struct setxattr_ctx *sactx;
13042 
13043 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13044 		return EINVAL;
13045 	}
13046 
13047 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13048 	if (sactx == NULL) {
13049 		return ENOMEM;
13050 	}
13051 
13052 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13053 	if (error != 0) {
13054 		if (error == EPERM) {
13055 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13056 			error = ENAMETOOLONG;
13057 		}
13058 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13059 		goto out;
13060 	}
13061 	if (xattr_protected(sactx->attrname) &&
13062 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13063 		goto out;
13064 	}
13065 	if (uap->size != 0 && uap->value == 0) {
13066 		error = EINVAL;
13067 		goto out;
13068 	}
13069 	if (uap->size > INT_MAX) {
13070 		error = E2BIG;
13071 		goto out;
13072 	}
13073 
13074 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13075 #if CONFIG_FILE_LEASES
13076 	nameiflags |= WANTPARENT;
13077 #endif
13078 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13079 	if ((error = namei(&sactx->nd))) {
13080 		goto out;
13081 	}
13082 	vp = sactx->nd.ni_vp;
13083 #if CONFIG_FILE_LEASES
13084 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13085 	vnode_put(sactx->nd.ni_dvp);
13086 #endif
13087 	nameidone(&sactx->nd);
13088 
13089 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13090 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13091 	uio_addiov(auio, uap->value, uap->size);
13092 
13093 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13094 #if CONFIG_FSE
13095 	if (error == 0) {
13096 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13097 		    FSE_ARG_VNODE, vp,
13098 		    FSE_ARG_DONE);
13099 	}
13100 #endif
13101 	vnode_put(vp);
13102 out:
13103 	kfree_type(struct setxattr_ctx, sactx);
13104 	*retval = 0;
13105 	return error;
13106 }
13107 
13108 /*
13109  * Set the data of an extended attribute.
13110  */
13111 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13112 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13113 {
13114 	vnode_t vp;
13115 	char attrname[XATTR_MAXNAMELEN + 1];
13116 	vfs_context_t ctx = vfs_context_current();
13117 	uio_t auio = NULL;
13118 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13119 	size_t namelen;
13120 	int error;
13121 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13122 
13123 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13124 		return EINVAL;
13125 	}
13126 
13127 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13128 	if (error != 0) {
13129 		if (error == EPERM) {
13130 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13131 			return ENAMETOOLONG;
13132 		}
13133 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13134 		return error;
13135 	}
13136 	if (xattr_protected(attrname) &&
13137 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13138 		return error;
13139 	}
13140 	if (uap->size != 0 && uap->value == 0) {
13141 		return EINVAL;
13142 	}
13143 	if (uap->size > INT_MAX) {
13144 		return E2BIG;
13145 	}
13146 	if ((error = file_vnode(uap->fd, &vp))) {
13147 		return error;
13148 	}
13149 	if ((error = vnode_getwithref(vp))) {
13150 		file_drop(uap->fd);
13151 		return error;
13152 	}
13153 
13154 #if CONFIG_FILE_LEASES
13155 	vnode_breakdirlease(vp, true, O_WRONLY);
13156 #endif
13157 
13158 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13159 	    &uio_buf[0], sizeof(uio_buf));
13160 	uio_addiov(auio, uap->value, uap->size);
13161 
13162 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13163 #if CONFIG_FSE
13164 	if (error == 0) {
13165 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13166 		    FSE_ARG_VNODE, vp,
13167 		    FSE_ARG_DONE);
13168 	}
13169 #endif
13170 	vnode_put(vp);
13171 	file_drop(uap->fd);
13172 	*retval = 0;
13173 	return error;
13174 }
13175 
13176 /*
13177  * Remove an extended attribute.
13178  * XXX Code duplication here.
13179  */
13180 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13181 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13182 {
13183 	vnode_t vp;
13184 	struct nameidata nd;
13185 	char attrname[XATTR_MAXNAMELEN + 1];
13186 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13187 	vfs_context_t ctx = vfs_context_current();
13188 	size_t namelen;
13189 	u_int32_t nameiflags;
13190 	int error;
13191 
13192 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13193 		return EINVAL;
13194 	}
13195 
13196 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13197 	if (error != 0) {
13198 		return error;
13199 	}
13200 	if (xattr_protected(attrname)) {
13201 		return EPERM;
13202 	}
13203 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13204 #if CONFIG_FILE_LEASES
13205 	nameiflags |= WANTPARENT;
13206 #endif
13207 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13208 	if ((error = namei(&nd))) {
13209 		return error;
13210 	}
13211 	vp = nd.ni_vp;
13212 #if CONFIG_FILE_LEASES
13213 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13214 	vnode_put(nd.ni_dvp);
13215 #endif
13216 	nameidone(&nd);
13217 
13218 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13219 #if CONFIG_FSE
13220 	if (error == 0) {
13221 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13222 		    FSE_ARG_VNODE, vp,
13223 		    FSE_ARG_DONE);
13224 	}
13225 #endif
13226 	vnode_put(vp);
13227 	*retval = 0;
13228 	return error;
13229 }
13230 
13231 /*
13232  * Remove an extended attribute.
13233  * XXX Code duplication here.
13234  */
13235 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13236 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13237 {
13238 	vnode_t vp;
13239 	char attrname[XATTR_MAXNAMELEN + 1];
13240 	size_t namelen;
13241 	int error;
13242 #if CONFIG_FSE
13243 	vfs_context_t ctx = vfs_context_current();
13244 #endif
13245 
13246 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13247 		return EINVAL;
13248 	}
13249 
13250 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13251 	if (error != 0) {
13252 		return error;
13253 	}
13254 	if (xattr_protected(attrname)) {
13255 		return EPERM;
13256 	}
13257 	if ((error = file_vnode(uap->fd, &vp))) {
13258 		return error;
13259 	}
13260 	if ((error = vnode_getwithref(vp))) {
13261 		file_drop(uap->fd);
13262 		return error;
13263 	}
13264 
13265 #if CONFIG_FILE_LEASES
13266 	vnode_breakdirlease(vp, true, O_WRONLY);
13267 #endif
13268 
13269 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13270 #if CONFIG_FSE
13271 	if (error == 0) {
13272 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13273 		    FSE_ARG_VNODE, vp,
13274 		    FSE_ARG_DONE);
13275 	}
13276 #endif
13277 	vnode_put(vp);
13278 	file_drop(uap->fd);
13279 	*retval = 0;
13280 	return error;
13281 }
13282 
13283 /*
13284  * Retrieve the list of extended attribute names.
13285  * XXX Code duplication here.
13286  */
13287 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13288 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13289 {
13290 	vnode_t vp;
13291 	struct nameidata nd;
13292 	vfs_context_t ctx = vfs_context_current();
13293 	uio_t auio = NULL;
13294 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13295 	size_t attrsize = 0;
13296 	u_int32_t nameiflags;
13297 	int error;
13298 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13299 
13300 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13301 		return EINVAL;
13302 	}
13303 
13304 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13305 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13306 	if ((error = namei(&nd))) {
13307 		return error;
13308 	}
13309 	vp = nd.ni_vp;
13310 	nameidone(&nd);
13311 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13312 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13313 		    &uio_buf[0], sizeof(uio_buf));
13314 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13315 	}
13316 
13317 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13318 
13319 	vnode_put(vp);
13320 	if (auio) {
13321 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13322 	} else {
13323 		*retval = (user_ssize_t)attrsize;
13324 	}
13325 	return error;
13326 }
13327 
13328 /*
13329  * Retrieve the list of extended attribute names.
13330  * XXX Code duplication here.
13331  */
13332 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13333 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13334 {
13335 	vnode_t vp;
13336 	uio_t auio = NULL;
13337 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13338 	size_t attrsize = 0;
13339 	int error;
13340 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13341 
13342 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13343 		return EINVAL;
13344 	}
13345 
13346 	if ((error = file_vnode(uap->fd, &vp))) {
13347 		return error;
13348 	}
13349 	if ((error = vnode_getwithref(vp))) {
13350 		file_drop(uap->fd);
13351 		return error;
13352 	}
13353 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13354 		auio = uio_createwithbuffer(1, 0, spacetype,
13355 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
13356 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13357 	}
13358 
13359 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13360 
13361 	vnode_put(vp);
13362 	file_drop(uap->fd);
13363 	if (auio) {
13364 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13365 	} else {
13366 		*retval = (user_ssize_t)attrsize;
13367 	}
13368 	return error;
13369 }
13370 
13371 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13372 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13373     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13374 {
13375 	int error;
13376 	struct mount *mp = NULL;
13377 	vnode_t vp;
13378 	int length;
13379 	int bpflags;
13380 	/* maximum number of times to retry build_path */
13381 	unsigned int retries = 0x10;
13382 
13383 	if (bufsize > FSGETPATH_MAXBUFLEN) {
13384 		return EINVAL;
13385 	}
13386 
13387 	if (buf == NULL) {
13388 		return ENOMEM;
13389 	}
13390 
13391 retry:
13392 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13393 		error = ENOTSUP;  /* unexpected failure */
13394 		return ENOTSUP;
13395 	}
13396 
13397 #if CONFIG_UNION_MOUNTS
13398 unionget:
13399 #endif /* CONFIG_UNION_MOUNTS */
13400 	if (objid == 2) {
13401 		struct vfs_attr vfsattr;
13402 		int use_vfs_root = TRUE;
13403 
13404 		VFSATTR_INIT(&vfsattr);
13405 		VFSATTR_WANTED(&vfsattr, f_capabilities);
13406 		if (!(options & FSOPT_ISREALFSID) &&
13407 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13408 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13409 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13410 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13411 				use_vfs_root = FALSE;
13412 			}
13413 		}
13414 
13415 		if (use_vfs_root) {
13416 			error = VFS_ROOT(mp, &vp, ctx);
13417 		} else {
13418 			error = VFS_VGET(mp, objid, &vp, ctx);
13419 		}
13420 	} else {
13421 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13422 	}
13423 
13424 #if CONFIG_UNION_MOUNTS
13425 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13426 		/*
13427 		 * If the fileid isn't found and we're in a union
13428 		 * mount volume, then see if the fileid is in the
13429 		 * mounted-on volume.
13430 		 */
13431 		struct mount *tmp = mp;
13432 		mp = vnode_mount(tmp->mnt_vnodecovered);
13433 		vfs_unbusy(tmp);
13434 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
13435 			goto unionget;
13436 		}
13437 	} else {
13438 		vfs_unbusy(mp);
13439 	}
13440 #else
13441 	vfs_unbusy(mp);
13442 #endif /* CONFIG_UNION_MOUNTS */
13443 
13444 	if (error) {
13445 		return error;
13446 	}
13447 
13448 #if CONFIG_MACF
13449 	error = mac_vnode_check_fsgetpath(ctx, vp);
13450 	if (error) {
13451 		vnode_put(vp);
13452 		return error;
13453 	}
13454 #endif
13455 
13456 	/* Obtain the absolute path to this vnode. */
13457 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13458 	if (options & FSOPT_NOFIRMLINKPATH) {
13459 		bpflags |= BUILDPATH_NO_FIRMLINK;
13460 	}
13461 	bpflags |= BUILDPATH_CHECK_MOVED;
13462 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13463 	vnode_put(vp);
13464 
13465 	if (error) {
13466 		/* there was a race building the path, try a few more times */
13467 		if (error == EAGAIN) {
13468 			--retries;
13469 			if (retries > 0) {
13470 				goto retry;
13471 			}
13472 
13473 			error = ENOENT;
13474 		}
13475 		goto out;
13476 	}
13477 
13478 	AUDIT_ARG(text, buf);
13479 
13480 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13481 		unsigned long path_words[NUMPARMS];
13482 		size_t path_len = sizeof(path_words);
13483 
13484 		if ((size_t)length < path_len) {
13485 			memcpy((char *)path_words, buf, length);
13486 			memset((char *)path_words + length, 0, path_len - length);
13487 
13488 			path_len = length;
13489 		} else {
13490 			memcpy((char *)path_words, buf + (length - path_len), path_len);
13491 		}
13492 
13493 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
13494 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13495 	}
13496 
13497 	*pathlen = length; /* may be superseded by error */
13498 
13499 out:
13500 	return error;
13501 }
13502 
13503 /*
13504  * Obtain the full pathname of a file system object by id.
13505  */
13506 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13507 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13508     uint32_t options, user_ssize_t *retval)
13509 {
13510 	vfs_context_t ctx = vfs_context_current();
13511 	fsid_t fsid;
13512 	char *realpath;
13513 	int length;
13514 	int error;
13515 
13516 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13517 		return EINVAL;
13518 	}
13519 
13520 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13521 		return error;
13522 	}
13523 	AUDIT_ARG(value32, fsid.val[0]);
13524 	AUDIT_ARG(value64, objid);
13525 	/* Restrict output buffer size for now. */
13526 
13527 	if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13528 		return EINVAL;
13529 	}
13530 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13531 	if (realpath == NULL) {
13532 		return ENOMEM;
13533 	}
13534 
13535 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13536 	    options, &length);
13537 
13538 	if (error) {
13539 		goto out;
13540 	}
13541 
13542 	error = copyout((caddr_t)realpath, buf, length);
13543 
13544 	*retval = (user_ssize_t)length; /* may be superseded by error */
13545 out:
13546 	kfree_data(realpath, bufsize);
13547 	return error;
13548 }
13549 
13550 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13551 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13552 {
13553 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13554 	           0, retval);
13555 }
13556 
13557 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13558 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13559 {
13560 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13561 	           uap->options, retval);
13562 }
13563 
13564 /*
13565  * Common routine to handle various flavors of statfs data heading out
13566  *	to user space.
13567  *
13568  * Returns:	0			Success
13569  *		EFAULT
13570  */
13571 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13572 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13573     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13574     boolean_t partial_copy)
13575 {
13576 	int             error;
13577 	int             my_size, copy_size;
13578 
13579 	if (is_64_bit) {
13580 		struct user64_statfs sfs;
13581 		my_size = copy_size = sizeof(sfs);
13582 		bzero(&sfs, my_size);
13583 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13584 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13585 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13586 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13587 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13588 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13589 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13590 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13591 		sfs.f_files = (user64_long_t)sfsp->f_files;
13592 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13593 		sfs.f_fsid = sfsp->f_fsid;
13594 		sfs.f_owner = sfsp->f_owner;
13595 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13596 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13597 		} else {
13598 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13599 		}
13600 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13601 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13602 
13603 		if (partial_copy) {
13604 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13605 		}
13606 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13607 	} else {
13608 		struct user32_statfs sfs;
13609 
13610 		my_size = copy_size = sizeof(sfs);
13611 		bzero(&sfs, my_size);
13612 
13613 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13614 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13615 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13616 
13617 		/*
13618 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13619 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
13620 		 * to reflect the filesystem size as best we can.
13621 		 */
13622 		if ((sfsp->f_blocks > INT_MAX)
13623 		    /* Hack for 4061702 . I think the real fix is for Carbon to
13624 		     * look for some volume capability and not depend on hidden
13625 		     * semantics agreed between a FS and carbon.
13626 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13627 		     * for Carbon to set bNoVolumeSizes volume attribute.
13628 		     * Without this the webdavfs files cannot be copied onto
13629 		     * disk as they look huge. This change should not affect
13630 		     * XSAN as they should not setting these to -1..
13631 		     */
13632 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
13633 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
13634 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13635 			int             shift;
13636 
13637 			/*
13638 			 * Work out how far we have to shift the block count down to make it fit.
13639 			 * Note that it's possible to have to shift so far that the resulting
13640 			 * blocksize would be unreportably large.  At that point, we will clip
13641 			 * any values that don't fit.
13642 			 *
13643 			 * For safety's sake, we also ensure that f_iosize is never reported as
13644 			 * being smaller than f_bsize.
13645 			 */
13646 			for (shift = 0; shift < 32; shift++) {
13647 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13648 					break;
13649 				}
13650 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13651 					break;
13652 				}
13653 			}
13654 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13655 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13656 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13657 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13658 #undef __SHIFT_OR_CLIP
13659 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13660 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13661 		} else {
13662 			/* filesystem is small enough to be reported honestly */
13663 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13664 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13665 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13666 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13667 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13668 		}
13669 		sfs.f_files = (user32_long_t)sfsp->f_files;
13670 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13671 		sfs.f_fsid = sfsp->f_fsid;
13672 		sfs.f_owner = sfsp->f_owner;
13673 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13674 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13675 		} else {
13676 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13677 		}
13678 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13679 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13680 
13681 		if (partial_copy) {
13682 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13683 		}
13684 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13685 	}
13686 
13687 	if (sizep != NULL) {
13688 		*sizep = my_size;
13689 	}
13690 	return error;
13691 }
13692 
13693 /*
13694  * copy stat structure into user_stat structure.
13695  */
13696 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13697 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13698 {
13699 	bzero(usbp, sizeof(*usbp));
13700 
13701 	usbp->st_dev = sbp->st_dev;
13702 	usbp->st_ino = sbp->st_ino;
13703 	usbp->st_mode = sbp->st_mode;
13704 	usbp->st_nlink = sbp->st_nlink;
13705 	usbp->st_uid = sbp->st_uid;
13706 	usbp->st_gid = sbp->st_gid;
13707 	usbp->st_rdev = sbp->st_rdev;
13708 #ifndef _POSIX_C_SOURCE
13709 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13710 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13711 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13712 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13713 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13714 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13715 #else
13716 	usbp->st_atime = sbp->st_atime;
13717 	usbp->st_atimensec = sbp->st_atimensec;
13718 	usbp->st_mtime = sbp->st_mtime;
13719 	usbp->st_mtimensec = sbp->st_mtimensec;
13720 	usbp->st_ctime = sbp->st_ctime;
13721 	usbp->st_ctimensec = sbp->st_ctimensec;
13722 #endif
13723 	usbp->st_size = sbp->st_size;
13724 	usbp->st_blocks = sbp->st_blocks;
13725 	usbp->st_blksize = sbp->st_blksize;
13726 	usbp->st_flags = sbp->st_flags;
13727 	usbp->st_gen = sbp->st_gen;
13728 	usbp->st_lspare = sbp->st_lspare;
13729 	usbp->st_qspare[0] = sbp->st_qspare[0];
13730 	usbp->st_qspare[1] = sbp->st_qspare[1];
13731 }
13732 
13733 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13734 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13735 {
13736 	bzero(usbp, sizeof(*usbp));
13737 
13738 	usbp->st_dev = sbp->st_dev;
13739 	usbp->st_ino = sbp->st_ino;
13740 	usbp->st_mode = sbp->st_mode;
13741 	usbp->st_nlink = sbp->st_nlink;
13742 	usbp->st_uid = sbp->st_uid;
13743 	usbp->st_gid = sbp->st_gid;
13744 	usbp->st_rdev = sbp->st_rdev;
13745 #ifndef _POSIX_C_SOURCE
13746 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13747 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13748 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13749 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13750 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13751 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13752 #else
13753 	usbp->st_atime = sbp->st_atime;
13754 	usbp->st_atimensec = sbp->st_atimensec;
13755 	usbp->st_mtime = sbp->st_mtime;
13756 	usbp->st_mtimensec = sbp->st_mtimensec;
13757 	usbp->st_ctime = sbp->st_ctime;
13758 	usbp->st_ctimensec = sbp->st_ctimensec;
13759 #endif
13760 	usbp->st_size = sbp->st_size;
13761 	usbp->st_blocks = sbp->st_blocks;
13762 	usbp->st_blksize = sbp->st_blksize;
13763 	usbp->st_flags = sbp->st_flags;
13764 	usbp->st_gen = sbp->st_gen;
13765 	usbp->st_lspare = sbp->st_lspare;
13766 	usbp->st_qspare[0] = sbp->st_qspare[0];
13767 	usbp->st_qspare[1] = sbp->st_qspare[1];
13768 }
13769 
13770 /*
13771  * copy stat64 structure into user_stat64 structure.
13772  */
13773 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13774 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13775 {
13776 	bzero(usbp, sizeof(*usbp));
13777 
13778 	usbp->st_dev = sbp->st_dev;
13779 	usbp->st_ino = sbp->st_ino;
13780 	usbp->st_mode = sbp->st_mode;
13781 	usbp->st_nlink = sbp->st_nlink;
13782 	usbp->st_uid = sbp->st_uid;
13783 	usbp->st_gid = sbp->st_gid;
13784 	usbp->st_rdev = sbp->st_rdev;
13785 #ifndef _POSIX_C_SOURCE
13786 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13787 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13788 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13789 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13790 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13791 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13792 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13793 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13794 #else
13795 	usbp->st_atime = sbp->st_atime;
13796 	usbp->st_atimensec = sbp->st_atimensec;
13797 	usbp->st_mtime = sbp->st_mtime;
13798 	usbp->st_mtimensec = sbp->st_mtimensec;
13799 	usbp->st_ctime = sbp->st_ctime;
13800 	usbp->st_ctimensec = sbp->st_ctimensec;
13801 	usbp->st_birthtime = sbp->st_birthtime;
13802 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13803 #endif
13804 	usbp->st_size = sbp->st_size;
13805 	usbp->st_blocks = sbp->st_blocks;
13806 	usbp->st_blksize = sbp->st_blksize;
13807 	usbp->st_flags = sbp->st_flags;
13808 	usbp->st_gen = sbp->st_gen;
13809 	usbp->st_lspare = sbp->st_lspare;
13810 	usbp->st_qspare[0] = sbp->st_qspare[0];
13811 	usbp->st_qspare[1] = sbp->st_qspare[1];
13812 }
13813 
13814 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13815 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13816 {
13817 	bzero(usbp, sizeof(*usbp));
13818 
13819 	usbp->st_dev = sbp->st_dev;
13820 	usbp->st_ino = sbp->st_ino;
13821 	usbp->st_mode = sbp->st_mode;
13822 	usbp->st_nlink = sbp->st_nlink;
13823 	usbp->st_uid = sbp->st_uid;
13824 	usbp->st_gid = sbp->st_gid;
13825 	usbp->st_rdev = sbp->st_rdev;
13826 #ifndef _POSIX_C_SOURCE
13827 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13828 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13829 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13830 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13831 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13832 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13833 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13834 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13835 #else
13836 	usbp->st_atime = sbp->st_atime;
13837 	usbp->st_atimensec = sbp->st_atimensec;
13838 	usbp->st_mtime = sbp->st_mtime;
13839 	usbp->st_mtimensec = sbp->st_mtimensec;
13840 	usbp->st_ctime = sbp->st_ctime;
13841 	usbp->st_ctimensec = sbp->st_ctimensec;
13842 	usbp->st_birthtime = sbp->st_birthtime;
13843 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13844 #endif
13845 	usbp->st_size = sbp->st_size;
13846 	usbp->st_blocks = sbp->st_blocks;
13847 	usbp->st_blksize = sbp->st_blksize;
13848 	usbp->st_flags = sbp->st_flags;
13849 	usbp->st_gen = sbp->st_gen;
13850 	usbp->st_lspare = sbp->st_lspare;
13851 	usbp->st_qspare[0] = sbp->st_qspare[0];
13852 	usbp->st_qspare[1] = sbp->st_qspare[1];
13853 }
13854 
13855 /*
13856  * Purge buffer cache for simulating cold starts
13857  */
13858 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13859 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13860 {
13861 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13862 
13863 	return VNODE_RETURNED;
13864 }
13865 
13866 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13867 vfs_purge_callback(mount_t mp, __unused void * arg)
13868 {
13869 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13870 
13871 	return VFS_RETURNED;
13872 }
13873 
13874 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13875 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13876 {
13877 	if (!kauth_cred_issuser(kauth_cred_get())) {
13878 		return EPERM;
13879 	}
13880 
13881 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13882 
13883 	return 0;
13884 }
13885 
13886 /*
13887  * gets the vnode associated with the (unnamed) snapshot directory
13888  * for a Filesystem. The snapshot directory vnode is returned with
13889  * an iocount on it.
13890  */
13891 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13892 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13893 {
13894 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13895 }
13896 
13897 /*
13898  * Get the snapshot vnode.
13899  *
13900  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13901  * needs nameidone() on ndp.
13902  *
13903  * If the snapshot vnode exists it is returned in ndp->ni_vp.
13904  *
13905  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13906  * not needed.
13907  */
13908 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13909 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13910     user_addr_t name, struct nameidata *ndp, int32_t op,
13911 #if !CONFIG_TRIGGERS
13912     __unused
13913 #endif
13914     enum path_operation pathop,
13915     vfs_context_t ctx)
13916 {
13917 	int error, i;
13918 	caddr_t name_buf;
13919 	size_t name_len;
13920 	struct vfs_attr vfa;
13921 
13922 	*sdvpp = NULLVP;
13923 	*rvpp = NULLVP;
13924 
13925 	error = vnode_getfromfd(ctx, dirfd, rvpp);
13926 	if (error) {
13927 		return error;
13928 	}
13929 
13930 	if (!vnode_isvroot(*rvpp)) {
13931 		error = EINVAL;
13932 		goto out;
13933 	}
13934 
13935 	/* Make sure the filesystem supports snapshots */
13936 	VFSATTR_INIT(&vfa);
13937 	VFSATTR_WANTED(&vfa, f_capabilities);
13938 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13939 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13940 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13941 	    VOL_CAP_INT_SNAPSHOT)) ||
13942 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13943 	    VOL_CAP_INT_SNAPSHOT))) {
13944 		error = ENOTSUP;
13945 		goto out;
13946 	}
13947 
13948 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13949 	if (error) {
13950 		goto out;
13951 	}
13952 
13953 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13954 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13955 	if (error) {
13956 		goto out1;
13957 	}
13958 
13959 	/*
13960 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13961 	 * (the length returned by copyinstr includes the terminating NUL)
13962 	 */
13963 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13964 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13965 		error = EINVAL;
13966 		goto out1;
13967 	}
13968 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13969 		;
13970 	}
13971 	if (i < (int)name_len) {
13972 		error = EINVAL;
13973 		goto out1;
13974 	}
13975 
13976 #if CONFIG_MACF
13977 	if (op == CREATE) {
13978 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
13979 		    name_buf);
13980 	} else if (op == DELETE) {
13981 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
13982 		    name_buf);
13983 	}
13984 	if (error) {
13985 		goto out1;
13986 	}
13987 #endif
13988 
13989 	/* Check if the snapshot already exists ... */
13990 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
13991 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
13992 	ndp->ni_dvp = *sdvpp;
13993 
13994 	error = namei(ndp);
13995 out1:
13996 	zfree(ZV_NAMEI, name_buf);
13997 out:
13998 	if (error) {
13999 		if (*sdvpp) {
14000 			vnode_put(*sdvpp);
14001 			*sdvpp = NULLVP;
14002 		}
14003 		if (*rvpp) {
14004 			vnode_put(*rvpp);
14005 			*rvpp = NULLVP;
14006 		}
14007 	}
14008 	return error;
14009 }
14010 
14011 /*
14012  * create a filesystem snapshot (for supporting filesystems)
14013  *
14014  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14015  * We get to the (unnamed) snapshot directory vnode and create the vnode
14016  * for the snapshot in it.
14017  *
14018  * Restrictions:
14019  *
14020  *    a) Passed in name for snapshot cannot have slashes.
14021  *    b) name can't be "." or ".."
14022  *
14023  * Since this requires superuser privileges, vnode_authorize calls are not
14024  * made.
14025  */
14026 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14027 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14028     vfs_context_t ctx)
14029 {
14030 	vnode_t rvp, snapdvp;
14031 	int error;
14032 	struct nameidata *ndp;
14033 
14034 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14035 
14036 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14037 	    OP_LINK, ctx);
14038 	if (error) {
14039 		goto out;
14040 	}
14041 
14042 	if (ndp->ni_vp) {
14043 		vnode_put(ndp->ni_vp);
14044 		error = EEXIST;
14045 	} else {
14046 		struct vnode_attr *vap;
14047 		vnode_t vp = NULLVP;
14048 
14049 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14050 
14051 		VATTR_INIT(vap);
14052 		VATTR_SET(vap, va_type, VREG);
14053 		VATTR_SET(vap, va_mode, 0);
14054 
14055 		error = vn_create(snapdvp, &vp, ndp, vap,
14056 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14057 		if (!error && vp) {
14058 			vnode_put(vp);
14059 		}
14060 
14061 		kfree_type(struct vnode_attr, vap);
14062 	}
14063 
14064 	nameidone(ndp);
14065 	vnode_put(snapdvp);
14066 	vnode_put(rvp);
14067 out:
14068 	kfree_type(struct nameidata, ndp);
14069 
14070 	return error;
14071 }
14072 
14073 /*
14074  * Delete a Filesystem snapshot
14075  *
14076  * get the vnode for the unnamed snapshot directory and the snapshot and
14077  * delete the snapshot.
14078  */
14079 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14080 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14081     vfs_context_t ctx)
14082 {
14083 	vnode_t rvp, snapdvp;
14084 	int error;
14085 	struct nameidata *ndp;
14086 
14087 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14088 
14089 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14090 	    OP_UNLINK, ctx);
14091 	if (error) {
14092 		goto out;
14093 	}
14094 
14095 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14096 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14097 
14098 	vnode_put(ndp->ni_vp);
14099 	nameidone(ndp);
14100 	vnode_put(snapdvp);
14101 	vnode_put(rvp);
14102 out:
14103 	kfree_type(struct nameidata, ndp);
14104 
14105 	return error;
14106 }
14107 
14108 /*
14109  * Revert a filesystem to a snapshot
14110  *
14111  * Marks the filesystem to revert to the given snapshot on next mount.
14112  */
14113 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14114 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14115     vfs_context_t ctx)
14116 {
14117 	int error;
14118 	vnode_t rvp;
14119 	mount_t mp;
14120 	struct fs_snapshot_revert_args revert_data;
14121 	struct componentname cnp;
14122 	caddr_t name_buf;
14123 	size_t name_len;
14124 
14125 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14126 	if (error) {
14127 		return error;
14128 	}
14129 	mp = vnode_mount(rvp);
14130 
14131 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14132 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14133 	if (error) {
14134 		zfree(ZV_NAMEI, name_buf);
14135 		vnode_put(rvp);
14136 		return error;
14137 	}
14138 
14139 #if CONFIG_MACF
14140 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14141 	if (error) {
14142 		zfree(ZV_NAMEI, name_buf);
14143 		vnode_put(rvp);
14144 		return error;
14145 	}
14146 #endif
14147 
14148 	/*
14149 	 * Grab mount_iterref so that we can release the vnode,
14150 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14151 	 */
14152 	error = mount_iterref(mp, 0);
14153 	vnode_put(rvp);
14154 	if (error) {
14155 		zfree(ZV_NAMEI, name_buf);
14156 		return error;
14157 	}
14158 
14159 	memset(&cnp, 0, sizeof(cnp));
14160 	cnp.cn_pnbuf = (char *)name_buf;
14161 	cnp.cn_nameiop = LOOKUP;
14162 	cnp.cn_flags = ISLASTCN | HASBUF;
14163 	cnp.cn_pnlen = MAXPATHLEN;
14164 	cnp.cn_nameptr = cnp.cn_pnbuf;
14165 	cnp.cn_namelen = (int)name_len;
14166 	revert_data.sr_cnp = &cnp;
14167 
14168 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14169 	mount_iterdrop(mp);
14170 	zfree(ZV_NAMEI, name_buf);
14171 
14172 	if (error) {
14173 		/* If there was any error, try again using VNOP_IOCTL */
14174 
14175 		vnode_t snapdvp;
14176 		struct nameidata namend;
14177 
14178 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14179 		    OP_LOOKUP, ctx);
14180 		if (error) {
14181 			return error;
14182 		}
14183 
14184 
14185 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14186 		    0, ctx);
14187 
14188 		vnode_put(namend.ni_vp);
14189 		nameidone(&namend);
14190 		vnode_put(snapdvp);
14191 		vnode_put(rvp);
14192 	}
14193 
14194 	return error;
14195 }
14196 
14197 /*
14198  * rename a Filesystem snapshot
14199  *
14200  * get the vnode for the unnamed snapshot directory and the snapshot and
14201  * rename the snapshot. This is a very specialised (and simple) case of
14202  * rename(2) (which has to deal with a lot more complications). It differs
14203  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14204  */
14205 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14206 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14207     __unused uint32_t flags, vfs_context_t ctx)
14208 {
14209 	vnode_t rvp, snapdvp;
14210 	int error, i;
14211 	caddr_t newname_buf;
14212 	size_t name_len;
14213 	vnode_t fvp;
14214 	struct nameidata *fromnd, *tond;
14215 	/* carving out a chunk for structs that are too big to be on stack. */
14216 	struct {
14217 		struct nameidata from_node;
14218 		struct nameidata to_node;
14219 	} * __rename_data;
14220 
14221 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14222 	fromnd = &__rename_data->from_node;
14223 	tond = &__rename_data->to_node;
14224 
14225 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14226 	    OP_UNLINK, ctx);
14227 	if (error) {
14228 		goto out;
14229 	}
14230 	fvp  = fromnd->ni_vp;
14231 
14232 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14233 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14234 	if (error) {
14235 		goto out1;
14236 	}
14237 
14238 	/*
14239 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14240 	 * slashes.
14241 	 * (the length returned by copyinstr includes the terminating NUL)
14242 	 *
14243 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14244 	 * off here itself.
14245 	 */
14246 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14247 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14248 		error = EINVAL;
14249 		goto out1;
14250 	}
14251 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14252 		;
14253 	}
14254 	if (i < (int)name_len) {
14255 		error = EINVAL;
14256 		goto out1;
14257 	}
14258 
14259 #if CONFIG_MACF
14260 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14261 	    newname_buf);
14262 	if (error) {
14263 		goto out1;
14264 	}
14265 #endif
14266 
14267 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14268 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14269 	tond->ni_dvp = snapdvp;
14270 
14271 	error = namei(tond);
14272 	if (error) {
14273 		goto out2;
14274 	} else if (tond->ni_vp) {
14275 		/*
14276 		 * snapshot rename behaves differently than rename(2) - if the
14277 		 * new name exists, EEXIST is returned.
14278 		 */
14279 		vnode_put(tond->ni_vp);
14280 		error = EEXIST;
14281 		goto out2;
14282 	}
14283 
14284 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14285 	    &tond->ni_cnd, ctx);
14286 
14287 out2:
14288 	nameidone(tond);
14289 out1:
14290 	zfree(ZV_NAMEI, newname_buf);
14291 	vnode_put(fvp);
14292 	vnode_put(snapdvp);
14293 	vnode_put(rvp);
14294 	nameidone(fromnd);
14295 out:
14296 	kfree_type(typeof(*__rename_data), __rename_data);
14297 	return error;
14298 }
14299 
14300 /*
14301  * Mount a Filesystem snapshot
14302  *
14303  * get the vnode for the unnamed snapshot directory and the snapshot and
14304  * mount the snapshot.
14305  */
14306 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14307 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14308     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14309 {
14310 	mount_t mp;
14311 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14312 	struct fs_snapshot_mount_args smnt_data;
14313 	int error;
14314 	struct nameidata *snapndp, *dirndp;
14315 	/* carving out a chunk for structs that are too big to be on stack. */
14316 	struct {
14317 		struct nameidata snapnd;
14318 		struct nameidata dirnd;
14319 	} * __snapshot_mount_data;
14320 
14321 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14322 	snapndp = &__snapshot_mount_data->snapnd;
14323 	dirndp = &__snapshot_mount_data->dirnd;
14324 
14325 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14326 	    OP_LOOKUP, ctx);
14327 	if (error) {
14328 		goto out;
14329 	}
14330 
14331 	snapvp  = snapndp->ni_vp;
14332 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14333 		error = EIO;
14334 		goto out1;
14335 	}
14336 
14337 	/* Get the vnode to be covered */
14338 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14339 	    UIO_USERSPACE, directory, ctx);
14340 	error = namei(dirndp);
14341 	if (error) {
14342 		goto out1;
14343 	}
14344 
14345 	vp = dirndp->ni_vp;
14346 	pvp = dirndp->ni_dvp;
14347 	mp = vnode_mount(rvp);
14348 
14349 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14350 		error = EINVAL;
14351 		goto out2;
14352 	}
14353 
14354 #if CONFIG_MACF
14355 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14356 	    mp->mnt_vfsstat.f_fstypename);
14357 	if (error) {
14358 		goto out2;
14359 	}
14360 #endif
14361 
14362 	smnt_data.sm_mp  = mp;
14363 	smnt_data.sm_cnp = &snapndp->ni_cnd;
14364 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14365 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
14366 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14367 
14368 out2:
14369 	vnode_put(vp);
14370 	vnode_put(pvp);
14371 	nameidone(dirndp);
14372 out1:
14373 	vnode_put(snapvp);
14374 	vnode_put(snapdvp);
14375 	vnode_put(rvp);
14376 	nameidone(snapndp);
14377 out:
14378 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14379 	return error;
14380 }
14381 
14382 /*
14383  * Root from a snapshot of the filesystem
14384  *
14385  * Marks the filesystem to root from the given snapshot on next boot.
14386  */
14387 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14388 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14389     vfs_context_t ctx)
14390 {
14391 	int error;
14392 	vnode_t rvp;
14393 	mount_t mp;
14394 	struct fs_snapshot_root_args root_data;
14395 	struct componentname cnp;
14396 	caddr_t name_buf;
14397 	size_t name_len;
14398 
14399 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14400 	if (error) {
14401 		return error;
14402 	}
14403 	mp = vnode_mount(rvp);
14404 
14405 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14406 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14407 	if (error) {
14408 		zfree(ZV_NAMEI, name_buf);
14409 		vnode_put(rvp);
14410 		return error;
14411 	}
14412 
14413 	// XXX MAC checks ?
14414 
14415 	/*
14416 	 * Grab mount_iterref so that we can release the vnode,
14417 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14418 	 */
14419 	error = mount_iterref(mp, 0);
14420 	vnode_put(rvp);
14421 	if (error) {
14422 		zfree(ZV_NAMEI, name_buf);
14423 		return error;
14424 	}
14425 
14426 	memset(&cnp, 0, sizeof(cnp));
14427 	cnp.cn_pnbuf = (char *)name_buf;
14428 	cnp.cn_nameiop = LOOKUP;
14429 	cnp.cn_flags = ISLASTCN | HASBUF;
14430 	cnp.cn_pnlen = MAXPATHLEN;
14431 	cnp.cn_nameptr = cnp.cn_pnbuf;
14432 	cnp.cn_namelen = (int)name_len;
14433 	root_data.sr_cnp = &cnp;
14434 
14435 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14436 
14437 	mount_iterdrop(mp);
14438 	zfree(ZV_NAMEI, name_buf);
14439 
14440 	return error;
14441 }
14442 
14443 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14444 vfs_context_can_snapshot(vfs_context_t ctx)
14445 {
14446 	static const char * const snapshot_entitlements[] = {
14447 		"com.apple.private.vfs.snapshot",
14448 		"com.apple.developer.vfs.snapshot",
14449 		"com.apple.private.apfs.arv.limited.snapshot",
14450 	};
14451 	static const size_t nentitlements =
14452 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14453 	size_t i;
14454 
14455 	task_t task = vfs_context_task(ctx);
14456 	for (i = 0; i < nentitlements; i++) {
14457 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14458 			return TRUE;
14459 		}
14460 	}
14461 	return FALSE;
14462 }
14463 
14464 /*
14465  * FS snapshot operations dispatcher
14466  */
14467 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14468 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14469     __unused int32_t *retval)
14470 {
14471 	int error;
14472 	vfs_context_t ctx = vfs_context_current();
14473 
14474 	AUDIT_ARG(fd, uap->dirfd);
14475 	AUDIT_ARG(value32, uap->op);
14476 
14477 	if (!vfs_context_can_snapshot(ctx)) {
14478 		return EPERM;
14479 	}
14480 
14481 	/*
14482 	 * Enforce user authorization for snapshot modification operations,
14483 	 * or if trying to root from snapshot.
14484 	 */
14485 	if (uap->op != SNAPSHOT_OP_MOUNT) {
14486 		vnode_t dvp = NULLVP;
14487 		vnode_t devvp = NULLVP;
14488 		mount_t mp;
14489 
14490 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14491 		if (error) {
14492 			return error;
14493 		}
14494 		mp = vnode_mount(dvp);
14495 		devvp = mp->mnt_devvp;
14496 
14497 		/* get an iocount on devvp */
14498 		if (devvp == NULLVP) {
14499 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14500 			/* for mounts which arent block devices */
14501 			if (error == ENOENT) {
14502 				error = ENXIO;
14503 			}
14504 		} else {
14505 			error = vnode_getwithref(devvp);
14506 		}
14507 
14508 		if (error) {
14509 			vnode_put(dvp);
14510 			return error;
14511 		}
14512 
14513 		if ((vfs_context_issuser(ctx) == 0) &&
14514 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14515 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14516 			error = EPERM;
14517 		}
14518 		vnode_put(dvp);
14519 		vnode_put(devvp);
14520 
14521 		if (error) {
14522 			return error;
14523 		}
14524 	}
14525 
14526 	switch (uap->op) {
14527 	case SNAPSHOT_OP_CREATE:
14528 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14529 		break;
14530 	case SNAPSHOT_OP_DELETE:
14531 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14532 		break;
14533 	case SNAPSHOT_OP_RENAME:
14534 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14535 		    uap->flags, ctx);
14536 		break;
14537 	case SNAPSHOT_OP_MOUNT:
14538 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14539 		    uap->data, uap->flags, ctx);
14540 		break;
14541 	case SNAPSHOT_OP_REVERT:
14542 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14543 		break;
14544 #if CONFIG_MNT_ROOTSNAP
14545 	case SNAPSHOT_OP_ROOT:
14546 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14547 		break;
14548 #endif /* CONFIG_MNT_ROOTSNAP */
14549 	default:
14550 		error = ENOSYS;
14551 	}
14552 
14553 	return error;
14554 }
14555