xref: /xnu-8796.121.2/bsd/vfs/vfs_syscalls.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 1995-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117 
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122 
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125 
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130 
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137 
138 #include <nfs/nfs_conf.h>
139 
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143 
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148 
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 	((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 	release_pathbuff(x)
154 #else
155 #define GET_PATH(x)     \
156 	((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 	zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160 
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164 
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168 
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
171 #endif
172 
173 extern void disk_conditioner_unmount(mount_t mp);
174 
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 	vnode_t olddp;
178 	vnode_t newdp;
179 };
180 /* callback  for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182 
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192     boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195     struct componentname *cnp, user_addr_t fsmountargs,
196     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198 
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200 
201 struct fd_vn_data * fg_vn_data_alloc(void);
202 
203 /*
204  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205  * Concurrent lookups (or lookups by ids) on hard links can cause the
206  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207  * does) to return ENOENT as the path cannot be returned from the name cache
208  * alone. We have no option but to retry and hope to get one namei->reverse path
209  * generation done without an intervening lookup, lookup by id on the hard link
210  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211  * which currently are the MAC hooks for rename, unlink and rmdir.
212  */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214 
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217 
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219     int unlink_flags);
220 
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229 
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236 
237 __private_extern__
238 int sync_internal(void);
239 
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242 
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245 
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249 
250 extern lck_rw_t rootvnode_rw_lock;
251 
252 VFS_SMR_DECLARE;
253 extern uint32_t nc_smr_enabled;
254 
255 /*
256  * incremented each time a mount or unmount operation occurs
257  * used to invalidate the cached value of the rootvp in the
258  * mount structure utilized by cache_lookup_path
259  */
260 uint32_t mount_generation = 0;
261 
262 /* counts number of mount and unmount operations */
263 unsigned int vfs_nummntops = 0;
264 
265 /* system-wide, per-boot unique mount ID */
266 static _Atomic uint64_t mount_unique_id = 1;
267 
268 extern const struct fileops vnops;
269 #if CONFIG_APPLEDOUBLE
270 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
271 #endif /* CONFIG_APPLEDOUBLE */
272 
273 /* Maximum buffer length supported by fsgetpath(2) */
274 #define FSGETPATH_MAXBUFLEN  8192
275 
276 /*
277  * Virtual File System System Calls
278  */
279 
280 /*
281  * Private in-kernel mounting spi (specific use-cases only)
282  */
283 boolean_t
vfs_iskernelmount(mount_t mp)284 vfs_iskernelmount(mount_t mp)
285 {
286 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
287 }
288 
289 __private_extern__
290 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)291 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
292     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
293     vfs_context_t ctx)
294 {
295 	struct nameidata nd;
296 	boolean_t did_namei;
297 	int error;
298 
299 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
300 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
301 
302 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
303 
304 	/*
305 	 * Get the vnode to be covered if it's not supplied
306 	 */
307 	if (vp == NULLVP) {
308 		error = namei(&nd);
309 		if (error) {
310 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
311 				printf("failed to locate mount-on path: %s ", path);
312 			}
313 			return error;
314 		}
315 		vp = nd.ni_vp;
316 		pvp = nd.ni_dvp;
317 		did_namei = TRUE;
318 	} else {
319 		char *pnbuf = CAST_DOWN(char *, path);
320 
321 		nd.ni_cnd.cn_pnbuf = pnbuf;
322 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
323 		did_namei = FALSE;
324 	}
325 
326 	kern_flags |= KERNEL_MOUNT_KMOUNT;
327 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
328 	    syscall_flags, kern_flags, NULL, ctx);
329 
330 	if (did_namei) {
331 		vnode_put(vp);
332 		vnode_put(pvp);
333 		nameidone(&nd);
334 	}
335 
336 	return error;
337 }
338 
339 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)340 vfs_mount_at_path(const char *fstype, const char *path,
341     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
342     int mnt_flags, int flags)
343 {
344 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
345 	int error, km_flags = 0;
346 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
347 
348 	/*
349 	 * This call is currently restricted to specific use cases.
350 	 */
351 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
352 		return ENOTSUP;
353 	}
354 
355 #if !defined(XNU_TARGET_OS_OSX)
356 	if (strcmp(fstype, "lifs") == 0) {
357 		syscall_flags |= MNT_NOEXEC;
358 	}
359 #endif
360 
361 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
362 		km_flags |= KERNEL_MOUNT_NOAUTH;
363 	}
364 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
365 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
366 	}
367 
368 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
369 	    syscall_flags, km_flags, ctx);
370 	if (error) {
371 		printf("%s: mount on %s failed, error %d\n", __func__, path,
372 		    error);
373 	}
374 
375 	return error;
376 }
377 
378 int
vfs_mount_override_type_name(mount_t mp,const char * name)379 vfs_mount_override_type_name(mount_t mp, const char *name)
380 {
381 	if (mp == NULL || name == NULL) {
382 		return EINVAL;
383 	}
384 
385 	/* Override the FS type name. */
386 	mount_lock_spin(mp);
387 	strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
388 	mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
389 	mount_unlock(mp);
390 
391 	return 0;
392 }
393 
394 /*
395  * Mount a file system.
396  */
397 /* ARGSUSED */
398 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)399 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
400 {
401 	struct __mac_mount_args muap;
402 
403 	muap.type = uap->type;
404 	muap.path = uap->path;
405 	muap.flags = uap->flags;
406 	muap.data = uap->data;
407 	muap.mac_p = USER_ADDR_NULL;
408 	return __mac_mount(p, &muap, retval);
409 }
410 
411 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)412 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
413 {
414 	struct componentname    cn;
415 	vfs_context_t           ctx = vfs_context_current();
416 	size_t                  dummy = 0;
417 	int                     error;
418 	int                     flags = uap->flags;
419 	char                    fstypename[MFSNAMELEN];
420 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
421 	vnode_t                 pvp;
422 	vnode_t                 vp;
423 
424 	AUDIT_ARG(fd, uap->fd);
425 	AUDIT_ARG(fflags, flags);
426 	/* fstypename will get audited by mount_common */
427 
428 	/* Sanity check the flags */
429 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
430 		return ENOTSUP;
431 	}
432 
433 	if (flags & MNT_UNION) {
434 		return EPERM;
435 	}
436 
437 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
438 	if (error) {
439 		return error;
440 	}
441 
442 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
443 		return error;
444 	}
445 
446 	if ((error = vnode_getwithref(vp)) != 0) {
447 		file_drop(uap->fd);
448 		return error;
449 	}
450 
451 	pvp = vnode_getparent(vp);
452 	if (pvp == NULL) {
453 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
454 			error = EBUSY;
455 		} else {
456 			error = EINVAL;
457 		}
458 		vnode_put(vp);
459 		file_drop(uap->fd);
460 		return error;
461 	}
462 
463 	memset(&cn, 0, sizeof(struct componentname));
464 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
465 	cn.cn_pnlen = MAXPATHLEN;
466 
467 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
468 		zfree(ZV_NAMEI, cn.cn_pnbuf);
469 		vnode_put(pvp);
470 		vnode_put(vp);
471 		file_drop(uap->fd);
472 		return error;
473 	}
474 
475 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
476 
477 	zfree(ZV_NAMEI, cn.cn_pnbuf);
478 	vnode_put(pvp);
479 	vnode_put(vp);
480 	file_drop(uap->fd);
481 
482 	return error;
483 }
484 
485 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
486 
487 /*
488  * Get the size of a graft file (a manifest or payload file).
489  * The vp should be an iocounted vnode.
490  */
491 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)492 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
493 {
494 	struct stat64 sb = {};
495 	int error;
496 
497 	*size = 0;
498 
499 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
500 	if (error) {
501 		return error;
502 	}
503 
504 	if (sb.st_size == 0) {
505 		error = ENODATA;
506 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
507 		error = EFBIG;
508 	} else {
509 		*size = (size_t) sb.st_size;
510 	}
511 
512 	return error;
513 }
514 
515 /*
516  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
517  * `size` must already be validated.
518  */
519 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)520 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
521 {
522 	return vn_rdwr(UIO_READ, graft_vp,
523 	           (caddr_t) buf, (int) size, /* offset */ 0,
524 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
525 	           vfs_context_ucred(vctx), /* resid */ NULL,
526 	           vfs_context_proc(vctx));
527 }
528 
529 /*
530  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
531  * and read it into `buf`.
532  */
533 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)534 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
535 {
536 	vnode_t metadata_vp = NULLVP;
537 	int error;
538 
539 	// Convert this graft fd to a vnode.
540 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
541 		goto out;
542 	}
543 
544 	// Get (and validate) size information.
545 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
546 		goto out;
547 	}
548 
549 	// Read each file into the provided buffer - we must get the expected amount of bytes.
550 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
551 		goto out;
552 	}
553 
554 out:
555 	if (metadata_vp) {
556 		vnode_put(metadata_vp);
557 		metadata_vp = NULLVP;
558 	}
559 
560 	return error;
561 }
562 
563 /*
564  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
565  * provided in `gfs`, saving the size of data read in `gfs`.
566  */
567 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)568 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
569     fsioc_graft_fs_t *gfs)
570 {
571 	int error;
572 
573 	// Read the authentic manifest.
574 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
575 	    &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
576 		return error;
577 	}
578 
579 	// The user manifest is currently unused, but set its size.
580 	gfs->user_manifest_size = 0;
581 
582 	// Read the payload.
583 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
584 	    &gfs->payload_size, gfs->payload))) {
585 		return error;
586 	}
587 
588 	return 0;
589 }
590 
591 /*
592  * Call into the filesystem to verify and graft a cryptex.
593  */
594 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)595 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
596     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
597 {
598 	fsioc_graft_fs_t gfs = {};
599 	uint64_t graft_dir_ino = 0;
600 	struct stat64 sb = {};
601 	int error;
602 
603 	// Pre-flight arguments.
604 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
605 		// Make sure that this graft version matches what we support.
606 		return ENOTSUP;
607 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
608 		// For this type, cryptex VP must live on same volume as the target of graft.
609 		return EXDEV;
610 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
611 		// We cannot graft upon non-directories.
612 		return ENOTDIR;
613 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
614 	    sbc_args->sbc_payload_fd < 0) {
615 		// We cannot graft without a manifest and payload.
616 		return EINVAL;
617 	}
618 
619 	if (mounton_vp) {
620 		// Get the mounton's inode number.
621 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
622 		if (error) {
623 			return error;
624 		}
625 		graft_dir_ino = (uint64_t) sb.st_ino;
626 	}
627 
628 	// Create buffers (of our maximum-defined size) to store authentication info.
629 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
630 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
631 
632 	if (!gfs.authentic_manifest || !gfs.payload) {
633 		error = ENOMEM;
634 		goto out;
635 	}
636 
637 	// Read our fd's into our buffers.
638 	// (Note that this will set the buffer size fields in `gfs`.)
639 	error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
640 	if (error) {
641 		goto out;
642 	}
643 
644 	gfs.graft_version = FSIOC_GRAFT_VERSION;
645 	gfs.graft_type = graft_type;
646 	gfs.graft_4cc = sbc_args->sbc_4cc;
647 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
648 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
649 	}
650 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
651 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
652 	}
653 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
654 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
655 	}
656 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
657 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
658 	}
659 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
660 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
661 	}
662 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
663 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
664 	}
665 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
666 
667 	// Call into the FS to perform the graft (and validation).
668 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
669 
670 out:
671 	if (gfs.authentic_manifest) {
672 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
673 		gfs.authentic_manifest = NULL;
674 	}
675 	if (gfs.payload) {
676 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
677 		gfs.payload = NULL;
678 	}
679 
680 	return error;
681 }
682 
683 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
684 
685 /*
686  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
687  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
688  */
689 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)690 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
691 {
692 	int ua_dmgfd = uap->dmg_fd;
693 	user_addr_t ua_mountdir = uap->mountdir;
694 	uint32_t ua_grafttype = uap->graft_type;
695 	user_addr_t ua_graftargs = uap->gda;
696 
697 	graftdmg_args_un kern_gda = {};
698 	int error = 0;
699 	secure_boot_cryptex_args_t *sbc_args = NULL;
700 
701 	vnode_t cryptex_vp = NULLVP;
702 	vnode_t mounton_vp = NULLVP;
703 	struct nameidata nd = {};
704 	vfs_context_t ctx = vfs_context_current();
705 
706 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
707 		return EPERM;
708 	}
709 
710 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
711 	if (error) {
712 		return error;
713 	}
714 
715 	// Copy mount dir in, if provided.
716 	if (ua_mountdir != USER_ADDR_NULL) {
717 		// Acquire vnode for mount-on path
718 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
719 		    UIO_USERSPACE, ua_mountdir, ctx);
720 
721 		error = namei(&nd);
722 		if (error) {
723 			return error;
724 		}
725 		mounton_vp = nd.ni_vp;
726 	}
727 
728 	// Convert fd to vnode.
729 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
730 	if (error) {
731 		goto graftout;
732 	}
733 
734 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_DOWNLEVEL) {
735 		error = EINVAL;
736 	} else {
737 		sbc_args = &kern_gda.sbc_args;
738 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
739 	}
740 
741 graftout:
742 	if (cryptex_vp) {
743 		vnode_put(cryptex_vp);
744 		cryptex_vp = NULLVP;
745 	}
746 	if (mounton_vp) {
747 		vnode_put(mounton_vp);
748 		mounton_vp = NULLVP;
749 	}
750 	if (ua_mountdir != USER_ADDR_NULL) {
751 		nameidone(&nd);
752 	}
753 
754 	return error;
755 }
756 
757 /*
758  * Ungraft a cryptex disk image (via mount dir FD)
759  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
760  */
761 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)762 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
763 {
764 	int error = 0;
765 	user_addr_t ua_mountdir = uap->mountdir;
766 	fsioc_ungraft_fs_t ugfs;
767 	vnode_t mounton_vp = NULLVP;
768 	struct nameidata nd = {};
769 	vfs_context_t ctx = vfs_context_current();
770 
771 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
772 		return EPERM;
773 	}
774 
775 	if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
776 		return EINVAL;
777 	}
778 
779 	ugfs.ungraft_flags = 0;
780 
781 	// Acquire vnode for mount-on path
782 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
783 	    UIO_USERSPACE, ua_mountdir, ctx);
784 
785 	error = namei(&nd);
786 	if (error) {
787 		return error;
788 	}
789 	mounton_vp = nd.ni_vp;
790 
791 	// Call into the FS to perform the ungraft
792 	error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
793 
794 	vnode_put(mounton_vp);
795 	nameidone(&nd);
796 
797 	return error;
798 }
799 
800 
801 void
vfs_notify_mount(vnode_t pdvp)802 vfs_notify_mount(vnode_t pdvp)
803 {
804 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
805 	lock_vnode_and_post(pdvp, NOTE_WRITE);
806 }
807 
808 /*
809  * __mac_mount:
810  *	Mount a file system taking into account MAC label behavior.
811  *	See mount(2) man page for more information
812  *
813  * Parameters:    p                        Process requesting the mount
814  *                uap                      User argument descriptor (see below)
815  *                retval                   (ignored)
816  *
817  * Indirect:      uap->type                Filesystem type
818  *                uap->path                Path to mount
819  *                uap->data                Mount arguments
820  *                uap->mac_p               MAC info
821  *                uap->flags               Mount flags
822  *
823  *
824  * Returns:        0                       Success
825  *                !0                       Not success
826  */
827 boolean_t root_fs_upgrade_try = FALSE;
828 
829 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)830 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
831 {
832 	vnode_t pvp = NULL;
833 	vnode_t vp = NULL;
834 	int need_nameidone = 0;
835 	vfs_context_t ctx = vfs_context_current();
836 	char fstypename[MFSNAMELEN];
837 	struct nameidata nd;
838 	size_t dummy = 0;
839 	char *labelstr = NULL;
840 	size_t labelsz = 0;
841 	int flags = uap->flags;
842 	int error;
843 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
844 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
845 #else
846 #pragma unused(p)
847 #endif
848 	/*
849 	 * Get the fs type name from user space
850 	 */
851 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
852 	if (error) {
853 		return error;
854 	}
855 
856 	/*
857 	 * Get the vnode to be covered
858 	 */
859 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
860 	    UIO_USERSPACE, uap->path, ctx);
861 	if (flags & MNT_NOFOLLOW) {
862 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
863 	}
864 	error = namei(&nd);
865 	if (error) {
866 		goto out;
867 	}
868 	need_nameidone = 1;
869 	vp = nd.ni_vp;
870 	pvp = nd.ni_dvp;
871 
872 #ifdef CONFIG_IMGSRC_ACCESS
873 	/* Mounting image source cannot be batched with other operations */
874 	if (flags == MNT_IMGSRC_BY_INDEX) {
875 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
876 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
877 		goto out;
878 	}
879 #endif /* CONFIG_IMGSRC_ACCESS */
880 
881 #if CONFIG_MACF
882 	/*
883 	 * Get the label string (if any) from user space
884 	 */
885 	if (uap->mac_p != USER_ADDR_NULL) {
886 		struct user_mac mac;
887 		size_t ulen = 0;
888 
889 		if (is_64bit) {
890 			struct user64_mac mac64;
891 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
892 			mac.m_buflen = (user_size_t)mac64.m_buflen;
893 			mac.m_string = (user_addr_t)mac64.m_string;
894 		} else {
895 			struct user32_mac mac32;
896 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
897 			mac.m_buflen = mac32.m_buflen;
898 			mac.m_string = mac32.m_string;
899 		}
900 		if (error) {
901 			goto out;
902 		}
903 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
904 		    (mac.m_buflen < 2)) {
905 			error = EINVAL;
906 			goto out;
907 		}
908 		labelsz = mac.m_buflen;
909 		labelstr = kalloc_data(labelsz, Z_WAITOK);
910 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
911 		if (error) {
912 			goto out;
913 		}
914 		AUDIT_ARG(mac_string, labelstr);
915 	}
916 #endif /* CONFIG_MACF */
917 
918 	AUDIT_ARG(fflags, flags);
919 
920 #if !CONFIG_UNION_MOUNTS
921 	if (flags & MNT_UNION) {
922 		error = EPERM;
923 		goto out;
924 	}
925 #endif
926 
927 	if ((vp->v_flag & VROOT) &&
928 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
929 #if CONFIG_UNION_MOUNTS
930 		if (!(flags & MNT_UNION)) {
931 			flags |= MNT_UPDATE;
932 		} else {
933 			/*
934 			 * For a union mount on '/', treat it as fresh
935 			 * mount instead of update.
936 			 * Otherwise, union mouting on '/' used to panic the
937 			 * system before, since mnt_vnodecovered was found to
938 			 * be NULL for '/' which is required for unionlookup
939 			 * after it gets ENOENT on union mount.
940 			 */
941 			flags = (flags & ~(MNT_UPDATE));
942 		}
943 #else
944 		flags |= MNT_UPDATE;
945 #endif /* CONFIG_UNION_MOUNTS */
946 
947 #if SECURE_KERNEL
948 		if ((flags & MNT_RDONLY) == 0) {
949 			/* Release kernels are not allowed to mount "/" as rw */
950 			error = EPERM;
951 			goto out;
952 		}
953 #endif
954 
955 		/*
956 		 * See 7392553 for more details on why this check exists.
957 		 * Suffice to say: If this check is ON and something tries
958 		 * to mount the rootFS RW, we'll turn off the codesign
959 		 * bitmap optimization.
960 		 */
961 #if CHECK_CS_VALIDATION_BITMAP
962 		if ((flags & MNT_RDONLY) == 0) {
963 			root_fs_upgrade_try = TRUE;
964 		}
965 #endif
966 	}
967 
968 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
969 	    labelstr, ctx);
970 
971 out:
972 
973 #if CONFIG_MACF
974 	kfree_data(labelstr, labelsz);
975 #endif /* CONFIG_MACF */
976 
977 	if (vp) {
978 		vnode_put(vp);
979 	}
980 	if (pvp) {
981 		vnode_put(pvp);
982 	}
983 	if (need_nameidone) {
984 		nameidone(&nd);
985 	}
986 
987 	return error;
988 }
989 
990 /*
991  * common mount implementation (final stage of mounting)
992  *
993  * Arguments:
994  *  fstypename	file system type (ie it's vfs name)
995  *  pvp		parent of covered vnode
996  *  vp		covered vnode
997  *  cnp		component name (ie path) of covered vnode
998  *  flags	generic mount flags
999  *  fsmountargs	file system specific data
1000  *  labelstr	optional MAC label
1001  *  kernelmount	TRUE for mounts initiated from inside the kernel
1002  *  ctx		caller's context
1003  */
1004 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1005 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1006     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1007     char *labelstr, vfs_context_t ctx)
1008 {
1009 #if !CONFIG_MACF
1010 #pragma unused(labelstr)
1011 #endif
1012 	struct vnode *devvp = NULLVP;
1013 	struct vnode *device_vnode = NULLVP;
1014 #if CONFIG_MACF
1015 	struct vnode *rvp;
1016 #endif
1017 	struct mount *mp = NULL;
1018 	struct vfstable *vfsp = (struct vfstable *)0;
1019 	struct proc *p = vfs_context_proc(ctx);
1020 	int error, flag = 0;
1021 	bool flag_set = false;
1022 	user_addr_t devpath = USER_ADDR_NULL;
1023 	int ronly = 0;
1024 	int mntalloc = 0;
1025 	boolean_t vfsp_ref = FALSE;
1026 	boolean_t is_rwlock_locked = FALSE;
1027 	boolean_t did_rele = FALSE;
1028 	boolean_t have_usecount = FALSE;
1029 	boolean_t did_set_lmount = FALSE;
1030 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1031 
1032 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1033 	/* Check for mutually-exclusive flag bits */
1034 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1035 	int bitcount = 0;
1036 	while (checkflags != 0) {
1037 		checkflags &= (checkflags - 1);
1038 		bitcount++;
1039 	}
1040 
1041 	if (bitcount > 1) {
1042 		//not allowed to request multiple mount-by-role flags
1043 		error = EINVAL;
1044 		goto out1;
1045 	}
1046 #endif
1047 
1048 	/*
1049 	 * Process an update for an existing mount
1050 	 */
1051 	if (flags & MNT_UPDATE) {
1052 		if ((vp->v_flag & VROOT) == 0) {
1053 			error = EINVAL;
1054 			goto out1;
1055 		}
1056 		mp = vp->v_mount;
1057 
1058 		/* if unmount or mount in progress, return error */
1059 		mount_lock_spin(mp);
1060 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1061 			mount_unlock(mp);
1062 			error = EBUSY;
1063 			goto out1;
1064 		}
1065 		mp->mnt_lflag |= MNT_LMOUNT;
1066 		did_set_lmount = TRUE;
1067 		mount_unlock(mp);
1068 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1069 		is_rwlock_locked = TRUE;
1070 		/*
1071 		 * We only allow the filesystem to be reloaded if it
1072 		 * is currently mounted read-only.
1073 		 */
1074 		if ((flags & MNT_RELOAD) &&
1075 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1076 			error = ENOTSUP;
1077 			goto out1;
1078 		}
1079 
1080 		/*
1081 		 * If content protection is enabled, update mounts are not
1082 		 * allowed to turn it off.
1083 		 */
1084 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1085 		    ((flags & MNT_CPROTECT) == 0)) {
1086 			error = EINVAL;
1087 			goto out1;
1088 		}
1089 
1090 		/*
1091 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1092 		 * failure to return an error for this so we'll just silently
1093 		 * add it if it is not passed in.
1094 		 */
1095 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1096 		    ((flags & MNT_REMOVABLE) == 0)) {
1097 			flags |= MNT_REMOVABLE;
1098 		}
1099 
1100 		/* Can't downgrade the backer of the root FS */
1101 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1102 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1103 			error = ENOTSUP;
1104 			goto out1;
1105 		}
1106 
1107 		/*
1108 		 * Only root, or the user that did the original mount is
1109 		 * permitted to update it.
1110 		 */
1111 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1112 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1113 			goto out1;
1114 		}
1115 #if CONFIG_MACF
1116 		error = mac_mount_check_remount(ctx, mp);
1117 		if (error != 0) {
1118 			goto out1;
1119 		}
1120 #endif
1121 		/*
1122 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1123 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1124 		 */
1125 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1126 			flags |= MNT_NOSUID | MNT_NODEV;
1127 			if (mp->mnt_flag & MNT_NOEXEC) {
1128 				flags |= MNT_NOEXEC;
1129 			}
1130 		}
1131 		flag = mp->mnt_flag;
1132 		flag_set = true;
1133 
1134 
1135 
1136 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1137 
1138 		vfsp = mp->mnt_vtable;
1139 		goto update;
1140 	} // MNT_UPDATE
1141 
1142 	/*
1143 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1144 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1145 	 */
1146 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1147 		flags |= MNT_NOSUID | MNT_NODEV;
1148 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1149 			flags |= MNT_NOEXEC;
1150 		}
1151 	}
1152 
1153 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1154 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1155 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1156 	mount_list_lock();
1157 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1158 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1159 			vfsp->vfc_refcount++;
1160 			vfsp_ref = TRUE;
1161 			break;
1162 		}
1163 	}
1164 	mount_list_unlock();
1165 	if (vfsp == NULL) {
1166 		error = ENODEV;
1167 		goto out1;
1168 	}
1169 
1170 	/*
1171 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1172 	 * except in ROSV configs and for the initial BaseSystem root.
1173 	 */
1174 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1175 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1176 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1177 		error = EINVAL;  /* unsupported request */
1178 		goto out1;
1179 	}
1180 
1181 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1182 	if (error != 0) {
1183 		goto out1;
1184 	}
1185 
1186 	/*
1187 	 * Allocate and initialize the filesystem (mount_t)
1188 	 */
1189 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1190 	mntalloc = 1;
1191 
1192 	/* Initialize the default IO constraints */
1193 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1194 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1195 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1196 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1197 	mp->mnt_devblocksize = DEV_BSIZE;
1198 	mp->mnt_alignmentmask = PAGE_MASK;
1199 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1200 	mp->mnt_ioscale = 1;
1201 	mp->mnt_ioflags = 0;
1202 	mp->mnt_realrootvp = NULLVP;
1203 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1204 
1205 	mp->mnt_lflag |= MNT_LMOUNT;
1206 	did_set_lmount = TRUE;
1207 
1208 	TAILQ_INIT(&mp->mnt_vnodelist);
1209 	TAILQ_INIT(&mp->mnt_workerqueue);
1210 	TAILQ_INIT(&mp->mnt_newvnodes);
1211 	mount_lock_init(mp);
1212 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1213 	is_rwlock_locked = TRUE;
1214 	mp->mnt_op = vfsp->vfc_vfsops;
1215 	mp->mnt_vtable = vfsp;
1216 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1217 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1218 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1219 	do {
1220 		size_t pathlen = MAXPATHLEN;
1221 
1222 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1223 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1224 		}
1225 	} while (0);
1226 	mp->mnt_vnodecovered = vp;
1227 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1228 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1229 	mp->mnt_devbsdunit = 0;
1230 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1231 
1232 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1233 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1234 
1235 	if (kernelmount) {
1236 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1237 	}
1238 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1239 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1240 	}
1241 
1242 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1243 		// kernel mounted devfs
1244 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1245 	}
1246 
1247 update:
1248 
1249 	/*
1250 	 * Set the mount level flags.
1251 	 */
1252 	if (flags & MNT_RDONLY) {
1253 		mp->mnt_flag |= MNT_RDONLY;
1254 	} else if (mp->mnt_flag & MNT_RDONLY) {
1255 		// disallow read/write upgrades of file systems that
1256 		// had the TYPENAME_OVERRIDE feature set.
1257 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1258 			error = EPERM;
1259 			goto out1;
1260 		}
1261 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1262 	}
1263 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1264 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1265 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1266 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1267 	    MNT_QUARANTINE | MNT_CPROTECT);
1268 
1269 #if SECURE_KERNEL
1270 #if !CONFIG_MNT_SUID
1271 	/*
1272 	 * On release builds of iOS based platforms, always enforce NOSUID on
1273 	 * all mounts. We do this here because we can catch update mounts as well as
1274 	 * non-update mounts in this case.
1275 	 */
1276 	mp->mnt_flag |= (MNT_NOSUID);
1277 #endif
1278 #endif
1279 
1280 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1281 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1282 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1283 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1284 	    MNT_QUARANTINE | MNT_CPROTECT);
1285 
1286 #if CONFIG_MACF
1287 	if (flags & MNT_MULTILABEL) {
1288 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1289 			error = EINVAL;
1290 			goto out1;
1291 		}
1292 		mp->mnt_flag |= MNT_MULTILABEL;
1293 	}
1294 #endif
1295 	/*
1296 	 * Process device path for local file systems if requested.
1297 	 *
1298 	 * Snapshot and mount-by-role mounts do not use this path; they are
1299 	 * passing other opaque data in the device path field.
1300 	 *
1301 	 * Basesystemroot mounts pass a device path to be resolved here,
1302 	 * but it's just a char * already inside the kernel, which
1303 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1304 	 * mounts we must skip copyin (both of the address and of the string
1305 	 * (in NDINIT).
1306 	 */
1307 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1308 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1309 		boolean_t do_copyin_devpath = true;
1310 #if CONFIG_BASESYSTEMROOT
1311 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1312 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1313 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1314 			// but is actually a char ** pointing to a (kernelspace) string.
1315 			// We manually unpack it with a series of casts and dereferences
1316 			// that reverses what was done just above us on the stack in
1317 			// imageboot_pivot_image().
1318 			// After retrieving the path to the dev node (which we will NDINIT
1319 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1320 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1321 			char **devnamepp = (char **)fsmountargs;
1322 			char *devnamep = *devnamepp;
1323 			devpath = CAST_USER_ADDR_T(devnamep);
1324 			do_copyin_devpath = false;
1325 			fsmountargs = USER_ADDR_NULL;
1326 
1327 			//Now that we have a mp, denote that this mount is for the basesystem.
1328 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1329 		}
1330 #endif // CONFIG_BASESYSTEMROOT
1331 
1332 		if (do_copyin_devpath) {
1333 			if (vfs_context_is64bit(ctx)) {
1334 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1335 					goto out1;
1336 				}
1337 				fsmountargs += sizeof(devpath);
1338 			} else {
1339 				user32_addr_t tmp;
1340 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1341 					goto out1;
1342 				}
1343 				/* munge into LP64 addr */
1344 				devpath = CAST_USER_ADDR_T(tmp);
1345 				fsmountargs += sizeof(tmp);
1346 			}
1347 		}
1348 
1349 		/* Lookup device and authorize access to it */
1350 		if ((devpath)) {
1351 			struct nameidata nd;
1352 
1353 			enum uio_seg seg = UIO_USERSPACE;
1354 #if CONFIG_BASESYSTEMROOT
1355 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1356 				seg = UIO_SYSSPACE;
1357 			}
1358 #endif // CONFIG_BASESYSTEMROOT
1359 
1360 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1361 			if ((error = namei(&nd))) {
1362 				goto out1;
1363 			}
1364 
1365 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1366 			devvp = nd.ni_vp;
1367 
1368 			nameidone(&nd);
1369 
1370 			if (devvp->v_type != VBLK) {
1371 				error = ENOTBLK;
1372 				goto out2;
1373 			}
1374 			if (major(devvp->v_rdev) >= nblkdev) {
1375 				error = ENXIO;
1376 				goto out2;
1377 			}
1378 			/*
1379 			 * If mount by non-root, then verify that user has necessary
1380 			 * permissions on the device.
1381 			 */
1382 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1383 				mode_t accessmode = KAUTH_VNODE_READ_DATA;
1384 
1385 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1386 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1387 				}
1388 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1389 					goto out2;
1390 				}
1391 			}
1392 		}
1393 		/* On first mount, preflight and open device */
1394 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1395 			if ((error = vnode_ref(devvp))) {
1396 				goto out2;
1397 			}
1398 			/*
1399 			 * Disallow multiple mounts of the same device.
1400 			 * Disallow mounting of a device that is currently in use
1401 			 * (except for root, which might share swap device for miniroot).
1402 			 * Flush out any old buffers remaining from a previous use.
1403 			 */
1404 			if ((error = vfs_mountedon(devvp))) {
1405 				goto out3;
1406 			}
1407 
1408 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1409 				error = EBUSY;
1410 				goto out3;
1411 			}
1412 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1413 				error = ENOTBLK;
1414 				goto out3;
1415 			}
1416 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1417 				goto out3;
1418 			}
1419 
1420 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1421 #if CONFIG_MACF
1422 			error = mac_vnode_check_open(ctx,
1423 			    devvp,
1424 			    ronly ? FREAD : FREAD | FWRITE);
1425 			if (error) {
1426 				goto out3;
1427 			}
1428 #endif /* MAC */
1429 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1430 				goto out3;
1431 			}
1432 
1433 			mp->mnt_devvp = devvp;
1434 			device_vnode = devvp;
1435 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1436 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1437 		    (device_vnode = mp->mnt_devvp)) {
1438 			dev_t dev;
1439 			int maj;
1440 			/*
1441 			 * If upgrade to read-write by non-root, then verify
1442 			 * that user has necessary permissions on the device.
1443 			 */
1444 			vnode_getalways(device_vnode);
1445 
1446 			if (suser(vfs_context_ucred(ctx), NULL) &&
1447 			    (error = vnode_authorize(device_vnode, NULL,
1448 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1449 			    ctx)) != 0) {
1450 				vnode_put(device_vnode);
1451 				goto out2;
1452 			}
1453 
1454 			/* Tell the device that we're upgrading */
1455 			dev = (dev_t)device_vnode->v_rdev;
1456 			maj = major(dev);
1457 
1458 			if ((u_int)maj >= (u_int)nblkdev) {
1459 				panic("Volume mounted on a device with invalid major number.");
1460 			}
1461 
1462 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1463 			vnode_put(device_vnode);
1464 			device_vnode = NULLVP;
1465 			if (error != 0) {
1466 				goto out2;
1467 			}
1468 		}
1469 	} // localargs && !(snapshot | data | vm)
1470 
1471 #if CONFIG_MACF
1472 	if ((flags & MNT_UPDATE) == 0) {
1473 		mac_mount_label_init(mp);
1474 		mac_mount_label_associate(ctx, mp);
1475 	}
1476 	if (labelstr) {
1477 		if ((flags & MNT_UPDATE) != 0) {
1478 			error = mac_mount_check_label_update(ctx, mp);
1479 			if (error != 0) {
1480 				goto out3;
1481 			}
1482 		}
1483 	}
1484 #endif
1485 	/*
1486 	 * Mount the filesystem.  We already asserted that internal_flags
1487 	 * cannot have more than one mount-by-role bit set.
1488 	 */
1489 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1490 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1491 		    (caddr_t)fsmountargs, 0, ctx);
1492 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1493 #if CONFIG_ROSV_STARTUP
1494 		struct mount *origin_mp = (struct mount*)fsmountargs;
1495 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1496 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1497 		if (error) {
1498 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1499 		} else {
1500 			/* Mark volume associated with system volume */
1501 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1502 
1503 			/* Attempt to acquire the mnt_devvp and set it up */
1504 			struct vnode *mp_devvp = NULL;
1505 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1506 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1507 				    0, &mp_devvp, vfs_context_kernel());
1508 				if (!lerr) {
1509 					mp->mnt_devvp = mp_devvp;
1510 					//vnode_lookup took an iocount, need to drop it.
1511 					vnode_put(mp_devvp);
1512 					// now set `device_vnode` to the devvp that was acquired.
1513 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1514 					// note that though the iocount above was dropped, the mount acquires
1515 					// an implicit reference against the device.
1516 					device_vnode = mp_devvp;
1517 				}
1518 			}
1519 		}
1520 #else
1521 		error = EINVAL;
1522 #endif
1523 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1524 #if CONFIG_MOUNT_VM
1525 		struct mount *origin_mp = (struct mount*)fsmountargs;
1526 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1527 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1528 		if (error) {
1529 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1530 		} else {
1531 			/* Mark volume associated with system volume and a swap mount */
1532 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1533 			/* Attempt to acquire the mnt_devvp and set it up */
1534 			struct vnode *mp_devvp = NULL;
1535 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1536 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1537 				    0, &mp_devvp, vfs_context_kernel());
1538 				if (!lerr) {
1539 					mp->mnt_devvp = mp_devvp;
1540 					//vnode_lookup took an iocount, need to drop it.
1541 					vnode_put(mp_devvp);
1542 
1543 					// now set `device_vnode` to the devvp that was acquired.
1544 					// note that though the iocount above was dropped, the mount acquires
1545 					// an implicit reference against the device.
1546 					device_vnode = mp_devvp;
1547 				}
1548 			}
1549 		}
1550 #else
1551 		error = EINVAL;
1552 #endif
1553 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1554 #if CONFIG_MOUNT_PREBOOTRECOVERY
1555 		struct mount *origin_mp = (struct mount*)fsmountargs;
1556 		uint32_t mount_role = 0;
1557 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1558 			mount_role = VFS_PREBOOT_ROLE;
1559 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1560 			mount_role = VFS_RECOVERY_ROLE;
1561 		}
1562 
1563 		if (mount_role != 0) {
1564 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1565 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1566 			if (error) {
1567 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1568 			} else {
1569 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1570 				/* Mark volume associated with system volume */
1571 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1572 				/* Attempt to acquire the mnt_devvp and set it up */
1573 				struct vnode *mp_devvp = NULL;
1574 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1575 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1576 					    0, &mp_devvp, vfs_context_kernel());
1577 					if (!lerr) {
1578 						mp->mnt_devvp = mp_devvp;
1579 						//vnode_lookup took an iocount, need to drop it.
1580 						vnode_put(mp_devvp);
1581 
1582 						// now set `device_vnode` to the devvp that was acquired.
1583 						// note that though the iocount above was dropped, the mount acquires
1584 						// an implicit reference against the device.
1585 						device_vnode = mp_devvp;
1586 					}
1587 				}
1588 			}
1589 		} else {
1590 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1591 			error = EINVAL;
1592 		}
1593 #else
1594 		error = EINVAL;
1595 #endif
1596 	} else {
1597 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1598 	}
1599 
1600 	if (flags & MNT_UPDATE) {
1601 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1602 			mp->mnt_flag &= ~MNT_RDONLY;
1603 		}
1604 		mp->mnt_flag &= ~
1605 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1606 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1607 		if (error) {
1608 			mp->mnt_flag = flag;  /* restore flag value */
1609 		}
1610 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1611 		lck_rw_done(&mp->mnt_rwlock);
1612 		is_rwlock_locked = FALSE;
1613 		if (!error) {
1614 			enablequotas(mp, ctx);
1615 		}
1616 		goto exit;
1617 	}
1618 
1619 	/*
1620 	 * Put the new filesystem on the mount list after root.
1621 	 */
1622 	if (error == 0) {
1623 		struct vfs_attr vfsattr;
1624 		if (device_vnode) {
1625 			/*
1626 			 *   cache the IO attributes for the underlying physical media...
1627 			 *   an error return indicates the underlying driver doesn't
1628 			 *   support all the queries necessary... however, reasonable
1629 			 *   defaults will have been set, so no reason to bail or care
1630 			 *
1631 			 *   Need to do this before calling the MAC hook as it needs
1632 			 *   information from this call.
1633 			 */
1634 			vfs_init_io_attributes(device_vnode, mp);
1635 		}
1636 
1637 #if CONFIG_MACF
1638 		error = mac_mount_check_mount_late(ctx, mp);
1639 		if (error != 0) {
1640 			goto out4;
1641 		}
1642 
1643 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1644 			error = VFS_ROOT(mp, &rvp, ctx);
1645 			if (error) {
1646 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1647 				goto out4;
1648 			}
1649 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1650 			/*
1651 			 * drop reference provided by VFS_ROOT
1652 			 */
1653 			vnode_put(rvp);
1654 
1655 			if (error) {
1656 				goto out4;
1657 			}
1658 		}
1659 #endif  /* MAC */
1660 
1661 		vnode_lock_spin(vp);
1662 		CLR(vp->v_flag, VMOUNT);
1663 		vp->v_mountedhere = mp;
1664 		vnode_unlock(vp);
1665 
1666 		/*
1667 		 * taking the name_cache_lock exclusively will
1668 		 * insure that everyone is out of the fast path who
1669 		 * might be trying to use a now stale copy of
1670 		 * vp->v_mountedhere->mnt_realrootvp
1671 		 * bumping mount_generation causes the cached values
1672 		 * to be invalidated
1673 		 */
1674 		name_cache_lock();
1675 		mount_generation++;
1676 		name_cache_unlock();
1677 
1678 		error = vnode_ref(vp);
1679 		if (error != 0) {
1680 			goto out4;
1681 		}
1682 
1683 		have_usecount = TRUE;
1684 
1685 		error = checkdirs(vp, ctx);
1686 		if (error != 0) {
1687 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1688 			goto out4;
1689 		}
1690 		/*
1691 		 * there is no cleanup code here so I have made it void
1692 		 * we need to revisit this
1693 		 */
1694 		(void)VFS_START(mp, 0, ctx);
1695 
1696 		if (mount_list_add(mp) != 0) {
1697 			/*
1698 			 * The system is shutting down trying to umount
1699 			 * everything, so fail with a plausible errno.
1700 			 */
1701 			error = EBUSY;
1702 			goto out4;
1703 		}
1704 		lck_rw_done(&mp->mnt_rwlock);
1705 		is_rwlock_locked = FALSE;
1706 
1707 		/* Check if this mounted file system supports EAs or named streams. */
1708 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1709 		VFSATTR_INIT(&vfsattr);
1710 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1711 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1712 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1713 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1714 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1715 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1716 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1717 			}
1718 #if NAMEDSTREAMS
1719 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1720 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1721 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1722 			}
1723 #endif
1724 			/* Check if this file system supports path from id lookups. */
1725 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1726 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1727 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1728 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1729 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1730 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1731 			}
1732 
1733 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1734 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1735 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1736 			}
1737 		}
1738 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1739 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1740 		}
1741 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1742 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1743 		}
1744 		/* increment the operations count */
1745 		OSAddAtomic(1, &vfs_nummntops);
1746 		enablequotas(mp, ctx);
1747 
1748 		if (device_vnode) {
1749 			device_vnode->v_specflags |= SI_MOUNTEDON;
1750 		}
1751 
1752 		/* Now that mount is setup, notify the listeners */
1753 		vfs_notify_mount(pvp);
1754 		IOBSDMountChange(mp, kIOMountChangeMount);
1755 	} else {
1756 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1757 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1758 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1759 			    mp->mnt_vtable->vfc_name, error);
1760 		}
1761 
1762 		vnode_lock_spin(vp);
1763 		CLR(vp->v_flag, VMOUNT);
1764 		vnode_unlock(vp);
1765 		mount_list_lock();
1766 		mp->mnt_vtable->vfc_refcount--;
1767 		mount_list_unlock();
1768 
1769 		if (device_vnode) {
1770 			vnode_rele(device_vnode);
1771 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1772 		}
1773 		lck_rw_done(&mp->mnt_rwlock);
1774 		is_rwlock_locked = FALSE;
1775 
1776 		if (nc_smr_enabled) {
1777 			vfs_smr_synchronize();
1778 		}
1779 
1780 		/*
1781 		 * if we get here, we have a mount structure that needs to be freed,
1782 		 * but since the coveredvp hasn't yet been updated to point at it,
1783 		 * no need to worry about other threads holding a crossref on this mp
1784 		 * so it's ok to just free it
1785 		 */
1786 		mount_lock_destroy(mp);
1787 #if CONFIG_MACF
1788 		mac_mount_label_destroy(mp);
1789 #endif
1790 		zfree(mount_zone, mp);
1791 		did_set_lmount = false;
1792 	}
1793 exit:
1794 	/*
1795 	 * drop I/O count on the device vp if there was one
1796 	 */
1797 	if (devpath && devvp) {
1798 		vnode_put(devvp);
1799 	}
1800 
1801 	if (did_set_lmount) {
1802 		mount_lock_spin(mp);
1803 		mp->mnt_lflag &= ~MNT_LMOUNT;
1804 		mount_unlock(mp);
1805 	}
1806 
1807 	return error;
1808 
1809 /* Error condition exits */
1810 out4:
1811 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1812 
1813 	/*
1814 	 * If the mount has been placed on the covered vp,
1815 	 * it may have been discovered by now, so we have
1816 	 * to treat this just like an unmount
1817 	 */
1818 	mount_lock_spin(mp);
1819 	mp->mnt_lflag |= MNT_LDEAD;
1820 	mount_unlock(mp);
1821 
1822 	if (device_vnode != NULLVP) {
1823 		vnode_rele(device_vnode);
1824 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1825 		    ctx);
1826 		did_rele = TRUE;
1827 	}
1828 
1829 	vnode_lock_spin(vp);
1830 
1831 	mp->mnt_crossref++;
1832 	vp->v_mountedhere = (mount_t) 0;
1833 
1834 	vnode_unlock(vp);
1835 
1836 	if (have_usecount) {
1837 		vnode_rele(vp);
1838 	}
1839 out3:
1840 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1841 		vnode_rele(devvp);
1842 	}
1843 out2:
1844 	if (devpath && devvp) {
1845 		vnode_put(devvp);
1846 	}
1847 out1:
1848 	/* Release mnt_rwlock only when it was taken */
1849 	if (is_rwlock_locked == TRUE) {
1850 		if (flag_set) {
1851 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1852 		}
1853 		lck_rw_done(&mp->mnt_rwlock);
1854 	}
1855 
1856 	if (did_set_lmount) {
1857 		mount_lock_spin(mp);
1858 		mp->mnt_lflag &= ~MNT_LMOUNT;
1859 		mount_unlock(mp);
1860 	}
1861 
1862 	if (mntalloc) {
1863 		if (mp->mnt_crossref) {
1864 			mount_dropcrossref(mp, vp, 0);
1865 		} else {
1866 			if (nc_smr_enabled) {
1867 				vfs_smr_synchronize();
1868 			}
1869 
1870 			mount_lock_destroy(mp);
1871 #if CONFIG_MACF
1872 			mac_mount_label_destroy(mp);
1873 #endif
1874 			zfree(mount_zone, mp);
1875 		}
1876 	}
1877 	if (vfsp_ref) {
1878 		mount_list_lock();
1879 		vfsp->vfc_refcount--;
1880 		mount_list_unlock();
1881 	}
1882 
1883 	return error;
1884 }
1885 
1886 /*
1887  * Flush in-core data, check for competing mount attempts,
1888  * and set VMOUNT
1889  */
1890 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1891 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1892 {
1893 #if !CONFIG_MACF
1894 #pragma unused(cnp,fsname)
1895 #endif
1896 	struct vnode_attr va;
1897 	int error;
1898 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1899 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1900 	boolean_t is_busy;
1901 
1902 	if (!skip_auth) {
1903 		/*
1904 		 * If the user is not root, ensure that they own the directory
1905 		 * onto which we are attempting to mount.
1906 		 */
1907 		VATTR_INIT(&va);
1908 		VATTR_WANTED(&va, va_uid);
1909 		if ((error = vnode_getattr(vp, &va, ctx)) ||
1910 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1911 		    (!vfs_context_issuser(ctx)))) {
1912 			error = EPERM;
1913 			goto out;
1914 		}
1915 	}
1916 
1917 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1918 		goto out;
1919 	}
1920 
1921 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1922 		goto out;
1923 	}
1924 
1925 	if (vp->v_type != VDIR) {
1926 		error = ENOTDIR;
1927 		goto out;
1928 	}
1929 
1930 	vnode_lock_spin(vp);
1931 	is_busy = is_fmount ?
1932 	    (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1933 	    (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1934 	if (is_busy) {
1935 		vnode_unlock(vp);
1936 		error = EBUSY;
1937 		goto out;
1938 	}
1939 	SET(vp->v_flag, VMOUNT);
1940 	vnode_unlock(vp);
1941 
1942 #if CONFIG_MACF
1943 	error = mac_mount_check_mount(ctx, vp,
1944 	    cnp, fsname);
1945 	if (error != 0) {
1946 		vnode_lock_spin(vp);
1947 		CLR(vp->v_flag, VMOUNT);
1948 		vnode_unlock(vp);
1949 	}
1950 #endif
1951 
1952 out:
1953 	return error;
1954 }
1955 
1956 #if CONFIG_IMGSRC_ACCESS
1957 
1958 #define DEBUG_IMGSRC 0
1959 
1960 #if DEBUG_IMGSRC
1961 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1962 #else
1963 #define IMGSRC_DEBUG(args...) do { } while(0)
1964 #endif
1965 
1966 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1967 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1968 {
1969 	struct nameidata nd;
1970 	vnode_t vp, realdevvp;
1971 	mode_t accessmode;
1972 	int error;
1973 	enum uio_seg uio = UIO_USERSPACE;
1974 
1975 	if (ctx == vfs_context_kernel()) {
1976 		uio = UIO_SYSSPACE;
1977 	}
1978 
1979 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1980 	if ((error = namei(&nd))) {
1981 		IMGSRC_DEBUG("namei() failed with %d\n", error);
1982 		return error;
1983 	}
1984 
1985 	vp = nd.ni_vp;
1986 
1987 	if (!vnode_isblk(vp)) {
1988 		IMGSRC_DEBUG("Not block device.\n");
1989 		error = ENOTBLK;
1990 		goto out;
1991 	}
1992 
1993 	realdevvp = mp->mnt_devvp;
1994 	if (realdevvp == NULLVP) {
1995 		IMGSRC_DEBUG("No device backs the mount.\n");
1996 		error = ENXIO;
1997 		goto out;
1998 	}
1999 
2000 	error = vnode_getwithref(realdevvp);
2001 	if (error != 0) {
2002 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2003 		goto out;
2004 	}
2005 
2006 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2007 		IMGSRC_DEBUG("Wrong dev_t.\n");
2008 		error = ENXIO;
2009 		goto out1;
2010 	}
2011 
2012 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2013 
2014 	/*
2015 	 * If mount by non-root, then verify that user has necessary
2016 	 * permissions on the device.
2017 	 */
2018 	if (!vfs_context_issuser(ctx)) {
2019 		accessmode = KAUTH_VNODE_READ_DATA;
2020 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2021 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2022 		}
2023 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2024 			IMGSRC_DEBUG("Access denied.\n");
2025 			goto out1;
2026 		}
2027 	}
2028 
2029 	*devvpp = vp;
2030 
2031 out1:
2032 	vnode_put(realdevvp);
2033 
2034 out:
2035 	nameidone(&nd);
2036 
2037 	if (error) {
2038 		vnode_put(vp);
2039 	}
2040 
2041 	return error;
2042 }
2043 
2044 /*
2045  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2046  * and call checkdirs()
2047  */
2048 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2049 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2050 {
2051 	int error;
2052 
2053 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2054 
2055 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2056 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2057 
2058 	vnode_lock_spin(vp);
2059 	CLR(vp->v_flag, VMOUNT);
2060 	vp->v_mountedhere = mp;
2061 	vnode_unlock(vp);
2062 
2063 	/*
2064 	 * taking the name_cache_lock exclusively will
2065 	 * insure that everyone is out of the fast path who
2066 	 * might be trying to use a now stale copy of
2067 	 * vp->v_mountedhere->mnt_realrootvp
2068 	 * bumping mount_generation causes the cached values
2069 	 * to be invalidated
2070 	 */
2071 	name_cache_lock();
2072 	mount_generation++;
2073 	name_cache_unlock();
2074 
2075 	error = vnode_ref(vp);
2076 	if (error != 0) {
2077 		goto out;
2078 	}
2079 
2080 	error = checkdirs(vp, ctx);
2081 	if (error != 0) {
2082 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2083 		vnode_rele(vp);
2084 		goto out;
2085 	}
2086 
2087 out:
2088 	if (error != 0) {
2089 		mp->mnt_vnodecovered = NULLVP;
2090 	}
2091 	return error;
2092 }
2093 
2094 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2095 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2096 {
2097 	vnode_rele(vp);
2098 	vnode_lock_spin(vp);
2099 	vp->v_mountedhere = (mount_t)NULL;
2100 	vnode_unlock(vp);
2101 
2102 	mp->mnt_vnodecovered = NULLVP;
2103 }
2104 
2105 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2106 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2107 {
2108 	int error;
2109 
2110 	/* unmount in progress return error */
2111 	mount_lock_spin(mp);
2112 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2113 		mount_unlock(mp);
2114 		return EBUSY;
2115 	}
2116 	mount_unlock(mp);
2117 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2118 
2119 	/*
2120 	 * We only allow the filesystem to be reloaded if it
2121 	 * is currently mounted read-only.
2122 	 */
2123 	if ((flags & MNT_RELOAD) &&
2124 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2125 		error = ENOTSUP;
2126 		goto out;
2127 	}
2128 
2129 	/*
2130 	 * Only root, or the user that did the original mount is
2131 	 * permitted to update it.
2132 	 */
2133 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2134 	    (!vfs_context_issuser(ctx))) {
2135 		error = EPERM;
2136 		goto out;
2137 	}
2138 #if CONFIG_MACF
2139 	error = mac_mount_check_remount(ctx, mp);
2140 	if (error != 0) {
2141 		goto out;
2142 	}
2143 #endif
2144 
2145 out:
2146 	if (error) {
2147 		lck_rw_done(&mp->mnt_rwlock);
2148 	}
2149 
2150 	return error;
2151 }
2152 
2153 static void
mount_end_update(mount_t mp)2154 mount_end_update(mount_t mp)
2155 {
2156 	lck_rw_done(&mp->mnt_rwlock);
2157 }
2158 
2159 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2160 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2161 {
2162 	vnode_t vp;
2163 
2164 	if (height >= MAX_IMAGEBOOT_NESTING) {
2165 		return EINVAL;
2166 	}
2167 
2168 	vp = imgsrc_rootvnodes[height];
2169 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2170 		*rvpp = vp;
2171 		return 0;
2172 	} else {
2173 		return ENOENT;
2174 	}
2175 }
2176 
2177 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2178 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2179     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2180     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2181 {
2182 	int error;
2183 	mount_t mp;
2184 	boolean_t placed = FALSE;
2185 	struct vfstable *vfsp;
2186 	user_addr_t devpath;
2187 	char *old_mntonname;
2188 	vnode_t rvp;
2189 	vnode_t devvp;
2190 	uint32_t height;
2191 	uint32_t flags;
2192 
2193 	/* If we didn't imageboot, nothing to move */
2194 	if (imgsrc_rootvnodes[0] == NULLVP) {
2195 		return EINVAL;
2196 	}
2197 
2198 	/* Only root can do this */
2199 	if (!vfs_context_issuser(ctx)) {
2200 		return EPERM;
2201 	}
2202 
2203 	IMGSRC_DEBUG("looking for root vnode.\n");
2204 
2205 	/*
2206 	 * Get root vnode of filesystem we're moving.
2207 	 */
2208 	if (by_index) {
2209 		if (is64bit) {
2210 			struct user64_mnt_imgsrc_args mia64;
2211 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2212 			if (error != 0) {
2213 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2214 				return error;
2215 			}
2216 
2217 			height = mia64.mi_height;
2218 			flags = mia64.mi_flags;
2219 			devpath = (user_addr_t)mia64.mi_devpath;
2220 		} else {
2221 			struct user32_mnt_imgsrc_args mia32;
2222 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2223 			if (error != 0) {
2224 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2225 				return error;
2226 			}
2227 
2228 			height = mia32.mi_height;
2229 			flags = mia32.mi_flags;
2230 			devpath = mia32.mi_devpath;
2231 		}
2232 	} else {
2233 		/*
2234 		 * For binary compatibility--assumes one level of nesting.
2235 		 */
2236 		if (is64bit) {
2237 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2238 				return error;
2239 			}
2240 		} else {
2241 			user32_addr_t tmp;
2242 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2243 				return error;
2244 			}
2245 
2246 			/* munge into LP64 addr */
2247 			devpath = CAST_USER_ADDR_T(tmp);
2248 		}
2249 
2250 		height = 0;
2251 		flags = 0;
2252 	}
2253 
2254 	if (flags != 0) {
2255 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2256 		return EINVAL;
2257 	}
2258 
2259 	error = get_imgsrc_rootvnode(height, &rvp);
2260 	if (error != 0) {
2261 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2262 		return error;
2263 	}
2264 
2265 	IMGSRC_DEBUG("got old root vnode\n");
2266 
2267 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2268 
2269 	/* Can only move once */
2270 	mp = vnode_mount(rvp);
2271 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2272 		IMGSRC_DEBUG("Already moved.\n");
2273 		error = EBUSY;
2274 		goto out0;
2275 	}
2276 
2277 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2278 	IMGSRC_DEBUG("Starting updated.\n");
2279 
2280 	/* Get exclusive rwlock on mount, authorize update on mp */
2281 	error = mount_begin_update(mp, ctx, 0);
2282 	if (error != 0) {
2283 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2284 		goto out0;
2285 	}
2286 
2287 	/*
2288 	 * It can only be moved once.  Flag is set under the rwlock,
2289 	 * so we're now safe to proceed.
2290 	 */
2291 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2292 		IMGSRC_DEBUG("Already moved [2]\n");
2293 		goto out1;
2294 	}
2295 
2296 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2297 
2298 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2299 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2300 	if (error != 0) {
2301 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2302 		goto out1;
2303 	}
2304 
2305 	IMGSRC_DEBUG("Covered vp OK.\n");
2306 
2307 	/* Sanity check the name caller has provided */
2308 	vfsp = mp->mnt_vtable;
2309 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2310 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2311 		    vfsp->vfc_name, fsname);
2312 		error = EINVAL;
2313 		goto out2;
2314 	}
2315 
2316 	/* Check the device vnode and update mount-from name, for local filesystems */
2317 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2318 		IMGSRC_DEBUG("Local, doing device validation.\n");
2319 
2320 		if (devpath != USER_ADDR_NULL) {
2321 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2322 			if (error) {
2323 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2324 				goto out2;
2325 			}
2326 
2327 			vnode_put(devvp);
2328 		}
2329 	}
2330 
2331 	/*
2332 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2333 	 * and increment the name cache's mount generation
2334 	 */
2335 
2336 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2337 	error = place_mount_and_checkdirs(mp, vp, ctx);
2338 	if (error != 0) {
2339 		goto out2;
2340 	}
2341 
2342 	placed = TRUE;
2343 
2344 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2345 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2346 
2347 	/* Forbid future moves */
2348 	mount_lock(mp);
2349 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2350 	mount_unlock(mp);
2351 
2352 	/* Finally, add to mount list, completely ready to go */
2353 	if (mount_list_add(mp) != 0) {
2354 		/*
2355 		 * The system is shutting down trying to umount
2356 		 * everything, so fail with a plausible errno.
2357 		 */
2358 		error = EBUSY;
2359 		goto out3;
2360 	}
2361 
2362 	mount_end_update(mp);
2363 	vnode_put(rvp);
2364 	zfree(ZV_NAMEI, old_mntonname);
2365 
2366 	vfs_notify_mount(pvp);
2367 
2368 	return 0;
2369 out3:
2370 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2371 
2372 	mount_lock(mp);
2373 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2374 	mount_unlock(mp);
2375 
2376 out2:
2377 	/*
2378 	 * Placing the mp on the vnode clears VMOUNT,
2379 	 * so cleanup is different after that point
2380 	 */
2381 	if (placed) {
2382 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2383 		undo_place_on_covered_vp(mp, vp);
2384 	} else {
2385 		vnode_lock_spin(vp);
2386 		CLR(vp->v_flag, VMOUNT);
2387 		vnode_unlock(vp);
2388 	}
2389 out1:
2390 	mount_end_update(mp);
2391 
2392 out0:
2393 	vnode_put(rvp);
2394 	zfree(ZV_NAMEI, old_mntonname);
2395 	return error;
2396 }
2397 
2398 #endif /* CONFIG_IMGSRC_ACCESS */
2399 
2400 void
enablequotas(struct mount * mp,vfs_context_t ctx)2401 enablequotas(struct mount *mp, vfs_context_t ctx)
2402 {
2403 	struct nameidata qnd;
2404 	int type;
2405 	char qfpath[MAXPATHLEN];
2406 	const char *qfname = QUOTAFILENAME;
2407 	const char *qfopsname = QUOTAOPSNAME;
2408 	const char *qfextension[] = INITQFNAMES;
2409 
2410 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2411 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2412 		return;
2413 	}
2414 	/*
2415 	 * Enable filesystem disk quotas if necessary.
2416 	 * We ignore errors as this should not interfere with final mount
2417 	 */
2418 	for (type = 0; type < MAXQUOTAS; type++) {
2419 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2420 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2421 		    CAST_USER_ADDR_T(qfpath), ctx);
2422 		if (namei(&qnd) != 0) {
2423 			continue;           /* option file to trigger quotas is not present */
2424 		}
2425 		vnode_put(qnd.ni_vp);
2426 		nameidone(&qnd);
2427 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2428 
2429 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2430 	}
2431 	return;
2432 }
2433 
2434 
2435 static int
checkdirs_callback(proc_t p,void * arg)2436 checkdirs_callback(proc_t p, void * arg)
2437 {
2438 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2439 	vnode_t olddp = cdrp->olddp;
2440 	vnode_t newdp = cdrp->newdp;
2441 	struct filedesc *fdp = &p->p_fd;
2442 	vnode_t new_cvp = newdp;
2443 	vnode_t new_rvp = newdp;
2444 	vnode_t old_cvp = NULL;
2445 	vnode_t old_rvp = NULL;
2446 
2447 	/*
2448 	 * XXX Also needs to iterate each thread in the process to see if it
2449 	 * XXX is using a per-thread current working directory, and, if so,
2450 	 * XXX update that as well.
2451 	 */
2452 
2453 	/*
2454 	 * First, with the proc_fdlock held, check to see if we will need
2455 	 * to do any work.  If not, we will get out fast.
2456 	 */
2457 	proc_fdlock(p);
2458 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2459 		proc_fdunlock(p);
2460 		return PROC_RETURNED;
2461 	}
2462 	proc_fdunlock(p);
2463 
2464 	/*
2465 	 * Ok, we will have to do some work.  Always take two refs
2466 	 * because we might need that many.  We'll dispose of whatever
2467 	 * we ended up not using.
2468 	 */
2469 	if (vnode_ref(newdp) != 0) {
2470 		return PROC_RETURNED;
2471 	}
2472 	if (vnode_ref(newdp) != 0) {
2473 		vnode_rele(newdp);
2474 		return PROC_RETURNED;
2475 	}
2476 
2477 	proc_dirs_lock_exclusive(p);
2478 	/*
2479 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2480 	 * have to do all of the checks again.
2481 	 */
2482 	proc_fdlock(p);
2483 	if (fdp->fd_cdir == olddp) {
2484 		old_cvp = olddp;
2485 		fdp->fd_cdir = newdp;
2486 		new_cvp = NULL;
2487 	}
2488 	if (fdp->fd_rdir == olddp) {
2489 		old_rvp = olddp;
2490 		fdp->fd_rdir = newdp;
2491 		new_rvp = NULL;
2492 	}
2493 	proc_fdunlock(p);
2494 	proc_dirs_unlock_exclusive(p);
2495 
2496 	/*
2497 	 * Dispose of any references that are no longer needed.
2498 	 */
2499 	if (old_cvp != NULL) {
2500 		vnode_rele(old_cvp);
2501 	}
2502 	if (old_rvp != NULL) {
2503 		vnode_rele(old_rvp);
2504 	}
2505 	if (new_cvp != NULL) {
2506 		vnode_rele(new_cvp);
2507 	}
2508 	if (new_rvp != NULL) {
2509 		vnode_rele(new_rvp);
2510 	}
2511 
2512 	return PROC_RETURNED;
2513 }
2514 
2515 
2516 
2517 /*
2518  * Scan all active processes to see if any of them have a current
2519  * or root directory onto which the new filesystem has just been
2520  * mounted. If so, replace them with the new mount point.
2521  */
2522 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2523 checkdirs(vnode_t olddp, vfs_context_t ctx)
2524 {
2525 	vnode_t newdp;
2526 	vnode_t tvp;
2527 	int err;
2528 	struct cdirargs cdr;
2529 
2530 	if (olddp->v_usecount == 1) {
2531 		return 0;
2532 	}
2533 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2534 
2535 	if (err != 0) {
2536 #if DIAGNOSTIC
2537 		panic("mount: lost mount: error %d", err);
2538 #endif
2539 		return err;
2540 	}
2541 
2542 	cdr.olddp = olddp;
2543 	cdr.newdp = newdp;
2544 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2545 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2546 
2547 	if (rootvnode == olddp) {
2548 		vnode_ref(newdp);
2549 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2550 		tvp = rootvnode;
2551 		rootvnode = newdp;
2552 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2553 		vnode_rele(tvp);
2554 	}
2555 
2556 	vnode_put(newdp);
2557 	return 0;
2558 }
2559 
2560 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2561 	"com.apple.private.vfs.role-account-unmount"
2562 
2563 /*
2564  * Unmount a file system.
2565  *
2566  * Note: unmount takes a path to the vnode mounted on as argument,
2567  * not special file (as before).
2568  */
2569 /* ARGSUSED */
2570 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2571 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2572 {
2573 	vnode_t vp;
2574 	struct mount *mp;
2575 	int error;
2576 	struct nameidata nd;
2577 	vfs_context_t ctx;
2578 
2579 	/*
2580 	 * If the process has the entitlement, use the kernel's context when
2581 	 * performing lookup on the mount path as the process might lack proper
2582 	 * permission to access the directory.
2583 	 */
2584 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2585 	    vfs_context_kernel() : vfs_context_current();
2586 
2587 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2588 	    UIO_USERSPACE, uap->path, ctx);
2589 	error = namei(&nd);
2590 	if (error) {
2591 		return error;
2592 	}
2593 	vp = nd.ni_vp;
2594 	mp = vp->v_mount;
2595 	nameidone(&nd);
2596 
2597 #if CONFIG_MACF
2598 	error = mac_mount_check_umount(ctx, mp);
2599 	if (error != 0) {
2600 		vnode_put(vp);
2601 		return error;
2602 	}
2603 #endif
2604 	/*
2605 	 * Must be the root of the filesystem
2606 	 */
2607 	if ((vp->v_flag & VROOT) == 0) {
2608 		vnode_put(vp);
2609 		return EINVAL;
2610 	}
2611 	mount_ref(mp, 0);
2612 	vnode_put(vp);
2613 	/* safedounmount consumes the mount ref */
2614 	return safedounmount(mp, uap->flags, ctx);
2615 }
2616 
2617 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2618 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2619 {
2620 	mount_t mp;
2621 
2622 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2623 	if (mp == (mount_t)0) {
2624 		return ENOENT;
2625 	}
2626 	mount_ref(mp, 0);
2627 	mount_iterdrop(mp);
2628 	/* safedounmount consumes the mount ref */
2629 	return safedounmount(mp, flags, ctx);
2630 }
2631 
2632 /*
2633  * The mount struct comes with a mount ref which will be consumed.
2634  * Do the actual file system unmount, prevent some common foot shooting.
2635  */
2636 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2637 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2638 {
2639 	int error;
2640 	proc_t p = vfs_context_proc(ctx);
2641 
2642 	/*
2643 	 * If the file system is not responding and MNT_NOBLOCK
2644 	 * is set and not a forced unmount then return EBUSY.
2645 	 */
2646 	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2647 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2648 		error = EBUSY;
2649 		goto out;
2650 	}
2651 
2652 	/*
2653 	 * Skip authorization in two cases:
2654 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2655 	 *   This entitlement allows non-root processes unmount volumes mounted by
2656 	 *   other processes.
2657 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2658 	 *   attempt.
2659 	 */
2660 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2661 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2662 		/*
2663 		 * Only root, or the user that did the original mount is
2664 		 * permitted to unmount this filesystem.
2665 		 */
2666 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2667 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2668 			goto out;
2669 		}
2670 	}
2671 	/*
2672 	 * Don't allow unmounting the root file system, or other volumes
2673 	 * associated with it (for example, the associated VM or DATA mounts) .
2674 	 */
2675 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2676 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2677 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2678 			    mp->mnt_vfsstat.f_mntonname);
2679 		}
2680 		error = EBUSY; /* the root (or associated volumes) is always busy */
2681 		goto out;
2682 	}
2683 
2684 	/*
2685 	 * If the mount is providing the root filesystem's disk image
2686 	 * (i.e. imageboot), don't allow unmounting
2687 	 */
2688 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2689 		error = EBUSY;
2690 		goto out;
2691 	}
2692 
2693 	return dounmount(mp, flags, 1, ctx);
2694 
2695 out:
2696 	mount_drop(mp, 0);
2697 	return error;
2698 }
2699 
2700 /*
2701  * Do the actual file system unmount.
2702  */
2703 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2704 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2705 {
2706 	vnode_t coveredvp = (vnode_t)0;
2707 	int error;
2708 	int needwakeup = 0;
2709 	int forcedunmount = 0;
2710 	int lflags = 0;
2711 	struct vnode *devvp = NULLVP;
2712 #if CONFIG_TRIGGERS
2713 	proc_t p = vfs_context_proc(ctx);
2714 	int did_vflush = 0;
2715 	int pflags_save = 0;
2716 #endif /* CONFIG_TRIGGERS */
2717 
2718 #if CONFIG_FSE
2719 	if (!(flags & MNT_FORCE)) {
2720 		fsevent_unmount(mp, ctx);  /* has to come first! */
2721 	}
2722 #endif
2723 
2724 	mount_lock(mp);
2725 
2726 	/*
2727 	 * If already an unmount in progress just return EBUSY.
2728 	 * Even a forced unmount cannot override.
2729 	 */
2730 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2731 		if (withref != 0) {
2732 			mount_drop(mp, 1);
2733 		}
2734 		mount_unlock(mp);
2735 		return EBUSY;
2736 	}
2737 
2738 	if (flags & MNT_FORCE) {
2739 		forcedunmount = 1;
2740 		mp->mnt_lflag |= MNT_LFORCE;
2741 	}
2742 
2743 #if CONFIG_TRIGGERS
2744 	if (flags & MNT_NOBLOCK && p != kernproc) {
2745 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2746 	}
2747 #endif
2748 
2749 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2750 	mp->mnt_lflag |= MNT_LUNMOUNT;
2751 	mp->mnt_flag &= ~MNT_ASYNC;
2752 	/*
2753 	 * anyone currently in the fast path that
2754 	 * trips over the cached rootvp will be
2755 	 * dumped out and forced into the slow path
2756 	 * to regenerate a new cached value
2757 	 */
2758 	mp->mnt_realrootvp = NULLVP;
2759 	mount_unlock(mp);
2760 
2761 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2762 		/*
2763 		 * Force unmount any mounts in this filesystem.
2764 		 * If any unmounts fail - just leave them dangling.
2765 		 * Avoids recursion.
2766 		 */
2767 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2768 	}
2769 
2770 	/*
2771 	 * taking the name_cache_lock exclusively will
2772 	 * insure that everyone is out of the fast path who
2773 	 * might be trying to use a now stale copy of
2774 	 * vp->v_mountedhere->mnt_realrootvp
2775 	 * bumping mount_generation causes the cached values
2776 	 * to be invalidated
2777 	 */
2778 	name_cache_lock();
2779 	mount_generation++;
2780 	name_cache_unlock();
2781 
2782 
2783 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2784 	if (withref != 0) {
2785 		mount_drop(mp, 0);
2786 	}
2787 	error = 0;
2788 	if (forcedunmount == 0) {
2789 		ubc_umount(mp); /* release cached vnodes */
2790 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2791 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2792 			if (error) {
2793 				mount_lock(mp);
2794 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2795 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2796 				mp->mnt_lflag &= ~MNT_LFORCE;
2797 				goto out;
2798 			}
2799 		}
2800 	}
2801 
2802 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2803 
2804 #if CONFIG_TRIGGERS
2805 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2806 	did_vflush = 1;
2807 #endif
2808 	if (forcedunmount) {
2809 		lflags |= FORCECLOSE;
2810 	}
2811 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2812 	if ((forcedunmount == 0) && error) {
2813 		mount_lock(mp);
2814 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2815 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2816 		mp->mnt_lflag &= ~MNT_LFORCE;
2817 		goto out;
2818 	}
2819 
2820 	/* make sure there are no one in the mount iterations or lookup */
2821 	mount_iterdrain(mp);
2822 
2823 	error = VFS_UNMOUNT(mp, flags, ctx);
2824 	if (error) {
2825 		mount_iterreset(mp);
2826 		mount_lock(mp);
2827 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2828 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2829 		mp->mnt_lflag &= ~MNT_LFORCE;
2830 		goto out;
2831 	}
2832 
2833 	/* increment the operations count */
2834 	if (!error) {
2835 		OSAddAtomic(1, &vfs_nummntops);
2836 	}
2837 
2838 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2839 		/* hold an io reference and drop the usecount before close */
2840 		devvp = mp->mnt_devvp;
2841 		vnode_getalways(devvp);
2842 		vnode_rele(devvp);
2843 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2844 		    ctx);
2845 		vnode_clearmountedon(devvp);
2846 		vnode_put(devvp);
2847 	}
2848 	lck_rw_done(&mp->mnt_rwlock);
2849 	mount_list_remove(mp);
2850 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2851 
2852 	/* mark the mount point hook in the vp but not drop the ref yet */
2853 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2854 		/*
2855 		 * The covered vnode needs special handling. Trying to get an
2856 		 * iocount must not block here as this may lead to deadlocks
2857 		 * if the Filesystem to which the covered vnode belongs is
2858 		 * undergoing forced unmounts. Since we hold a usecount, the
2859 		 * vnode cannot be reused (it can, however, still be terminated)
2860 		 */
2861 		vnode_getalways(coveredvp);
2862 		vnode_lock_spin(coveredvp);
2863 
2864 		mp->mnt_crossref++;
2865 		coveredvp->v_mountedhere = (struct mount *)0;
2866 		CLR(coveredvp->v_flag, VMOUNT);
2867 
2868 		vnode_unlock(coveredvp);
2869 		vnode_put(coveredvp);
2870 	}
2871 
2872 	mount_list_lock();
2873 	mp->mnt_vtable->vfc_refcount--;
2874 	mount_list_unlock();
2875 
2876 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
2877 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2878 	mount_lock(mp);
2879 	mp->mnt_lflag |= MNT_LDEAD;
2880 
2881 	if (mp->mnt_lflag & MNT_LWAIT) {
2882 		/*
2883 		 * do the wakeup here
2884 		 * in case we block in mount_refdrain
2885 		 * which will drop the mount lock
2886 		 * and allow anyone blocked in vfs_busy
2887 		 * to wakeup and see the LDEAD state
2888 		 */
2889 		mp->mnt_lflag &= ~MNT_LWAIT;
2890 		wakeup((caddr_t)mp);
2891 	}
2892 	mount_refdrain(mp);
2893 
2894 	/* free disk_conditioner_info structure for this mount */
2895 	disk_conditioner_unmount(mp);
2896 
2897 out:
2898 	if (mp->mnt_lflag & MNT_LWAIT) {
2899 		mp->mnt_lflag &= ~MNT_LWAIT;
2900 		needwakeup = 1;
2901 	}
2902 
2903 #if CONFIG_TRIGGERS
2904 	if (flags & MNT_NOBLOCK && p != kernproc) {
2905 		// Restore P_NOREMOTEHANG bit to its previous value
2906 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
2907 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2908 		}
2909 	}
2910 
2911 	/*
2912 	 * Callback and context are set together under the mount lock, and
2913 	 * never cleared, so we're safe to examine them here, drop the lock,
2914 	 * and call out.
2915 	 */
2916 	if (mp->mnt_triggercallback != NULL) {
2917 		mount_unlock(mp);
2918 		if (error == 0) {
2919 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2920 		} else if (did_vflush) {
2921 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2922 		}
2923 	} else {
2924 		mount_unlock(mp);
2925 	}
2926 #else
2927 	mount_unlock(mp);
2928 #endif /* CONFIG_TRIGGERS */
2929 
2930 	lck_rw_done(&mp->mnt_rwlock);
2931 
2932 	if (needwakeup) {
2933 		wakeup((caddr_t)mp);
2934 	}
2935 
2936 	if (!error) {
2937 		if ((coveredvp != NULLVP)) {
2938 			vnode_t pvp = NULLVP;
2939 
2940 			/*
2941 			 * The covered vnode needs special handling. Trying to
2942 			 * get an iocount must not block here as this may lead
2943 			 * to deadlocks if the Filesystem to which the covered
2944 			 * vnode belongs is undergoing forced unmounts. Since we
2945 			 * hold a usecount, the  vnode cannot be reused
2946 			 * (it can, however, still be terminated).
2947 			 */
2948 			vnode_getalways(coveredvp);
2949 
2950 			mount_dropcrossref(mp, coveredvp, 0);
2951 			/*
2952 			 * We'll _try_ to detect if this really needs to be
2953 			 * done. The coveredvp can only be in termination (or
2954 			 * terminated) if the coveredvp's mount point is in a
2955 			 * forced unmount (or has been) since we still hold the
2956 			 * ref.
2957 			 */
2958 			if (!vnode_isrecycled(coveredvp)) {
2959 				pvp = vnode_getparent(coveredvp);
2960 #if CONFIG_TRIGGERS
2961 				if (coveredvp->v_resolve) {
2962 					vnode_trigger_rearm(coveredvp, ctx);
2963 				}
2964 #endif
2965 			}
2966 
2967 			vnode_rele(coveredvp);
2968 			vnode_put(coveredvp);
2969 			coveredvp = NULLVP;
2970 
2971 			if (pvp) {
2972 				lock_vnode_and_post(pvp, NOTE_WRITE);
2973 				vnode_put(pvp);
2974 			}
2975 		} else if (mp->mnt_flag & MNT_ROOTFS) {
2976 			if (nc_smr_enabled) {
2977 				vfs_smr_synchronize();
2978 			}
2979 
2980 			mount_lock_destroy(mp);
2981 #if CONFIG_MACF
2982 			mac_mount_label_destroy(mp);
2983 #endif
2984 			zfree(mount_zone, mp);
2985 		} else {
2986 			panic("dounmount: no coveredvp");
2987 		}
2988 	}
2989 	return error;
2990 }
2991 
2992 /*
2993  * Unmount any mounts in this filesystem.
2994  */
2995 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2996 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2997 {
2998 	mount_t smp;
2999 	fsid_t *fsids, fsid;
3000 	int fsids_sz;
3001 	int count = 0, i, m = 0;
3002 	vnode_t vp;
3003 
3004 	mount_list_lock();
3005 
3006 	// Get an array to hold the submounts fsids.
3007 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
3008 	count++;
3009 	fsids_sz = count * sizeof(fsid_t);
3010 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3011 	if (fsids == NULL) {
3012 		mount_list_unlock();
3013 		goto out;
3014 	}
3015 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
3016 
3017 	/*
3018 	 * Fill the array with submount fsids.
3019 	 * Since mounts are always added to the tail of the mount list, the
3020 	 * list is always in mount order.
3021 	 * For each mount check if the mounted-on vnode belongs to a
3022 	 * mount that's already added to our array of mounts to be unmounted.
3023 	 */
3024 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3025 		vp = smp->mnt_vnodecovered;
3026 		if (vp == NULL) {
3027 			continue;
3028 		}
3029 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3030 		for (i = 0; i <= m; i++) {
3031 			if (fsids[i].val[0] == fsid.val[0] &&
3032 			    fsids[i].val[1] == fsid.val[1]) {
3033 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3034 				break;
3035 			}
3036 		}
3037 	}
3038 	mount_list_unlock();
3039 
3040 	// Unmount the submounts in reverse order. Ignore errors.
3041 	for (i = m; i > 0; i--) {
3042 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3043 		if (smp) {
3044 			mount_ref(smp, 0);
3045 			mount_iterdrop(smp);
3046 			(void) dounmount(smp, flags, 1, ctx);
3047 		}
3048 	}
3049 out:
3050 	kfree_data(fsids, fsids_sz);
3051 }
3052 
3053 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3054 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3055 {
3056 	vnode_hold(dp);
3057 	vnode_lock(dp);
3058 	mp->mnt_crossref--;
3059 
3060 	if (mp->mnt_crossref < 0) {
3061 		panic("mount cross refs -ve");
3062 	}
3063 
3064 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3065 		if (need_put) {
3066 			vnode_put_locked(dp);
3067 		}
3068 		vnode_drop_and_unlock(dp);
3069 
3070 		if (nc_smr_enabled) {
3071 			vfs_smr_synchronize();
3072 		}
3073 
3074 		mount_lock_destroy(mp);
3075 #if CONFIG_MACF
3076 		mac_mount_label_destroy(mp);
3077 #endif
3078 		zfree(mount_zone, mp);
3079 		return;
3080 	}
3081 	if (need_put) {
3082 		vnode_put_locked(dp);
3083 	}
3084 	vnode_drop_and_unlock(dp);
3085 }
3086 
3087 
3088 /*
3089  * Sync each mounted filesystem.
3090  */
3091 #if DIAGNOSTIC
3092 int syncprt = 0;
3093 #endif
3094 
3095 int print_vmpage_stat = 0;
3096 
3097 /*
3098  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3099  *			mounted read-write with the passed waitfor value.
3100  *
3101  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3102  *		arg	user argument (please see below)
3103  *
3104  * User argument is a pointer to 32 bit unsigned integer which describes the
3105  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3106  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3107  * waitfor value.
3108  *
3109  * Returns:		VFS_RETURNED
3110  */
3111 static int
sync_callback(mount_t mp,void * arg)3112 sync_callback(mount_t mp, void *arg)
3113 {
3114 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3115 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3116 		unsigned waitfor = MNT_NOWAIT;
3117 
3118 		if (arg) {
3119 			waitfor = *(uint32_t*)arg;
3120 		}
3121 
3122 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3123 		if (waitfor != MNT_WAIT &&
3124 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3125 		    waitfor != MNT_NOWAIT &&
3126 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3127 		    waitfor != MNT_DWAIT &&
3128 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3129 			panic("Passed inappropriate waitfor %u to "
3130 			    "sync_callback()", waitfor);
3131 		}
3132 
3133 		mp->mnt_flag &= ~MNT_ASYNC;
3134 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3135 		if (asyncflag) {
3136 			mp->mnt_flag |= MNT_ASYNC;
3137 		}
3138 	}
3139 
3140 	return VFS_RETURNED;
3141 }
3142 
3143 /* ARGSUSED */
3144 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3145 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3146 {
3147 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3148 
3149 	if (print_vmpage_stat) {
3150 		vm_countdirtypages();
3151 	}
3152 
3153 #if DIAGNOSTIC
3154 	if (syncprt) {
3155 		vfs_bufstats();
3156 	}
3157 #endif /* DIAGNOSTIC */
3158 	return 0;
3159 }
3160 
3161 typedef enum {
3162 	SYNC_ALL = 0,
3163 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3164 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3165 } sync_type_t;
3166 
3167 static int
sync_internal_callback(mount_t mp,void * arg)3168 sync_internal_callback(mount_t mp, void *arg)
3169 {
3170 	if (arg) {
3171 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3172 		    (mp->mnt_flag & MNT_LOCAL);
3173 		sync_type_t sync_type = *((sync_type_t *)arg);
3174 
3175 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3176 			return VFS_RETURNED;
3177 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3178 			return VFS_RETURNED;
3179 		}
3180 	}
3181 
3182 	(void)sync_callback(mp, NULL);
3183 
3184 	return VFS_RETURNED;
3185 }
3186 
3187 int sync_thread_state = 0;
3188 int sync_timeout_seconds = 5;
3189 
3190 #define SYNC_THREAD_RUN       0x0001
3191 #define SYNC_THREAD_RUNNING   0x0002
3192 
3193 #if CONFIG_PHYS_WRITE_ACCT
3194 thread_t pm_sync_thread;
3195 #endif /* CONFIG_PHYS_WRITE_ACCT */
3196 
3197 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3198 sync_thread(__unused void *arg, __unused wait_result_t wr)
3199 {
3200 	sync_type_t sync_type;
3201 #if CONFIG_PHYS_WRITE_ACCT
3202 	pm_sync_thread = current_thread();
3203 #endif /* CONFIG_PHYS_WRITE_ACCT */
3204 
3205 	lck_mtx_lock(&sync_mtx_lck);
3206 	while (sync_thread_state & SYNC_THREAD_RUN) {
3207 		sync_thread_state &= ~SYNC_THREAD_RUN;
3208 		lck_mtx_unlock(&sync_mtx_lck);
3209 
3210 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3211 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3212 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3213 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3214 
3215 		lck_mtx_lock(&sync_mtx_lck);
3216 	}
3217 	/*
3218 	 * This wakeup _has_ to be issued before the lock is released otherwise
3219 	 * we may end up waking up a thread in sync_internal which is
3220 	 * expecting a wakeup from a thread it just created and not from this
3221 	 * thread which is about to exit.
3222 	 */
3223 	wakeup(&sync_thread_state);
3224 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3225 #if CONFIG_PHYS_WRITE_ACCT
3226 	pm_sync_thread = NULL;
3227 #endif /* CONFIG_PHYS_WRITE_ACCT */
3228 	lck_mtx_unlock(&sync_mtx_lck);
3229 
3230 	if (print_vmpage_stat) {
3231 		vm_countdirtypages();
3232 	}
3233 
3234 #if DIAGNOSTIC
3235 	if (syncprt) {
3236 		vfs_bufstats();
3237 	}
3238 #endif /* DIAGNOSTIC */
3239 }
3240 
3241 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3242 
3243 /*
3244  * An in-kernel sync for power management to call.
3245  * This function always returns within sync_timeout seconds.
3246  */
3247 __private_extern__ int
sync_internal(void)3248 sync_internal(void)
3249 {
3250 	thread_t thd = NULL;
3251 	int error;
3252 	int thread_created = FALSE;
3253 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3254 
3255 	lck_mtx_lock(&sync_mtx_lck);
3256 	sync_thread_state |= SYNC_THREAD_RUN;
3257 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3258 		int kr;
3259 
3260 		sync_thread_state |= SYNC_THREAD_RUNNING;
3261 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3262 		if (kr != KERN_SUCCESS) {
3263 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3264 			lck_mtx_unlock(&sync_mtx_lck);
3265 			printf("sync_thread failed\n");
3266 			return 0;
3267 		}
3268 		thread_created = TRUE;
3269 	}
3270 
3271 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3272 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3273 	if (error) {
3274 		struct timeval now;
3275 
3276 		microtime(&now);
3277 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3278 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3279 			sync_timeout_last_print.tv_sec = now.tv_sec;
3280 		}
3281 	}
3282 
3283 	if (thread_created) {
3284 		thread_deallocate(thd);
3285 	}
3286 
3287 	return 0;
3288 } /* end of sync_internal call */
3289 
3290 /*
3291  * Change filesystem quotas.
3292  */
3293 #if QUOTA
3294 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3295 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3296 {
3297 	struct mount *mp;
3298 	int error, quota_cmd, quota_status = 0;
3299 	caddr_t datap;
3300 	size_t fnamelen;
3301 	struct nameidata nd;
3302 	vfs_context_t ctx = vfs_context_current();
3303 	struct dqblk my_dqblk = {};
3304 
3305 	AUDIT_ARG(uid, uap->uid);
3306 	AUDIT_ARG(cmd, uap->cmd);
3307 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3308 	    uap->path, ctx);
3309 	error = namei(&nd);
3310 	if (error) {
3311 		return error;
3312 	}
3313 	mp = nd.ni_vp->v_mount;
3314 	mount_ref(mp, 0);
3315 	vnode_put(nd.ni_vp);
3316 	nameidone(&nd);
3317 
3318 #if CONFIG_MACF
3319 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3320 	if (error != 0) {
3321 		goto out;
3322 	}
3323 #endif
3324 
3325 	/* copyin any data we will need for downstream code */
3326 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3327 
3328 	switch (quota_cmd) {
3329 	case Q_QUOTAON:
3330 		/* uap->arg specifies a file from which to take the quotas */
3331 		fnamelen = MAXPATHLEN;
3332 		datap = zalloc(ZV_NAMEI);
3333 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3334 		break;
3335 	case Q_GETQUOTA:
3336 		/* uap->arg is a pointer to a dqblk structure. */
3337 		datap = (caddr_t) &my_dqblk;
3338 		break;
3339 	case Q_SETQUOTA:
3340 	case Q_SETUSE:
3341 		/* uap->arg is a pointer to a dqblk structure. */
3342 		datap = (caddr_t) &my_dqblk;
3343 		if (proc_is64bit(p)) {
3344 			struct user_dqblk       my_dqblk64;
3345 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3346 			if (error == 0) {
3347 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3348 			}
3349 		} else {
3350 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3351 		}
3352 		break;
3353 	case Q_QUOTASTAT:
3354 		/* uap->arg is a pointer to an integer */
3355 		datap = (caddr_t) &quota_status;
3356 		break;
3357 	default:
3358 		datap = NULL;
3359 		break;
3360 	} /* switch */
3361 
3362 	if (error == 0) {
3363 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3364 	}
3365 
3366 	switch (quota_cmd) {
3367 	case Q_QUOTAON:
3368 		if (datap != NULL) {
3369 			zfree(ZV_NAMEI, datap);
3370 		}
3371 		break;
3372 	case Q_GETQUOTA:
3373 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3374 		if (error == 0) {
3375 			if (proc_is64bit(p)) {
3376 				struct user_dqblk       my_dqblk64;
3377 
3378 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3379 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3380 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3381 			} else {
3382 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3383 			}
3384 		}
3385 		break;
3386 	case Q_QUOTASTAT:
3387 		/* uap->arg is a pointer to an integer */
3388 		if (error == 0) {
3389 			error = copyout(datap, uap->arg, sizeof(quota_status));
3390 		}
3391 		break;
3392 	default:
3393 		break;
3394 	} /* switch */
3395 
3396 out:
3397 	mount_drop(mp, 0);
3398 	return error;
3399 }
3400 #else
3401 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3402 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3403 {
3404 	return EOPNOTSUPP;
3405 }
3406 #endif /* QUOTA */
3407 
3408 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3409 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3410 {
3411 	int error;
3412 	vfs_context_t ctx = vfs_context_current();
3413 
3414 #if CONFIG_MACF
3415 	error = mac_mount_check_stat(ctx, mp);
3416 	if (error != 0) {
3417 		return error;
3418 	}
3419 #endif
3420 
3421 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3422 	if (error != 0) {
3423 		return error;
3424 	}
3425 
3426 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3427 }
3428 
3429 /*
3430  * Get filesystem statistics.
3431  *
3432  * Returns:	0			Success
3433  *	namei:???
3434  *	vfs_update_vfsstat:???
3435  *	munge_statfs:EFAULT
3436  */
3437 /* ARGSUSED */
3438 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3439 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3440 {
3441 	int error;
3442 	struct mount *mp;
3443 	struct nameidata nd;
3444 	vfs_context_t ctx = vfs_context_current();
3445 	vnode_t vp;
3446 
3447 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3448 	    UIO_USERSPACE, uap->path, ctx);
3449 	error = namei(&nd);
3450 	if (error != 0) {
3451 		return error;
3452 	}
3453 	vp = nd.ni_vp;
3454 	mp = vp->v_mount;
3455 	nameidone(&nd);
3456 
3457 	error = statfs_internal(p, mp, uap->buf);
3458 	vnode_put(vp);
3459 
3460 	return error;
3461 }
3462 
3463 /*
3464  * Get filesystem statistics.
3465  */
3466 /* ARGSUSED */
3467 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3468 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3469 {
3470 	int error;
3471 	vnode_t vp = NULL;
3472 	struct mount *mp;
3473 
3474 	AUDIT_ARG(fd, uap->fd);
3475 
3476 	if ((error = file_vnode(uap->fd, &vp)) ||
3477 	    (error = vnode_getwithref(vp))) {
3478 		goto out;
3479 	}
3480 
3481 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3482 
3483 	mp = vp->v_mount;
3484 	if (!mp) {
3485 		error = EBADF;
3486 		goto out_vnode;
3487 	}
3488 
3489 	error = statfs_internal(p, mp, uap->buf);
3490 
3491 out_vnode:
3492 	vnode_put(vp);
3493 
3494 out:
3495 	if (vp != NULL) {
3496 		file_drop(uap->fd);
3497 	}
3498 
3499 	return error;
3500 }
3501 
3502 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3503 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3504 {
3505 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3506 
3507 	bzero(sfs, sizeof(*sfs));
3508 
3509 	sfs->f_bsize = vsfs->f_bsize;
3510 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3511 	sfs->f_blocks = vsfs->f_blocks;
3512 	sfs->f_bfree = vsfs->f_bfree;
3513 	sfs->f_bavail = vsfs->f_bavail;
3514 	sfs->f_files = vsfs->f_files;
3515 	sfs->f_ffree = vsfs->f_ffree;
3516 	sfs->f_fsid = vsfs->f_fsid;
3517 	sfs->f_owner = vsfs->f_owner;
3518 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3519 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3520 	sfs->f_fssubtype = vsfs->f_fssubtype;
3521 	sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3522 	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3523 		strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3524 	} else {
3525 		strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3526 	}
3527 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3528 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3529 }
3530 
3531 /*
3532  * Get file system statistics in 64-bit mode
3533  */
3534 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3535 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3536 {
3537 	struct mount *mp;
3538 	int error;
3539 	struct nameidata *ndp;
3540 	struct statfs64 *sfsp;
3541 	vfs_context_t ctxp = vfs_context_current();
3542 	vnode_t vp;
3543 	struct {
3544 		struct nameidata nd;
3545 		struct statfs64 sfs;
3546 	} *__nameidata_statfs64;
3547 
3548 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3549 	    Z_WAITOK);
3550 	ndp = &__nameidata_statfs64->nd;
3551 
3552 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3553 	    UIO_USERSPACE, uap->path, ctxp);
3554 	error = namei(ndp);
3555 	if (error != 0) {
3556 		goto out;
3557 	}
3558 	vp = ndp->ni_vp;
3559 	mp = vp->v_mount;
3560 	nameidone(ndp);
3561 
3562 #if CONFIG_MACF
3563 	error = mac_mount_check_stat(ctxp, mp);
3564 	if (error != 0) {
3565 		vnode_put(vp);
3566 		goto out;
3567 	}
3568 #endif
3569 
3570 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3571 	if (error != 0) {
3572 		vnode_put(vp);
3573 		goto out;
3574 	}
3575 
3576 	sfsp = &__nameidata_statfs64->sfs;
3577 	vfs_get_statfs64(mp, sfsp);
3578 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3579 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3580 		/* This process does not want to see a seperate data volume mountpoint */
3581 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3582 	}
3583 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3584 	vnode_put(vp);
3585 
3586 out:
3587 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3588 
3589 	return error;
3590 }
3591 
3592 /*
3593  * Get file system statistics in 64-bit mode
3594  */
3595 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3596 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3597 {
3598 	struct vnode *vp;
3599 	struct mount *mp;
3600 	struct statfs64 sfs;
3601 	int error;
3602 
3603 	AUDIT_ARG(fd, uap->fd);
3604 
3605 	if ((error = file_vnode(uap->fd, &vp))) {
3606 		return error;
3607 	}
3608 
3609 	error = vnode_getwithref(vp);
3610 	if (error) {
3611 		file_drop(uap->fd);
3612 		return error;
3613 	}
3614 
3615 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3616 
3617 	mp = vp->v_mount;
3618 	if (!mp) {
3619 		error = EBADF;
3620 		goto out;
3621 	}
3622 
3623 #if CONFIG_MACF
3624 	error = mac_mount_check_stat(vfs_context_current(), mp);
3625 	if (error != 0) {
3626 		goto out;
3627 	}
3628 #endif
3629 
3630 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3631 		goto out;
3632 	}
3633 
3634 	vfs_get_statfs64(mp, &sfs);
3635 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3636 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3637 		/* This process does not want to see a seperate data volume mountpoint */
3638 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3639 	}
3640 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3641 
3642 out:
3643 	file_drop(uap->fd);
3644 	vnode_put(vp);
3645 
3646 	return error;
3647 }
3648 
3649 struct getfsstat_struct {
3650 	user_addr_t     sfsp;
3651 	user_addr_t     *mp;
3652 	int             count;
3653 	int             maxcount;
3654 	int             flags;
3655 	int             error;
3656 };
3657 
3658 
3659 static int
getfsstat_callback(mount_t mp,void * arg)3660 getfsstat_callback(mount_t mp, void * arg)
3661 {
3662 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3663 	struct vfsstatfs *sp;
3664 	int error, my_size;
3665 	vfs_context_t ctx = vfs_context_current();
3666 
3667 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3668 #if CONFIG_MACF
3669 		error = mac_mount_check_stat(ctx, mp);
3670 		if (error != 0) {
3671 			fstp->error = error;
3672 			return VFS_RETURNED_DONE;
3673 		}
3674 #endif
3675 		sp = &mp->mnt_vfsstat;
3676 		/*
3677 		 * If MNT_NOWAIT is specified, do not refresh the
3678 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3679 		 */
3680 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3681 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3682 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3683 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3684 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3685 			return VFS_RETURNED;
3686 		}
3687 
3688 		/*
3689 		 * Need to handle LP64 version of struct statfs
3690 		 */
3691 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3692 		if (error) {
3693 			fstp->error = error;
3694 			return VFS_RETURNED_DONE;
3695 		}
3696 		fstp->sfsp += my_size;
3697 
3698 		if (fstp->mp) {
3699 #if CONFIG_MACF
3700 			error = mac_mount_label_get(mp, *fstp->mp);
3701 			if (error) {
3702 				fstp->error = error;
3703 				return VFS_RETURNED_DONE;
3704 			}
3705 #endif
3706 			fstp->mp++;
3707 		}
3708 	}
3709 	fstp->count++;
3710 	return VFS_RETURNED;
3711 }
3712 
3713 /*
3714  * Get statistics on all filesystems.
3715  */
3716 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3717 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3718 {
3719 	struct __mac_getfsstat_args muap;
3720 
3721 	muap.buf = uap->buf;
3722 	muap.bufsize = uap->bufsize;
3723 	muap.mac = USER_ADDR_NULL;
3724 	muap.macsize = 0;
3725 	muap.flags = uap->flags;
3726 
3727 	return __mac_getfsstat(p, &muap, retval);
3728 }
3729 
3730 /*
3731  * __mac_getfsstat: Get MAC-related file system statistics
3732  *
3733  * Parameters:    p                        (ignored)
3734  *                uap                      User argument descriptor (see below)
3735  *                retval                   Count of file system statistics (N stats)
3736  *
3737  * Indirect:      uap->bufsize             Buffer size
3738  *                uap->macsize             MAC info size
3739  *                uap->buf                 Buffer where information will be returned
3740  *                uap->mac                 MAC info
3741  *                uap->flags               File system flags
3742  *
3743  *
3744  * Returns:        0                       Success
3745  *                !0                       Not success
3746  *
3747  */
3748 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3749 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3750 {
3751 	user_addr_t sfsp;
3752 	user_addr_t *mp;
3753 	size_t count, maxcount, bufsize, macsize;
3754 	struct getfsstat_struct fst;
3755 
3756 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3757 		return EINVAL;
3758 	}
3759 
3760 	bufsize = (size_t) uap->bufsize;
3761 	macsize = (size_t) uap->macsize;
3762 
3763 	if (IS_64BIT_PROCESS(p)) {
3764 		maxcount = bufsize / sizeof(struct user64_statfs);
3765 	} else {
3766 		maxcount = bufsize / sizeof(struct user32_statfs);
3767 	}
3768 	sfsp = uap->buf;
3769 	count = 0;
3770 
3771 	mp = NULL;
3772 
3773 #if CONFIG_MACF
3774 	if (uap->mac != USER_ADDR_NULL) {
3775 		u_int32_t *mp0;
3776 		int error;
3777 		unsigned int i;
3778 
3779 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3780 		if (count != maxcount) {
3781 			return EINVAL;
3782 		}
3783 
3784 		/* Copy in the array */
3785 		mp0 = kalloc_data(macsize, Z_WAITOK);
3786 		if (mp0 == NULL) {
3787 			return ENOMEM;
3788 		}
3789 
3790 		error = copyin(uap->mac, mp0, macsize);
3791 		if (error) {
3792 			kfree_data(mp0, macsize);
3793 			return error;
3794 		}
3795 
3796 		/* Normalize to an array of user_addr_t */
3797 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3798 		if (mp == NULL) {
3799 			kfree_data(mp0, macsize);
3800 			return ENOMEM;
3801 		}
3802 
3803 		for (i = 0; i < count; i++) {
3804 			if (IS_64BIT_PROCESS(p)) {
3805 				mp[i] = ((user_addr_t *)mp0)[i];
3806 			} else {
3807 				mp[i] = (user_addr_t)mp0[i];
3808 			}
3809 		}
3810 		kfree_data(mp0, macsize);
3811 	}
3812 #endif
3813 
3814 
3815 	fst.sfsp = sfsp;
3816 	fst.mp = mp;
3817 	fst.flags = uap->flags;
3818 	fst.count = 0;
3819 	fst.error = 0;
3820 	fst.maxcount = (int)maxcount;
3821 
3822 
3823 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3824 
3825 	if (mp) {
3826 		kfree_data(mp, count * sizeof(user_addr_t));
3827 	}
3828 
3829 	if (fst.error) {
3830 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3831 		return fst.error;
3832 	}
3833 
3834 	if (fst.sfsp && fst.count > fst.maxcount) {
3835 		*retval = fst.maxcount;
3836 	} else {
3837 		*retval = fst.count;
3838 	}
3839 	return 0;
3840 }
3841 
3842 static int
getfsstat64_callback(mount_t mp,void * arg)3843 getfsstat64_callback(mount_t mp, void * arg)
3844 {
3845 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3846 	struct vfsstatfs *sp;
3847 	struct statfs64 sfs;
3848 	int error;
3849 
3850 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3851 #if CONFIG_MACF
3852 		error = mac_mount_check_stat(vfs_context_current(), mp);
3853 		if (error != 0) {
3854 			fstp->error = error;
3855 			return VFS_RETURNED_DONE;
3856 		}
3857 #endif
3858 		sp = &mp->mnt_vfsstat;
3859 		/*
3860 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3861 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
3862 		 *
3863 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3864 		 * getfsstat, since the constants are out of the same
3865 		 * namespace.
3866 		 */
3867 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3868 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3869 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3870 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3871 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3872 			return VFS_RETURNED;
3873 		}
3874 
3875 		vfs_get_statfs64(mp, &sfs);
3876 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3877 		if (error) {
3878 			fstp->error = error;
3879 			return VFS_RETURNED_DONE;
3880 		}
3881 		fstp->sfsp += sizeof(sfs);
3882 	}
3883 	fstp->count++;
3884 	return VFS_RETURNED;
3885 }
3886 
3887 /*
3888  * Get statistics on all file systems in 64 bit mode.
3889  */
3890 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3891 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3892 {
3893 	user_addr_t sfsp;
3894 	int count, maxcount;
3895 	struct getfsstat_struct fst;
3896 
3897 	maxcount = uap->bufsize / sizeof(struct statfs64);
3898 
3899 	sfsp = uap->buf;
3900 	count = 0;
3901 
3902 	fst.sfsp = sfsp;
3903 	fst.flags = uap->flags;
3904 	fst.count = 0;
3905 	fst.error = 0;
3906 	fst.maxcount = maxcount;
3907 
3908 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3909 
3910 	if (fst.error) {
3911 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3912 		return fst.error;
3913 	}
3914 
3915 	if (fst.sfsp && fst.count > fst.maxcount) {
3916 		*retval = fst.maxcount;
3917 	} else {
3918 		*retval = fst.count;
3919 	}
3920 
3921 	return 0;
3922 }
3923 
3924 /*
3925  * gets the associated vnode with the file descriptor passed.
3926  * as input
3927  *
3928  * INPUT
3929  * ctx - vfs context of caller
3930  * fd - file descriptor for which vnode is required.
3931  * vpp - Pointer to pointer to vnode to be returned.
3932  *
3933  * The vnode is returned with an iocount so any vnode obtained
3934  * by this call needs a vnode_put
3935  *
3936  */
3937 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3938 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3939 {
3940 	int error;
3941 	vnode_t vp;
3942 	struct fileproc *fp;
3943 	proc_t p = vfs_context_proc(ctx);
3944 
3945 	*vpp =  NULLVP;
3946 
3947 	error = fp_getfvp(p, fd, &fp, &vp);
3948 	if (error) {
3949 		return error;
3950 	}
3951 
3952 	error = vnode_getwithref(vp);
3953 	if (error) {
3954 		(void)fp_drop(p, fd, fp, 0);
3955 		return error;
3956 	}
3957 
3958 	(void)fp_drop(p, fd, fp, 0);
3959 	*vpp = vp;
3960 	return error;
3961 }
3962 
3963 /*
3964  * Wrapper function around namei to start lookup from a directory
3965  * specified by a file descriptor ni_dirfd.
3966  *
3967  * In addition to all the errors returned by namei, this call can
3968  * return ENOTDIR if the file descriptor does not refer to a directory.
3969  * and EBADF if the file descriptor is not valid.
3970  */
3971 int
nameiat(struct nameidata * ndp,int dirfd)3972 nameiat(struct nameidata *ndp, int dirfd)
3973 {
3974 	if ((dirfd != AT_FDCWD) &&
3975 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3976 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
3977 		int error = 0;
3978 		char c;
3979 
3980 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3981 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
3982 			if (error) {
3983 				return error;
3984 			}
3985 		} else {
3986 			c = *((char *)(ndp->ni_dirp));
3987 		}
3988 
3989 		if (c != '/') {
3990 			vnode_t dvp_at;
3991 
3992 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3993 			    &dvp_at);
3994 			if (error) {
3995 				return error;
3996 			}
3997 
3998 			if (vnode_vtype(dvp_at) != VDIR) {
3999 				vnode_put(dvp_at);
4000 				return ENOTDIR;
4001 			}
4002 
4003 			ndp->ni_dvp = dvp_at;
4004 			ndp->ni_cnd.cn_flags |= USEDVP;
4005 			error = namei(ndp);
4006 			ndp->ni_cnd.cn_flags &= ~USEDVP;
4007 			vnode_put(dvp_at);
4008 			return error;
4009 		}
4010 	}
4011 
4012 	return namei(ndp);
4013 }
4014 
4015 /*
4016  * Change current working directory to a given file descriptor.
4017  */
4018 /* ARGSUSED */
4019 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)4020 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
4021 {
4022 	vnode_t vp;
4023 	vnode_t tdp;
4024 	vnode_t tvp;
4025 	struct mount *mp;
4026 	int error, should_put = 1;
4027 	vfs_context_t ctx = vfs_context_current();
4028 
4029 	AUDIT_ARG(fd, uap->fd);
4030 	if (per_thread && uap->fd == -1) {
4031 		/*
4032 		 * Switching back from per-thread to per process CWD; verify we
4033 		 * in fact have one before proceeding.  The only success case
4034 		 * for this code path is to return 0 preemptively after zapping
4035 		 * the thread structure contents.
4036 		 */
4037 		thread_t th = vfs_context_thread(ctx);
4038 		if (th) {
4039 			uthread_t uth = get_bsdthread_info(th);
4040 			tvp = uth->uu_cdir;
4041 			uth->uu_cdir = NULLVP;
4042 			if (tvp != NULLVP) {
4043 				vnode_rele(tvp);
4044 				return 0;
4045 			}
4046 		}
4047 		return EBADF;
4048 	}
4049 
4050 	if ((error = file_vnode(uap->fd, &vp))) {
4051 		return error;
4052 	}
4053 	if ((error = vnode_getwithref(vp))) {
4054 		file_drop(uap->fd);
4055 		return error;
4056 	}
4057 
4058 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4059 
4060 	if (vp->v_type != VDIR) {
4061 		error = ENOTDIR;
4062 		goto out;
4063 	}
4064 
4065 #if CONFIG_MACF
4066 	error = mac_vnode_check_chdir(ctx, vp);
4067 	if (error) {
4068 		goto out;
4069 	}
4070 #endif
4071 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4072 	if (error) {
4073 		goto out;
4074 	}
4075 
4076 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4077 		if (vfs_busy(mp, LK_NOWAIT)) {
4078 			error = EACCES;
4079 			goto out;
4080 		}
4081 		error = VFS_ROOT(mp, &tdp, ctx);
4082 		vfs_unbusy(mp);
4083 		if (error) {
4084 			break;
4085 		}
4086 		vnode_put(vp);
4087 		vp = tdp;
4088 	}
4089 	if (error) {
4090 		goto out;
4091 	}
4092 	if ((error = vnode_ref(vp))) {
4093 		goto out;
4094 	}
4095 	vnode_put(vp);
4096 	should_put = 0;
4097 
4098 	if (per_thread) {
4099 		thread_t th = vfs_context_thread(ctx);
4100 		if (th) {
4101 			uthread_t uth = get_bsdthread_info(th);
4102 			tvp = uth->uu_cdir;
4103 			uth->uu_cdir = vp;
4104 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4105 		} else {
4106 			vnode_rele(vp);
4107 			error = ENOENT;
4108 			goto out;
4109 		}
4110 	} else {
4111 		proc_dirs_lock_exclusive(p);
4112 		proc_fdlock(p);
4113 		tvp = p->p_fd.fd_cdir;
4114 		p->p_fd.fd_cdir = vp;
4115 		proc_fdunlock(p);
4116 		proc_dirs_unlock_exclusive(p);
4117 	}
4118 
4119 	if (tvp) {
4120 		vnode_rele(tvp);
4121 	}
4122 
4123 out:
4124 	if (should_put) {
4125 		vnode_put(vp);
4126 	}
4127 	file_drop(uap->fd);
4128 
4129 	return error;
4130 }
4131 
4132 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4133 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4134 {
4135 	return common_fchdir(p, uap, 0);
4136 }
4137 
4138 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4139 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4140 {
4141 	return common_fchdir(p, (void *)uap, 1);
4142 }
4143 
4144 
4145 /*
4146  * Change current working directory (".").
4147  *
4148  * Returns:	0			Success
4149  *	change_dir:ENOTDIR
4150  *	change_dir:???
4151  *	vnode_ref:ENOENT		No such file or directory
4152  */
4153 /* ARGSUSED */
4154 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4155 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4156 {
4157 	int error;
4158 	vnode_t tvp;
4159 
4160 	error = change_dir(ndp, ctx);
4161 	if (error) {
4162 		return error;
4163 	}
4164 	if ((error = vnode_ref(ndp->ni_vp))) {
4165 		vnode_put(ndp->ni_vp);
4166 		return error;
4167 	}
4168 	/*
4169 	 * drop the iocount we picked up in change_dir
4170 	 */
4171 	vnode_put(ndp->ni_vp);
4172 
4173 	if (per_thread) {
4174 		thread_t th = vfs_context_thread(ctx);
4175 		if (th) {
4176 			uthread_t uth = get_bsdthread_info(th);
4177 			tvp = uth->uu_cdir;
4178 			uth->uu_cdir = ndp->ni_vp;
4179 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4180 		} else {
4181 			vnode_rele(ndp->ni_vp);
4182 			return ENOENT;
4183 		}
4184 	} else {
4185 		proc_dirs_lock_exclusive(p);
4186 		proc_fdlock(p);
4187 		tvp = p->p_fd.fd_cdir;
4188 		p->p_fd.fd_cdir = ndp->ni_vp;
4189 		proc_fdunlock(p);
4190 		proc_dirs_unlock_exclusive(p);
4191 	}
4192 
4193 	if (tvp) {
4194 		vnode_rele(tvp);
4195 	}
4196 
4197 	return 0;
4198 }
4199 
4200 
4201 /*
4202  * Change current working directory (".").
4203  *
4204  * Returns:	0			Success
4205  *	chdir_internal:ENOTDIR
4206  *	chdir_internal:ENOENT		No such file or directory
4207  *	chdir_internal:???
4208  */
4209 /* ARGSUSED */
4210 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4211 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4212 {
4213 	struct nameidata nd;
4214 	vfs_context_t ctx = vfs_context_current();
4215 
4216 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4217 	    UIO_USERSPACE, uap->path, ctx);
4218 
4219 	return chdir_internal(p, ctx, &nd, per_thread);
4220 }
4221 
4222 
4223 /*
4224  * chdir
4225  *
4226  * Change current working directory (".") for the entire process
4227  *
4228  * Parameters:  p       Process requesting the call
4229  *              uap     User argument descriptor (see below)
4230  *              retval  (ignored)
4231  *
4232  * Indirect parameters:	uap->path	Directory path
4233  *
4234  * Returns:	0			Success
4235  *              common_chdir: ENOTDIR
4236  *              common_chdir: ENOENT	No such file or directory
4237  *              common_chdir: ???
4238  *
4239  */
4240 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4241 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4242 {
4243 	return common_chdir(p, (void *)uap, 0);
4244 }
4245 
4246 /*
4247  * __pthread_chdir
4248  *
4249  * Change current working directory (".") for a single thread
4250  *
4251  * Parameters:  p       Process requesting the call
4252  *              uap     User argument descriptor (see below)
4253  *              retval  (ignored)
4254  *
4255  * Indirect parameters:	uap->path	Directory path
4256  *
4257  * Returns:	0			Success
4258  *              common_chdir: ENOTDIR
4259  *		common_chdir: ENOENT	No such file or directory
4260  *		common_chdir: ???
4261  *
4262  */
4263 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4264 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4265 {
4266 	return common_chdir(p, (void *)uap, 1);
4267 }
4268 
4269 
4270 /*
4271  * Change notion of root (``/'') directory.
4272  */
4273 /* ARGSUSED */
4274 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4275 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4276 {
4277 	struct filedesc *fdp = &p->p_fd;
4278 	int error;
4279 	struct nameidata nd;
4280 	vnode_t tvp;
4281 	vfs_context_t ctx = vfs_context_current();
4282 
4283 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4284 		return error;
4285 	}
4286 
4287 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4288 	    UIO_USERSPACE, uap->path, ctx);
4289 	error = change_dir(&nd, ctx);
4290 	if (error) {
4291 		return error;
4292 	}
4293 
4294 #if CONFIG_MACF
4295 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4296 	    &nd.ni_cnd);
4297 	if (error) {
4298 		vnode_put(nd.ni_vp);
4299 		return error;
4300 	}
4301 #endif
4302 
4303 	if ((error = vnode_ref(nd.ni_vp))) {
4304 		vnode_put(nd.ni_vp);
4305 		return error;
4306 	}
4307 	vnode_put(nd.ni_vp);
4308 
4309 	/*
4310 	 * This lock provides the guarantee that as long as you hold the lock
4311 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4312 	 * on a referenced vnode in namei when determining the rootvnode for
4313 	 * a process.
4314 	 */
4315 	/* needed for synchronization with lookup */
4316 	proc_dirs_lock_exclusive(p);
4317 	/* needed for setting the flag and other activities on the fd itself */
4318 	proc_fdlock(p);
4319 	tvp = fdp->fd_rdir;
4320 	fdp->fd_rdir = nd.ni_vp;
4321 	fdt_flag_set(fdp, FD_CHROOT);
4322 	proc_fdunlock(p);
4323 	proc_dirs_unlock_exclusive(p);
4324 
4325 	if (tvp != NULL) {
4326 		vnode_rele(tvp);
4327 	}
4328 
4329 	return 0;
4330 }
4331 
4332 #define PATHSTATICBUFLEN 256
4333 #define PIVOT_ROOT_ENTITLEMENT              \
4334        "com.apple.private.vfs.pivot-root"
4335 
4336 #if defined(XNU_TARGET_OS_OSX)
4337 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4338 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4339 {
4340 	int error;
4341 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4342 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4343 	char *new_rootfs_path_before_buf = NULL;
4344 	char *old_rootfs_path_after_buf = NULL;
4345 	char *incoming = NULL;
4346 	char *outgoing = NULL;
4347 	vnode_t incoming_rootvp = NULLVP;
4348 	size_t bytes_copied;
4349 
4350 	/*
4351 	 * XXX : Additional restrictions needed
4352 	 * - perhaps callable only once.
4353 	 */
4354 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4355 		return error;
4356 	}
4357 
4358 	/*
4359 	 * pivot_root can be executed by launchd only.
4360 	 * Enforce entitlement.
4361 	 */
4362 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4363 		return EPERM;
4364 	}
4365 
4366 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4367 	if (error == ENAMETOOLONG) {
4368 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4369 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4370 	}
4371 
4372 	if (error) {
4373 		goto out;
4374 	}
4375 
4376 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4377 	if (error == ENAMETOOLONG) {
4378 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4379 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4380 	}
4381 	if (error) {
4382 		goto out;
4383 	}
4384 
4385 	if (new_rootfs_path_before_buf) {
4386 		incoming = new_rootfs_path_before_buf;
4387 	} else {
4388 		incoming = &new_rootfs_path_before[0];
4389 	}
4390 
4391 	if (old_rootfs_path_after_buf) {
4392 		outgoing = old_rootfs_path_after_buf;
4393 	} else {
4394 		outgoing = &old_rootfs_path_after[0];
4395 	}
4396 
4397 	/*
4398 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4399 	 * Userland is not allowed to pivot to an image.
4400 	 */
4401 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4402 	if (error) {
4403 		goto out;
4404 	}
4405 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4406 	if (error) {
4407 		goto out;
4408 	}
4409 
4410 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4411 
4412 out:
4413 	if (incoming_rootvp != NULLVP) {
4414 		vnode_put(incoming_rootvp);
4415 		incoming_rootvp = NULLVP;
4416 	}
4417 
4418 	if (old_rootfs_path_after_buf) {
4419 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4420 	}
4421 
4422 	if (new_rootfs_path_before_buf) {
4423 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4424 	}
4425 
4426 	return error;
4427 }
4428 #else
4429 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4430 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4431 {
4432 	return nosys(p, NULL, retval);
4433 }
4434 #endif /* XNU_TARGET_OS_OSX */
4435 
4436 /*
4437  * Common routine for chroot and chdir.
4438  *
4439  * Returns:	0			Success
4440  *		ENOTDIR			Not a directory
4441  *		namei:???		[anything namei can return]
4442  *		vnode_authorize:???	[anything vnode_authorize can return]
4443  */
4444 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4445 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4446 {
4447 	vnode_t vp;
4448 	int error;
4449 
4450 	if ((error = namei(ndp))) {
4451 		return error;
4452 	}
4453 	nameidone(ndp);
4454 	vp = ndp->ni_vp;
4455 
4456 	if (vp->v_type != VDIR) {
4457 		vnode_put(vp);
4458 		return ENOTDIR;
4459 	}
4460 
4461 #if CONFIG_MACF
4462 	error = mac_vnode_check_chdir(ctx, vp);
4463 	if (error) {
4464 		vnode_put(vp);
4465 		return error;
4466 	}
4467 #endif
4468 
4469 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4470 	if (error) {
4471 		vnode_put(vp);
4472 		return error;
4473 	}
4474 
4475 	return error;
4476 }
4477 
4478 /*
4479  * Free the vnode data (for directories) associated with the file glob.
4480  */
4481 struct fd_vn_data *
fg_vn_data_alloc(void)4482 fg_vn_data_alloc(void)
4483 {
4484 	struct fd_vn_data *fvdata;
4485 
4486 	/* Allocate per fd vnode data */
4487 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4488 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4489 	return fvdata;
4490 }
4491 
4492 /*
4493  * Free the vnode data (for directories) associated with the file glob.
4494  */
4495 void
fg_vn_data_free(void * fgvndata)4496 fg_vn_data_free(void *fgvndata)
4497 {
4498 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4499 
4500 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4501 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4502 	kfree_type(struct fd_vn_data, fvdata);
4503 }
4504 
4505 /*
4506  * Check permissions, allocate an open file structure,
4507  * and call the device open routine if any.
4508  *
4509  * Returns:	0			Success
4510  *		EINVAL
4511  *		EINTR
4512  *	falloc:ENFILE
4513  *	falloc:EMFILE
4514  *	falloc:ENOMEM
4515  *	vn_open_auth:???
4516  *	dupfdopen:???
4517  *	VNOP_ADVLOCK:???
4518  *	vnode_setsize:???
4519  *
4520  * XXX Need to implement uid, gid
4521  */
4522 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4523 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4524     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4525 {
4526 	proc_t p = vfs_context_proc(ctx);
4527 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4528 	struct fileproc *fp;
4529 	vnode_t vp;
4530 	int flags, oflags, amode;
4531 	int type, indx, error;
4532 	struct vfs_context context;
4533 	vnode_t authvp = NULLVP;
4534 
4535 	oflags = uflags;
4536 
4537 	amode = oflags & O_ACCMODE;
4538 	/*
4539 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4540 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4541 	 * with FREAD/FWRITE.
4542 	 */
4543 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4544 		return EINVAL;
4545 	}
4546 
4547 	flags = FFLAGS(uflags);
4548 	CLR(flags, FENCRYPTED);
4549 	CLR(flags, FUNENCRYPTED);
4550 
4551 	AUDIT_ARG(fflags, oflags);
4552 	AUDIT_ARG(mode, vap->va_mode);
4553 
4554 	if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4555 		return error;
4556 	}
4557 	if (flags & O_CLOEXEC) {
4558 		fp->fp_flags |= FP_CLOEXEC;
4559 	}
4560 	if (flags & O_CLOFORK) {
4561 		fp->fp_flags |= FP_CLOFORK;
4562 	}
4563 
4564 	/* setup state to recognize when fdesc_open was called */
4565 	uu->uu_dupfd = -1;
4566 
4567 	/*
4568 	 * Disable read/write access if file is opened with O_EVTONLY and
4569 	 * the process has requested to deny read/write access.
4570 	 */
4571 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4572 		flags &= ~(FREAD | FWRITE);
4573 	}
4574 
4575 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4576 		error = vnode_getfromfd(ctx, authfd, &authvp);
4577 		if (error) {
4578 			fp_free(p, indx, fp);
4579 			return error;
4580 		}
4581 	}
4582 
4583 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4584 		if (authvp != NULLVP) {
4585 			vnode_put(authvp);
4586 		}
4587 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4588 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4589 				*retval = indx;
4590 				return 0;
4591 			}
4592 		}
4593 		if (error == ERESTART) {
4594 			error = EINTR;
4595 		}
4596 		fp_free(p, indx, fp);
4597 		return error;
4598 	}
4599 
4600 	if (authvp != NULLVP) {
4601 		vnode_put(authvp);
4602 	}
4603 
4604 	uu->uu_dupfd = 0;
4605 	vp = ndp->ni_vp;
4606 
4607 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4608 	fp->fp_glob->fg_ops = &vnops;
4609 	fp_set_data(fp, vp);
4610 
4611 #if CONFIG_FILE_LEASES
4612 	/*
4613 	 * If we are creating a file or open with truncate, we need to break the
4614 	 * lease if there is a read lease placed on the parent dir.
4615 	 */
4616 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4617 		vnode_breakdirlease(vp, true, oflags);
4618 	}
4619 	/* Now check if there is a lease placed on the file itself. */
4620 	error = vnode_breaklease(vp, oflags, ctx);
4621 	if (error) {
4622 		goto bad;
4623 	}
4624 #endif /* CONFIG_FILE_LEASES */
4625 
4626 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4627 		struct flock lf = {
4628 			.l_whence = SEEK_SET,
4629 		};
4630 
4631 		if (flags & O_EXLOCK) {
4632 			lf.l_type = F_WRLCK;
4633 		} else {
4634 			lf.l_type = F_RDLCK;
4635 		}
4636 		type = F_FLOCK;
4637 		if ((flags & FNONBLOCK) == 0) {
4638 			type |= F_WAIT;
4639 		}
4640 #if CONFIG_MACF
4641 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4642 		    F_SETLK, &lf);
4643 		if (error) {
4644 			goto bad;
4645 		}
4646 #endif
4647 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4648 			goto bad;
4649 		}
4650 		fp->fp_glob->fg_flag |= FWASLOCKED;
4651 	}
4652 
4653 	/* try to truncate by setting the size attribute */
4654 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4655 		goto bad;
4656 	}
4657 
4658 	/*
4659 	 * For directories we hold some additional information in the fd.
4660 	 */
4661 	if (vnode_vtype(vp) == VDIR) {
4662 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4663 	} else {
4664 		fp->fp_glob->fg_vn_data = NULL;
4665 	}
4666 
4667 #if CONFIG_SECLUDED_MEMORY
4668 	if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4669 		memory_object_control_t moc;
4670 		const char *v_name;
4671 
4672 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4673 
4674 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4675 			/* nothing to do... */
4676 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4677 			/* writable -> no longer  eligible for secluded pages */
4678 			memory_object_mark_eligible_for_secluded(moc,
4679 			    FALSE);
4680 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4681 			char pathname[32] = { 0, };
4682 			size_t copied;
4683 			/* XXX FBDP: better way to detect /Applications/ ? */
4684 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4685 				(void)copyinstr(ndp->ni_dirp,
4686 				    pathname,
4687 				    sizeof(pathname),
4688 				    &copied);
4689 			} else {
4690 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4691 				    pathname,
4692 				    sizeof(pathname),
4693 				    &copied);
4694 			}
4695 			pathname[sizeof(pathname) - 1] = '\0';
4696 			if (strncmp(pathname,
4697 			    "/Applications/",
4698 			    strlen("/Applications/")) == 0 &&
4699 			    strncmp(pathname,
4700 			    "/Applications/Camera.app/",
4701 			    strlen("/Applications/Camera.app/")) != 0) {
4702 				/*
4703 				 * not writable
4704 				 * AND from "/Applications/"
4705 				 * AND not from "/Applications/Camera.app/"
4706 				 * ==> eligible for secluded
4707 				 */
4708 				memory_object_mark_eligible_for_secluded(moc,
4709 				    TRUE);
4710 			}
4711 		} else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4712 		    (v_name = vnode_getname(vp))) {
4713 			size_t len = strlen(v_name);
4714 
4715 			if (!strncmp(v_name, "dyld", len) ||
4716 			    !strncmp(v_name, "launchd", len) ||
4717 			    !strncmp(v_name, "Camera", len) ||
4718 			    !strncmp(v_name, "SpringBoard", len) ||
4719 			    !strncmp(v_name, "backboardd", len)) {
4720 				/*
4721 				 * This file matters when launching Camera:
4722 				 * do not store its contents in the secluded
4723 				 * pool that will be drained on Camera launch.
4724 				 */
4725 				memory_object_mark_eligible_for_secluded(moc,
4726 				    FALSE);
4727 			} else if (!strncmp(v_name, "mediaserverd", len)) {
4728 				memory_object_mark_eligible_for_secluded(moc,
4729 				    FALSE);
4730 				memory_object_mark_for_realtime(moc,
4731 				    true);
4732 			} else if (!strncmp(v_name, "bluetoothd", len)) {
4733 				/*
4734 				 * bluetoothd might be needed for realtime audio
4735 				 * playback.
4736 				 */
4737 				memory_object_mark_eligible_for_secluded(moc,
4738 				    FALSE);
4739 				memory_object_mark_for_realtime(moc,
4740 				    true);
4741 			} else {
4742 				char pathname[64] = { 0, };
4743 				size_t copied;
4744 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4745 					(void)copyinstr(ndp->ni_dirp,
4746 					    pathname,
4747 					    sizeof(pathname),
4748 					    &copied);
4749 				} else {
4750 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
4751 					    pathname,
4752 					    sizeof(pathname),
4753 					    &copied);
4754 				}
4755 				pathname[sizeof(pathname) - 1] = '\0';
4756 				if (strncmp(pathname,
4757 				    "/Library/Audio/Plug-Ins/",
4758 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4759 				    strncmp(pathname,
4760 				    "/System/Library/Audio/Plug-Ins/",
4761 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4762 					/*
4763 					 * This may be an audio plugin required
4764 					 * for realtime playback.
4765 					 * ==> NOT eligible for secluded.
4766 					 */
4767 					memory_object_mark_eligible_for_secluded(moc,
4768 					    FALSE);
4769 					memory_object_mark_for_realtime(moc,
4770 					    true);
4771 				}
4772 			}
4773 			vnode_putname(v_name);
4774 		}
4775 	}
4776 #endif /* CONFIG_SECLUDED_MEMORY */
4777 
4778 	vnode_put(vp);
4779 
4780 	/*
4781 	 * The first terminal open (without a O_NOCTTY) by a session leader
4782 	 * results in it being set as the controlling terminal.
4783 	 */
4784 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4785 	    !(flags & O_NOCTTY)) {
4786 		int tmp = 0;
4787 
4788 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4789 		    (caddr_t)&tmp, ctx);
4790 	}
4791 
4792 	proc_fdlock(p);
4793 	procfdtbl_releasefd(p, indx, NULL);
4794 
4795 	fp_drop(p, indx, fp, 1);
4796 	proc_fdunlock(p);
4797 
4798 	*retval = indx;
4799 
4800 	return 0;
4801 bad:
4802 	context = *vfs_context_current();
4803 	context.vc_ucred = fp->fp_glob->fg_cred;
4804 
4805 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4806 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4807 		struct flock lf = {
4808 			.l_whence = SEEK_SET,
4809 			.l_type = F_UNLCK,
4810 		};
4811 
4812 		(void)VNOP_ADVLOCK(
4813 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4814 	}
4815 
4816 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4817 	vnode_put(vp);
4818 	fp_free(p, indx, fp);
4819 
4820 	return error;
4821 }
4822 
4823 /*
4824  * While most of the *at syscall handlers can call nameiat() which
4825  * is a wrapper around namei, the use of namei and initialisation
4826  * of nameidata are far removed and in different functions  - namei
4827  * gets called in vn_open_auth for open1. So we'll just do here what
4828  * nameiat() does.
4829  */
4830 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4831 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4832     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4833     int dirfd, int authfd)
4834 {
4835 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4836 		int error;
4837 		char c;
4838 
4839 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4840 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4841 			if (error) {
4842 				return error;
4843 			}
4844 		} else {
4845 			c = *((char *)(ndp->ni_dirp));
4846 		}
4847 
4848 		if (c != '/') {
4849 			vnode_t dvp_at;
4850 
4851 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4852 			    &dvp_at);
4853 			if (error) {
4854 				return error;
4855 			}
4856 
4857 			if (vnode_vtype(dvp_at) != VDIR) {
4858 				vnode_put(dvp_at);
4859 				return ENOTDIR;
4860 			}
4861 
4862 			ndp->ni_dvp = dvp_at;
4863 			ndp->ni_cnd.cn_flags |= USEDVP;
4864 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4865 			    retval, authfd);
4866 			vnode_put(dvp_at);
4867 			return error;
4868 		}
4869 	}
4870 
4871 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4872 }
4873 
4874 /*
4875  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4876  *
4877  * Parameters:	p			Process requesting the open
4878  *		uap			User argument descriptor (see below)
4879  *		retval			Pointer to an area to receive the
4880  *					return calue from the system call
4881  *
4882  * Indirect:	uap->path		Path to open (same as 'open')
4883  *		uap->flags		Flags to open (same as 'open'
4884  *		uap->uid		UID to set, if creating
4885  *		uap->gid		GID to set, if creating
4886  *		uap->mode		File mode, if creating (same as 'open')
4887  *		uap->xsecurity		ACL to set, if creating
4888  *
4889  * Returns:	0			Success
4890  *		!0			errno value
4891  *
4892  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4893  *
4894  * XXX:		We should enummerate the possible errno values here, and where
4895  *		in the code they originated.
4896  */
4897 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4898 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4899 {
4900 	int ciferror;
4901 	kauth_filesec_t xsecdst;
4902 	struct vnode_attr va;
4903 	struct nameidata nd;
4904 	int cmode;
4905 
4906 	AUDIT_ARG(owner, uap->uid, uap->gid);
4907 
4908 	xsecdst = NULL;
4909 	if ((uap->xsecurity != USER_ADDR_NULL) &&
4910 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4911 		return ciferror;
4912 	}
4913 
4914 	VATTR_INIT(&va);
4915 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4916 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4917 	if (uap->uid != KAUTH_UID_NONE) {
4918 		VATTR_SET(&va, va_uid, uap->uid);
4919 	}
4920 	if (uap->gid != KAUTH_GID_NONE) {
4921 		VATTR_SET(&va, va_gid, uap->gid);
4922 	}
4923 	if (xsecdst != NULL) {
4924 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4925 		va.va_vaflags |= VA_FILESEC_ACL;
4926 	}
4927 
4928 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4929 	    uap->path, vfs_context_current());
4930 
4931 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4932 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4933 	if (xsecdst != NULL) {
4934 		kauth_filesec_free(xsecdst);
4935 	}
4936 
4937 	return ciferror;
4938 }
4939 
4940 /*
4941  * Go through the data-protected atomically controlled open (2)
4942  *
4943  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4944  */
4945 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4946 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4947     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4948 {
4949 	/*
4950 	 * Follow the same path as normal open(2)
4951 	 * Look up the item if it exists, and acquire the vnode.
4952 	 */
4953 	struct vnode_attr va;
4954 	struct nameidata nd;
4955 	int cmode;
4956 	int error;
4957 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4958 
4959 	VATTR_INIT(&va);
4960 	/* Mask off all but regular access permissions */
4961 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4962 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4963 
4964 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4965 	    path, ctx);
4966 
4967 	/*
4968 	 * Initialize the extra fields in vnode_attr to pass down our
4969 	 * extra fields.
4970 	 * 1. target cprotect class.
4971 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4972 	 */
4973 	if (flags & O_CREAT) {
4974 		/* lower level kernel code validates that the class is valid before applying it. */
4975 		if (class != PROTECTION_CLASS_DEFAULT) {
4976 			/*
4977 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4978 			 * file behave the same as open (2)
4979 			 */
4980 			VATTR_SET(&va, va_dataprotect_class, class);
4981 		}
4982 	}
4983 
4984 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4985 		if (flags & (O_RDWR | O_WRONLY)) {
4986 			/*
4987 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
4988 			 */
4989 			return EINVAL;
4990 		}
4991 		if (dpflags & O_DP_GETRAWENCRYPTED) {
4992 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4993 		}
4994 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4995 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4996 		}
4997 		if (dpflags & O_DP_AUTHENTICATE) {
4998 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4999 		}
5000 	}
5001 
5002 	error = open1at(vfs_context_current(), &nd, flags, &va,
5003 	    NULL, NULL, retval, fd, authfd);
5004 
5005 	return error;
5006 }
5007 
5008 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5009 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5010 {
5011 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5012 		return EINVAL;
5013 	}
5014 
5015 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5016 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5017 }
5018 
5019 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5020 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5021 {
5022 	if (uap->dpflags & O_DP_AUTHENTICATE) {
5023 		return EINVAL;
5024 	}
5025 
5026 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5027 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5028 }
5029 
5030 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5031 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5032     int fd, enum uio_seg segflg, int *retval)
5033 {
5034 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5035 	struct {
5036 		struct vnode_attr va;
5037 		struct nameidata nd;
5038 	} *__open_data;
5039 	struct vnode_attr *vap;
5040 	struct nameidata *ndp;
5041 	int cmode;
5042 	int error;
5043 
5044 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5045 	vap = &__open_data->va;
5046 	ndp = &__open_data->nd;
5047 
5048 	VATTR_INIT(vap);
5049 	/* Mask off all but regular access permissions */
5050 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5051 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5052 
5053 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5054 	    segflg, path, ctx);
5055 
5056 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5057 
5058 	kfree_type(typeof(*__open_data), __open_data);
5059 
5060 	return error;
5061 }
5062 
5063 int
open(proc_t p,struct open_args * uap,int32_t * retval)5064 open(proc_t p, struct open_args *uap, int32_t *retval)
5065 {
5066 	__pthread_testcancel(1);
5067 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5068 }
5069 
5070 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5071 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5072     int32_t *retval)
5073 {
5074 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5075 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5076 }
5077 
5078 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5079 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5080     int32_t *retval)
5081 {
5082 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5083 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
5084 }
5085 
5086 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5087 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5088 {
5089 	__pthread_testcancel(1);
5090 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5091 }
5092 
5093 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5094 
5095 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5096 vfs_context_can_open_by_id(vfs_context_t ctx)
5097 {
5098 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5099 		return TRUE;
5100 	}
5101 
5102 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5103 	           OPEN_BY_ID_ENTITLEMENT);
5104 }
5105 
5106 /*
5107  * openbyid_np: open a file given a file system id and a file system object id
5108  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5109  *	file systems that don't support object ids it is a node id (uint64_t).
5110  *
5111  * Parameters:	p			Process requesting the open
5112  *		uap			User argument descriptor (see below)
5113  *		retval			Pointer to an area to receive the
5114  *					return calue from the system call
5115  *
5116  * Indirect:	uap->path		Path to open (same as 'open')
5117  *
5118  *		uap->fsid		id of target file system
5119  *		uap->objid		id of target file system object
5120  *		uap->flags		Flags to open (same as 'open')
5121  *
5122  * Returns:	0			Success
5123  *		!0			errno value
5124  *
5125  *
5126  * XXX:		We should enummerate the possible errno values here, and where
5127  *		in the code they originated.
5128  */
5129 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5130 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5131 {
5132 	fsid_t fsid;
5133 	uint64_t objid;
5134 	int error;
5135 	char *buf = NULL;
5136 	int buflen = MAXPATHLEN;
5137 	int pathlen = 0;
5138 	vfs_context_t ctx = vfs_context_current();
5139 
5140 	if (!vfs_context_can_open_by_id(ctx)) {
5141 		return EPERM;
5142 	}
5143 
5144 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5145 		return error;
5146 	}
5147 
5148 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5149 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5150 		return error;
5151 	}
5152 
5153 	AUDIT_ARG(value32, fsid.val[0]);
5154 	AUDIT_ARG(value64, objid);
5155 
5156 	/*resolve path from fsis, objid*/
5157 	do {
5158 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5159 		if (buf == NULL) {
5160 			return ENOMEM;
5161 		}
5162 
5163 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5164 		    buf, FSOPT_ISREALFSID, &pathlen);
5165 
5166 		if (error) {
5167 			kfree_data(buf, buflen + 1);
5168 			buf = NULL;
5169 		}
5170 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5171 
5172 	if (error) {
5173 		return error;
5174 	}
5175 
5176 	buf[pathlen] = 0;
5177 
5178 	error = openat_internal(
5179 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5180 
5181 	kfree_data(buf, buflen + 1);
5182 
5183 	return error;
5184 }
5185 
5186 
5187 /*
5188  * Create a special file.
5189  */
5190 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5191     int fd);
5192 
5193 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5194 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5195     mode_t mode, int fd)
5196 {
5197 	vfs_context_t ctx = vfs_context_current();
5198 	struct nameidata nd;
5199 	vnode_t vp, dvp;
5200 	int error;
5201 
5202 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5203 	if ((mode & S_IFMT) == S_IFIFO) {
5204 		return mkfifo1(ctx, upath, vap, fd);
5205 	}
5206 
5207 	AUDIT_ARG(mode, mode);
5208 	AUDIT_ARG(value32, vap->va_rdev);
5209 
5210 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5211 		return error;
5212 	}
5213 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5214 	    UIO_USERSPACE, upath, ctx);
5215 	error = nameiat(&nd, fd);
5216 	if (error) {
5217 		return error;
5218 	}
5219 	dvp = nd.ni_dvp;
5220 	vp = nd.ni_vp;
5221 
5222 	if (vp != NULL) {
5223 		error = EEXIST;
5224 		goto out;
5225 	}
5226 
5227 	switch (mode & S_IFMT) {
5228 	case S_IFCHR:
5229 		VATTR_SET(vap, va_type, VCHR);
5230 		break;
5231 	case S_IFBLK:
5232 		VATTR_SET(vap, va_type, VBLK);
5233 		break;
5234 	default:
5235 		error = EINVAL;
5236 		goto out;
5237 	}
5238 
5239 #if CONFIG_MACF
5240 	error = mac_vnode_check_create(ctx,
5241 	    nd.ni_dvp, &nd.ni_cnd, vap);
5242 	if (error) {
5243 		goto out;
5244 	}
5245 #endif
5246 
5247 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5248 		goto out;
5249 	}
5250 
5251 #if CONFIG_FILE_LEASES
5252 	vnode_breakdirlease(dvp, false, O_WRONLY);
5253 #endif
5254 
5255 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5256 		goto out;
5257 	}
5258 
5259 	if (vp) {
5260 		int     update_flags = 0;
5261 
5262 		// Make sure the name & parent pointers are hooked up
5263 		if (vp->v_name == NULL) {
5264 			update_flags |= VNODE_UPDATE_NAME;
5265 		}
5266 		if (vp->v_parent == NULLVP) {
5267 			update_flags |= VNODE_UPDATE_PARENT;
5268 		}
5269 
5270 		if (update_flags) {
5271 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5272 		}
5273 
5274 #if CONFIG_FSE
5275 		add_fsevent(FSE_CREATE_FILE, ctx,
5276 		    FSE_ARG_VNODE, vp,
5277 		    FSE_ARG_DONE);
5278 #endif
5279 	}
5280 
5281 out:
5282 	/*
5283 	 * nameidone has to happen before we vnode_put(dvp)
5284 	 * since it may need to release the fs_nodelock on the dvp
5285 	 */
5286 	nameidone(&nd);
5287 
5288 	if (vp) {
5289 		vnode_put(vp);
5290 	}
5291 	vnode_put(dvp);
5292 
5293 	return error;
5294 }
5295 
5296 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5297 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5298 {
5299 	struct vnode_attr va;
5300 
5301 	VATTR_INIT(&va);
5302 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5303 	VATTR_SET(&va, va_rdev, uap->dev);
5304 
5305 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5306 }
5307 
5308 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5309 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5310 {
5311 	struct vnode_attr va;
5312 
5313 	VATTR_INIT(&va);
5314 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5315 	VATTR_SET(&va, va_rdev, uap->dev);
5316 
5317 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5318 }
5319 
5320 /*
5321  * Create a named pipe.
5322  *
5323  * Returns:	0			Success
5324  *		EEXIST
5325  *	namei:???
5326  *	vnode_authorize:???
5327  *	vn_create:???
5328  */
5329 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5330 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5331 {
5332 	vnode_t vp, dvp;
5333 	int error;
5334 	struct nameidata nd;
5335 
5336 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5337 	    UIO_USERSPACE, upath, ctx);
5338 	error = nameiat(&nd, fd);
5339 	if (error) {
5340 		return error;
5341 	}
5342 	dvp = nd.ni_dvp;
5343 	vp = nd.ni_vp;
5344 
5345 	/* check that this is a new file and authorize addition */
5346 	if (vp != NULL) {
5347 		error = EEXIST;
5348 		goto out;
5349 	}
5350 	VATTR_SET(vap, va_type, VFIFO);
5351 
5352 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5353 		goto out;
5354 	}
5355 
5356 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5357 out:
5358 	/*
5359 	 * nameidone has to happen before we vnode_put(dvp)
5360 	 * since it may need to release the fs_nodelock on the dvp
5361 	 */
5362 	nameidone(&nd);
5363 
5364 	if (vp) {
5365 		vnode_put(vp);
5366 	}
5367 	vnode_put(dvp);
5368 
5369 	return error;
5370 }
5371 
5372 
5373 /*
5374  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5375  *
5376  * Parameters:	p			Process requesting the open
5377  *		uap			User argument descriptor (see below)
5378  *		retval			(Ignored)
5379  *
5380  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5381  *		uap->uid		UID to set
5382  *		uap->gid		GID to set
5383  *		uap->mode		File mode to set (same as 'mkfifo')
5384  *		uap->xsecurity		ACL to set, if creating
5385  *
5386  * Returns:	0			Success
5387  *		!0			errno value
5388  *
5389  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5390  *
5391  * XXX:		We should enummerate the possible errno values here, and where
5392  *		in the code they originated.
5393  */
5394 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5395 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5396 {
5397 	int ciferror;
5398 	kauth_filesec_t xsecdst;
5399 	struct vnode_attr va;
5400 
5401 	AUDIT_ARG(owner, uap->uid, uap->gid);
5402 
5403 	xsecdst = KAUTH_FILESEC_NONE;
5404 	if (uap->xsecurity != USER_ADDR_NULL) {
5405 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5406 			return ciferror;
5407 		}
5408 	}
5409 
5410 	VATTR_INIT(&va);
5411 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5412 	if (uap->uid != KAUTH_UID_NONE) {
5413 		VATTR_SET(&va, va_uid, uap->uid);
5414 	}
5415 	if (uap->gid != KAUTH_GID_NONE) {
5416 		VATTR_SET(&va, va_gid, uap->gid);
5417 	}
5418 	if (xsecdst != KAUTH_FILESEC_NONE) {
5419 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5420 		va.va_vaflags |= VA_FILESEC_ACL;
5421 	}
5422 
5423 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5424 
5425 	if (xsecdst != KAUTH_FILESEC_NONE) {
5426 		kauth_filesec_free(xsecdst);
5427 	}
5428 	return ciferror;
5429 }
5430 
5431 /* ARGSUSED */
5432 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5433 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5434 {
5435 	struct vnode_attr va;
5436 
5437 	VATTR_INIT(&va);
5438 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5439 
5440 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5441 }
5442 
5443 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5444 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5445 {
5446 	struct vnode_attr va;
5447 
5448 	VATTR_INIT(&va);
5449 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5450 
5451 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5452 }
5453 
5454 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5455 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5456 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5457 
5458 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5459 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5460 {
5461 	int ret, len = _len;
5462 
5463 	*truncated_path = 0;
5464 
5465 	if (firmlink) {
5466 		ret = vn_getpath(dvp, path, &len);
5467 	} else {
5468 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5469 	}
5470 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5471 		if (leafname) {
5472 			path[len - 1] = '/';
5473 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5474 			if (len > MAXPATHLEN) {
5475 				char *ptr;
5476 
5477 				// the string got truncated!
5478 				*truncated_path = 1;
5479 				ptr = strrchr(path, '/');
5480 				if (ptr) {
5481 					*ptr = '\0';   // chop off the string at the last directory component
5482 				}
5483 				len = (int)strlen(path) + 1;
5484 			}
5485 		}
5486 	} else if (ret == 0) {
5487 		*truncated_path = 1;
5488 	} else if (ret != 0) {
5489 		struct vnode *mydvp = dvp;
5490 
5491 		if (ret != ENOSPC) {
5492 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5493 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5494 		}
5495 		*truncated_path = 1;
5496 
5497 		do {
5498 			if (mydvp->v_parent != NULL) {
5499 				mydvp = mydvp->v_parent;
5500 			} else if (mydvp->v_mount) {
5501 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5502 				break;
5503 			} else {
5504 				// no parent and no mount point?  only thing is to punt and say "/" changed
5505 				strlcpy(path, "/", _len);
5506 				len = 2;
5507 				mydvp = NULL;
5508 			}
5509 
5510 			if (mydvp == NULL) {
5511 				break;
5512 			}
5513 
5514 			len = _len;
5515 			if (firmlink) {
5516 				ret = vn_getpath(mydvp, path, &len);
5517 			} else {
5518 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5519 			}
5520 		} while (ret == ENOSPC);
5521 	}
5522 
5523 	return len;
5524 }
5525 
5526 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5527 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5528 {
5529 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5530 }
5531 
5532 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5533 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5534 {
5535 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5536 }
5537 
5538 /*
5539  * Make a hard file link.
5540  *
5541  * Returns:	0			Success
5542  *		EPERM
5543  *		EEXIST
5544  *		EXDEV
5545  *	namei:???
5546  *	vnode_authorize:???
5547  *	VNOP_LINK:???
5548  */
5549 /* ARGSUSED */
5550 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5551 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5552     user_addr_t link, int flag, enum uio_seg segflg)
5553 {
5554 	vnode_t vp, pvp, dvp, lvp;
5555 	struct nameidata nd;
5556 	int follow;
5557 	int error;
5558 #if CONFIG_FSE
5559 	fse_info finfo;
5560 #endif
5561 	int need_event, has_listeners, need_kpath2;
5562 	char *target_path = NULL;
5563 	char  *no_firmlink_path = NULL;
5564 	int truncated = 0;
5565 	int truncated_no_firmlink_path = 0;
5566 
5567 	vp = dvp = lvp = NULLVP;
5568 
5569 	/* look up the object we are linking to */
5570 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5571 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5572 	    segflg, path, ctx);
5573 
5574 	error = nameiat(&nd, fd1);
5575 	if (error) {
5576 		return error;
5577 	}
5578 	vp = nd.ni_vp;
5579 
5580 	nameidone(&nd);
5581 
5582 	/*
5583 	 * Normally, linking to directories is not supported.
5584 	 * However, some file systems may have limited support.
5585 	 */
5586 	if (vp->v_type == VDIR) {
5587 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5588 			error = EPERM;   /* POSIX */
5589 			goto out;
5590 		}
5591 
5592 		/* Linking to a directory requires ownership. */
5593 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5594 			struct vnode_attr dva;
5595 
5596 			VATTR_INIT(&dva);
5597 			VATTR_WANTED(&dva, va_uid);
5598 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5599 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5600 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5601 				error = EACCES;
5602 				goto out;
5603 			}
5604 		}
5605 	}
5606 
5607 	/* lookup the target node */
5608 #if CONFIG_TRIGGERS
5609 	nd.ni_op = OP_LINK;
5610 #endif
5611 	nd.ni_cnd.cn_nameiop = CREATE;
5612 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5613 	nd.ni_dirp = link;
5614 	error = nameiat(&nd, fd2);
5615 	if (error != 0) {
5616 		goto out;
5617 	}
5618 	dvp = nd.ni_dvp;
5619 	lvp = nd.ni_vp;
5620 
5621 #if CONFIG_MACF
5622 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5623 		goto out2;
5624 	}
5625 #endif
5626 
5627 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5628 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5629 		goto out2;
5630 	}
5631 
5632 	/* target node must not exist */
5633 	if (lvp != NULLVP) {
5634 		error = EEXIST;
5635 		goto out2;
5636 	}
5637 	/* cannot link across mountpoints */
5638 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5639 		error = EXDEV;
5640 		goto out2;
5641 	}
5642 
5643 	/* authorize creation of the target note */
5644 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5645 		goto out2;
5646 	}
5647 
5648 #if CONFIG_FILE_LEASES
5649 	vnode_breakdirlease(dvp, false, O_WRONLY);
5650 #endif
5651 
5652 	/* and finally make the link */
5653 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5654 	if (error) {
5655 		goto out2;
5656 	}
5657 
5658 #if CONFIG_MACF
5659 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5660 #endif
5661 
5662 #if CONFIG_FSE
5663 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5664 #else
5665 	need_event = 0;
5666 #endif
5667 	has_listeners = kauth_authorize_fileop_has_listeners();
5668 
5669 	need_kpath2 = 0;
5670 #if CONFIG_AUDIT
5671 	if (AUDIT_RECORD_EXISTS()) {
5672 		need_kpath2 = 1;
5673 	}
5674 #endif
5675 
5676 	if (need_event || has_listeners || need_kpath2) {
5677 		char *link_to_path = NULL;
5678 		int len, link_name_len;
5679 		int  len_no_firmlink_path = 0;
5680 
5681 		/* build the path to the new link file */
5682 		GET_PATH(target_path);
5683 
5684 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5685 		if (no_firmlink_path == NULL) {
5686 			GET_PATH(no_firmlink_path);
5687 		}
5688 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5689 
5690 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5691 
5692 		if (has_listeners) {
5693 			/* build the path to file we are linking to */
5694 			GET_PATH(link_to_path);
5695 
5696 			link_name_len = MAXPATHLEN;
5697 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5698 				/*
5699 				 * Call out to allow 3rd party notification of rename.
5700 				 * Ignore result of kauth_authorize_fileop call.
5701 				 */
5702 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5703 				    (uintptr_t)link_to_path,
5704 				    (uintptr_t)target_path);
5705 			}
5706 			if (link_to_path != NULL) {
5707 				RELEASE_PATH(link_to_path);
5708 			}
5709 		}
5710 #if CONFIG_FSE
5711 		if (need_event) {
5712 			/* construct fsevent */
5713 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5714 				if (truncated_no_firmlink_path) {
5715 					finfo.mode |= FSE_TRUNCATED_PATH;
5716 				}
5717 
5718 				// build the path to the destination of the link
5719 				add_fsevent(FSE_CREATE_FILE, ctx,
5720 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5721 				    FSE_ARG_FINFO, &finfo,
5722 				    FSE_ARG_DONE);
5723 			}
5724 
5725 			pvp = vp->v_parent;
5726 			// need an iocount on parent vnode in this case
5727 			if (pvp && pvp != dvp) {
5728 				pvp = vnode_getparent_if_different(vp, dvp);
5729 			}
5730 			if (pvp) {
5731 				add_fsevent(FSE_STAT_CHANGED, ctx,
5732 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5733 			}
5734 			if (pvp && pvp != dvp) {
5735 				vnode_put(pvp);
5736 			}
5737 		}
5738 #endif
5739 	}
5740 out2:
5741 	/*
5742 	 * nameidone has to happen before we vnode_put(dvp)
5743 	 * since it may need to release the fs_nodelock on the dvp
5744 	 */
5745 	nameidone(&nd);
5746 	if (target_path != NULL) {
5747 		RELEASE_PATH(target_path);
5748 	}
5749 	if (no_firmlink_path != NULL) {
5750 		RELEASE_PATH(no_firmlink_path);
5751 		no_firmlink_path = NULL;
5752 	}
5753 out:
5754 	if (lvp) {
5755 		vnode_put(lvp);
5756 	}
5757 	if (dvp) {
5758 		vnode_put(dvp);
5759 	}
5760 	vnode_put(vp);
5761 	return error;
5762 }
5763 
5764 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5765 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5766 {
5767 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5768 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5769 }
5770 
5771 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5772 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5773 {
5774 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5775 		return EINVAL;
5776 	}
5777 
5778 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5779 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5780 }
5781 
5782 /*
5783  * Make a symbolic link.
5784  *
5785  * We could add support for ACLs here too...
5786  */
5787 /* ARGSUSED */
5788 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5789 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5790     user_addr_t link, enum uio_seg segflg)
5791 {
5792 	struct vnode_attr va;
5793 	char *path;
5794 	int error;
5795 	struct nameidata nd;
5796 	vnode_t vp, dvp;
5797 	size_t dummy = 0;
5798 	proc_t p;
5799 
5800 	error = 0;
5801 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5802 		path = zalloc(ZV_NAMEI);
5803 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5804 	} else {
5805 		path = (char *)path_data;
5806 	}
5807 	if (error) {
5808 		goto out;
5809 	}
5810 	AUDIT_ARG(text, path);  /* This is the link string */
5811 
5812 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5813 	    segflg, link, ctx);
5814 
5815 	error = nameiat(&nd, fd);
5816 	if (error) {
5817 		goto out;
5818 	}
5819 	dvp = nd.ni_dvp;
5820 	vp = nd.ni_vp;
5821 
5822 	p = vfs_context_proc(ctx);
5823 	VATTR_INIT(&va);
5824 	VATTR_SET(&va, va_type, VLNK);
5825 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5826 
5827 #if CONFIG_MACF
5828 	error = mac_vnode_check_create(ctx,
5829 	    dvp, &nd.ni_cnd, &va);
5830 #endif
5831 	if (error != 0) {
5832 		goto skipit;
5833 	}
5834 
5835 	if (vp != NULL) {
5836 		error = EEXIST;
5837 		goto skipit;
5838 	}
5839 
5840 	/* authorize */
5841 	if (error == 0) {
5842 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5843 	}
5844 	/* get default ownership, etc. */
5845 	if (error == 0) {
5846 		error = vnode_authattr_new(dvp, &va, 0, ctx);
5847 	}
5848 
5849 #if CONFIG_FILE_LEASES
5850 	vnode_breakdirlease(dvp, false, O_WRONLY);
5851 #endif
5852 
5853 	if (error == 0) {
5854 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5855 	}
5856 
5857 	/* do fallback attribute handling */
5858 	if (error == 0 && vp) {
5859 		error = vnode_setattr_fallback(vp, &va, ctx);
5860 	}
5861 
5862 #if CONFIG_MACF
5863 	if (error == 0 && vp) {
5864 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5865 	}
5866 #endif
5867 
5868 	if (error == 0) {
5869 		int     update_flags = 0;
5870 
5871 		/*check if a new vnode was created, else try to get one*/
5872 		if (vp == NULL) {
5873 			nd.ni_cnd.cn_nameiop = LOOKUP;
5874 #if CONFIG_TRIGGERS
5875 			nd.ni_op = OP_LOOKUP;
5876 #endif
5877 			/*
5878 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5879 			 * reallocated again in namei().
5880 			 */
5881 			nd.ni_cnd.cn_flags &= HASBUF;
5882 			error = nameiat(&nd, fd);
5883 			if (error) {
5884 				goto skipit;
5885 			}
5886 			vp = nd.ni_vp;
5887 		}
5888 
5889 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5890 		/* call out to allow 3rd party notification of rename.
5891 		 * Ignore result of kauth_authorize_fileop call.
5892 		 */
5893 		if (kauth_authorize_fileop_has_listeners() &&
5894 		    namei(&nd) == 0) {
5895 			char *new_link_path = NULL;
5896 			int             len;
5897 
5898 			/* build the path to the new link file */
5899 			new_link_path = get_pathbuff();
5900 			len = MAXPATHLEN;
5901 			vn_getpath(dvp, new_link_path, &len);
5902 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5903 				new_link_path[len - 1] = '/';
5904 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5905 			}
5906 
5907 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5908 			    (uintptr_t)path, (uintptr_t)new_link_path);
5909 			if (new_link_path != NULL) {
5910 				release_pathbuff(new_link_path);
5911 			}
5912 		}
5913 #endif
5914 		// Make sure the name & parent pointers are hooked up
5915 		if (vp->v_name == NULL) {
5916 			update_flags |= VNODE_UPDATE_NAME;
5917 		}
5918 		if (vp->v_parent == NULLVP) {
5919 			update_flags |= VNODE_UPDATE_PARENT;
5920 		}
5921 
5922 		if (update_flags) {
5923 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5924 		}
5925 
5926 #if CONFIG_FSE
5927 		add_fsevent(FSE_CREATE_FILE, ctx,
5928 		    FSE_ARG_VNODE, vp,
5929 		    FSE_ARG_DONE);
5930 #endif
5931 	}
5932 
5933 skipit:
5934 	/*
5935 	 * nameidone has to happen before we vnode_put(dvp)
5936 	 * since it may need to release the fs_nodelock on the dvp
5937 	 */
5938 	nameidone(&nd);
5939 
5940 	if (vp) {
5941 		vnode_put(vp);
5942 	}
5943 	vnode_put(dvp);
5944 out:
5945 	if (path && (path != (char *)path_data)) {
5946 		zfree(ZV_NAMEI, path);
5947 	}
5948 
5949 	return error;
5950 }
5951 
5952 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5953 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5954 {
5955 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5956 	           uap->link, UIO_USERSPACE);
5957 }
5958 
5959 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5960 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5961     __unused int32_t *retval)
5962 {
5963 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5964 	           uap->path2, UIO_USERSPACE);
5965 }
5966 
5967 /*
5968  * Delete a whiteout from the filesystem.
5969  * No longer supported.
5970  */
5971 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5972 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5973 {
5974 	return ENOTSUP;
5975 }
5976 
5977 /*
5978  * Delete a name from the filesystem.
5979  */
5980 /* ARGSUSED */
5981 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5982 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5983     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5984 {
5985 	struct {
5986 		struct nameidata nd;
5987 #if CONFIG_FSE
5988 		struct vnode_attr va;
5989 		fse_info finfo;
5990 #endif
5991 	} *__unlink_data;
5992 	struct nameidata *ndp;
5993 	vnode_t vp, dvp;
5994 	int error;
5995 	struct componentname *cnp;
5996 	char  *path = NULL;
5997 	char  *no_firmlink_path = NULL;
5998 	int  len_path = 0;
5999 	int  len_no_firmlink_path = 0;
6000 	int flags;
6001 	int need_event;
6002 	int has_listeners;
6003 	int truncated_path;
6004 	int truncated_no_firmlink_path;
6005 	int batched;
6006 	struct vnode_attr *vap;
6007 	int do_retry;
6008 	int retry_count = 0;
6009 	int cn_flags;
6010 
6011 	cn_flags = LOCKPARENT;
6012 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6013 		cn_flags |= AUDITVNPATH1;
6014 	}
6015 	/* If a starting dvp is passed, it trumps any fd passed. */
6016 	if (start_dvp) {
6017 		cn_flags |= USEDVP;
6018 	}
6019 
6020 #if NAMEDRSRCFORK
6021 	/* unlink or delete is allowed on rsrc forks and named streams */
6022 	cn_flags |= CN_ALLOWRSRCFORK;
6023 #endif
6024 
6025 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6026 	ndp = &__unlink_data->nd;
6027 #if CONFIG_FSE
6028 	fse_info *finfop = &__unlink_data->finfo;
6029 #endif
6030 
6031 retry:
6032 	do_retry = 0;
6033 	flags = 0;
6034 	need_event = 0;
6035 	has_listeners = 0;
6036 	truncated_path = 0;
6037 	truncated_no_firmlink_path = 0;
6038 	vap = NULL;
6039 
6040 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6041 
6042 	ndp->ni_dvp = start_dvp;
6043 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
6044 	cnp = &ndp->ni_cnd;
6045 
6046 continue_lookup:
6047 	error = nameiat(ndp, fd);
6048 	if (error) {
6049 		goto early_out;
6050 	}
6051 
6052 	dvp = ndp->ni_dvp;
6053 	vp = ndp->ni_vp;
6054 
6055 	/* With Carbon delete semantics, busy files cannot be deleted */
6056 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6057 		flags |= VNODE_REMOVE_NODELETEBUSY;
6058 	}
6059 
6060 	/* Skip any potential upcalls if told to. */
6061 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6062 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6063 	}
6064 
6065 	if (vp) {
6066 		batched = vnode_compound_remove_available(vp);
6067 		/*
6068 		 * The root of a mounted filesystem cannot be deleted.
6069 		 */
6070 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6071 			error = EBUSY;
6072 			goto out;
6073 		}
6074 
6075 #if DEVELOPMENT || DEBUG
6076 		/*
6077 		 * XXX VSWAP: Check for entitlements or special flag here
6078 		 * so we can restrict access appropriately.
6079 		 */
6080 #else /* DEVELOPMENT || DEBUG */
6081 
6082 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6083 			error = EPERM;
6084 			goto out;
6085 		}
6086 #endif /* DEVELOPMENT || DEBUG */
6087 
6088 		if (!batched) {
6089 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6090 			if (error) {
6091 				if (error == ENOENT) {
6092 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6093 						do_retry = 1;
6094 						retry_count++;
6095 					}
6096 				}
6097 				goto out;
6098 			}
6099 		}
6100 	} else {
6101 		batched = 1;
6102 
6103 		if (!vnode_compound_remove_available(dvp)) {
6104 			panic("No vp, but no compound remove?");
6105 		}
6106 	}
6107 
6108 #if CONFIG_FSE
6109 	need_event = need_fsevent(FSE_DELETE, dvp);
6110 	if (need_event) {
6111 		if (!batched) {
6112 			if ((vp->v_flag & VISHARDLINK) == 0) {
6113 				/* XXX need to get these data in batched VNOP */
6114 				get_fse_info(vp, finfop, ctx);
6115 			}
6116 		} else {
6117 			error =
6118 			    vfs_get_notify_attributes(&__unlink_data->va);
6119 			if (error) {
6120 				goto out;
6121 			}
6122 
6123 			vap = &__unlink_data->va;
6124 		}
6125 	}
6126 #endif
6127 	has_listeners = kauth_authorize_fileop_has_listeners();
6128 	if (need_event || has_listeners) {
6129 		if (path == NULL) {
6130 			GET_PATH(path);
6131 		}
6132 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6133 		if (no_firmlink_path == NULL) {
6134 			GET_PATH(no_firmlink_path);
6135 		}
6136 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6137 	}
6138 
6139 #if NAMEDRSRCFORK
6140 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6141 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6142 	} else
6143 #endif
6144 	{
6145 #if CONFIG_FILE_LEASES
6146 		vnode_breakdirlease(dvp, false, O_WRONLY);
6147 #endif
6148 
6149 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6150 		vp = ndp->ni_vp;
6151 		if (error == EKEEPLOOKING) {
6152 			if (!batched) {
6153 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6154 			}
6155 
6156 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6157 				panic("EKEEPLOOKING, but continue flag not set?");
6158 			}
6159 
6160 			if (vnode_isdir(vp)) {
6161 				error = EISDIR;
6162 				goto out;
6163 			}
6164 			goto continue_lookup;
6165 		} else if (error == ENOENT && batched) {
6166 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6167 				/*
6168 				 * For compound VNOPs, the authorization callback may
6169 				 * return ENOENT in case of racing hardlink lookups
6170 				 * hitting the name  cache, redrive the lookup.
6171 				 */
6172 				do_retry = 1;
6173 				retry_count += 1;
6174 				goto out;
6175 			}
6176 		}
6177 	}
6178 
6179 	/*
6180 	 * Call out to allow 3rd party notification of delete.
6181 	 * Ignore result of kauth_authorize_fileop call.
6182 	 */
6183 	if (!error) {
6184 		if (has_listeners) {
6185 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6186 			    KAUTH_FILEOP_DELETE,
6187 			    (uintptr_t)vp,
6188 			    (uintptr_t)path);
6189 		}
6190 
6191 		if (vp->v_flag & VISHARDLINK) {
6192 			//
6193 			// if a hardlink gets deleted we want to blow away the
6194 			// v_parent link because the path that got us to this
6195 			// instance of the link is no longer valid.  this will
6196 			// force the next call to get the path to ask the file
6197 			// system instead of just following the v_parent link.
6198 			//
6199 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6200 		}
6201 
6202 #if CONFIG_FSE
6203 		if (need_event) {
6204 			if (vp->v_flag & VISHARDLINK) {
6205 				get_fse_info(vp, finfop, ctx);
6206 			} else if (vap) {
6207 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6208 			}
6209 			if (truncated_path) {
6210 				finfop->mode |= FSE_TRUNCATED_PATH;
6211 			}
6212 			add_fsevent(FSE_DELETE, ctx,
6213 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6214 			    FSE_ARG_FINFO, finfop,
6215 			    FSE_ARG_DONE);
6216 		}
6217 #endif
6218 
6219 #if CONFIG_MACF
6220 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6221 #endif
6222 	}
6223 
6224 out:
6225 	if (path != NULL) {
6226 		RELEASE_PATH(path);
6227 		path = NULL;
6228 	}
6229 
6230 	if (no_firmlink_path != NULL) {
6231 		RELEASE_PATH(no_firmlink_path);
6232 		no_firmlink_path = NULL;
6233 	}
6234 #if NAMEDRSRCFORK
6235 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6236 	 * will cause its shadow file to go away if necessary.
6237 	 */
6238 	if (vp && (vnode_isnamedstream(vp)) &&
6239 	    (vp->v_parent != NULLVP) &&
6240 	    vnode_isshadow(vp)) {
6241 		vnode_recycle(vp);
6242 	}
6243 #endif
6244 	/*
6245 	 * nameidone has to happen before we vnode_put(dvp)
6246 	 * since it may need to release the fs_nodelock on the dvp
6247 	 */
6248 	nameidone(ndp);
6249 	vnode_put(dvp);
6250 	if (vp) {
6251 		vnode_put(vp);
6252 	}
6253 
6254 	if (do_retry) {
6255 		goto retry;
6256 	}
6257 
6258 early_out:
6259 	kfree_type(typeof(*__unlink_data), __unlink_data);
6260 	return error;
6261 }
6262 
6263 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6264 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6265     enum uio_seg segflg, int unlink_flags)
6266 {
6267 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6268 	           unlink_flags);
6269 }
6270 
6271 /*
6272  * Delete a name from the filesystem using Carbon semantics.
6273  */
6274 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6275 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6276 {
6277 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6278 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6279 }
6280 
6281 /*
6282  * Delete a name from the filesystem using POSIX semantics.
6283  */
6284 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6285 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6286 {
6287 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6288 	           uap->path, UIO_USERSPACE, 0);
6289 }
6290 
6291 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6292 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6293 {
6294 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6295 		return EINVAL;
6296 	}
6297 
6298 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6299 		int unlink_flags = 0;
6300 
6301 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6302 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6303 		}
6304 		return rmdirat_internal(vfs_context_current(), uap->fd,
6305 		           uap->path, UIO_USERSPACE, unlink_flags);
6306 	} else {
6307 		return unlinkat_internal(vfs_context_current(), uap->fd,
6308 		           NULLVP, uap->path, UIO_USERSPACE, 0);
6309 	}
6310 }
6311 
6312 /*
6313  * Reposition read/write file offset.
6314  */
6315 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6316 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6317 {
6318 	struct fileproc *fp;
6319 	vnode_t vp;
6320 	struct vfs_context *ctx;
6321 	off_t offset = uap->offset, file_size;
6322 	int error;
6323 
6324 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6325 		if (error == ENOTSUP) {
6326 			return ESPIPE;
6327 		}
6328 		return error;
6329 	}
6330 	if (vnode_isfifo(vp)) {
6331 		file_drop(uap->fd);
6332 		return ESPIPE;
6333 	}
6334 
6335 
6336 	ctx = vfs_context_current();
6337 #if CONFIG_MACF
6338 	if (uap->whence == L_INCR && uap->offset == 0) {
6339 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6340 		    fp->fp_glob);
6341 	} else {
6342 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6343 		    fp->fp_glob);
6344 	}
6345 	if (error) {
6346 		file_drop(uap->fd);
6347 		return error;
6348 	}
6349 #endif
6350 	if ((error = vnode_getwithref(vp))) {
6351 		file_drop(uap->fd);
6352 		return error;
6353 	}
6354 
6355 	switch (uap->whence) {
6356 	case L_INCR:
6357 		offset += fp->fp_glob->fg_offset;
6358 		break;
6359 	case L_XTND:
6360 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6361 			break;
6362 		}
6363 		offset += file_size;
6364 		break;
6365 	case L_SET:
6366 		break;
6367 	case SEEK_HOLE:
6368 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6369 		break;
6370 	case SEEK_DATA:
6371 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6372 		break;
6373 	default:
6374 		error = EINVAL;
6375 	}
6376 	if (error == 0) {
6377 		if (uap->offset > 0 && offset < 0) {
6378 			/* Incremented/relative move past max size */
6379 			error = EOVERFLOW;
6380 		} else {
6381 			/*
6382 			 * Allow negative offsets on character devices, per
6383 			 * POSIX 1003.1-2001.  Most likely for writing disk
6384 			 * labels.
6385 			 */
6386 			if (offset < 0 && vp->v_type != VCHR) {
6387 				/* Decremented/relative move before start */
6388 				error = EINVAL;
6389 			} else {
6390 				/* Success */
6391 				fp->fp_glob->fg_offset = offset;
6392 				*retval = fp->fp_glob->fg_offset;
6393 			}
6394 		}
6395 	}
6396 
6397 	/*
6398 	 * An lseek can affect whether data is "available to read."  Use
6399 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6400 	 */
6401 	post_event_if_success(vp, error, NOTE_NONE);
6402 	(void)vnode_put(vp);
6403 	file_drop(uap->fd);
6404 	return error;
6405 }
6406 
6407 
6408 /*
6409  * Check access permissions.
6410  *
6411  * Returns:	0			Success
6412  *		vnode_authorize:???
6413  */
6414 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6415 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6416 {
6417 	kauth_action_t action;
6418 	int error;
6419 
6420 	/*
6421 	 * If just the regular access bits, convert them to something
6422 	 * that vnode_authorize will understand.
6423 	 */
6424 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6425 		action = 0;
6426 		if (uflags & R_OK) {
6427 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6428 		}
6429 		if (uflags & W_OK) {
6430 			if (vnode_isdir(vp)) {
6431 				action |= KAUTH_VNODE_ADD_FILE |
6432 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6433 				/* might want delete rights here too */
6434 			} else {
6435 				action |= KAUTH_VNODE_WRITE_DATA;
6436 			}
6437 		}
6438 		if (uflags & X_OK) {
6439 			if (vnode_isdir(vp)) {
6440 				action |= KAUTH_VNODE_SEARCH;
6441 			} else {
6442 				action |= KAUTH_VNODE_EXECUTE;
6443 			}
6444 		}
6445 	} else {
6446 		/* take advantage of definition of uflags */
6447 		action = uflags >> 8;
6448 	}
6449 
6450 #if CONFIG_MACF
6451 	error = mac_vnode_check_access(ctx, vp, uflags);
6452 	if (error) {
6453 		return error;
6454 	}
6455 #endif /* MAC */
6456 
6457 	/* action == 0 means only check for existence */
6458 	if (action != 0) {
6459 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6460 	} else {
6461 		error = 0;
6462 	}
6463 
6464 	return error;
6465 }
6466 
6467 
6468 
6469 /*
6470  * access_extended: Check access permissions in bulk.
6471  *
6472  * Description:	uap->entries		Pointer to an array of accessx
6473  *                                      descriptor structs, plus one or
6474  *                                      more NULL terminated strings (see
6475  *                                      "Notes" section below).
6476  *		uap->size		Size of the area pointed to by
6477  *					uap->entries.
6478  *		uap->results		Pointer to the results array.
6479  *
6480  * Returns:	0			Success
6481  *		ENOMEM			Insufficient memory
6482  *		EINVAL			Invalid arguments
6483  *		namei:EFAULT		Bad address
6484  *		namei:ENAMETOOLONG	Filename too long
6485  *		namei:ENOENT		No such file or directory
6486  *		namei:ELOOP		Too many levels of symbolic links
6487  *		namei:EBADF		Bad file descriptor
6488  *		namei:ENOTDIR		Not a directory
6489  *		namei:???
6490  *		access1:
6491  *
6492  * Implicit returns:
6493  *		uap->results		Array contents modified
6494  *
6495  * Notes:	The uap->entries are structured as an arbitrary length array
6496  *		of accessx descriptors, followed by one or more NULL terminated
6497  *		strings
6498  *
6499  *			struct accessx_descriptor[0]
6500  *			...
6501  *			struct accessx_descriptor[n]
6502  *			char name_data[0];
6503  *
6504  *		We determine the entry count by walking the buffer containing
6505  *		the uap->entries argument descriptor.  For each descriptor we
6506  *		see, the valid values for the offset ad_name_offset will be
6507  *		in the byte range:
6508  *
6509  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6510  *						to
6511  *				[ uap->entries + uap->size - 2 ]
6512  *
6513  *		since we must have at least one string, and the string must
6514  *		be at least one character plus the NULL terminator in length.
6515  *
6516  * XXX:		Need to support the check-as uid argument
6517  */
6518 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6519 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6520 {
6521 	struct accessx_descriptor *input = NULL;
6522 	errno_t *result = NULL;
6523 	errno_t error = 0;
6524 	int wantdelete = 0;
6525 	size_t desc_max, desc_actual = 0;
6526 	unsigned int i, j;
6527 	struct vfs_context context;
6528 	struct nameidata nd;
6529 	int niopts;
6530 	vnode_t vp = NULL;
6531 	vnode_t dvp = NULL;
6532 #define ACCESSX_MAX_DESCR_ON_STACK 10
6533 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6534 
6535 	context.vc_ucred = NULL;
6536 
6537 	/*
6538 	 * Validate parameters; if valid, copy the descriptor array and string
6539 	 * arguments into local memory.  Before proceeding, the following
6540 	 * conditions must have been met:
6541 	 *
6542 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6543 	 * o	There must be sufficient room in the request for at least one
6544 	 *	descriptor and a one yte NUL terminated string.
6545 	 * o	The allocation of local storage must not fail.
6546 	 */
6547 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6548 		return ENOMEM;
6549 	}
6550 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6551 		return EINVAL;
6552 	}
6553 	if (uap->size <= sizeof(stack_input)) {
6554 		input = stack_input;
6555 	} else {
6556 		input = kalloc_data(uap->size, Z_WAITOK);
6557 		if (input == NULL) {
6558 			error = ENOMEM;
6559 			goto out;
6560 		}
6561 	}
6562 	error = copyin(uap->entries, input, uap->size);
6563 	if (error) {
6564 		goto out;
6565 	}
6566 
6567 	AUDIT_ARG(opaque, input, uap->size);
6568 
6569 	/*
6570 	 * Force NUL termination of the copyin buffer to avoid nami() running
6571 	 * off the end.  If the caller passes us bogus data, they may get a
6572 	 * bogus result.
6573 	 */
6574 	((char *)input)[uap->size - 1] = 0;
6575 
6576 	/*
6577 	 * Access is defined as checking against the process' real identity,
6578 	 * even if operations are checking the effective identity.  This
6579 	 * requires that we use a local vfs context.
6580 	 */
6581 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6582 	context.vc_thread = current_thread();
6583 
6584 	/*
6585 	 * Find out how many entries we have, so we can allocate the result
6586 	 * array by walking the list and adjusting the count downward by the
6587 	 * earliest string offset we see.
6588 	 */
6589 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6590 	desc_actual = desc_max;
6591 	for (i = 0; i < desc_actual; i++) {
6592 		/*
6593 		 * Take the offset to the name string for this entry and
6594 		 * convert to an input array index, which would be one off
6595 		 * the end of the array if this entry was the lowest-addressed
6596 		 * name string.
6597 		 */
6598 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6599 
6600 		/*
6601 		 * An offset greater than the max allowable offset is an error.
6602 		 * It is also an error for any valid entry to point
6603 		 * to a location prior to the end of the current entry, if
6604 		 * it's not a reference to the string of the previous entry.
6605 		 */
6606 		if (j > desc_max || (j != 0 && j <= i)) {
6607 			error = EINVAL;
6608 			goto out;
6609 		}
6610 
6611 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6612 		if (input[i].ad_name_offset >= uap->size) {
6613 			error = EINVAL;
6614 			goto out;
6615 		}
6616 
6617 		/*
6618 		 * An offset of 0 means use the previous descriptor's offset;
6619 		 * this is used to chain multiple requests for the same file
6620 		 * to avoid multiple lookups.
6621 		 */
6622 		if (j == 0) {
6623 			/* This is not valid for the first entry */
6624 			if (i == 0) {
6625 				error = EINVAL;
6626 				goto out;
6627 			}
6628 			continue;
6629 		}
6630 
6631 		/*
6632 		 * If the offset of the string for this descriptor is before
6633 		 * what we believe is the current actual last descriptor,
6634 		 * then we need to adjust our estimate downward; this permits
6635 		 * the string table following the last descriptor to be out
6636 		 * of order relative to the descriptor list.
6637 		 */
6638 		if (j < desc_actual) {
6639 			desc_actual = j;
6640 		}
6641 	}
6642 
6643 	/*
6644 	 * We limit the actual number of descriptors we are willing to process
6645 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6646 	 * requested does not exceed this limit,
6647 	 */
6648 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6649 		error = ENOMEM;
6650 		goto out;
6651 	}
6652 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6653 	if (result == NULL) {
6654 		error = ENOMEM;
6655 		goto out;
6656 	}
6657 
6658 	/*
6659 	 * Do the work by iterating over the descriptor entries we know to
6660 	 * at least appear to contain valid data.
6661 	 */
6662 	error = 0;
6663 	for (i = 0; i < desc_actual; i++) {
6664 		/*
6665 		 * If the ad_name_offset is 0, then we use the previous
6666 		 * results to make the check; otherwise, we are looking up
6667 		 * a new file name.
6668 		 */
6669 		if (input[i].ad_name_offset != 0) {
6670 			/* discard old vnodes */
6671 			if (vp) {
6672 				vnode_put(vp);
6673 				vp = NULL;
6674 			}
6675 			if (dvp) {
6676 				vnode_put(dvp);
6677 				dvp = NULL;
6678 			}
6679 
6680 			/*
6681 			 * Scan forward in the descriptor list to see if we
6682 			 * need the parent vnode.  We will need it if we are
6683 			 * deleting, since we must have rights  to remove
6684 			 * entries in the parent directory, as well as the
6685 			 * rights to delete the object itself.
6686 			 */
6687 			wantdelete = input[i].ad_flags & _DELETE_OK;
6688 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6689 				if (input[j].ad_flags & _DELETE_OK) {
6690 					wantdelete = 1;
6691 				}
6692 			}
6693 
6694 			niopts = FOLLOW | AUDITVNPATH1;
6695 
6696 			/* need parent for vnode_authorize for deletion test */
6697 			if (wantdelete) {
6698 				niopts |= WANTPARENT;
6699 			}
6700 
6701 			/* do the lookup */
6702 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6703 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6704 			    &context);
6705 			error = namei(&nd);
6706 			if (!error) {
6707 				vp = nd.ni_vp;
6708 				if (wantdelete) {
6709 					dvp = nd.ni_dvp;
6710 				}
6711 			}
6712 			nameidone(&nd);
6713 		}
6714 
6715 		/*
6716 		 * Handle lookup errors.
6717 		 */
6718 		switch (error) {
6719 		case ENOENT:
6720 		case EACCES:
6721 		case EPERM:
6722 		case ENOTDIR:
6723 			result[i] = error;
6724 			break;
6725 		case 0:
6726 			/* run this access check */
6727 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6728 			break;
6729 		default:
6730 			/* fatal lookup error */
6731 
6732 			goto out;
6733 		}
6734 	}
6735 
6736 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6737 
6738 	/* copy out results */
6739 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6740 
6741 out:
6742 	if (input && input != stack_input) {
6743 		kfree_data(input, uap->size);
6744 	}
6745 	if (result) {
6746 		kfree_data(result, desc_actual * sizeof(errno_t));
6747 	}
6748 	if (vp) {
6749 		vnode_put(vp);
6750 	}
6751 	if (dvp) {
6752 		vnode_put(dvp);
6753 	}
6754 	if (IS_VALID_CRED(context.vc_ucred)) {
6755 		kauth_cred_unref(&context.vc_ucred);
6756 	}
6757 	return error;
6758 }
6759 
6760 
6761 /*
6762  * Returns:	0			Success
6763  *		namei:EFAULT		Bad address
6764  *		namei:ENAMETOOLONG	Filename too long
6765  *		namei:ENOENT		No such file or directory
6766  *		namei:ELOOP		Too many levels of symbolic links
6767  *		namei:EBADF		Bad file descriptor
6768  *		namei:ENOTDIR		Not a directory
6769  *		namei:???
6770  *		access1:
6771  */
6772 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6773 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6774     int flag, enum uio_seg segflg)
6775 {
6776 	int error;
6777 	struct nameidata nd;
6778 	int niopts;
6779 	struct vfs_context context;
6780 #if NAMEDRSRCFORK
6781 	int is_namedstream = 0;
6782 #endif
6783 
6784 	/*
6785 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6786 	 * against the process' real identity, even if operations are checking
6787 	 * the effective identity.  So we need to tweak the credential
6788 	 * in the context for that case.
6789 	 */
6790 	if (!(flag & AT_EACCESS)) {
6791 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6792 	} else {
6793 		context.vc_ucred = ctx->vc_ucred;
6794 	}
6795 	context.vc_thread = ctx->vc_thread;
6796 
6797 
6798 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6799 	/* need parent for vnode_authorize for deletion test */
6800 	if (amode & _DELETE_OK) {
6801 		niopts |= WANTPARENT;
6802 	}
6803 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6804 	    path, &context);
6805 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6806 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6807 	}
6808 
6809 #if NAMEDRSRCFORK
6810 	/* access(F_OK) calls are allowed for resource forks. */
6811 	if (amode == F_OK) {
6812 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6813 	}
6814 #endif
6815 	error = nameiat(&nd, fd);
6816 	if (error) {
6817 		goto out;
6818 	}
6819 
6820 #if NAMEDRSRCFORK
6821 	/* Grab reference on the shadow stream file vnode to
6822 	 * force an inactive on release which will mark it
6823 	 * for recycle.
6824 	 */
6825 	if (vnode_isnamedstream(nd.ni_vp) &&
6826 	    (nd.ni_vp->v_parent != NULLVP) &&
6827 	    vnode_isshadow(nd.ni_vp)) {
6828 		is_namedstream = 1;
6829 		vnode_ref(nd.ni_vp);
6830 	}
6831 #endif
6832 
6833 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6834 
6835 #if NAMEDRSRCFORK
6836 	if (is_namedstream) {
6837 		vnode_rele(nd.ni_vp);
6838 	}
6839 #endif
6840 
6841 	vnode_put(nd.ni_vp);
6842 	if (amode & _DELETE_OK) {
6843 		vnode_put(nd.ni_dvp);
6844 	}
6845 	nameidone(&nd);
6846 
6847 out:
6848 	if (!(flag & AT_EACCESS)) {
6849 		kauth_cred_unref(&context.vc_ucred);
6850 	}
6851 	return error;
6852 }
6853 
6854 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6855 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6856 {
6857 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
6858 	           uap->path, uap->flags, 0, UIO_USERSPACE);
6859 }
6860 
6861 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6862 faccessat(__unused proc_t p, struct faccessat_args *uap,
6863     __unused int32_t *retval)
6864 {
6865 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6866 		return EINVAL;
6867 	}
6868 
6869 	return faccessat_internal(vfs_context_current(), uap->fd,
6870 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6871 }
6872 
6873 /*
6874  * Returns:	0			Success
6875  *		EFAULT
6876  *	copyout:EFAULT
6877  *	namei:???
6878  *	vn_stat:???
6879  */
6880 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6881 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6882     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6883     enum uio_seg segflg, int fd, int flag)
6884 {
6885 	struct nameidata nd;
6886 	int follow;
6887 	union {
6888 		struct stat sb;
6889 		struct stat64 sb64;
6890 	} source = {};
6891 	union {
6892 		struct user64_stat user64_sb;
6893 		struct user32_stat user32_sb;
6894 		struct user64_stat64 user64_sb64;
6895 		struct user32_stat64 user32_sb64;
6896 	} dest = {};
6897 	caddr_t sbp;
6898 	int error, my_size;
6899 	kauth_filesec_t fsec;
6900 	size_t xsecurity_bufsize;
6901 	void * statptr;
6902 	struct fileproc *fp = NULL;
6903 	int needsrealdev = 0;
6904 
6905 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6906 	NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6907 	    segflg, path, ctx);
6908 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6909 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6910 	}
6911 
6912 #if NAMEDRSRCFORK
6913 	int is_namedstream = 0;
6914 	/* stat calls are allowed for resource forks. */
6915 	nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6916 #endif
6917 
6918 	if (flag & AT_FDONLY) {
6919 		vnode_t fvp;
6920 
6921 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6922 		if (error) {
6923 			return error;
6924 		}
6925 		if ((error = vnode_getwithref(fvp))) {
6926 			file_drop(fd);
6927 			return error;
6928 		}
6929 		nd.ni_vp = fvp;
6930 	} else {
6931 		error = nameiat(&nd, fd);
6932 		if (error) {
6933 			return error;
6934 		}
6935 	}
6936 	fsec = KAUTH_FILESEC_NONE;
6937 
6938 	statptr = (void *)&source;
6939 
6940 #if NAMEDRSRCFORK
6941 	/* Grab reference on the shadow stream file vnode to
6942 	 * force an inactive on release which will mark it
6943 	 * for recycle.
6944 	 */
6945 	if (vnode_isnamedstream(nd.ni_vp) &&
6946 	    (nd.ni_vp->v_parent != NULLVP) &&
6947 	    vnode_isshadow(nd.ni_vp)) {
6948 		is_namedstream = 1;
6949 		vnode_ref(nd.ni_vp);
6950 	}
6951 #endif
6952 
6953 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
6954 	if (fp && (xsecurity == USER_ADDR_NULL)) {
6955 		/*
6956 		 * If the caller has the file open, and is not
6957 		 * requesting extended security information, we are
6958 		 * going to let them get the basic stat information.
6959 		 */
6960 		error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6961 		    fp->fp_glob->fg_cred);
6962 	} else {
6963 		error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6964 		    isstat64, needsrealdev, ctx);
6965 	}
6966 
6967 #if NAMEDRSRCFORK
6968 	if (is_namedstream) {
6969 		vnode_rele(nd.ni_vp);
6970 	}
6971 #endif
6972 	vnode_put(nd.ni_vp);
6973 	nameidone(&nd);
6974 	if (fp) {
6975 		file_drop(fd);
6976 		fp = NULL;
6977 	}
6978 
6979 	if (error) {
6980 		return error;
6981 	}
6982 	/* Zap spare fields */
6983 	if (isstat64 != 0) {
6984 		source.sb64.st_lspare = 0;
6985 		source.sb64.st_qspare[0] = 0LL;
6986 		source.sb64.st_qspare[1] = 0LL;
6987 		if (vfs_context_is64bit(ctx)) {
6988 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6989 			my_size = sizeof(dest.user64_sb64);
6990 			sbp = (caddr_t)&dest.user64_sb64;
6991 		} else {
6992 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6993 			my_size = sizeof(dest.user32_sb64);
6994 			sbp = (caddr_t)&dest.user32_sb64;
6995 		}
6996 		/*
6997 		 * Check if we raced (post lookup) against the last unlink of a file.
6998 		 */
6999 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7000 			source.sb64.st_nlink = 1;
7001 		}
7002 	} else {
7003 		source.sb.st_lspare = 0;
7004 		source.sb.st_qspare[0] = 0LL;
7005 		source.sb.st_qspare[1] = 0LL;
7006 		if (vfs_context_is64bit(ctx)) {
7007 			munge_user64_stat(&source.sb, &dest.user64_sb);
7008 			my_size = sizeof(dest.user64_sb);
7009 			sbp = (caddr_t)&dest.user64_sb;
7010 		} else {
7011 			munge_user32_stat(&source.sb, &dest.user32_sb);
7012 			my_size = sizeof(dest.user32_sb);
7013 			sbp = (caddr_t)&dest.user32_sb;
7014 		}
7015 
7016 		/*
7017 		 * Check if we raced (post lookup) against the last unlink of a file.
7018 		 */
7019 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7020 			source.sb.st_nlink = 1;
7021 		}
7022 	}
7023 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7024 		goto out;
7025 	}
7026 
7027 	/* caller wants extended security information? */
7028 	if (xsecurity != USER_ADDR_NULL) {
7029 		/* did we get any? */
7030 		if (fsec == KAUTH_FILESEC_NONE) {
7031 			if (susize(xsecurity_size, 0) != 0) {
7032 				error = EFAULT;
7033 				goto out;
7034 			}
7035 		} else {
7036 			/* find the user buffer size */
7037 			xsecurity_bufsize = fusize(xsecurity_size);
7038 
7039 			/* copy out the actual data size */
7040 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7041 				error = EFAULT;
7042 				goto out;
7043 			}
7044 
7045 			/* if the caller supplied enough room, copy out to it */
7046 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7047 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7048 			}
7049 		}
7050 	}
7051 out:
7052 	if (fsec != KAUTH_FILESEC_NONE) {
7053 		kauth_filesec_free(fsec);
7054 	}
7055 	return error;
7056 }
7057 
7058 /*
7059  * stat_extended: Get file status; with extended security (ACL).
7060  *
7061  * Parameters:    p                       (ignored)
7062  *                uap                     User argument descriptor (see below)
7063  *                retval                  (ignored)
7064  *
7065  * Indirect:      uap->path               Path of file to get status from
7066  *                uap->ub                 User buffer (holds file status info)
7067  *                uap->xsecurity          ACL to get (extended security)
7068  *                uap->xsecurity_size     Size of ACL
7069  *
7070  * Returns:        0                      Success
7071  *                !0                      errno value
7072  *
7073  */
7074 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7075 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7076     __unused int32_t *retval)
7077 {
7078 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7079 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7080 	           0);
7081 }
7082 
7083 /*
7084  * Returns:	0			Success
7085  *	fstatat_internal:???		[see fstatat_internal() in this file]
7086  */
7087 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7088 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7089 {
7090 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7091 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7092 }
7093 
7094 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7095 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7096 {
7097 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7098 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7099 }
7100 
7101 /*
7102  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7103  *
7104  * Parameters:    p                       (ignored)
7105  *                uap                     User argument descriptor (see below)
7106  *                retval                  (ignored)
7107  *
7108  * Indirect:      uap->path               Path of file to get status from
7109  *                uap->ub                 User buffer (holds file status info)
7110  *                uap->xsecurity          ACL to get (extended security)
7111  *                uap->xsecurity_size     Size of ACL
7112  *
7113  * Returns:        0                      Success
7114  *                !0                      errno value
7115  *
7116  */
7117 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7118 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7119 {
7120 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7121 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7122 	           0);
7123 }
7124 
7125 /*
7126  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7127  *
7128  * Parameters:    p                       (ignored)
7129  *                uap                     User argument descriptor (see below)
7130  *                retval                  (ignored)
7131  *
7132  * Indirect:      uap->path               Path of file to get status from
7133  *                uap->ub                 User buffer (holds file status info)
7134  *                uap->xsecurity          ACL to get (extended security)
7135  *                uap->xsecurity_size     Size of ACL
7136  *
7137  * Returns:        0                      Success
7138  *                !0                      errno value
7139  *
7140  */
7141 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7142 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7143 {
7144 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7145 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7146 	           AT_SYMLINK_NOFOLLOW);
7147 }
7148 
7149 /*
7150  * Get file status; this version does not follow links.
7151  */
7152 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7153 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7154 {
7155 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7156 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7157 }
7158 
7159 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7160 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7161 {
7162 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7163 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7164 }
7165 
7166 /*
7167  * lstat64_extended: Get file status; can handle large inode numbers; does not
7168  * follow links; with extended security (ACL).
7169  *
7170  * Parameters:    p                       (ignored)
7171  *                uap                     User argument descriptor (see below)
7172  *                retval                  (ignored)
7173  *
7174  * Indirect:      uap->path               Path of file to get status from
7175  *                uap->ub                 User buffer (holds file status info)
7176  *                uap->xsecurity          ACL to get (extended security)
7177  *                uap->xsecurity_size     Size of ACL
7178  *
7179  * Returns:        0                      Success
7180  *                !0                      errno value
7181  *
7182  */
7183 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7184 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7185 {
7186 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7187 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7188 	           AT_SYMLINK_NOFOLLOW);
7189 }
7190 
7191 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7192 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7193 {
7194 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7195 		return EINVAL;
7196 	}
7197 
7198 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7199 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7200 }
7201 
7202 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7203 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7204     __unused int32_t *retval)
7205 {
7206 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7207 		return EINVAL;
7208 	}
7209 
7210 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7211 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7212 }
7213 
7214 /*
7215  * Get configurable pathname variables.
7216  *
7217  * Returns:	0			Success
7218  *	namei:???
7219  *	vn_pathconf:???
7220  *
7221  * Notes:	Global implementation  constants are intended to be
7222  *		implemented in this function directly; all other constants
7223  *		are per-FS implementation, and therefore must be handled in
7224  *		each respective FS, instead.
7225  *
7226  * XXX We implement some things globally right now that should actually be
7227  * XXX per-FS; we will need to deal with this at some point.
7228  */
7229 /* ARGSUSED */
7230 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7231 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7232 {
7233 	int error;
7234 	struct nameidata nd;
7235 	vfs_context_t ctx = vfs_context_current();
7236 
7237 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7238 	    UIO_USERSPACE, uap->path, ctx);
7239 	error = namei(&nd);
7240 	if (error) {
7241 		return error;
7242 	}
7243 
7244 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7245 
7246 	vnode_put(nd.ni_vp);
7247 	nameidone(&nd);
7248 	return error;
7249 }
7250 
7251 /*
7252  * Return target name of a symbolic link.
7253  */
7254 /* ARGSUSED */
7255 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7256 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7257     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7258     int *retval)
7259 {
7260 	vnode_t vp;
7261 	uio_t auio;
7262 	int error;
7263 	struct nameidata nd;
7264 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
7265 	bool put_vnode;
7266 
7267 	if (bufsize > INT32_MAX) {
7268 		return EINVAL;
7269 	}
7270 
7271 	if (lnk_vp) {
7272 		vp = lnk_vp;
7273 		put_vnode = false;
7274 	} else {
7275 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7276 		    seg, path, ctx);
7277 
7278 		error = nameiat(&nd, fd);
7279 		if (error) {
7280 			return error;
7281 		}
7282 		vp = nd.ni_vp;
7283 		put_vnode = true;
7284 		nameidone(&nd);
7285 	}
7286 
7287 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7288 	    &uio_buf[0], sizeof(uio_buf));
7289 	uio_addiov(auio, buf, bufsize);
7290 	if (vp->v_type != VLNK) {
7291 		error = EINVAL;
7292 	} else {
7293 #if CONFIG_MACF
7294 		error = mac_vnode_check_readlink(ctx, vp);
7295 #endif
7296 		if (error == 0) {
7297 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7298 			    ctx);
7299 		}
7300 		if (error == 0) {
7301 			error = VNOP_READLINK(vp, auio, ctx);
7302 		}
7303 	}
7304 
7305 	if (put_vnode) {
7306 		vnode_put(vp);
7307 	}
7308 
7309 	*retval = (int)(bufsize - uio_resid(auio));
7310 	return error;
7311 }
7312 
7313 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7314 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7315 {
7316 	enum uio_seg procseg;
7317 	vnode_t vp;
7318 	int error;
7319 
7320 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7321 
7322 	AUDIT_ARG(fd, uap->fd);
7323 
7324 	if ((error = file_vnode(uap->fd, &vp))) {
7325 		return error;
7326 	}
7327 	if ((error = vnode_getwithref(vp))) {
7328 		file_drop(uap->fd);
7329 		return error;
7330 	}
7331 
7332 	error = readlinkat_internal(vfs_context_current(), -1,
7333 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7334 	    uap->bufsize, procseg, retval);
7335 
7336 	vnode_put(vp);
7337 	file_drop(uap->fd);
7338 	return error;
7339 }
7340 
7341 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7342 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7343 {
7344 	enum uio_seg procseg;
7345 
7346 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7347 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7348 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7349 	           uap->count, procseg, retval);
7350 }
7351 
7352 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7353 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7354 {
7355 	enum uio_seg procseg;
7356 
7357 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7358 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7359 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7360 	           retval);
7361 }
7362 
7363 /*
7364  * Change file flags, the deep inner layer.
7365  */
7366 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7367 chflags0(vnode_t vp, struct vnode_attr *va,
7368     int (*setattr)(vnode_t, void *, vfs_context_t),
7369     void *arg, vfs_context_t ctx)
7370 {
7371 	kauth_action_t action = 0;
7372 	int error;
7373 
7374 #if CONFIG_MACF
7375 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7376 	if (error) {
7377 		goto out;
7378 	}
7379 #endif
7380 
7381 	/* request authorisation, disregard immutability */
7382 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7383 		goto out;
7384 	}
7385 	/*
7386 	 * Request that the auth layer disregard those file flags it's allowed to when
7387 	 * authorizing this operation; we need to do this in order to be able to
7388 	 * clear immutable flags.
7389 	 */
7390 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7391 		goto out;
7392 	}
7393 	error = (*setattr)(vp, arg, ctx);
7394 
7395 #if CONFIG_MACF
7396 	if (error == 0) {
7397 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7398 	}
7399 #endif
7400 
7401 out:
7402 	return error;
7403 }
7404 
7405 /*
7406  * Change file flags.
7407  *
7408  * NOTE: this will vnode_put() `vp'
7409  */
7410 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7411 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7412 {
7413 	struct vnode_attr va;
7414 	int error;
7415 
7416 	VATTR_INIT(&va);
7417 	VATTR_SET(&va, va_flags, flags);
7418 
7419 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7420 	vnode_put(vp);
7421 
7422 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7423 		error = ENOTSUP;
7424 	}
7425 
7426 	return error;
7427 }
7428 
7429 /*
7430  * Change flags of a file given a path name.
7431  */
7432 /* ARGSUSED */
7433 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7434 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7435 {
7436 	vnode_t vp;
7437 	vfs_context_t ctx = vfs_context_current();
7438 	int error;
7439 	struct nameidata nd;
7440 	uint32_t wantparent = 0;
7441 
7442 #if CONFIG_FILE_LEASES
7443 	wantparent = WANTPARENT;
7444 #endif
7445 
7446 	AUDIT_ARG(fflags, uap->flags);
7447 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7448 	    UIO_USERSPACE, uap->path, ctx);
7449 	error = namei(&nd);
7450 	if (error) {
7451 		return error;
7452 	}
7453 	vp = nd.ni_vp;
7454 
7455 #if CONFIG_FILE_LEASES
7456 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7457 	vnode_put(nd.ni_dvp);
7458 #endif
7459 
7460 	nameidone(&nd);
7461 
7462 	/* we don't vnode_put() here because chflags1 does internally */
7463 	error = chflags1(vp, uap->flags, ctx);
7464 
7465 	return error;
7466 }
7467 
7468 /*
7469  * Change flags of a file given a file descriptor.
7470  */
7471 /* ARGSUSED */
7472 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7473 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7474 {
7475 	vnode_t vp;
7476 	int error;
7477 
7478 	AUDIT_ARG(fd, uap->fd);
7479 	AUDIT_ARG(fflags, uap->flags);
7480 	if ((error = file_vnode(uap->fd, &vp))) {
7481 		return error;
7482 	}
7483 
7484 	if ((error = vnode_getwithref(vp))) {
7485 		file_drop(uap->fd);
7486 		return error;
7487 	}
7488 
7489 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7490 
7491 #if CONFIG_FILE_LEASES
7492 	vnode_breakdirlease(vp, true, O_WRONLY);
7493 #endif
7494 
7495 	/* we don't vnode_put() here because chflags1 does internally */
7496 	error = chflags1(vp, uap->flags, vfs_context_current());
7497 
7498 	file_drop(uap->fd);
7499 	return error;
7500 }
7501 
7502 /*
7503  * Change security information on a filesystem object.
7504  *
7505  * Returns:	0			Success
7506  *		EPERM			Operation not permitted
7507  *		vnode_authattr:???	[anything vnode_authattr can return]
7508  *		vnode_authorize:???	[anything vnode_authorize can return]
7509  *		vnode_setattr:???	[anything vnode_setattr can return]
7510  *
7511  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7512  *		translated to EPERM before being returned.
7513  */
7514 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7515 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7516 {
7517 	kauth_action_t action;
7518 	int error;
7519 
7520 	AUDIT_ARG(mode, vap->va_mode);
7521 	/* XXX audit new args */
7522 
7523 #if NAMEDSTREAMS
7524 	/* chmod calls are not allowed for resource forks. */
7525 	if (vp->v_flag & VISNAMEDSTREAM) {
7526 		return EPERM;
7527 	}
7528 #endif
7529 
7530 #if CONFIG_MACF
7531 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7532 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7533 		return error;
7534 	}
7535 
7536 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7537 		if ((error = mac_vnode_check_setowner(ctx, vp,
7538 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7539 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7540 			return error;
7541 		}
7542 	}
7543 
7544 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7545 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7546 		return error;
7547 	}
7548 #endif
7549 
7550 	/* make sure that the caller is allowed to set this security information */
7551 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7552 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7553 		if (error == EACCES) {
7554 			error = EPERM;
7555 		}
7556 		return error;
7557 	}
7558 
7559 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7560 		return error;
7561 	}
7562 
7563 #if CONFIG_MACF
7564 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7565 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7566 	}
7567 
7568 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7569 		mac_vnode_notify_setowner(ctx, vp,
7570 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7571 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7572 	}
7573 
7574 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7575 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7576 	}
7577 #endif
7578 
7579 	return error;
7580 }
7581 
7582 
7583 /*
7584  * Change mode of a file given a path name.
7585  *
7586  * Returns:	0			Success
7587  *		namei:???		[anything namei can return]
7588  *		chmod_vnode:???		[anything chmod_vnode can return]
7589  */
7590 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7591 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7592     int fd, int flag, enum uio_seg segflg)
7593 {
7594 	struct nameidata nd;
7595 	int follow, error;
7596 	uint32_t wantparent = 0;
7597 
7598 #if CONFIG_FILE_LEASES
7599 	wantparent = WANTPARENT;
7600 #endif
7601 
7602 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7603 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7604 	    segflg, path, ctx);
7605 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7606 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7607 	}
7608 	if ((error = nameiat(&nd, fd))) {
7609 		return error;
7610 	}
7611 
7612 #if CONFIG_FILE_LEASES
7613 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7614 	vnode_put(nd.ni_dvp);
7615 #endif
7616 
7617 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7618 	vnode_put(nd.ni_vp);
7619 	nameidone(&nd);
7620 	return error;
7621 }
7622 
7623 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7624 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7625     gid_t gid, user_addr_t xsecurity)
7626 {
7627 	int error;
7628 
7629 	VATTR_INIT(pva);
7630 
7631 	if (mode != -1) {
7632 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
7633 	} else {
7634 		pva->va_mode = 0;
7635 	}
7636 
7637 	if (uid != KAUTH_UID_NONE) {
7638 		VATTR_SET(pva, va_uid, uid);
7639 	}
7640 
7641 	if (gid != KAUTH_GID_NONE) {
7642 		VATTR_SET(pva, va_gid, gid);
7643 	}
7644 
7645 	*pxsecdst = NULL;
7646 	switch (xsecurity) {
7647 	case USER_ADDR_NULL:
7648 		break;
7649 
7650 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7651 		VATTR_SET(pva, va_acl, NULL);
7652 		break;
7653 
7654 	default:
7655 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7656 			return error;
7657 		}
7658 
7659 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7660 		pva->va_vaflags |= VA_FILESEC_ACL;
7661 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7662 		break;
7663 	}
7664 
7665 	return 0;
7666 }
7667 
7668 /*
7669  * chmod_extended: Change the mode of a file given a path name; with extended
7670  * argument list (including extended security (ACL)).
7671  *
7672  * Parameters:	p			Process requesting the open
7673  *		uap			User argument descriptor (see below)
7674  *		retval			(ignored)
7675  *
7676  * Indirect:	uap->path		Path to object (same as 'chmod')
7677  *		uap->uid		UID to set
7678  *		uap->gid		GID to set
7679  *		uap->mode		File mode to set (same as 'chmod')
7680  *		uap->xsecurity		ACL to set (or delete)
7681  *
7682  * Returns:	0			Success
7683  *		!0			errno value
7684  *
7685  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7686  *
7687  * XXX:		We should enummerate the possible errno values here, and where
7688  *		in the code they originated.
7689  */
7690 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7691 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7692 {
7693 	int error;
7694 	struct vnode_attr va;
7695 	kauth_filesec_t xsecdst = NULL;
7696 
7697 	AUDIT_ARG(owner, uap->uid, uap->gid);
7698 
7699 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7700 	    uap->gid, uap->xsecurity);
7701 
7702 	if (error) {
7703 		return error;
7704 	}
7705 
7706 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7707 	    UIO_USERSPACE);
7708 
7709 	if (xsecdst != NULL) {
7710 		kauth_filesec_free(xsecdst);
7711 	}
7712 	return error;
7713 }
7714 
7715 /*
7716  * Returns:	0			Success
7717  *		chmodat:???		[anything chmodat can return]
7718  */
7719 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7720 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7721     int flag, enum uio_seg segflg)
7722 {
7723 	struct vnode_attr va;
7724 
7725 	VATTR_INIT(&va);
7726 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7727 
7728 	return chmodat(ctx, path, &va, fd, flag, segflg);
7729 }
7730 
7731 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7732 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7733 {
7734 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7735 	           AT_FDCWD, 0, UIO_USERSPACE);
7736 }
7737 
7738 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7739 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7740 {
7741 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7742 		return EINVAL;
7743 	}
7744 
7745 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7746 	           uap->fd, uap->flag, UIO_USERSPACE);
7747 }
7748 
7749 /*
7750  * Change mode of a file given a file descriptor.
7751  */
7752 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7753 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7754 {
7755 	vnode_t vp;
7756 	int error;
7757 
7758 	AUDIT_ARG(fd, fd);
7759 
7760 	if ((error = file_vnode(fd, &vp)) != 0) {
7761 		return error;
7762 	}
7763 	if ((error = vnode_getwithref(vp)) != 0) {
7764 		file_drop(fd);
7765 		return error;
7766 	}
7767 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7768 
7769 #if CONFIG_FILE_LEASES
7770 	vnode_breakdirlease(vp, true, O_WRONLY);
7771 #endif
7772 
7773 	error = chmod_vnode(vfs_context_current(), vp, vap);
7774 	(void)vnode_put(vp);
7775 	file_drop(fd);
7776 
7777 	return error;
7778 }
7779 
7780 /*
7781  * fchmod_extended: Change mode of a file given a file descriptor; with
7782  * extended argument list (including extended security (ACL)).
7783  *
7784  * Parameters:    p                       Process requesting to change file mode
7785  *                uap                     User argument descriptor (see below)
7786  *                retval                  (ignored)
7787  *
7788  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7789  *                uap->uid                UID to set
7790  *                uap->gid                GID to set
7791  *                uap->xsecurity          ACL to set (or delete)
7792  *                uap->fd                 File descriptor of file to change mode
7793  *
7794  * Returns:        0                      Success
7795  *                !0                      errno value
7796  *
7797  */
7798 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7799 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7800 {
7801 	int error;
7802 	struct vnode_attr va;
7803 	kauth_filesec_t xsecdst = NULL;
7804 
7805 	AUDIT_ARG(owner, uap->uid, uap->gid);
7806 
7807 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7808 	    uap->gid, uap->xsecurity);
7809 
7810 	if (error) {
7811 		return error;
7812 	}
7813 
7814 	error = fchmod1(p, uap->fd, &va);
7815 
7816 	if (xsecdst != NULL) {
7817 		kauth_filesec_free(xsecdst);
7818 	}
7819 	return error;
7820 }
7821 
7822 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7823 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7824 {
7825 	struct vnode_attr va;
7826 
7827 	VATTR_INIT(&va);
7828 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7829 
7830 	return fchmod1(p, uap->fd, &va);
7831 }
7832 
7833 
7834 /*
7835  * Set ownership given a path name.
7836  */
7837 /* ARGSUSED */
7838 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7839 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7840     gid_t gid, int flag, enum uio_seg segflg)
7841 {
7842 	vnode_t vp;
7843 	struct vnode_attr va;
7844 	int error;
7845 	struct nameidata nd;
7846 	int follow;
7847 	kauth_action_t action;
7848 	uint32_t wantparent = 0;
7849 
7850 #if CONFIG_FILE_LEASES
7851 	wantparent = WANTPARENT;
7852 #endif
7853 
7854 	AUDIT_ARG(owner, uid, gid);
7855 
7856 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7857 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent, segflg,
7858 	    path, ctx);
7859 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7860 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7861 	}
7862 	error = nameiat(&nd, fd);
7863 	if (error) {
7864 		return error;
7865 	}
7866 	vp = nd.ni_vp;
7867 
7868 	VATTR_INIT(&va);
7869 	if (uid != (uid_t)VNOVAL) {
7870 		VATTR_SET(&va, va_uid, uid);
7871 	}
7872 	if (gid != (gid_t)VNOVAL) {
7873 		VATTR_SET(&va, va_gid, gid);
7874 	}
7875 
7876 #if CONFIG_MACF
7877 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7878 	if (error) {
7879 		goto out;
7880 	}
7881 #endif
7882 
7883 	/* preflight and authorize attribute changes */
7884 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7885 		goto out;
7886 	}
7887 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7888 		goto out;
7889 	}
7890 
7891 #if CONFIG_FILE_LEASES
7892 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7893 #endif
7894 
7895 	error = vnode_setattr(vp, &va, ctx);
7896 
7897 #if CONFIG_MACF
7898 	if (error == 0) {
7899 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
7900 	}
7901 #endif
7902 
7903 out:
7904 	/*
7905 	 * EACCES is only allowed from namei(); permissions failure should
7906 	 * return EPERM, so we need to translate the error code.
7907 	 */
7908 	if (error == EACCES) {
7909 		error = EPERM;
7910 	}
7911 
7912 #if CONFIG_FILE_LEASES
7913 	vnode_put(nd.ni_dvp);
7914 #endif
7915 	nameidone(&nd);
7916 	vnode_put(vp);
7917 	return error;
7918 }
7919 
7920 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7921 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7922 {
7923 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7924 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
7925 }
7926 
7927 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7928 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7929 {
7930 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7931 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7932 }
7933 
7934 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7935 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7936 {
7937 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7938 		return EINVAL;
7939 	}
7940 
7941 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7942 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7943 }
7944 
7945 /*
7946  * Set ownership given a file descriptor.
7947  */
7948 /* ARGSUSED */
7949 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7950 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7951 {
7952 	struct vnode_attr va;
7953 	vfs_context_t ctx = vfs_context_current();
7954 	vnode_t vp;
7955 	int error;
7956 	kauth_action_t action;
7957 
7958 	AUDIT_ARG(owner, uap->uid, uap->gid);
7959 	AUDIT_ARG(fd, uap->fd);
7960 
7961 	if ((error = file_vnode(uap->fd, &vp))) {
7962 		return error;
7963 	}
7964 
7965 	if ((error = vnode_getwithref(vp))) {
7966 		file_drop(uap->fd);
7967 		return error;
7968 	}
7969 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7970 
7971 	VATTR_INIT(&va);
7972 	if (uap->uid != VNOVAL) {
7973 		VATTR_SET(&va, va_uid, uap->uid);
7974 	}
7975 	if (uap->gid != VNOVAL) {
7976 		VATTR_SET(&va, va_gid, uap->gid);
7977 	}
7978 
7979 #if NAMEDSTREAMS
7980 	/* chown calls are not allowed for resource forks. */
7981 	if (vp->v_flag & VISNAMEDSTREAM) {
7982 		error = EPERM;
7983 		goto out;
7984 	}
7985 #endif
7986 
7987 #if CONFIG_MACF
7988 	error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7989 	if (error) {
7990 		goto out;
7991 	}
7992 #endif
7993 
7994 	/* preflight and authorize attribute changes */
7995 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7996 		goto out;
7997 	}
7998 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7999 		if (error == EACCES) {
8000 			error = EPERM;
8001 		}
8002 		goto out;
8003 	}
8004 
8005 #if CONFIG_FILE_LEASES
8006 	vnode_breakdirlease(vp, true, O_WRONLY);
8007 #endif
8008 
8009 	error = vnode_setattr(vp, &va, ctx);
8010 
8011 #if CONFIG_MACF
8012 	if (error == 0) {
8013 		mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
8014 	}
8015 #endif
8016 
8017 out:
8018 	(void)vnode_put(vp);
8019 	file_drop(uap->fd);
8020 	return error;
8021 }
8022 
8023 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8024 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8025 {
8026 	int error;
8027 
8028 	if (usrtvp == USER_ADDR_NULL) {
8029 		struct timeval old_tv;
8030 		/* XXX Y2038 bug because of microtime argument */
8031 		microtime(&old_tv);
8032 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8033 		tsp[1] = tsp[0];
8034 	} else {
8035 		if (IS_64BIT_PROCESS(current_proc())) {
8036 			struct user64_timeval tv[2];
8037 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8038 			if (error) {
8039 				return error;
8040 			}
8041 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8042 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8043 		} else {
8044 			struct user32_timeval tv[2];
8045 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8046 			if (error) {
8047 				return error;
8048 			}
8049 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8050 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8051 		}
8052 	}
8053 	return 0;
8054 }
8055 
8056 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8057 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8058     int nullflag)
8059 {
8060 	int error;
8061 	struct vnode_attr va;
8062 	kauth_action_t action;
8063 
8064 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8065 
8066 	VATTR_INIT(&va);
8067 	VATTR_SET(&va, va_access_time, ts[0]);
8068 	VATTR_SET(&va, va_modify_time, ts[1]);
8069 	if (nullflag) {
8070 		va.va_vaflags |= VA_UTIMES_NULL;
8071 	}
8072 
8073 #if NAMEDSTREAMS
8074 	/* utimes calls are not allowed for resource forks. */
8075 	if (vp->v_flag & VISNAMEDSTREAM) {
8076 		error = EPERM;
8077 		goto out;
8078 	}
8079 #endif
8080 
8081 #if CONFIG_MACF
8082 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8083 	if (error) {
8084 		goto out;
8085 	}
8086 #endif
8087 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8088 		if (!nullflag && error == EACCES) {
8089 			error = EPERM;
8090 		}
8091 		goto out;
8092 	}
8093 
8094 	/* since we may not need to auth anything, check here */
8095 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8096 		if (!nullflag && error == EACCES) {
8097 			error = EPERM;
8098 		}
8099 		goto out;
8100 	}
8101 	error = vnode_setattr(vp, &va, ctx);
8102 
8103 #if CONFIG_MACF
8104 	if (error == 0) {
8105 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8106 	}
8107 #endif
8108 
8109 out:
8110 	return error;
8111 }
8112 
8113 /*
8114  * Set the access and modification times of a file.
8115  */
8116 /* ARGSUSED */
8117 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8118 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8119 {
8120 	struct timespec ts[2];
8121 	user_addr_t usrtvp;
8122 	int error;
8123 	struct nameidata nd;
8124 	vfs_context_t ctx = vfs_context_current();
8125 	uint32_t wantparent = 0;
8126 
8127 #if CONFIG_FILE_LEASES
8128 	wantparent = WANTPARENT;
8129 #endif
8130 
8131 	/*
8132 	 * AUDIT: Needed to change the order of operations to do the
8133 	 * name lookup first because auditing wants the path.
8134 	 */
8135 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8136 	    UIO_USERSPACE, uap->path, ctx);
8137 	error = namei(&nd);
8138 	if (error) {
8139 		return error;
8140 	}
8141 
8142 	/*
8143 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8144 	 * the current time instead.
8145 	 */
8146 	usrtvp = uap->tptr;
8147 	if ((error = getutimes(usrtvp, ts)) != 0) {
8148 		goto out;
8149 	}
8150 
8151 #if CONFIG_FILE_LEASES
8152 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8153 #endif
8154 
8155 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8156 
8157 out:
8158 #if CONFIG_FILE_LEASES
8159 	vnode_put(nd.ni_dvp);
8160 #endif
8161 	nameidone(&nd);
8162 	vnode_put(nd.ni_vp);
8163 	return error;
8164 }
8165 
8166 /*
8167  * Set the access and modification times of a file.
8168  */
8169 /* ARGSUSED */
8170 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8171 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8172 {
8173 	struct timespec ts[2];
8174 	vnode_t vp;
8175 	user_addr_t usrtvp;
8176 	int error;
8177 
8178 	AUDIT_ARG(fd, uap->fd);
8179 	usrtvp = uap->tptr;
8180 	if ((error = getutimes(usrtvp, ts)) != 0) {
8181 		return error;
8182 	}
8183 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8184 		return error;
8185 	}
8186 	if ((error = vnode_getwithref(vp))) {
8187 		file_drop(uap->fd);
8188 		return error;
8189 	}
8190 
8191 #if CONFIG_FILE_LEASES
8192 	vnode_breakdirlease(vp, true, O_WRONLY);
8193 #endif
8194 
8195 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8196 
8197 	vnode_put(vp);
8198 	file_drop(uap->fd);
8199 	return error;
8200 }
8201 
8202 static int
truncate_validate_common(proc_t p,off_t length)8203 truncate_validate_common(proc_t p, off_t length)
8204 {
8205 	rlim_t fsize_limit;
8206 
8207 	if (length < 0) {
8208 		return EINVAL;
8209 	}
8210 
8211 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8212 	if ((rlim_t)length > fsize_limit) {
8213 		psignal(p, SIGXFSZ);
8214 		return EFBIG;
8215 	}
8216 
8217 	return 0;
8218 }
8219 
8220 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8221 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8222     vfs_context_t ctx, boolean_t need_auth)
8223 {
8224 	struct vnode_attr va;
8225 	kauth_action_t action;
8226 	int error;
8227 
8228 	VATTR_INIT(&va);
8229 	VATTR_SET(&va, va_data_size, length);
8230 
8231 #if CONFIG_MACF
8232 	error = mac_vnode_check_truncate(ctx, cred, vp);
8233 	if (error) {
8234 		return error;
8235 	}
8236 #endif
8237 
8238 	/*
8239 	 * If we reached here from `ftruncate` then we already did an effective
8240 	 * `vnode_authorize` upon open.  We honour the result from then.
8241 	 */
8242 	if (need_auth) {
8243 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8244 			return error;
8245 		}
8246 
8247 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8248 			return error;
8249 		}
8250 	}
8251 
8252 #if CONFIG_FILE_LEASES
8253 	/* Check if there is a lease placed on the parent directory. */
8254 	vnode_breakdirlease(vp, true, O_WRONLY);
8255 
8256 	/* Now check if there is a lease placed on the file itself. */
8257 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8258 #endif
8259 
8260 	error = vnode_setattr(vp, &va, ctx);
8261 
8262 #if CONFIG_MACF
8263 	if (error == 0) {
8264 		mac_vnode_notify_truncate(ctx, cred, vp);
8265 	}
8266 #endif
8267 
8268 	return error;
8269 }
8270 
8271 /*
8272  * Truncate a file given its path name.
8273  */
8274 /* ARGSUSED */
8275 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8276 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8277 {
8278 	vfs_context_t ctx = vfs_context_current();
8279 	vnode_t vp;
8280 	int error;
8281 	struct nameidata nd;
8282 
8283 	if ((error = truncate_validate_common(p, uap->length))) {
8284 		return error;
8285 	}
8286 
8287 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8288 	    UIO_USERSPACE, uap->path, ctx);
8289 
8290 	if ((error = namei(&nd))) {
8291 		return error;
8292 	}
8293 
8294 	vp = nd.ni_vp;
8295 	nameidone(&nd);
8296 
8297 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8298 	vnode_put(vp);
8299 
8300 	return error;
8301 }
8302 
8303 /*
8304  * Truncate a file given a file descriptor.
8305  */
8306 /* ARGSUSED */
8307 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8308 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8309 {
8310 	vnode_t vp;
8311 	struct fileproc *fp;
8312 	int error;
8313 
8314 	AUDIT_ARG(fd, uap->fd);
8315 
8316 	if ((error = truncate_validate_common(p, uap->length))) {
8317 		return error;
8318 	}
8319 
8320 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8321 		return error;
8322 	}
8323 
8324 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8325 	case DTYPE_PSXSHM:
8326 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8327 		goto out;
8328 	case DTYPE_VNODE:
8329 		break;
8330 	default:
8331 		error = EINVAL;
8332 		goto out;
8333 	}
8334 
8335 	vp = (vnode_t)fp_get_data(fp);
8336 
8337 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8338 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8339 		error = EINVAL;
8340 		goto out;
8341 	}
8342 
8343 	if ((error = vnode_getwithref(vp)) != 0) {
8344 		goto out;
8345 	}
8346 
8347 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8348 
8349 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8350 	    vfs_context_current(), false);
8351 	vnode_put(vp);
8352 
8353 out:
8354 	file_drop(uap->fd);
8355 	return error;
8356 }
8357 
8358 
8359 /*
8360  * Sync an open file with synchronized I/O _file_ integrity completion
8361  */
8362 /* ARGSUSED */
8363 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8364 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8365 {
8366 	__pthread_testcancel(1);
8367 	return fsync_common(p, uap, MNT_WAIT);
8368 }
8369 
8370 
8371 /*
8372  * Sync an open file with synchronized I/O _file_ integrity completion
8373  *
8374  * Notes:	This is a legacy support function that does not test for
8375  *		thread cancellation points.
8376  */
8377 /* ARGSUSED */
8378 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8379 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8380 {
8381 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8382 }
8383 
8384 
8385 /*
8386  * Sync an open file with synchronized I/O _data_ integrity completion
8387  */
8388 /* ARGSUSED */
8389 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8390 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8391 {
8392 	__pthread_testcancel(1);
8393 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8394 }
8395 
8396 
8397 /*
8398  * fsync_common
8399  *
8400  * Common fsync code to support both synchronized I/O file integrity completion
8401  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8402  *
8403  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8404  * will only guarantee that the file data contents are retrievable.  If
8405  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8406  * includes additional metadata unnecessary for retrieving the file data
8407  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8408  * storage.
8409  *
8410  * Parameters:	p				The process
8411  *		uap->fd				The descriptor to synchronize
8412  *		flags				The data integrity flags
8413  *
8414  * Returns:	int				Success
8415  *	fp_getfvp:EBADF				Bad file descriptor
8416  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8417  *	VNOP_FSYNC:???				unspecified
8418  *
8419  * Notes:	We use struct fsync_args because it is a short name, and all
8420  *		caller argument structures are otherwise identical.
8421  */
8422 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8423 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8424 {
8425 	vnode_t vp;
8426 	struct fileproc *fp;
8427 	vfs_context_t ctx = vfs_context_current();
8428 	int error;
8429 
8430 	AUDIT_ARG(fd, uap->fd);
8431 
8432 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8433 		return error;
8434 	}
8435 	if ((error = vnode_getwithref(vp))) {
8436 		file_drop(uap->fd);
8437 		return error;
8438 	}
8439 
8440 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8441 
8442 	error = VNOP_FSYNC(vp, flags, ctx);
8443 
8444 #if NAMEDRSRCFORK
8445 	/* Sync resource fork shadow file if necessary. */
8446 	if ((error == 0) &&
8447 	    (vp->v_flag & VISNAMEDSTREAM) &&
8448 	    (vp->v_parent != NULLVP) &&
8449 	    vnode_isshadow(vp) &&
8450 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8451 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8452 	}
8453 #endif
8454 
8455 	(void)vnode_put(vp);
8456 	file_drop(uap->fd);
8457 	return error;
8458 }
8459 
8460 /*
8461  * Duplicate files.  Source must be a file, target must be a file or
8462  * must not exist.
8463  *
8464  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8465  *     perform inheritance correctly.
8466  */
8467 /* ARGSUSED */
8468 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8469 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8470 {
8471 	vnode_t tvp, fvp, tdvp, sdvp;
8472 	struct nameidata fromnd, tond;
8473 	int error;
8474 	vfs_context_t ctx = vfs_context_current();
8475 
8476 	/* Check that the flags are valid. */
8477 	if (uap->flags & ~CPF_MASK) {
8478 		return EINVAL;
8479 	}
8480 
8481 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8482 	    UIO_USERSPACE, uap->from, ctx);
8483 	if ((error = namei(&fromnd))) {
8484 		return error;
8485 	}
8486 	fvp = fromnd.ni_vp;
8487 
8488 	NDINIT(&tond, CREATE, OP_LINK,
8489 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8490 	    UIO_USERSPACE, uap->to, ctx);
8491 	if ((error = namei(&tond))) {
8492 		goto out1;
8493 	}
8494 	tdvp = tond.ni_dvp;
8495 	tvp = tond.ni_vp;
8496 
8497 	if (tvp != NULL) {
8498 		if (!(uap->flags & CPF_OVERWRITE)) {
8499 			error = EEXIST;
8500 			goto out;
8501 		}
8502 	}
8503 
8504 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8505 		error = EISDIR;
8506 		goto out;
8507 	}
8508 
8509 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8510 		error = EOPNOTSUPP;
8511 		goto out;
8512 	}
8513 
8514 #if CONFIG_MACF
8515 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8516 		goto out;
8517 	}
8518 #endif /* CONFIG_MACF */
8519 
8520 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8521 		goto out;
8522 	}
8523 	if (tvp) {
8524 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8525 			goto out;
8526 		}
8527 	}
8528 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8529 		goto out;
8530 	}
8531 
8532 	if (fvp == tdvp) {
8533 		error = EINVAL;
8534 	}
8535 	/*
8536 	 * If source is the same as the destination (that is the
8537 	 * same inode number) then there is nothing to do.
8538 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8539 	 */
8540 	if (fvp == tvp) {
8541 		error = -1;
8542 	}
8543 
8544 #if CONFIG_FILE_LEASES
8545 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8546 #endif
8547 
8548 	if (!error) {
8549 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8550 	}
8551 out:
8552 	sdvp = tond.ni_startdir;
8553 	/*
8554 	 * nameidone has to happen before we vnode_put(tdvp)
8555 	 * since it may need to release the fs_nodelock on the tdvp
8556 	 */
8557 	nameidone(&tond);
8558 
8559 	if (tvp) {
8560 		vnode_put(tvp);
8561 	}
8562 	vnode_put(tdvp);
8563 	vnode_put(sdvp);
8564 out1:
8565 	vnode_put(fvp);
8566 
8567 	nameidone(&fromnd);
8568 
8569 	if (error == -1) {
8570 		return 0;
8571 	}
8572 	return error;
8573 }
8574 
8575 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8576 
8577 /*
8578  * Helper function for doing clones. The caller is expected to provide an
8579  * iocounted source vnode and release it.
8580  */
8581 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8582 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8583     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8584 {
8585 	vnode_t tvp, tdvp;
8586 	struct nameidata tond;
8587 	int error;
8588 	int follow;
8589 	boolean_t free_src_acl;
8590 	boolean_t attr_cleanup;
8591 	enum vtype v_type;
8592 	kauth_action_t action;
8593 	struct componentname *cnp;
8594 	uint32_t defaulted = 0;
8595 	struct vnode_attr va;
8596 	struct vnode_attr nva;
8597 	uint32_t vnop_flags;
8598 
8599 	v_type = vnode_vtype(fvp);
8600 	switch (v_type) {
8601 	case VLNK:
8602 	/* FALLTHRU */
8603 	case VREG:
8604 		action = KAUTH_VNODE_ADD_FILE;
8605 		break;
8606 	case VDIR:
8607 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8608 		    fvp->v_mountedhere) {
8609 			return EINVAL;
8610 		}
8611 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8612 		break;
8613 	default:
8614 		return EINVAL;
8615 	}
8616 
8617 	AUDIT_ARG(fd2, dst_dirfd);
8618 	AUDIT_ARG(value32, flags);
8619 
8620 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8621 	NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8622 	    UIO_USERSPACE, dst, ctx);
8623 	if ((error = nameiat(&tond, dst_dirfd))) {
8624 		return error;
8625 	}
8626 	cnp = &tond.ni_cnd;
8627 	tdvp = tond.ni_dvp;
8628 	tvp = tond.ni_vp;
8629 
8630 	free_src_acl = FALSE;
8631 	attr_cleanup = FALSE;
8632 
8633 	if (tvp != NULL) {
8634 		error = EEXIST;
8635 		goto out;
8636 	}
8637 
8638 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8639 		error = EXDEV;
8640 		goto out;
8641 	}
8642 
8643 #if CONFIG_MACF
8644 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8645 		goto out;
8646 	}
8647 #endif
8648 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8649 		goto out;
8650 	}
8651 
8652 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8653 	if (data_read_authorised) {
8654 		action &= ~KAUTH_VNODE_READ_DATA;
8655 	}
8656 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8657 		goto out;
8658 	}
8659 
8660 	/*
8661 	 * certain attributes may need to be changed from the source, we ask for
8662 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
8663 	 * flag is specified. By default, the clone file will inherit the target
8664 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8665 	 * will inherit the source file's ACLs instead.
8666 	 */
8667 	VATTR_INIT(&va);
8668 	VATTR_WANTED(&va, va_uid);
8669 	VATTR_WANTED(&va, va_gid);
8670 	VATTR_WANTED(&va, va_mode);
8671 	VATTR_WANTED(&va, va_flags);
8672 	if (flags & CLONE_ACL) {
8673 		VATTR_WANTED(&va, va_acl);
8674 	}
8675 
8676 	if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8677 		goto out;
8678 	}
8679 
8680 	VATTR_INIT(&nva);
8681 	VATTR_SET(&nva, va_type, v_type);
8682 	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8683 		VATTR_SET(&nva, va_acl, va.va_acl);
8684 		free_src_acl = TRUE;
8685 	}
8686 
8687 	/* Handle ACL inheritance, initialize vap. */
8688 	if (v_type == VLNK) {
8689 		error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8690 	} else {
8691 		error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8692 		if (error) {
8693 			goto out;
8694 		}
8695 		attr_cleanup = TRUE;
8696 	}
8697 
8698 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8699 	/*
8700 	 * We've got initial values for all security parameters,
8701 	 * If we are superuser, then we can change owners to be the
8702 	 * same as the source. Both superuser and the owner have default
8703 	 * WRITE_SECURITY privileges so all other fields can be taken
8704 	 * from source as well.
8705 	 */
8706 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8707 		if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8708 			VATTR_SET(&nva, va_uid, va.va_uid);
8709 		}
8710 		if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8711 			VATTR_SET(&nva, va_gid, va.va_gid);
8712 		}
8713 	} else {
8714 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8715 	}
8716 
8717 	if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8718 		VATTR_SET(&nva, va_mode, va.va_mode);
8719 	}
8720 	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8721 		VATTR_SET(&nva, va_flags,
8722 		    ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8723 		    (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8724 	}
8725 
8726 #if CONFIG_FILE_LEASES
8727 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8728 #endif
8729 
8730 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8731 
8732 	if (!error && tvp) {
8733 		int     update_flags = 0;
8734 #if CONFIG_FSE
8735 		int fsevent;
8736 #endif /* CONFIG_FSE */
8737 
8738 		/*
8739 		 * If some of the requested attributes weren't handled by the
8740 		 * VNOP, use our fallback code.
8741 		 */
8742 		if (!VATTR_ALL_SUPPORTED(&nva)) {
8743 			(void)vnode_setattr_fallback(tvp, &nva, ctx);
8744 		}
8745 
8746 #if CONFIG_MACF
8747 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8748 		    VNODE_LABEL_CREATE, ctx);
8749 #endif
8750 
8751 		// Make sure the name & parent pointers are hooked up
8752 		if (tvp->v_name == NULL) {
8753 			update_flags |= VNODE_UPDATE_NAME;
8754 		}
8755 		if (tvp->v_parent == NULLVP) {
8756 			update_flags |= VNODE_UPDATE_PARENT;
8757 		}
8758 
8759 		if (update_flags) {
8760 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8761 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8762 		}
8763 
8764 #if CONFIG_FSE
8765 		switch (vnode_vtype(tvp)) {
8766 		case VLNK:
8767 		/* FALLTHRU */
8768 		case VREG:
8769 			fsevent = FSE_CREATE_FILE;
8770 			break;
8771 		case VDIR:
8772 			fsevent = FSE_CREATE_DIR;
8773 			break;
8774 		default:
8775 			goto out;
8776 		}
8777 
8778 		if (need_fsevent(fsevent, tvp)) {
8779 			/*
8780 			 * The following is a sequence of three explicit events.
8781 			 * A pair of FSE_CLONE events representing the source and destination
8782 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8783 			 * fseventsd may coalesce the destination clone and create events
8784 			 * into a single event resulting in the following sequence for a client
8785 			 * FSE_CLONE (src)
8786 			 * FSE_CLONE | FSE_CREATE (dst)
8787 			 */
8788 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8789 			    FSE_ARG_DONE);
8790 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8791 			    FSE_ARG_DONE);
8792 		}
8793 #endif /* CONFIG_FSE */
8794 	}
8795 
8796 out:
8797 	if (attr_cleanup) {
8798 		vn_attribute_cleanup(&nva, defaulted);
8799 	}
8800 	if (free_src_acl && va.va_acl) {
8801 		kauth_acl_free(va.va_acl);
8802 	}
8803 	nameidone(&tond);
8804 	if (tvp) {
8805 		vnode_put(tvp);
8806 	}
8807 	vnode_put(tdvp);
8808 	return error;
8809 }
8810 
8811 /*
8812  * clone files or directories, target must not exist.
8813  */
8814 /* ARGSUSED */
8815 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8816 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8817     __unused int32_t *retval)
8818 {
8819 	vnode_t fvp;
8820 	struct nameidata fromnd;
8821 	int follow;
8822 	int error;
8823 	vfs_context_t ctx = vfs_context_current();
8824 
8825 	/* Check that the flags are valid. */
8826 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8827 		return EINVAL;
8828 	}
8829 
8830 	AUDIT_ARG(fd, uap->src_dirfd);
8831 
8832 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8833 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8834 	    UIO_USERSPACE, uap->src, ctx);
8835 	if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8836 		return error;
8837 	}
8838 
8839 	fvp = fromnd.ni_vp;
8840 	nameidone(&fromnd);
8841 
8842 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8843 	    uap->flags, ctx);
8844 
8845 	vnode_put(fvp);
8846 	return error;
8847 }
8848 
8849 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8850 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8851     __unused int32_t *retval)
8852 {
8853 	vnode_t fvp;
8854 	struct fileproc *fp;
8855 	int error;
8856 	vfs_context_t ctx = vfs_context_current();
8857 
8858 	/* Check that the flags are valid. */
8859 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8860 		return EINVAL;
8861 	}
8862 
8863 	AUDIT_ARG(fd, uap->src_fd);
8864 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8865 	if (error) {
8866 		return error;
8867 	}
8868 
8869 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8870 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8871 		error = EBADF;
8872 		goto out;
8873 	}
8874 
8875 	if ((error = vnode_getwithref(fvp))) {
8876 		goto out;
8877 	}
8878 
8879 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8880 
8881 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8882 	    uap->flags, ctx);
8883 
8884 	vnode_put(fvp);
8885 out:
8886 	file_drop(uap->src_fd);
8887 	return error;
8888 }
8889 
8890 static int
rename_submounts_callback(mount_t mp,void * arg)8891 rename_submounts_callback(mount_t mp, void *arg)
8892 {
8893 	int error = 0;
8894 	mount_t pmp = (mount_t)arg;
8895 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8896 
8897 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8898 		return 0;
8899 	}
8900 
8901 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8902 		return 0;
8903 	}
8904 
8905 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8906 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8907 		return -1;
8908 	}
8909 
8910 	size_t pathlen = MAXPATHLEN;
8911 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8912 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8913 	}
8914 
8915 	vfs_unbusy(mp);
8916 
8917 	return error;
8918 }
8919 
8920 /*
8921  * Rename files.  Source and destination must either both be directories,
8922  * or both not be directories.  If target is a directory, it must be empty.
8923  */
8924 /* ARGSUSED */
8925 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8926 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8927     int tofd, user_addr_t to, int segflg, u_int uflags)
8928 {
8929 	vnode_t tvp, tdvp;
8930 	vnode_t fvp, fdvp;
8931 	vnode_t mnt_fvp;
8932 	struct nameidata *fromnd, *tond;
8933 	int error = 0;
8934 	int do_retry;
8935 	int retry_count;
8936 	int mntrename;
8937 	int need_event;
8938 	int need_kpath2;
8939 	int has_listeners;
8940 	const char *oname = NULL;
8941 	char *from_name = NULL, *to_name = NULL;
8942 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8943 	int from_len = 0, to_len = 0;
8944 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8945 	int holding_mntlock;
8946 	int vn_authorize_skipped;
8947 	mount_t locked_mp = NULL;
8948 	vnode_t oparent = NULLVP;
8949 #if CONFIG_FSE
8950 	fse_info from_finfo = {}, to_finfo;
8951 #endif
8952 	int from_truncated = 0, to_truncated = 0;
8953 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8954 	int batched = 0;
8955 	struct vnode_attr *fvap, *tvap;
8956 	int continuing = 0;
8957 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8958 	int32_t nofollow_any = 0;
8959 	/* carving out a chunk for structs that are too big to be on stack. */
8960 	struct {
8961 		struct nameidata from_node, to_node;
8962 		struct vnode_attr fv_attr, tv_attr;
8963 	} * __rename_data;
8964 
8965 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8966 	fromnd = &__rename_data->from_node;
8967 	tond = &__rename_data->to_node;
8968 
8969 	holding_mntlock = 0;
8970 	do_retry = 0;
8971 	retry_count = 0;
8972 retry:
8973 	fvp = tvp = NULL;
8974 	fdvp = tdvp = NULL;
8975 	fvap = tvap = NULL;
8976 	mnt_fvp = NULLVP;
8977 	mntrename = FALSE;
8978 	vn_authorize_skipped = FALSE;
8979 
8980 	if (uflags & RENAME_NOFOLLOW_ANY) {
8981 		nofollow_any = NAMEI_NOFOLLOW_ANY;
8982 	}
8983 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8984 	    segflg, from, ctx);
8985 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8986 
8987 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8988 	    segflg, to, ctx);
8989 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8990 
8991 continue_lookup:
8992 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8993 		if ((error = nameiat(fromnd, fromfd))) {
8994 			goto out1;
8995 		}
8996 		fdvp = fromnd->ni_dvp;
8997 		fvp  = fromnd->ni_vp;
8998 
8999 		if (fvp && fvp->v_type == VDIR) {
9000 			tond->ni_cnd.cn_flags |= WILLBEDIR;
9001 		}
9002 	}
9003 
9004 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9005 		if ((error = nameiat(tond, tofd))) {
9006 			/*
9007 			 * Translate error code for rename("dir1", "dir2/.").
9008 			 */
9009 			if (error == EISDIR && fvp->v_type == VDIR) {
9010 				error = EINVAL;
9011 			}
9012 			goto out1;
9013 		}
9014 		tdvp = tond->ni_dvp;
9015 		tvp  = tond->ni_vp;
9016 	}
9017 
9018 #if DEVELOPMENT || DEBUG
9019 	/*
9020 	 * XXX VSWAP: Check for entitlements or special flag here
9021 	 * so we can restrict access appropriately.
9022 	 */
9023 #else /* DEVELOPMENT || DEBUG */
9024 
9025 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9026 		error = EPERM;
9027 		goto out1;
9028 	}
9029 
9030 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9031 		error = EPERM;
9032 		goto out1;
9033 	}
9034 #endif /* DEVELOPMENT || DEBUG */
9035 
9036 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9037 		error = ENOENT;
9038 		goto out1;
9039 	}
9040 
9041 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9042 		int32_t pval = 0;
9043 		int err = 0;
9044 
9045 		/*
9046 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9047 		 * has the same name as target iff the following conditions are met:
9048 		 * 1. the target file system is case insensitive
9049 		 * 2. source and target directories are the same
9050 		 * 3. source and target files are the same
9051 		 * 4. name only differs in case (determined by underlying filesystem)
9052 		 */
9053 		if (fvp != tvp || fdvp != tdvp) {
9054 			error = EEXIST;
9055 			goto out1;
9056 		}
9057 
9058 		/*
9059 		 * Assume that the target file system is case sensitive if
9060 		 * _PC_CASE_SENSITIVE selector isn't supported.
9061 		 */
9062 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9063 		if (err != 0 || pval != 0) {
9064 			error = EEXIST;
9065 			goto out1;
9066 		}
9067 	}
9068 
9069 	batched = vnode_compound_rename_available(fdvp);
9070 
9071 #if CONFIG_FSE
9072 	need_event = need_fsevent(FSE_RENAME, fdvp);
9073 	if (need_event) {
9074 		if (fvp) {
9075 			get_fse_info(fvp, &from_finfo, ctx);
9076 		} else {
9077 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9078 			if (error) {
9079 				goto out1;
9080 			}
9081 
9082 			fvap = &__rename_data->fv_attr;
9083 		}
9084 
9085 		if (tvp) {
9086 			get_fse_info(tvp, &to_finfo, ctx);
9087 		} else if (batched) {
9088 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9089 			if (error) {
9090 				goto out1;
9091 			}
9092 
9093 			tvap = &__rename_data->tv_attr;
9094 		}
9095 	}
9096 #else
9097 	need_event = 0;
9098 #endif /* CONFIG_FSE */
9099 
9100 	has_listeners = kauth_authorize_fileop_has_listeners();
9101 
9102 	need_kpath2 = 0;
9103 #if CONFIG_AUDIT
9104 	if (AUDIT_RECORD_EXISTS()) {
9105 		need_kpath2 = 1;
9106 	}
9107 #endif
9108 
9109 	if (need_event || has_listeners) {
9110 		if (from_name == NULL) {
9111 			GET_PATH(from_name);
9112 		}
9113 
9114 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9115 
9116 		if (from_name_no_firmlink == NULL) {
9117 			GET_PATH(from_name_no_firmlink);
9118 		}
9119 
9120 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9121 	}
9122 
9123 	if (need_event || need_kpath2 || has_listeners) {
9124 		if (to_name == NULL) {
9125 			GET_PATH(to_name);
9126 		}
9127 
9128 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9129 
9130 		if (to_name_no_firmlink == NULL) {
9131 			GET_PATH(to_name_no_firmlink);
9132 		}
9133 
9134 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9135 		if (to_name && need_kpath2) {
9136 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9137 		}
9138 	}
9139 	if (!fvp) {
9140 		/*
9141 		 * Claim: this check will never reject a valid rename.
9142 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9143 		 * Suppose fdvp and tdvp are not on the same mount.
9144 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9145 		 *      then you can't move it to within another dir on the same mountpoint.
9146 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9147 		 *
9148 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9149 		 */
9150 		if (fdvp->v_mount != tdvp->v_mount) {
9151 			error = EXDEV;
9152 			goto out1;
9153 		}
9154 		goto skipped_lookup;
9155 	}
9156 
9157 	/*
9158 	 * If the source and destination are the same (i.e. they're
9159 	 * links to the same vnode) and the target file system is
9160 	 * case sensitive, then there is nothing to do.
9161 	 *
9162 	 * XXX Come back to this.
9163 	 */
9164 	if (fvp == tvp) {
9165 		int pathconf_val;
9166 
9167 		/*
9168 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9169 		 * then assume that this file system is case sensitive.
9170 		 */
9171 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9172 		    pathconf_val != 0) {
9173 			vn_authorize_skipped = TRUE;
9174 			goto out1;
9175 		}
9176 	}
9177 
9178 	/*
9179 	 * Allow the renaming of mount points.
9180 	 * - target must not exist
9181 	 * - target must reside in the same directory as source
9182 	 * - union mounts cannot be renamed
9183 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9184 	 *
9185 	 * XXX Handle this in VFS after a continued lookup (if we missed
9186 	 * in the cache to start off)
9187 	 *
9188 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9189 	 * we'll skip past here.  The file system is responsible for
9190 	 * checking that @tvp is not a descendent of @fvp and vice versa
9191 	 * so it should always return EINVAL if either @tvp or @fvp is the
9192 	 * root of a volume.
9193 	 */
9194 	if ((fvp->v_flag & VROOT) &&
9195 	    (fvp->v_type == VDIR) &&
9196 	    (tvp == NULL) &&
9197 	    (fvp->v_mountedhere == NULL) &&
9198 	    (fdvp == tdvp) &&
9199 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9200 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9201 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9202 		vnode_t coveredvp;
9203 
9204 		/* switch fvp to the covered vnode */
9205 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9206 		if ((vnode_getwithref(coveredvp))) {
9207 			error = ENOENT;
9208 			goto out1;
9209 		}
9210 		/*
9211 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9212 		 * later.
9213 		 */
9214 		mnt_fvp = fvp;
9215 
9216 		fvp = coveredvp;
9217 		mntrename = TRUE;
9218 	}
9219 	/*
9220 	 * Check for cross-device rename.
9221 	 */
9222 	if ((fvp->v_mount != tdvp->v_mount) ||
9223 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9224 		error = EXDEV;
9225 		goto out1;
9226 	}
9227 
9228 	/*
9229 	 * If source is the same as the destination (that is the
9230 	 * same inode number) then there is nothing to do...
9231 	 * EXCEPT if the underlying file system supports case
9232 	 * insensitivity and is case preserving.  In this case
9233 	 * the file system needs to handle the special case of
9234 	 * getting the same vnode as target (fvp) and source (tvp).
9235 	 *
9236 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9237 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9238 	 * handle the special case of getting the same vnode as target and
9239 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9240 	 * so not to cause locking problems. There is a single reference on tvp.
9241 	 *
9242 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9243 	 * that correct behaviour then is just to return success without doing
9244 	 * anything.
9245 	 *
9246 	 * XXX filesystem should take care of this itself, perhaps...
9247 	 */
9248 	if (fvp == tvp && fdvp == tdvp) {
9249 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9250 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9251 		    fromnd->ni_cnd.cn_namelen)) {
9252 			vn_authorize_skipped = TRUE;
9253 			goto out1;
9254 		}
9255 	}
9256 
9257 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9258 		/*
9259 		 * we're holding a reference and lock
9260 		 * on locked_mp, but it no longer matches
9261 		 * what we want to do... so drop our hold
9262 		 */
9263 		mount_unlock_renames(locked_mp);
9264 		mount_drop(locked_mp, 0);
9265 		holding_mntlock = 0;
9266 	}
9267 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9268 		/*
9269 		 * serialize renames that re-shape
9270 		 * the tree... if holding_mntlock is
9271 		 * set, then we're ready to go...
9272 		 * otherwise we
9273 		 * first need to drop the iocounts
9274 		 * we picked up, second take the
9275 		 * lock to serialize the access,
9276 		 * then finally start the lookup
9277 		 * process over with the lock held
9278 		 */
9279 		if (!holding_mntlock) {
9280 			/*
9281 			 * need to grab a reference on
9282 			 * the mount point before we
9283 			 * drop all the iocounts... once
9284 			 * the iocounts are gone, the mount
9285 			 * could follow
9286 			 */
9287 			locked_mp = fvp->v_mount;
9288 			mount_ref(locked_mp, 0);
9289 
9290 			/*
9291 			 * nameidone has to happen before we vnode_put(tvp)
9292 			 * since it may need to release the fs_nodelock on the tvp
9293 			 */
9294 			nameidone(tond);
9295 
9296 			if (tvp) {
9297 				vnode_put(tvp);
9298 			}
9299 			vnode_put(tdvp);
9300 
9301 			/*
9302 			 * nameidone has to happen before we vnode_put(fdvp)
9303 			 * since it may need to release the fs_nodelock on the fvp
9304 			 */
9305 			nameidone(fromnd);
9306 
9307 			vnode_put(fvp);
9308 			vnode_put(fdvp);
9309 
9310 			if (mnt_fvp != NULLVP) {
9311 				vnode_put(mnt_fvp);
9312 			}
9313 
9314 			mount_lock_renames(locked_mp);
9315 			holding_mntlock = 1;
9316 
9317 			goto retry;
9318 		}
9319 	} else {
9320 		/*
9321 		 * when we dropped the iocounts to take
9322 		 * the lock, we allowed the identity of
9323 		 * the various vnodes to change... if they did,
9324 		 * we may no longer be dealing with a rename
9325 		 * that reshapes the tree... once we're holding
9326 		 * the iocounts, the vnodes can't change type
9327 		 * so we're free to drop the lock at this point
9328 		 * and continue on
9329 		 */
9330 		if (holding_mntlock) {
9331 			mount_unlock_renames(locked_mp);
9332 			mount_drop(locked_mp, 0);
9333 			holding_mntlock = 0;
9334 		}
9335 	}
9336 
9337 	if (!batched) {
9338 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9339 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9340 		    flags, NULL);
9341 		if (error) {
9342 			if (error == ENOENT) {
9343 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9344 					/*
9345 					 * We encountered a race where after doing the namei,
9346 					 * tvp stops being valid. If so, simply re-drive the rename
9347 					 * call from the top.
9348 					 */
9349 					do_retry = 1;
9350 					retry_count += 1;
9351 				}
9352 			}
9353 			goto out1;
9354 		}
9355 	}
9356 
9357 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9358 	if (mnt_fvp != NULLVP) {
9359 		vnode_put(mnt_fvp);
9360 		mnt_fvp = NULLVP;
9361 	}
9362 
9363 	// save these off so we can later verify that fvp is the same
9364 	oname   = fvp->v_name;
9365 	oparent = fvp->v_parent;
9366 
9367 skipped_lookup:
9368 #if CONFIG_FILE_LEASES
9369 	/* Lease break needed for source's parent dir? */
9370 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9371 
9372 	/* Lease break needed for target's parent dir? */
9373 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9374 #endif
9375 
9376 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9377 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9378 	    flags, ctx);
9379 
9380 	if (holding_mntlock) {
9381 		/*
9382 		 * we can drop our serialization
9383 		 * lock now
9384 		 */
9385 		mount_unlock_renames(locked_mp);
9386 		mount_drop(locked_mp, 0);
9387 		holding_mntlock = 0;
9388 	}
9389 	if (error) {
9390 		if (error == EDATALESS) {
9391 			/*
9392 			 * If we've been here before, something has gone
9393 			 * horribly wrong and we should just get out lest
9394 			 * we spiral around the drain forever.
9395 			 */
9396 			if (flags & VFS_RENAME_DATALESS) {
9397 				error = EIO;
9398 				goto out1;
9399 			}
9400 
9401 			/*
9402 			 * The object we're renaming is dataless (or has a
9403 			 * dataless descendent) and requires materialization
9404 			 * before the rename occurs.  But we're holding the
9405 			 * mount point's rename lock, so it's not safe to
9406 			 * make the upcall.
9407 			 *
9408 			 * In this case, we release the lock, perform the
9409 			 * materialization, and start the whole thing over.
9410 			 */
9411 			error = vnode_materialize_dataless_file(fvp,
9412 			    NAMESPACE_HANDLER_RENAME_OP);
9413 
9414 			if (error == 0) {
9415 				/*
9416 				 * The next time around we need to tell the
9417 				 * file system that the materializtaion has
9418 				 * been performed.
9419 				 */
9420 				flags |= VFS_RENAME_DATALESS;
9421 				do_retry = 1;
9422 			}
9423 			goto out1;
9424 		}
9425 		if (error == EKEEPLOOKING) {
9426 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9427 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9428 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9429 				}
9430 			}
9431 
9432 			fromnd->ni_vp = fvp;
9433 			tond->ni_vp = tvp;
9434 
9435 			goto continue_lookup;
9436 		}
9437 
9438 		/*
9439 		 * We may encounter a race in the VNOP where the destination didn't
9440 		 * exist when we did the namei, but it does by the time we go and
9441 		 * try to create the entry. In this case, we should re-drive this rename
9442 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9443 		 * but other filesystems susceptible to this race could return it, too.
9444 		 */
9445 		if (error == ERECYCLE) {
9446 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9447 				do_retry = 1;
9448 				retry_count += 1;
9449 			} else {
9450 				printf("rename retry limit due to ERECYCLE reached\n");
9451 				error = ENOENT;
9452 			}
9453 		}
9454 
9455 		/*
9456 		 * For compound VNOPs, the authorization callback may return
9457 		 * ENOENT in case of racing hardlink lookups hitting the name
9458 		 * cache, redrive the lookup.
9459 		 */
9460 		if (batched && error == ENOENT) {
9461 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9462 				do_retry = 1;
9463 				retry_count += 1;
9464 			}
9465 		}
9466 
9467 		goto out1;
9468 	}
9469 
9470 	/* call out to allow 3rd party notification of rename.
9471 	 * Ignore result of kauth_authorize_fileop call.
9472 	 */
9473 	kauth_authorize_fileop(vfs_context_ucred(ctx),
9474 	    KAUTH_FILEOP_RENAME,
9475 	    (uintptr_t)from_name, (uintptr_t)to_name);
9476 	if (flags & VFS_RENAME_SWAP) {
9477 		kauth_authorize_fileop(vfs_context_ucred(ctx),
9478 		    KAUTH_FILEOP_RENAME,
9479 		    (uintptr_t)to_name, (uintptr_t)from_name);
9480 	}
9481 
9482 #if CONFIG_FSE
9483 	if (from_name != NULL && to_name != NULL) {
9484 		if (from_truncated || to_truncated) {
9485 			// set it here since only the from_finfo gets reported up to user space
9486 			from_finfo.mode |= FSE_TRUNCATED_PATH;
9487 		}
9488 
9489 		if (tvap && tvp) {
9490 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9491 		}
9492 		if (fvap) {
9493 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9494 		}
9495 
9496 		if (tvp) {
9497 			add_fsevent(FSE_RENAME, ctx,
9498 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9499 			    FSE_ARG_FINFO, &from_finfo,
9500 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9501 			    FSE_ARG_FINFO, &to_finfo,
9502 			    FSE_ARG_DONE);
9503 			if (flags & VFS_RENAME_SWAP) {
9504 				/*
9505 				 * Strictly speaking, swap is the equivalent of
9506 				 * *three* renames.  FSEvents clients should only take
9507 				 * the events as a hint, so we only bother reporting
9508 				 * two.
9509 				 */
9510 				add_fsevent(FSE_RENAME, ctx,
9511 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9512 				    FSE_ARG_FINFO, &to_finfo,
9513 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9514 				    FSE_ARG_FINFO, &from_finfo,
9515 				    FSE_ARG_DONE);
9516 			}
9517 		} else {
9518 			add_fsevent(FSE_RENAME, ctx,
9519 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9520 			    FSE_ARG_FINFO, &from_finfo,
9521 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9522 			    FSE_ARG_DONE);
9523 		}
9524 	}
9525 #endif /* CONFIG_FSE */
9526 
9527 	/*
9528 	 * update filesystem's mount point data
9529 	 */
9530 	if (mntrename) {
9531 		char *cp, *pathend, *mpname;
9532 		char * tobuf;
9533 		struct mount *mp;
9534 		int maxlen;
9535 		size_t len = 0;
9536 
9537 		mp = fvp->v_mountedhere;
9538 
9539 		if (vfs_busy(mp, LK_NOWAIT)) {
9540 			error = EBUSY;
9541 			goto out1;
9542 		}
9543 		tobuf = zalloc(ZV_NAMEI);
9544 
9545 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
9546 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9547 		} else {
9548 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9549 		}
9550 		if (!error) {
9551 			/* find current mount point prefix */
9552 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
9553 			for (cp = pathend; *cp != '\0'; ++cp) {
9554 				if (*cp == '/') {
9555 					pathend = cp + 1;
9556 				}
9557 			}
9558 			/* find last component of target name */
9559 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9560 				if (*cp == '/') {
9561 					mpname = cp + 1;
9562 				}
9563 			}
9564 
9565 			/* Update f_mntonname of sub mounts */
9566 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
9567 
9568 			/* append name to prefix */
9569 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9570 			bzero(pathend, maxlen);
9571 
9572 			strlcpy(pathend, mpname, maxlen);
9573 		}
9574 		zfree(ZV_NAMEI, tobuf);
9575 
9576 		vfs_unbusy(mp);
9577 
9578 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9579 	}
9580 	/*
9581 	 * fix up name & parent pointers.  note that we first
9582 	 * check that fvp has the same name/parent pointers it
9583 	 * had before the rename call... this is a 'weak' check
9584 	 * at best...
9585 	 *
9586 	 * XXX oparent and oname may not be set in the compound vnop case
9587 	 */
9588 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9589 		int update_flags;
9590 
9591 		update_flags = VNODE_UPDATE_NAME;
9592 
9593 		if (fdvp != tdvp) {
9594 			update_flags |= VNODE_UPDATE_PARENT;
9595 		}
9596 
9597 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9598 	}
9599 out1:
9600 	/*
9601 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9602 	 * skipped earlier as no actual rename was performed.
9603 	 */
9604 	if (vn_authorize_skipped && error == 0) {
9605 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
9606 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9607 		    flags, NULL);
9608 		if (error && error == ENOENT) {
9609 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9610 				do_retry = 1;
9611 				retry_count += 1;
9612 			}
9613 		}
9614 	}
9615 	if (to_name != NULL) {
9616 		RELEASE_PATH(to_name);
9617 		to_name = NULL;
9618 	}
9619 	if (to_name_no_firmlink != NULL) {
9620 		RELEASE_PATH(to_name_no_firmlink);
9621 		to_name_no_firmlink = NULL;
9622 	}
9623 	if (from_name != NULL) {
9624 		RELEASE_PATH(from_name);
9625 		from_name = NULL;
9626 	}
9627 	if (from_name_no_firmlink != NULL) {
9628 		RELEASE_PATH(from_name_no_firmlink);
9629 		from_name_no_firmlink = NULL;
9630 	}
9631 	if (holding_mntlock) {
9632 		mount_unlock_renames(locked_mp);
9633 		mount_drop(locked_mp, 0);
9634 		holding_mntlock = 0;
9635 	}
9636 	if (tdvp) {
9637 		/*
9638 		 * nameidone has to happen before we vnode_put(tdvp)
9639 		 * since it may need to release the fs_nodelock on the tdvp
9640 		 */
9641 		nameidone(tond);
9642 
9643 		if (tvp) {
9644 			vnode_put(tvp);
9645 		}
9646 		vnode_put(tdvp);
9647 	}
9648 	if (fdvp) {
9649 		/*
9650 		 * nameidone has to happen before we vnode_put(fdvp)
9651 		 * since it may need to release the fs_nodelock on the fdvp
9652 		 */
9653 		nameidone(fromnd);
9654 
9655 		if (fvp) {
9656 			vnode_put(fvp);
9657 		}
9658 		vnode_put(fdvp);
9659 	}
9660 	if (mnt_fvp != NULLVP) {
9661 		vnode_put(mnt_fvp);
9662 	}
9663 	/*
9664 	 * If things changed after we did the namei, then we will re-drive
9665 	 * this rename call from the top.
9666 	 */
9667 	if (do_retry) {
9668 		do_retry = 0;
9669 		goto retry;
9670 	}
9671 
9672 	kfree_type(typeof(*__rename_data), __rename_data);
9673 	return error;
9674 }
9675 
9676 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9677 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9678 {
9679 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9680 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9681 }
9682 
9683 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9684 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9685 {
9686 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9687 		return EINVAL;
9688 	}
9689 
9690 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9691 		return EINVAL;
9692 	}
9693 
9694 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9695 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9696 }
9697 
9698 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9699 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9700 {
9701 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9702 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9703 }
9704 
9705 /*
9706  * Make a directory file.
9707  *
9708  * Returns:	0			Success
9709  *		EEXIST
9710  *	namei:???
9711  *	vnode_authorize:???
9712  *	vn_create:???
9713  */
9714 /* ARGSUSED */
9715 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9716 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9717     enum uio_seg segflg)
9718 {
9719 	vnode_t vp, dvp;
9720 	int error;
9721 	int update_flags = 0;
9722 	int batched;
9723 	struct nameidata nd;
9724 
9725 	AUDIT_ARG(mode, vap->va_mode);
9726 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9727 	    path, ctx);
9728 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9729 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9730 
9731 continue_lookup:
9732 	error = nameiat(&nd, fd);
9733 	if (error) {
9734 		return error;
9735 	}
9736 	dvp = nd.ni_dvp;
9737 	vp = nd.ni_vp;
9738 
9739 	if (vp != NULL) {
9740 		error = EEXIST;
9741 		goto out;
9742 	}
9743 
9744 	batched = vnode_compound_mkdir_available(dvp);
9745 
9746 	VATTR_SET(vap, va_type, VDIR);
9747 
9748 	/*
9749 	 * XXX
9750 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9751 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9752 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9753 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9754 	 */
9755 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9756 		if (error == EACCES || error == EPERM) {
9757 			int error2;
9758 
9759 			nameidone(&nd);
9760 			vnode_put(dvp);
9761 			dvp = NULLVP;
9762 
9763 			/*
9764 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9765 			 * rather than EACCESS if the target exists.
9766 			 */
9767 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9768 			    path, ctx);
9769 			error2 = nameiat(&nd, fd);
9770 			if (error2) {
9771 				goto out;
9772 			} else {
9773 				vp = nd.ni_vp;
9774 				error = EEXIST;
9775 				goto out;
9776 			}
9777 		}
9778 
9779 		goto out;
9780 	}
9781 
9782 #if CONFIG_FILE_LEASES
9783 	vnode_breakdirlease(dvp, false, O_WRONLY);
9784 #endif
9785 
9786 	/*
9787 	 * make the directory
9788 	 */
9789 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9790 		if (error == EKEEPLOOKING) {
9791 			nd.ni_vp = vp;
9792 			goto continue_lookup;
9793 		}
9794 
9795 		goto out;
9796 	}
9797 
9798 	// Make sure the name & parent pointers are hooked up
9799 	if (vp->v_name == NULL) {
9800 		update_flags |= VNODE_UPDATE_NAME;
9801 	}
9802 	if (vp->v_parent == NULLVP) {
9803 		update_flags |= VNODE_UPDATE_PARENT;
9804 	}
9805 
9806 	if (update_flags) {
9807 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9808 	}
9809 
9810 #if CONFIG_FSE
9811 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9812 #endif
9813 
9814 out:
9815 	/*
9816 	 * nameidone has to happen before we vnode_put(dvp)
9817 	 * since it may need to release the fs_nodelock on the dvp
9818 	 */
9819 	nameidone(&nd);
9820 
9821 	if (vp) {
9822 		vnode_put(vp);
9823 	}
9824 	if (dvp) {
9825 		vnode_put(dvp);
9826 	}
9827 
9828 	return error;
9829 }
9830 
9831 /*
9832  * mkdir_extended: Create a directory; with extended security (ACL).
9833  *
9834  * Parameters:    p                       Process requesting to create the directory
9835  *                uap                     User argument descriptor (see below)
9836  *                retval                  (ignored)
9837  *
9838  * Indirect:      uap->path               Path of directory to create
9839  *                uap->mode               Access permissions to set
9840  *                uap->xsecurity          ACL to set
9841  *
9842  * Returns:        0                      Success
9843  *                !0                      Not success
9844  *
9845  */
9846 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9847 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9848 {
9849 	int ciferror;
9850 	kauth_filesec_t xsecdst;
9851 	struct vnode_attr va;
9852 
9853 	AUDIT_ARG(owner, uap->uid, uap->gid);
9854 
9855 	xsecdst = NULL;
9856 	if ((uap->xsecurity != USER_ADDR_NULL) &&
9857 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9858 		return ciferror;
9859 	}
9860 
9861 	VATTR_INIT(&va);
9862 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9863 	if (xsecdst != NULL) {
9864 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9865 		va.va_vaflags |= VA_FILESEC_ACL;
9866 	}
9867 
9868 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9869 	    UIO_USERSPACE);
9870 	if (xsecdst != NULL) {
9871 		kauth_filesec_free(xsecdst);
9872 	}
9873 	return ciferror;
9874 }
9875 
9876 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9877 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9878 {
9879 	struct vnode_attr va;
9880 
9881 	VATTR_INIT(&va);
9882 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9883 
9884 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9885 	           UIO_USERSPACE);
9886 }
9887 
9888 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9889 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9890 {
9891 	struct vnode_attr va;
9892 
9893 	VATTR_INIT(&va);
9894 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9895 
9896 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9897 	           UIO_USERSPACE);
9898 }
9899 
9900 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9901 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9902     enum uio_seg segflg, int unlink_flags)
9903 {
9904 	struct {
9905 		struct nameidata nd;
9906 #if CONFIG_FSE
9907 		struct vnode_attr va;
9908 #endif /* CONFIG_FSE */
9909 	} *__rmdir_data;
9910 	vnode_t vp, dvp;
9911 	int error;
9912 	struct nameidata *ndp;
9913 	char     *path = NULL;
9914 	char     *no_firmlink_path = NULL;
9915 	int       len_path = 0;
9916 	int       len_no_firmlink_path = 0;
9917 	int has_listeners = 0;
9918 	int need_event = 0;
9919 	int truncated_path = 0;
9920 	int truncated_no_firmlink_path = 0;
9921 	struct vnode_attr *vap = NULL;
9922 	int restart_count = 0;
9923 	int batched;
9924 
9925 	int restart_flag;
9926 
9927 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9928 	ndp = &__rmdir_data->nd;
9929 
9930 	/*
9931 	 * This loop exists to restart rmdir in the unlikely case that two
9932 	 * processes are simultaneously trying to remove the same directory
9933 	 * containing orphaned appleDouble files.
9934 	 */
9935 	do {
9936 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9937 		    segflg, dirpath, ctx);
9938 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9939 continue_lookup:
9940 		restart_flag = 0;
9941 		vap = NULL;
9942 
9943 		error = nameiat(ndp, fd);
9944 		if (error) {
9945 			goto err_out;
9946 		}
9947 
9948 		dvp = ndp->ni_dvp;
9949 		vp = ndp->ni_vp;
9950 
9951 		if (vp) {
9952 			batched = vnode_compound_rmdir_available(vp);
9953 
9954 			if (vp->v_flag & VROOT) {
9955 				/*
9956 				 * The root of a mounted filesystem cannot be deleted.
9957 				 */
9958 				error = EBUSY;
9959 				goto out;
9960 			}
9961 
9962 #if DEVELOPMENT || DEBUG
9963 			/*
9964 			 * XXX VSWAP: Check for entitlements or special flag here
9965 			 * so we can restrict access appropriately.
9966 			 */
9967 #else /* DEVELOPMENT || DEBUG */
9968 
9969 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9970 				error = EPERM;
9971 				goto out;
9972 			}
9973 #endif /* DEVELOPMENT || DEBUG */
9974 
9975 			/*
9976 			 * Removed a check here; we used to abort if vp's vid
9977 			 * was not the same as what we'd seen the last time around.
9978 			 * I do not think that check was valid, because if we retry
9979 			 * and all dirents are gone, the directory could legitimately
9980 			 * be recycled but still be present in a situation where we would
9981 			 * have had permission to delete.  Therefore, we won't make
9982 			 * an effort to preserve that check now that we may not have a
9983 			 * vp here.
9984 			 */
9985 
9986 			if (!batched) {
9987 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9988 				if (error) {
9989 					if (error == ENOENT) {
9990 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9991 							restart_flag = 1;
9992 							restart_count += 1;
9993 						}
9994 					}
9995 					goto out;
9996 				}
9997 			}
9998 		} else {
9999 			batched = 1;
10000 
10001 			if (!vnode_compound_rmdir_available(dvp)) {
10002 				panic("No error, but no compound rmdir?");
10003 			}
10004 		}
10005 
10006 #if CONFIG_FSE
10007 		fse_info  finfo = {0};
10008 
10009 		need_event = need_fsevent(FSE_DELETE, dvp);
10010 		if (need_event) {
10011 			if (!batched) {
10012 				get_fse_info(vp, &finfo, ctx);
10013 			} else {
10014 				error = vfs_get_notify_attributes(&__rmdir_data->va);
10015 				if (error) {
10016 					goto out;
10017 				}
10018 
10019 				vap = &__rmdir_data->va;
10020 			}
10021 		}
10022 #endif
10023 		has_listeners = kauth_authorize_fileop_has_listeners();
10024 		if (need_event || has_listeners) {
10025 			if (path == NULL) {
10026 				GET_PATH(path);
10027 			}
10028 
10029 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10030 
10031 			if (no_firmlink_path == NULL) {
10032 				GET_PATH(no_firmlink_path);
10033 			}
10034 
10035 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10036 #if CONFIG_FSE
10037 			if (truncated_no_firmlink_path) {
10038 				finfo.mode |= FSE_TRUNCATED_PATH;
10039 			}
10040 #endif
10041 		}
10042 
10043 #if CONFIG_FILE_LEASES
10044 		vnode_breakdirlease(dvp, false, O_WRONLY);
10045 #endif
10046 
10047 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10048 		ndp->ni_vp = vp;
10049 		if (vp == NULLVP) {
10050 			/* Couldn't find a vnode */
10051 			goto out;
10052 		}
10053 
10054 		if (error == EKEEPLOOKING) {
10055 			goto continue_lookup;
10056 		} else if (batched && error == ENOENT) {
10057 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10058 				/*
10059 				 * For compound VNOPs, the authorization callback
10060 				 * may return ENOENT in case of racing hard link lookups
10061 				 * redrive the lookup.
10062 				 */
10063 				restart_flag = 1;
10064 				restart_count += 1;
10065 				goto out;
10066 			}
10067 		}
10068 
10069 		/*
10070 		 * XXX There's no provision for passing flags
10071 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10072 		 * because it's not empty, then we try again
10073 		 * with VNOP_REMOVE(), passing in a special
10074 		 * flag that clever file systems will know
10075 		 * how to handle.
10076 		 */
10077 		if (error == ENOTEMPTY &&
10078 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10079 			/*
10080 			 * If this fails, we want to keep the original
10081 			 * error.
10082 			 */
10083 			if (vn_remove(dvp, &vp, ndp,
10084 			    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10085 				error = 0;
10086 			}
10087 		}
10088 
10089 #if CONFIG_APPLEDOUBLE
10090 		/*
10091 		 * Special case to remove orphaned AppleDouble
10092 		 * files. I don't like putting this in the kernel,
10093 		 * but carbon does not like putting this in carbon either,
10094 		 * so here we are.
10095 		 */
10096 		if (error == ENOTEMPTY) {
10097 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10098 			if (ad_error == EBUSY) {
10099 				error = ad_error;
10100 				goto out;
10101 			}
10102 
10103 
10104 			/*
10105 			 * Assuming everything went well, we will try the RMDIR again
10106 			 */
10107 			if (!ad_error) {
10108 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10109 			}
10110 		}
10111 #endif /* CONFIG_APPLEDOUBLE */
10112 		/*
10113 		 * Call out to allow 3rd party notification of delete.
10114 		 * Ignore result of kauth_authorize_fileop call.
10115 		 */
10116 		if (!error) {
10117 			if (has_listeners) {
10118 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10119 				    KAUTH_FILEOP_DELETE,
10120 				    (uintptr_t)vp,
10121 				    (uintptr_t)path);
10122 			}
10123 
10124 			if (vp->v_flag & VISHARDLINK) {
10125 				// see the comment in unlink1() about why we update
10126 				// the parent of a hard link when it is removed
10127 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10128 			}
10129 
10130 #if CONFIG_FSE
10131 			if (need_event) {
10132 				if (vap) {
10133 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10134 				}
10135 				add_fsevent(FSE_DELETE, ctx,
10136 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10137 				    FSE_ARG_FINFO, &finfo,
10138 				    FSE_ARG_DONE);
10139 			}
10140 #endif
10141 
10142 #if CONFIG_MACF
10143 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10144 #endif
10145 		}
10146 
10147 out:
10148 		if (path != NULL) {
10149 			RELEASE_PATH(path);
10150 			path = NULL;
10151 		}
10152 
10153 		if (no_firmlink_path != NULL) {
10154 			RELEASE_PATH(no_firmlink_path);
10155 			no_firmlink_path = NULL;
10156 		}
10157 
10158 		/*
10159 		 * nameidone has to happen before we vnode_put(dvp)
10160 		 * since it may need to release the fs_nodelock on the dvp
10161 		 */
10162 		nameidone(ndp);
10163 		vnode_put(dvp);
10164 
10165 		if (vp) {
10166 			vnode_put(vp);
10167 		}
10168 
10169 		if (restart_flag == 0) {
10170 			wakeup_one((caddr_t)vp);
10171 			goto err_out;
10172 		}
10173 		tsleep(vp, PVFS, "rm AD", 1);
10174 	} while (restart_flag != 0);
10175 
10176 err_out:
10177 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10178 
10179 	return error;
10180 }
10181 
10182 /*
10183  * Remove a directory file.
10184  */
10185 /* ARGSUSED */
10186 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10187 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10188 {
10189 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10190 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10191 }
10192 
10193 /* Get direntry length padded to 8 byte alignment */
10194 #define DIRENT64_LEN(namlen) \
10195 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10196 
10197 /* Get dirent length padded to 4 byte alignment */
10198 #define DIRENT_LEN(namelen) \
10199 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10200 
10201 /* Get the end of this dirent */
10202 #define DIRENT_END(dep) \
10203 	(((char *)(dep)) + (dep)->d_reclen - 1)
10204 
10205 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10206 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10207     int *numdirent, vfs_context_t ctxp)
10208 {
10209 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10210 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10211 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10212 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10213 	} else {
10214 		size_t bufsize;
10215 		void * bufptr;
10216 		uio_t auio;
10217 		struct direntry *entry64;
10218 		struct dirent *dep;
10219 		size_t bytesread;
10220 		int error;
10221 
10222 		/*
10223 		 * We're here because the underlying file system does not
10224 		 * support direnties or we mounted denying support so we must
10225 		 * fall back to dirents and convert them to direntries.
10226 		 *
10227 		 * Our kernel buffer needs to be smaller since re-packing will
10228 		 * expand each dirent.  The worse case (when the name length
10229 		 * is 3 or less) corresponds to a struct direntry size of 32
10230 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10231 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10232 		 * will prevent us from reading more than we can pack.
10233 		 *
10234 		 * Since this buffer is wired memory, we will limit the
10235 		 * buffer size to a maximum of 32K. We would really like to
10236 		 * use 32K in the MIN(), but we use magic number 87371 to
10237 		 * prevent uio_resid() * 3 / 8 from overflowing.
10238 		 */
10239 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10240 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10241 		if (bufptr == NULL) {
10242 			return ENOMEM;
10243 		}
10244 
10245 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10246 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10247 		auio->uio_offset = uio->uio_offset;
10248 
10249 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10250 
10251 		dep = (struct dirent *)bufptr;
10252 		bytesread = bufsize - uio_resid(auio);
10253 
10254 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10255 		/*
10256 		 * Convert all the entries and copy them out to user's buffer.
10257 		 */
10258 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10259 			/* First check that the dirent struct up to d_name is within the buffer */
10260 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10261 			    /* Check that the length of the entire dirent is within the buffer */
10262 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10263 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10264 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10265 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10266 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10267 				    vp->v_name ? vp->v_name : "<unknown>");
10268 				error = EIO;
10269 				break;
10270 			}
10271 
10272 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10273 
10274 			bzero(entry64, enbufsize);
10275 			/* Convert a dirent to a dirent64. */
10276 			entry64->d_ino = dep->d_ino;
10277 			entry64->d_seekoff = 0;
10278 			entry64->d_reclen = (uint16_t)enbufsize;
10279 			entry64->d_namlen = dep->d_namlen;
10280 			entry64->d_type = dep->d_type;
10281 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10282 
10283 			/* Move to next entry. */
10284 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10285 
10286 			/* Copy entry64 to user's buffer. */
10287 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10288 		}
10289 
10290 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10291 		if (error == 0) {
10292 			uio->uio_offset = auio->uio_offset;
10293 		}
10294 		uio_free(auio);
10295 		kfree_data(bufptr, bufsize);
10296 		kfree_type(struct direntry, entry64);
10297 		return error;
10298 	}
10299 }
10300 
10301 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10302 
10303 /*
10304  * Read a block of directory entries in a file system independent format.
10305  */
10306 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10307 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10308     off_t *offset, int *eofflag, int flags)
10309 {
10310 	vnode_t vp;
10311 	struct vfs_context context = *vfs_context_current();    /* local copy */
10312 	struct fileproc *fp;
10313 	uio_t auio;
10314 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10315 	off_t loff;
10316 	int error, numdirent;
10317 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10318 
10319 get_from_fd:
10320 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10321 	if (error) {
10322 		return error;
10323 	}
10324 
10325 	vn_offset_lock(fp->fp_glob);
10326 	if (((vnode_t)fp_get_data(fp)) != vp) {
10327 		vn_offset_unlock(fp->fp_glob);
10328 		file_drop(fd);
10329 		goto get_from_fd;
10330 	}
10331 
10332 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10333 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10334 		error = EBADF;
10335 		goto out;
10336 	}
10337 
10338 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10339 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10340 	}
10341 
10342 #if CONFIG_MACF
10343 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10344 	if (error) {
10345 		goto out;
10346 	}
10347 #endif
10348 
10349 	if ((error = vnode_getwithref(vp))) {
10350 		goto out;
10351 	}
10352 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10353 
10354 #if CONFIG_UNION_MOUNTS
10355 unionread:
10356 #endif /* CONFIG_UNION_MOUNTS */
10357 	if (vp->v_type != VDIR) {
10358 		(void)vnode_put(vp);
10359 		error = EINVAL;
10360 		goto out;
10361 	}
10362 
10363 #if CONFIG_MACF
10364 	error = mac_vnode_check_readdir(&context, vp);
10365 	if (error != 0) {
10366 		(void)vnode_put(vp);
10367 		goto out;
10368 	}
10369 #endif /* MAC */
10370 
10371 	loff = fp->fp_glob->fg_offset;
10372 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10373 	uio_addiov(auio, bufp, bufsize);
10374 
10375 	if (flags & VNODE_READDIR_EXTENDED) {
10376 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10377 		fp->fp_glob->fg_offset = uio_offset(auio);
10378 	} else {
10379 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10380 		fp->fp_glob->fg_offset = uio_offset(auio);
10381 	}
10382 	if (error) {
10383 		(void)vnode_put(vp);
10384 		goto out;
10385 	}
10386 
10387 #if CONFIG_UNION_MOUNTS
10388 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10389 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10390 		vnode_t uvp;
10391 
10392 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10393 			if (vnode_ref(uvp) == 0) {
10394 				fp_set_data(fp, uvp);
10395 				fp->fp_glob->fg_offset = 0;
10396 				vnode_rele(vp);
10397 				vnode_put(vp);
10398 				vp = uvp;
10399 				goto unionread;
10400 			} else {
10401 				/* could not get a ref, can't replace in fd */
10402 				vnode_put(uvp);
10403 			}
10404 		}
10405 	}
10406 #endif /* CONFIG_UNION_MOUNTS */
10407 
10408 	vnode_put(vp);
10409 	if (offset) {
10410 		*offset = loff;
10411 	}
10412 
10413 	*bytesread = bufsize - uio_resid(auio);
10414 out:
10415 	vn_offset_unlock(fp->fp_glob);
10416 	file_drop(fd);
10417 	return error;
10418 }
10419 
10420 
10421 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10422 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10423 {
10424 	off_t offset;
10425 	ssize_t bytesread;
10426 	int error, eofflag;
10427 
10428 	AUDIT_ARG(fd, uap->fd);
10429 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
10430 	    &bytesread, &offset, &eofflag, 0);
10431 
10432 	if (error == 0) {
10433 		if (proc_is64bit(p)) {
10434 			user64_long_t base = (user64_long_t)offset;
10435 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10436 		} else {
10437 			user32_long_t base = (user32_long_t)offset;
10438 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10439 		}
10440 		*retval = (int)bytesread;
10441 	}
10442 	return error;
10443 }
10444 
10445 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10446 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10447 {
10448 	off_t offset;
10449 	ssize_t bytesread;
10450 	int error, eofflag;
10451 	user_size_t bufsize;
10452 
10453 	AUDIT_ARG(fd, uap->fd);
10454 
10455 	/*
10456 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10457 	 * then the kernel carves out the last 4 bytes to return extended
10458 	 * information to userspace (namely whether we reached EOF with this call).
10459 	 */
10460 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10461 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10462 	} else {
10463 		bufsize = uap->bufsize;
10464 	}
10465 
10466 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
10467 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10468 
10469 	if (error == 0) {
10470 		*retval = bytesread;
10471 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10472 
10473 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10474 			getdirentries64_flags_t flags = 0;
10475 			if (eofflag) {
10476 				flags |= GETDIRENTRIES64_EOF;
10477 			}
10478 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10479 			    sizeof(flags));
10480 		}
10481 	}
10482 	return error;
10483 }
10484 
10485 
10486 /*
10487  * Set the mode mask for creation of filesystem nodes.
10488  * XXX implement xsecurity
10489  */
10490 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
10491 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10492 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10493 {
10494 	AUDIT_ARG(mask, newmask);
10495 	proc_fdlock(p);
10496 	*retval = p->p_fd.fd_cmask;
10497 	p->p_fd.fd_cmask = newmask & ALLPERMS;
10498 	proc_fdunlock(p);
10499 	return 0;
10500 }
10501 
10502 /*
10503  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10504  *
10505  * Parameters:    p                       Process requesting to set the umask
10506  *                uap                     User argument descriptor (see below)
10507  *                retval                  umask of the process (parameter p)
10508  *
10509  * Indirect:      uap->newmask            umask to set
10510  *                uap->xsecurity          ACL to set
10511  *
10512  * Returns:        0                      Success
10513  *                !0                      Not success
10514  *
10515  */
10516 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10517 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10518 {
10519 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10520 }
10521 
10522 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10523 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10524 {
10525 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10526 }
10527 
10528 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
10529 	"com.apple.private.vfs.revoke-mounted-device"
10530 
10531 /*
10532  * Void all references to file by ripping underlying filesystem
10533  * away from vnode.
10534  */
10535 /* ARGSUSED */
10536 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10537 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10538 {
10539 	vnode_t vp;
10540 	struct vnode_attr va;
10541 	vfs_context_t ctx = vfs_context_current();
10542 	int error;
10543 	struct nameidata nd;
10544 
10545 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10546 	    uap->path, ctx);
10547 	error = namei(&nd);
10548 	if (error) {
10549 		return error;
10550 	}
10551 	vp = nd.ni_vp;
10552 
10553 	nameidone(&nd);
10554 
10555 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10556 		error = ENOTSUP;
10557 		goto out;
10558 	}
10559 
10560 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10561 		error = EBUSY;
10562 		goto out;
10563 	}
10564 
10565 #if CONFIG_MACF
10566 	error = mac_vnode_check_revoke(ctx, vp);
10567 	if (error) {
10568 		goto out;
10569 	}
10570 #endif
10571 
10572 	VATTR_INIT(&va);
10573 	VATTR_WANTED(&va, va_uid);
10574 	if ((error = vnode_getattr(vp, &va, ctx))) {
10575 		goto out;
10576 	}
10577 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10578 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10579 		goto out;
10580 	}
10581 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10582 		VNOP_REVOKE(vp, REVOKEALL, ctx);
10583 	}
10584 out:
10585 	vnode_put(vp);
10586 	return error;
10587 }
10588 
10589 
10590 /*
10591  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10592  *  The following system calls are designed to support features
10593  *  which are specific to the HFS & HFS Plus volume formats
10594  */
10595 
10596 
10597 /*
10598  * Obtain attribute information on objects in a directory while enumerating
10599  * the directory.
10600  */
10601 /* ARGSUSED */
10602 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10603 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10604 {
10605 	vnode_t vp;
10606 	struct fileproc *fp;
10607 	uio_t auio = NULL;
10608 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10609 	uint32_t count = 0, savecount = 0;
10610 	uint32_t newstate = 0;
10611 	int error, eofflag = 0;
10612 	off_t loff = 0;
10613 	struct attrlist attributelist;
10614 	vfs_context_t ctx = vfs_context_current();
10615 	int fd = uap->fd;
10616 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10617 	kauth_action_t action;
10618 
10619 	AUDIT_ARG(fd, fd);
10620 
10621 	/* Get the attributes into kernel space */
10622 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10623 		return error;
10624 	}
10625 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10626 		return error;
10627 	}
10628 	savecount = count;
10629 
10630 get_from_fd:
10631 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10632 		return error;
10633 	}
10634 
10635 	vn_offset_lock(fp->fp_glob);
10636 	if (((vnode_t)fp_get_data(fp)) != vp) {
10637 		vn_offset_unlock(fp->fp_glob);
10638 		file_drop(fd);
10639 		goto get_from_fd;
10640 	}
10641 
10642 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10643 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10644 		error = EBADF;
10645 		goto out;
10646 	}
10647 
10648 
10649 #if CONFIG_MACF
10650 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10651 	    fp->fp_glob);
10652 	if (error) {
10653 		goto out;
10654 	}
10655 #endif
10656 
10657 
10658 	if ((error = vnode_getwithref(vp))) {
10659 		goto out;
10660 	}
10661 
10662 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10663 
10664 #if CONFIG_UNION_MOUNTS
10665 unionread:
10666 #endif /* CONFIG_UNION_MOUNTS */
10667 	if (vp->v_type != VDIR) {
10668 		(void)vnode_put(vp);
10669 		error = EINVAL;
10670 		goto out;
10671 	}
10672 
10673 #if CONFIG_MACF
10674 	error = mac_vnode_check_readdir(ctx, vp);
10675 	if (error != 0) {
10676 		(void)vnode_put(vp);
10677 		goto out;
10678 	}
10679 #endif /* MAC */
10680 
10681 	/* set up the uio structure which will contain the users return buffer */
10682 	loff = fp->fp_glob->fg_offset;
10683 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10684 	uio_addiov(auio, uap->buffer, uap->buffersize);
10685 
10686 	/*
10687 	 * If the only item requested is file names, we can let that past with
10688 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10689 	 * they need SEARCH as well.
10690 	 */
10691 	action = KAUTH_VNODE_LIST_DIRECTORY;
10692 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10693 	    attributelist.fileattr || attributelist.dirattr) {
10694 		action |= KAUTH_VNODE_SEARCH;
10695 	}
10696 
10697 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10698 		/* Believe it or not, uap->options only has 32-bits of valid
10699 		 * info, so truncate before extending again */
10700 
10701 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10702 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10703 	}
10704 
10705 	if (error) {
10706 		(void) vnode_put(vp);
10707 		goto out;
10708 	}
10709 
10710 #if CONFIG_UNION_MOUNTS
10711 	/*
10712 	 * If we've got the last entry of a directory in a union mount
10713 	 * then reset the eofflag and pretend there's still more to come.
10714 	 * The next call will again set eofflag and the buffer will be empty,
10715 	 * so traverse to the underlying directory and do the directory
10716 	 * read there.
10717 	 */
10718 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10719 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10720 			eofflag = 0;
10721 		} else {                                                // Empty buffer
10722 			vnode_t uvp;
10723 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10724 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10725 					fp_set_data(fp, uvp);
10726 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10727 					count = savecount;
10728 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10729 					vnode_put(vp);
10730 					vp = uvp;
10731 					goto unionread;
10732 				} else {
10733 					/* could not get a ref, can't replace in fd */
10734 					vnode_put(uvp);
10735 				}
10736 			}
10737 		}
10738 	}
10739 #endif /* CONFIG_UNION_MOUNTS */
10740 
10741 	(void)vnode_put(vp);
10742 
10743 	if (error) {
10744 		goto out;
10745 	}
10746 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10747 
10748 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10749 		goto out;
10750 	}
10751 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10752 		goto out;
10753 	}
10754 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10755 		goto out;
10756 	}
10757 
10758 	*retval = eofflag;  /* similar to getdirentries */
10759 	error = 0;
10760 out:
10761 	vn_offset_unlock(fp->fp_glob);
10762 	file_drop(fd);
10763 	return error; /* return error earlier, an retval of 0 or 1 now */
10764 } /* end of getdirentriesattr system call */
10765 
10766 /*
10767  * Exchange data between two files
10768  */
10769 
10770 /* ARGSUSED */
10771 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10772 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10773 {
10774 	struct nameidata fnd, snd;
10775 	vfs_context_t ctx = vfs_context_current();
10776 	vnode_t fvp;
10777 	vnode_t svp;
10778 	int error;
10779 	u_int32_t nameiflags;
10780 	char *fpath = NULL;
10781 	char *spath = NULL;
10782 	int   flen = 0, slen = 0;
10783 	int from_truncated = 0, to_truncated = 0;
10784 #if CONFIG_FSE
10785 	fse_info f_finfo, s_finfo;
10786 #endif
10787 
10788 	nameiflags = 0;
10789 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10790 		nameiflags |= FOLLOW;
10791 	}
10792 
10793 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10794 	    UIO_USERSPACE, uap->path1, ctx);
10795 
10796 	error = namei(&fnd);
10797 	if (error) {
10798 		goto out2;
10799 	}
10800 
10801 	nameidone(&fnd);
10802 	fvp = fnd.ni_vp;
10803 
10804 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10805 	    UIO_USERSPACE, uap->path2, ctx);
10806 
10807 	error = namei(&snd);
10808 	if (error) {
10809 		vnode_put(fvp);
10810 		goto out2;
10811 	}
10812 	nameidone(&snd);
10813 	svp = snd.ni_vp;
10814 
10815 	/*
10816 	 * if the files are the same, return an inval error
10817 	 */
10818 	if (svp == fvp) {
10819 		error = EINVAL;
10820 		goto out;
10821 	}
10822 
10823 	/*
10824 	 * if the files are on different volumes, return an error
10825 	 */
10826 	if (svp->v_mount != fvp->v_mount) {
10827 		error = EXDEV;
10828 		goto out;
10829 	}
10830 
10831 	/* If they're not files, return an error */
10832 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10833 		error = EINVAL;
10834 		goto out;
10835 	}
10836 
10837 #if CONFIG_MACF
10838 	error = mac_vnode_check_exchangedata(ctx,
10839 	    fvp, svp);
10840 	if (error) {
10841 		goto out;
10842 	}
10843 #endif
10844 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10845 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10846 		goto out;
10847 	}
10848 
10849 	if (
10850 #if CONFIG_FSE
10851 		need_fsevent(FSE_EXCHANGE, fvp) ||
10852 #endif
10853 		kauth_authorize_fileop_has_listeners()) {
10854 		GET_PATH(fpath);
10855 		GET_PATH(spath);
10856 
10857 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10858 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10859 
10860 #if CONFIG_FSE
10861 		get_fse_info(fvp, &f_finfo, ctx);
10862 		get_fse_info(svp, &s_finfo, ctx);
10863 		if (from_truncated || to_truncated) {
10864 			// set it here since only the f_finfo gets reported up to user space
10865 			f_finfo.mode |= FSE_TRUNCATED_PATH;
10866 		}
10867 #endif
10868 	}
10869 	/* Ok, make the call */
10870 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10871 
10872 	if (error == 0) {
10873 		const char *tmpname;
10874 
10875 		if (fpath != NULL && spath != NULL) {
10876 			/* call out to allow 3rd party notification of exchangedata.
10877 			 * Ignore result of kauth_authorize_fileop call.
10878 			 */
10879 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10880 			    (uintptr_t)fpath, (uintptr_t)spath);
10881 		}
10882 		name_cache_lock();
10883 
10884 		tmpname     = fvp->v_name;
10885 		fvp->v_name = svp->v_name;
10886 		svp->v_name = tmpname;
10887 
10888 		if (fvp->v_parent != svp->v_parent) {
10889 			vnode_t tmp;
10890 
10891 			tmp           = fvp->v_parent;
10892 			fvp->v_parent = svp->v_parent;
10893 			svp->v_parent = tmp;
10894 		}
10895 		name_cache_unlock();
10896 
10897 #if CONFIG_FSE
10898 		if (fpath != NULL && spath != NULL) {
10899 			add_fsevent(FSE_EXCHANGE, ctx,
10900 			    FSE_ARG_STRING, flen, fpath,
10901 			    FSE_ARG_FINFO, &f_finfo,
10902 			    FSE_ARG_STRING, slen, spath,
10903 			    FSE_ARG_FINFO, &s_finfo,
10904 			    FSE_ARG_DONE);
10905 		}
10906 #endif
10907 	}
10908 
10909 out:
10910 	if (fpath != NULL) {
10911 		RELEASE_PATH(fpath);
10912 	}
10913 	if (spath != NULL) {
10914 		RELEASE_PATH(spath);
10915 	}
10916 	vnode_put(svp);
10917 	vnode_put(fvp);
10918 out2:
10919 	return error;
10920 }
10921 
10922 /*
10923  * Return (in MB) the amount of freespace on the given vnode's volume.
10924  */
10925 uint32_t freespace_mb(vnode_t vp);
10926 
10927 uint32_t
freespace_mb(vnode_t vp)10928 freespace_mb(vnode_t vp)
10929 {
10930 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10931 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10932 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10933 }
10934 
10935 #if CONFIG_SEARCHFS
10936 
10937 /* ARGSUSED */
10938 
10939 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10940 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10941 {
10942 	vnode_t vp, tvp;
10943 	int i, error = 0;
10944 	int fserror = 0;
10945 	struct nameidata nd;
10946 	struct user64_fssearchblock searchblock;
10947 	struct searchstate *state;
10948 	struct attrlist *returnattrs;
10949 	struct timeval timelimit;
10950 	void *searchparams1, *searchparams2;
10951 	uio_t auio = NULL;
10952 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10953 	uint32_t nummatches;
10954 	size_t mallocsize;
10955 	uint32_t nameiflags;
10956 	vfs_context_t ctx = vfs_context_current();
10957 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10958 
10959 	/* Start by copying in fsearchblock parameter list */
10960 	if (IS_64BIT_PROCESS(p)) {
10961 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10962 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
10963 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
10964 	} else {
10965 		struct user32_fssearchblock tmp_searchblock;
10966 
10967 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10968 		// munge into 64-bit version
10969 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10970 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10971 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10972 		searchblock.maxmatches = tmp_searchblock.maxmatches;
10973 		/*
10974 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10975 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10976 		 */
10977 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10978 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10979 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10980 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10981 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10982 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10983 		searchblock.searchattrs = tmp_searchblock.searchattrs;
10984 	}
10985 	if (error) {
10986 		return error;
10987 	}
10988 
10989 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10990 	 */
10991 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10992 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10993 		return EINVAL;
10994 	}
10995 
10996 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10997 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
10998 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10999 	/* block.                                                                                             */
11000 	/*												      */
11001 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
11002 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
11003 	/*       assumes the size is still 556 bytes it will continue to work				      */
11004 
11005 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11006 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11007 
11008 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11009 
11010 	/* Now set up the various pointers to the correct place in our newly allocated memory */
11011 
11012 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11013 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11014 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11015 
11016 	/* Now copy in the stuff given our local variables. */
11017 
11018 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11019 		goto freeandexit;
11020 	}
11021 
11022 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11023 		goto freeandexit;
11024 	}
11025 
11026 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11027 		goto freeandexit;
11028 	}
11029 
11030 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11031 		goto freeandexit;
11032 	}
11033 
11034 	/*
11035 	 * When searching a union mount, need to set the
11036 	 * start flag at the first call on each layer to
11037 	 * reset state for the new volume.
11038 	 */
11039 	if (uap->options & SRCHFS_START) {
11040 		state->ss_union_layer = 0;
11041 	} else {
11042 		uap->options |= state->ss_union_flags;
11043 	}
11044 	state->ss_union_flags = 0;
11045 
11046 	/*
11047 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11048 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11049 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11050 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11051 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11052 	 */
11053 
11054 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11055 		attrreference_t* string_ref;
11056 		u_int32_t* start_length;
11057 		user64_size_t param_length;
11058 
11059 		/* validate searchparams1 */
11060 		param_length = searchblock.sizeofsearchparams1;
11061 		/* skip the word that specifies length of the buffer */
11062 		start_length = (u_int32_t*) searchparams1;
11063 		start_length = start_length + 1;
11064 		string_ref = (attrreference_t*) start_length;
11065 
11066 		/* ensure no negative offsets or too big offsets */
11067 		if (string_ref->attr_dataoffset < 0) {
11068 			error = EINVAL;
11069 			goto freeandexit;
11070 		}
11071 		if (string_ref->attr_length > MAXPATHLEN) {
11072 			error = EINVAL;
11073 			goto freeandexit;
11074 		}
11075 
11076 		/* Check for pointer overflow in the string ref */
11077 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11078 			error = EINVAL;
11079 			goto freeandexit;
11080 		}
11081 
11082 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11083 			error = EINVAL;
11084 			goto freeandexit;
11085 		}
11086 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11087 			error = EINVAL;
11088 			goto freeandexit;
11089 		}
11090 	}
11091 
11092 	/* set up the uio structure which will contain the users return buffer */
11093 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11094 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11095 
11096 	nameiflags = 0;
11097 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11098 		nameiflags |= FOLLOW;
11099 	}
11100 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11101 	    UIO_USERSPACE, uap->path, ctx);
11102 
11103 	error = namei(&nd);
11104 	if (error) {
11105 		goto freeandexit;
11106 	}
11107 	vp = nd.ni_vp;
11108 	nameidone(&nd);
11109 
11110 	/*
11111 	 * Switch to the root vnode for the volume
11112 	 */
11113 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11114 	vnode_put(vp);
11115 	if (error) {
11116 		goto freeandexit;
11117 	}
11118 	vp = tvp;
11119 
11120 #if CONFIG_UNION_MOUNTS
11121 	/*
11122 	 * If it's a union mount, the path lookup takes
11123 	 * us to the top layer. But we may need to descend
11124 	 * to a lower layer. For non-union mounts the layer
11125 	 * is always zero.
11126 	 */
11127 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11128 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11129 			break;
11130 		}
11131 		tvp = vp;
11132 		vp = vp->v_mount->mnt_vnodecovered;
11133 		if (vp == NULL) {
11134 			vnode_put(tvp);
11135 			error = ENOENT;
11136 			goto freeandexit;
11137 		}
11138 		error = vnode_getwithref(vp);
11139 		vnode_put(tvp);
11140 		if (error) {
11141 			goto freeandexit;
11142 		}
11143 	}
11144 #endif /* CONFIG_UNION_MOUNTS */
11145 
11146 #if CONFIG_MACF
11147 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11148 	if (error) {
11149 		vnode_put(vp);
11150 		goto freeandexit;
11151 	}
11152 #endif
11153 
11154 
11155 	/*
11156 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11157 	 * before and sometimes the underlying code doesnt deal with it well.
11158 	 */
11159 	if (searchblock.maxmatches == 0) {
11160 		nummatches = 0;
11161 		goto saveandexit;
11162 	}
11163 
11164 	/*
11165 	 * Allright, we have everything we need, so lets make that call.
11166 	 *
11167 	 * We keep special track of the return value from the file system:
11168 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11169 	 * from copying out any results...
11170 	 */
11171 
11172 	fserror = VNOP_SEARCHFS(vp,
11173 	    searchparams1,
11174 	    searchparams2,
11175 	    &searchblock.searchattrs,
11176 	    (uint32_t)searchblock.maxmatches,
11177 	    &timelimit,
11178 	    returnattrs,
11179 	    &nummatches,
11180 	    (uint32_t)uap->scriptcode,
11181 	    (uint32_t)uap->options,
11182 	    auio,
11183 	    (struct searchstate *) &state->ss_fsstate,
11184 	    ctx);
11185 
11186 #if CONFIG_UNION_MOUNTS
11187 	/*
11188 	 * If it's a union mount we need to be called again
11189 	 * to search the mounted-on filesystem.
11190 	 */
11191 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11192 		state->ss_union_flags = SRCHFS_START;
11193 		state->ss_union_layer++;        // search next layer down
11194 		fserror = EAGAIN;
11195 	}
11196 #endif /* CONFIG_UNION_MOUNTS */
11197 
11198 saveandexit:
11199 
11200 	vnode_put(vp);
11201 
11202 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11203 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11204 
11205 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11206 		goto freeandexit;
11207 	}
11208 
11209 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11210 		goto freeandexit;
11211 	}
11212 
11213 	error = fserror;
11214 
11215 freeandexit:
11216 
11217 	kfree_data(searchparams1, mallocsize);
11218 
11219 	return error;
11220 } /* end of searchfs system call */
11221 
11222 #else /* CONFIG_SEARCHFS */
11223 
11224 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11225 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11226 {
11227 	return ENOTSUP;
11228 }
11229 
11230 #endif /* CONFIG_SEARCHFS */
11231 
11232 
11233 #if CONFIG_DATALESS_FILES
11234 
11235 /*
11236  * === Namespace Resolver Up-call Mechanism ===
11237  *
11238  * When I/O is performed to a dataless file or directory (read, write,
11239  * lookup-in, etc.), the file system performs an upcall to the namespace
11240  * resolver (filecoordinationd) to materialize the object.
11241  *
11242  * We need multiple up-calls to be in flight at once, and we need these
11243  * up-calls to be interruptible, thus the following implementation:
11244  *
11245  * => The nspace_resolver_request represents the in-kernel request state.
11246  *    It contains a request ID, storage space for the errno code returned
11247  *    by filecoordinationd, and flags.
11248  *
11249  * => The request ID is simply a global monotonically incrementing 32-bit
11250  *    number.  Outstanding requests are stored in a hash table, and the
11251  *    hash function is extremely simple.
11252  *
11253  * => When an upcall is to be made to filecoordinationd, a request structure
11254  *    is allocated on the stack (it is small, and needs to live only during
11255  *    the duration of the call to resolve_nspace_item_ext()).  It is
11256  *    initialized and inserted into the table.  Some backpressure from
11257  *    filecoordinationd is applied by limiting the numnber of entries that
11258  *    can be inserted into the table (and thus limiting the number of
11259  *    outstanding requests issued to filecoordinationd); waiting for an
11260  *    available slot is interruptible.
11261  *
11262  * => Once the request has been inserted into the table, the up-call is made
11263  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11264  *    immediately and filecoordinationd processes the request asynchronously.
11265  *
11266  * => The caller now waits for the request to complete.  Tnis is achieved by
11267  *    sleeping on the address of the request structure and waiting for
11268  *    filecoordinationd to mark the request structure as complete.  This
11269  *    is an interruptible sleep call; if interrupted, the request structure
11270  *    is removed from the table and EINTR is returned to the caller.  If
11271  *    this occurs, an advisory up-call is made to filecoordinationd with
11272  *    the request ID to indicate that the request can be aborted or
11273  *    de-prioritized at the discretion of filecoordinationd.
11274  *
11275  * => When filecoordinationd has completed the request, it signals completion
11276  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11277  *    decorated as a namespace resolver can write to this sysctl node.  The
11278  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11279  *    The request ID is looked up in the table, and if the request is found,
11280  *    the error code is stored in the request structure and a wakeup()
11281  *    issued on the address of the request structure.  If the request is not
11282  *    found, we simply drop the completion notification, assuming that the
11283  *    caller was interrupted.
11284  *
11285  * => When the waiting thread wakes up, it extracts the error code from the
11286  *    request structure, removes the request from the table, and returns the
11287  *    error code to the calling function.  Fini!
11288  */
11289 
11290 struct nspace_resolver_request {
11291 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11292 	vnode_t         r_vp;
11293 	uint32_t        r_req_id;
11294 	int             r_resolver_error;
11295 	int             r_flags;
11296 };
11297 
11298 #define RRF_COMPLETE    0x0001
11299 
11300 static uint32_t
next_nspace_req_id(void)11301 next_nspace_req_id(void)
11302 {
11303 	static uint32_t next_req_id;
11304 
11305 	return OSAddAtomic(1, &next_req_id);
11306 }
11307 
11308 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11309 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11310 
11311 static LIST_HEAD(nspace_resolver_requesthead,
11312     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11313 static u_long nspace_resolver_request_hashmask;
11314 static u_int nspace_resolver_request_count;
11315 static bool nspace_resolver_request_wait_slot;
11316 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11317 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11318     &nspace_resolver_request_lck_grp);
11319 
11320 #define NSPACE_REQ_LOCK() \
11321 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11322 #define NSPACE_REQ_UNLOCK() \
11323 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11324 
11325 #define NSPACE_RESOLVER_HASH(req_id)    \
11326 	(&nspace_resolver_request_hashtbl[(req_id) & \
11327 	 nspace_resolver_request_hashmask])
11328 
11329 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)11330 nspace_resolver_req_lookup(uint32_t req_id)
11331 {
11332 	struct nspace_resolver_requesthead *bucket;
11333 	struct nspace_resolver_request *req;
11334 
11335 	bucket = NSPACE_RESOLVER_HASH(req_id);
11336 	LIST_FOREACH(req, bucket, r_hashlink) {
11337 		if (req->r_req_id == req_id) {
11338 			return req;
11339 		}
11340 	}
11341 
11342 	return NULL;
11343 }
11344 
11345 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11346 nspace_resolver_req_add(struct nspace_resolver_request *req)
11347 {
11348 	struct nspace_resolver_requesthead *bucket;
11349 	int error;
11350 
11351 	while (nspace_resolver_request_count >=
11352 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11353 		nspace_resolver_request_wait_slot = true;
11354 		error = msleep(&nspace_resolver_request_count,
11355 		    &nspace_resolver_request_hash_mutex,
11356 		    PVFS | PCATCH, "nspacerq", NULL);
11357 		if (error) {
11358 			return error;
11359 		}
11360 	}
11361 
11362 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11363 #if DIAGNOSTIC
11364 	assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
11365 #endif /* DIAGNOSTIC */
11366 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11367 	nspace_resolver_request_count++;
11368 
11369 	return 0;
11370 }
11371 
11372 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11373 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11374 {
11375 	struct nspace_resolver_requesthead *bucket;
11376 
11377 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11378 #if DIAGNOSTIC
11379 	assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
11380 #endif /* DIAGNOSTIC */
11381 	LIST_REMOVE(req, r_hashlink);
11382 	nspace_resolver_request_count--;
11383 
11384 	if (nspace_resolver_request_wait_slot) {
11385 		nspace_resolver_request_wait_slot = false;
11386 		wakeup(&nspace_resolver_request_count);
11387 	}
11388 }
11389 
11390 static void
nspace_resolver_req_cancel(uint32_t req_id)11391 nspace_resolver_req_cancel(uint32_t req_id)
11392 {
11393 	kern_return_t kr;
11394 	mach_port_t mp;
11395 
11396 	// Failures here aren't fatal -- the cancellation message
11397 	// sent to the resolver is merely advisory.
11398 
11399 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11400 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11401 		return;
11402 	}
11403 
11404 	kr = send_nspace_resolve_cancel(mp, req_id);
11405 	if (kr != KERN_SUCCESS) {
11406 		os_log_error(OS_LOG_DEFAULT,
11407 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11408 	}
11409 
11410 	ipc_port_release_send(mp);
11411 }
11412 
11413 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11414 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11415 {
11416 	bool send_cancel_message = false;
11417 	int error;
11418 
11419 	NSPACE_REQ_LOCK();
11420 
11421 	while ((req->r_flags & RRF_COMPLETE) == 0) {
11422 		error = msleep(req, &nspace_resolver_request_hash_mutex,
11423 		    PVFS | PCATCH, "nspace", NULL);
11424 		if (error && error != ERESTART) {
11425 			req->r_resolver_error = (error == EINTR) ? EINTR :
11426 			    ETIMEDOUT;
11427 			send_cancel_message = true;
11428 			break;
11429 		}
11430 	}
11431 
11432 	nspace_resolver_req_remove(req);
11433 
11434 	NSPACE_REQ_UNLOCK();
11435 
11436 	if (send_cancel_message) {
11437 		nspace_resolver_req_cancel(req->r_req_id);
11438 	}
11439 
11440 	return req->r_resolver_error;
11441 }
11442 
11443 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11444 nspace_resolver_req_mark_complete(
11445 	struct nspace_resolver_request *req,
11446 	int resolver_error)
11447 {
11448 	req->r_resolver_error = resolver_error;
11449 	req->r_flags |= RRF_COMPLETE;
11450 	wakeup(req);
11451 }
11452 
11453 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)11454 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
11455 {
11456 	struct nspace_resolver_request *req;
11457 
11458 	NSPACE_REQ_LOCK();
11459 
11460 	// If we don't find the request corresponding to our req_id,
11461 	// just drop the completion signal on the floor; it's likely
11462 	// that the requester interrupted with a signal.
11463 
11464 	req = nspace_resolver_req_lookup(req_id);
11465 	if (req) {
11466 		mount_t locked_mp = NULL;
11467 
11468 		locked_mp = req->r_vp->v_mount;
11469 		mount_ref(locked_mp, 0);
11470 		mount_lock_renames(locked_mp);
11471 
11472 		//
11473 		// if the resolver isn't already returning an error and we have an
11474 		// orig_gencount, then get an iocount on the request vnode and check
11475 		// that the gencount on req->r_vp has not changed.
11476 		//
11477 		// note: a ref was taken on req->r_vp when the request was created
11478 		// and that ref will be dropped by that thread when it wakes up.
11479 		//
11480 		if (resolver_error == 0 &&
11481 		    orig_gencount != 0 &&
11482 		    vnode_getwithref(req->r_vp) == 0) {
11483 			struct vnode_attr va;
11484 			uint64_t cur_gencount;
11485 
11486 			VATTR_INIT(&va);
11487 			VATTR_WANTED(&va, va_recursive_gencount);
11488 
11489 			if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
11490 				cur_gencount = va.va_recursive_gencount;
11491 			} else {
11492 				cur_gencount = 0;
11493 			}
11494 
11495 			if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
11496 				printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
11497 
11498 				// this error will be returned to the thread that initiated the
11499 				// materialization of req->r_vp.
11500 				resolver_error = EBUSY;
11501 
11502 				// note: we explicitly do not return an error to the caller (i.e.
11503 				// the thread that did the materialization) because they said they
11504 				// don't want one.
11505 			}
11506 
11507 			vnode_put(req->r_vp);
11508 		}
11509 
11510 		mount_unlock_renames(locked_mp);
11511 		mount_drop(locked_mp, 0);
11512 
11513 		nspace_resolver_req_mark_complete(req, resolver_error);
11514 	}
11515 
11516 	NSPACE_REQ_UNLOCK();
11517 
11518 	return;
11519 }
11520 
11521 static struct proc *nspace_resolver_proc;
11522 
11523 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11524 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11525 {
11526 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11527 	    p == nspace_resolver_proc) ? 1 : 0;
11528 	return 0;
11529 }
11530 
11531 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11532 
11533 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11534 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11535 {
11536 	vfs_context_t ctx = vfs_context_current();
11537 	int error = 0;
11538 
11539 	//
11540 	// The system filecoordinationd runs as uid == 0.  This also
11541 	// has the nice side-effect of filtering out filecoordinationd
11542 	// running in the simulator.
11543 	//
11544 	if (!vfs_context_issuser(ctx) ||
11545 	    !vfs_context_is_dataless_resolver(ctx)) {
11546 		return EPERM;
11547 	}
11548 
11549 	if (is_resolver) {
11550 		NSPACE_REQ_LOCK();
11551 
11552 		if (nspace_resolver_proc == NULL) {
11553 			proc_lock(p);
11554 			p->p_lflag |= P_LNSPACE_RESOLVER;
11555 			proc_unlock(p);
11556 			nspace_resolver_proc = p;
11557 		} else {
11558 			error = EBUSY;
11559 		}
11560 
11561 		NSPACE_REQ_UNLOCK();
11562 	} else {
11563 		// This is basically just like the exit case.
11564 		// nspace_resolver_exited() will verify that the
11565 		// process is the resolver, and will clear the
11566 		// global.
11567 		nspace_resolver_exited(p);
11568 	}
11569 
11570 	return error;
11571 }
11572 
11573 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11574 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11575 {
11576 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11577 	    (p->p_vfs_iopolicy &
11578 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11579 		*is_prevented = 1;
11580 	} else {
11581 		*is_prevented = 0;
11582 	}
11583 	return 0;
11584 }
11585 
11586 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11587 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11588 {
11589 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11590 		return is_prevented ? 0 : EBUSY;
11591 	}
11592 
11593 	if (is_prevented) {
11594 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11595 	} else {
11596 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11597 	}
11598 	return 0;
11599 }
11600 
11601 static int
nspace_materialization_get_thread_state(int * is_prevented)11602 nspace_materialization_get_thread_state(int *is_prevented)
11603 {
11604 	uthread_t ut = current_uthread();
11605 
11606 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11607 	return 0;
11608 }
11609 
11610 static int
nspace_materialization_set_thread_state(int is_prevented)11611 nspace_materialization_set_thread_state(int is_prevented)
11612 {
11613 	uthread_t ut = current_uthread();
11614 
11615 	if (is_prevented) {
11616 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11617 	} else {
11618 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11619 	}
11620 	return 0;
11621 }
11622 
11623 /* the vfs.nspace branch */
11624 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11625 
11626 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11627 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11628     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11629 {
11630 	struct proc *p = req->p;
11631 	int new_value, old_value, changed = 0;
11632 	int error;
11633 
11634 	error = nspace_resolver_get_proc_state(p, &old_value);
11635 	if (error) {
11636 		return error;
11637 	}
11638 
11639 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11640 	    &changed);
11641 	if (error == 0 && changed) {
11642 		error = nspace_resolver_set_proc_state(p, new_value);
11643 	}
11644 	return error;
11645 }
11646 
11647 /* decorate this process as the dataless file resolver */
11648 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11649     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11650     0, 0, sysctl_nspace_resolver, "I", "");
11651 
11652 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11653 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11654     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11655 {
11656 	struct proc *p = req->p;
11657 	int new_value, old_value, changed = 0;
11658 	int error;
11659 
11660 	error = nspace_materialization_get_proc_state(p, &old_value);
11661 	if (error) {
11662 		return error;
11663 	}
11664 
11665 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11666 	    &changed);
11667 	if (error == 0 && changed) {
11668 		error = nspace_materialization_set_proc_state(p, new_value);
11669 	}
11670 	return error;
11671 }
11672 
11673 /* decorate this process as not wanting to materialize dataless files */
11674 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11675     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11676     0, 0, sysctl_nspace_prevent_materialization, "I", "");
11677 
11678 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11679 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11680     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11681 {
11682 	int new_value, old_value, changed = 0;
11683 	int error;
11684 
11685 	error = nspace_materialization_get_thread_state(&old_value);
11686 	if (error) {
11687 		return error;
11688 	}
11689 
11690 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11691 	    &changed);
11692 	if (error == 0 && changed) {
11693 		error = nspace_materialization_set_thread_state(new_value);
11694 	}
11695 	return error;
11696 }
11697 
11698 /* decorate this thread as not wanting to materialize dataless files */
11699 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11700     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11701     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11702 
11703 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11704 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11705     __unused int arg2, struct sysctl_req *req)
11706 {
11707 	struct proc *p = req->p;
11708 	uint32_t req_status[2] = { 0, 0 };
11709 	uint64_t gencount = 0;
11710 	int error, is_resolver, changed = 0, gencount_changed;
11711 
11712 	error = nspace_resolver_get_proc_state(p, &is_resolver);
11713 	if (error) {
11714 		return error;
11715 	}
11716 
11717 	if (!is_resolver) {
11718 		return EPERM;
11719 	}
11720 
11721 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11722 	    &changed);
11723 	if (error) {
11724 		return error;
11725 	}
11726 
11727 	// get the gencount if it was passed
11728 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11729 	    &gencount_changed);
11730 	if (error) {
11731 		gencount = 0;
11732 		// we ignore the error because the gencount was optional
11733 		error = 0;
11734 	}
11735 
11736 	/*
11737 	 * req_status[0] is the req_id
11738 	 *
11739 	 * req_status[1] is the errno
11740 	 */
11741 	if (error == 0 && changed) {
11742 		nspace_resolver_req_completed(req_status[0],
11743 		    (int)req_status[1], gencount);
11744 	}
11745 	return error;
11746 }
11747 
11748 /* Resolver reports completed reqs here. */
11749 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11750     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11751     0, 0, sysctl_nspace_complete, "-", "");
11752 
11753 #endif /* CONFIG_DATALESS_FILES */
11754 
11755 #if CONFIG_DATALESS_FILES
11756 #define __no_dataless_unused    /* nothing */
11757 #else
11758 #define __no_dataless_unused    __unused
11759 #endif
11760 
11761 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11762 vfs_context_dataless_materialization_is_prevented(
11763 	vfs_context_t const ctx __no_dataless_unused)
11764 {
11765 #if CONFIG_DATALESS_FILES
11766 	proc_t const p = vfs_context_proc(ctx);
11767 	thread_t const t = vfs_context_thread(ctx);
11768 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11769 
11770 	/*
11771 	 * Kernel context ==> return EDEADLK, as we would with any random
11772 	 * process decorated as no-materialize.
11773 	 */
11774 	if (ctx == vfs_context_kernel()) {
11775 		return EDEADLK;
11776 	}
11777 
11778 	/*
11779 	 * If the process has the dataless-manipulation entitlement,
11780 	 * materialization is prevented, and depending on the kind
11781 	 * of file system operation, things get to proceed as if the
11782 	 * object is not dataless.
11783 	 */
11784 	if (vfs_context_is_dataless_manipulator(ctx)) {
11785 		return EJUSTRETURN;
11786 	}
11787 
11788 	/*
11789 	 * Per-thread decorations override any process-wide decorations.
11790 	 * (Foundation uses this, and this overrides even the dataless-
11791 	 * manipulation entitlement so as to make API contracts consistent.)
11792 	 */
11793 	if (ut != NULL) {
11794 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11795 			return EDEADLK;
11796 		}
11797 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11798 			return 0;
11799 		}
11800 	}
11801 
11802 	/*
11803 	 * If the process's iopolicy specifies that dataless files
11804 	 * can be materialized, then we let it go ahead.
11805 	 */
11806 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11807 		return 0;
11808 	}
11809 #endif /* CONFIG_DATALESS_FILES */
11810 
11811 	/*
11812 	 * The default behavior is to not materialize dataless files;
11813 	 * return to the caller that deadlock was detected.
11814 	 */
11815 	return EDEADLK;
11816 }
11817 
11818 void
nspace_resolver_init(void)11819 nspace_resolver_init(void)
11820 {
11821 #if CONFIG_DATALESS_FILES
11822 	nspace_resolver_request_hashtbl =
11823 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11824 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11825 #endif /* CONFIG_DATALESS_FILES */
11826 }
11827 
11828 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11829 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11830 {
11831 #if CONFIG_DATALESS_FILES
11832 	struct nspace_resolver_requesthead *bucket;
11833 	struct nspace_resolver_request *req;
11834 	u_long idx;
11835 
11836 	NSPACE_REQ_LOCK();
11837 
11838 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11839 	    p == nspace_resolver_proc) {
11840 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11841 			bucket = &nspace_resolver_request_hashtbl[idx];
11842 			LIST_FOREACH(req, bucket, r_hashlink) {
11843 				nspace_resolver_req_mark_complete(req,
11844 				    ETIMEDOUT);
11845 			}
11846 		}
11847 		nspace_resolver_proc = NULL;
11848 	}
11849 
11850 	NSPACE_REQ_UNLOCK();
11851 #endif /* CONFIG_DATALESS_FILES */
11852 }
11853 
11854 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11855 resolve_nspace_item(struct vnode *vp, uint64_t op)
11856 {
11857 	return resolve_nspace_item_ext(vp, op, NULL);
11858 }
11859 
11860 #define DATALESS_RESOLVER_ENTITLEMENT     \
11861 	"com.apple.private.vfs.dataless-resolver"
11862 #define DATALESS_MANIPULATION_ENTITLEMENT \
11863 	"com.apple.private.vfs.dataless-manipulation"
11864 
11865 #if CONFIG_DATALESS_FILES
11866 /*
11867  * Return TRUE if the vfs context is associated with the dataless
11868  * resolver.
11869  */
11870 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11871 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11872 {
11873 	return IOTaskHasEntitlement(vfs_context_task(ctx),
11874 	           DATALESS_RESOLVER_ENTITLEMENT);
11875 }
11876 #endif /* CONFIG_DATALESS_FILES */
11877 
11878 /*
11879  * Return TRUE if the vfs context is associated with a process entitled
11880  * for dataless manipulation.
11881  *
11882  * XXX Arguably belongs in vfs_subr.c, but is here because of the
11883  * complication around CONFIG_DATALESS_FILES.
11884  */
11885 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11886 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11887 {
11888 #if CONFIG_DATALESS_FILES
11889 	task_t task = vfs_context_task(ctx);
11890 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11891 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11892 #else
11893 	return false;
11894 #endif /* CONFIG_DATALESS_FILES */
11895 }
11896 
11897 #if CONFIG_DATALESS_FILES
11898 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11899 log_materialization_prevented(vnode_t vp, uint64_t op)
11900 {
11901 	char p_name[MAXCOMLEN + 1];
11902 	char *vntype;
11903 	proc_selfname(&p_name[0], sizeof(p_name));
11904 
11905 	if (vp->v_type == VREG) {
11906 		vntype = "File";
11907 	} else if (vp->v_type == VDIR) {
11908 		vntype = "Dir";
11909 	} else if (vp->v_type == VLNK) {
11910 		vntype = "SymLink";
11911 	} else {
11912 		vntype = "Other";
11913 	}
11914 
11915 #if DEVELOPMENT
11916 	char *path = NULL;
11917 	int   len;
11918 
11919 	path = get_pathbuff();
11920 	len = MAXPATHLEN;
11921 	if (path) {
11922 		vn_getpath(vp, path, &len);
11923 	}
11924 
11925 	os_log_debug(OS_LOG_DEFAULT,
11926 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11927 	    p_name, proc_selfpid(),
11928 	    op, vntype, path ? path : "<unknown-path>");
11929 	if (path) {
11930 		release_pathbuff(path);
11931 	}
11932 #else
11933 	os_log_debug(OS_LOG_DEFAULT,
11934 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11935 	    p_name, proc_selfpid(),
11936 	    op, vntype);
11937 #endif
11938 }
11939 #endif /* CONFIG_DATALESS_FILES */
11940 
11941 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11942 vfs_materialize_item(
11943 	struct vnode *vp __no_dataless_unused,
11944 	uint64_t op __no_dataless_unused,
11945 	int64_t offset __no_dataless_unused,
11946 	int64_t size __no_dataless_unused,
11947 	char *lookup_name __no_dataless_unused,
11948 	size_t const namelen __no_dataless_unused)
11949 {
11950 #if CONFIG_DATALESS_FILES
11951 	struct nspace_resolver_request req;
11952 	kern_return_t kern_ret;
11953 	mach_port_t mach_port;
11954 	char *path = NULL;
11955 	vfs_context_t context;
11956 	int path_len;
11957 	int error;
11958 	audit_token_t atoken;
11959 
11960 	/*
11961 	 * If this is a snapshot event and the vnode is on a disk image just
11962 	 * pretend nothing happened since any change to the disk image will
11963 	 * cause the disk image itself to get backed up and this avoids multi-
11964 	 * way deadlocks between the snapshot handler and the ever popular
11965 	 * diskimages-helper process. The variable nspace_allow_virtual_devs
11966 	 * allows this behavior to be overridden (for use by the Mobile
11967 	 * TimeMachine testing infrastructure which uses disk images).
11968 	 */
11969 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11970 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11971 		return ENOTSUP;
11972 	}
11973 
11974 	context = vfs_context_current();
11975 
11976 	error = vfs_context_dataless_materialization_is_prevented(context);
11977 	if (error) {
11978 		log_materialization_prevented(vp, op);
11979 		return error;
11980 	}
11981 
11982 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11983 	    &mach_port);
11984 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11985 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11986 		/*
11987 		 * Treat this like being unable to access the backing store
11988 		 * server.
11989 		 */
11990 		return ETIMEDOUT;
11991 	}
11992 
11993 	int path_alloc_len = MAXPATHLEN;
11994 	do {
11995 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
11996 		if (path == NULL) {
11997 			return ENOMEM;
11998 		}
11999 
12000 		path_len = path_alloc_len;
12001 		error = vn_getpath(vp, path, &path_len);
12002 		if (error == 0) {
12003 			break;
12004 		} else if (error == ENOSPC) {
12005 			kfree_data(path, path_alloc_len);
12006 			path = NULL;
12007 		} else {
12008 			goto out_release_port;
12009 		}
12010 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12011 
12012 	error = vfs_context_copy_audit_token(context, &atoken);
12013 	if (error) {
12014 		goto out_release_port;
12015 	}
12016 
12017 	req.r_req_id = next_nspace_req_id();
12018 	req.r_resolver_error = 0;
12019 	req.r_flags = 0;
12020 	req.r_vp = vp;
12021 
12022 	NSPACE_REQ_LOCK();
12023 	error = nspace_resolver_req_add(&req);
12024 	NSPACE_REQ_UNLOCK();
12025 	if (error) {
12026 		goto out_release_port;
12027 	}
12028 
12029 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12030 	if (vp->v_type == VDIR) {
12031 		char *tmpname = NULL;
12032 
12033 		/*
12034 		 * If the caller provided a lookup_name *and* a name length,
12035 		 * then we assume the lookup_name is not NUL-terminated.
12036 		 * Allocate a temporary buffer in this case to provide
12037 		 * a NUL-terminated path name to the IPC call.
12038 		 */
12039 		if (lookup_name != NULL && namelen != 0) {
12040 			if (namelen >= PATH_MAX) {
12041 				error = EINVAL;
12042 				goto out_release_port;
12043 			}
12044 			tmpname = zalloc(ZV_NAMEI);
12045 			strlcpy(tmpname, lookup_name, namelen + 1);
12046 			lookup_name = tmpname;
12047 		} else if (lookup_name != NULL) {
12048 			/*
12049 			 * If the caller provided a lookup_name with a
12050 			 * zero name length, then we assume it's NUL-
12051 			 * terminated.  Verify it has a valid length.
12052 			 */
12053 			if (strlen(lookup_name) >= PATH_MAX) {
12054 				error = EINVAL;
12055 				goto out_release_port;
12056 			}
12057 		}
12058 
12059 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12060 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
12061 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12062 
12063 		if (tmpname != NULL) {
12064 			zfree(ZV_NAMEI, tmpname);
12065 
12066 			/*
12067 			 * Poison lookup_name rather than reference
12068 			 * freed memory.
12069 			 */
12070 			lookup_name = NULL;
12071 		}
12072 	} else {
12073 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12074 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
12075 		    offset, size, path, atoken);
12076 	}
12077 	if (kern_ret != KERN_SUCCESS) {
12078 		/*
12079 		 * Also treat this like being unable to access the backing
12080 		 * store server.
12081 		 */
12082 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12083 		    kern_ret);
12084 		error = ETIMEDOUT;
12085 
12086 		NSPACE_REQ_LOCK();
12087 		nspace_resolver_req_remove(&req);
12088 		NSPACE_REQ_UNLOCK();
12089 		goto out_release_port;
12090 	}
12091 
12092 	/*
12093 	 * Give back the memory we allocated earlier while we wait; we
12094 	 * no longer need it.
12095 	 */
12096 	kfree_data(path, path_alloc_len);
12097 	path = NULL;
12098 
12099 	/*
12100 	 * Request has been submitted to the resolver. Now (interruptibly)
12101 	 * wait for completion. Upon requrn, the request will have been
12102 	 * removed from the lookup table.
12103 	 */
12104 	error = nspace_resolver_req_wait(&req);
12105 
12106 out_release_port:
12107 	if (path != NULL) {
12108 		kfree_data(path, path_alloc_len);
12109 		path = NULL;
12110 	}
12111 	ipc_port_release_send(mach_port);
12112 
12113 	return error;
12114 #else
12115 	return ENOTSUP;
12116 #endif /* CONFIG_DATALESS_FILES */
12117 }
12118 
12119 /*
12120  * vfs_materialize_file: Materialize a regular file.
12121  *
12122  * Inputs:
12123  * vp		The dataless file to be materialized.
12124  *
12125  * op		What kind of operation is being performed:
12126  *		-> NAMESPACE_HANDLER_READ_OP
12127  *		-> NAMESPACE_HANDLER_WRITE_OP
12128  *		-> NAMESPACE_HANDLER_LINK_CREATE
12129  *		-> NAMESPACE_HANDLER_DELETE_OP
12130  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12131  *		-> NAMESPACE_HANDLER_RENAME_OP
12132  *
12133  * offset	offset of I/O for READ or WRITE.  Ignored for
12134  *		other ops.
12135  *
12136  * size		size of I/O for READ or WRITE  Ignored for
12137  *		other ops.
12138  *
12139  * If offsize or size are -1 for a READ or WRITE, then the resolver should
12140  * consider the range to be unknown.
12141  *
12142  * Upon successful return, the caller may proceed with the operation.
12143  * N.B. the file may still be "dataless" in this case.
12144  */
12145 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12146 vfs_materialize_file(
12147 	struct vnode *vp,
12148 	uint64_t op,
12149 	int64_t offset,
12150 	int64_t size)
12151 {
12152 	if (vp->v_type != VREG) {
12153 		return EFTYPE;
12154 	}
12155 	return vfs_materialize_item(vp, op, offset, size, NULL, 0);
12156 }
12157 
12158 /*
12159  * vfs_materialize_dir:
12160  *
12161  * Inputs:
12162  * vp		The dataless directory to be materialized.
12163  *
12164  * op		What kind of operation is being performed:
12165  *		-> NAMESPACE_HANDLER_READ_OP
12166  *		-> NAMESPACE_HANDLER_WRITE_OP
12167  *		-> NAMESPACE_HANDLER_DELETE_OP
12168  *		-> NAMESPACE_HANDLER_RENAME_OP
12169  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12170  *
12171  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12172  *		other ops.  May or may not be NUL-terminated; see below.
12173  *
12174  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12175  *		terminated and namelen is the number of valid bytes in
12176  *		lookup_name. If zero, then lookup_name is assumed to be
12177  *		NUL-terminated.
12178  *
12179  * Upon successful return, the caller may proceed with the operation.
12180  * N.B. the directory may still be "dataless" in this case.
12181  */
12182 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12183 vfs_materialize_dir(
12184 	struct vnode *vp,
12185 	uint64_t op,
12186 	char *lookup_name,
12187 	size_t namelen)
12188 {
12189 	if (vp->v_type != VDIR) {
12190 		return EFTYPE;
12191 	}
12192 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12193 		return EINVAL;
12194 	}
12195 	return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
12196 }
12197 
12198 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)12199 resolve_nspace_item_ext(
12200 	struct vnode *vp __no_dataless_unused,
12201 	uint64_t op __no_dataless_unused,
12202 	void *arg __unused)
12203 {
12204 #if CONFIG_DATALESS_FILES
12205 	int error;
12206 	mach_port_t mp;
12207 	char *path = NULL;
12208 	int path_len;
12209 	kern_return_t kr;
12210 	struct nspace_resolver_request req;
12211 
12212 	// only allow namespace events on regular files, directories and symlinks.
12213 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
12214 		return EFTYPE;
12215 	}
12216 
12217 	//
12218 	// if this is a snapshot event and the vnode is on a
12219 	// disk image just pretend nothing happened since any
12220 	// change to the disk image will cause the disk image
12221 	// itself to get backed up and this avoids multi-way
12222 	// deadlocks between the snapshot handler and the ever
12223 	// popular diskimages-helper process.  the variable
12224 	// nspace_allow_virtual_devs allows this behavior to
12225 	// be overridden (for use by the Mobile TimeMachine
12226 	// testing infrastructure which uses disk images)
12227 	//
12228 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12229 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12230 		return ENOTSUP;
12231 	}
12232 
12233 	error = vfs_context_dataless_materialization_is_prevented(
12234 		vfs_context_current());
12235 	if (error) {
12236 		log_materialization_prevented(vp, op);
12237 		return error;
12238 	}
12239 
12240 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
12241 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
12242 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12243 		// Treat this like being unable to access the backing
12244 		// store server.
12245 		return ETIMEDOUT;
12246 	}
12247 
12248 	int path_alloc_len = MAXPATHLEN;
12249 	do {
12250 		path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12251 		if (path == NULL) {
12252 			return ENOMEM;
12253 		}
12254 
12255 		path_len = path_alloc_len;
12256 		error = vn_getpath(vp, path, &path_len);
12257 		if (error == 0) {
12258 			break;
12259 		} else if (error == ENOSPC) {
12260 			kfree_data(path, path_alloc_len);
12261 			path = NULL;
12262 		} else {
12263 			goto out_release_port;
12264 		}
12265 	} while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12266 
12267 	if (error == 0) {
12268 		int xxx_rdar44371223;   /* XXX Mig bug */
12269 		req.r_req_id = next_nspace_req_id();
12270 		req.r_resolver_error = 0;
12271 		req.r_flags = 0;
12272 
12273 		if ((error = vnode_ref(vp)) == 0) {     // take a ref so that the vnode doesn't go away
12274 			req.r_vp = vp;
12275 		} else {
12276 			goto out_release_port;
12277 		}
12278 
12279 		NSPACE_REQ_LOCK();
12280 		error = nspace_resolver_req_add(&req);
12281 		NSPACE_REQ_UNLOCK();
12282 		if (error) {
12283 			vnode_rele(req.r_vp);
12284 			goto out_release_port;
12285 		}
12286 
12287 		os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12288 		kr = send_nspace_resolve_path(mp, req.r_req_id,
12289 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
12290 		    path, &xxx_rdar44371223);
12291 		if (kr != KERN_SUCCESS) {
12292 			// Also treat this like being unable to access
12293 			// the backing store server.
12294 			os_log_error(OS_LOG_DEFAULT,
12295 			    "NSPACE resolve_path failure: %d", kr);
12296 			error = ETIMEDOUT;
12297 
12298 			NSPACE_REQ_LOCK();
12299 			nspace_resolver_req_remove(&req);
12300 			NSPACE_REQ_UNLOCK();
12301 			vnode_rele(req.r_vp);
12302 			goto out_release_port;
12303 		}
12304 
12305 		// Give back the memory we allocated earlier while
12306 		// we wait; we no longer need it.
12307 		kfree_data(path, path_alloc_len);
12308 		path = NULL;
12309 
12310 		// Request has been submitted to the resolver.
12311 		// Now (interruptibly) wait for completion.
12312 		// Upon requrn, the request will have been removed
12313 		// from the lookup table.
12314 		error = nspace_resolver_req_wait(&req);
12315 
12316 		vnode_rele(req.r_vp);
12317 	}
12318 
12319 out_release_port:
12320 	if (path != NULL) {
12321 		kfree_data(path, path_alloc_len);
12322 		path = NULL;
12323 	}
12324 	ipc_port_release_send(mp);
12325 
12326 	return error;
12327 #else
12328 	return ENOTSUP;
12329 #endif /* CONFIG_DATALESS_FILES */
12330 }
12331 
12332 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)12333 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
12334     __unused uint64_t op_type, __unused void *arg)
12335 {
12336 	return 0;
12337 }
12338 
12339 #if 0
12340 static int
12341 build_volfs_path(struct vnode *vp, char *path, int *len)
12342 {
12343 	struct vnode_attr va;
12344 	int ret;
12345 
12346 	VATTR_INIT(&va);
12347 	VATTR_WANTED(&va, va_fsid);
12348 	VATTR_WANTED(&va, va_fileid);
12349 
12350 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12351 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12352 		ret = -1;
12353 	} else {
12354 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12355 		ret = 0;
12356 	}
12357 
12358 	return ret;
12359 }
12360 #endif
12361 
12362 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12363 fsctl_bogus_command_compat(unsigned long cmd)
12364 {
12365 	switch (cmd) {
12366 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12367 		return FSIOC_SYNC_VOLUME;
12368 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12369 		return FSIOC_ROUTEFS_SETROUTEID;
12370 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12371 		return FSIOC_SET_PACKAGE_EXTS;
12372 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12373 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
12374 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12375 		return DISK_CONDITIONER_IOC_GET;
12376 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12377 		return DISK_CONDITIONER_IOC_SET;
12378 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12379 		return FSIOC_FIOSEEKHOLE;
12380 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12381 		return FSIOC_FIOSEEKDATA;
12382 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12383 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12384 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12385 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
12386 	}
12387 
12388 	return cmd;
12389 }
12390 
12391 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12392 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12393 {
12394 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12395 }
12396 
12397 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12398 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12399 {
12400 	struct vfs_attr vfa;
12401 	mount_t mp = vp->v_mount;
12402 	unsigned arg;
12403 	int error;
12404 
12405 	/* record vid of vp so we can drop it below. */
12406 	uint32_t vvid = vp->v_id;
12407 
12408 	/*
12409 	 * Then grab mount_iterref so that we can release the vnode.
12410 	 * Without this, a thread may call vnode_iterate_prepare then
12411 	 * get into a deadlock because we've never released the root vp
12412 	 */
12413 	error = mount_iterref(mp, 0);
12414 	if (error) {
12415 		return error;
12416 	}
12417 	vnode_hold(vp);
12418 	vnode_put(vp);
12419 
12420 	arg = MNT_NOWAIT;
12421 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12422 		arg = MNT_WAIT;
12423 	}
12424 
12425 	/*
12426 	 * If the filessytem supports multiple filesytems in a
12427 	 * partition (For eg APFS volumes in a container, it knows
12428 	 * that the waitfor argument to VFS_SYNC are flags.
12429 	 */
12430 	VFSATTR_INIT(&vfa);
12431 	VFSATTR_WANTED(&vfa, f_capabilities);
12432 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12433 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12434 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12435 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12436 		arg |= MNT_VOLUME;
12437 	}
12438 
12439 	/* issue the sync for this volume */
12440 	(void)sync_callback(mp, &arg);
12441 
12442 	/*
12443 	 * Then release the mount_iterref once we're done syncing; it's not
12444 	 * needed for the VNOP_IOCTL below
12445 	 */
12446 	mount_iterdrop(mp);
12447 
12448 	if (arg & FSCTL_SYNC_FULLSYNC) {
12449 		/* re-obtain vnode iocount on the root vp, if possible */
12450 		error = vnode_getwithvid(vp, vvid);
12451 		if (error == 0) {
12452 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12453 			vnode_put(vp);
12454 		}
12455 	}
12456 	vnode_drop(vp);
12457 	/* mark the argument VP as having been released */
12458 	*arg_vp = NULL;
12459 	return error;
12460 }
12461 
12462 #if ROUTEFS
12463 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12464 handle_routes(user_addr_t udata)
12465 {
12466 	char routepath[MAXPATHLEN];
12467 	size_t len = 0;
12468 	int error;
12469 
12470 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12471 		return error;
12472 	}
12473 	bzero(routepath, MAXPATHLEN);
12474 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12475 	if (error) {
12476 		return error;
12477 	}
12478 	error = routefs_kernel_mount(routepath);
12479 	return error;
12480 }
12481 #endif
12482 
12483 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12484 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12485 {
12486 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12487 	struct vnode_attr va;
12488 	int error;
12489 
12490 	VATTR_INIT(&va);
12491 	VATTR_SET(&va, va_flags, cas->new_flags);
12492 
12493 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12494 
12495 #if CONFIG_FSE
12496 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12497 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12498 	}
12499 #endif
12500 
12501 	return error;
12502 }
12503 
12504 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12505 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12506 {
12507 	struct mount *mp = NULL;
12508 	errno_t rootauth = 0;
12509 
12510 	mp = vp->v_mount;
12511 
12512 	/*
12513 	 * query the underlying FS and see if it reports something
12514 	 * sane for this vnode. If volume is authenticated via
12515 	 * chunklist, leave that for the caller to determine.
12516 	 */
12517 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12518 
12519 	return rootauth;
12520 }
12521 
12522 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12523 	"com.apple.private.kernel.set-package-extensions"
12524 
12525 /*
12526  * Make a filesystem-specific control call:
12527  */
12528 /* ARGSUSED */
12529 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12530 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12531 {
12532 	int error = 0;
12533 	boolean_t is64bit;
12534 	u_int size;
12535 #define STK_PARAMS 128
12536 	char stkbuf[STK_PARAMS] = {0};
12537 	caddr_t data, memp;
12538 	vnode_t vp = *arg_vp;
12539 
12540 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
12541 		return ENOTTY;
12542 	}
12543 
12544 	cmd = fsctl_bogus_command_compat(cmd);
12545 
12546 	size = IOCPARM_LEN(cmd);
12547 	if (size > IOCPARM_MAX) {
12548 		return EINVAL;
12549 	}
12550 
12551 	is64bit = proc_is64bit(p);
12552 
12553 	memp = NULL;
12554 
12555 	if (size > sizeof(stkbuf)) {
12556 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12557 			return ENOMEM;
12558 		}
12559 		data = memp;
12560 	} else {
12561 		data = &stkbuf[0];
12562 	};
12563 
12564 	if (cmd & IOC_IN) {
12565 		if (size) {
12566 			error = copyin(udata, data, size);
12567 			if (error) {
12568 				if (memp) {
12569 					kfree_data(memp, size);
12570 				}
12571 				return error;
12572 			}
12573 		} else {
12574 			if (is64bit) {
12575 				*(user_addr_t *)data = udata;
12576 			} else {
12577 				*(uint32_t *)data = (uint32_t)udata;
12578 			}
12579 		};
12580 	} else if ((cmd & IOC_OUT) && size) {
12581 		/*
12582 		 * Zero the buffer so the user always
12583 		 * gets back something deterministic.
12584 		 */
12585 		bzero(data, size);
12586 	} else if (cmd & IOC_VOID) {
12587 		if (is64bit) {
12588 			*(user_addr_t *)data = udata;
12589 		} else {
12590 			*(uint32_t *)data = (uint32_t)udata;
12591 		}
12592 	}
12593 
12594 	/* Check to see if it's a generic command */
12595 	switch (cmd) {
12596 	case FSIOC_SYNC_VOLUME:
12597 		error = handle_sync_volume(vp, arg_vp, data, ctx);
12598 		break;
12599 
12600 	case FSIOC_ROUTEFS_SETROUTEID:
12601 #if ROUTEFS
12602 		error = handle_routes(udata);
12603 #endif
12604 		break;
12605 
12606 	case FSIOC_SET_PACKAGE_EXTS: {
12607 		user_addr_t ext_strings;
12608 		uint32_t    num_entries;
12609 		uint32_t    max_width;
12610 
12611 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12612 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12613 			error = EPERM;
12614 			break;
12615 		}
12616 
12617 		if ((is64bit && size != sizeof(user64_package_ext_info))
12618 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12619 			// either you're 64-bit and passed a 64-bit struct or
12620 			// you're 32-bit and passed a 32-bit struct.  otherwise
12621 			// it's not ok.
12622 			error = EINVAL;
12623 			break;
12624 		}
12625 
12626 		if (is64bit) {
12627 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12628 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12629 			}
12630 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12631 			num_entries = ((user64_package_ext_info *)data)->num_entries;
12632 			max_width   = ((user64_package_ext_info *)data)->max_width;
12633 		} else {
12634 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12635 			num_entries = ((user32_package_ext_info *)data)->num_entries;
12636 			max_width   = ((user32_package_ext_info *)data)->max_width;
12637 		}
12638 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
12639 	}
12640 	break;
12641 
12642 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12643 	{
12644 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12645 			break;
12646 		}
12647 		if (vp->v_mount) {
12648 			mount_lock(vp->v_mount);
12649 			if (data[0] != 0) {
12650 				int i;
12651 				for (i = 0; i < MFSTYPENAMELEN; i++) {
12652 					if (!data[i]) {
12653 						goto continue_copy;
12654 					}
12655 				}
12656 				/*
12657 				 * Getting here means we have a user data string which has no
12658 				 * NULL termination in its first MFSTYPENAMELEN bytes.
12659 				 * This is bogus, let's avoid strlcpy-ing the read data and
12660 				 * return an error.
12661 				 */
12662 				error = EINVAL;
12663 				goto unlock;
12664 continue_copy:
12665 				strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
12666 				vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
12667 				if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12668 					vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
12669 					vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
12670 				}
12671 			} else {
12672 				if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12673 					vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
12674 				}
12675 				vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
12676 				vp->v_mount->fstypename_override[0] = '\0';
12677 			}
12678 unlock:
12679 			mount_unlock(vp->v_mount);
12680 		}
12681 	}
12682 	break;
12683 
12684 	case DISK_CONDITIONER_IOC_GET: {
12685 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12686 	}
12687 	break;
12688 
12689 	case DISK_CONDITIONER_IOC_SET: {
12690 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12691 	}
12692 	break;
12693 
12694 	case FSIOC_CAS_BSDFLAGS:
12695 		error = handle_flags(vp, data, ctx);
12696 		break;
12697 
12698 	case FSIOC_FD_ONLY_OPEN_ONCE: {
12699 		error = 0;
12700 		if (vnode_usecount(vp) > 1) {
12701 			vnode_lock_spin(vp);
12702 			if (vp->v_lflag & VL_HASSTREAMS) {
12703 				if (vnode_isinuse_locked(vp, 1, 1)) {
12704 					error = EBUSY;
12705 				}
12706 			} else if (vnode_usecount(vp) > 1) {
12707 				error = EBUSY;
12708 			}
12709 			vnode_unlock(vp);
12710 		}
12711 	}
12712 	break;
12713 
12714 	case FSIOC_EVAL_ROOTAUTH:
12715 		error = handle_auth(vp, cmd, data, options, ctx);
12716 		break;
12717 
12718 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
12719 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12720 		break;
12721 
12722 	default: {
12723 		/* other, known commands shouldn't be passed down here */
12724 		switch (cmd) {
12725 		case F_PUNCHHOLE:
12726 		case F_TRIM_ACTIVE_FILE:
12727 		case F_RDADVISE:
12728 		case F_TRANSCODEKEY:
12729 		case F_GETPROTECTIONLEVEL:
12730 		case F_GETDEFAULTPROTLEVEL:
12731 		case F_MAKECOMPRESSED:
12732 		case F_SET_GREEDY_MODE:
12733 		case F_SETSTATICCONTENT:
12734 		case F_SETIOTYPE:
12735 		case F_SETBACKINGSTORE:
12736 		case F_GETPATH_MTMINFO:
12737 		case APFSIOC_REVERT_TO_SNAPSHOT:
12738 		case FSIOC_FIOSEEKHOLE:
12739 		case FSIOC_FIOSEEKDATA:
12740 		case HFS_GET_BOOT_INFO:
12741 		case HFS_SET_BOOT_INFO:
12742 		case FIOPINSWAP:
12743 		case F_CHKCLEAN:
12744 		case F_FULLFSYNC:
12745 		case F_BARRIERFSYNC:
12746 		case F_FREEZE_FS:
12747 		case F_THAW_FS:
12748 		case FSIOC_KERNEL_ROOTAUTH:
12749 		case FSIOC_GRAFT_FS:
12750 		case FSIOC_UNGRAFT_FS:
12751 		case FSIOC_AUTH_FS:
12752 			error = EINVAL;
12753 			goto outdrop;
12754 		}
12755 		/* Invoke the filesystem-specific code */
12756 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12757 	}
12758 	} /* end switch stmt */
12759 
12760 	/*
12761 	 * if no errors, copy any data to user. Size was
12762 	 * already set and checked above.
12763 	 */
12764 	if (error == 0 && (cmd & IOC_OUT) && size) {
12765 		error = copyout(data, udata, size);
12766 	}
12767 
12768 outdrop:
12769 	if (memp) {
12770 		kfree_data(memp, size);
12771 	}
12772 
12773 	return error;
12774 }
12775 
12776 /* ARGSUSED */
12777 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12778 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12779 {
12780 	int error;
12781 	struct nameidata nd;
12782 	uint32_t nameiflags;
12783 	vnode_t vp = NULL;
12784 	vfs_context_t ctx = vfs_context_current();
12785 
12786 	AUDIT_ARG(cmd, (int)uap->cmd);
12787 	AUDIT_ARG(value32, uap->options);
12788 	/* Get the vnode for the file we are getting info on:  */
12789 	nameiflags = 0;
12790 	//
12791 	// if we come through fsctl() then the file is by definition not open.
12792 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12793 	// lest the caller mistakenly thinks the only open is their own (but in
12794 	// reality it's someone elses).
12795 	//
12796 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12797 		return EINVAL;
12798 	}
12799 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12800 		nameiflags |= FOLLOW;
12801 	}
12802 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12803 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12804 	}
12805 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12806 	    UIO_USERSPACE, uap->path, ctx);
12807 	if ((error = namei(&nd))) {
12808 		goto done;
12809 	}
12810 	vp = nd.ni_vp;
12811 	nameidone(&nd);
12812 
12813 #if CONFIG_MACF
12814 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12815 	if (error) {
12816 		goto done;
12817 	}
12818 #endif
12819 
12820 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12821 
12822 done:
12823 	if (vp) {
12824 		vnode_put(vp);
12825 	}
12826 	return error;
12827 }
12828 /* ARGSUSED */
12829 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12830 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12831 {
12832 	int error;
12833 	vnode_t vp = NULL;
12834 	vfs_context_t ctx = vfs_context_current();
12835 	int fd = -1;
12836 
12837 	AUDIT_ARG(fd, uap->fd);
12838 	AUDIT_ARG(cmd, (int)uap->cmd);
12839 	AUDIT_ARG(value32, uap->options);
12840 
12841 	/* Get the vnode for the file we are getting info on:  */
12842 	if ((error = file_vnode(uap->fd, &vp))) {
12843 		return error;
12844 	}
12845 	fd = uap->fd;
12846 	if ((error = vnode_getwithref(vp))) {
12847 		file_drop(fd);
12848 		return error;
12849 	}
12850 
12851 #if CONFIG_MACF
12852 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12853 		file_drop(fd);
12854 		vnode_put(vp);
12855 		return error;
12856 	}
12857 #endif
12858 
12859 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12860 
12861 	file_drop(fd);
12862 
12863 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12864 	if (vp) {
12865 		vnode_put(vp);
12866 	}
12867 
12868 	return error;
12869 }
12870 /* end of fsctl system call */
12871 
12872 #define FILESEC_ACCESS_ENTITLEMENT              \
12873 	"com.apple.private.vfs.filesec-access"
12874 
12875 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12876 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12877 {
12878 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12879 		/*
12880 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12881 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12882 		 */
12883 		if ((!setting && vfs_context_issuser(ctx)) ||
12884 		    IOTaskHasEntitlement(vfs_context_task(ctx),
12885 		    FILESEC_ACCESS_ENTITLEMENT)) {
12886 			return 0;
12887 		}
12888 	}
12889 
12890 	return EPERM;
12891 }
12892 
12893 /*
12894  *  Retrieve the data of an extended attribute.
12895  */
12896 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12897 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12898 {
12899 	vnode_t vp;
12900 	struct nameidata nd;
12901 	char attrname[XATTR_MAXNAMELEN + 1];
12902 	vfs_context_t ctx = vfs_context_current();
12903 	uio_t auio = NULL;
12904 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12905 	size_t attrsize = 0;
12906 	size_t namelen;
12907 	u_int32_t nameiflags;
12908 	int error;
12909 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12910 
12911 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12912 		return EINVAL;
12913 	}
12914 
12915 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12916 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12917 	if ((error = namei(&nd))) {
12918 		return error;
12919 	}
12920 	vp = nd.ni_vp;
12921 	nameidone(&nd);
12922 
12923 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12924 	if (error != 0) {
12925 		goto out;
12926 	}
12927 	if (xattr_protected(attrname) &&
12928 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12929 		goto out;
12930 	}
12931 	/*
12932 	 * the specific check for 0xffffffff is a hack to preserve
12933 	 * binaray compatibilty in K64 with applications that discovered
12934 	 * that passing in a buf pointer and a size of -1 resulted in
12935 	 * just the size of the indicated extended attribute being returned.
12936 	 * this isn't part of the documented behavior, but because of the
12937 	 * original implemtation's check for "uap->size > 0", this behavior
12938 	 * was allowed. In K32 that check turned into a signed comparison
12939 	 * even though uap->size is unsigned...  in K64, we blow by that
12940 	 * check because uap->size is unsigned and doesn't get sign smeared
12941 	 * in the munger for a 32 bit user app.  we also need to add a
12942 	 * check to limit the maximum size of the buffer being passed in...
12943 	 * unfortunately, the underlying fileystems seem to just malloc
12944 	 * the requested size even if the actual extended attribute is tiny.
12945 	 * because that malloc is for kernel wired memory, we have to put a
12946 	 * sane limit on it.
12947 	 *
12948 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12949 	 * U64 running on K64 will yield -1 (64 bits wide)
12950 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
12951 	 */
12952 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12953 		goto no_uio;
12954 	}
12955 
12956 	if (uap->value) {
12957 		if (uap->size > (size_t)XATTR_MAXSIZE) {
12958 			uap->size = XATTR_MAXSIZE;
12959 		}
12960 
12961 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12962 		    &uio_buf[0], sizeof(uio_buf));
12963 		uio_addiov(auio, uap->value, uap->size);
12964 	}
12965 no_uio:
12966 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12967 out:
12968 	vnode_put(vp);
12969 
12970 	if (auio) {
12971 		*retval = uap->size - uio_resid(auio);
12972 	} else {
12973 		*retval = (user_ssize_t)attrsize;
12974 	}
12975 
12976 	return error;
12977 }
12978 
12979 /*
12980  * Retrieve the data of an extended attribute.
12981  */
12982 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12983 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12984 {
12985 	vnode_t vp;
12986 	char attrname[XATTR_MAXNAMELEN + 1];
12987 	vfs_context_t ctx = vfs_context_current();
12988 	uio_t auio = NULL;
12989 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12990 	size_t attrsize = 0;
12991 	size_t namelen;
12992 	int error;
12993 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12994 
12995 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12996 		return EINVAL;
12997 	}
12998 
12999 	if ((error = file_vnode(uap->fd, &vp))) {
13000 		return error;
13001 	}
13002 	if ((error = vnode_getwithref(vp))) {
13003 		file_drop(uap->fd);
13004 		return error;
13005 	}
13006 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13007 	if (error != 0) {
13008 		goto out;
13009 	}
13010 	if (xattr_protected(attrname) &&
13011 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13012 		goto out;
13013 	}
13014 	if (uap->value && uap->size > 0) {
13015 		if (uap->size > (size_t)XATTR_MAXSIZE) {
13016 			uap->size = XATTR_MAXSIZE;
13017 		}
13018 
13019 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13020 		    &uio_buf[0], sizeof(uio_buf));
13021 		uio_addiov(auio, uap->value, uap->size);
13022 	}
13023 
13024 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13025 out:
13026 	(void)vnode_put(vp);
13027 	file_drop(uap->fd);
13028 
13029 	if (auio) {
13030 		*retval = uap->size - uio_resid(auio);
13031 	} else {
13032 		*retval = (user_ssize_t)attrsize;
13033 	}
13034 	return error;
13035 }
13036 
13037 /* struct for checkdirs iteration */
13038 struct setxattr_ctx {
13039 	struct nameidata nd;
13040 	char attrname[XATTR_MAXNAMELEN + 1];
13041 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13042 };
13043 
13044 /*
13045  * Set the data of an extended attribute.
13046  */
13047 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13048 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13049 {
13050 	vnode_t vp;
13051 	vfs_context_t ctx = vfs_context_current();
13052 	uio_t auio = NULL;
13053 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13054 	size_t namelen;
13055 	u_int32_t nameiflags;
13056 	int error;
13057 	struct setxattr_ctx *sactx;
13058 
13059 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13060 		return EINVAL;
13061 	}
13062 
13063 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13064 	if (sactx == NULL) {
13065 		return ENOMEM;
13066 	}
13067 
13068 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13069 	if (error != 0) {
13070 		if (error == EPERM) {
13071 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13072 			error = ENAMETOOLONG;
13073 		}
13074 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13075 		goto out;
13076 	}
13077 	if (xattr_protected(sactx->attrname) &&
13078 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13079 		goto out;
13080 	}
13081 	if (uap->size != 0 && uap->value == 0) {
13082 		error = EINVAL;
13083 		goto out;
13084 	}
13085 	if (uap->size > INT_MAX) {
13086 		error = E2BIG;
13087 		goto out;
13088 	}
13089 
13090 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13091 #if CONFIG_FILE_LEASES
13092 	nameiflags |= WANTPARENT;
13093 #endif
13094 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13095 	if ((error = namei(&sactx->nd))) {
13096 		goto out;
13097 	}
13098 	vp = sactx->nd.ni_vp;
13099 #if CONFIG_FILE_LEASES
13100 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13101 	vnode_put(sactx->nd.ni_dvp);
13102 #endif
13103 	nameidone(&sactx->nd);
13104 
13105 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13106 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13107 	uio_addiov(auio, uap->value, uap->size);
13108 
13109 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13110 #if CONFIG_FSE
13111 	if (error == 0) {
13112 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13113 		    FSE_ARG_VNODE, vp,
13114 		    FSE_ARG_DONE);
13115 	}
13116 #endif
13117 	vnode_put(vp);
13118 out:
13119 	kfree_type(struct setxattr_ctx, sactx);
13120 	*retval = 0;
13121 	return error;
13122 }
13123 
13124 /*
13125  * Set the data of an extended attribute.
13126  */
13127 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13128 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13129 {
13130 	vnode_t vp;
13131 	char attrname[XATTR_MAXNAMELEN + 1];
13132 	vfs_context_t ctx = vfs_context_current();
13133 	uio_t auio = NULL;
13134 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13135 	size_t namelen;
13136 	int error;
13137 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13138 
13139 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13140 		return EINVAL;
13141 	}
13142 
13143 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13144 	if (error != 0) {
13145 		if (error == EPERM) {
13146 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13147 			return ENAMETOOLONG;
13148 		}
13149 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13150 		return error;
13151 	}
13152 	if (xattr_protected(attrname) &&
13153 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13154 		return error;
13155 	}
13156 	if (uap->size != 0 && uap->value == 0) {
13157 		return EINVAL;
13158 	}
13159 	if (uap->size > INT_MAX) {
13160 		return E2BIG;
13161 	}
13162 	if ((error = file_vnode(uap->fd, &vp))) {
13163 		return error;
13164 	}
13165 	if ((error = vnode_getwithref(vp))) {
13166 		file_drop(uap->fd);
13167 		return error;
13168 	}
13169 
13170 #if CONFIG_FILE_LEASES
13171 	vnode_breakdirlease(vp, true, O_WRONLY);
13172 #endif
13173 
13174 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13175 	    &uio_buf[0], sizeof(uio_buf));
13176 	uio_addiov(auio, uap->value, uap->size);
13177 
13178 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13179 #if CONFIG_FSE
13180 	if (error == 0) {
13181 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13182 		    FSE_ARG_VNODE, vp,
13183 		    FSE_ARG_DONE);
13184 	}
13185 #endif
13186 	vnode_put(vp);
13187 	file_drop(uap->fd);
13188 	*retval = 0;
13189 	return error;
13190 }
13191 
13192 /*
13193  * Remove an extended attribute.
13194  * XXX Code duplication here.
13195  */
13196 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13197 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13198 {
13199 	vnode_t vp;
13200 	struct nameidata nd;
13201 	char attrname[XATTR_MAXNAMELEN + 1];
13202 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13203 	vfs_context_t ctx = vfs_context_current();
13204 	size_t namelen;
13205 	u_int32_t nameiflags;
13206 	int error;
13207 
13208 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13209 		return EINVAL;
13210 	}
13211 
13212 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13213 	if (error != 0) {
13214 		return error;
13215 	}
13216 	if (xattr_protected(attrname)) {
13217 		return EPERM;
13218 	}
13219 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13220 #if CONFIG_FILE_LEASES
13221 	nameiflags |= WANTPARENT;
13222 #endif
13223 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13224 	if ((error = namei(&nd))) {
13225 		return error;
13226 	}
13227 	vp = nd.ni_vp;
13228 #if CONFIG_FILE_LEASES
13229 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13230 	vnode_put(nd.ni_dvp);
13231 #endif
13232 	nameidone(&nd);
13233 
13234 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13235 #if CONFIG_FSE
13236 	if (error == 0) {
13237 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13238 		    FSE_ARG_VNODE, vp,
13239 		    FSE_ARG_DONE);
13240 	}
13241 #endif
13242 	vnode_put(vp);
13243 	*retval = 0;
13244 	return error;
13245 }
13246 
13247 /*
13248  * Remove an extended attribute.
13249  * XXX Code duplication here.
13250  */
13251 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13252 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13253 {
13254 	vnode_t vp;
13255 	char attrname[XATTR_MAXNAMELEN + 1];
13256 	size_t namelen;
13257 	int error;
13258 #if CONFIG_FSE
13259 	vfs_context_t ctx = vfs_context_current();
13260 #endif
13261 
13262 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13263 		return EINVAL;
13264 	}
13265 
13266 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13267 	if (error != 0) {
13268 		return error;
13269 	}
13270 	if (xattr_protected(attrname)) {
13271 		return EPERM;
13272 	}
13273 	if ((error = file_vnode(uap->fd, &vp))) {
13274 		return error;
13275 	}
13276 	if ((error = vnode_getwithref(vp))) {
13277 		file_drop(uap->fd);
13278 		return error;
13279 	}
13280 
13281 #if CONFIG_FILE_LEASES
13282 	vnode_breakdirlease(vp, true, O_WRONLY);
13283 #endif
13284 
13285 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13286 #if CONFIG_FSE
13287 	if (error == 0) {
13288 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13289 		    FSE_ARG_VNODE, vp,
13290 		    FSE_ARG_DONE);
13291 	}
13292 #endif
13293 	vnode_put(vp);
13294 	file_drop(uap->fd);
13295 	*retval = 0;
13296 	return error;
13297 }
13298 
13299 /*
13300  * Retrieve the list of extended attribute names.
13301  * XXX Code duplication here.
13302  */
13303 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13304 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13305 {
13306 	vnode_t vp;
13307 	struct nameidata nd;
13308 	vfs_context_t ctx = vfs_context_current();
13309 	uio_t auio = NULL;
13310 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13311 	size_t attrsize = 0;
13312 	u_int32_t nameiflags;
13313 	int error;
13314 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13315 
13316 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13317 		return EINVAL;
13318 	}
13319 
13320 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13321 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13322 	if ((error = namei(&nd))) {
13323 		return error;
13324 	}
13325 	vp = nd.ni_vp;
13326 	nameidone(&nd);
13327 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13328 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13329 		    &uio_buf[0], sizeof(uio_buf));
13330 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13331 	}
13332 
13333 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13334 
13335 	vnode_put(vp);
13336 	if (auio) {
13337 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13338 	} else {
13339 		*retval = (user_ssize_t)attrsize;
13340 	}
13341 	return error;
13342 }
13343 
13344 /*
13345  * Retrieve the list of extended attribute names.
13346  * XXX Code duplication here.
13347  */
13348 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13349 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13350 {
13351 	vnode_t vp;
13352 	uio_t auio = NULL;
13353 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13354 	size_t attrsize = 0;
13355 	int error;
13356 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13357 
13358 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13359 		return EINVAL;
13360 	}
13361 
13362 	if ((error = file_vnode(uap->fd, &vp))) {
13363 		return error;
13364 	}
13365 	if ((error = vnode_getwithref(vp))) {
13366 		file_drop(uap->fd);
13367 		return error;
13368 	}
13369 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13370 		auio = uio_createwithbuffer(1, 0, spacetype,
13371 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
13372 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13373 	}
13374 
13375 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13376 
13377 	vnode_put(vp);
13378 	file_drop(uap->fd);
13379 	if (auio) {
13380 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13381 	} else {
13382 		*retval = (user_ssize_t)attrsize;
13383 	}
13384 	return error;
13385 }
13386 
13387 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13388 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13389     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13390 {
13391 	int error;
13392 	struct mount *mp = NULL;
13393 	vnode_t vp;
13394 	int length;
13395 	int bpflags;
13396 	/* maximum number of times to retry build_path */
13397 	unsigned int retries = 0x10;
13398 
13399 	if (bufsize > FSGETPATH_MAXBUFLEN) {
13400 		return EINVAL;
13401 	}
13402 
13403 	if (buf == NULL) {
13404 		return ENOMEM;
13405 	}
13406 
13407 retry:
13408 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13409 		error = ENOTSUP;  /* unexpected failure */
13410 		return ENOTSUP;
13411 	}
13412 
13413 #if CONFIG_UNION_MOUNTS
13414 unionget:
13415 #endif /* CONFIG_UNION_MOUNTS */
13416 	if (objid == 2) {
13417 		struct vfs_attr vfsattr;
13418 		int use_vfs_root = TRUE;
13419 
13420 		VFSATTR_INIT(&vfsattr);
13421 		VFSATTR_WANTED(&vfsattr, f_capabilities);
13422 		if (!(options & FSOPT_ISREALFSID) &&
13423 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13424 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13425 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13426 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13427 				use_vfs_root = FALSE;
13428 			}
13429 		}
13430 
13431 		if (use_vfs_root) {
13432 			error = VFS_ROOT(mp, &vp, ctx);
13433 		} else {
13434 			error = VFS_VGET(mp, objid, &vp, ctx);
13435 		}
13436 	} else {
13437 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13438 	}
13439 
13440 #if CONFIG_UNION_MOUNTS
13441 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13442 		/*
13443 		 * If the fileid isn't found and we're in a union
13444 		 * mount volume, then see if the fileid is in the
13445 		 * mounted-on volume.
13446 		 */
13447 		struct mount *tmp = mp;
13448 		mp = vnode_mount(tmp->mnt_vnodecovered);
13449 		vfs_unbusy(tmp);
13450 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
13451 			goto unionget;
13452 		}
13453 	} else {
13454 		vfs_unbusy(mp);
13455 	}
13456 #else
13457 	vfs_unbusy(mp);
13458 #endif /* CONFIG_UNION_MOUNTS */
13459 
13460 	if (error) {
13461 		return error;
13462 	}
13463 
13464 #if CONFIG_MACF
13465 	error = mac_vnode_check_fsgetpath(ctx, vp);
13466 	if (error) {
13467 		vnode_put(vp);
13468 		return error;
13469 	}
13470 #endif
13471 
13472 	/* Obtain the absolute path to this vnode. */
13473 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13474 	if (options & FSOPT_NOFIRMLINKPATH) {
13475 		bpflags |= BUILDPATH_NO_FIRMLINK;
13476 	}
13477 	bpflags |= BUILDPATH_CHECK_MOVED;
13478 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13479 	vnode_put(vp);
13480 
13481 	if (error) {
13482 		/* there was a race building the path, try a few more times */
13483 		if (error == EAGAIN) {
13484 			--retries;
13485 			if (retries > 0) {
13486 				goto retry;
13487 			}
13488 
13489 			error = ENOENT;
13490 		}
13491 		goto out;
13492 	}
13493 
13494 	AUDIT_ARG(text, buf);
13495 
13496 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13497 		unsigned long path_words[NUMPARMS];
13498 		size_t path_len = sizeof(path_words);
13499 
13500 		if ((size_t)length < path_len) {
13501 			memcpy((char *)path_words, buf, length);
13502 			memset((char *)path_words + length, 0, path_len - length);
13503 
13504 			path_len = length;
13505 		} else {
13506 			memcpy((char *)path_words, buf + (length - path_len), path_len);
13507 		}
13508 
13509 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
13510 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13511 	}
13512 
13513 	*pathlen = length; /* may be superseded by error */
13514 
13515 out:
13516 	return error;
13517 }
13518 
13519 /*
13520  * Obtain the full pathname of a file system object by id.
13521  */
13522 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13523 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13524     uint32_t options, user_ssize_t *retval)
13525 {
13526 	vfs_context_t ctx = vfs_context_current();
13527 	fsid_t fsid;
13528 	char *realpath;
13529 	int length;
13530 	int error;
13531 
13532 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13533 		return EINVAL;
13534 	}
13535 
13536 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13537 		return error;
13538 	}
13539 	AUDIT_ARG(value32, fsid.val[0]);
13540 	AUDIT_ARG(value64, objid);
13541 	/* Restrict output buffer size for now. */
13542 
13543 	if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13544 		return EINVAL;
13545 	}
13546 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13547 	if (realpath == NULL) {
13548 		return ENOMEM;
13549 	}
13550 
13551 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13552 	    options, &length);
13553 
13554 	if (error) {
13555 		goto out;
13556 	}
13557 
13558 	error = copyout((caddr_t)realpath, buf, length);
13559 
13560 	*retval = (user_ssize_t)length; /* may be superseded by error */
13561 out:
13562 	kfree_data(realpath, bufsize);
13563 	return error;
13564 }
13565 
13566 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13567 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13568 {
13569 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13570 	           0, retval);
13571 }
13572 
13573 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13574 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13575 {
13576 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13577 	           uap->options, retval);
13578 }
13579 
13580 /*
13581  * Common routine to handle various flavors of statfs data heading out
13582  *	to user space.
13583  *
13584  * Returns:	0			Success
13585  *		EFAULT
13586  */
13587 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13588 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13589     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13590     boolean_t partial_copy)
13591 {
13592 	int             error;
13593 	int             my_size, copy_size;
13594 
13595 	if (is_64_bit) {
13596 		struct user64_statfs sfs;
13597 		my_size = copy_size = sizeof(sfs);
13598 		bzero(&sfs, my_size);
13599 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13600 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13601 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13602 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13603 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13604 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13605 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13606 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13607 		sfs.f_files = (user64_long_t)sfsp->f_files;
13608 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13609 		sfs.f_fsid = sfsp->f_fsid;
13610 		sfs.f_owner = sfsp->f_owner;
13611 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13612 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13613 		} else {
13614 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13615 		}
13616 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13617 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13618 
13619 		if (partial_copy) {
13620 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13621 		}
13622 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13623 	} else {
13624 		struct user32_statfs sfs;
13625 
13626 		my_size = copy_size = sizeof(sfs);
13627 		bzero(&sfs, my_size);
13628 
13629 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13630 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13631 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13632 
13633 		/*
13634 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13635 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
13636 		 * to reflect the filesystem size as best we can.
13637 		 */
13638 		if ((sfsp->f_blocks > INT_MAX)
13639 		    /* Hack for 4061702 . I think the real fix is for Carbon to
13640 		     * look for some volume capability and not depend on hidden
13641 		     * semantics agreed between a FS and carbon.
13642 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13643 		     * for Carbon to set bNoVolumeSizes volume attribute.
13644 		     * Without this the webdavfs files cannot be copied onto
13645 		     * disk as they look huge. This change should not affect
13646 		     * XSAN as they should not setting these to -1..
13647 		     */
13648 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
13649 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
13650 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13651 			int             shift;
13652 
13653 			/*
13654 			 * Work out how far we have to shift the block count down to make it fit.
13655 			 * Note that it's possible to have to shift so far that the resulting
13656 			 * blocksize would be unreportably large.  At that point, we will clip
13657 			 * any values that don't fit.
13658 			 *
13659 			 * For safety's sake, we also ensure that f_iosize is never reported as
13660 			 * being smaller than f_bsize.
13661 			 */
13662 			for (shift = 0; shift < 32; shift++) {
13663 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13664 					break;
13665 				}
13666 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13667 					break;
13668 				}
13669 			}
13670 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13671 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13672 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13673 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13674 #undef __SHIFT_OR_CLIP
13675 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13676 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13677 		} else {
13678 			/* filesystem is small enough to be reported honestly */
13679 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13680 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13681 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13682 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13683 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13684 		}
13685 		sfs.f_files = (user32_long_t)sfsp->f_files;
13686 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13687 		sfs.f_fsid = sfsp->f_fsid;
13688 		sfs.f_owner = sfsp->f_owner;
13689 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13690 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13691 		} else {
13692 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13693 		}
13694 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13695 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13696 
13697 		if (partial_copy) {
13698 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13699 		}
13700 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13701 	}
13702 
13703 	if (sizep != NULL) {
13704 		*sizep = my_size;
13705 	}
13706 	return error;
13707 }
13708 
13709 /*
13710  * copy stat structure into user_stat structure.
13711  */
13712 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13713 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13714 {
13715 	bzero(usbp, sizeof(*usbp));
13716 
13717 	usbp->st_dev = sbp->st_dev;
13718 	usbp->st_ino = sbp->st_ino;
13719 	usbp->st_mode = sbp->st_mode;
13720 	usbp->st_nlink = sbp->st_nlink;
13721 	usbp->st_uid = sbp->st_uid;
13722 	usbp->st_gid = sbp->st_gid;
13723 	usbp->st_rdev = sbp->st_rdev;
13724 #ifndef _POSIX_C_SOURCE
13725 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13726 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13727 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13728 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13729 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13730 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13731 #else
13732 	usbp->st_atime = sbp->st_atime;
13733 	usbp->st_atimensec = sbp->st_atimensec;
13734 	usbp->st_mtime = sbp->st_mtime;
13735 	usbp->st_mtimensec = sbp->st_mtimensec;
13736 	usbp->st_ctime = sbp->st_ctime;
13737 	usbp->st_ctimensec = sbp->st_ctimensec;
13738 #endif
13739 	usbp->st_size = sbp->st_size;
13740 	usbp->st_blocks = sbp->st_blocks;
13741 	usbp->st_blksize = sbp->st_blksize;
13742 	usbp->st_flags = sbp->st_flags;
13743 	usbp->st_gen = sbp->st_gen;
13744 	usbp->st_lspare = sbp->st_lspare;
13745 	usbp->st_qspare[0] = sbp->st_qspare[0];
13746 	usbp->st_qspare[1] = sbp->st_qspare[1];
13747 }
13748 
13749 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13750 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13751 {
13752 	bzero(usbp, sizeof(*usbp));
13753 
13754 	usbp->st_dev = sbp->st_dev;
13755 	usbp->st_ino = sbp->st_ino;
13756 	usbp->st_mode = sbp->st_mode;
13757 	usbp->st_nlink = sbp->st_nlink;
13758 	usbp->st_uid = sbp->st_uid;
13759 	usbp->st_gid = sbp->st_gid;
13760 	usbp->st_rdev = sbp->st_rdev;
13761 #ifndef _POSIX_C_SOURCE
13762 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13763 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13764 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13765 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13766 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13767 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13768 #else
13769 	usbp->st_atime = sbp->st_atime;
13770 	usbp->st_atimensec = sbp->st_atimensec;
13771 	usbp->st_mtime = sbp->st_mtime;
13772 	usbp->st_mtimensec = sbp->st_mtimensec;
13773 	usbp->st_ctime = sbp->st_ctime;
13774 	usbp->st_ctimensec = sbp->st_ctimensec;
13775 #endif
13776 	usbp->st_size = sbp->st_size;
13777 	usbp->st_blocks = sbp->st_blocks;
13778 	usbp->st_blksize = sbp->st_blksize;
13779 	usbp->st_flags = sbp->st_flags;
13780 	usbp->st_gen = sbp->st_gen;
13781 	usbp->st_lspare = sbp->st_lspare;
13782 	usbp->st_qspare[0] = sbp->st_qspare[0];
13783 	usbp->st_qspare[1] = sbp->st_qspare[1];
13784 }
13785 
13786 /*
13787  * copy stat64 structure into user_stat64 structure.
13788  */
13789 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13790 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13791 {
13792 	bzero(usbp, sizeof(*usbp));
13793 
13794 	usbp->st_dev = sbp->st_dev;
13795 	usbp->st_ino = sbp->st_ino;
13796 	usbp->st_mode = sbp->st_mode;
13797 	usbp->st_nlink = sbp->st_nlink;
13798 	usbp->st_uid = sbp->st_uid;
13799 	usbp->st_gid = sbp->st_gid;
13800 	usbp->st_rdev = sbp->st_rdev;
13801 #ifndef _POSIX_C_SOURCE
13802 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13803 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13804 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13805 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13806 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13807 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13808 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13809 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13810 #else
13811 	usbp->st_atime = sbp->st_atime;
13812 	usbp->st_atimensec = sbp->st_atimensec;
13813 	usbp->st_mtime = sbp->st_mtime;
13814 	usbp->st_mtimensec = sbp->st_mtimensec;
13815 	usbp->st_ctime = sbp->st_ctime;
13816 	usbp->st_ctimensec = sbp->st_ctimensec;
13817 	usbp->st_birthtime = sbp->st_birthtime;
13818 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13819 #endif
13820 	usbp->st_size = sbp->st_size;
13821 	usbp->st_blocks = sbp->st_blocks;
13822 	usbp->st_blksize = sbp->st_blksize;
13823 	usbp->st_flags = sbp->st_flags;
13824 	usbp->st_gen = sbp->st_gen;
13825 	usbp->st_lspare = sbp->st_lspare;
13826 	usbp->st_qspare[0] = sbp->st_qspare[0];
13827 	usbp->st_qspare[1] = sbp->st_qspare[1];
13828 }
13829 
13830 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13831 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13832 {
13833 	bzero(usbp, sizeof(*usbp));
13834 
13835 	usbp->st_dev = sbp->st_dev;
13836 	usbp->st_ino = sbp->st_ino;
13837 	usbp->st_mode = sbp->st_mode;
13838 	usbp->st_nlink = sbp->st_nlink;
13839 	usbp->st_uid = sbp->st_uid;
13840 	usbp->st_gid = sbp->st_gid;
13841 	usbp->st_rdev = sbp->st_rdev;
13842 #ifndef _POSIX_C_SOURCE
13843 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13844 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13845 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13846 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13847 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13848 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13849 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13850 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13851 #else
13852 	usbp->st_atime = sbp->st_atime;
13853 	usbp->st_atimensec = sbp->st_atimensec;
13854 	usbp->st_mtime = sbp->st_mtime;
13855 	usbp->st_mtimensec = sbp->st_mtimensec;
13856 	usbp->st_ctime = sbp->st_ctime;
13857 	usbp->st_ctimensec = sbp->st_ctimensec;
13858 	usbp->st_birthtime = sbp->st_birthtime;
13859 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13860 #endif
13861 	usbp->st_size = sbp->st_size;
13862 	usbp->st_blocks = sbp->st_blocks;
13863 	usbp->st_blksize = sbp->st_blksize;
13864 	usbp->st_flags = sbp->st_flags;
13865 	usbp->st_gen = sbp->st_gen;
13866 	usbp->st_lspare = sbp->st_lspare;
13867 	usbp->st_qspare[0] = sbp->st_qspare[0];
13868 	usbp->st_qspare[1] = sbp->st_qspare[1];
13869 }
13870 
13871 /*
13872  * Purge buffer cache for simulating cold starts
13873  */
13874 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13875 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13876 {
13877 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13878 
13879 	return VNODE_RETURNED;
13880 }
13881 
13882 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13883 vfs_purge_callback(mount_t mp, __unused void * arg)
13884 {
13885 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13886 
13887 	return VFS_RETURNED;
13888 }
13889 
13890 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13891 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13892 {
13893 	if (!kauth_cred_issuser(kauth_cred_get())) {
13894 		return EPERM;
13895 	}
13896 
13897 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13898 
13899 	return 0;
13900 }
13901 
13902 /*
13903  * gets the vnode associated with the (unnamed) snapshot directory
13904  * for a Filesystem. The snapshot directory vnode is returned with
13905  * an iocount on it.
13906  */
13907 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13908 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13909 {
13910 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13911 }
13912 
13913 /*
13914  * Get the snapshot vnode.
13915  *
13916  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13917  * needs nameidone() on ndp.
13918  *
13919  * If the snapshot vnode exists it is returned in ndp->ni_vp.
13920  *
13921  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13922  * not needed.
13923  */
13924 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13925 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13926     user_addr_t name, struct nameidata *ndp, int32_t op,
13927 #if !CONFIG_TRIGGERS
13928     __unused
13929 #endif
13930     enum path_operation pathop,
13931     vfs_context_t ctx)
13932 {
13933 	int error, i;
13934 	caddr_t name_buf;
13935 	size_t name_len;
13936 	struct vfs_attr vfa;
13937 
13938 	*sdvpp = NULLVP;
13939 	*rvpp = NULLVP;
13940 
13941 	error = vnode_getfromfd(ctx, dirfd, rvpp);
13942 	if (error) {
13943 		return error;
13944 	}
13945 
13946 	if (!vnode_isvroot(*rvpp)) {
13947 		error = EINVAL;
13948 		goto out;
13949 	}
13950 
13951 	/* Make sure the filesystem supports snapshots */
13952 	VFSATTR_INIT(&vfa);
13953 	VFSATTR_WANTED(&vfa, f_capabilities);
13954 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13955 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13956 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13957 	    VOL_CAP_INT_SNAPSHOT)) ||
13958 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13959 	    VOL_CAP_INT_SNAPSHOT))) {
13960 		error = ENOTSUP;
13961 		goto out;
13962 	}
13963 
13964 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13965 	if (error) {
13966 		goto out;
13967 	}
13968 
13969 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13970 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13971 	if (error) {
13972 		goto out1;
13973 	}
13974 
13975 	/*
13976 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13977 	 * (the length returned by copyinstr includes the terminating NUL)
13978 	 */
13979 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13980 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13981 		error = EINVAL;
13982 		goto out1;
13983 	}
13984 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13985 		;
13986 	}
13987 	if (i < (int)name_len) {
13988 		error = EINVAL;
13989 		goto out1;
13990 	}
13991 
13992 #if CONFIG_MACF
13993 	if (op == CREATE) {
13994 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
13995 		    name_buf);
13996 	} else if (op == DELETE) {
13997 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
13998 		    name_buf);
13999 	}
14000 	if (error) {
14001 		goto out1;
14002 	}
14003 #endif
14004 
14005 	/* Check if the snapshot already exists ... */
14006 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14007 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14008 	ndp->ni_dvp = *sdvpp;
14009 
14010 	error = namei(ndp);
14011 out1:
14012 	zfree(ZV_NAMEI, name_buf);
14013 out:
14014 	if (error) {
14015 		if (*sdvpp) {
14016 			vnode_put(*sdvpp);
14017 			*sdvpp = NULLVP;
14018 		}
14019 		if (*rvpp) {
14020 			vnode_put(*rvpp);
14021 			*rvpp = NULLVP;
14022 		}
14023 	}
14024 	return error;
14025 }
14026 
14027 /*
14028  * create a filesystem snapshot (for supporting filesystems)
14029  *
14030  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14031  * We get to the (unnamed) snapshot directory vnode and create the vnode
14032  * for the snapshot in it.
14033  *
14034  * Restrictions:
14035  *
14036  *    a) Passed in name for snapshot cannot have slashes.
14037  *    b) name can't be "." or ".."
14038  *
14039  * Since this requires superuser privileges, vnode_authorize calls are not
14040  * made.
14041  */
14042 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14043 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14044     vfs_context_t ctx)
14045 {
14046 	vnode_t rvp, snapdvp;
14047 	int error;
14048 	struct nameidata *ndp;
14049 
14050 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14051 
14052 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14053 	    OP_LINK, ctx);
14054 	if (error) {
14055 		goto out;
14056 	}
14057 
14058 	if (ndp->ni_vp) {
14059 		vnode_put(ndp->ni_vp);
14060 		error = EEXIST;
14061 	} else {
14062 		struct vnode_attr *vap;
14063 		vnode_t vp = NULLVP;
14064 
14065 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14066 
14067 		VATTR_INIT(vap);
14068 		VATTR_SET(vap, va_type, VREG);
14069 		VATTR_SET(vap, va_mode, 0);
14070 
14071 		error = vn_create(snapdvp, &vp, ndp, vap,
14072 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14073 		if (!error && vp) {
14074 			vnode_put(vp);
14075 		}
14076 
14077 		kfree_type(struct vnode_attr, vap);
14078 	}
14079 
14080 	nameidone(ndp);
14081 	vnode_put(snapdvp);
14082 	vnode_put(rvp);
14083 out:
14084 	kfree_type(struct nameidata, ndp);
14085 
14086 	return error;
14087 }
14088 
14089 /*
14090  * Delete a Filesystem snapshot
14091  *
14092  * get the vnode for the unnamed snapshot directory and the snapshot and
14093  * delete the snapshot.
14094  */
14095 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14096 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14097     vfs_context_t ctx)
14098 {
14099 	vnode_t rvp, snapdvp;
14100 	int error;
14101 	struct nameidata *ndp;
14102 
14103 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14104 
14105 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14106 	    OP_UNLINK, ctx);
14107 	if (error) {
14108 		goto out;
14109 	}
14110 
14111 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14112 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14113 
14114 	vnode_put(ndp->ni_vp);
14115 	nameidone(ndp);
14116 	vnode_put(snapdvp);
14117 	vnode_put(rvp);
14118 out:
14119 	kfree_type(struct nameidata, ndp);
14120 
14121 	return error;
14122 }
14123 
14124 /*
14125  * Revert a filesystem to a snapshot
14126  *
14127  * Marks the filesystem to revert to the given snapshot on next mount.
14128  */
14129 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14130 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14131     vfs_context_t ctx)
14132 {
14133 	int error;
14134 	vnode_t rvp;
14135 	mount_t mp;
14136 	struct fs_snapshot_revert_args revert_data;
14137 	struct componentname cnp;
14138 	caddr_t name_buf;
14139 	size_t name_len;
14140 
14141 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14142 	if (error) {
14143 		return error;
14144 	}
14145 	mp = vnode_mount(rvp);
14146 
14147 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14148 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14149 	if (error) {
14150 		zfree(ZV_NAMEI, name_buf);
14151 		vnode_put(rvp);
14152 		return error;
14153 	}
14154 
14155 #if CONFIG_MACF
14156 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14157 	if (error) {
14158 		zfree(ZV_NAMEI, name_buf);
14159 		vnode_put(rvp);
14160 		return error;
14161 	}
14162 #endif
14163 
14164 	/*
14165 	 * Grab mount_iterref so that we can release the vnode,
14166 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14167 	 */
14168 	error = mount_iterref(mp, 0);
14169 	vnode_put(rvp);
14170 	if (error) {
14171 		zfree(ZV_NAMEI, name_buf);
14172 		return error;
14173 	}
14174 
14175 	memset(&cnp, 0, sizeof(cnp));
14176 	cnp.cn_pnbuf = (char *)name_buf;
14177 	cnp.cn_nameiop = LOOKUP;
14178 	cnp.cn_flags = ISLASTCN | HASBUF;
14179 	cnp.cn_pnlen = MAXPATHLEN;
14180 	cnp.cn_nameptr = cnp.cn_pnbuf;
14181 	cnp.cn_namelen = (int)name_len;
14182 	revert_data.sr_cnp = &cnp;
14183 
14184 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14185 	mount_iterdrop(mp);
14186 	zfree(ZV_NAMEI, name_buf);
14187 
14188 	if (error) {
14189 		/* If there was any error, try again using VNOP_IOCTL */
14190 
14191 		vnode_t snapdvp;
14192 		struct nameidata namend;
14193 
14194 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14195 		    OP_LOOKUP, ctx);
14196 		if (error) {
14197 			return error;
14198 		}
14199 
14200 
14201 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14202 		    0, ctx);
14203 
14204 		vnode_put(namend.ni_vp);
14205 		nameidone(&namend);
14206 		vnode_put(snapdvp);
14207 		vnode_put(rvp);
14208 	}
14209 
14210 	return error;
14211 }
14212 
14213 /*
14214  * rename a Filesystem snapshot
14215  *
14216  * get the vnode for the unnamed snapshot directory and the snapshot and
14217  * rename the snapshot. This is a very specialised (and simple) case of
14218  * rename(2) (which has to deal with a lot more complications). It differs
14219  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14220  */
14221 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14222 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14223     __unused uint32_t flags, vfs_context_t ctx)
14224 {
14225 	vnode_t rvp, snapdvp;
14226 	int error, i;
14227 	caddr_t newname_buf;
14228 	size_t name_len;
14229 	vnode_t fvp;
14230 	struct nameidata *fromnd, *tond;
14231 	/* carving out a chunk for structs that are too big to be on stack. */
14232 	struct {
14233 		struct nameidata from_node;
14234 		struct nameidata to_node;
14235 	} * __rename_data;
14236 
14237 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14238 	fromnd = &__rename_data->from_node;
14239 	tond = &__rename_data->to_node;
14240 
14241 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14242 	    OP_UNLINK, ctx);
14243 	if (error) {
14244 		goto out;
14245 	}
14246 	fvp  = fromnd->ni_vp;
14247 
14248 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14249 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14250 	if (error) {
14251 		goto out1;
14252 	}
14253 
14254 	/*
14255 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14256 	 * slashes.
14257 	 * (the length returned by copyinstr includes the terminating NUL)
14258 	 *
14259 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14260 	 * off here itself.
14261 	 */
14262 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14263 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14264 		error = EINVAL;
14265 		goto out1;
14266 	}
14267 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14268 		;
14269 	}
14270 	if (i < (int)name_len) {
14271 		error = EINVAL;
14272 		goto out1;
14273 	}
14274 
14275 #if CONFIG_MACF
14276 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14277 	    newname_buf);
14278 	if (error) {
14279 		goto out1;
14280 	}
14281 #endif
14282 
14283 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14284 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14285 	tond->ni_dvp = snapdvp;
14286 
14287 	error = namei(tond);
14288 	if (error) {
14289 		goto out2;
14290 	} else if (tond->ni_vp) {
14291 		/*
14292 		 * snapshot rename behaves differently than rename(2) - if the
14293 		 * new name exists, EEXIST is returned.
14294 		 */
14295 		vnode_put(tond->ni_vp);
14296 		error = EEXIST;
14297 		goto out2;
14298 	}
14299 
14300 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14301 	    &tond->ni_cnd, ctx);
14302 
14303 out2:
14304 	nameidone(tond);
14305 out1:
14306 	zfree(ZV_NAMEI, newname_buf);
14307 	vnode_put(fvp);
14308 	vnode_put(snapdvp);
14309 	vnode_put(rvp);
14310 	nameidone(fromnd);
14311 out:
14312 	kfree_type(typeof(*__rename_data), __rename_data);
14313 	return error;
14314 }
14315 
14316 /*
14317  * Mount a Filesystem snapshot
14318  *
14319  * get the vnode for the unnamed snapshot directory and the snapshot and
14320  * mount the snapshot.
14321  */
14322 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14323 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14324     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14325 {
14326 	mount_t mp;
14327 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14328 	struct fs_snapshot_mount_args smnt_data;
14329 	int error;
14330 	struct nameidata *snapndp, *dirndp;
14331 	/* carving out a chunk for structs that are too big to be on stack. */
14332 	struct {
14333 		struct nameidata snapnd;
14334 		struct nameidata dirnd;
14335 	} * __snapshot_mount_data;
14336 
14337 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14338 	snapndp = &__snapshot_mount_data->snapnd;
14339 	dirndp = &__snapshot_mount_data->dirnd;
14340 
14341 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14342 	    OP_LOOKUP, ctx);
14343 	if (error) {
14344 		goto out;
14345 	}
14346 
14347 	snapvp  = snapndp->ni_vp;
14348 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14349 		error = EIO;
14350 		goto out1;
14351 	}
14352 
14353 	/* Get the vnode to be covered */
14354 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14355 	    UIO_USERSPACE, directory, ctx);
14356 	error = namei(dirndp);
14357 	if (error) {
14358 		goto out1;
14359 	}
14360 
14361 	vp = dirndp->ni_vp;
14362 	pvp = dirndp->ni_dvp;
14363 	mp = vnode_mount(rvp);
14364 
14365 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14366 		error = EINVAL;
14367 		goto out2;
14368 	}
14369 
14370 #if CONFIG_MACF
14371 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14372 	    mp->mnt_vfsstat.f_fstypename);
14373 	if (error) {
14374 		goto out2;
14375 	}
14376 #endif
14377 
14378 	smnt_data.sm_mp  = mp;
14379 	smnt_data.sm_cnp = &snapndp->ni_cnd;
14380 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14381 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
14382 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14383 
14384 out2:
14385 	vnode_put(vp);
14386 	vnode_put(pvp);
14387 	nameidone(dirndp);
14388 out1:
14389 	vnode_put(snapvp);
14390 	vnode_put(snapdvp);
14391 	vnode_put(rvp);
14392 	nameidone(snapndp);
14393 out:
14394 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14395 	return error;
14396 }
14397 
14398 /*
14399  * Root from a snapshot of the filesystem
14400  *
14401  * Marks the filesystem to root from the given snapshot on next boot.
14402  */
14403 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14404 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14405     vfs_context_t ctx)
14406 {
14407 	int error;
14408 	vnode_t rvp;
14409 	mount_t mp;
14410 	struct fs_snapshot_root_args root_data;
14411 	struct componentname cnp;
14412 	caddr_t name_buf;
14413 	size_t name_len;
14414 
14415 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14416 	if (error) {
14417 		return error;
14418 	}
14419 	mp = vnode_mount(rvp);
14420 
14421 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14422 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14423 	if (error) {
14424 		zfree(ZV_NAMEI, name_buf);
14425 		vnode_put(rvp);
14426 		return error;
14427 	}
14428 
14429 	// XXX MAC checks ?
14430 
14431 	/*
14432 	 * Grab mount_iterref so that we can release the vnode,
14433 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14434 	 */
14435 	error = mount_iterref(mp, 0);
14436 	vnode_put(rvp);
14437 	if (error) {
14438 		zfree(ZV_NAMEI, name_buf);
14439 		return error;
14440 	}
14441 
14442 	memset(&cnp, 0, sizeof(cnp));
14443 	cnp.cn_pnbuf = (char *)name_buf;
14444 	cnp.cn_nameiop = LOOKUP;
14445 	cnp.cn_flags = ISLASTCN | HASBUF;
14446 	cnp.cn_pnlen = MAXPATHLEN;
14447 	cnp.cn_nameptr = cnp.cn_pnbuf;
14448 	cnp.cn_namelen = (int)name_len;
14449 	root_data.sr_cnp = &cnp;
14450 
14451 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14452 
14453 	mount_iterdrop(mp);
14454 	zfree(ZV_NAMEI, name_buf);
14455 
14456 	return error;
14457 }
14458 
14459 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14460 vfs_context_can_snapshot(vfs_context_t ctx)
14461 {
14462 	static const char * const snapshot_entitlements[] = {
14463 		"com.apple.private.vfs.snapshot",
14464 		"com.apple.developer.vfs.snapshot",
14465 		"com.apple.private.apfs.arv.limited.snapshot",
14466 	};
14467 	static const size_t nentitlements =
14468 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14469 	size_t i;
14470 
14471 	task_t task = vfs_context_task(ctx);
14472 	for (i = 0; i < nentitlements; i++) {
14473 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14474 			return TRUE;
14475 		}
14476 	}
14477 	return FALSE;
14478 }
14479 
14480 /*
14481  * FS snapshot operations dispatcher
14482  */
14483 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14484 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14485     __unused int32_t *retval)
14486 {
14487 	int error;
14488 	vfs_context_t ctx = vfs_context_current();
14489 
14490 	AUDIT_ARG(fd, uap->dirfd);
14491 	AUDIT_ARG(value32, uap->op);
14492 
14493 	if (!vfs_context_can_snapshot(ctx)) {
14494 		return EPERM;
14495 	}
14496 
14497 	/*
14498 	 * Enforce user authorization for snapshot modification operations,
14499 	 * or if trying to root from snapshot.
14500 	 */
14501 	if (uap->op != SNAPSHOT_OP_MOUNT) {
14502 		vnode_t dvp = NULLVP;
14503 		vnode_t devvp = NULLVP;
14504 		mount_t mp;
14505 
14506 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14507 		if (error) {
14508 			return error;
14509 		}
14510 		mp = vnode_mount(dvp);
14511 		devvp = mp->mnt_devvp;
14512 
14513 		/* get an iocount on devvp */
14514 		if (devvp == NULLVP) {
14515 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14516 			/* for mounts which arent block devices */
14517 			if (error == ENOENT) {
14518 				error = ENXIO;
14519 			}
14520 		} else {
14521 			error = vnode_getwithref(devvp);
14522 		}
14523 
14524 		if (error) {
14525 			vnode_put(dvp);
14526 			return error;
14527 		}
14528 
14529 		if ((vfs_context_issuser(ctx) == 0) &&
14530 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14531 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14532 			error = EPERM;
14533 		}
14534 		vnode_put(dvp);
14535 		vnode_put(devvp);
14536 
14537 		if (error) {
14538 			return error;
14539 		}
14540 	}
14541 
14542 	switch (uap->op) {
14543 	case SNAPSHOT_OP_CREATE:
14544 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14545 		break;
14546 	case SNAPSHOT_OP_DELETE:
14547 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14548 		break;
14549 	case SNAPSHOT_OP_RENAME:
14550 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14551 		    uap->flags, ctx);
14552 		break;
14553 	case SNAPSHOT_OP_MOUNT:
14554 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14555 		    uap->data, uap->flags, ctx);
14556 		break;
14557 	case SNAPSHOT_OP_REVERT:
14558 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14559 		break;
14560 #if CONFIG_MNT_ROOTSNAP
14561 	case SNAPSHOT_OP_ROOT:
14562 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14563 		break;
14564 #endif /* CONFIG_MNT_ROOTSNAP */
14565 	default:
14566 		error = ENOSYS;
14567 	}
14568 
14569 	return error;
14570 }
14571