xref: /xnu-8792.61.2/bsd/vfs/vfs_syscalls.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c) !
1 /*
2  * Copyright (c) 1995-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Copyright (c) 1989, 1993
30  *	The Regents of the University of California.  All rights reserved.
31  * (c) UNIX System Laboratories, Inc.
32  * All or some portions of this file are derived from material licensed
33  * to the University of California by American Telephone and Telegraph
34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35  * the permission of UNIX System Laboratories, Inc.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
66  */
67 /*
68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69  * support for mandatory and extensible security protections.  This notice
70  * is included in support of clause 2.2 (b) of the Apple Public License,
71  * Version 2.0.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112 
113 #include <vfs/vfs_disk_conditioner.h>
114 
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117 
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122 
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125 
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130 
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137 
138 #include <nfs/nfs_conf.h>
139 
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143 
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148 
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 	((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 	release_pathbuff(x)
154 #else
155 #define GET_PATH(x)     \
156 	((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 	zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160 
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164 
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168 
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
171 #endif
172 
173 extern void disk_conditioner_unmount(mount_t mp);
174 
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 	vnode_t olddp;
178 	vnode_t newdp;
179 };
180 /* callback  for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182 
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192     boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195     struct componentname *cnp, user_addr_t fsmountargs,
196     int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198 
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200 
201 struct fd_vn_data * fg_vn_data_alloc(void);
202 
203 /*
204  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205  * Concurrent lookups (or lookups by ids) on hard links can cause the
206  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207  * does) to return ENOENT as the path cannot be returned from the name cache
208  * alone. We have no option but to retry and hope to get one namei->reverse path
209  * generation done without an intervening lookup, lookup by id on the hard link
210  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211  * which currently are the MAC hooks for rename, unlink and rmdir.
212  */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214 
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217 
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219     int unlink_flags);
220 
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229 
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236 
237 __private_extern__
238 int sync_internal(void);
239 
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242 
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245 
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249 
250 extern lck_rw_t rootvnode_rw_lock;
251 
252 /*
253  * incremented each time a mount or unmount operation occurs
254  * used to invalidate the cached value of the rootvp in the
255  * mount structure utilized by cache_lookup_path
256  */
257 uint32_t mount_generation = 0;
258 
259 /* counts number of mount and unmount operations */
260 unsigned int vfs_nummntops = 0;
261 
262 /* system-wide, per-boot unique mount ID */
263 static _Atomic uint64_t mount_unique_id = 1;
264 
265 extern const struct fileops vnops;
266 #if CONFIG_APPLEDOUBLE
267 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
268 #endif /* CONFIG_APPLEDOUBLE */
269 
270 /*
271  * Virtual File System System Calls
272  */
273 
274 /*
275  * Private in-kernel mounting spi (specific use-cases only)
276  */
277 boolean_t
vfs_iskernelmount(mount_t mp)278 vfs_iskernelmount(mount_t mp)
279 {
280 	return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
281 }
282 
283 __private_extern__
284 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)285 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
286     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
287     vfs_context_t ctx)
288 {
289 	struct nameidata nd;
290 	boolean_t did_namei;
291 	int error;
292 
293 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
294 	    UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
295 
296 	kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
297 
298 	/*
299 	 * Get the vnode to be covered if it's not supplied
300 	 */
301 	if (vp == NULLVP) {
302 		error = namei(&nd);
303 		if (error) {
304 			if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
305 				printf("failed to locate mount-on path: %s ", path);
306 			}
307 			return error;
308 		}
309 		vp = nd.ni_vp;
310 		pvp = nd.ni_dvp;
311 		did_namei = TRUE;
312 	} else {
313 		char *pnbuf = CAST_DOWN(char *, path);
314 
315 		nd.ni_cnd.cn_pnbuf = pnbuf;
316 		nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
317 		did_namei = FALSE;
318 	}
319 
320 	kern_flags |= KERNEL_MOUNT_KMOUNT;
321 	error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
322 	    syscall_flags, kern_flags, NULL, ctx);
323 
324 	if (did_namei) {
325 		vnode_put(vp);
326 		vnode_put(pvp);
327 		nameidone(&nd);
328 	}
329 
330 	return error;
331 }
332 
333 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)334 vfs_mount_at_path(const char *fstype, const char *path,
335     vnode_t pvp, vnode_t vp, void *data, size_t datalen,
336     int mnt_flags, int flags)
337 {
338 	int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
339 	int error, km_flags = 0;
340 	vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
341 
342 	/*
343 	 * This call is currently restricted to specific use cases.
344 	 */
345 	if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
346 		return ENOTSUP;
347 	}
348 
349 #if !defined(XNU_TARGET_OS_OSX)
350 	if (strcmp(fstype, "lifs") == 0) {
351 		syscall_flags |= MNT_NOEXEC;
352 	}
353 #endif
354 
355 	if (flags & VFS_MOUNT_FLAG_NOAUTH) {
356 		km_flags |= KERNEL_MOUNT_NOAUTH;
357 	}
358 	if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
359 		km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
360 	}
361 
362 	error = kernel_mount(fstype, pvp, vp, path, data, datalen,
363 	    syscall_flags, km_flags, ctx);
364 	if (error) {
365 		printf("%s: mount on %s failed, error %d\n", __func__, path,
366 		    error);
367 	}
368 
369 	return error;
370 }
371 
372 int
vfs_mount_override_type_name(mount_t mp,const char * name)373 vfs_mount_override_type_name(mount_t mp, const char *name)
374 {
375 	if (mp == NULL || name == NULL) {
376 		return EINVAL;
377 	}
378 
379 	/* Override the FS type name. */
380 	mount_lock_spin(mp);
381 	strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
382 	mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
383 	mount_unlock(mp);
384 
385 	return 0;
386 }
387 
388 /*
389  * Mount a file system.
390  */
391 /* ARGSUSED */
392 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)393 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
394 {
395 	struct __mac_mount_args muap;
396 
397 	muap.type = uap->type;
398 	muap.path = uap->path;
399 	muap.flags = uap->flags;
400 	muap.data = uap->data;
401 	muap.mac_p = USER_ADDR_NULL;
402 	return __mac_mount(p, &muap, retval);
403 }
404 
405 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)406 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
407 {
408 	struct componentname    cn;
409 	vfs_context_t           ctx = vfs_context_current();
410 	size_t                  dummy = 0;
411 	int                     error;
412 	int                     flags = uap->flags;
413 	char                    fstypename[MFSNAMELEN];
414 	char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
415 	vnode_t                 pvp;
416 	vnode_t                 vp;
417 
418 	AUDIT_ARG(fd, uap->fd);
419 	AUDIT_ARG(fflags, flags);
420 	/* fstypename will get audited by mount_common */
421 
422 	/* Sanity check the flags */
423 	if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
424 		return ENOTSUP;
425 	}
426 
427 	if (flags & MNT_UNION) {
428 		return EPERM;
429 	}
430 
431 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
432 	if (error) {
433 		return error;
434 	}
435 
436 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
437 		return error;
438 	}
439 
440 	if ((error = vnode_getwithref(vp)) != 0) {
441 		file_drop(uap->fd);
442 		return error;
443 	}
444 
445 	pvp = vnode_getparent(vp);
446 	if (pvp == NULL) {
447 		if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
448 			error = EBUSY;
449 		} else {
450 			error = EINVAL;
451 		}
452 		vnode_put(vp);
453 		file_drop(uap->fd);
454 		return error;
455 	}
456 
457 	memset(&cn, 0, sizeof(struct componentname));
458 	cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
459 	cn.cn_pnlen = MAXPATHLEN;
460 
461 	if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
462 		zfree(ZV_NAMEI, cn.cn_pnbuf);
463 		vnode_put(pvp);
464 		vnode_put(vp);
465 		file_drop(uap->fd);
466 		return error;
467 	}
468 
469 	error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
470 
471 	zfree(ZV_NAMEI, cn.cn_pnbuf);
472 	vnode_put(pvp);
473 	vnode_put(vp);
474 	file_drop(uap->fd);
475 
476 	return error;
477 }
478 
479 #define MAX_GRAFT_METADATA_SIZE             16384 /* bytes */
480 
481 /*
482  * Get the size of a graft file (a manifest or payload file).
483  * The vp should be an iocounted vnode.
484  */
485 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)486 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
487 {
488 	struct stat64 sb = {};
489 	int error;
490 
491 	*size = 0;
492 
493 	error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
494 	if (error) {
495 		return error;
496 	}
497 
498 	if (sb.st_size == 0) {
499 		error = ENODATA;
500 	} else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
501 		error = EFBIG;
502 	} else {
503 		*size = (size_t) sb.st_size;
504 	}
505 
506 	return error;
507 }
508 
509 /*
510  * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
511  * `size` must already be validated.
512  */
513 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)514 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
515 {
516 	return vn_rdwr(UIO_READ, graft_vp,
517 	           (caddr_t) buf, (int) size, /* offset */ 0,
518 	           UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
519 	           vfs_context_ucred(vctx), /* resid */ NULL,
520 	           vfs_context_proc(vctx));
521 }
522 
523 /*
524  * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
525  * and read it into `buf`.
526  */
527 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)528 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
529 {
530 	vnode_t metadata_vp = NULLVP;
531 	int error;
532 
533 	// Convert this graft fd to a vnode.
534 	if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
535 		goto out;
536 	}
537 
538 	// Get (and validate) size information.
539 	if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
540 		goto out;
541 	}
542 
543 	// Read each file into the provided buffer - we must get the expected amount of bytes.
544 	if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
545 		goto out;
546 	}
547 
548 out:
549 	if (metadata_vp) {
550 		vnode_put(metadata_vp);
551 		metadata_vp = NULLVP;
552 	}
553 
554 	return error;
555 }
556 
557 /*
558  * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
559  * provided in `gfs`, saving the size of data read in `gfs`.
560  */
561 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)562 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
563     fsioc_graft_fs_t *gfs)
564 {
565 	int error;
566 
567 	// Read the authentic manifest.
568 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
569 	    &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
570 		return error;
571 	}
572 
573 	// The user manifest is currently unused, but set its size.
574 	gfs->user_manifest_size = 0;
575 
576 	// Read the payload.
577 	if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
578 	    &gfs->payload_size, gfs->payload))) {
579 		return error;
580 	}
581 
582 	return 0;
583 }
584 
585 /*
586  * Call into the filesystem to verify and graft a cryptex.
587  */
588 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)589 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
590     vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
591 {
592 	fsioc_graft_fs_t gfs = {};
593 	uint64_t graft_dir_ino = 0;
594 	struct stat64 sb = {};
595 	int error;
596 
597 	// Pre-flight arguments.
598 	if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
599 		// Make sure that this graft version matches what we support.
600 		return ENOTSUP;
601 	} else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
602 		// For this type, cryptex VP must live on same volume as the target of graft.
603 		return EXDEV;
604 	} else if (mounton_vp && mounton_vp->v_type != VDIR) {
605 		// We cannot graft upon non-directories.
606 		return ENOTDIR;
607 	} else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
608 	    sbc_args->sbc_payload_fd < 0) {
609 		// We cannot graft without a manifest and payload.
610 		return EINVAL;
611 	}
612 
613 	if (mounton_vp) {
614 		// Get the mounton's inode number.
615 		error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
616 		if (error) {
617 			return error;
618 		}
619 		graft_dir_ino = (uint64_t) sb.st_ino;
620 	}
621 
622 	// Create buffers (of our maximum-defined size) to store authentication info.
623 	gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
624 	gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
625 
626 	if (!gfs.authentic_manifest || !gfs.payload) {
627 		error = ENOMEM;
628 		goto out;
629 	}
630 
631 	// Read our fd's into our buffers.
632 	// (Note that this will set the buffer size fields in `gfs`.)
633 	error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
634 	if (error) {
635 		goto out;
636 	}
637 
638 	gfs.graft_version = FSIOC_GRAFT_VERSION;
639 	gfs.graft_type = graft_type;
640 	gfs.graft_4cc = sbc_args->sbc_4cc;
641 	if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
642 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
643 	}
644 	if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
645 		gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
646 	}
647 	if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
648 		gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
649 	}
650 	if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
651 		gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
652 	}
653 	if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
654 		gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
655 	}
656 	if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
657 		gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
658 	}
659 	gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
660 
661 	// Call into the FS to perform the graft (and validation).
662 	error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
663 
664 out:
665 	if (gfs.authentic_manifest) {
666 		kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
667 		gfs.authentic_manifest = NULL;
668 	}
669 	if (gfs.payload) {
670 		kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
671 		gfs.payload = NULL;
672 	}
673 
674 	return error;
675 }
676 
677 #define GRAFTDMG_ENTITLEMENT  "com.apple.private.vfs.graftdmg"
678 
679 /*
680  * Graft a cryptex disk image (via FD) onto the appropriate mount-point
681  * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
682  */
683 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)684 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
685 {
686 	int ua_dmgfd = uap->dmg_fd;
687 	user_addr_t ua_mountdir = uap->mountdir;
688 	uint32_t ua_grafttype = uap->graft_type;
689 	user_addr_t ua_graftargs = uap->gda;
690 
691 	graftdmg_args_un kern_gda = {};
692 	int error = 0;
693 	secure_boot_cryptex_args_t *sbc_args = NULL;
694 
695 	vnode_t cryptex_vp = NULLVP;
696 	vnode_t mounton_vp = NULLVP;
697 	struct nameidata nd = {};
698 	vfs_context_t ctx = vfs_context_current();
699 
700 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
701 		return EPERM;
702 	}
703 
704 	error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
705 	if (error) {
706 		return error;
707 	}
708 
709 	// Copy mount dir in, if provided.
710 	if (ua_mountdir != USER_ADDR_NULL) {
711 		// Acquire vnode for mount-on path
712 		NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
713 		    UIO_USERSPACE, ua_mountdir, ctx);
714 
715 		error = namei(&nd);
716 		if (error) {
717 			return error;
718 		}
719 		mounton_vp = nd.ni_vp;
720 	}
721 
722 	// Convert fd to vnode.
723 	error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
724 	if (error) {
725 		goto graftout;
726 	}
727 
728 	if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_DOWNLEVEL) {
729 		error = EINVAL;
730 	} else {
731 		sbc_args = &kern_gda.sbc_args;
732 		error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
733 	}
734 
735 graftout:
736 	if (cryptex_vp) {
737 		vnode_put(cryptex_vp);
738 		cryptex_vp = NULLVP;
739 	}
740 	if (mounton_vp) {
741 		vnode_put(mounton_vp);
742 		mounton_vp = NULLVP;
743 	}
744 	if (ua_mountdir != USER_ADDR_NULL) {
745 		nameidone(&nd);
746 	}
747 
748 	return error;
749 }
750 
751 /*
752  * Ungraft a cryptex disk image (via mount dir FD)
753  * { int ungraftdmg(const char *mountdir, uint64_t flags); }
754  */
755 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)756 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
757 {
758 	int error = 0;
759 	user_addr_t ua_mountdir = uap->mountdir;
760 	fsioc_ungraft_fs_t ugfs;
761 	vnode_t mounton_vp = NULLVP;
762 	struct nameidata nd = {};
763 	vfs_context_t ctx = vfs_context_current();
764 
765 	if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
766 		return EPERM;
767 	}
768 
769 	if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
770 		return EINVAL;
771 	}
772 
773 	ugfs.ungraft_flags = 0;
774 
775 	// Acquire vnode for mount-on path
776 	NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
777 	    UIO_USERSPACE, ua_mountdir, ctx);
778 
779 	error = namei(&nd);
780 	if (error) {
781 		return error;
782 	}
783 	mounton_vp = nd.ni_vp;
784 
785 	// Call into the FS to perform the ungraft
786 	error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
787 
788 	vnode_put(mounton_vp);
789 	nameidone(&nd);
790 
791 	return error;
792 }
793 
794 
795 void
vfs_notify_mount(vnode_t pdvp)796 vfs_notify_mount(vnode_t pdvp)
797 {
798 	vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
799 	lock_vnode_and_post(pdvp, NOTE_WRITE);
800 }
801 
802 /*
803  * __mac_mount:
804  *	Mount a file system taking into account MAC label behavior.
805  *	See mount(2) man page for more information
806  *
807  * Parameters:    p                        Process requesting the mount
808  *                uap                      User argument descriptor (see below)
809  *                retval                   (ignored)
810  *
811  * Indirect:      uap->type                Filesystem type
812  *                uap->path                Path to mount
813  *                uap->data                Mount arguments
814  *                uap->mac_p               MAC info
815  *                uap->flags               Mount flags
816  *
817  *
818  * Returns:        0                       Success
819  *                !0                       Not success
820  */
821 boolean_t root_fs_upgrade_try = FALSE;
822 
823 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)824 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
825 {
826 	vnode_t pvp = NULL;
827 	vnode_t vp = NULL;
828 	int need_nameidone = 0;
829 	vfs_context_t ctx = vfs_context_current();
830 	char fstypename[MFSNAMELEN];
831 	struct nameidata nd;
832 	size_t dummy = 0;
833 	char *labelstr = NULL;
834 	size_t labelsz = 0;
835 	int flags = uap->flags;
836 	int error;
837 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
838 	boolean_t is_64bit = IS_64BIT_PROCESS(p);
839 #else
840 #pragma unused(p)
841 #endif
842 	/*
843 	 * Get the fs type name from user space
844 	 */
845 	error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
846 	if (error) {
847 		return error;
848 	}
849 
850 	/*
851 	 * Get the vnode to be covered
852 	 */
853 	NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
854 	    UIO_USERSPACE, uap->path, ctx);
855 	if (flags & MNT_NOFOLLOW) {
856 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
857 	}
858 	error = namei(&nd);
859 	if (error) {
860 		goto out;
861 	}
862 	need_nameidone = 1;
863 	vp = nd.ni_vp;
864 	pvp = nd.ni_dvp;
865 
866 #ifdef CONFIG_IMGSRC_ACCESS
867 	/* Mounting image source cannot be batched with other operations */
868 	if (flags == MNT_IMGSRC_BY_INDEX) {
869 		error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
870 		    ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
871 		goto out;
872 	}
873 #endif /* CONFIG_IMGSRC_ACCESS */
874 
875 #if CONFIG_MACF
876 	/*
877 	 * Get the label string (if any) from user space
878 	 */
879 	if (uap->mac_p != USER_ADDR_NULL) {
880 		struct user_mac mac;
881 		size_t ulen = 0;
882 
883 		if (is_64bit) {
884 			struct user64_mac mac64;
885 			error = copyin(uap->mac_p, &mac64, sizeof(mac64));
886 			mac.m_buflen = (user_size_t)mac64.m_buflen;
887 			mac.m_string = (user_addr_t)mac64.m_string;
888 		} else {
889 			struct user32_mac mac32;
890 			error = copyin(uap->mac_p, &mac32, sizeof(mac32));
891 			mac.m_buflen = mac32.m_buflen;
892 			mac.m_string = mac32.m_string;
893 		}
894 		if (error) {
895 			goto out;
896 		}
897 		if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
898 		    (mac.m_buflen < 2)) {
899 			error = EINVAL;
900 			goto out;
901 		}
902 		labelsz = mac.m_buflen;
903 		labelstr = kalloc_data(labelsz, Z_WAITOK);
904 		error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
905 		if (error) {
906 			goto out;
907 		}
908 		AUDIT_ARG(mac_string, labelstr);
909 	}
910 #endif /* CONFIG_MACF */
911 
912 	AUDIT_ARG(fflags, flags);
913 
914 #if !CONFIG_UNION_MOUNTS
915 	if (flags & MNT_UNION) {
916 		error = EPERM;
917 		goto out;
918 	}
919 #endif
920 
921 	if ((vp->v_flag & VROOT) &&
922 	    (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
923 #if CONFIG_UNION_MOUNTS
924 		if (!(flags & MNT_UNION)) {
925 			flags |= MNT_UPDATE;
926 		} else {
927 			/*
928 			 * For a union mount on '/', treat it as fresh
929 			 * mount instead of update.
930 			 * Otherwise, union mouting on '/' used to panic the
931 			 * system before, since mnt_vnodecovered was found to
932 			 * be NULL for '/' which is required for unionlookup
933 			 * after it gets ENOENT on union mount.
934 			 */
935 			flags = (flags & ~(MNT_UPDATE));
936 		}
937 #else
938 		flags |= MNT_UPDATE;
939 #endif /* CONFIG_UNION_MOUNTS */
940 
941 #if SECURE_KERNEL
942 		if ((flags & MNT_RDONLY) == 0) {
943 			/* Release kernels are not allowed to mount "/" as rw */
944 			error = EPERM;
945 			goto out;
946 		}
947 #endif
948 
949 		/*
950 		 * See 7392553 for more details on why this check exists.
951 		 * Suffice to say: If this check is ON and something tries
952 		 * to mount the rootFS RW, we'll turn off the codesign
953 		 * bitmap optimization.
954 		 */
955 #if CHECK_CS_VALIDATION_BITMAP
956 		if ((flags & MNT_RDONLY) == 0) {
957 			root_fs_upgrade_try = TRUE;
958 		}
959 #endif
960 	}
961 
962 	error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
963 	    labelstr, ctx);
964 
965 out:
966 
967 #if CONFIG_MACF
968 	kfree_data(labelstr, labelsz);
969 #endif /* CONFIG_MACF */
970 
971 	if (vp) {
972 		vnode_put(vp);
973 	}
974 	if (pvp) {
975 		vnode_put(pvp);
976 	}
977 	if (need_nameidone) {
978 		nameidone(&nd);
979 	}
980 
981 	return error;
982 }
983 
984 /*
985  * common mount implementation (final stage of mounting)
986  *
987  * Arguments:
988  *  fstypename	file system type (ie it's vfs name)
989  *  pvp		parent of covered vnode
990  *  vp		covered vnode
991  *  cnp		component name (ie path) of covered vnode
992  *  flags	generic mount flags
993  *  fsmountargs	file system specific data
994  *  labelstr	optional MAC label
995  *  kernelmount	TRUE for mounts initiated from inside the kernel
996  *  ctx		caller's context
997  */
998 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)999 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1000     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1001     char *labelstr, vfs_context_t ctx)
1002 {
1003 #if !CONFIG_MACF
1004 #pragma unused(labelstr)
1005 #endif
1006 	struct vnode *devvp = NULLVP;
1007 	struct vnode *device_vnode = NULLVP;
1008 #if CONFIG_MACF
1009 	struct vnode *rvp;
1010 #endif
1011 	struct mount *mp = NULL;
1012 	struct vfstable *vfsp = (struct vfstable *)0;
1013 	struct proc *p = vfs_context_proc(ctx);
1014 	int error, flag = 0;
1015 	bool flag_set = false;
1016 	user_addr_t devpath = USER_ADDR_NULL;
1017 	int ronly = 0;
1018 	int mntalloc = 0;
1019 	boolean_t vfsp_ref = FALSE;
1020 	boolean_t is_rwlock_locked = FALSE;
1021 	boolean_t did_rele = FALSE;
1022 	boolean_t have_usecount = FALSE;
1023 	boolean_t did_set_lmount = FALSE;
1024 	boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1025 
1026 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1027 	/* Check for mutually-exclusive flag bits */
1028 	uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1029 	int bitcount = 0;
1030 	while (checkflags != 0) {
1031 		checkflags &= (checkflags - 1);
1032 		bitcount++;
1033 	}
1034 
1035 	if (bitcount > 1) {
1036 		//not allowed to request multiple mount-by-role flags
1037 		error = EINVAL;
1038 		goto out1;
1039 	}
1040 #endif
1041 
1042 	/*
1043 	 * Process an update for an existing mount
1044 	 */
1045 	if (flags & MNT_UPDATE) {
1046 		if ((vp->v_flag & VROOT) == 0) {
1047 			error = EINVAL;
1048 			goto out1;
1049 		}
1050 		mp = vp->v_mount;
1051 
1052 		/* if unmount or mount in progress, return error */
1053 		mount_lock_spin(mp);
1054 		if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1055 			mount_unlock(mp);
1056 			error = EBUSY;
1057 			goto out1;
1058 		}
1059 		mp->mnt_lflag |= MNT_LMOUNT;
1060 		did_set_lmount = TRUE;
1061 		mount_unlock(mp);
1062 		lck_rw_lock_exclusive(&mp->mnt_rwlock);
1063 		is_rwlock_locked = TRUE;
1064 		/*
1065 		 * We only allow the filesystem to be reloaded if it
1066 		 * is currently mounted read-only.
1067 		 */
1068 		if ((flags & MNT_RELOAD) &&
1069 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1070 			error = ENOTSUP;
1071 			goto out1;
1072 		}
1073 
1074 		/*
1075 		 * If content protection is enabled, update mounts are not
1076 		 * allowed to turn it off.
1077 		 */
1078 		if ((mp->mnt_flag & MNT_CPROTECT) &&
1079 		    ((flags & MNT_CPROTECT) == 0)) {
1080 			error = EINVAL;
1081 			goto out1;
1082 		}
1083 
1084 		/*
1085 		 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1086 		 * failure to return an error for this so we'll just silently
1087 		 * add it if it is not passed in.
1088 		 */
1089 		if ((mp->mnt_flag & MNT_REMOVABLE) &&
1090 		    ((flags & MNT_REMOVABLE) == 0)) {
1091 			flags |= MNT_REMOVABLE;
1092 		}
1093 
1094 		/* Can't downgrade the backer of the root FS */
1095 		if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1096 		    (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1097 			error = ENOTSUP;
1098 			goto out1;
1099 		}
1100 
1101 		/*
1102 		 * Only root, or the user that did the original mount is
1103 		 * permitted to update it.
1104 		 */
1105 		if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1106 		    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1107 			goto out1;
1108 		}
1109 #if CONFIG_MACF
1110 		error = mac_mount_check_remount(ctx, mp);
1111 		if (error != 0) {
1112 			goto out1;
1113 		}
1114 #endif
1115 		/*
1116 		 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1117 		 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1118 		 */
1119 		if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1120 			flags |= MNT_NOSUID | MNT_NODEV;
1121 			if (mp->mnt_flag & MNT_NOEXEC) {
1122 				flags |= MNT_NOEXEC;
1123 			}
1124 		}
1125 		flag = mp->mnt_flag;
1126 		flag_set = true;
1127 
1128 
1129 
1130 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1131 
1132 		vfsp = mp->mnt_vtable;
1133 		goto update;
1134 	} // MNT_UPDATE
1135 
1136 	/*
1137 	 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1138 	 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1139 	 */
1140 	if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1141 		flags |= MNT_NOSUID | MNT_NODEV;
1142 		if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1143 			flags |= MNT_NOEXEC;
1144 		}
1145 	}
1146 
1147 	/* XXXAUDIT: Should we capture the type on the error path as well? */
1148 	/* XXX cast-away const (audit_arg_text() does not modify its input) */
1149 	AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1150 	mount_list_lock();
1151 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1152 		if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1153 			vfsp->vfc_refcount++;
1154 			vfsp_ref = TRUE;
1155 			break;
1156 		}
1157 	}
1158 	mount_list_unlock();
1159 	if (vfsp == NULL) {
1160 		error = ENODEV;
1161 		goto out1;
1162 	}
1163 
1164 	/*
1165 	 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1166 	 * except in ROSV configs and for the initial BaseSystem root.
1167 	 */
1168 	if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1169 	    ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1170 	    ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1171 		error = EINVAL;  /* unsupported request */
1172 		goto out1;
1173 	}
1174 
1175 	error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1176 	if (error != 0) {
1177 		goto out1;
1178 	}
1179 
1180 	/*
1181 	 * Allocate and initialize the filesystem (mount_t)
1182 	 */
1183 	mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1184 	mntalloc = 1;
1185 
1186 	/* Initialize the default IO constraints */
1187 	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1188 	mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1189 	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1190 	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1191 	mp->mnt_devblocksize = DEV_BSIZE;
1192 	mp->mnt_alignmentmask = PAGE_MASK;
1193 	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1194 	mp->mnt_ioscale = 1;
1195 	mp->mnt_ioflags = 0;
1196 	mp->mnt_realrootvp = NULLVP;
1197 	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1198 
1199 	mp->mnt_lflag |= MNT_LMOUNT;
1200 	did_set_lmount = TRUE;
1201 
1202 	TAILQ_INIT(&mp->mnt_vnodelist);
1203 	TAILQ_INIT(&mp->mnt_workerqueue);
1204 	TAILQ_INIT(&mp->mnt_newvnodes);
1205 	mount_lock_init(mp);
1206 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
1207 	is_rwlock_locked = TRUE;
1208 	mp->mnt_op = vfsp->vfc_vfsops;
1209 	mp->mnt_vtable = vfsp;
1210 	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
1211 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1212 	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1213 	do {
1214 		size_t pathlen = MAXPATHLEN;
1215 
1216 		if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1217 			strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1218 		}
1219 	} while (0);
1220 	mp->mnt_vnodecovered = vp;
1221 	mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1222 	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1223 	mp->mnt_devbsdunit = 0;
1224 	mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1225 
1226 	/* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1227 	vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1228 
1229 	if (kernelmount) {
1230 		mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1231 	}
1232 	if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1233 		mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1234 	}
1235 
1236 	if (KERNEL_MOUNT_DEVFS & internal_flags) {
1237 		// kernel mounted devfs
1238 		mp->mnt_kern_flag |= MNTK_SYSTEM;
1239 	}
1240 
1241 update:
1242 
1243 	/*
1244 	 * Set the mount level flags.
1245 	 */
1246 	if (flags & MNT_RDONLY) {
1247 		mp->mnt_flag |= MNT_RDONLY;
1248 	} else if (mp->mnt_flag & MNT_RDONLY) {
1249 		// disallow read/write upgrades of file systems that
1250 		// had the TYPENAME_OVERRIDE feature set.
1251 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1252 			error = EPERM;
1253 			goto out1;
1254 		}
1255 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
1256 	}
1257 	mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1258 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1259 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1260 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1261 	    MNT_QUARANTINE | MNT_CPROTECT);
1262 
1263 #if SECURE_KERNEL
1264 #if !CONFIG_MNT_SUID
1265 	/*
1266 	 * On release builds of iOS based platforms, always enforce NOSUID on
1267 	 * all mounts. We do this here because we can catch update mounts as well as
1268 	 * non-update mounts in this case.
1269 	 */
1270 	mp->mnt_flag |= (MNT_NOSUID);
1271 #endif
1272 #endif
1273 
1274 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1275 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1276 	    MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1277 	    MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1278 	    MNT_QUARANTINE | MNT_CPROTECT);
1279 
1280 #if CONFIG_MACF
1281 	if (flags & MNT_MULTILABEL) {
1282 		if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1283 			error = EINVAL;
1284 			goto out1;
1285 		}
1286 		mp->mnt_flag |= MNT_MULTILABEL;
1287 	}
1288 #endif
1289 	/*
1290 	 * Process device path for local file systems if requested.
1291 	 *
1292 	 * Snapshot and mount-by-role mounts do not use this path; they are
1293 	 * passing other opaque data in the device path field.
1294 	 *
1295 	 * Basesystemroot mounts pass a device path to be resolved here,
1296 	 * but it's just a char * already inside the kernel, which
1297 	 * kernel_mount() shoved into a user_addr_t to call us. So for such
1298 	 * mounts we must skip copyin (both of the address and of the string
1299 	 * (in NDINIT).
1300 	 */
1301 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1302 	    !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1303 		boolean_t do_copyin_devpath = true;
1304 #if CONFIG_BASESYSTEMROOT
1305 		if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1306 			// KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1307 			// We have been passed fsmountargs, which is typed as a user_addr_t,
1308 			// but is actually a char ** pointing to a (kernelspace) string.
1309 			// We manually unpack it with a series of casts and dereferences
1310 			// that reverses what was done just above us on the stack in
1311 			// imageboot_pivot_image().
1312 			// After retrieving the path to the dev node (which we will NDINIT
1313 			// in a moment), we pass NULL fsmountargs on to the filesystem.
1314 			_Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1315 			char **devnamepp = (char **)fsmountargs;
1316 			char *devnamep = *devnamepp;
1317 			devpath = CAST_USER_ADDR_T(devnamep);
1318 			do_copyin_devpath = false;
1319 			fsmountargs = USER_ADDR_NULL;
1320 
1321 			//Now that we have a mp, denote that this mount is for the basesystem.
1322 			mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1323 		}
1324 #endif // CONFIG_BASESYSTEMROOT
1325 
1326 		if (do_copyin_devpath) {
1327 			if (vfs_context_is64bit(ctx)) {
1328 				if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1329 					goto out1;
1330 				}
1331 				fsmountargs += sizeof(devpath);
1332 			} else {
1333 				user32_addr_t tmp;
1334 				if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1335 					goto out1;
1336 				}
1337 				/* munge into LP64 addr */
1338 				devpath = CAST_USER_ADDR_T(tmp);
1339 				fsmountargs += sizeof(tmp);
1340 			}
1341 		}
1342 
1343 		/* Lookup device and authorize access to it */
1344 		if ((devpath)) {
1345 			struct nameidata nd;
1346 
1347 			enum uio_seg seg = UIO_USERSPACE;
1348 #if CONFIG_BASESYSTEMROOT
1349 			if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1350 				seg = UIO_SYSSPACE;
1351 			}
1352 #endif // CONFIG_BASESYSTEMROOT
1353 
1354 			NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1355 			if ((error = namei(&nd))) {
1356 				goto out1;
1357 			}
1358 
1359 			strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1360 			devvp = nd.ni_vp;
1361 
1362 			nameidone(&nd);
1363 
1364 			if (devvp->v_type != VBLK) {
1365 				error = ENOTBLK;
1366 				goto out2;
1367 			}
1368 			if (major(devvp->v_rdev) >= nblkdev) {
1369 				error = ENXIO;
1370 				goto out2;
1371 			}
1372 			/*
1373 			 * If mount by non-root, then verify that user has necessary
1374 			 * permissions on the device.
1375 			 */
1376 			if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1377 				mode_t accessmode = KAUTH_VNODE_READ_DATA;
1378 
1379 				if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1380 					accessmode |= KAUTH_VNODE_WRITE_DATA;
1381 				}
1382 				if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1383 					goto out2;
1384 				}
1385 			}
1386 		}
1387 		/* On first mount, preflight and open device */
1388 		if (devpath && ((flags & MNT_UPDATE) == 0)) {
1389 			if ((error = vnode_ref(devvp))) {
1390 				goto out2;
1391 			}
1392 			/*
1393 			 * Disallow multiple mounts of the same device.
1394 			 * Disallow mounting of a device that is currently in use
1395 			 * (except for root, which might share swap device for miniroot).
1396 			 * Flush out any old buffers remaining from a previous use.
1397 			 */
1398 			if ((error = vfs_mountedon(devvp))) {
1399 				goto out3;
1400 			}
1401 
1402 			if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1403 				error = EBUSY;
1404 				goto out3;
1405 			}
1406 			if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1407 				error = ENOTBLK;
1408 				goto out3;
1409 			}
1410 			if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1411 				goto out3;
1412 			}
1413 
1414 			ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1415 #if CONFIG_MACF
1416 			error = mac_vnode_check_open(ctx,
1417 			    devvp,
1418 			    ronly ? FREAD : FREAD | FWRITE);
1419 			if (error) {
1420 				goto out3;
1421 			}
1422 #endif /* MAC */
1423 			if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1424 				goto out3;
1425 			}
1426 
1427 			mp->mnt_devvp = devvp;
1428 			device_vnode = devvp;
1429 		} else if ((mp->mnt_flag & MNT_RDONLY) &&
1430 		    (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1431 		    (device_vnode = mp->mnt_devvp)) {
1432 			dev_t dev;
1433 			int maj;
1434 			/*
1435 			 * If upgrade to read-write by non-root, then verify
1436 			 * that user has necessary permissions on the device.
1437 			 */
1438 			vnode_getalways(device_vnode);
1439 
1440 			if (suser(vfs_context_ucred(ctx), NULL) &&
1441 			    (error = vnode_authorize(device_vnode, NULL,
1442 			    KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1443 			    ctx)) != 0) {
1444 				vnode_put(device_vnode);
1445 				goto out2;
1446 			}
1447 
1448 			/* Tell the device that we're upgrading */
1449 			dev = (dev_t)device_vnode->v_rdev;
1450 			maj = major(dev);
1451 
1452 			if ((u_int)maj >= (u_int)nblkdev) {
1453 				panic("Volume mounted on a device with invalid major number.");
1454 			}
1455 
1456 			error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1457 			vnode_put(device_vnode);
1458 			device_vnode = NULLVP;
1459 			if (error != 0) {
1460 				goto out2;
1461 			}
1462 		}
1463 	} // localargs && !(snapshot | data | vm)
1464 
1465 #if CONFIG_MACF
1466 	if ((flags & MNT_UPDATE) == 0) {
1467 		mac_mount_label_init(mp);
1468 		mac_mount_label_associate(ctx, mp);
1469 	}
1470 	if (labelstr) {
1471 		if ((flags & MNT_UPDATE) != 0) {
1472 			error = mac_mount_check_label_update(ctx, mp);
1473 			if (error != 0) {
1474 				goto out3;
1475 			}
1476 		}
1477 	}
1478 #endif
1479 	/*
1480 	 * Mount the filesystem.  We already asserted that internal_flags
1481 	 * cannot have more than one mount-by-role bit set.
1482 	 */
1483 	if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1484 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1485 		    (caddr_t)fsmountargs, 0, ctx);
1486 	} else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1487 #if CONFIG_ROSV_STARTUP
1488 		struct mount *origin_mp = (struct mount*)fsmountargs;
1489 		fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1490 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1491 		if (error) {
1492 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1493 		} else {
1494 			/* Mark volume associated with system volume */
1495 			mp->mnt_kern_flag |= MNTK_SYSTEM;
1496 
1497 			/* Attempt to acquire the mnt_devvp and set it up */
1498 			struct vnode *mp_devvp = NULL;
1499 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1500 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1501 				    0, &mp_devvp, vfs_context_kernel());
1502 				if (!lerr) {
1503 					mp->mnt_devvp = mp_devvp;
1504 					//vnode_lookup took an iocount, need to drop it.
1505 					vnode_put(mp_devvp);
1506 					// now set `device_vnode` to the devvp that was acquired.
1507 					// this is needed in order to ensure vfs_init_io_attributes is invoked.
1508 					// note that though the iocount above was dropped, the mount acquires
1509 					// an implicit reference against the device.
1510 					device_vnode = mp_devvp;
1511 				}
1512 			}
1513 		}
1514 #else
1515 		error = EINVAL;
1516 #endif
1517 	} else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1518 #if CONFIG_MOUNT_VM
1519 		struct mount *origin_mp = (struct mount*)fsmountargs;
1520 		fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1521 		error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1522 		if (error) {
1523 			printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1524 		} else {
1525 			/* Mark volume associated with system volume and a swap mount */
1526 			mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1527 			/* Attempt to acquire the mnt_devvp and set it up */
1528 			struct vnode *mp_devvp = NULL;
1529 			if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1530 				errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1531 				    0, &mp_devvp, vfs_context_kernel());
1532 				if (!lerr) {
1533 					mp->mnt_devvp = mp_devvp;
1534 					//vnode_lookup took an iocount, need to drop it.
1535 					vnode_put(mp_devvp);
1536 
1537 					// now set `device_vnode` to the devvp that was acquired.
1538 					// note that though the iocount above was dropped, the mount acquires
1539 					// an implicit reference against the device.
1540 					device_vnode = mp_devvp;
1541 				}
1542 			}
1543 		}
1544 #else
1545 		error = EINVAL;
1546 #endif
1547 	} else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1548 #if CONFIG_MOUNT_PREBOOTRECOVERY
1549 		struct mount *origin_mp = (struct mount*)fsmountargs;
1550 		uint32_t mount_role = 0;
1551 		if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1552 			mount_role = VFS_PREBOOT_ROLE;
1553 		} else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1554 			mount_role = VFS_RECOVERY_ROLE;
1555 		}
1556 
1557 		if (mount_role != 0) {
1558 			fs_role_mount_args_t frma = {origin_mp, mount_role};
1559 			error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1560 			if (error) {
1561 				printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1562 			} else {
1563 				// NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1564 				/* Mark volume associated with system volume */
1565 				//mp->mnt_kern_flag |= MNTK_SYSTEM;
1566 				/* Attempt to acquire the mnt_devvp and set it up */
1567 				struct vnode *mp_devvp = NULL;
1568 				if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1569 					errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1570 					    0, &mp_devvp, vfs_context_kernel());
1571 					if (!lerr) {
1572 						mp->mnt_devvp = mp_devvp;
1573 						//vnode_lookup took an iocount, need to drop it.
1574 						vnode_put(mp_devvp);
1575 
1576 						// now set `device_vnode` to the devvp that was acquired.
1577 						// note that though the iocount above was dropped, the mount acquires
1578 						// an implicit reference against the device.
1579 						device_vnode = mp_devvp;
1580 					}
1581 				}
1582 			}
1583 		} else {
1584 			printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1585 			error = EINVAL;
1586 		}
1587 #else
1588 		error = EINVAL;
1589 #endif
1590 	} else {
1591 		error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1592 	}
1593 
1594 	if (flags & MNT_UPDATE) {
1595 		if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1596 			mp->mnt_flag &= ~MNT_RDONLY;
1597 		}
1598 		mp->mnt_flag &= ~
1599 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1600 		mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1601 		if (error) {
1602 			mp->mnt_flag = flag;  /* restore flag value */
1603 		}
1604 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1605 		lck_rw_done(&mp->mnt_rwlock);
1606 		is_rwlock_locked = FALSE;
1607 		if (!error) {
1608 			enablequotas(mp, ctx);
1609 		}
1610 		goto exit;
1611 	}
1612 
1613 	/*
1614 	 * Put the new filesystem on the mount list after root.
1615 	 */
1616 	if (error == 0) {
1617 		struct vfs_attr vfsattr;
1618 		if (device_vnode) {
1619 			/*
1620 			 *   cache the IO attributes for the underlying physical media...
1621 			 *   an error return indicates the underlying driver doesn't
1622 			 *   support all the queries necessary... however, reasonable
1623 			 *   defaults will have been set, so no reason to bail or care
1624 			 *
1625 			 *   Need to do this before calling the MAC hook as it needs
1626 			 *   information from this call.
1627 			 */
1628 			vfs_init_io_attributes(device_vnode, mp);
1629 		}
1630 
1631 #if CONFIG_MACF
1632 		error = mac_mount_check_mount_late(ctx, mp);
1633 		if (error != 0) {
1634 			goto out4;
1635 		}
1636 
1637 		if (vfs_flags(mp) & MNT_MULTILABEL) {
1638 			error = VFS_ROOT(mp, &rvp, ctx);
1639 			if (error) {
1640 				printf("%s() VFS_ROOT returned %d\n", __func__, error);
1641 				goto out4;
1642 			}
1643 			error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1644 			/*
1645 			 * drop reference provided by VFS_ROOT
1646 			 */
1647 			vnode_put(rvp);
1648 
1649 			if (error) {
1650 				goto out4;
1651 			}
1652 		}
1653 #endif  /* MAC */
1654 
1655 		vnode_lock_spin(vp);
1656 		CLR(vp->v_flag, VMOUNT);
1657 		vp->v_mountedhere = mp;
1658 		vnode_unlock(vp);
1659 
1660 		/*
1661 		 * taking the name_cache_lock exclusively will
1662 		 * insure that everyone is out of the fast path who
1663 		 * might be trying to use a now stale copy of
1664 		 * vp->v_mountedhere->mnt_realrootvp
1665 		 * bumping mount_generation causes the cached values
1666 		 * to be invalidated
1667 		 */
1668 		name_cache_lock();
1669 		mount_generation++;
1670 		name_cache_unlock();
1671 
1672 		error = vnode_ref(vp);
1673 		if (error != 0) {
1674 			goto out4;
1675 		}
1676 
1677 		have_usecount = TRUE;
1678 
1679 		error = checkdirs(vp, ctx);
1680 		if (error != 0) {
1681 			/* Unmount the filesystem as cdir/rdirs cannot be updated */
1682 			goto out4;
1683 		}
1684 		/*
1685 		 * there is no cleanup code here so I have made it void
1686 		 * we need to revisit this
1687 		 */
1688 		(void)VFS_START(mp, 0, ctx);
1689 
1690 		if (mount_list_add(mp) != 0) {
1691 			/*
1692 			 * The system is shutting down trying to umount
1693 			 * everything, so fail with a plausible errno.
1694 			 */
1695 			error = EBUSY;
1696 			goto out4;
1697 		}
1698 		lck_rw_done(&mp->mnt_rwlock);
1699 		is_rwlock_locked = FALSE;
1700 
1701 		/* Check if this mounted file system supports EAs or named streams. */
1702 		/* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1703 		VFSATTR_INIT(&vfsattr);
1704 		VFSATTR_WANTED(&vfsattr, f_capabilities);
1705 		if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1706 		    vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1707 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1708 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1709 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1710 				mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1711 			}
1712 #if NAMEDSTREAMS
1713 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1714 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1715 				mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1716 			}
1717 #endif
1718 			/* Check if this file system supports path from id lookups. */
1719 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1720 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1721 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1722 			} else if (mp->mnt_flag & MNT_DOVOLFS) {
1723 				/* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1724 				mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1725 			}
1726 
1727 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1728 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1729 				mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1730 			}
1731 		}
1732 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1733 			mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1734 		}
1735 		if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1736 			mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1737 		}
1738 		/* increment the operations count */
1739 		OSAddAtomic(1, &vfs_nummntops);
1740 		enablequotas(mp, ctx);
1741 
1742 		if (device_vnode) {
1743 			device_vnode->v_specflags |= SI_MOUNTEDON;
1744 		}
1745 
1746 		/* Now that mount is setup, notify the listeners */
1747 		vfs_notify_mount(pvp);
1748 		IOBSDMountChange(mp, kIOMountChangeMount);
1749 	} else {
1750 		/* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1751 		if (mp->mnt_vnodelist.tqh_first != NULL) {
1752 			panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1753 			    mp->mnt_vtable->vfc_name, error);
1754 		}
1755 
1756 		vnode_lock_spin(vp);
1757 		CLR(vp->v_flag, VMOUNT);
1758 		vnode_unlock(vp);
1759 		mount_list_lock();
1760 		mp->mnt_vtable->vfc_refcount--;
1761 		mount_list_unlock();
1762 
1763 		if (device_vnode) {
1764 			vnode_rele(device_vnode);
1765 			VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1766 		}
1767 		lck_rw_done(&mp->mnt_rwlock);
1768 		is_rwlock_locked = FALSE;
1769 
1770 		/*
1771 		 * if we get here, we have a mount structure that needs to be freed,
1772 		 * but since the coveredvp hasn't yet been updated to point at it,
1773 		 * no need to worry about other threads holding a crossref on this mp
1774 		 * so it's ok to just free it
1775 		 */
1776 		mount_lock_destroy(mp);
1777 #if CONFIG_MACF
1778 		mac_mount_label_destroy(mp);
1779 #endif
1780 		zfree(mount_zone, mp);
1781 		did_set_lmount = false;
1782 	}
1783 exit:
1784 	/*
1785 	 * drop I/O count on the device vp if there was one
1786 	 */
1787 	if (devpath && devvp) {
1788 		vnode_put(devvp);
1789 	}
1790 
1791 	if (did_set_lmount) {
1792 		mount_lock_spin(mp);
1793 		mp->mnt_lflag &= ~MNT_LMOUNT;
1794 		mount_unlock(mp);
1795 	}
1796 
1797 	return error;
1798 
1799 /* Error condition exits */
1800 out4:
1801 	(void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1802 
1803 	/*
1804 	 * If the mount has been placed on the covered vp,
1805 	 * it may have been discovered by now, so we have
1806 	 * to treat this just like an unmount
1807 	 */
1808 	mount_lock_spin(mp);
1809 	mp->mnt_lflag |= MNT_LDEAD;
1810 	mount_unlock(mp);
1811 
1812 	if (device_vnode != NULLVP) {
1813 		vnode_rele(device_vnode);
1814 		VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1815 		    ctx);
1816 		did_rele = TRUE;
1817 	}
1818 
1819 	vnode_lock_spin(vp);
1820 
1821 	mp->mnt_crossref++;
1822 	vp->v_mountedhere = (mount_t) 0;
1823 
1824 	vnode_unlock(vp);
1825 
1826 	if (have_usecount) {
1827 		vnode_rele(vp);
1828 	}
1829 out3:
1830 	if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1831 		vnode_rele(devvp);
1832 	}
1833 out2:
1834 	if (devpath && devvp) {
1835 		vnode_put(devvp);
1836 	}
1837 out1:
1838 	/* Release mnt_rwlock only when it was taken */
1839 	if (is_rwlock_locked == TRUE) {
1840 		if (flag_set) {
1841 			mp->mnt_flag = flag;  /* restore mnt_flag value */
1842 		}
1843 		lck_rw_done(&mp->mnt_rwlock);
1844 	}
1845 
1846 	if (did_set_lmount) {
1847 		mount_lock_spin(mp);
1848 		mp->mnt_lflag &= ~MNT_LMOUNT;
1849 		mount_unlock(mp);
1850 	}
1851 
1852 	if (mntalloc) {
1853 		if (mp->mnt_crossref) {
1854 			mount_dropcrossref(mp, vp, 0);
1855 		} else {
1856 			mount_lock_destroy(mp);
1857 #if CONFIG_MACF
1858 			mac_mount_label_destroy(mp);
1859 #endif
1860 			zfree(mount_zone, mp);
1861 		}
1862 	}
1863 	if (vfsp_ref) {
1864 		mount_list_lock();
1865 		vfsp->vfc_refcount--;
1866 		mount_list_unlock();
1867 	}
1868 
1869 	return error;
1870 }
1871 
1872 /*
1873  * Flush in-core data, check for competing mount attempts,
1874  * and set VMOUNT
1875  */
1876 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1877 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1878 {
1879 #if !CONFIG_MACF
1880 #pragma unused(cnp,fsname)
1881 #endif
1882 	struct vnode_attr va;
1883 	int error;
1884 	boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1885 	boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1886 	boolean_t is_busy;
1887 
1888 	if (!skip_auth) {
1889 		/*
1890 		 * If the user is not root, ensure that they own the directory
1891 		 * onto which we are attempting to mount.
1892 		 */
1893 		VATTR_INIT(&va);
1894 		VATTR_WANTED(&va, va_uid);
1895 		if ((error = vnode_getattr(vp, &va, ctx)) ||
1896 		    (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1897 		    (!vfs_context_issuser(ctx)))) {
1898 			error = EPERM;
1899 			goto out;
1900 		}
1901 	}
1902 
1903 	if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1904 		goto out;
1905 	}
1906 
1907 	if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1908 		goto out;
1909 	}
1910 
1911 	if (vp->v_type != VDIR) {
1912 		error = ENOTDIR;
1913 		goto out;
1914 	}
1915 
1916 	vnode_lock_spin(vp);
1917 	is_busy = is_fmount ?
1918 	    (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1919 	    (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1920 	if (is_busy) {
1921 		vnode_unlock(vp);
1922 		error = EBUSY;
1923 		goto out;
1924 	}
1925 	SET(vp->v_flag, VMOUNT);
1926 	vnode_unlock(vp);
1927 
1928 #if CONFIG_MACF
1929 	error = mac_mount_check_mount(ctx, vp,
1930 	    cnp, fsname);
1931 	if (error != 0) {
1932 		vnode_lock_spin(vp);
1933 		CLR(vp->v_flag, VMOUNT);
1934 		vnode_unlock(vp);
1935 	}
1936 #endif
1937 
1938 out:
1939 	return error;
1940 }
1941 
1942 #if CONFIG_IMGSRC_ACCESS
1943 
1944 #define DEBUG_IMGSRC 0
1945 
1946 #if DEBUG_IMGSRC
1947 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1948 #else
1949 #define IMGSRC_DEBUG(args...) do { } while(0)
1950 #endif
1951 
1952 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1953 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1954 {
1955 	struct nameidata nd;
1956 	vnode_t vp, realdevvp;
1957 	mode_t accessmode;
1958 	int error;
1959 	enum uio_seg uio = UIO_USERSPACE;
1960 
1961 	if (ctx == vfs_context_kernel()) {
1962 		uio = UIO_SYSSPACE;
1963 	}
1964 
1965 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1966 	if ((error = namei(&nd))) {
1967 		IMGSRC_DEBUG("namei() failed with %d\n", error);
1968 		return error;
1969 	}
1970 
1971 	vp = nd.ni_vp;
1972 
1973 	if (!vnode_isblk(vp)) {
1974 		IMGSRC_DEBUG("Not block device.\n");
1975 		error = ENOTBLK;
1976 		goto out;
1977 	}
1978 
1979 	realdevvp = mp->mnt_devvp;
1980 	if (realdevvp == NULLVP) {
1981 		IMGSRC_DEBUG("No device backs the mount.\n");
1982 		error = ENXIO;
1983 		goto out;
1984 	}
1985 
1986 	error = vnode_getwithref(realdevvp);
1987 	if (error != 0) {
1988 		IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1989 		goto out;
1990 	}
1991 
1992 	if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1993 		IMGSRC_DEBUG("Wrong dev_t.\n");
1994 		error = ENXIO;
1995 		goto out1;
1996 	}
1997 
1998 	strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1999 
2000 	/*
2001 	 * If mount by non-root, then verify that user has necessary
2002 	 * permissions on the device.
2003 	 */
2004 	if (!vfs_context_issuser(ctx)) {
2005 		accessmode = KAUTH_VNODE_READ_DATA;
2006 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2007 			accessmode |= KAUTH_VNODE_WRITE_DATA;
2008 		}
2009 		if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2010 			IMGSRC_DEBUG("Access denied.\n");
2011 			goto out1;
2012 		}
2013 	}
2014 
2015 	*devvpp = vp;
2016 
2017 out1:
2018 	vnode_put(realdevvp);
2019 
2020 out:
2021 	nameidone(&nd);
2022 
2023 	if (error) {
2024 		vnode_put(vp);
2025 	}
2026 
2027 	return error;
2028 }
2029 
2030 /*
2031  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2032  * and call checkdirs()
2033  */
2034 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2035 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2036 {
2037 	int error;
2038 
2039 	mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2040 
2041 	IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2042 	    mp->mnt_vtable->vfc_name, vnode_getname(vp));
2043 
2044 	vnode_lock_spin(vp);
2045 	CLR(vp->v_flag, VMOUNT);
2046 	vp->v_mountedhere = mp;
2047 	vnode_unlock(vp);
2048 
2049 	/*
2050 	 * taking the name_cache_lock exclusively will
2051 	 * insure that everyone is out of the fast path who
2052 	 * might be trying to use a now stale copy of
2053 	 * vp->v_mountedhere->mnt_realrootvp
2054 	 * bumping mount_generation causes the cached values
2055 	 * to be invalidated
2056 	 */
2057 	name_cache_lock();
2058 	mount_generation++;
2059 	name_cache_unlock();
2060 
2061 	error = vnode_ref(vp);
2062 	if (error != 0) {
2063 		goto out;
2064 	}
2065 
2066 	error = checkdirs(vp, ctx);
2067 	if (error != 0) {
2068 		/* Unmount the filesystem as cdir/rdirs cannot be updated */
2069 		vnode_rele(vp);
2070 		goto out;
2071 	}
2072 
2073 out:
2074 	if (error != 0) {
2075 		mp->mnt_vnodecovered = NULLVP;
2076 	}
2077 	return error;
2078 }
2079 
2080 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2081 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2082 {
2083 	vnode_rele(vp);
2084 	vnode_lock_spin(vp);
2085 	vp->v_mountedhere = (mount_t)NULL;
2086 	vnode_unlock(vp);
2087 
2088 	mp->mnt_vnodecovered = NULLVP;
2089 }
2090 
2091 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2092 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2093 {
2094 	int error;
2095 
2096 	/* unmount in progress return error */
2097 	mount_lock_spin(mp);
2098 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2099 		mount_unlock(mp);
2100 		return EBUSY;
2101 	}
2102 	mount_unlock(mp);
2103 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2104 
2105 	/*
2106 	 * We only allow the filesystem to be reloaded if it
2107 	 * is currently mounted read-only.
2108 	 */
2109 	if ((flags & MNT_RELOAD) &&
2110 	    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2111 		error = ENOTSUP;
2112 		goto out;
2113 	}
2114 
2115 	/*
2116 	 * Only root, or the user that did the original mount is
2117 	 * permitted to update it.
2118 	 */
2119 	if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2120 	    (!vfs_context_issuser(ctx))) {
2121 		error = EPERM;
2122 		goto out;
2123 	}
2124 #if CONFIG_MACF
2125 	error = mac_mount_check_remount(ctx, mp);
2126 	if (error != 0) {
2127 		goto out;
2128 	}
2129 #endif
2130 
2131 out:
2132 	if (error) {
2133 		lck_rw_done(&mp->mnt_rwlock);
2134 	}
2135 
2136 	return error;
2137 }
2138 
2139 static void
mount_end_update(mount_t mp)2140 mount_end_update(mount_t mp)
2141 {
2142 	lck_rw_done(&mp->mnt_rwlock);
2143 }
2144 
2145 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2146 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2147 {
2148 	vnode_t vp;
2149 
2150 	if (height >= MAX_IMAGEBOOT_NESTING) {
2151 		return EINVAL;
2152 	}
2153 
2154 	vp = imgsrc_rootvnodes[height];
2155 	if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2156 		*rvpp = vp;
2157 		return 0;
2158 	} else {
2159 		return ENOENT;
2160 	}
2161 }
2162 
2163 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2164 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2165     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2166     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2167 {
2168 	int error;
2169 	mount_t mp;
2170 	boolean_t placed = FALSE;
2171 	struct vfstable *vfsp;
2172 	user_addr_t devpath;
2173 	char *old_mntonname;
2174 	vnode_t rvp;
2175 	vnode_t devvp;
2176 	uint32_t height;
2177 	uint32_t flags;
2178 
2179 	/* If we didn't imageboot, nothing to move */
2180 	if (imgsrc_rootvnodes[0] == NULLVP) {
2181 		return EINVAL;
2182 	}
2183 
2184 	/* Only root can do this */
2185 	if (!vfs_context_issuser(ctx)) {
2186 		return EPERM;
2187 	}
2188 
2189 	IMGSRC_DEBUG("looking for root vnode.\n");
2190 
2191 	/*
2192 	 * Get root vnode of filesystem we're moving.
2193 	 */
2194 	if (by_index) {
2195 		if (is64bit) {
2196 			struct user64_mnt_imgsrc_args mia64;
2197 			error = copyin(fsmountargs, &mia64, sizeof(mia64));
2198 			if (error != 0) {
2199 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2200 				return error;
2201 			}
2202 
2203 			height = mia64.mi_height;
2204 			flags = mia64.mi_flags;
2205 			devpath = (user_addr_t)mia64.mi_devpath;
2206 		} else {
2207 			struct user32_mnt_imgsrc_args mia32;
2208 			error = copyin(fsmountargs, &mia32, sizeof(mia32));
2209 			if (error != 0) {
2210 				IMGSRC_DEBUG("Failed to copy in arguments.\n");
2211 				return error;
2212 			}
2213 
2214 			height = mia32.mi_height;
2215 			flags = mia32.mi_flags;
2216 			devpath = mia32.mi_devpath;
2217 		}
2218 	} else {
2219 		/*
2220 		 * For binary compatibility--assumes one level of nesting.
2221 		 */
2222 		if (is64bit) {
2223 			if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2224 				return error;
2225 			}
2226 		} else {
2227 			user32_addr_t tmp;
2228 			if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2229 				return error;
2230 			}
2231 
2232 			/* munge into LP64 addr */
2233 			devpath = CAST_USER_ADDR_T(tmp);
2234 		}
2235 
2236 		height = 0;
2237 		flags = 0;
2238 	}
2239 
2240 	if (flags != 0) {
2241 		IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2242 		return EINVAL;
2243 	}
2244 
2245 	error = get_imgsrc_rootvnode(height, &rvp);
2246 	if (error != 0) {
2247 		IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2248 		return error;
2249 	}
2250 
2251 	IMGSRC_DEBUG("got old root vnode\n");
2252 
2253 	old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2254 
2255 	/* Can only move once */
2256 	mp = vnode_mount(rvp);
2257 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2258 		IMGSRC_DEBUG("Already moved.\n");
2259 		error = EBUSY;
2260 		goto out0;
2261 	}
2262 
2263 	IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2264 	IMGSRC_DEBUG("Starting updated.\n");
2265 
2266 	/* Get exclusive rwlock on mount, authorize update on mp */
2267 	error = mount_begin_update(mp, ctx, 0);
2268 	if (error != 0) {
2269 		IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2270 		goto out0;
2271 	}
2272 
2273 	/*
2274 	 * It can only be moved once.  Flag is set under the rwlock,
2275 	 * so we're now safe to proceed.
2276 	 */
2277 	if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2278 		IMGSRC_DEBUG("Already moved [2]\n");
2279 		goto out1;
2280 	}
2281 
2282 	IMGSRC_DEBUG("Preparing coveredvp.\n");
2283 
2284 	/* Mark covered vnode as mount in progress, authorize placing mount on top */
2285 	error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2286 	if (error != 0) {
2287 		IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2288 		goto out1;
2289 	}
2290 
2291 	IMGSRC_DEBUG("Covered vp OK.\n");
2292 
2293 	/* Sanity check the name caller has provided */
2294 	vfsp = mp->mnt_vtable;
2295 	if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2296 		IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2297 		    vfsp->vfc_name, fsname);
2298 		error = EINVAL;
2299 		goto out2;
2300 	}
2301 
2302 	/* Check the device vnode and update mount-from name, for local filesystems */
2303 	if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2304 		IMGSRC_DEBUG("Local, doing device validation.\n");
2305 
2306 		if (devpath != USER_ADDR_NULL) {
2307 			error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2308 			if (error) {
2309 				IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2310 				goto out2;
2311 			}
2312 
2313 			vnode_put(devvp);
2314 		}
2315 	}
2316 
2317 	/*
2318 	 * Place mp on top of vnode, ref the vnode,  call checkdirs(),
2319 	 * and increment the name cache's mount generation
2320 	 */
2321 
2322 	IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2323 	error = place_mount_and_checkdirs(mp, vp, ctx);
2324 	if (error != 0) {
2325 		goto out2;
2326 	}
2327 
2328 	placed = TRUE;
2329 
2330 	strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2331 	strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2332 
2333 	/* Forbid future moves */
2334 	mount_lock(mp);
2335 	mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2336 	mount_unlock(mp);
2337 
2338 	/* Finally, add to mount list, completely ready to go */
2339 	if (mount_list_add(mp) != 0) {
2340 		/*
2341 		 * The system is shutting down trying to umount
2342 		 * everything, so fail with a plausible errno.
2343 		 */
2344 		error = EBUSY;
2345 		goto out3;
2346 	}
2347 
2348 	mount_end_update(mp);
2349 	vnode_put(rvp);
2350 	zfree(ZV_NAMEI, old_mntonname);
2351 
2352 	vfs_notify_mount(pvp);
2353 
2354 	return 0;
2355 out3:
2356 	strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2357 
2358 	mount_lock(mp);
2359 	mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2360 	mount_unlock(mp);
2361 
2362 out2:
2363 	/*
2364 	 * Placing the mp on the vnode clears VMOUNT,
2365 	 * so cleanup is different after that point
2366 	 */
2367 	if (placed) {
2368 		/* Rele the vp, clear VMOUNT and v_mountedhere */
2369 		undo_place_on_covered_vp(mp, vp);
2370 	} else {
2371 		vnode_lock_spin(vp);
2372 		CLR(vp->v_flag, VMOUNT);
2373 		vnode_unlock(vp);
2374 	}
2375 out1:
2376 	mount_end_update(mp);
2377 
2378 out0:
2379 	vnode_put(rvp);
2380 	zfree(ZV_NAMEI, old_mntonname);
2381 	return error;
2382 }
2383 
2384 #endif /* CONFIG_IMGSRC_ACCESS */
2385 
2386 void
enablequotas(struct mount * mp,vfs_context_t ctx)2387 enablequotas(struct mount *mp, vfs_context_t ctx)
2388 {
2389 	struct nameidata qnd;
2390 	int type;
2391 	char qfpath[MAXPATHLEN];
2392 	const char *qfname = QUOTAFILENAME;
2393 	const char *qfopsname = QUOTAOPSNAME;
2394 	const char *qfextension[] = INITQFNAMES;
2395 
2396 	/* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2397 	if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2398 		return;
2399 	}
2400 	/*
2401 	 * Enable filesystem disk quotas if necessary.
2402 	 * We ignore errors as this should not interfere with final mount
2403 	 */
2404 	for (type = 0; type < MAXQUOTAS; type++) {
2405 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2406 		NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2407 		    CAST_USER_ADDR_T(qfpath), ctx);
2408 		if (namei(&qnd) != 0) {
2409 			continue;           /* option file to trigger quotas is not present */
2410 		}
2411 		vnode_put(qnd.ni_vp);
2412 		nameidone(&qnd);
2413 		snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2414 
2415 		(void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2416 	}
2417 	return;
2418 }
2419 
2420 
2421 static int
checkdirs_callback(proc_t p,void * arg)2422 checkdirs_callback(proc_t p, void * arg)
2423 {
2424 	struct cdirargs *cdrp = (struct cdirargs *)arg;
2425 	vnode_t olddp = cdrp->olddp;
2426 	vnode_t newdp = cdrp->newdp;
2427 	struct filedesc *fdp = &p->p_fd;
2428 	vnode_t new_cvp = newdp;
2429 	vnode_t new_rvp = newdp;
2430 	vnode_t old_cvp = NULL;
2431 	vnode_t old_rvp = NULL;
2432 
2433 	/*
2434 	 * XXX Also needs to iterate each thread in the process to see if it
2435 	 * XXX is using a per-thread current working directory, and, if so,
2436 	 * XXX update that as well.
2437 	 */
2438 
2439 	/*
2440 	 * First, with the proc_fdlock held, check to see if we will need
2441 	 * to do any work.  If not, we will get out fast.
2442 	 */
2443 	proc_fdlock(p);
2444 	if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2445 		proc_fdunlock(p);
2446 		return PROC_RETURNED;
2447 	}
2448 	proc_fdunlock(p);
2449 
2450 	/*
2451 	 * Ok, we will have to do some work.  Always take two refs
2452 	 * because we might need that many.  We'll dispose of whatever
2453 	 * we ended up not using.
2454 	 */
2455 	if (vnode_ref(newdp) != 0) {
2456 		return PROC_RETURNED;
2457 	}
2458 	if (vnode_ref(newdp) != 0) {
2459 		vnode_rele(newdp);
2460 		return PROC_RETURNED;
2461 	}
2462 
2463 	proc_dirs_lock_exclusive(p);
2464 	/*
2465 	 * Now do the work.  Note: we dropped the proc_fdlock, so we
2466 	 * have to do all of the checks again.
2467 	 */
2468 	proc_fdlock(p);
2469 	if (fdp->fd_cdir == olddp) {
2470 		old_cvp = olddp;
2471 		fdp->fd_cdir = newdp;
2472 		new_cvp = NULL;
2473 	}
2474 	if (fdp->fd_rdir == olddp) {
2475 		old_rvp = olddp;
2476 		fdp->fd_rdir = newdp;
2477 		new_rvp = NULL;
2478 	}
2479 	proc_fdunlock(p);
2480 	proc_dirs_unlock_exclusive(p);
2481 
2482 	/*
2483 	 * Dispose of any references that are no longer needed.
2484 	 */
2485 	if (old_cvp != NULL) {
2486 		vnode_rele(old_cvp);
2487 	}
2488 	if (old_rvp != NULL) {
2489 		vnode_rele(old_rvp);
2490 	}
2491 	if (new_cvp != NULL) {
2492 		vnode_rele(new_cvp);
2493 	}
2494 	if (new_rvp != NULL) {
2495 		vnode_rele(new_rvp);
2496 	}
2497 
2498 	return PROC_RETURNED;
2499 }
2500 
2501 
2502 
2503 /*
2504  * Scan all active processes to see if any of them have a current
2505  * or root directory onto which the new filesystem has just been
2506  * mounted. If so, replace them with the new mount point.
2507  */
2508 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2509 checkdirs(vnode_t olddp, vfs_context_t ctx)
2510 {
2511 	vnode_t newdp;
2512 	vnode_t tvp;
2513 	int err;
2514 	struct cdirargs cdr;
2515 
2516 	if (olddp->v_usecount == 1) {
2517 		return 0;
2518 	}
2519 	err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2520 
2521 	if (err != 0) {
2522 #if DIAGNOSTIC
2523 		panic("mount: lost mount: error %d", err);
2524 #endif
2525 		return err;
2526 	}
2527 
2528 	cdr.olddp = olddp;
2529 	cdr.newdp = newdp;
2530 	/* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2531 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2532 
2533 	if (rootvnode == olddp) {
2534 		vnode_ref(newdp);
2535 		lck_rw_lock_exclusive(&rootvnode_rw_lock);
2536 		tvp = rootvnode;
2537 		rootvnode = newdp;
2538 		lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2539 		vnode_rele(tvp);
2540 	}
2541 
2542 	vnode_put(newdp);
2543 	return 0;
2544 }
2545 
2546 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2547 	"com.apple.private.vfs.role-account-unmount"
2548 
2549 /*
2550  * Unmount a file system.
2551  *
2552  * Note: unmount takes a path to the vnode mounted on as argument,
2553  * not special file (as before).
2554  */
2555 /* ARGSUSED */
2556 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2557 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2558 {
2559 	vnode_t vp;
2560 	struct mount *mp;
2561 	int error;
2562 	struct nameidata nd;
2563 	vfs_context_t ctx;
2564 
2565 	/*
2566 	 * If the process has the entitlement, use the kernel's context when
2567 	 * performing lookup on the mount path as the process might lack proper
2568 	 * permission to access the directory.
2569 	 */
2570 	ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2571 	    vfs_context_kernel() : vfs_context_current();
2572 
2573 	NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2574 	    UIO_USERSPACE, uap->path, ctx);
2575 	error = namei(&nd);
2576 	if (error) {
2577 		return error;
2578 	}
2579 	vp = nd.ni_vp;
2580 	mp = vp->v_mount;
2581 	nameidone(&nd);
2582 
2583 #if CONFIG_MACF
2584 	error = mac_mount_check_umount(ctx, mp);
2585 	if (error != 0) {
2586 		vnode_put(vp);
2587 		return error;
2588 	}
2589 #endif
2590 	/*
2591 	 * Must be the root of the filesystem
2592 	 */
2593 	if ((vp->v_flag & VROOT) == 0) {
2594 		vnode_put(vp);
2595 		return EINVAL;
2596 	}
2597 	mount_ref(mp, 0);
2598 	vnode_put(vp);
2599 	/* safedounmount consumes the mount ref */
2600 	return safedounmount(mp, uap->flags, ctx);
2601 }
2602 
2603 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2604 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2605 {
2606 	mount_t mp;
2607 
2608 	mp = mount_list_lookupby_fsid(fsid, 0, 1);
2609 	if (mp == (mount_t)0) {
2610 		return ENOENT;
2611 	}
2612 	mount_ref(mp, 0);
2613 	mount_iterdrop(mp);
2614 	/* safedounmount consumes the mount ref */
2615 	return safedounmount(mp, flags, ctx);
2616 }
2617 
2618 /*
2619  * The mount struct comes with a mount ref which will be consumed.
2620  * Do the actual file system unmount, prevent some common foot shooting.
2621  */
2622 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2623 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2624 {
2625 	int error;
2626 	proc_t p = vfs_context_proc(ctx);
2627 
2628 	/*
2629 	 * If the file system is not responding and MNT_NOBLOCK
2630 	 * is set and not a forced unmount then return EBUSY.
2631 	 */
2632 	if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2633 	    (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2634 		error = EBUSY;
2635 		goto out;
2636 	}
2637 
2638 	/*
2639 	 * Skip authorization in two cases:
2640 	 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2641 	 *   This entitlement allows non-root processes unmount volumes mounted by
2642 	 *   other processes.
2643 	 * - If the mount is tagged as permissive and this is not a forced-unmount
2644 	 *   attempt.
2645 	 */
2646 	if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2647 	    (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2648 		/*
2649 		 * Only root, or the user that did the original mount is
2650 		 * permitted to unmount this filesystem.
2651 		 */
2652 		if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2653 		    (error = suser(kauth_cred_get(), &p->p_acflag))) {
2654 			goto out;
2655 		}
2656 	}
2657 	/*
2658 	 * Don't allow unmounting the root file system, or other volumes
2659 	 * associated with it (for example, the associated VM or DATA mounts) .
2660 	 */
2661 	if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2662 		if (!(mp->mnt_flag & MNT_ROOTFS)) {
2663 			printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2664 			    mp->mnt_vfsstat.f_mntonname);
2665 		}
2666 		error = EBUSY; /* the root (or associated volumes) is always busy */
2667 		goto out;
2668 	}
2669 
2670 	/*
2671 	 * If the mount is providing the root filesystem's disk image
2672 	 * (i.e. imageboot), don't allow unmounting
2673 	 */
2674 	if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2675 		error = EBUSY;
2676 		goto out;
2677 	}
2678 
2679 	return dounmount(mp, flags, 1, ctx);
2680 
2681 out:
2682 	mount_drop(mp, 0);
2683 	return error;
2684 }
2685 
2686 /*
2687  * Do the actual file system unmount.
2688  */
2689 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2690 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2691 {
2692 	vnode_t coveredvp = (vnode_t)0;
2693 	int error;
2694 	int needwakeup = 0;
2695 	int forcedunmount = 0;
2696 	int lflags = 0;
2697 	struct vnode *devvp = NULLVP;
2698 #if CONFIG_TRIGGERS
2699 	proc_t p = vfs_context_proc(ctx);
2700 	int did_vflush = 0;
2701 	int pflags_save = 0;
2702 #endif /* CONFIG_TRIGGERS */
2703 
2704 #if CONFIG_FSE
2705 	if (!(flags & MNT_FORCE)) {
2706 		fsevent_unmount(mp, ctx);  /* has to come first! */
2707 	}
2708 #endif
2709 
2710 	mount_lock(mp);
2711 
2712 	/*
2713 	 * If already an unmount in progress just return EBUSY.
2714 	 * Even a forced unmount cannot override.
2715 	 */
2716 	if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2717 		if (withref != 0) {
2718 			mount_drop(mp, 1);
2719 		}
2720 		mount_unlock(mp);
2721 		return EBUSY;
2722 	}
2723 
2724 	if (flags & MNT_FORCE) {
2725 		forcedunmount = 1;
2726 		mp->mnt_lflag |= MNT_LFORCE;
2727 	}
2728 
2729 #if CONFIG_TRIGGERS
2730 	if (flags & MNT_NOBLOCK && p != kernproc) {
2731 		pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2732 	}
2733 #endif
2734 
2735 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
2736 	mp->mnt_lflag |= MNT_LUNMOUNT;
2737 	mp->mnt_flag &= ~MNT_ASYNC;
2738 	/*
2739 	 * anyone currently in the fast path that
2740 	 * trips over the cached rootvp will be
2741 	 * dumped out and forced into the slow path
2742 	 * to regenerate a new cached value
2743 	 */
2744 	mp->mnt_realrootvp = NULLVP;
2745 	mount_unlock(mp);
2746 
2747 	if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2748 		/*
2749 		 * Force unmount any mounts in this filesystem.
2750 		 * If any unmounts fail - just leave them dangling.
2751 		 * Avoids recursion.
2752 		 */
2753 		(void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2754 	}
2755 
2756 	/*
2757 	 * taking the name_cache_lock exclusively will
2758 	 * insure that everyone is out of the fast path who
2759 	 * might be trying to use a now stale copy of
2760 	 * vp->v_mountedhere->mnt_realrootvp
2761 	 * bumping mount_generation causes the cached values
2762 	 * to be invalidated
2763 	 */
2764 	name_cache_lock();
2765 	mount_generation++;
2766 	name_cache_unlock();
2767 
2768 
2769 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2770 	if (withref != 0) {
2771 		mount_drop(mp, 0);
2772 	}
2773 	error = 0;
2774 	if (forcedunmount == 0) {
2775 		ubc_umount(mp); /* release cached vnodes */
2776 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2777 			error = VFS_SYNC(mp, MNT_WAIT, ctx);
2778 			if (error) {
2779 				mount_lock(mp);
2780 				mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2781 				mp->mnt_lflag &= ~MNT_LUNMOUNT;
2782 				mp->mnt_lflag &= ~MNT_LFORCE;
2783 				goto out;
2784 			}
2785 		}
2786 	}
2787 
2788 	IOBSDMountChange(mp, kIOMountChangeUnmount);
2789 
2790 #if CONFIG_TRIGGERS
2791 	vfs_nested_trigger_unmounts(mp, flags, ctx);
2792 	did_vflush = 1;
2793 #endif
2794 	if (forcedunmount) {
2795 		lflags |= FORCECLOSE;
2796 	}
2797 	error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2798 	if ((forcedunmount == 0) && error) {
2799 		mount_lock(mp);
2800 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2801 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2802 		mp->mnt_lflag &= ~MNT_LFORCE;
2803 		goto out;
2804 	}
2805 
2806 	/* make sure there are no one in the mount iterations or lookup */
2807 	mount_iterdrain(mp);
2808 
2809 	error = VFS_UNMOUNT(mp, flags, ctx);
2810 	if (error) {
2811 		mount_iterreset(mp);
2812 		mount_lock(mp);
2813 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2814 		mp->mnt_lflag &= ~MNT_LUNMOUNT;
2815 		mp->mnt_lflag &= ~MNT_LFORCE;
2816 		goto out;
2817 	}
2818 
2819 	/* increment the operations count */
2820 	if (!error) {
2821 		OSAddAtomic(1, &vfs_nummntops);
2822 	}
2823 
2824 	if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2825 		/* hold an io reference and drop the usecount before close */
2826 		devvp = mp->mnt_devvp;
2827 		vnode_getalways(devvp);
2828 		vnode_rele(devvp);
2829 		VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2830 		    ctx);
2831 		vnode_clearmountedon(devvp);
2832 		vnode_put(devvp);
2833 	}
2834 	lck_rw_done(&mp->mnt_rwlock);
2835 	mount_list_remove(mp);
2836 	lck_rw_lock_exclusive(&mp->mnt_rwlock);
2837 
2838 	/* mark the mount point hook in the vp but not drop the ref yet */
2839 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2840 		/*
2841 		 * The covered vnode needs special handling. Trying to get an
2842 		 * iocount must not block here as this may lead to deadlocks
2843 		 * if the Filesystem to which the covered vnode belongs is
2844 		 * undergoing forced unmounts. Since we hold a usecount, the
2845 		 * vnode cannot be reused (it can, however, still be terminated)
2846 		 */
2847 		vnode_getalways(coveredvp);
2848 		vnode_lock_spin(coveredvp);
2849 
2850 		mp->mnt_crossref++;
2851 		coveredvp->v_mountedhere = (struct mount *)0;
2852 		CLR(coveredvp->v_flag, VMOUNT);
2853 
2854 		vnode_unlock(coveredvp);
2855 		vnode_put(coveredvp);
2856 	}
2857 
2858 	mount_list_lock();
2859 	mp->mnt_vtable->vfc_refcount--;
2860 	mount_list_unlock();
2861 
2862 	cache_purgevfs(mp);     /* remove cache entries for this file sys */
2863 	vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2864 	mount_lock(mp);
2865 	mp->mnt_lflag |= MNT_LDEAD;
2866 
2867 	if (mp->mnt_lflag & MNT_LWAIT) {
2868 		/*
2869 		 * do the wakeup here
2870 		 * in case we block in mount_refdrain
2871 		 * which will drop the mount lock
2872 		 * and allow anyone blocked in vfs_busy
2873 		 * to wakeup and see the LDEAD state
2874 		 */
2875 		mp->mnt_lflag &= ~MNT_LWAIT;
2876 		wakeup((caddr_t)mp);
2877 	}
2878 	mount_refdrain(mp);
2879 
2880 	/* free disk_conditioner_info structure for this mount */
2881 	disk_conditioner_unmount(mp);
2882 
2883 out:
2884 	if (mp->mnt_lflag & MNT_LWAIT) {
2885 		mp->mnt_lflag &= ~MNT_LWAIT;
2886 		needwakeup = 1;
2887 	}
2888 
2889 #if CONFIG_TRIGGERS
2890 	if (flags & MNT_NOBLOCK && p != kernproc) {
2891 		// Restore P_NOREMOTEHANG bit to its previous value
2892 		if ((pflags_save & P_NOREMOTEHANG) == 0) {
2893 			OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2894 		}
2895 	}
2896 
2897 	/*
2898 	 * Callback and context are set together under the mount lock, and
2899 	 * never cleared, so we're safe to examine them here, drop the lock,
2900 	 * and call out.
2901 	 */
2902 	if (mp->mnt_triggercallback != NULL) {
2903 		mount_unlock(mp);
2904 		if (error == 0) {
2905 			mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2906 		} else if (did_vflush) {
2907 			mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2908 		}
2909 	} else {
2910 		mount_unlock(mp);
2911 	}
2912 #else
2913 	mount_unlock(mp);
2914 #endif /* CONFIG_TRIGGERS */
2915 
2916 	lck_rw_done(&mp->mnt_rwlock);
2917 
2918 	if (needwakeup) {
2919 		wakeup((caddr_t)mp);
2920 	}
2921 
2922 	if (!error) {
2923 		if ((coveredvp != NULLVP)) {
2924 			vnode_t pvp = NULLVP;
2925 
2926 			/*
2927 			 * The covered vnode needs special handling. Trying to
2928 			 * get an iocount must not block here as this may lead
2929 			 * to deadlocks if the Filesystem to which the covered
2930 			 * vnode belongs is undergoing forced unmounts. Since we
2931 			 * hold a usecount, the  vnode cannot be reused
2932 			 * (it can, however, still be terminated).
2933 			 */
2934 			vnode_getalways(coveredvp);
2935 
2936 			mount_dropcrossref(mp, coveredvp, 0);
2937 			/*
2938 			 * We'll _try_ to detect if this really needs to be
2939 			 * done. The coveredvp can only be in termination (or
2940 			 * terminated) if the coveredvp's mount point is in a
2941 			 * forced unmount (or has been) since we still hold the
2942 			 * ref.
2943 			 */
2944 			if (!vnode_isrecycled(coveredvp)) {
2945 				pvp = vnode_getparent(coveredvp);
2946 #if CONFIG_TRIGGERS
2947 				if (coveredvp->v_resolve) {
2948 					vnode_trigger_rearm(coveredvp, ctx);
2949 				}
2950 #endif
2951 			}
2952 
2953 			vnode_rele(coveredvp);
2954 			vnode_put(coveredvp);
2955 			coveredvp = NULLVP;
2956 
2957 			if (pvp) {
2958 				lock_vnode_and_post(pvp, NOTE_WRITE);
2959 				vnode_put(pvp);
2960 			}
2961 		} else if (mp->mnt_flag & MNT_ROOTFS) {
2962 			mount_lock_destroy(mp);
2963 #if CONFIG_MACF
2964 			mac_mount_label_destroy(mp);
2965 #endif
2966 			zfree(mount_zone, mp);
2967 		} else {
2968 			panic("dounmount: no coveredvp");
2969 		}
2970 	}
2971 	return error;
2972 }
2973 
2974 /*
2975  * Unmount any mounts in this filesystem.
2976  */
2977 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2978 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2979 {
2980 	mount_t smp;
2981 	fsid_t *fsids, fsid;
2982 	int fsids_sz;
2983 	int count = 0, i, m = 0;
2984 	vnode_t vp;
2985 
2986 	mount_list_lock();
2987 
2988 	// Get an array to hold the submounts fsids.
2989 	TAILQ_FOREACH(smp, &mountlist, mnt_list)
2990 	count++;
2991 	fsids_sz = count * sizeof(fsid_t);
2992 	fsids = kalloc_data(fsids_sz, Z_NOWAIT);
2993 	if (fsids == NULL) {
2994 		mount_list_unlock();
2995 		goto out;
2996 	}
2997 	fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2998 
2999 	/*
3000 	 * Fill the array with submount fsids.
3001 	 * Since mounts are always added to the tail of the mount list, the
3002 	 * list is always in mount order.
3003 	 * For each mount check if the mounted-on vnode belongs to a
3004 	 * mount that's already added to our array of mounts to be unmounted.
3005 	 */
3006 	for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3007 		vp = smp->mnt_vnodecovered;
3008 		if (vp == NULL) {
3009 			continue;
3010 		}
3011 		fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
3012 		for (i = 0; i <= m; i++) {
3013 			if (fsids[i].val[0] == fsid.val[0] &&
3014 			    fsids[i].val[1] == fsid.val[1]) {
3015 				fsids[++m] = smp->mnt_vfsstat.f_fsid;
3016 				break;
3017 			}
3018 		}
3019 	}
3020 	mount_list_unlock();
3021 
3022 	// Unmount the submounts in reverse order. Ignore errors.
3023 	for (i = m; i > 0; i--) {
3024 		smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3025 		if (smp) {
3026 			mount_ref(smp, 0);
3027 			mount_iterdrop(smp);
3028 			(void) dounmount(smp, flags, 1, ctx);
3029 		}
3030 	}
3031 out:
3032 	kfree_data(fsids, fsids_sz);
3033 }
3034 
3035 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3036 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3037 {
3038 	vnode_hold(dp);
3039 	vnode_lock(dp);
3040 	mp->mnt_crossref--;
3041 
3042 	if (mp->mnt_crossref < 0) {
3043 		panic("mount cross refs -ve");
3044 	}
3045 
3046 	if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3047 		if (need_put) {
3048 			vnode_put_locked(dp);
3049 		}
3050 		vnode_drop_and_unlock(dp);
3051 
3052 		mount_lock_destroy(mp);
3053 #if CONFIG_MACF
3054 		mac_mount_label_destroy(mp);
3055 #endif
3056 		zfree(mount_zone, mp);
3057 		return;
3058 	}
3059 	if (need_put) {
3060 		vnode_put_locked(dp);
3061 	}
3062 	vnode_drop_and_unlock(dp);
3063 }
3064 
3065 
3066 /*
3067  * Sync each mounted filesystem.
3068  */
3069 #if DIAGNOSTIC
3070 int syncprt = 0;
3071 #endif
3072 
3073 int print_vmpage_stat = 0;
3074 
3075 /*
3076  * sync_callback:	simple wrapper that calls VFS_SYNC() on volumes
3077  *			mounted read-write with the passed waitfor value.
3078  *
3079  * Parameters:	mp	mount-point descriptor per mounted file-system instance.
3080  *		arg	user argument (please see below)
3081  *
3082  * User argument is a pointer to 32 bit unsigned integer which describes the
3083  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
3084  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3085  * waitfor value.
3086  *
3087  * Returns:		VFS_RETURNED
3088  */
3089 static int
sync_callback(mount_t mp,void * arg)3090 sync_callback(mount_t mp, void *arg)
3091 {
3092 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3093 		int asyncflag = mp->mnt_flag & MNT_ASYNC;
3094 		unsigned waitfor = MNT_NOWAIT;
3095 
3096 		if (arg) {
3097 			waitfor = *(uint32_t*)arg;
3098 		}
3099 
3100 		/* Sanity check for flags - these are the only valid combinations for the flag bits*/
3101 		if (waitfor != MNT_WAIT &&
3102 		    waitfor != (MNT_WAIT | MNT_VOLUME) &&
3103 		    waitfor != MNT_NOWAIT &&
3104 		    waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3105 		    waitfor != MNT_DWAIT &&
3106 		    waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3107 			panic("Passed inappropriate waitfor %u to "
3108 			    "sync_callback()", waitfor);
3109 		}
3110 
3111 		mp->mnt_flag &= ~MNT_ASYNC;
3112 		(void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3113 		if (asyncflag) {
3114 			mp->mnt_flag |= MNT_ASYNC;
3115 		}
3116 	}
3117 
3118 	return VFS_RETURNED;
3119 }
3120 
3121 /* ARGSUSED */
3122 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3123 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3124 {
3125 	vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3126 
3127 	if (print_vmpage_stat) {
3128 		vm_countdirtypages();
3129 	}
3130 
3131 #if DIAGNOSTIC
3132 	if (syncprt) {
3133 		vfs_bufstats();
3134 	}
3135 #endif /* DIAGNOSTIC */
3136 	return 0;
3137 }
3138 
3139 typedef enum {
3140 	SYNC_ALL = 0,
3141 	SYNC_ONLY_RELIABLE_MEDIA = 1,
3142 	SYNC_ONLY_UNRELIABLE_MEDIA = 2
3143 } sync_type_t;
3144 
3145 static int
sync_internal_callback(mount_t mp,void * arg)3146 sync_internal_callback(mount_t mp, void *arg)
3147 {
3148 	if (arg) {
3149 		int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3150 		    (mp->mnt_flag & MNT_LOCAL);
3151 		sync_type_t sync_type = *((sync_type_t *)arg);
3152 
3153 		if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3154 			return VFS_RETURNED;
3155 		} else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3156 			return VFS_RETURNED;
3157 		}
3158 	}
3159 
3160 	(void)sync_callback(mp, NULL);
3161 
3162 	return VFS_RETURNED;
3163 }
3164 
3165 int sync_thread_state = 0;
3166 int sync_timeout_seconds = 5;
3167 
3168 #define SYNC_THREAD_RUN       0x0001
3169 #define SYNC_THREAD_RUNNING   0x0002
3170 
3171 #if CONFIG_PHYS_WRITE_ACCT
3172 thread_t pm_sync_thread;
3173 #endif /* CONFIG_PHYS_WRITE_ACCT */
3174 
3175 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3176 sync_thread(__unused void *arg, __unused wait_result_t wr)
3177 {
3178 	sync_type_t sync_type;
3179 #if CONFIG_PHYS_WRITE_ACCT
3180 	pm_sync_thread = current_thread();
3181 #endif /* CONFIG_PHYS_WRITE_ACCT */
3182 
3183 	lck_mtx_lock(&sync_mtx_lck);
3184 	while (sync_thread_state & SYNC_THREAD_RUN) {
3185 		sync_thread_state &= ~SYNC_THREAD_RUN;
3186 		lck_mtx_unlock(&sync_mtx_lck);
3187 
3188 		sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3189 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3190 		sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3191 		vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3192 
3193 		lck_mtx_lock(&sync_mtx_lck);
3194 	}
3195 	/*
3196 	 * This wakeup _has_ to be issued before the lock is released otherwise
3197 	 * we may end up waking up a thread in sync_internal which is
3198 	 * expecting a wakeup from a thread it just created and not from this
3199 	 * thread which is about to exit.
3200 	 */
3201 	wakeup(&sync_thread_state);
3202 	sync_thread_state &= ~SYNC_THREAD_RUNNING;
3203 #if CONFIG_PHYS_WRITE_ACCT
3204 	pm_sync_thread = NULL;
3205 #endif /* CONFIG_PHYS_WRITE_ACCT */
3206 	lck_mtx_unlock(&sync_mtx_lck);
3207 
3208 	if (print_vmpage_stat) {
3209 		vm_countdirtypages();
3210 	}
3211 
3212 #if DIAGNOSTIC
3213 	if (syncprt) {
3214 		vfs_bufstats();
3215 	}
3216 #endif /* DIAGNOSTIC */
3217 }
3218 
3219 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3220 
3221 /*
3222  * An in-kernel sync for power management to call.
3223  * This function always returns within sync_timeout seconds.
3224  */
3225 __private_extern__ int
sync_internal(void)3226 sync_internal(void)
3227 {
3228 	thread_t thd = NULL;
3229 	int error;
3230 	int thread_created = FALSE;
3231 	struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3232 
3233 	lck_mtx_lock(&sync_mtx_lck);
3234 	sync_thread_state |= SYNC_THREAD_RUN;
3235 	if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3236 		int kr;
3237 
3238 		sync_thread_state |= SYNC_THREAD_RUNNING;
3239 		kr = kernel_thread_start(sync_thread, NULL, &thd);
3240 		if (kr != KERN_SUCCESS) {
3241 			sync_thread_state &= ~SYNC_THREAD_RUNNING;
3242 			lck_mtx_unlock(&sync_mtx_lck);
3243 			printf("sync_thread failed\n");
3244 			return 0;
3245 		}
3246 		thread_created = TRUE;
3247 	}
3248 
3249 	error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3250 	    (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3251 	if (error) {
3252 		struct timeval now;
3253 
3254 		microtime(&now);
3255 		if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3256 			printf("sync timed out: %d sec\n", sync_timeout_seconds);
3257 			sync_timeout_last_print.tv_sec = now.tv_sec;
3258 		}
3259 	}
3260 
3261 	if (thread_created) {
3262 		thread_deallocate(thd);
3263 	}
3264 
3265 	return 0;
3266 } /* end of sync_internal call */
3267 
3268 /*
3269  * Change filesystem quotas.
3270  */
3271 #if QUOTA
3272 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3273 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3274 {
3275 	struct mount *mp;
3276 	int error, quota_cmd, quota_status = 0;
3277 	caddr_t datap;
3278 	size_t fnamelen;
3279 	struct nameidata nd;
3280 	vfs_context_t ctx = vfs_context_current();
3281 	struct dqblk my_dqblk = {};
3282 
3283 	AUDIT_ARG(uid, uap->uid);
3284 	AUDIT_ARG(cmd, uap->cmd);
3285 	NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3286 	    uap->path, ctx);
3287 	error = namei(&nd);
3288 	if (error) {
3289 		return error;
3290 	}
3291 	mp = nd.ni_vp->v_mount;
3292 	mount_ref(mp, 0);
3293 	vnode_put(nd.ni_vp);
3294 	nameidone(&nd);
3295 
3296 #if CONFIG_MACF
3297 	error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3298 	if (error != 0) {
3299 		goto out;
3300 	}
3301 #endif
3302 
3303 	/* copyin any data we will need for downstream code */
3304 	quota_cmd = uap->cmd >> SUBCMDSHIFT;
3305 
3306 	switch (quota_cmd) {
3307 	case Q_QUOTAON:
3308 		/* uap->arg specifies a file from which to take the quotas */
3309 		fnamelen = MAXPATHLEN;
3310 		datap = zalloc(ZV_NAMEI);
3311 		error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3312 		break;
3313 	case Q_GETQUOTA:
3314 		/* uap->arg is a pointer to a dqblk structure. */
3315 		datap = (caddr_t) &my_dqblk;
3316 		break;
3317 	case Q_SETQUOTA:
3318 	case Q_SETUSE:
3319 		/* uap->arg is a pointer to a dqblk structure. */
3320 		datap = (caddr_t) &my_dqblk;
3321 		if (proc_is64bit(p)) {
3322 			struct user_dqblk       my_dqblk64;
3323 			error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3324 			if (error == 0) {
3325 				munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3326 			}
3327 		} else {
3328 			error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3329 		}
3330 		break;
3331 	case Q_QUOTASTAT:
3332 		/* uap->arg is a pointer to an integer */
3333 		datap = (caddr_t) &quota_status;
3334 		break;
3335 	default:
3336 		datap = NULL;
3337 		break;
3338 	} /* switch */
3339 
3340 	if (error == 0) {
3341 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3342 	}
3343 
3344 	switch (quota_cmd) {
3345 	case Q_QUOTAON:
3346 		if (datap != NULL) {
3347 			zfree(ZV_NAMEI, datap);
3348 		}
3349 		break;
3350 	case Q_GETQUOTA:
3351 		/* uap->arg is a pointer to a dqblk structure we need to copy out to */
3352 		if (error == 0) {
3353 			if (proc_is64bit(p)) {
3354 				struct user_dqblk       my_dqblk64;
3355 
3356 				memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3357 				munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3358 				error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3359 			} else {
3360 				error = copyout(datap, uap->arg, sizeof(struct dqblk));
3361 			}
3362 		}
3363 		break;
3364 	case Q_QUOTASTAT:
3365 		/* uap->arg is a pointer to an integer */
3366 		if (error == 0) {
3367 			error = copyout(datap, uap->arg, sizeof(quota_status));
3368 		}
3369 		break;
3370 	default:
3371 		break;
3372 	} /* switch */
3373 
3374 out:
3375 	mount_drop(mp, 0);
3376 	return error;
3377 }
3378 #else
3379 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3380 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3381 {
3382 	return EOPNOTSUPP;
3383 }
3384 #endif /* QUOTA */
3385 
3386 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3387 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3388 {
3389 	int error;
3390 	vfs_context_t ctx = vfs_context_current();
3391 
3392 #if CONFIG_MACF
3393 	error = mac_mount_check_stat(ctx, mp);
3394 	if (error != 0) {
3395 		return error;
3396 	}
3397 #endif
3398 
3399 	error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3400 	if (error != 0) {
3401 		return error;
3402 	}
3403 
3404 	return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3405 }
3406 
3407 /*
3408  * Get filesystem statistics.
3409  *
3410  * Returns:	0			Success
3411  *	namei:???
3412  *	vfs_update_vfsstat:???
3413  *	munge_statfs:EFAULT
3414  */
3415 /* ARGSUSED */
3416 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3417 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3418 {
3419 	int error;
3420 	struct mount *mp;
3421 	struct nameidata nd;
3422 	vfs_context_t ctx = vfs_context_current();
3423 	vnode_t vp;
3424 
3425 	NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3426 	    UIO_USERSPACE, uap->path, ctx);
3427 	error = namei(&nd);
3428 	if (error != 0) {
3429 		return error;
3430 	}
3431 	vp = nd.ni_vp;
3432 	mp = vp->v_mount;
3433 	nameidone(&nd);
3434 
3435 	error = statfs_internal(p, mp, uap->buf);
3436 	vnode_put(vp);
3437 
3438 	return error;
3439 }
3440 
3441 /*
3442  * Get filesystem statistics.
3443  */
3444 /* ARGSUSED */
3445 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3446 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3447 {
3448 	int error;
3449 	vnode_t vp = NULL;
3450 	struct mount *mp;
3451 
3452 	AUDIT_ARG(fd, uap->fd);
3453 
3454 	if ((error = file_vnode(uap->fd, &vp)) ||
3455 	    (error = vnode_getwithref(vp))) {
3456 		goto out;
3457 	}
3458 
3459 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3460 
3461 	mp = vp->v_mount;
3462 	if (!mp) {
3463 		error = EBADF;
3464 		goto out_vnode;
3465 	}
3466 
3467 	error = statfs_internal(p, mp, uap->buf);
3468 
3469 out_vnode:
3470 	vnode_put(vp);
3471 
3472 out:
3473 	if (vp != NULL) {
3474 		file_drop(uap->fd);
3475 	}
3476 
3477 	return error;
3478 }
3479 
3480 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3481 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3482 {
3483 	struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3484 
3485 	bzero(sfs, sizeof(*sfs));
3486 
3487 	sfs->f_bsize = vsfs->f_bsize;
3488 	sfs->f_iosize = (int32_t)vsfs->f_iosize;
3489 	sfs->f_blocks = vsfs->f_blocks;
3490 	sfs->f_bfree = vsfs->f_bfree;
3491 	sfs->f_bavail = vsfs->f_bavail;
3492 	sfs->f_files = vsfs->f_files;
3493 	sfs->f_ffree = vsfs->f_ffree;
3494 	sfs->f_fsid = vsfs->f_fsid;
3495 	sfs->f_owner = vsfs->f_owner;
3496 	sfs->f_type = mp->mnt_vtable->vfc_typenum;
3497 	sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3498 	sfs->f_fssubtype = vsfs->f_fssubtype;
3499 	sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3500 	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3501 		strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3502 	} else {
3503 		strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3504 	}
3505 	strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3506 	strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3507 }
3508 
3509 /*
3510  * Get file system statistics in 64-bit mode
3511  */
3512 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3513 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3514 {
3515 	struct mount *mp;
3516 	int error;
3517 	struct nameidata *ndp;
3518 	struct statfs64 *sfsp;
3519 	vfs_context_t ctxp = vfs_context_current();
3520 	vnode_t vp;
3521 	struct {
3522 		struct nameidata nd;
3523 		struct statfs64 sfs;
3524 	} *__nameidata_statfs64;
3525 
3526 	__nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3527 	    Z_WAITOK);
3528 	ndp = &__nameidata_statfs64->nd;
3529 
3530 	NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3531 	    UIO_USERSPACE, uap->path, ctxp);
3532 	error = namei(ndp);
3533 	if (error != 0) {
3534 		goto out;
3535 	}
3536 	vp = ndp->ni_vp;
3537 	mp = vp->v_mount;
3538 	nameidone(ndp);
3539 
3540 #if CONFIG_MACF
3541 	error = mac_mount_check_stat(ctxp, mp);
3542 	if (error != 0) {
3543 		vnode_put(vp);
3544 		goto out;
3545 	}
3546 #endif
3547 
3548 	error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3549 	if (error != 0) {
3550 		vnode_put(vp);
3551 		goto out;
3552 	}
3553 
3554 	sfsp = &__nameidata_statfs64->sfs;
3555 	vfs_get_statfs64(mp, sfsp);
3556 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3557 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3558 		/* This process does not want to see a seperate data volume mountpoint */
3559 		strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3560 	}
3561 	error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3562 	vnode_put(vp);
3563 
3564 out:
3565 	kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3566 
3567 	return error;
3568 }
3569 
3570 /*
3571  * Get file system statistics in 64-bit mode
3572  */
3573 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3574 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3575 {
3576 	struct vnode *vp;
3577 	struct mount *mp;
3578 	struct statfs64 sfs;
3579 	int error;
3580 
3581 	AUDIT_ARG(fd, uap->fd);
3582 
3583 	if ((error = file_vnode(uap->fd, &vp))) {
3584 		return error;
3585 	}
3586 
3587 	error = vnode_getwithref(vp);
3588 	if (error) {
3589 		file_drop(uap->fd);
3590 		return error;
3591 	}
3592 
3593 	AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3594 
3595 	mp = vp->v_mount;
3596 	if (!mp) {
3597 		error = EBADF;
3598 		goto out;
3599 	}
3600 
3601 #if CONFIG_MACF
3602 	error = mac_mount_check_stat(vfs_context_current(), mp);
3603 	if (error != 0) {
3604 		goto out;
3605 	}
3606 #endif
3607 
3608 	if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3609 		goto out;
3610 	}
3611 
3612 	vfs_get_statfs64(mp, &sfs);
3613 	if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3614 	    (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3615 		/* This process does not want to see a seperate data volume mountpoint */
3616 		strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3617 	}
3618 	error = copyout(&sfs, uap->buf, sizeof(sfs));
3619 
3620 out:
3621 	file_drop(uap->fd);
3622 	vnode_put(vp);
3623 
3624 	return error;
3625 }
3626 
3627 struct getfsstat_struct {
3628 	user_addr_t     sfsp;
3629 	user_addr_t     *mp;
3630 	int             count;
3631 	int             maxcount;
3632 	int             flags;
3633 	int             error;
3634 };
3635 
3636 
3637 static int
getfsstat_callback(mount_t mp,void * arg)3638 getfsstat_callback(mount_t mp, void * arg)
3639 {
3640 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3641 	struct vfsstatfs *sp;
3642 	int error, my_size;
3643 	vfs_context_t ctx = vfs_context_current();
3644 
3645 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3646 #if CONFIG_MACF
3647 		error = mac_mount_check_stat(ctx, mp);
3648 		if (error != 0) {
3649 			fstp->error = error;
3650 			return VFS_RETURNED_DONE;
3651 		}
3652 #endif
3653 		sp = &mp->mnt_vfsstat;
3654 		/*
3655 		 * If MNT_NOWAIT is specified, do not refresh the
3656 		 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3657 		 */
3658 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3659 		    (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3660 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3661 		    (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3662 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3663 			return VFS_RETURNED;
3664 		}
3665 
3666 		/*
3667 		 * Need to handle LP64 version of struct statfs
3668 		 */
3669 		error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3670 		if (error) {
3671 			fstp->error = error;
3672 			return VFS_RETURNED_DONE;
3673 		}
3674 		fstp->sfsp += my_size;
3675 
3676 		if (fstp->mp) {
3677 #if CONFIG_MACF
3678 			error = mac_mount_label_get(mp, *fstp->mp);
3679 			if (error) {
3680 				fstp->error = error;
3681 				return VFS_RETURNED_DONE;
3682 			}
3683 #endif
3684 			fstp->mp++;
3685 		}
3686 	}
3687 	fstp->count++;
3688 	return VFS_RETURNED;
3689 }
3690 
3691 /*
3692  * Get statistics on all filesystems.
3693  */
3694 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3695 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3696 {
3697 	struct __mac_getfsstat_args muap;
3698 
3699 	muap.buf = uap->buf;
3700 	muap.bufsize = uap->bufsize;
3701 	muap.mac = USER_ADDR_NULL;
3702 	muap.macsize = 0;
3703 	muap.flags = uap->flags;
3704 
3705 	return __mac_getfsstat(p, &muap, retval);
3706 }
3707 
3708 /*
3709  * __mac_getfsstat: Get MAC-related file system statistics
3710  *
3711  * Parameters:    p                        (ignored)
3712  *                uap                      User argument descriptor (see below)
3713  *                retval                   Count of file system statistics (N stats)
3714  *
3715  * Indirect:      uap->bufsize             Buffer size
3716  *                uap->macsize             MAC info size
3717  *                uap->buf                 Buffer where information will be returned
3718  *                uap->mac                 MAC info
3719  *                uap->flags               File system flags
3720  *
3721  *
3722  * Returns:        0                       Success
3723  *                !0                       Not success
3724  *
3725  */
3726 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3727 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3728 {
3729 	user_addr_t sfsp;
3730 	user_addr_t *mp;
3731 	size_t count, maxcount, bufsize, macsize;
3732 	struct getfsstat_struct fst;
3733 
3734 	if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3735 		return EINVAL;
3736 	}
3737 
3738 	bufsize = (size_t) uap->bufsize;
3739 	macsize = (size_t) uap->macsize;
3740 
3741 	if (IS_64BIT_PROCESS(p)) {
3742 		maxcount = bufsize / sizeof(struct user64_statfs);
3743 	} else {
3744 		maxcount = bufsize / sizeof(struct user32_statfs);
3745 	}
3746 	sfsp = uap->buf;
3747 	count = 0;
3748 
3749 	mp = NULL;
3750 
3751 #if CONFIG_MACF
3752 	if (uap->mac != USER_ADDR_NULL) {
3753 		u_int32_t *mp0;
3754 		int error;
3755 		unsigned int i;
3756 
3757 		count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3758 		if (count != maxcount) {
3759 			return EINVAL;
3760 		}
3761 
3762 		/* Copy in the array */
3763 		mp0 = kalloc_data(macsize, Z_WAITOK);
3764 		if (mp0 == NULL) {
3765 			return ENOMEM;
3766 		}
3767 
3768 		error = copyin(uap->mac, mp0, macsize);
3769 		if (error) {
3770 			kfree_data(mp0, macsize);
3771 			return error;
3772 		}
3773 
3774 		/* Normalize to an array of user_addr_t */
3775 		mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3776 		if (mp == NULL) {
3777 			kfree_data(mp0, macsize);
3778 			return ENOMEM;
3779 		}
3780 
3781 		for (i = 0; i < count; i++) {
3782 			if (IS_64BIT_PROCESS(p)) {
3783 				mp[i] = ((user_addr_t *)mp0)[i];
3784 			} else {
3785 				mp[i] = (user_addr_t)mp0[i];
3786 			}
3787 		}
3788 		kfree_data(mp0, macsize);
3789 	}
3790 #endif
3791 
3792 
3793 	fst.sfsp = sfsp;
3794 	fst.mp = mp;
3795 	fst.flags = uap->flags;
3796 	fst.count = 0;
3797 	fst.error = 0;
3798 	fst.maxcount = (int)maxcount;
3799 
3800 
3801 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3802 
3803 	if (mp) {
3804 		kfree_data(mp, count * sizeof(user_addr_t));
3805 	}
3806 
3807 	if (fst.error) {
3808 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3809 		return fst.error;
3810 	}
3811 
3812 	if (fst.sfsp && fst.count > fst.maxcount) {
3813 		*retval = fst.maxcount;
3814 	} else {
3815 		*retval = fst.count;
3816 	}
3817 	return 0;
3818 }
3819 
3820 static int
getfsstat64_callback(mount_t mp,void * arg)3821 getfsstat64_callback(mount_t mp, void * arg)
3822 {
3823 	struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3824 	struct vfsstatfs *sp;
3825 	struct statfs64 sfs;
3826 	int error;
3827 
3828 	if (fstp->sfsp && fstp->count < fstp->maxcount) {
3829 #if CONFIG_MACF
3830 		error = mac_mount_check_stat(vfs_context_current(), mp);
3831 		if (error != 0) {
3832 			fstp->error = error;
3833 			return VFS_RETURNED_DONE;
3834 		}
3835 #endif
3836 		sp = &mp->mnt_vfsstat;
3837 		/*
3838 		 * If MNT_NOWAIT is specified, do not refresh the fsstat
3839 		 * cache. MNT_WAIT overrides MNT_NOWAIT.
3840 		 *
3841 		 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3842 		 * getfsstat, since the constants are out of the same
3843 		 * namespace.
3844 		 */
3845 		if ((mp->mnt_lflag & MNT_LDEAD) ||
3846 		    ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3847 		    (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3848 		    (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3849 			KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3850 			return VFS_RETURNED;
3851 		}
3852 
3853 		vfs_get_statfs64(mp, &sfs);
3854 		error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3855 		if (error) {
3856 			fstp->error = error;
3857 			return VFS_RETURNED_DONE;
3858 		}
3859 		fstp->sfsp += sizeof(sfs);
3860 	}
3861 	fstp->count++;
3862 	return VFS_RETURNED;
3863 }
3864 
3865 /*
3866  * Get statistics on all file systems in 64 bit mode.
3867  */
3868 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3869 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3870 {
3871 	user_addr_t sfsp;
3872 	int count, maxcount;
3873 	struct getfsstat_struct fst;
3874 
3875 	maxcount = uap->bufsize / sizeof(struct statfs64);
3876 
3877 	sfsp = uap->buf;
3878 	count = 0;
3879 
3880 	fst.sfsp = sfsp;
3881 	fst.flags = uap->flags;
3882 	fst.count = 0;
3883 	fst.error = 0;
3884 	fst.maxcount = maxcount;
3885 
3886 	vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3887 
3888 	if (fst.error) {
3889 		KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3890 		return fst.error;
3891 	}
3892 
3893 	if (fst.sfsp && fst.count > fst.maxcount) {
3894 		*retval = fst.maxcount;
3895 	} else {
3896 		*retval = fst.count;
3897 	}
3898 
3899 	return 0;
3900 }
3901 
3902 /*
3903  * gets the associated vnode with the file descriptor passed.
3904  * as input
3905  *
3906  * INPUT
3907  * ctx - vfs context of caller
3908  * fd - file descriptor for which vnode is required.
3909  * vpp - Pointer to pointer to vnode to be returned.
3910  *
3911  * The vnode is returned with an iocount so any vnode obtained
3912  * by this call needs a vnode_put
3913  *
3914  */
3915 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3916 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3917 {
3918 	int error;
3919 	vnode_t vp;
3920 	struct fileproc *fp;
3921 	proc_t p = vfs_context_proc(ctx);
3922 
3923 	*vpp =  NULLVP;
3924 
3925 	error = fp_getfvp(p, fd, &fp, &vp);
3926 	if (error) {
3927 		return error;
3928 	}
3929 
3930 	error = vnode_getwithref(vp);
3931 	if (error) {
3932 		(void)fp_drop(p, fd, fp, 0);
3933 		return error;
3934 	}
3935 
3936 	(void)fp_drop(p, fd, fp, 0);
3937 	*vpp = vp;
3938 	return error;
3939 }
3940 
3941 /*
3942  * Wrapper function around namei to start lookup from a directory
3943  * specified by a file descriptor ni_dirfd.
3944  *
3945  * In addition to all the errors returned by namei, this call can
3946  * return ENOTDIR if the file descriptor does not refer to a directory.
3947  * and EBADF if the file descriptor is not valid.
3948  */
3949 int
nameiat(struct nameidata * ndp,int dirfd)3950 nameiat(struct nameidata *ndp, int dirfd)
3951 {
3952 	if ((dirfd != AT_FDCWD) &&
3953 	    !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3954 	    !(ndp->ni_cnd.cn_flags & USEDVP)) {
3955 		int error = 0;
3956 		char c;
3957 
3958 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3959 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
3960 			if (error) {
3961 				return error;
3962 			}
3963 		} else {
3964 			c = *((char *)(ndp->ni_dirp));
3965 		}
3966 
3967 		if (c != '/') {
3968 			vnode_t dvp_at;
3969 
3970 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3971 			    &dvp_at);
3972 			if (error) {
3973 				return error;
3974 			}
3975 
3976 			if (vnode_vtype(dvp_at) != VDIR) {
3977 				vnode_put(dvp_at);
3978 				return ENOTDIR;
3979 			}
3980 
3981 			ndp->ni_dvp = dvp_at;
3982 			ndp->ni_cnd.cn_flags |= USEDVP;
3983 			error = namei(ndp);
3984 			ndp->ni_cnd.cn_flags &= ~USEDVP;
3985 			vnode_put(dvp_at);
3986 			return error;
3987 		}
3988 	}
3989 
3990 	return namei(ndp);
3991 }
3992 
3993 /*
3994  * Change current working directory to a given file descriptor.
3995  */
3996 /* ARGSUSED */
3997 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)3998 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3999 {
4000 	vnode_t vp;
4001 	vnode_t tdp;
4002 	vnode_t tvp;
4003 	struct mount *mp;
4004 	int error, should_put = 1;
4005 	vfs_context_t ctx = vfs_context_current();
4006 
4007 	AUDIT_ARG(fd, uap->fd);
4008 	if (per_thread && uap->fd == -1) {
4009 		/*
4010 		 * Switching back from per-thread to per process CWD; verify we
4011 		 * in fact have one before proceeding.  The only success case
4012 		 * for this code path is to return 0 preemptively after zapping
4013 		 * the thread structure contents.
4014 		 */
4015 		thread_t th = vfs_context_thread(ctx);
4016 		if (th) {
4017 			uthread_t uth = get_bsdthread_info(th);
4018 			tvp = uth->uu_cdir;
4019 			uth->uu_cdir = NULLVP;
4020 			if (tvp != NULLVP) {
4021 				vnode_rele(tvp);
4022 				return 0;
4023 			}
4024 		}
4025 		return EBADF;
4026 	}
4027 
4028 	if ((error = file_vnode(uap->fd, &vp))) {
4029 		return error;
4030 	}
4031 	if ((error = vnode_getwithref(vp))) {
4032 		file_drop(uap->fd);
4033 		return error;
4034 	}
4035 
4036 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4037 
4038 	if (vp->v_type != VDIR) {
4039 		error = ENOTDIR;
4040 		goto out;
4041 	}
4042 
4043 #if CONFIG_MACF
4044 	error = mac_vnode_check_chdir(ctx, vp);
4045 	if (error) {
4046 		goto out;
4047 	}
4048 #endif
4049 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4050 	if (error) {
4051 		goto out;
4052 	}
4053 
4054 	while (!error && (mp = vp->v_mountedhere) != NULL) {
4055 		if (vfs_busy(mp, LK_NOWAIT)) {
4056 			error = EACCES;
4057 			goto out;
4058 		}
4059 		error = VFS_ROOT(mp, &tdp, ctx);
4060 		vfs_unbusy(mp);
4061 		if (error) {
4062 			break;
4063 		}
4064 		vnode_put(vp);
4065 		vp = tdp;
4066 	}
4067 	if (error) {
4068 		goto out;
4069 	}
4070 	if ((error = vnode_ref(vp))) {
4071 		goto out;
4072 	}
4073 	vnode_put(vp);
4074 	should_put = 0;
4075 
4076 	if (per_thread) {
4077 		thread_t th = vfs_context_thread(ctx);
4078 		if (th) {
4079 			uthread_t uth = get_bsdthread_info(th);
4080 			tvp = uth->uu_cdir;
4081 			uth->uu_cdir = vp;
4082 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4083 		} else {
4084 			vnode_rele(vp);
4085 			error = ENOENT;
4086 			goto out;
4087 		}
4088 	} else {
4089 		proc_dirs_lock_exclusive(p);
4090 		proc_fdlock(p);
4091 		tvp = p->p_fd.fd_cdir;
4092 		p->p_fd.fd_cdir = vp;
4093 		proc_fdunlock(p);
4094 		proc_dirs_unlock_exclusive(p);
4095 	}
4096 
4097 	if (tvp) {
4098 		vnode_rele(tvp);
4099 	}
4100 
4101 out:
4102 	if (should_put) {
4103 		vnode_put(vp);
4104 	}
4105 	file_drop(uap->fd);
4106 
4107 	return error;
4108 }
4109 
4110 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4111 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4112 {
4113 	return common_fchdir(p, uap, 0);
4114 }
4115 
4116 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4117 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4118 {
4119 	return common_fchdir(p, (void *)uap, 1);
4120 }
4121 
4122 
4123 /*
4124  * Change current working directory (".").
4125  *
4126  * Returns:	0			Success
4127  *	change_dir:ENOTDIR
4128  *	change_dir:???
4129  *	vnode_ref:ENOENT		No such file or directory
4130  */
4131 /* ARGSUSED */
4132 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4133 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4134 {
4135 	int error;
4136 	vnode_t tvp;
4137 
4138 	error = change_dir(ndp, ctx);
4139 	if (error) {
4140 		return error;
4141 	}
4142 	if ((error = vnode_ref(ndp->ni_vp))) {
4143 		vnode_put(ndp->ni_vp);
4144 		return error;
4145 	}
4146 	/*
4147 	 * drop the iocount we picked up in change_dir
4148 	 */
4149 	vnode_put(ndp->ni_vp);
4150 
4151 	if (per_thread) {
4152 		thread_t th = vfs_context_thread(ctx);
4153 		if (th) {
4154 			uthread_t uth = get_bsdthread_info(th);
4155 			tvp = uth->uu_cdir;
4156 			uth->uu_cdir = ndp->ni_vp;
4157 			OSBitOrAtomic(P_THCWD, &p->p_flag);
4158 		} else {
4159 			vnode_rele(ndp->ni_vp);
4160 			return ENOENT;
4161 		}
4162 	} else {
4163 		proc_dirs_lock_exclusive(p);
4164 		proc_fdlock(p);
4165 		tvp = p->p_fd.fd_cdir;
4166 		p->p_fd.fd_cdir = ndp->ni_vp;
4167 		proc_fdunlock(p);
4168 		proc_dirs_unlock_exclusive(p);
4169 	}
4170 
4171 	if (tvp) {
4172 		vnode_rele(tvp);
4173 	}
4174 
4175 	return 0;
4176 }
4177 
4178 
4179 /*
4180  * Change current working directory (".").
4181  *
4182  * Returns:	0			Success
4183  *	chdir_internal:ENOTDIR
4184  *	chdir_internal:ENOENT		No such file or directory
4185  *	chdir_internal:???
4186  */
4187 /* ARGSUSED */
4188 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4189 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4190 {
4191 	struct nameidata nd;
4192 	vfs_context_t ctx = vfs_context_current();
4193 
4194 	NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4195 	    UIO_USERSPACE, uap->path, ctx);
4196 
4197 	return chdir_internal(p, ctx, &nd, per_thread);
4198 }
4199 
4200 
4201 /*
4202  * chdir
4203  *
4204  * Change current working directory (".") for the entire process
4205  *
4206  * Parameters:  p       Process requesting the call
4207  *              uap     User argument descriptor (see below)
4208  *              retval  (ignored)
4209  *
4210  * Indirect parameters:	uap->path	Directory path
4211  *
4212  * Returns:	0			Success
4213  *              common_chdir: ENOTDIR
4214  *              common_chdir: ENOENT	No such file or directory
4215  *              common_chdir: ???
4216  *
4217  */
4218 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4219 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4220 {
4221 	return common_chdir(p, (void *)uap, 0);
4222 }
4223 
4224 /*
4225  * __pthread_chdir
4226  *
4227  * Change current working directory (".") for a single thread
4228  *
4229  * Parameters:  p       Process requesting the call
4230  *              uap     User argument descriptor (see below)
4231  *              retval  (ignored)
4232  *
4233  * Indirect parameters:	uap->path	Directory path
4234  *
4235  * Returns:	0			Success
4236  *              common_chdir: ENOTDIR
4237  *		common_chdir: ENOENT	No such file or directory
4238  *		common_chdir: ???
4239  *
4240  */
4241 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4242 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4243 {
4244 	return common_chdir(p, (void *)uap, 1);
4245 }
4246 
4247 
4248 /*
4249  * Change notion of root (``/'') directory.
4250  */
4251 /* ARGSUSED */
4252 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4253 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4254 {
4255 	struct filedesc *fdp = &p->p_fd;
4256 	int error;
4257 	struct nameidata nd;
4258 	vnode_t tvp;
4259 	vfs_context_t ctx = vfs_context_current();
4260 
4261 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4262 		return error;
4263 	}
4264 
4265 	NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4266 	    UIO_USERSPACE, uap->path, ctx);
4267 	error = change_dir(&nd, ctx);
4268 	if (error) {
4269 		return error;
4270 	}
4271 
4272 #if CONFIG_MACF
4273 	error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4274 	    &nd.ni_cnd);
4275 	if (error) {
4276 		vnode_put(nd.ni_vp);
4277 		return error;
4278 	}
4279 #endif
4280 
4281 	if ((error = vnode_ref(nd.ni_vp))) {
4282 		vnode_put(nd.ni_vp);
4283 		return error;
4284 	}
4285 	vnode_put(nd.ni_vp);
4286 
4287 	/*
4288 	 * This lock provides the guarantee that as long as you hold the lock
4289 	 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4290 	 * on a referenced vnode in namei when determining the rootvnode for
4291 	 * a process.
4292 	 */
4293 	/* needed for synchronization with lookup */
4294 	proc_dirs_lock_exclusive(p);
4295 	/* needed for setting the flag and other activities on the fd itself */
4296 	proc_fdlock(p);
4297 	tvp = fdp->fd_rdir;
4298 	fdp->fd_rdir = nd.ni_vp;
4299 	fdt_flag_set(fdp, FD_CHROOT);
4300 	proc_fdunlock(p);
4301 	proc_dirs_unlock_exclusive(p);
4302 
4303 	if (tvp != NULL) {
4304 		vnode_rele(tvp);
4305 	}
4306 
4307 	return 0;
4308 }
4309 
4310 #define PATHSTATICBUFLEN 256
4311 #define PIVOT_ROOT_ENTITLEMENT              \
4312        "com.apple.private.vfs.pivot-root"
4313 
4314 #if defined(XNU_TARGET_OS_OSX)
4315 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4316 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4317 {
4318 	int error;
4319 	char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4320 	char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4321 	char *new_rootfs_path_before_buf = NULL;
4322 	char *old_rootfs_path_after_buf = NULL;
4323 	char *incoming = NULL;
4324 	char *outgoing = NULL;
4325 	vnode_t incoming_rootvp = NULLVP;
4326 	size_t bytes_copied;
4327 
4328 	/*
4329 	 * XXX : Additional restrictions needed
4330 	 * - perhaps callable only once.
4331 	 */
4332 	if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4333 		return error;
4334 	}
4335 
4336 	/*
4337 	 * pivot_root can be executed by launchd only.
4338 	 * Enforce entitlement.
4339 	 */
4340 	if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4341 		return EPERM;
4342 	}
4343 
4344 	error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4345 	if (error == ENAMETOOLONG) {
4346 		new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4347 		error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4348 	}
4349 
4350 	if (error) {
4351 		goto out;
4352 	}
4353 
4354 	error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4355 	if (error == ENAMETOOLONG) {
4356 		old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4357 		error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4358 	}
4359 	if (error) {
4360 		goto out;
4361 	}
4362 
4363 	if (new_rootfs_path_before_buf) {
4364 		incoming = new_rootfs_path_before_buf;
4365 	} else {
4366 		incoming = &new_rootfs_path_before[0];
4367 	}
4368 
4369 	if (old_rootfs_path_after_buf) {
4370 		outgoing = old_rootfs_path_after_buf;
4371 	} else {
4372 		outgoing = &old_rootfs_path_after[0];
4373 	}
4374 
4375 	/*
4376 	 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4377 	 * Userland is not allowed to pivot to an image.
4378 	 */
4379 	error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4380 	if (error) {
4381 		goto out;
4382 	}
4383 	error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4384 	if (error) {
4385 		goto out;
4386 	}
4387 
4388 	error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4389 
4390 out:
4391 	if (incoming_rootvp != NULLVP) {
4392 		vnode_put(incoming_rootvp);
4393 		incoming_rootvp = NULLVP;
4394 	}
4395 
4396 	if (old_rootfs_path_after_buf) {
4397 		zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4398 	}
4399 
4400 	if (new_rootfs_path_before_buf) {
4401 		zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4402 	}
4403 
4404 	return error;
4405 }
4406 #else
4407 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4408 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4409 {
4410 	return nosys(p, NULL, retval);
4411 }
4412 #endif /* XNU_TARGET_OS_OSX */
4413 
4414 /*
4415  * Common routine for chroot and chdir.
4416  *
4417  * Returns:	0			Success
4418  *		ENOTDIR			Not a directory
4419  *		namei:???		[anything namei can return]
4420  *		vnode_authorize:???	[anything vnode_authorize can return]
4421  */
4422 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4423 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4424 {
4425 	vnode_t vp;
4426 	int error;
4427 
4428 	if ((error = namei(ndp))) {
4429 		return error;
4430 	}
4431 	nameidone(ndp);
4432 	vp = ndp->ni_vp;
4433 
4434 	if (vp->v_type != VDIR) {
4435 		vnode_put(vp);
4436 		return ENOTDIR;
4437 	}
4438 
4439 #if CONFIG_MACF
4440 	error = mac_vnode_check_chdir(ctx, vp);
4441 	if (error) {
4442 		vnode_put(vp);
4443 		return error;
4444 	}
4445 #endif
4446 
4447 	error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4448 	if (error) {
4449 		vnode_put(vp);
4450 		return error;
4451 	}
4452 
4453 	return error;
4454 }
4455 
4456 /*
4457  * Free the vnode data (for directories) associated with the file glob.
4458  */
4459 struct fd_vn_data *
fg_vn_data_alloc(void)4460 fg_vn_data_alloc(void)
4461 {
4462 	struct fd_vn_data *fvdata;
4463 
4464 	/* Allocate per fd vnode data */
4465 	fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4466 	lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4467 	return fvdata;
4468 }
4469 
4470 /*
4471  * Free the vnode data (for directories) associated with the file glob.
4472  */
4473 void
fg_vn_data_free(void * fgvndata)4474 fg_vn_data_free(void *fgvndata)
4475 {
4476 	struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4477 
4478 	kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4479 	lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4480 	kfree_type(struct fd_vn_data, fvdata);
4481 }
4482 
4483 /*
4484  * Check permissions, allocate an open file structure,
4485  * and call the device open routine if any.
4486  *
4487  * Returns:	0			Success
4488  *		EINVAL
4489  *		EINTR
4490  *	falloc:ENFILE
4491  *	falloc:EMFILE
4492  *	falloc:ENOMEM
4493  *	vn_open_auth:???
4494  *	dupfdopen:???
4495  *	VNOP_ADVLOCK:???
4496  *	vnode_setsize:???
4497  *
4498  * XXX Need to implement uid, gid
4499  */
4500 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4501 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4502     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4503 {
4504 	proc_t p = vfs_context_proc(ctx);
4505 	uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4506 	struct fileproc *fp;
4507 	vnode_t vp;
4508 	int flags, oflags, amode;
4509 	int type, indx, error;
4510 	struct vfs_context context;
4511 	vnode_t authvp = NULLVP;
4512 
4513 	oflags = uflags;
4514 
4515 	amode = oflags & O_ACCMODE;
4516 	/*
4517 	 * Because O_RDONLY is 0, it is not possible to distinguish between
4518 	 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4519 	 * with FREAD/FWRITE.
4520 	 */
4521 	if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4522 		return EINVAL;
4523 	}
4524 
4525 	flags = FFLAGS(uflags);
4526 	CLR(flags, FENCRYPTED);
4527 	CLR(flags, FUNENCRYPTED);
4528 
4529 	AUDIT_ARG(fflags, oflags);
4530 	AUDIT_ARG(mode, vap->va_mode);
4531 
4532 	if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4533 		return error;
4534 	}
4535 	if (flags & O_CLOEXEC) {
4536 		fp->fp_flags |= FP_CLOEXEC;
4537 	}
4538 	if (flags & O_CLOFORK) {
4539 		fp->fp_flags |= FP_CLOFORK;
4540 	}
4541 
4542 	/* setup state to recognize when fdesc_open was called */
4543 	uu->uu_dupfd = -1;
4544 
4545 	/*
4546 	 * Disable read/write access if file is opened with O_EVTONLY and
4547 	 * the process has requested to deny read/write access.
4548 	 */
4549 	if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4550 		flags &= ~(FREAD | FWRITE);
4551 	}
4552 
4553 	if (authfd != AUTH_OPEN_NOAUTHFD) {
4554 		error = vnode_getfromfd(ctx, authfd, &authvp);
4555 		if (error) {
4556 			fp_free(p, indx, fp);
4557 			return error;
4558 		}
4559 	}
4560 
4561 	if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4562 		if (authvp != NULLVP) {
4563 			vnode_put(authvp);
4564 		}
4565 		if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4566 			if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4567 				*retval = indx;
4568 				return 0;
4569 			}
4570 		}
4571 		if (error == ERESTART) {
4572 			error = EINTR;
4573 		}
4574 		fp_free(p, indx, fp);
4575 		return error;
4576 	}
4577 
4578 	if (authvp != NULLVP) {
4579 		vnode_put(authvp);
4580 	}
4581 
4582 	uu->uu_dupfd = 0;
4583 	vp = ndp->ni_vp;
4584 
4585 	fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4586 	fp->fp_glob->fg_ops = &vnops;
4587 	fp_set_data(fp, vp);
4588 
4589 #if CONFIG_FILE_LEASES
4590 	/*
4591 	 * If we are creating a file or open with truncate, we need to break the
4592 	 * lease if there is a read lease placed on the parent dir.
4593 	 */
4594 	if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4595 		vnode_breakdirlease(vp, true, oflags);
4596 	}
4597 	/* Now check if there is a lease placed on the file itself. */
4598 	error = vnode_breaklease(vp, oflags, ctx);
4599 	if (error) {
4600 		goto bad;
4601 	}
4602 #endif /* CONFIG_FILE_LEASES */
4603 
4604 	if (flags & (O_EXLOCK | O_SHLOCK)) {
4605 		struct flock lf = {
4606 			.l_whence = SEEK_SET,
4607 		};
4608 
4609 		if (flags & O_EXLOCK) {
4610 			lf.l_type = F_WRLCK;
4611 		} else {
4612 			lf.l_type = F_RDLCK;
4613 		}
4614 		type = F_FLOCK;
4615 		if ((flags & FNONBLOCK) == 0) {
4616 			type |= F_WAIT;
4617 		}
4618 #if CONFIG_MACF
4619 		error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4620 		    F_SETLK, &lf);
4621 		if (error) {
4622 			goto bad;
4623 		}
4624 #endif
4625 		if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4626 			goto bad;
4627 		}
4628 		fp->fp_glob->fg_flag |= FWASLOCKED;
4629 	}
4630 
4631 	/* try to truncate by setting the size attribute */
4632 	if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4633 		goto bad;
4634 	}
4635 
4636 	/*
4637 	 * For directories we hold some additional information in the fd.
4638 	 */
4639 	if (vnode_vtype(vp) == VDIR) {
4640 		fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4641 	} else {
4642 		fp->fp_glob->fg_vn_data = NULL;
4643 	}
4644 
4645 	vnode_put(vp);
4646 
4647 	/*
4648 	 * The first terminal open (without a O_NOCTTY) by a session leader
4649 	 * results in it being set as the controlling terminal.
4650 	 */
4651 	if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4652 	    !(flags & O_NOCTTY)) {
4653 		int tmp = 0;
4654 
4655 		(void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4656 		    (caddr_t)&tmp, ctx);
4657 	}
4658 
4659 	proc_fdlock(p);
4660 	procfdtbl_releasefd(p, indx, NULL);
4661 
4662 #if CONFIG_SECLUDED_MEMORY
4663 	if (secluded_for_filecache &&
4664 	    FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE &&
4665 	    vnode_vtype(vp) == VREG) {
4666 		memory_object_control_t moc;
4667 
4668 		moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4669 
4670 		if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4671 			/* nothing to do... */
4672 		} else if (fp->fp_glob->fg_flag & FWRITE) {
4673 			/* writable -> no longer  eligible for secluded pages */
4674 			memory_object_mark_eligible_for_secluded(moc,
4675 			    FALSE);
4676 		} else if (secluded_for_filecache == 1) {
4677 			char pathname[32] = { 0, };
4678 			size_t copied;
4679 			/* XXX FBDP: better way to detect /Applications/ ? */
4680 			if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4681 				(void)copyinstr(ndp->ni_dirp,
4682 				    pathname,
4683 				    sizeof(pathname),
4684 				    &copied);
4685 			} else {
4686 				copystr(CAST_DOWN(void *, ndp->ni_dirp),
4687 				    pathname,
4688 				    sizeof(pathname),
4689 				    &copied);
4690 			}
4691 			pathname[sizeof(pathname) - 1] = '\0';
4692 			if (strncmp(pathname,
4693 			    "/Applications/",
4694 			    strlen("/Applications/")) == 0 &&
4695 			    strncmp(pathname,
4696 			    "/Applications/Camera.app/",
4697 			    strlen("/Applications/Camera.app/")) != 0) {
4698 				/*
4699 				 * not writable
4700 				 * AND from "/Applications/"
4701 				 * AND not from "/Applications/Camera.app/"
4702 				 * ==> eligible for secluded
4703 				 */
4704 				memory_object_mark_eligible_for_secluded(moc,
4705 				    TRUE);
4706 			}
4707 		} else if (secluded_for_filecache == 2) {
4708 			size_t len = strlen(vp->v_name);
4709 			if (!strncmp(vp->v_name, "dyld", len) ||
4710 			    !strncmp(vp->v_name, "launchd", len) ||
4711 			    !strncmp(vp->v_name, "Camera", len) ||
4712 			    !strncmp(vp->v_name, "SpringBoard", len) ||
4713 			    !strncmp(vp->v_name, "backboardd", len)) {
4714 				/*
4715 				 * This file matters when launching Camera:
4716 				 * do not store its contents in the secluded
4717 				 * pool that will be drained on Camera launch.
4718 				 */
4719 				memory_object_mark_eligible_for_secluded(moc,
4720 				    FALSE);
4721 			} else if (!strncmp(vp->v_name, "mediaserverd", len)) {
4722 				memory_object_mark_eligible_for_secluded(moc,
4723 				    FALSE);
4724 				memory_object_mark_for_realtime(moc,
4725 				    true);
4726 			} else if (!strncmp(vp->v_name, "bluetoothd", len)) {
4727 				/*
4728 				 * bluetoothd might be needed for realtime audio
4729 				 * playback.
4730 				 */
4731 				memory_object_mark_eligible_for_secluded(moc,
4732 				    FALSE);
4733 				memory_object_mark_for_realtime(moc,
4734 				    true);
4735 			} else {
4736 				char pathname[64] = { 0, };
4737 				size_t copied;
4738 				if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4739 					(void)copyinstr(ndp->ni_dirp,
4740 					    pathname,
4741 					    sizeof(pathname),
4742 					    &copied);
4743 				} else {
4744 					copystr(CAST_DOWN(void *, ndp->ni_dirp),
4745 					    pathname,
4746 					    sizeof(pathname),
4747 					    &copied);
4748 				}
4749 				pathname[sizeof(pathname) - 1] = '\0';
4750 				if (strncmp(pathname,
4751 				    "/Library/Audio/Plug-Ins/",
4752 				    strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4753 				    strncmp(pathname,
4754 				    "/System/Library/Audio/Plug-Ins/",
4755 				    strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4756 					/*
4757 					 * This may be an audio plugin required
4758 					 * for realtime playback.
4759 					 * ==> NOT eligible for secluded.
4760 					 */
4761 					memory_object_mark_eligible_for_secluded(moc,
4762 					    FALSE);
4763 					memory_object_mark_for_realtime(moc,
4764 					    true);
4765 				}
4766 			}
4767 		}
4768 	}
4769 #endif /* CONFIG_SECLUDED_MEMORY */
4770 
4771 	fp_drop(p, indx, fp, 1);
4772 	proc_fdunlock(p);
4773 
4774 	*retval = indx;
4775 
4776 	return 0;
4777 bad:
4778 	context = *vfs_context_current();
4779 	context.vc_ucred = fp->fp_glob->fg_cred;
4780 
4781 	if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4782 	    (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4783 		struct flock lf = {
4784 			.l_whence = SEEK_SET,
4785 			.l_type = F_UNLCK,
4786 		};
4787 
4788 		(void)VNOP_ADVLOCK(
4789 			vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4790 	}
4791 
4792 	vn_close(vp, fp->fp_glob->fg_flag, &context);
4793 	vnode_put(vp);
4794 	fp_free(p, indx, fp);
4795 
4796 	return error;
4797 }
4798 
4799 /*
4800  * While most of the *at syscall handlers can call nameiat() which
4801  * is a wrapper around namei, the use of namei and initialisation
4802  * of nameidata are far removed and in different functions  - namei
4803  * gets called in vn_open_auth for open1. So we'll just do here what
4804  * nameiat() does.
4805  */
4806 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4807 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4808     struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4809     int dirfd, int authfd)
4810 {
4811 	if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4812 		int error;
4813 		char c;
4814 
4815 		if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4816 			error = copyin(ndp->ni_dirp, &c, sizeof(char));
4817 			if (error) {
4818 				return error;
4819 			}
4820 		} else {
4821 			c = *((char *)(ndp->ni_dirp));
4822 		}
4823 
4824 		if (c != '/') {
4825 			vnode_t dvp_at;
4826 
4827 			error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4828 			    &dvp_at);
4829 			if (error) {
4830 				return error;
4831 			}
4832 
4833 			if (vnode_vtype(dvp_at) != VDIR) {
4834 				vnode_put(dvp_at);
4835 				return ENOTDIR;
4836 			}
4837 
4838 			ndp->ni_dvp = dvp_at;
4839 			ndp->ni_cnd.cn_flags |= USEDVP;
4840 			error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4841 			    retval, authfd);
4842 			vnode_put(dvp_at);
4843 			return error;
4844 		}
4845 	}
4846 
4847 	return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4848 }
4849 
4850 /*
4851  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4852  *
4853  * Parameters:	p			Process requesting the open
4854  *		uap			User argument descriptor (see below)
4855  *		retval			Pointer to an area to receive the
4856  *					return calue from the system call
4857  *
4858  * Indirect:	uap->path		Path to open (same as 'open')
4859  *		uap->flags		Flags to open (same as 'open'
4860  *		uap->uid		UID to set, if creating
4861  *		uap->gid		GID to set, if creating
4862  *		uap->mode		File mode, if creating (same as 'open')
4863  *		uap->xsecurity		ACL to set, if creating
4864  *
4865  * Returns:	0			Success
4866  *		!0			errno value
4867  *
4868  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
4869  *
4870  * XXX:		We should enummerate the possible errno values here, and where
4871  *		in the code they originated.
4872  */
4873 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4874 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4875 {
4876 	int ciferror;
4877 	kauth_filesec_t xsecdst;
4878 	struct vnode_attr va;
4879 	struct nameidata nd;
4880 	int cmode;
4881 
4882 	AUDIT_ARG(owner, uap->uid, uap->gid);
4883 
4884 	xsecdst = NULL;
4885 	if ((uap->xsecurity != USER_ADDR_NULL) &&
4886 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4887 		return ciferror;
4888 	}
4889 
4890 	VATTR_INIT(&va);
4891 	cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4892 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4893 	if (uap->uid != KAUTH_UID_NONE) {
4894 		VATTR_SET(&va, va_uid, uap->uid);
4895 	}
4896 	if (uap->gid != KAUTH_GID_NONE) {
4897 		VATTR_SET(&va, va_gid, uap->gid);
4898 	}
4899 	if (xsecdst != NULL) {
4900 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4901 		va.va_vaflags |= VA_FILESEC_ACL;
4902 	}
4903 
4904 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4905 	    uap->path, vfs_context_current());
4906 
4907 	ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4908 	    NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4909 	if (xsecdst != NULL) {
4910 		kauth_filesec_free(xsecdst);
4911 	}
4912 
4913 	return ciferror;
4914 }
4915 
4916 /*
4917  * Go through the data-protected atomically controlled open (2)
4918  *
4919  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4920  */
4921 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4922 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4923     int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4924 {
4925 	/*
4926 	 * Follow the same path as normal open(2)
4927 	 * Look up the item if it exists, and acquire the vnode.
4928 	 */
4929 	struct vnode_attr va;
4930 	struct nameidata nd;
4931 	int cmode;
4932 	int error;
4933 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4934 
4935 	VATTR_INIT(&va);
4936 	/* Mask off all but regular access permissions */
4937 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4938 	VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4939 
4940 	NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4941 	    path, ctx);
4942 
4943 	/*
4944 	 * Initialize the extra fields in vnode_attr to pass down our
4945 	 * extra fields.
4946 	 * 1. target cprotect class.
4947 	 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4948 	 */
4949 	if (flags & O_CREAT) {
4950 		/* lower level kernel code validates that the class is valid before applying it. */
4951 		if (class != PROTECTION_CLASS_DEFAULT) {
4952 			/*
4953 			 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4954 			 * file behave the same as open (2)
4955 			 */
4956 			VATTR_SET(&va, va_dataprotect_class, class);
4957 		}
4958 	}
4959 
4960 	if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4961 		if (flags & (O_RDWR | O_WRONLY)) {
4962 			/*
4963 			 * Not allowed to write raw encrypted bytes or when opening authenticated.
4964 			 */
4965 			return EINVAL;
4966 		}
4967 		if (dpflags & O_DP_GETRAWENCRYPTED) {
4968 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4969 		}
4970 		if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4971 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4972 		}
4973 		if (dpflags & O_DP_AUTHENTICATE) {
4974 			VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4975 		}
4976 	}
4977 
4978 	error = open1at(vfs_context_current(), &nd, flags, &va,
4979 	    NULL, NULL, retval, fd, authfd);
4980 
4981 	return error;
4982 }
4983 
4984 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)4985 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
4986 {
4987 	if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
4988 		return EINVAL;
4989 	}
4990 
4991 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
4992 	           uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
4993 }
4994 
4995 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)4996 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4997 {
4998 	if (uap->dpflags & O_DP_AUTHENTICATE) {
4999 		return EINVAL;
5000 	}
5001 
5002 	return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5003 	           uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5004 }
5005 
5006 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5007 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5008     int fd, enum uio_seg segflg, int *retval)
5009 {
5010 	struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5011 	struct {
5012 		struct vnode_attr va;
5013 		struct nameidata nd;
5014 	} *__open_data;
5015 	struct vnode_attr *vap;
5016 	struct nameidata *ndp;
5017 	int cmode;
5018 	int error;
5019 
5020 	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5021 	vap = &__open_data->va;
5022 	ndp = &__open_data->nd;
5023 
5024 	VATTR_INIT(vap);
5025 	/* Mask off all but regular access permissions */
5026 	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5027 	VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5028 
5029 	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5030 	    segflg, path, ctx);
5031 
5032 	error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5033 
5034 	kfree_type(typeof(*__open_data), __open_data);
5035 
5036 	return error;
5037 }
5038 
5039 int
open(proc_t p,struct open_args * uap,int32_t * retval)5040 open(proc_t p, struct open_args *uap, int32_t *retval)
5041 {
5042 	__pthread_testcancel(1);
5043 	return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5044 }
5045 
5046 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5047 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5048     int32_t *retval)
5049 {
5050 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5051 	           uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5052 }
5053 
5054 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5055 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5056     int32_t *retval)
5057 {
5058 	return openat_internal(vfs_context_current(), uap->path, uap->flags,
5059 	           uap->mode, uap->fd, UIO_USERSPACE, retval);
5060 }
5061 
5062 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5063 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5064 {
5065 	__pthread_testcancel(1);
5066 	return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5067 }
5068 
5069 #define OPEN_BY_ID_ENTITLEMENT  "com.apple.private.vfs.open-by-id"
5070 
5071 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5072 vfs_context_can_open_by_id(vfs_context_t ctx)
5073 {
5074 	if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5075 		return TRUE;
5076 	}
5077 
5078 	return IOTaskHasEntitlement(vfs_context_task(ctx),
5079 	           OPEN_BY_ID_ENTITLEMENT);
5080 }
5081 
5082 /*
5083  * openbyid_np: open a file given a file system id and a file system object id
5084  *	the hfs file system object id is an fsobj_id_t {uint32, uint32}
5085  *	file systems that don't support object ids it is a node id (uint64_t).
5086  *
5087  * Parameters:	p			Process requesting the open
5088  *		uap			User argument descriptor (see below)
5089  *		retval			Pointer to an area to receive the
5090  *					return calue from the system call
5091  *
5092  * Indirect:	uap->path		Path to open (same as 'open')
5093  *
5094  *		uap->fsid		id of target file system
5095  *		uap->objid		id of target file system object
5096  *		uap->flags		Flags to open (same as 'open')
5097  *
5098  * Returns:	0			Success
5099  *		!0			errno value
5100  *
5101  *
5102  * XXX:		We should enummerate the possible errno values here, and where
5103  *		in the code they originated.
5104  */
5105 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5106 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5107 {
5108 	fsid_t fsid;
5109 	uint64_t objid;
5110 	int error;
5111 	char *buf = NULL;
5112 	int buflen = MAXPATHLEN;
5113 	int pathlen = 0;
5114 	vfs_context_t ctx = vfs_context_current();
5115 
5116 	if (!vfs_context_can_open_by_id(ctx)) {
5117 		return EPERM;
5118 	}
5119 
5120 	if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5121 		return error;
5122 	}
5123 
5124 	/*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5125 	if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5126 		return error;
5127 	}
5128 
5129 	AUDIT_ARG(value32, fsid.val[0]);
5130 	AUDIT_ARG(value64, objid);
5131 
5132 	/*resolve path from fsis, objid*/
5133 	do {
5134 		buf = kalloc_data(buflen + 1, Z_WAITOK);
5135 		if (buf == NULL) {
5136 			return ENOMEM;
5137 		}
5138 
5139 		error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5140 		    buf, FSOPT_ISREALFSID, &pathlen);
5141 
5142 		if (error) {
5143 			kfree_data(buf, buflen + 1);
5144 			buf = NULL;
5145 		}
5146 	} while (error == ENOSPC && (buflen += MAXPATHLEN));
5147 
5148 	if (error) {
5149 		return error;
5150 	}
5151 
5152 	buf[pathlen] = 0;
5153 
5154 	error = openat_internal(
5155 		ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5156 
5157 	kfree_data(buf, buflen + 1);
5158 
5159 	return error;
5160 }
5161 
5162 
5163 /*
5164  * Create a special file.
5165  */
5166 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5167     int fd);
5168 
5169 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5170 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5171     mode_t mode, int fd)
5172 {
5173 	vfs_context_t ctx = vfs_context_current();
5174 	struct nameidata nd;
5175 	vnode_t vp, dvp;
5176 	int error;
5177 
5178 	/* If it's a mknod() of a FIFO, call mkfifo1() instead */
5179 	if ((mode & S_IFMT) == S_IFIFO) {
5180 		return mkfifo1(ctx, upath, vap, fd);
5181 	}
5182 
5183 	AUDIT_ARG(mode, mode);
5184 	AUDIT_ARG(value32, vap->va_rdev);
5185 
5186 	if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5187 		return error;
5188 	}
5189 	NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5190 	    UIO_USERSPACE, upath, ctx);
5191 	error = nameiat(&nd, fd);
5192 	if (error) {
5193 		return error;
5194 	}
5195 	dvp = nd.ni_dvp;
5196 	vp = nd.ni_vp;
5197 
5198 	if (vp != NULL) {
5199 		error = EEXIST;
5200 		goto out;
5201 	}
5202 
5203 	switch (mode & S_IFMT) {
5204 	case S_IFCHR:
5205 		VATTR_SET(vap, va_type, VCHR);
5206 		break;
5207 	case S_IFBLK:
5208 		VATTR_SET(vap, va_type, VBLK);
5209 		break;
5210 	default:
5211 		error = EINVAL;
5212 		goto out;
5213 	}
5214 
5215 #if CONFIG_MACF
5216 	error = mac_vnode_check_create(ctx,
5217 	    nd.ni_dvp, &nd.ni_cnd, vap);
5218 	if (error) {
5219 		goto out;
5220 	}
5221 #endif
5222 
5223 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5224 		goto out;
5225 	}
5226 
5227 #if CONFIG_FILE_LEASES
5228 	vnode_breakdirlease(dvp, false, O_WRONLY);
5229 #endif
5230 
5231 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5232 		goto out;
5233 	}
5234 
5235 	if (vp) {
5236 		int     update_flags = 0;
5237 
5238 		// Make sure the name & parent pointers are hooked up
5239 		if (vp->v_name == NULL) {
5240 			update_flags |= VNODE_UPDATE_NAME;
5241 		}
5242 		if (vp->v_parent == NULLVP) {
5243 			update_flags |= VNODE_UPDATE_PARENT;
5244 		}
5245 
5246 		if (update_flags) {
5247 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5248 		}
5249 
5250 #if CONFIG_FSE
5251 		add_fsevent(FSE_CREATE_FILE, ctx,
5252 		    FSE_ARG_VNODE, vp,
5253 		    FSE_ARG_DONE);
5254 #endif
5255 	}
5256 
5257 out:
5258 	/*
5259 	 * nameidone has to happen before we vnode_put(dvp)
5260 	 * since it may need to release the fs_nodelock on the dvp
5261 	 */
5262 	nameidone(&nd);
5263 
5264 	if (vp) {
5265 		vnode_put(vp);
5266 	}
5267 	vnode_put(dvp);
5268 
5269 	return error;
5270 }
5271 
5272 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5273 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5274 {
5275 	struct vnode_attr va;
5276 
5277 	VATTR_INIT(&va);
5278 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5279 	VATTR_SET(&va, va_rdev, uap->dev);
5280 
5281 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5282 }
5283 
5284 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5285 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5286 {
5287 	struct vnode_attr va;
5288 
5289 	VATTR_INIT(&va);
5290 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5291 	VATTR_SET(&va, va_rdev, uap->dev);
5292 
5293 	return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5294 }
5295 
5296 /*
5297  * Create a named pipe.
5298  *
5299  * Returns:	0			Success
5300  *		EEXIST
5301  *	namei:???
5302  *	vnode_authorize:???
5303  *	vn_create:???
5304  */
5305 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5306 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5307 {
5308 	vnode_t vp, dvp;
5309 	int error;
5310 	struct nameidata nd;
5311 
5312 	NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5313 	    UIO_USERSPACE, upath, ctx);
5314 	error = nameiat(&nd, fd);
5315 	if (error) {
5316 		return error;
5317 	}
5318 	dvp = nd.ni_dvp;
5319 	vp = nd.ni_vp;
5320 
5321 	/* check that this is a new file and authorize addition */
5322 	if (vp != NULL) {
5323 		error = EEXIST;
5324 		goto out;
5325 	}
5326 	VATTR_SET(vap, va_type, VFIFO);
5327 
5328 	if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5329 		goto out;
5330 	}
5331 
5332 	error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5333 out:
5334 	/*
5335 	 * nameidone has to happen before we vnode_put(dvp)
5336 	 * since it may need to release the fs_nodelock on the dvp
5337 	 */
5338 	nameidone(&nd);
5339 
5340 	if (vp) {
5341 		vnode_put(vp);
5342 	}
5343 	vnode_put(dvp);
5344 
5345 	return error;
5346 }
5347 
5348 
5349 /*
5350  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5351  *
5352  * Parameters:	p			Process requesting the open
5353  *		uap			User argument descriptor (see below)
5354  *		retval			(Ignored)
5355  *
5356  * Indirect:	uap->path		Path to fifo (same as 'mkfifo')
5357  *		uap->uid		UID to set
5358  *		uap->gid		GID to set
5359  *		uap->mode		File mode to set (same as 'mkfifo')
5360  *		uap->xsecurity		ACL to set, if creating
5361  *
5362  * Returns:	0			Success
5363  *		!0			errno value
5364  *
5365  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
5366  *
5367  * XXX:		We should enummerate the possible errno values here, and where
5368  *		in the code they originated.
5369  */
5370 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5371 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5372 {
5373 	int ciferror;
5374 	kauth_filesec_t xsecdst;
5375 	struct vnode_attr va;
5376 
5377 	AUDIT_ARG(owner, uap->uid, uap->gid);
5378 
5379 	xsecdst = KAUTH_FILESEC_NONE;
5380 	if (uap->xsecurity != USER_ADDR_NULL) {
5381 		if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5382 			return ciferror;
5383 		}
5384 	}
5385 
5386 	VATTR_INIT(&va);
5387 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5388 	if (uap->uid != KAUTH_UID_NONE) {
5389 		VATTR_SET(&va, va_uid, uap->uid);
5390 	}
5391 	if (uap->gid != KAUTH_GID_NONE) {
5392 		VATTR_SET(&va, va_gid, uap->gid);
5393 	}
5394 	if (xsecdst != KAUTH_FILESEC_NONE) {
5395 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5396 		va.va_vaflags |= VA_FILESEC_ACL;
5397 	}
5398 
5399 	ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5400 
5401 	if (xsecdst != KAUTH_FILESEC_NONE) {
5402 		kauth_filesec_free(xsecdst);
5403 	}
5404 	return ciferror;
5405 }
5406 
5407 /* ARGSUSED */
5408 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5409 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5410 {
5411 	struct vnode_attr va;
5412 
5413 	VATTR_INIT(&va);
5414 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5415 
5416 	return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5417 }
5418 
5419 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5420 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5421 {
5422 	struct vnode_attr va;
5423 
5424 	VATTR_INIT(&va);
5425 	VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5426 
5427 	return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5428 }
5429 
5430 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5431 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5432 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5433 
5434 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5435 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5436 {
5437 	int ret, len = _len;
5438 
5439 	*truncated_path = 0;
5440 
5441 	if (firmlink) {
5442 		ret = vn_getpath(dvp, path, &len);
5443 	} else {
5444 		ret = vn_getpath_no_firmlink(dvp, path, &len);
5445 	}
5446 	if (ret == 0 && len < (MAXPATHLEN - 1)) {
5447 		if (leafname) {
5448 			path[len - 1] = '/';
5449 			len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5450 			if (len > MAXPATHLEN) {
5451 				char *ptr;
5452 
5453 				// the string got truncated!
5454 				*truncated_path = 1;
5455 				ptr = strrchr(path, '/');
5456 				if (ptr) {
5457 					*ptr = '\0';   // chop off the string at the last directory component
5458 				}
5459 				len = (int)strlen(path) + 1;
5460 			}
5461 		}
5462 	} else if (ret == 0) {
5463 		*truncated_path = 1;
5464 	} else if (ret != 0) {
5465 		struct vnode *mydvp = dvp;
5466 
5467 		if (ret != ENOSPC) {
5468 			printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5469 			    dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5470 		}
5471 		*truncated_path = 1;
5472 
5473 		do {
5474 			if (mydvp->v_parent != NULL) {
5475 				mydvp = mydvp->v_parent;
5476 			} else if (mydvp->v_mount) {
5477 				strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5478 				break;
5479 			} else {
5480 				// no parent and no mount point?  only thing is to punt and say "/" changed
5481 				strlcpy(path, "/", _len);
5482 				len = 2;
5483 				mydvp = NULL;
5484 			}
5485 
5486 			if (mydvp == NULL) {
5487 				break;
5488 			}
5489 
5490 			len = _len;
5491 			if (firmlink) {
5492 				ret = vn_getpath(mydvp, path, &len);
5493 			} else {
5494 				ret = vn_getpath_no_firmlink(mydvp, path, &len);
5495 			}
5496 		} while (ret == ENOSPC);
5497 	}
5498 
5499 	return len;
5500 }
5501 
5502 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5503 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5504 {
5505 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5506 }
5507 
5508 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5509 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5510 {
5511 	return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5512 }
5513 
5514 /*
5515  * Make a hard file link.
5516  *
5517  * Returns:	0			Success
5518  *		EPERM
5519  *		EEXIST
5520  *		EXDEV
5521  *	namei:???
5522  *	vnode_authorize:???
5523  *	VNOP_LINK:???
5524  */
5525 /* ARGSUSED */
5526 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5527 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5528     user_addr_t link, int flag, enum uio_seg segflg)
5529 {
5530 	vnode_t vp, pvp, dvp, lvp;
5531 	struct nameidata nd;
5532 	int follow;
5533 	int error;
5534 #if CONFIG_FSE
5535 	fse_info finfo;
5536 #endif
5537 	int need_event, has_listeners, need_kpath2;
5538 	char *target_path = NULL;
5539 	char  *no_firmlink_path = NULL;
5540 	int truncated = 0;
5541 	int truncated_no_firmlink_path = 0;
5542 
5543 	vp = dvp = lvp = NULLVP;
5544 
5545 	/* look up the object we are linking to */
5546 	follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5547 	NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5548 	    segflg, path, ctx);
5549 
5550 	error = nameiat(&nd, fd1);
5551 	if (error) {
5552 		return error;
5553 	}
5554 	vp = nd.ni_vp;
5555 
5556 	nameidone(&nd);
5557 
5558 	/*
5559 	 * Normally, linking to directories is not supported.
5560 	 * However, some file systems may have limited support.
5561 	 */
5562 	if (vp->v_type == VDIR) {
5563 		if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5564 			error = EPERM;   /* POSIX */
5565 			goto out;
5566 		}
5567 
5568 		/* Linking to a directory requires ownership. */
5569 		if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5570 			struct vnode_attr dva;
5571 
5572 			VATTR_INIT(&dva);
5573 			VATTR_WANTED(&dva, va_uid);
5574 			if (vnode_getattr(vp, &dva, ctx) != 0 ||
5575 			    !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5576 			    (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5577 				error = EACCES;
5578 				goto out;
5579 			}
5580 		}
5581 	}
5582 
5583 	/* lookup the target node */
5584 #if CONFIG_TRIGGERS
5585 	nd.ni_op = OP_LINK;
5586 #endif
5587 	nd.ni_cnd.cn_nameiop = CREATE;
5588 	nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5589 	nd.ni_dirp = link;
5590 	error = nameiat(&nd, fd2);
5591 	if (error != 0) {
5592 		goto out;
5593 	}
5594 	dvp = nd.ni_dvp;
5595 	lvp = nd.ni_vp;
5596 
5597 #if CONFIG_MACF
5598 	if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5599 		goto out2;
5600 	}
5601 #endif
5602 
5603 	/* or to anything that kauth doesn't want us to (eg. immutable items) */
5604 	if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5605 		goto out2;
5606 	}
5607 
5608 	/* target node must not exist */
5609 	if (lvp != NULLVP) {
5610 		error = EEXIST;
5611 		goto out2;
5612 	}
5613 	/* cannot link across mountpoints */
5614 	if (vnode_mount(vp) != vnode_mount(dvp)) {
5615 		error = EXDEV;
5616 		goto out2;
5617 	}
5618 
5619 	/* authorize creation of the target note */
5620 	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5621 		goto out2;
5622 	}
5623 
5624 #if CONFIG_FILE_LEASES
5625 	vnode_breakdirlease(dvp, false, O_WRONLY);
5626 #endif
5627 
5628 	/* and finally make the link */
5629 	error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5630 	if (error) {
5631 		goto out2;
5632 	}
5633 
5634 #if CONFIG_MACF
5635 	(void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5636 #endif
5637 
5638 #if CONFIG_FSE
5639 	need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5640 #else
5641 	need_event = 0;
5642 #endif
5643 	has_listeners = kauth_authorize_fileop_has_listeners();
5644 
5645 	need_kpath2 = 0;
5646 #if CONFIG_AUDIT
5647 	if (AUDIT_RECORD_EXISTS()) {
5648 		need_kpath2 = 1;
5649 	}
5650 #endif
5651 
5652 	if (need_event || has_listeners || need_kpath2) {
5653 		char *link_to_path = NULL;
5654 		int len, link_name_len;
5655 		int  len_no_firmlink_path = 0;
5656 
5657 		/* build the path to the new link file */
5658 		GET_PATH(target_path);
5659 
5660 		len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5661 		if (no_firmlink_path == NULL) {
5662 			GET_PATH(no_firmlink_path);
5663 		}
5664 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5665 
5666 		AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5667 
5668 		if (has_listeners) {
5669 			/* build the path to file we are linking to */
5670 			GET_PATH(link_to_path);
5671 
5672 			link_name_len = MAXPATHLEN;
5673 			if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5674 				/*
5675 				 * Call out to allow 3rd party notification of rename.
5676 				 * Ignore result of kauth_authorize_fileop call.
5677 				 */
5678 				kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5679 				    (uintptr_t)link_to_path,
5680 				    (uintptr_t)target_path);
5681 			}
5682 			if (link_to_path != NULL) {
5683 				RELEASE_PATH(link_to_path);
5684 			}
5685 		}
5686 #if CONFIG_FSE
5687 		if (need_event) {
5688 			/* construct fsevent */
5689 			if (get_fse_info(vp, &finfo, ctx) == 0) {
5690 				if (truncated_no_firmlink_path) {
5691 					finfo.mode |= FSE_TRUNCATED_PATH;
5692 				}
5693 
5694 				// build the path to the destination of the link
5695 				add_fsevent(FSE_CREATE_FILE, ctx,
5696 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5697 				    FSE_ARG_FINFO, &finfo,
5698 				    FSE_ARG_DONE);
5699 			}
5700 
5701 			pvp = vp->v_parent;
5702 			// need an iocount on pvp in this case
5703 			if (pvp && pvp != dvp) {
5704 				error = vnode_get(pvp);
5705 				if (error) {
5706 					pvp = NULLVP;
5707 					error = 0;
5708 				}
5709 			}
5710 			if (pvp) {
5711 				add_fsevent(FSE_STAT_CHANGED, ctx,
5712 				    FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5713 			}
5714 			if (pvp && pvp != dvp) {
5715 				vnode_put(pvp);
5716 			}
5717 		}
5718 #endif
5719 	}
5720 out2:
5721 	/*
5722 	 * nameidone has to happen before we vnode_put(dvp)
5723 	 * since it may need to release the fs_nodelock on the dvp
5724 	 */
5725 	nameidone(&nd);
5726 	if (target_path != NULL) {
5727 		RELEASE_PATH(target_path);
5728 	}
5729 	if (no_firmlink_path != NULL) {
5730 		RELEASE_PATH(no_firmlink_path);
5731 		no_firmlink_path = NULL;
5732 	}
5733 out:
5734 	if (lvp) {
5735 		vnode_put(lvp);
5736 	}
5737 	if (dvp) {
5738 		vnode_put(dvp);
5739 	}
5740 	vnode_put(vp);
5741 	return error;
5742 }
5743 
5744 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5745 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5746 {
5747 	return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5748 	           AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5749 }
5750 
5751 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5752 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5753 {
5754 	if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5755 		return EINVAL;
5756 	}
5757 
5758 	return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5759 	           uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5760 }
5761 
5762 /*
5763  * Make a symbolic link.
5764  *
5765  * We could add support for ACLs here too...
5766  */
5767 /* ARGSUSED */
5768 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5769 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5770     user_addr_t link, enum uio_seg segflg)
5771 {
5772 	struct vnode_attr va;
5773 	char *path;
5774 	int error;
5775 	struct nameidata nd;
5776 	vnode_t vp, dvp;
5777 	size_t dummy = 0;
5778 	proc_t p;
5779 
5780 	error = 0;
5781 	if (UIO_SEG_IS_USER_SPACE(segflg)) {
5782 		path = zalloc(ZV_NAMEI);
5783 		error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5784 	} else {
5785 		path = (char *)path_data;
5786 	}
5787 	if (error) {
5788 		goto out;
5789 	}
5790 	AUDIT_ARG(text, path);  /* This is the link string */
5791 
5792 	NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5793 	    segflg, link, ctx);
5794 
5795 	error = nameiat(&nd, fd);
5796 	if (error) {
5797 		goto out;
5798 	}
5799 	dvp = nd.ni_dvp;
5800 	vp = nd.ni_vp;
5801 
5802 	p = vfs_context_proc(ctx);
5803 	VATTR_INIT(&va);
5804 	VATTR_SET(&va, va_type, VLNK);
5805 	VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5806 
5807 #if CONFIG_MACF
5808 	error = mac_vnode_check_create(ctx,
5809 	    dvp, &nd.ni_cnd, &va);
5810 #endif
5811 	if (error != 0) {
5812 		goto skipit;
5813 	}
5814 
5815 	if (vp != NULL) {
5816 		error = EEXIST;
5817 		goto skipit;
5818 	}
5819 
5820 	/* authorize */
5821 	if (error == 0) {
5822 		error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5823 	}
5824 	/* get default ownership, etc. */
5825 	if (error == 0) {
5826 		error = vnode_authattr_new(dvp, &va, 0, ctx);
5827 	}
5828 
5829 #if CONFIG_FILE_LEASES
5830 	vnode_breakdirlease(dvp, false, O_WRONLY);
5831 #endif
5832 
5833 	if (error == 0) {
5834 		error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5835 	}
5836 
5837 	/* do fallback attribute handling */
5838 	if (error == 0 && vp) {
5839 		error = vnode_setattr_fallback(vp, &va, ctx);
5840 	}
5841 
5842 #if CONFIG_MACF
5843 	if (error == 0 && vp) {
5844 		error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5845 	}
5846 #endif
5847 
5848 	if (error == 0) {
5849 		int     update_flags = 0;
5850 
5851 		/*check if a new vnode was created, else try to get one*/
5852 		if (vp == NULL) {
5853 			nd.ni_cnd.cn_nameiop = LOOKUP;
5854 #if CONFIG_TRIGGERS
5855 			nd.ni_op = OP_LOOKUP;
5856 #endif
5857 			/*
5858 			 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5859 			 * reallocated again in namei().
5860 			 */
5861 			nd.ni_cnd.cn_flags &= HASBUF;
5862 			error = nameiat(&nd, fd);
5863 			if (error) {
5864 				goto skipit;
5865 			}
5866 			vp = nd.ni_vp;
5867 		}
5868 
5869 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5870 		/* call out to allow 3rd party notification of rename.
5871 		 * Ignore result of kauth_authorize_fileop call.
5872 		 */
5873 		if (kauth_authorize_fileop_has_listeners() &&
5874 		    namei(&nd) == 0) {
5875 			char *new_link_path = NULL;
5876 			int             len;
5877 
5878 			/* build the path to the new link file */
5879 			new_link_path = get_pathbuff();
5880 			len = MAXPATHLEN;
5881 			vn_getpath(dvp, new_link_path, &len);
5882 			if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5883 				new_link_path[len - 1] = '/';
5884 				strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5885 			}
5886 
5887 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5888 			    (uintptr_t)path, (uintptr_t)new_link_path);
5889 			if (new_link_path != NULL) {
5890 				release_pathbuff(new_link_path);
5891 			}
5892 		}
5893 #endif
5894 		// Make sure the name & parent pointers are hooked up
5895 		if (vp->v_name == NULL) {
5896 			update_flags |= VNODE_UPDATE_NAME;
5897 		}
5898 		if (vp->v_parent == NULLVP) {
5899 			update_flags |= VNODE_UPDATE_PARENT;
5900 		}
5901 
5902 		if (update_flags) {
5903 			vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5904 		}
5905 
5906 #if CONFIG_FSE
5907 		add_fsevent(FSE_CREATE_FILE, ctx,
5908 		    FSE_ARG_VNODE, vp,
5909 		    FSE_ARG_DONE);
5910 #endif
5911 	}
5912 
5913 skipit:
5914 	/*
5915 	 * nameidone has to happen before we vnode_put(dvp)
5916 	 * since it may need to release the fs_nodelock on the dvp
5917 	 */
5918 	nameidone(&nd);
5919 
5920 	if (vp) {
5921 		vnode_put(vp);
5922 	}
5923 	vnode_put(dvp);
5924 out:
5925 	if (path && (path != (char *)path_data)) {
5926 		zfree(ZV_NAMEI, path);
5927 	}
5928 
5929 	return error;
5930 }
5931 
5932 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5933 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5934 {
5935 	return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5936 	           uap->link, UIO_USERSPACE);
5937 }
5938 
5939 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5940 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5941     __unused int32_t *retval)
5942 {
5943 	return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5944 	           uap->path2, UIO_USERSPACE);
5945 }
5946 
5947 /*
5948  * Delete a whiteout from the filesystem.
5949  * No longer supported.
5950  */
5951 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5952 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5953 {
5954 	return ENOTSUP;
5955 }
5956 
5957 /*
5958  * Delete a name from the filesystem.
5959  */
5960 /* ARGSUSED */
5961 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5962 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5963     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5964 {
5965 	struct {
5966 		struct nameidata nd;
5967 #if CONFIG_FSE
5968 		struct vnode_attr va;
5969 		fse_info finfo;
5970 #endif
5971 	} *__unlink_data;
5972 	struct nameidata *ndp;
5973 	vnode_t vp, dvp;
5974 	int error;
5975 	struct componentname *cnp;
5976 	char  *path = NULL;
5977 	char  *no_firmlink_path = NULL;
5978 	int  len_path = 0;
5979 	int  len_no_firmlink_path = 0;
5980 	int flags;
5981 	int need_event;
5982 	int has_listeners;
5983 	int truncated_path;
5984 	int truncated_no_firmlink_path;
5985 	int batched;
5986 	struct vnode_attr *vap;
5987 	int do_retry;
5988 	int retry_count = 0;
5989 	int cn_flags;
5990 
5991 	cn_flags = LOCKPARENT;
5992 	if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5993 		cn_flags |= AUDITVNPATH1;
5994 	}
5995 	/* If a starting dvp is passed, it trumps any fd passed. */
5996 	if (start_dvp) {
5997 		cn_flags |= USEDVP;
5998 	}
5999 
6000 #if NAMEDRSRCFORK
6001 	/* unlink or delete is allowed on rsrc forks and named streams */
6002 	cn_flags |= CN_ALLOWRSRCFORK;
6003 #endif
6004 
6005 	__unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6006 	ndp = &__unlink_data->nd;
6007 #if CONFIG_FSE
6008 	fse_info *finfop = &__unlink_data->finfo;
6009 #endif
6010 
6011 retry:
6012 	do_retry = 0;
6013 	flags = 0;
6014 	need_event = 0;
6015 	has_listeners = 0;
6016 	truncated_path = 0;
6017 	truncated_no_firmlink_path = 0;
6018 	vap = NULL;
6019 
6020 	NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6021 
6022 	ndp->ni_dvp = start_dvp;
6023 	ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
6024 	cnp = &ndp->ni_cnd;
6025 
6026 continue_lookup:
6027 	error = nameiat(ndp, fd);
6028 	if (error) {
6029 		goto early_out;
6030 	}
6031 
6032 	dvp = ndp->ni_dvp;
6033 	vp = ndp->ni_vp;
6034 
6035 	/* With Carbon delete semantics, busy files cannot be deleted */
6036 	if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6037 		flags |= VNODE_REMOVE_NODELETEBUSY;
6038 	}
6039 
6040 	/* Skip any potential upcalls if told to. */
6041 	if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6042 		flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6043 	}
6044 
6045 	if (vp) {
6046 		batched = vnode_compound_remove_available(vp);
6047 		/*
6048 		 * The root of a mounted filesystem cannot be deleted.
6049 		 */
6050 		if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6051 			error = EBUSY;
6052 			goto out;
6053 		}
6054 
6055 #if DEVELOPMENT || DEBUG
6056 		/*
6057 		 * XXX VSWAP: Check for entitlements or special flag here
6058 		 * so we can restrict access appropriately.
6059 		 */
6060 #else /* DEVELOPMENT || DEBUG */
6061 
6062 		if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6063 			error = EPERM;
6064 			goto out;
6065 		}
6066 #endif /* DEVELOPMENT || DEBUG */
6067 
6068 		if (!batched) {
6069 			error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6070 			if (error) {
6071 				if (error == ENOENT) {
6072 					if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6073 						do_retry = 1;
6074 						retry_count++;
6075 					}
6076 				}
6077 				goto out;
6078 			}
6079 		}
6080 	} else {
6081 		batched = 1;
6082 
6083 		if (!vnode_compound_remove_available(dvp)) {
6084 			panic("No vp, but no compound remove?");
6085 		}
6086 	}
6087 
6088 #if CONFIG_FSE
6089 	need_event = need_fsevent(FSE_DELETE, dvp);
6090 	if (need_event) {
6091 		if (!batched) {
6092 			if ((vp->v_flag & VISHARDLINK) == 0) {
6093 				/* XXX need to get these data in batched VNOP */
6094 				get_fse_info(vp, finfop, ctx);
6095 			}
6096 		} else {
6097 			error =
6098 			    vfs_get_notify_attributes(&__unlink_data->va);
6099 			if (error) {
6100 				goto out;
6101 			}
6102 
6103 			vap = &__unlink_data->va;
6104 		}
6105 	}
6106 #endif
6107 	has_listeners = kauth_authorize_fileop_has_listeners();
6108 	if (need_event || has_listeners) {
6109 		if (path == NULL) {
6110 			GET_PATH(path);
6111 		}
6112 		len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6113 		if (no_firmlink_path == NULL) {
6114 			GET_PATH(no_firmlink_path);
6115 		}
6116 		len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6117 	}
6118 
6119 #if NAMEDRSRCFORK
6120 	if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6121 		error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6122 	} else
6123 #endif
6124 	{
6125 #if CONFIG_FILE_LEASES
6126 		vnode_breakdirlease(dvp, false, O_WRONLY);
6127 #endif
6128 
6129 		error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6130 		vp = ndp->ni_vp;
6131 		if (error == EKEEPLOOKING) {
6132 			if (!batched) {
6133 				panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6134 			}
6135 
6136 			if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6137 				panic("EKEEPLOOKING, but continue flag not set?");
6138 			}
6139 
6140 			if (vnode_isdir(vp)) {
6141 				error = EISDIR;
6142 				goto out;
6143 			}
6144 			goto continue_lookup;
6145 		} else if (error == ENOENT && batched) {
6146 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6147 				/*
6148 				 * For compound VNOPs, the authorization callback may
6149 				 * return ENOENT in case of racing hardlink lookups
6150 				 * hitting the name  cache, redrive the lookup.
6151 				 */
6152 				do_retry = 1;
6153 				retry_count += 1;
6154 				goto out;
6155 			}
6156 		}
6157 	}
6158 
6159 	/*
6160 	 * Call out to allow 3rd party notification of delete.
6161 	 * Ignore result of kauth_authorize_fileop call.
6162 	 */
6163 	if (!error) {
6164 		if (has_listeners) {
6165 			kauth_authorize_fileop(vfs_context_ucred(ctx),
6166 			    KAUTH_FILEOP_DELETE,
6167 			    (uintptr_t)vp,
6168 			    (uintptr_t)path);
6169 		}
6170 
6171 		if (vp->v_flag & VISHARDLINK) {
6172 			//
6173 			// if a hardlink gets deleted we want to blow away the
6174 			// v_parent link because the path that got us to this
6175 			// instance of the link is no longer valid.  this will
6176 			// force the next call to get the path to ask the file
6177 			// system instead of just following the v_parent link.
6178 			//
6179 			vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6180 		}
6181 
6182 #if CONFIG_FSE
6183 		if (need_event) {
6184 			if (vp->v_flag & VISHARDLINK) {
6185 				get_fse_info(vp, finfop, ctx);
6186 			} else if (vap) {
6187 				vnode_get_fse_info_from_vap(vp, finfop, vap);
6188 			}
6189 			if (truncated_path) {
6190 				finfop->mode |= FSE_TRUNCATED_PATH;
6191 			}
6192 			add_fsevent(FSE_DELETE, ctx,
6193 			    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6194 			    FSE_ARG_FINFO, finfop,
6195 			    FSE_ARG_DONE);
6196 		}
6197 #endif
6198 
6199 #if CONFIG_MACF
6200 		mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6201 #endif
6202 	}
6203 
6204 out:
6205 	if (path != NULL) {
6206 		RELEASE_PATH(path);
6207 		path = NULL;
6208 	}
6209 
6210 	if (no_firmlink_path != NULL) {
6211 		RELEASE_PATH(no_firmlink_path);
6212 		no_firmlink_path = NULL;
6213 	}
6214 #if NAMEDRSRCFORK
6215 	/* recycle the deleted rsrc fork vnode to force a reclaim, which
6216 	 * will cause its shadow file to go away if necessary.
6217 	 */
6218 	if (vp && (vnode_isnamedstream(vp)) &&
6219 	    (vp->v_parent != NULLVP) &&
6220 	    vnode_isshadow(vp)) {
6221 		vnode_recycle(vp);
6222 	}
6223 #endif
6224 	/*
6225 	 * nameidone has to happen before we vnode_put(dvp)
6226 	 * since it may need to release the fs_nodelock on the dvp
6227 	 */
6228 	nameidone(ndp);
6229 	vnode_put(dvp);
6230 	if (vp) {
6231 		vnode_put(vp);
6232 	}
6233 
6234 	if (do_retry) {
6235 		goto retry;
6236 	}
6237 
6238 early_out:
6239 	kfree_type(typeof(*__unlink_data), __unlink_data);
6240 	return error;
6241 }
6242 
6243 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6244 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6245     enum uio_seg segflg, int unlink_flags)
6246 {
6247 	return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6248 	           unlink_flags);
6249 }
6250 
6251 /*
6252  * Delete a name from the filesystem using Carbon semantics.
6253  */
6254 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6255 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6256 {
6257 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6258 	           uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6259 }
6260 
6261 /*
6262  * Delete a name from the filesystem using POSIX semantics.
6263  */
6264 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6265 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6266 {
6267 	return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6268 	           uap->path, UIO_USERSPACE, 0);
6269 }
6270 
6271 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6272 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6273 {
6274 	if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6275 		return EINVAL;
6276 	}
6277 
6278 	if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6279 		int unlink_flags = 0;
6280 
6281 		if (uap->flag & AT_REMOVEDIR_DATALESS) {
6282 			unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6283 		}
6284 		return rmdirat_internal(vfs_context_current(), uap->fd,
6285 		           uap->path, UIO_USERSPACE, unlink_flags);
6286 	} else {
6287 		return unlinkat_internal(vfs_context_current(), uap->fd,
6288 		           NULLVP, uap->path, UIO_USERSPACE, 0);
6289 	}
6290 }
6291 
6292 /*
6293  * Reposition read/write file offset.
6294  */
6295 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6296 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6297 {
6298 	struct fileproc *fp;
6299 	vnode_t vp;
6300 	struct vfs_context *ctx;
6301 	off_t offset = uap->offset, file_size;
6302 	int error;
6303 
6304 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6305 		if (error == ENOTSUP) {
6306 			return ESPIPE;
6307 		}
6308 		return error;
6309 	}
6310 	if (vnode_isfifo(vp)) {
6311 		file_drop(uap->fd);
6312 		return ESPIPE;
6313 	}
6314 
6315 
6316 	ctx = vfs_context_current();
6317 #if CONFIG_MACF
6318 	if (uap->whence == L_INCR && uap->offset == 0) {
6319 		error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6320 		    fp->fp_glob);
6321 	} else {
6322 		error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6323 		    fp->fp_glob);
6324 	}
6325 	if (error) {
6326 		file_drop(uap->fd);
6327 		return error;
6328 	}
6329 #endif
6330 	if ((error = vnode_getwithref(vp))) {
6331 		file_drop(uap->fd);
6332 		return error;
6333 	}
6334 
6335 	switch (uap->whence) {
6336 	case L_INCR:
6337 		offset += fp->fp_glob->fg_offset;
6338 		break;
6339 	case L_XTND:
6340 		if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6341 			break;
6342 		}
6343 		offset += file_size;
6344 		break;
6345 	case L_SET:
6346 		break;
6347 	case SEEK_HOLE:
6348 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6349 		break;
6350 	case SEEK_DATA:
6351 		error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6352 		break;
6353 	default:
6354 		error = EINVAL;
6355 	}
6356 	if (error == 0) {
6357 		if (uap->offset > 0 && offset < 0) {
6358 			/* Incremented/relative move past max size */
6359 			error = EOVERFLOW;
6360 		} else {
6361 			/*
6362 			 * Allow negative offsets on character devices, per
6363 			 * POSIX 1003.1-2001.  Most likely for writing disk
6364 			 * labels.
6365 			 */
6366 			if (offset < 0 && vp->v_type != VCHR) {
6367 				/* Decremented/relative move before start */
6368 				error = EINVAL;
6369 			} else {
6370 				/* Success */
6371 				fp->fp_glob->fg_offset = offset;
6372 				*retval = fp->fp_glob->fg_offset;
6373 			}
6374 		}
6375 	}
6376 
6377 	/*
6378 	 * An lseek can affect whether data is "available to read."  Use
6379 	 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6380 	 */
6381 	post_event_if_success(vp, error, NOTE_NONE);
6382 	(void)vnode_put(vp);
6383 	file_drop(uap->fd);
6384 	return error;
6385 }
6386 
6387 
6388 /*
6389  * Check access permissions.
6390  *
6391  * Returns:	0			Success
6392  *		vnode_authorize:???
6393  */
6394 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6395 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6396 {
6397 	kauth_action_t action;
6398 	int error;
6399 
6400 	/*
6401 	 * If just the regular access bits, convert them to something
6402 	 * that vnode_authorize will understand.
6403 	 */
6404 	if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6405 		action = 0;
6406 		if (uflags & R_OK) {
6407 			action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
6408 		}
6409 		if (uflags & W_OK) {
6410 			if (vnode_isdir(vp)) {
6411 				action |= KAUTH_VNODE_ADD_FILE |
6412 				    KAUTH_VNODE_ADD_SUBDIRECTORY;
6413 				/* might want delete rights here too */
6414 			} else {
6415 				action |= KAUTH_VNODE_WRITE_DATA;
6416 			}
6417 		}
6418 		if (uflags & X_OK) {
6419 			if (vnode_isdir(vp)) {
6420 				action |= KAUTH_VNODE_SEARCH;
6421 			} else {
6422 				action |= KAUTH_VNODE_EXECUTE;
6423 			}
6424 		}
6425 	} else {
6426 		/* take advantage of definition of uflags */
6427 		action = uflags >> 8;
6428 	}
6429 
6430 #if CONFIG_MACF
6431 	error = mac_vnode_check_access(ctx, vp, uflags);
6432 	if (error) {
6433 		return error;
6434 	}
6435 #endif /* MAC */
6436 
6437 	/* action == 0 means only check for existence */
6438 	if (action != 0) {
6439 		error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6440 	} else {
6441 		error = 0;
6442 	}
6443 
6444 	return error;
6445 }
6446 
6447 
6448 
6449 /*
6450  * access_extended: Check access permissions in bulk.
6451  *
6452  * Description:	uap->entries		Pointer to an array of accessx
6453  *                                      descriptor structs, plus one or
6454  *                                      more NULL terminated strings (see
6455  *                                      "Notes" section below).
6456  *		uap->size		Size of the area pointed to by
6457  *					uap->entries.
6458  *		uap->results		Pointer to the results array.
6459  *
6460  * Returns:	0			Success
6461  *		ENOMEM			Insufficient memory
6462  *		EINVAL			Invalid arguments
6463  *		namei:EFAULT		Bad address
6464  *		namei:ENAMETOOLONG	Filename too long
6465  *		namei:ENOENT		No such file or directory
6466  *		namei:ELOOP		Too many levels of symbolic links
6467  *		namei:EBADF		Bad file descriptor
6468  *		namei:ENOTDIR		Not a directory
6469  *		namei:???
6470  *		access1:
6471  *
6472  * Implicit returns:
6473  *		uap->results		Array contents modified
6474  *
6475  * Notes:	The uap->entries are structured as an arbitrary length array
6476  *		of accessx descriptors, followed by one or more NULL terminated
6477  *		strings
6478  *
6479  *			struct accessx_descriptor[0]
6480  *			...
6481  *			struct accessx_descriptor[n]
6482  *			char name_data[0];
6483  *
6484  *		We determine the entry count by walking the buffer containing
6485  *		the uap->entries argument descriptor.  For each descriptor we
6486  *		see, the valid values for the offset ad_name_offset will be
6487  *		in the byte range:
6488  *
6489  *			[ uap->entries + sizeof(struct accessx_descriptor) ]
6490  *						to
6491  *				[ uap->entries + uap->size - 2 ]
6492  *
6493  *		since we must have at least one string, and the string must
6494  *		be at least one character plus the NULL terminator in length.
6495  *
6496  * XXX:		Need to support the check-as uid argument
6497  */
6498 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6499 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6500 {
6501 	struct accessx_descriptor *input = NULL;
6502 	errno_t *result = NULL;
6503 	errno_t error = 0;
6504 	int wantdelete = 0;
6505 	size_t desc_max, desc_actual = 0;
6506 	unsigned int i, j;
6507 	struct vfs_context context;
6508 	struct nameidata nd;
6509 	int niopts;
6510 	vnode_t vp = NULL;
6511 	vnode_t dvp = NULL;
6512 #define ACCESSX_MAX_DESCR_ON_STACK 10
6513 	struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6514 
6515 	context.vc_ucred = NULL;
6516 
6517 	/*
6518 	 * Validate parameters; if valid, copy the descriptor array and string
6519 	 * arguments into local memory.  Before proceeding, the following
6520 	 * conditions must have been met:
6521 	 *
6522 	 * o	The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6523 	 * o	There must be sufficient room in the request for at least one
6524 	 *	descriptor and a one yte NUL terminated string.
6525 	 * o	The allocation of local storage must not fail.
6526 	 */
6527 	if (uap->size > ACCESSX_MAX_TABLESIZE) {
6528 		return ENOMEM;
6529 	}
6530 	if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6531 		return EINVAL;
6532 	}
6533 	if (uap->size <= sizeof(stack_input)) {
6534 		input = stack_input;
6535 	} else {
6536 		input = kalloc_data(uap->size, Z_WAITOK);
6537 		if (input == NULL) {
6538 			error = ENOMEM;
6539 			goto out;
6540 		}
6541 	}
6542 	error = copyin(uap->entries, input, uap->size);
6543 	if (error) {
6544 		goto out;
6545 	}
6546 
6547 	AUDIT_ARG(opaque, input, uap->size);
6548 
6549 	/*
6550 	 * Force NUL termination of the copyin buffer to avoid nami() running
6551 	 * off the end.  If the caller passes us bogus data, they may get a
6552 	 * bogus result.
6553 	 */
6554 	((char *)input)[uap->size - 1] = 0;
6555 
6556 	/*
6557 	 * Access is defined as checking against the process' real identity,
6558 	 * even if operations are checking the effective identity.  This
6559 	 * requires that we use a local vfs context.
6560 	 */
6561 	context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6562 	context.vc_thread = current_thread();
6563 
6564 	/*
6565 	 * Find out how many entries we have, so we can allocate the result
6566 	 * array by walking the list and adjusting the count downward by the
6567 	 * earliest string offset we see.
6568 	 */
6569 	desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6570 	desc_actual = desc_max;
6571 	for (i = 0; i < desc_actual; i++) {
6572 		/*
6573 		 * Take the offset to the name string for this entry and
6574 		 * convert to an input array index, which would be one off
6575 		 * the end of the array if this entry was the lowest-addressed
6576 		 * name string.
6577 		 */
6578 		j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6579 
6580 		/*
6581 		 * An offset greater than the max allowable offset is an error.
6582 		 * It is also an error for any valid entry to point
6583 		 * to a location prior to the end of the current entry, if
6584 		 * it's not a reference to the string of the previous entry.
6585 		 */
6586 		if (j > desc_max || (j != 0 && j <= i)) {
6587 			error = EINVAL;
6588 			goto out;
6589 		}
6590 
6591 		/* Also do not let ad_name_offset point to something beyond the size of the input */
6592 		if (input[i].ad_name_offset >= uap->size) {
6593 			error = EINVAL;
6594 			goto out;
6595 		}
6596 
6597 		/*
6598 		 * An offset of 0 means use the previous descriptor's offset;
6599 		 * this is used to chain multiple requests for the same file
6600 		 * to avoid multiple lookups.
6601 		 */
6602 		if (j == 0) {
6603 			/* This is not valid for the first entry */
6604 			if (i == 0) {
6605 				error = EINVAL;
6606 				goto out;
6607 			}
6608 			continue;
6609 		}
6610 
6611 		/*
6612 		 * If the offset of the string for this descriptor is before
6613 		 * what we believe is the current actual last descriptor,
6614 		 * then we need to adjust our estimate downward; this permits
6615 		 * the string table following the last descriptor to be out
6616 		 * of order relative to the descriptor list.
6617 		 */
6618 		if (j < desc_actual) {
6619 			desc_actual = j;
6620 		}
6621 	}
6622 
6623 	/*
6624 	 * We limit the actual number of descriptors we are willing to process
6625 	 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6626 	 * requested does not exceed this limit,
6627 	 */
6628 	if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6629 		error = ENOMEM;
6630 		goto out;
6631 	}
6632 	result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6633 	if (result == NULL) {
6634 		error = ENOMEM;
6635 		goto out;
6636 	}
6637 
6638 	/*
6639 	 * Do the work by iterating over the descriptor entries we know to
6640 	 * at least appear to contain valid data.
6641 	 */
6642 	error = 0;
6643 	for (i = 0; i < desc_actual; i++) {
6644 		/*
6645 		 * If the ad_name_offset is 0, then we use the previous
6646 		 * results to make the check; otherwise, we are looking up
6647 		 * a new file name.
6648 		 */
6649 		if (input[i].ad_name_offset != 0) {
6650 			/* discard old vnodes */
6651 			if (vp) {
6652 				vnode_put(vp);
6653 				vp = NULL;
6654 			}
6655 			if (dvp) {
6656 				vnode_put(dvp);
6657 				dvp = NULL;
6658 			}
6659 
6660 			/*
6661 			 * Scan forward in the descriptor list to see if we
6662 			 * need the parent vnode.  We will need it if we are
6663 			 * deleting, since we must have rights  to remove
6664 			 * entries in the parent directory, as well as the
6665 			 * rights to delete the object itself.
6666 			 */
6667 			wantdelete = input[i].ad_flags & _DELETE_OK;
6668 			for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6669 				if (input[j].ad_flags & _DELETE_OK) {
6670 					wantdelete = 1;
6671 				}
6672 			}
6673 
6674 			niopts = FOLLOW | AUDITVNPATH1;
6675 
6676 			/* need parent for vnode_authorize for deletion test */
6677 			if (wantdelete) {
6678 				niopts |= WANTPARENT;
6679 			}
6680 
6681 			/* do the lookup */
6682 			NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6683 			    CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6684 			    &context);
6685 			error = namei(&nd);
6686 			if (!error) {
6687 				vp = nd.ni_vp;
6688 				if (wantdelete) {
6689 					dvp = nd.ni_dvp;
6690 				}
6691 			}
6692 			nameidone(&nd);
6693 		}
6694 
6695 		/*
6696 		 * Handle lookup errors.
6697 		 */
6698 		switch (error) {
6699 		case ENOENT:
6700 		case EACCES:
6701 		case EPERM:
6702 		case ENOTDIR:
6703 			result[i] = error;
6704 			break;
6705 		case 0:
6706 			/* run this access check */
6707 			result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6708 			break;
6709 		default:
6710 			/* fatal lookup error */
6711 
6712 			goto out;
6713 		}
6714 	}
6715 
6716 	AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6717 
6718 	/* copy out results */
6719 	error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6720 
6721 out:
6722 	if (input && input != stack_input) {
6723 		kfree_data(input, uap->size);
6724 	}
6725 	if (result) {
6726 		kfree_data(result, desc_actual * sizeof(errno_t));
6727 	}
6728 	if (vp) {
6729 		vnode_put(vp);
6730 	}
6731 	if (dvp) {
6732 		vnode_put(dvp);
6733 	}
6734 	if (IS_VALID_CRED(context.vc_ucred)) {
6735 		kauth_cred_unref(&context.vc_ucred);
6736 	}
6737 	return error;
6738 }
6739 
6740 
6741 /*
6742  * Returns:	0			Success
6743  *		namei:EFAULT		Bad address
6744  *		namei:ENAMETOOLONG	Filename too long
6745  *		namei:ENOENT		No such file or directory
6746  *		namei:ELOOP		Too many levels of symbolic links
6747  *		namei:EBADF		Bad file descriptor
6748  *		namei:ENOTDIR		Not a directory
6749  *		namei:???
6750  *		access1:
6751  */
6752 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6753 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6754     int flag, enum uio_seg segflg)
6755 {
6756 	int error;
6757 	struct nameidata nd;
6758 	int niopts;
6759 	struct vfs_context context;
6760 #if NAMEDRSRCFORK
6761 	int is_namedstream = 0;
6762 #endif
6763 
6764 	/*
6765 	 * Unless the AT_EACCESS option is used, Access is defined as checking
6766 	 * against the process' real identity, even if operations are checking
6767 	 * the effective identity.  So we need to tweak the credential
6768 	 * in the context for that case.
6769 	 */
6770 	if (!(flag & AT_EACCESS)) {
6771 		context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6772 	} else {
6773 		context.vc_ucred = ctx->vc_ucred;
6774 	}
6775 	context.vc_thread = ctx->vc_thread;
6776 
6777 
6778 	niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6779 	/* need parent for vnode_authorize for deletion test */
6780 	if (amode & _DELETE_OK) {
6781 		niopts |= WANTPARENT;
6782 	}
6783 	NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6784 	    path, &context);
6785 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6786 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6787 	}
6788 
6789 #if NAMEDRSRCFORK
6790 	/* access(F_OK) calls are allowed for resource forks. */
6791 	if (amode == F_OK) {
6792 		nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6793 	}
6794 #endif
6795 	error = nameiat(&nd, fd);
6796 	if (error) {
6797 		goto out;
6798 	}
6799 
6800 #if NAMEDRSRCFORK
6801 	/* Grab reference on the shadow stream file vnode to
6802 	 * force an inactive on release which will mark it
6803 	 * for recycle.
6804 	 */
6805 	if (vnode_isnamedstream(nd.ni_vp) &&
6806 	    (nd.ni_vp->v_parent != NULLVP) &&
6807 	    vnode_isshadow(nd.ni_vp)) {
6808 		is_namedstream = 1;
6809 		vnode_ref(nd.ni_vp);
6810 	}
6811 #endif
6812 
6813 	error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6814 
6815 #if NAMEDRSRCFORK
6816 	if (is_namedstream) {
6817 		vnode_rele(nd.ni_vp);
6818 	}
6819 #endif
6820 
6821 	vnode_put(nd.ni_vp);
6822 	if (amode & _DELETE_OK) {
6823 		vnode_put(nd.ni_dvp);
6824 	}
6825 	nameidone(&nd);
6826 
6827 out:
6828 	if (!(flag & AT_EACCESS)) {
6829 		kauth_cred_unref(&context.vc_ucred);
6830 	}
6831 	return error;
6832 }
6833 
6834 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6835 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6836 {
6837 	return faccessat_internal(vfs_context_current(), AT_FDCWD,
6838 	           uap->path, uap->flags, 0, UIO_USERSPACE);
6839 }
6840 
6841 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6842 faccessat(__unused proc_t p, struct faccessat_args *uap,
6843     __unused int32_t *retval)
6844 {
6845 	if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6846 		return EINVAL;
6847 	}
6848 
6849 	return faccessat_internal(vfs_context_current(), uap->fd,
6850 	           uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6851 }
6852 
6853 /*
6854  * Returns:	0			Success
6855  *		EFAULT
6856  *	copyout:EFAULT
6857  *	namei:???
6858  *	vn_stat:???
6859  */
6860 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6861 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6862     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6863     enum uio_seg segflg, int fd, int flag)
6864 {
6865 	struct nameidata nd;
6866 	int follow;
6867 	union {
6868 		struct stat sb;
6869 		struct stat64 sb64;
6870 	} source = {};
6871 	union {
6872 		struct user64_stat user64_sb;
6873 		struct user32_stat user32_sb;
6874 		struct user64_stat64 user64_sb64;
6875 		struct user32_stat64 user32_sb64;
6876 	} dest = {};
6877 	caddr_t sbp;
6878 	int error, my_size;
6879 	kauth_filesec_t fsec;
6880 	size_t xsecurity_bufsize;
6881 	void * statptr;
6882 	struct fileproc *fp = NULL;
6883 	int needsrealdev = 0;
6884 
6885 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6886 	NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6887 	    segflg, path, ctx);
6888 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6889 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6890 	}
6891 
6892 #if NAMEDRSRCFORK
6893 	int is_namedstream = 0;
6894 	/* stat calls are allowed for resource forks. */
6895 	nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6896 #endif
6897 
6898 	if (flag & AT_FDONLY) {
6899 		vnode_t fvp;
6900 
6901 		error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6902 		if (error) {
6903 			return error;
6904 		}
6905 		if ((error = vnode_getwithref(fvp))) {
6906 			file_drop(fd);
6907 			return error;
6908 		}
6909 		nd.ni_vp = fvp;
6910 	} else {
6911 		error = nameiat(&nd, fd);
6912 		if (error) {
6913 			return error;
6914 		}
6915 	}
6916 	fsec = KAUTH_FILESEC_NONE;
6917 
6918 	statptr = (void *)&source;
6919 
6920 #if NAMEDRSRCFORK
6921 	/* Grab reference on the shadow stream file vnode to
6922 	 * force an inactive on release which will mark it
6923 	 * for recycle.
6924 	 */
6925 	if (vnode_isnamedstream(nd.ni_vp) &&
6926 	    (nd.ni_vp->v_parent != NULLVP) &&
6927 	    vnode_isshadow(nd.ni_vp)) {
6928 		is_namedstream = 1;
6929 		vnode_ref(nd.ni_vp);
6930 	}
6931 #endif
6932 
6933 	needsrealdev = flag & AT_REALDEV ? 1 : 0;
6934 	if (fp && (xsecurity == USER_ADDR_NULL)) {
6935 		/*
6936 		 * If the caller has the file open, and is not
6937 		 * requesting extended security information, we are
6938 		 * going to let them get the basic stat information.
6939 		 */
6940 		error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6941 		    fp->fp_glob->fg_cred);
6942 	} else {
6943 		error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6944 		    isstat64, needsrealdev, ctx);
6945 	}
6946 
6947 #if NAMEDRSRCFORK
6948 	if (is_namedstream) {
6949 		vnode_rele(nd.ni_vp);
6950 	}
6951 #endif
6952 	vnode_put(nd.ni_vp);
6953 	nameidone(&nd);
6954 	if (fp) {
6955 		file_drop(fd);
6956 		fp = NULL;
6957 	}
6958 
6959 	if (error) {
6960 		return error;
6961 	}
6962 	/* Zap spare fields */
6963 	if (isstat64 != 0) {
6964 		source.sb64.st_lspare = 0;
6965 		source.sb64.st_qspare[0] = 0LL;
6966 		source.sb64.st_qspare[1] = 0LL;
6967 		if (vfs_context_is64bit(ctx)) {
6968 			munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6969 			my_size = sizeof(dest.user64_sb64);
6970 			sbp = (caddr_t)&dest.user64_sb64;
6971 		} else {
6972 			munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6973 			my_size = sizeof(dest.user32_sb64);
6974 			sbp = (caddr_t)&dest.user32_sb64;
6975 		}
6976 		/*
6977 		 * Check if we raced (post lookup) against the last unlink of a file.
6978 		 */
6979 		if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6980 			source.sb64.st_nlink = 1;
6981 		}
6982 	} else {
6983 		source.sb.st_lspare = 0;
6984 		source.sb.st_qspare[0] = 0LL;
6985 		source.sb.st_qspare[1] = 0LL;
6986 		if (vfs_context_is64bit(ctx)) {
6987 			munge_user64_stat(&source.sb, &dest.user64_sb);
6988 			my_size = sizeof(dest.user64_sb);
6989 			sbp = (caddr_t)&dest.user64_sb;
6990 		} else {
6991 			munge_user32_stat(&source.sb, &dest.user32_sb);
6992 			my_size = sizeof(dest.user32_sb);
6993 			sbp = (caddr_t)&dest.user32_sb;
6994 		}
6995 
6996 		/*
6997 		 * Check if we raced (post lookup) against the last unlink of a file.
6998 		 */
6999 		if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7000 			source.sb.st_nlink = 1;
7001 		}
7002 	}
7003 	if ((error = copyout(sbp, ub, my_size)) != 0) {
7004 		goto out;
7005 	}
7006 
7007 	/* caller wants extended security information? */
7008 	if (xsecurity != USER_ADDR_NULL) {
7009 		/* did we get any? */
7010 		if (fsec == KAUTH_FILESEC_NONE) {
7011 			if (susize(xsecurity_size, 0) != 0) {
7012 				error = EFAULT;
7013 				goto out;
7014 			}
7015 		} else {
7016 			/* find the user buffer size */
7017 			xsecurity_bufsize = fusize(xsecurity_size);
7018 
7019 			/* copy out the actual data size */
7020 			if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7021 				error = EFAULT;
7022 				goto out;
7023 			}
7024 
7025 			/* if the caller supplied enough room, copy out to it */
7026 			if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7027 				error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7028 			}
7029 		}
7030 	}
7031 out:
7032 	if (fsec != KAUTH_FILESEC_NONE) {
7033 		kauth_filesec_free(fsec);
7034 	}
7035 	return error;
7036 }
7037 
7038 /*
7039  * stat_extended: Get file status; with extended security (ACL).
7040  *
7041  * Parameters:    p                       (ignored)
7042  *                uap                     User argument descriptor (see below)
7043  *                retval                  (ignored)
7044  *
7045  * Indirect:      uap->path               Path of file to get status from
7046  *                uap->ub                 User buffer (holds file status info)
7047  *                uap->xsecurity          ACL to get (extended security)
7048  *                uap->xsecurity_size     Size of ACL
7049  *
7050  * Returns:        0                      Success
7051  *                !0                      errno value
7052  *
7053  */
7054 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7055 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7056     __unused int32_t *retval)
7057 {
7058 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7059 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7060 	           0);
7061 }
7062 
7063 /*
7064  * Returns:	0			Success
7065  *	fstatat_internal:???		[see fstatat_internal() in this file]
7066  */
7067 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7068 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7069 {
7070 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7071 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7072 }
7073 
7074 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7075 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7076 {
7077 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7078 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7079 }
7080 
7081 /*
7082  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7083  *
7084  * Parameters:    p                       (ignored)
7085  *                uap                     User argument descriptor (see below)
7086  *                retval                  (ignored)
7087  *
7088  * Indirect:      uap->path               Path of file to get status from
7089  *                uap->ub                 User buffer (holds file status info)
7090  *                uap->xsecurity          ACL to get (extended security)
7091  *                uap->xsecurity_size     Size of ACL
7092  *
7093  * Returns:        0                      Success
7094  *                !0                      errno value
7095  *
7096  */
7097 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7098 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7099 {
7100 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7101 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7102 	           0);
7103 }
7104 
7105 /*
7106  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7107  *
7108  * Parameters:    p                       (ignored)
7109  *                uap                     User argument descriptor (see below)
7110  *                retval                  (ignored)
7111  *
7112  * Indirect:      uap->path               Path of file to get status from
7113  *                uap->ub                 User buffer (holds file status info)
7114  *                uap->xsecurity          ACL to get (extended security)
7115  *                uap->xsecurity_size     Size of ACL
7116  *
7117  * Returns:        0                      Success
7118  *                !0                      errno value
7119  *
7120  */
7121 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7122 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7123 {
7124 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7125 	           uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7126 	           AT_SYMLINK_NOFOLLOW);
7127 }
7128 
7129 /*
7130  * Get file status; this version does not follow links.
7131  */
7132 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7133 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7134 {
7135 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7136 	           0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7137 }
7138 
7139 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7140 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7141 {
7142 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7143 	           0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7144 }
7145 
7146 /*
7147  * lstat64_extended: Get file status; can handle large inode numbers; does not
7148  * follow links; with extended security (ACL).
7149  *
7150  * Parameters:    p                       (ignored)
7151  *                uap                     User argument descriptor (see below)
7152  *                retval                  (ignored)
7153  *
7154  * Indirect:      uap->path               Path of file to get status from
7155  *                uap->ub                 User buffer (holds file status info)
7156  *                uap->xsecurity          ACL to get (extended security)
7157  *                uap->xsecurity_size     Size of ACL
7158  *
7159  * Returns:        0                      Success
7160  *                !0                      errno value
7161  *
7162  */
7163 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7164 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7165 {
7166 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7167 	           uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7168 	           AT_SYMLINK_NOFOLLOW);
7169 }
7170 
7171 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7172 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7173 {
7174 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7175 		return EINVAL;
7176 	}
7177 
7178 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7179 	           0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7180 }
7181 
7182 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7183 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7184     __unused int32_t *retval)
7185 {
7186 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7187 		return EINVAL;
7188 	}
7189 
7190 	return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7191 	           0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7192 }
7193 
7194 /*
7195  * Get configurable pathname variables.
7196  *
7197  * Returns:	0			Success
7198  *	namei:???
7199  *	vn_pathconf:???
7200  *
7201  * Notes:	Global implementation  constants are intended to be
7202  *		implemented in this function directly; all other constants
7203  *		are per-FS implementation, and therefore must be handled in
7204  *		each respective FS, instead.
7205  *
7206  * XXX We implement some things globally right now that should actually be
7207  * XXX per-FS; we will need to deal with this at some point.
7208  */
7209 /* ARGSUSED */
7210 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7211 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7212 {
7213 	int error;
7214 	struct nameidata nd;
7215 	vfs_context_t ctx = vfs_context_current();
7216 
7217 	NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7218 	    UIO_USERSPACE, uap->path, ctx);
7219 	error = namei(&nd);
7220 	if (error) {
7221 		return error;
7222 	}
7223 
7224 	error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7225 
7226 	vnode_put(nd.ni_vp);
7227 	nameidone(&nd);
7228 	return error;
7229 }
7230 
7231 /*
7232  * Return target name of a symbolic link.
7233  */
7234 /* ARGSUSED */
7235 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7236 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7237     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7238     int *retval)
7239 {
7240 	vnode_t vp;
7241 	uio_t auio;
7242 	int error;
7243 	struct nameidata nd;
7244 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
7245 	bool put_vnode;
7246 
7247 	if (bufsize > INT32_MAX) {
7248 		return EINVAL;
7249 	}
7250 
7251 	if (lnk_vp) {
7252 		vp = lnk_vp;
7253 		put_vnode = false;
7254 	} else {
7255 		NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7256 		    seg, path, ctx);
7257 
7258 		error = nameiat(&nd, fd);
7259 		if (error) {
7260 			return error;
7261 		}
7262 		vp = nd.ni_vp;
7263 		put_vnode = true;
7264 		nameidone(&nd);
7265 	}
7266 
7267 	auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7268 	    &uio_buf[0], sizeof(uio_buf));
7269 	uio_addiov(auio, buf, bufsize);
7270 	if (vp->v_type != VLNK) {
7271 		error = EINVAL;
7272 	} else {
7273 #if CONFIG_MACF
7274 		error = mac_vnode_check_readlink(ctx, vp);
7275 #endif
7276 		if (error == 0) {
7277 			error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7278 			    ctx);
7279 		}
7280 		if (error == 0) {
7281 			error = VNOP_READLINK(vp, auio, ctx);
7282 		}
7283 	}
7284 
7285 	if (put_vnode) {
7286 		vnode_put(vp);
7287 	}
7288 
7289 	*retval = (int)(bufsize - uio_resid(auio));
7290 	return error;
7291 }
7292 
7293 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7294 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7295 {
7296 	enum uio_seg procseg;
7297 	vnode_t vp;
7298 	int error;
7299 
7300 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7301 
7302 	AUDIT_ARG(fd, uap->fd);
7303 
7304 	if ((error = file_vnode(uap->fd, &vp))) {
7305 		return error;
7306 	}
7307 	if ((error = vnode_getwithref(vp))) {
7308 		file_drop(uap->fd);
7309 		return error;
7310 	}
7311 
7312 	error = readlinkat_internal(vfs_context_current(), -1,
7313 	    vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7314 	    uap->bufsize, procseg, retval);
7315 
7316 	vnode_put(vp);
7317 	file_drop(uap->fd);
7318 	return error;
7319 }
7320 
7321 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7322 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7323 {
7324 	enum uio_seg procseg;
7325 
7326 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7327 	return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7328 	           CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7329 	           uap->count, procseg, retval);
7330 }
7331 
7332 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7333 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7334 {
7335 	enum uio_seg procseg;
7336 
7337 	procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7338 	return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7339 	           CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7340 	           retval);
7341 }
7342 
7343 /*
7344  * Change file flags, the deep inner layer.
7345  */
7346 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7347 chflags0(vnode_t vp, struct vnode_attr *va,
7348     int (*setattr)(vnode_t, void *, vfs_context_t),
7349     void *arg, vfs_context_t ctx)
7350 {
7351 	kauth_action_t action = 0;
7352 	int error;
7353 
7354 #if CONFIG_MACF
7355 	error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7356 	if (error) {
7357 		goto out;
7358 	}
7359 #endif
7360 
7361 	/* request authorisation, disregard immutability */
7362 	if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7363 		goto out;
7364 	}
7365 	/*
7366 	 * Request that the auth layer disregard those file flags it's allowed to when
7367 	 * authorizing this operation; we need to do this in order to be able to
7368 	 * clear immutable flags.
7369 	 */
7370 	if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7371 		goto out;
7372 	}
7373 	error = (*setattr)(vp, arg, ctx);
7374 
7375 #if CONFIG_MACF
7376 	if (error == 0) {
7377 		mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7378 	}
7379 #endif
7380 
7381 out:
7382 	return error;
7383 }
7384 
7385 /*
7386  * Change file flags.
7387  *
7388  * NOTE: this will vnode_put() `vp'
7389  */
7390 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7391 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7392 {
7393 	struct vnode_attr va;
7394 	int error;
7395 
7396 	VATTR_INIT(&va);
7397 	VATTR_SET(&va, va_flags, flags);
7398 
7399 	error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7400 	vnode_put(vp);
7401 
7402 	if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7403 		error = ENOTSUP;
7404 	}
7405 
7406 	return error;
7407 }
7408 
7409 /*
7410  * Change flags of a file given a path name.
7411  */
7412 /* ARGSUSED */
7413 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7414 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7415 {
7416 	vnode_t vp;
7417 	vfs_context_t ctx = vfs_context_current();
7418 	int error;
7419 	struct nameidata nd;
7420 	uint32_t wantparent = 0;
7421 
7422 #if CONFIG_FILE_LEASES
7423 	wantparent = WANTPARENT;
7424 #endif
7425 
7426 	AUDIT_ARG(fflags, uap->flags);
7427 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7428 	    UIO_USERSPACE, uap->path, ctx);
7429 	error = namei(&nd);
7430 	if (error) {
7431 		return error;
7432 	}
7433 	vp = nd.ni_vp;
7434 
7435 #if CONFIG_FILE_LEASES
7436 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7437 	vnode_put(nd.ni_dvp);
7438 #endif
7439 
7440 	nameidone(&nd);
7441 
7442 	/* we don't vnode_put() here because chflags1 does internally */
7443 	error = chflags1(vp, uap->flags, ctx);
7444 
7445 	return error;
7446 }
7447 
7448 /*
7449  * Change flags of a file given a file descriptor.
7450  */
7451 /* ARGSUSED */
7452 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7453 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7454 {
7455 	vnode_t vp;
7456 	int error;
7457 
7458 	AUDIT_ARG(fd, uap->fd);
7459 	AUDIT_ARG(fflags, uap->flags);
7460 	if ((error = file_vnode(uap->fd, &vp))) {
7461 		return error;
7462 	}
7463 
7464 	if ((error = vnode_getwithref(vp))) {
7465 		file_drop(uap->fd);
7466 		return error;
7467 	}
7468 
7469 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7470 
7471 #if CONFIG_FILE_LEASES
7472 	vnode_breakdirlease(vp, true, O_WRONLY);
7473 #endif
7474 
7475 	/* we don't vnode_put() here because chflags1 does internally */
7476 	error = chflags1(vp, uap->flags, vfs_context_current());
7477 
7478 	file_drop(uap->fd);
7479 	return error;
7480 }
7481 
7482 /*
7483  * Change security information on a filesystem object.
7484  *
7485  * Returns:	0			Success
7486  *		EPERM			Operation not permitted
7487  *		vnode_authattr:???	[anything vnode_authattr can return]
7488  *		vnode_authorize:???	[anything vnode_authorize can return]
7489  *		vnode_setattr:???	[anything vnode_setattr can return]
7490  *
7491  * Notes:	If vnode_authattr or vnode_authorize return EACCES, it will be
7492  *		translated to EPERM before being returned.
7493  */
7494 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7495 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7496 {
7497 	kauth_action_t action;
7498 	int error;
7499 
7500 	AUDIT_ARG(mode, vap->va_mode);
7501 	/* XXX audit new args */
7502 
7503 #if NAMEDSTREAMS
7504 	/* chmod calls are not allowed for resource forks. */
7505 	if (vp->v_flag & VISNAMEDSTREAM) {
7506 		return EPERM;
7507 	}
7508 #endif
7509 
7510 #if CONFIG_MACF
7511 	if (VATTR_IS_ACTIVE(vap, va_mode) &&
7512 	    (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7513 		return error;
7514 	}
7515 
7516 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7517 		if ((error = mac_vnode_check_setowner(ctx, vp,
7518 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7519 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7520 			return error;
7521 		}
7522 	}
7523 
7524 	if (VATTR_IS_ACTIVE(vap, va_acl) &&
7525 	    (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7526 		return error;
7527 	}
7528 #endif
7529 
7530 	/* make sure that the caller is allowed to set this security information */
7531 	if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7532 	    ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7533 		if (error == EACCES) {
7534 			error = EPERM;
7535 		}
7536 		return error;
7537 	}
7538 
7539 	if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7540 		return error;
7541 	}
7542 
7543 #if CONFIG_MACF
7544 	if (VATTR_IS_ACTIVE(vap, va_mode)) {
7545 		mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7546 	}
7547 
7548 	if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7549 		mac_vnode_notify_setowner(ctx, vp,
7550 		    VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7551 		    VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7552 	}
7553 
7554 	if (VATTR_IS_ACTIVE(vap, va_acl)) {
7555 		mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7556 	}
7557 #endif
7558 
7559 	return error;
7560 }
7561 
7562 
7563 /*
7564  * Change mode of a file given a path name.
7565  *
7566  * Returns:	0			Success
7567  *		namei:???		[anything namei can return]
7568  *		chmod_vnode:???		[anything chmod_vnode can return]
7569  */
7570 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7571 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7572     int fd, int flag, enum uio_seg segflg)
7573 {
7574 	struct nameidata nd;
7575 	int follow, error;
7576 	uint32_t wantparent = 0;
7577 
7578 #if CONFIG_FILE_LEASES
7579 	wantparent = WANTPARENT;
7580 #endif
7581 
7582 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7583 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7584 	    segflg, path, ctx);
7585 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7586 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7587 	}
7588 	if ((error = nameiat(&nd, fd))) {
7589 		return error;
7590 	}
7591 
7592 #if CONFIG_FILE_LEASES
7593 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7594 	vnode_put(nd.ni_dvp);
7595 #endif
7596 
7597 	error = chmod_vnode(ctx, nd.ni_vp, vap);
7598 	vnode_put(nd.ni_vp);
7599 	nameidone(&nd);
7600 	return error;
7601 }
7602 
7603 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7604 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7605     gid_t gid, user_addr_t xsecurity)
7606 {
7607 	int error;
7608 
7609 	VATTR_INIT(pva);
7610 
7611 	if (mode != -1) {
7612 		VATTR_SET(pva, va_mode, mode & ALLPERMS);
7613 	} else {
7614 		pva->va_mode = 0;
7615 	}
7616 
7617 	if (uid != KAUTH_UID_NONE) {
7618 		VATTR_SET(pva, va_uid, uid);
7619 	}
7620 
7621 	if (gid != KAUTH_GID_NONE) {
7622 		VATTR_SET(pva, va_gid, gid);
7623 	}
7624 
7625 	*pxsecdst = NULL;
7626 	switch (xsecurity) {
7627 	case USER_ADDR_NULL:
7628 		break;
7629 
7630 	case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7631 		VATTR_SET(pva, va_acl, NULL);
7632 		break;
7633 
7634 	default:
7635 		if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7636 			return error;
7637 		}
7638 
7639 		VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7640 		pva->va_vaflags |= VA_FILESEC_ACL;
7641 		KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7642 		break;
7643 	}
7644 
7645 	return 0;
7646 }
7647 
7648 /*
7649  * chmod_extended: Change the mode of a file given a path name; with extended
7650  * argument list (including extended security (ACL)).
7651  *
7652  * Parameters:	p			Process requesting the open
7653  *		uap			User argument descriptor (see below)
7654  *		retval			(ignored)
7655  *
7656  * Indirect:	uap->path		Path to object (same as 'chmod')
7657  *		uap->uid		UID to set
7658  *		uap->gid		GID to set
7659  *		uap->mode		File mode to set (same as 'chmod')
7660  *		uap->xsecurity		ACL to set (or delete)
7661  *
7662  * Returns:	0			Success
7663  *		!0			errno value
7664  *
7665  * Notes:	The kauth_filesec_t in 'va', if any, is in host byte order.
7666  *
7667  * XXX:		We should enummerate the possible errno values here, and where
7668  *		in the code they originated.
7669  */
7670 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7671 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7672 {
7673 	int error;
7674 	struct vnode_attr va;
7675 	kauth_filesec_t xsecdst = NULL;
7676 
7677 	AUDIT_ARG(owner, uap->uid, uap->gid);
7678 
7679 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7680 	    uap->gid, uap->xsecurity);
7681 
7682 	if (error) {
7683 		return error;
7684 	}
7685 
7686 	error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7687 	    UIO_USERSPACE);
7688 
7689 	if (xsecdst != NULL) {
7690 		kauth_filesec_free(xsecdst);
7691 	}
7692 	return error;
7693 }
7694 
7695 /*
7696  * Returns:	0			Success
7697  *		chmodat:???		[anything chmodat can return]
7698  */
7699 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7700 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7701     int flag, enum uio_seg segflg)
7702 {
7703 	struct vnode_attr va;
7704 
7705 	VATTR_INIT(&va);
7706 	VATTR_SET(&va, va_mode, mode & ALLPERMS);
7707 
7708 	return chmodat(ctx, path, &va, fd, flag, segflg);
7709 }
7710 
7711 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7712 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7713 {
7714 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7715 	           AT_FDCWD, 0, UIO_USERSPACE);
7716 }
7717 
7718 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7719 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7720 {
7721 	if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7722 		return EINVAL;
7723 	}
7724 
7725 	return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7726 	           uap->fd, uap->flag, UIO_USERSPACE);
7727 }
7728 
7729 /*
7730  * Change mode of a file given a file descriptor.
7731  */
7732 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7733 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7734 {
7735 	vnode_t vp;
7736 	int error;
7737 
7738 	AUDIT_ARG(fd, fd);
7739 
7740 	if ((error = file_vnode(fd, &vp)) != 0) {
7741 		return error;
7742 	}
7743 	if ((error = vnode_getwithref(vp)) != 0) {
7744 		file_drop(fd);
7745 		return error;
7746 	}
7747 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7748 
7749 #if CONFIG_FILE_LEASES
7750 	vnode_breakdirlease(vp, true, O_WRONLY);
7751 #endif
7752 
7753 	error = chmod_vnode(vfs_context_current(), vp, vap);
7754 	(void)vnode_put(vp);
7755 	file_drop(fd);
7756 
7757 	return error;
7758 }
7759 
7760 /*
7761  * fchmod_extended: Change mode of a file given a file descriptor; with
7762  * extended argument list (including extended security (ACL)).
7763  *
7764  * Parameters:    p                       Process requesting to change file mode
7765  *                uap                     User argument descriptor (see below)
7766  *                retval                  (ignored)
7767  *
7768  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7769  *                uap->uid                UID to set
7770  *                uap->gid                GID to set
7771  *                uap->xsecurity          ACL to set (or delete)
7772  *                uap->fd                 File descriptor of file to change mode
7773  *
7774  * Returns:        0                      Success
7775  *                !0                      errno value
7776  *
7777  */
7778 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7779 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7780 {
7781 	int error;
7782 	struct vnode_attr va;
7783 	kauth_filesec_t xsecdst = NULL;
7784 
7785 	AUDIT_ARG(owner, uap->uid, uap->gid);
7786 
7787 	error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7788 	    uap->gid, uap->xsecurity);
7789 
7790 	if (error) {
7791 		return error;
7792 	}
7793 
7794 	error = fchmod1(p, uap->fd, &va);
7795 
7796 	if (xsecdst != NULL) {
7797 		kauth_filesec_free(xsecdst);
7798 	}
7799 	return error;
7800 }
7801 
7802 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7803 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7804 {
7805 	struct vnode_attr va;
7806 
7807 	VATTR_INIT(&va);
7808 	VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7809 
7810 	return fchmod1(p, uap->fd, &va);
7811 }
7812 
7813 
7814 /*
7815  * Set ownership given a path name.
7816  */
7817 /* ARGSUSED */
7818 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7819 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7820     gid_t gid, int flag, enum uio_seg segflg)
7821 {
7822 	vnode_t vp;
7823 	struct vnode_attr va;
7824 	int error;
7825 	struct nameidata nd;
7826 	int follow;
7827 	kauth_action_t action;
7828 	uint32_t wantparent = 0;
7829 
7830 #if CONFIG_FILE_LEASES
7831 	wantparent = WANTPARENT;
7832 #endif
7833 
7834 	AUDIT_ARG(owner, uid, gid);
7835 
7836 	follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7837 	NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent, segflg,
7838 	    path, ctx);
7839 	if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7840 		nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7841 	}
7842 	error = nameiat(&nd, fd);
7843 	if (error) {
7844 		return error;
7845 	}
7846 	vp = nd.ni_vp;
7847 
7848 	VATTR_INIT(&va);
7849 	if (uid != (uid_t)VNOVAL) {
7850 		VATTR_SET(&va, va_uid, uid);
7851 	}
7852 	if (gid != (gid_t)VNOVAL) {
7853 		VATTR_SET(&va, va_gid, gid);
7854 	}
7855 
7856 #if CONFIG_MACF
7857 	error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7858 	if (error) {
7859 		goto out;
7860 	}
7861 #endif
7862 
7863 	/* preflight and authorize attribute changes */
7864 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7865 		goto out;
7866 	}
7867 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7868 		goto out;
7869 	}
7870 
7871 #if CONFIG_FILE_LEASES
7872 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7873 #endif
7874 
7875 	error = vnode_setattr(vp, &va, ctx);
7876 
7877 #if CONFIG_MACF
7878 	if (error == 0) {
7879 		mac_vnode_notify_setowner(ctx, vp, uid, gid);
7880 	}
7881 #endif
7882 
7883 out:
7884 	/*
7885 	 * EACCES is only allowed from namei(); permissions failure should
7886 	 * return EPERM, so we need to translate the error code.
7887 	 */
7888 	if (error == EACCES) {
7889 		error = EPERM;
7890 	}
7891 
7892 #if CONFIG_FILE_LEASES
7893 	vnode_put(nd.ni_dvp);
7894 #endif
7895 	nameidone(&nd);
7896 	vnode_put(vp);
7897 	return error;
7898 }
7899 
7900 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7901 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7902 {
7903 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7904 	           uap->uid, uap->gid, 0, UIO_USERSPACE);
7905 }
7906 
7907 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7908 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7909 {
7910 	return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7911 	           uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7912 }
7913 
7914 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7915 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7916 {
7917 	if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7918 		return EINVAL;
7919 	}
7920 
7921 	return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7922 	           uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7923 }
7924 
7925 /*
7926  * Set ownership given a file descriptor.
7927  */
7928 /* ARGSUSED */
7929 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7930 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7931 {
7932 	struct vnode_attr va;
7933 	vfs_context_t ctx = vfs_context_current();
7934 	vnode_t vp;
7935 	int error;
7936 	kauth_action_t action;
7937 
7938 	AUDIT_ARG(owner, uap->uid, uap->gid);
7939 	AUDIT_ARG(fd, uap->fd);
7940 
7941 	if ((error = file_vnode(uap->fd, &vp))) {
7942 		return error;
7943 	}
7944 
7945 	if ((error = vnode_getwithref(vp))) {
7946 		file_drop(uap->fd);
7947 		return error;
7948 	}
7949 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7950 
7951 	VATTR_INIT(&va);
7952 	if (uap->uid != VNOVAL) {
7953 		VATTR_SET(&va, va_uid, uap->uid);
7954 	}
7955 	if (uap->gid != VNOVAL) {
7956 		VATTR_SET(&va, va_gid, uap->gid);
7957 	}
7958 
7959 #if NAMEDSTREAMS
7960 	/* chown calls are not allowed for resource forks. */
7961 	if (vp->v_flag & VISNAMEDSTREAM) {
7962 		error = EPERM;
7963 		goto out;
7964 	}
7965 #endif
7966 
7967 #if CONFIG_MACF
7968 	error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7969 	if (error) {
7970 		goto out;
7971 	}
7972 #endif
7973 
7974 	/* preflight and authorize attribute changes */
7975 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7976 		goto out;
7977 	}
7978 	if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7979 		if (error == EACCES) {
7980 			error = EPERM;
7981 		}
7982 		goto out;
7983 	}
7984 
7985 #if CONFIG_FILE_LEASES
7986 	vnode_breakdirlease(vp, true, O_WRONLY);
7987 #endif
7988 
7989 	error = vnode_setattr(vp, &va, ctx);
7990 
7991 #if CONFIG_MACF
7992 	if (error == 0) {
7993 		mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7994 	}
7995 #endif
7996 
7997 out:
7998 	(void)vnode_put(vp);
7999 	file_drop(uap->fd);
8000 	return error;
8001 }
8002 
8003 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8004 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8005 {
8006 	int error;
8007 
8008 	if (usrtvp == USER_ADDR_NULL) {
8009 		struct timeval old_tv;
8010 		/* XXX Y2038 bug because of microtime argument */
8011 		microtime(&old_tv);
8012 		TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8013 		tsp[1] = tsp[0];
8014 	} else {
8015 		if (IS_64BIT_PROCESS(current_proc())) {
8016 			struct user64_timeval tv[2];
8017 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8018 			if (error) {
8019 				return error;
8020 			}
8021 			TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8022 			TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8023 		} else {
8024 			struct user32_timeval tv[2];
8025 			error = copyin(usrtvp, (void *)tv, sizeof(tv));
8026 			if (error) {
8027 				return error;
8028 			}
8029 			TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8030 			TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8031 		}
8032 	}
8033 	return 0;
8034 }
8035 
8036 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8037 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8038     int nullflag)
8039 {
8040 	int error;
8041 	struct vnode_attr va;
8042 	kauth_action_t action;
8043 
8044 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8045 
8046 	VATTR_INIT(&va);
8047 	VATTR_SET(&va, va_access_time, ts[0]);
8048 	VATTR_SET(&va, va_modify_time, ts[1]);
8049 	if (nullflag) {
8050 		va.va_vaflags |= VA_UTIMES_NULL;
8051 	}
8052 
8053 #if NAMEDSTREAMS
8054 	/* utimes calls are not allowed for resource forks. */
8055 	if (vp->v_flag & VISNAMEDSTREAM) {
8056 		error = EPERM;
8057 		goto out;
8058 	}
8059 #endif
8060 
8061 #if CONFIG_MACF
8062 	error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8063 	if (error) {
8064 		goto out;
8065 	}
8066 #endif
8067 	if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8068 		if (!nullflag && error == EACCES) {
8069 			error = EPERM;
8070 		}
8071 		goto out;
8072 	}
8073 
8074 	/* since we may not need to auth anything, check here */
8075 	if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8076 		if (!nullflag && error == EACCES) {
8077 			error = EPERM;
8078 		}
8079 		goto out;
8080 	}
8081 	error = vnode_setattr(vp, &va, ctx);
8082 
8083 #if CONFIG_MACF
8084 	if (error == 0) {
8085 		mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8086 	}
8087 #endif
8088 
8089 out:
8090 	return error;
8091 }
8092 
8093 /*
8094  * Set the access and modification times of a file.
8095  */
8096 /* ARGSUSED */
8097 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8098 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8099 {
8100 	struct timespec ts[2];
8101 	user_addr_t usrtvp;
8102 	int error;
8103 	struct nameidata nd;
8104 	vfs_context_t ctx = vfs_context_current();
8105 	uint32_t wantparent = 0;
8106 
8107 #if CONFIG_FILE_LEASES
8108 	wantparent = WANTPARENT;
8109 #endif
8110 
8111 	/*
8112 	 * AUDIT: Needed to change the order of operations to do the
8113 	 * name lookup first because auditing wants the path.
8114 	 */
8115 	NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8116 	    UIO_USERSPACE, uap->path, ctx);
8117 	error = namei(&nd);
8118 	if (error) {
8119 		return error;
8120 	}
8121 
8122 	/*
8123 	 * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
8124 	 * the current time instead.
8125 	 */
8126 	usrtvp = uap->tptr;
8127 	if ((error = getutimes(usrtvp, ts)) != 0) {
8128 		goto out;
8129 	}
8130 
8131 #if CONFIG_FILE_LEASES
8132 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8133 #endif
8134 
8135 	error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8136 
8137 out:
8138 #if CONFIG_FILE_LEASES
8139 	vnode_put(nd.ni_dvp);
8140 #endif
8141 	nameidone(&nd);
8142 	vnode_put(nd.ni_vp);
8143 	return error;
8144 }
8145 
8146 /*
8147  * Set the access and modification times of a file.
8148  */
8149 /* ARGSUSED */
8150 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8151 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8152 {
8153 	struct timespec ts[2];
8154 	vnode_t vp;
8155 	user_addr_t usrtvp;
8156 	int error;
8157 
8158 	AUDIT_ARG(fd, uap->fd);
8159 	usrtvp = uap->tptr;
8160 	if ((error = getutimes(usrtvp, ts)) != 0) {
8161 		return error;
8162 	}
8163 	if ((error = file_vnode(uap->fd, &vp)) != 0) {
8164 		return error;
8165 	}
8166 	if ((error = vnode_getwithref(vp))) {
8167 		file_drop(uap->fd);
8168 		return error;
8169 	}
8170 
8171 #if CONFIG_FILE_LEASES
8172 	vnode_breakdirlease(vp, true, O_WRONLY);
8173 #endif
8174 
8175 	error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8176 
8177 	vnode_put(vp);
8178 	file_drop(uap->fd);
8179 	return error;
8180 }
8181 
8182 static int
truncate_validate_common(proc_t p,off_t length)8183 truncate_validate_common(proc_t p, off_t length)
8184 {
8185 	rlim_t fsize_limit;
8186 
8187 	if (length < 0) {
8188 		return EINVAL;
8189 	}
8190 
8191 	fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8192 	if ((rlim_t)length > fsize_limit) {
8193 		psignal(p, SIGXFSZ);
8194 		return EFBIG;
8195 	}
8196 
8197 	return 0;
8198 }
8199 
8200 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8201 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8202     vfs_context_t ctx, boolean_t need_auth)
8203 {
8204 	struct vnode_attr va;
8205 	kauth_action_t action;
8206 	int error;
8207 
8208 	VATTR_INIT(&va);
8209 	VATTR_SET(&va, va_data_size, length);
8210 
8211 #if CONFIG_MACF
8212 	error = mac_vnode_check_truncate(ctx, cred, vp);
8213 	if (error) {
8214 		return error;
8215 	}
8216 #endif
8217 
8218 	/*
8219 	 * If we reached here from `ftruncate` then we already did an effective
8220 	 * `vnode_authorize` upon open.  We honour the result from then.
8221 	 */
8222 	if (need_auth) {
8223 		if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8224 			return error;
8225 		}
8226 
8227 		if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8228 			return error;
8229 		}
8230 	}
8231 
8232 #if CONFIG_FILE_LEASES
8233 	/* Check if there is a lease placed on the parent directory. */
8234 	vnode_breakdirlease(vp, true, O_WRONLY);
8235 
8236 	/* Now check if there is a lease placed on the file itself. */
8237 	(void)vnode_breaklease(vp, O_WRONLY, ctx);
8238 #endif
8239 
8240 	error = vnode_setattr(vp, &va, ctx);
8241 
8242 #if CONFIG_MACF
8243 	if (error == 0) {
8244 		mac_vnode_notify_truncate(ctx, cred, vp);
8245 	}
8246 #endif
8247 
8248 	return error;
8249 }
8250 
8251 /*
8252  * Truncate a file given its path name.
8253  */
8254 /* ARGSUSED */
8255 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8256 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8257 {
8258 	vfs_context_t ctx = vfs_context_current();
8259 	vnode_t vp;
8260 	int error;
8261 	struct nameidata nd;
8262 
8263 	if ((error = truncate_validate_common(p, uap->length))) {
8264 		return error;
8265 	}
8266 
8267 	NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8268 	    UIO_USERSPACE, uap->path, ctx);
8269 
8270 	if ((error = namei(&nd))) {
8271 		return error;
8272 	}
8273 
8274 	vp = nd.ni_vp;
8275 	nameidone(&nd);
8276 
8277 	error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8278 	vnode_put(vp);
8279 
8280 	return error;
8281 }
8282 
8283 /*
8284  * Truncate a file given a file descriptor.
8285  */
8286 /* ARGSUSED */
8287 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8288 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8289 {
8290 	vnode_t vp;
8291 	struct fileproc *fp;
8292 	int error;
8293 
8294 	AUDIT_ARG(fd, uap->fd);
8295 
8296 	if ((error = truncate_validate_common(p, uap->length))) {
8297 		return error;
8298 	}
8299 
8300 	if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8301 		return error;
8302 	}
8303 
8304 	switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8305 	case DTYPE_PSXSHM:
8306 		error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8307 		goto out;
8308 	case DTYPE_VNODE:
8309 		break;
8310 	default:
8311 		error = EINVAL;
8312 		goto out;
8313 	}
8314 
8315 	vp = (vnode_t)fp_get_data(fp);
8316 
8317 	if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8318 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8319 		error = EINVAL;
8320 		goto out;
8321 	}
8322 
8323 	if ((error = vnode_getwithref(vp)) != 0) {
8324 		goto out;
8325 	}
8326 
8327 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8328 
8329 	error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8330 	    vfs_context_current(), false);
8331 	vnode_put(vp);
8332 
8333 out:
8334 	file_drop(uap->fd);
8335 	return error;
8336 }
8337 
8338 
8339 /*
8340  * Sync an open file with synchronized I/O _file_ integrity completion
8341  */
8342 /* ARGSUSED */
8343 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8344 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8345 {
8346 	__pthread_testcancel(1);
8347 	return fsync_common(p, uap, MNT_WAIT);
8348 }
8349 
8350 
8351 /*
8352  * Sync an open file with synchronized I/O _file_ integrity completion
8353  *
8354  * Notes:	This is a legacy support function that does not test for
8355  *		thread cancellation points.
8356  */
8357 /* ARGSUSED */
8358 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8359 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8360 {
8361 	return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8362 }
8363 
8364 
8365 /*
8366  * Sync an open file with synchronized I/O _data_ integrity completion
8367  */
8368 /* ARGSUSED */
8369 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8370 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8371 {
8372 	__pthread_testcancel(1);
8373 	return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8374 }
8375 
8376 
8377 /*
8378  * fsync_common
8379  *
8380  * Common fsync code to support both synchronized I/O file integrity completion
8381  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8382  *
8383  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8384  * will only guarantee that the file data contents are retrievable.  If
8385  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8386  * includes additional metadata unnecessary for retrieving the file data
8387  * contents, such as atime, mtime, ctime, etc., also be committed to stable
8388  * storage.
8389  *
8390  * Parameters:	p				The process
8391  *		uap->fd				The descriptor to synchronize
8392  *		flags				The data integrity flags
8393  *
8394  * Returns:	int				Success
8395  *	fp_getfvp:EBADF				Bad file descriptor
8396  *	fp_getfvp:ENOTSUP			fd does not refer to a vnode
8397  *	VNOP_FSYNC:???				unspecified
8398  *
8399  * Notes:	We use struct fsync_args because it is a short name, and all
8400  *		caller argument structures are otherwise identical.
8401  */
8402 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8403 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8404 {
8405 	vnode_t vp;
8406 	struct fileproc *fp;
8407 	vfs_context_t ctx = vfs_context_current();
8408 	int error;
8409 
8410 	AUDIT_ARG(fd, uap->fd);
8411 
8412 	if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8413 		return error;
8414 	}
8415 	if ((error = vnode_getwithref(vp))) {
8416 		file_drop(uap->fd);
8417 		return error;
8418 	}
8419 
8420 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8421 
8422 	error = VNOP_FSYNC(vp, flags, ctx);
8423 
8424 #if NAMEDRSRCFORK
8425 	/* Sync resource fork shadow file if necessary. */
8426 	if ((error == 0) &&
8427 	    (vp->v_flag & VISNAMEDSTREAM) &&
8428 	    (vp->v_parent != NULLVP) &&
8429 	    vnode_isshadow(vp) &&
8430 	    (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8431 		(void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8432 	}
8433 #endif
8434 
8435 	(void)vnode_put(vp);
8436 	file_drop(uap->fd);
8437 	return error;
8438 }
8439 
8440 /*
8441  * Duplicate files.  Source must be a file, target must be a file or
8442  * must not exist.
8443  *
8444  * XXX Copyfile authorisation checking is woefully inadequate, and will not
8445  *     perform inheritance correctly.
8446  */
8447 /* ARGSUSED */
8448 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8449 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8450 {
8451 	vnode_t tvp, fvp, tdvp, sdvp;
8452 	struct nameidata fromnd, tond;
8453 	int error;
8454 	vfs_context_t ctx = vfs_context_current();
8455 
8456 	/* Check that the flags are valid. */
8457 	if (uap->flags & ~CPF_MASK) {
8458 		return EINVAL;
8459 	}
8460 
8461 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8462 	    UIO_USERSPACE, uap->from, ctx);
8463 	if ((error = namei(&fromnd))) {
8464 		return error;
8465 	}
8466 	fvp = fromnd.ni_vp;
8467 
8468 	NDINIT(&tond, CREATE, OP_LINK,
8469 	    LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8470 	    UIO_USERSPACE, uap->to, ctx);
8471 	if ((error = namei(&tond))) {
8472 		goto out1;
8473 	}
8474 	tdvp = tond.ni_dvp;
8475 	tvp = tond.ni_vp;
8476 
8477 	if (tvp != NULL) {
8478 		if (!(uap->flags & CPF_OVERWRITE)) {
8479 			error = EEXIST;
8480 			goto out;
8481 		}
8482 	}
8483 
8484 	if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8485 		error = EISDIR;
8486 		goto out;
8487 	}
8488 
8489 	if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8490 		error = EOPNOTSUPP;
8491 		goto out;
8492 	}
8493 
8494 #if CONFIG_MACF
8495 	if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8496 		goto out;
8497 	}
8498 #endif /* CONFIG_MACF */
8499 
8500 	if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8501 		goto out;
8502 	}
8503 	if (tvp) {
8504 		if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8505 			goto out;
8506 		}
8507 	}
8508 	if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8509 		goto out;
8510 	}
8511 
8512 	if (fvp == tdvp) {
8513 		error = EINVAL;
8514 	}
8515 	/*
8516 	 * If source is the same as the destination (that is the
8517 	 * same inode number) then there is nothing to do.
8518 	 * (fixed to have POSIX semantics - CSM 3/2/98)
8519 	 */
8520 	if (fvp == tvp) {
8521 		error = -1;
8522 	}
8523 
8524 #if CONFIG_FILE_LEASES
8525 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8526 #endif
8527 
8528 	if (!error) {
8529 		error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8530 	}
8531 out:
8532 	sdvp = tond.ni_startdir;
8533 	/*
8534 	 * nameidone has to happen before we vnode_put(tdvp)
8535 	 * since it may need to release the fs_nodelock on the tdvp
8536 	 */
8537 	nameidone(&tond);
8538 
8539 	if (tvp) {
8540 		vnode_put(tvp);
8541 	}
8542 	vnode_put(tdvp);
8543 	vnode_put(sdvp);
8544 out1:
8545 	vnode_put(fvp);
8546 
8547 	nameidone(&fromnd);
8548 
8549 	if (error == -1) {
8550 		return 0;
8551 	}
8552 	return error;
8553 }
8554 
8555 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8556 
8557 /*
8558  * Helper function for doing clones. The caller is expected to provide an
8559  * iocounted source vnode and release it.
8560  */
8561 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8562 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8563     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8564 {
8565 	vnode_t tvp, tdvp;
8566 	struct nameidata tond;
8567 	int error;
8568 	int follow;
8569 	boolean_t free_src_acl;
8570 	boolean_t attr_cleanup;
8571 	enum vtype v_type;
8572 	kauth_action_t action;
8573 	struct componentname *cnp;
8574 	uint32_t defaulted = 0;
8575 	struct vnode_attr va;
8576 	struct vnode_attr nva;
8577 	uint32_t vnop_flags;
8578 
8579 	v_type = vnode_vtype(fvp);
8580 	switch (v_type) {
8581 	case VLNK:
8582 	/* FALLTHRU */
8583 	case VREG:
8584 		action = KAUTH_VNODE_ADD_FILE;
8585 		break;
8586 	case VDIR:
8587 		if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8588 		    fvp->v_mountedhere) {
8589 			return EINVAL;
8590 		}
8591 		action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8592 		break;
8593 	default:
8594 		return EINVAL;
8595 	}
8596 
8597 	AUDIT_ARG(fd2, dst_dirfd);
8598 	AUDIT_ARG(value32, flags);
8599 
8600 	follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8601 	NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8602 	    UIO_USERSPACE, dst, ctx);
8603 	if ((error = nameiat(&tond, dst_dirfd))) {
8604 		return error;
8605 	}
8606 	cnp = &tond.ni_cnd;
8607 	tdvp = tond.ni_dvp;
8608 	tvp = tond.ni_vp;
8609 
8610 	free_src_acl = FALSE;
8611 	attr_cleanup = FALSE;
8612 
8613 	if (tvp != NULL) {
8614 		error = EEXIST;
8615 		goto out;
8616 	}
8617 
8618 	if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8619 		error = EXDEV;
8620 		goto out;
8621 	}
8622 
8623 #if CONFIG_MACF
8624 	if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8625 		goto out;
8626 	}
8627 #endif
8628 	if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8629 		goto out;
8630 	}
8631 
8632 	action = KAUTH_VNODE_GENERIC_READ_BITS;
8633 	if (data_read_authorised) {
8634 		action &= ~KAUTH_VNODE_READ_DATA;
8635 	}
8636 	if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8637 		goto out;
8638 	}
8639 
8640 	/*
8641 	 * certain attributes may need to be changed from the source, we ask for
8642 	 * those here with the exception of source file's ACLs unless the CLONE_ACL
8643 	 * flag is specified. By default, the clone file will inherit the target
8644 	 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8645 	 * will inherit the source file's ACLs instead.
8646 	 */
8647 	VATTR_INIT(&va);
8648 	VATTR_WANTED(&va, va_uid);
8649 	VATTR_WANTED(&va, va_gid);
8650 	VATTR_WANTED(&va, va_mode);
8651 	VATTR_WANTED(&va, va_flags);
8652 	if (flags & CLONE_ACL) {
8653 		VATTR_WANTED(&va, va_acl);
8654 	}
8655 
8656 	if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8657 		goto out;
8658 	}
8659 
8660 	VATTR_INIT(&nva);
8661 	VATTR_SET(&nva, va_type, v_type);
8662 	if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8663 		VATTR_SET(&nva, va_acl, va.va_acl);
8664 		free_src_acl = TRUE;
8665 	}
8666 
8667 	/* Handle ACL inheritance, initialize vap. */
8668 	if (v_type == VLNK) {
8669 		error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8670 	} else {
8671 		error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8672 		if (error) {
8673 			goto out;
8674 		}
8675 		attr_cleanup = TRUE;
8676 	}
8677 
8678 	vnop_flags = VNODE_CLONEFILE_DEFAULT;
8679 	/*
8680 	 * We've got initial values for all security parameters,
8681 	 * If we are superuser, then we can change owners to be the
8682 	 * same as the source. Both superuser and the owner have default
8683 	 * WRITE_SECURITY privileges so all other fields can be taken
8684 	 * from source as well.
8685 	 */
8686 	if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8687 		if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8688 			VATTR_SET(&nva, va_uid, va.va_uid);
8689 		}
8690 		if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8691 			VATTR_SET(&nva, va_gid, va.va_gid);
8692 		}
8693 	} else {
8694 		vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8695 	}
8696 
8697 	if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8698 		VATTR_SET(&nva, va_mode, va.va_mode);
8699 	}
8700 	if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8701 		VATTR_SET(&nva, va_flags,
8702 		    ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8703 		    (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8704 	}
8705 
8706 #if CONFIG_FILE_LEASES
8707 	vnode_breakdirlease(tdvp, false, O_WRONLY);
8708 #endif
8709 
8710 	error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8711 
8712 	if (!error && tvp) {
8713 		int     update_flags = 0;
8714 #if CONFIG_FSE
8715 		int fsevent;
8716 #endif /* CONFIG_FSE */
8717 
8718 		/*
8719 		 * If some of the requested attributes weren't handled by the
8720 		 * VNOP, use our fallback code.
8721 		 */
8722 		if (!VATTR_ALL_SUPPORTED(&nva)) {
8723 			(void)vnode_setattr_fallback(tvp, &nva, ctx);
8724 		}
8725 
8726 #if CONFIG_MACF
8727 		(void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8728 		    VNODE_LABEL_CREATE, ctx);
8729 #endif
8730 
8731 		// Make sure the name & parent pointers are hooked up
8732 		if (tvp->v_name == NULL) {
8733 			update_flags |= VNODE_UPDATE_NAME;
8734 		}
8735 		if (tvp->v_parent == NULLVP) {
8736 			update_flags |= VNODE_UPDATE_PARENT;
8737 		}
8738 
8739 		if (update_flags) {
8740 			(void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8741 			    cnp->cn_namelen, cnp->cn_hash, update_flags);
8742 		}
8743 
8744 #if CONFIG_FSE
8745 		switch (vnode_vtype(tvp)) {
8746 		case VLNK:
8747 		/* FALLTHRU */
8748 		case VREG:
8749 			fsevent = FSE_CREATE_FILE;
8750 			break;
8751 		case VDIR:
8752 			fsevent = FSE_CREATE_DIR;
8753 			break;
8754 		default:
8755 			goto out;
8756 		}
8757 
8758 		if (need_fsevent(fsevent, tvp)) {
8759 			/*
8760 			 * The following is a sequence of three explicit events.
8761 			 * A pair of FSE_CLONE events representing the source and destination
8762 			 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8763 			 * fseventsd may coalesce the destination clone and create events
8764 			 * into a single event resulting in the following sequence for a client
8765 			 * FSE_CLONE (src)
8766 			 * FSE_CLONE | FSE_CREATE (dst)
8767 			 */
8768 			add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8769 			    FSE_ARG_DONE);
8770 			add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8771 			    FSE_ARG_DONE);
8772 		}
8773 #endif /* CONFIG_FSE */
8774 	}
8775 
8776 out:
8777 	if (attr_cleanup) {
8778 		vn_attribute_cleanup(&nva, defaulted);
8779 	}
8780 	if (free_src_acl && va.va_acl) {
8781 		kauth_acl_free(va.va_acl);
8782 	}
8783 	nameidone(&tond);
8784 	if (tvp) {
8785 		vnode_put(tvp);
8786 	}
8787 	vnode_put(tdvp);
8788 	return error;
8789 }
8790 
8791 /*
8792  * clone files or directories, target must not exist.
8793  */
8794 /* ARGSUSED */
8795 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8796 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8797     __unused int32_t *retval)
8798 {
8799 	vnode_t fvp;
8800 	struct nameidata fromnd;
8801 	int follow;
8802 	int error;
8803 	vfs_context_t ctx = vfs_context_current();
8804 
8805 	/* Check that the flags are valid. */
8806 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8807 		return EINVAL;
8808 	}
8809 
8810 	AUDIT_ARG(fd, uap->src_dirfd);
8811 
8812 	follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8813 	NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8814 	    UIO_USERSPACE, uap->src, ctx);
8815 	if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8816 		return error;
8817 	}
8818 
8819 	fvp = fromnd.ni_vp;
8820 	nameidone(&fromnd);
8821 
8822 	error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8823 	    uap->flags, ctx);
8824 
8825 	vnode_put(fvp);
8826 	return error;
8827 }
8828 
8829 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8830 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8831     __unused int32_t *retval)
8832 {
8833 	vnode_t fvp;
8834 	struct fileproc *fp;
8835 	int error;
8836 	vfs_context_t ctx = vfs_context_current();
8837 
8838 	/* Check that the flags are valid. */
8839 	if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8840 		return EINVAL;
8841 	}
8842 
8843 	AUDIT_ARG(fd, uap->src_fd);
8844 	error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8845 	if (error) {
8846 		return error;
8847 	}
8848 
8849 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8850 		AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8851 		error = EBADF;
8852 		goto out;
8853 	}
8854 
8855 	if ((error = vnode_getwithref(fvp))) {
8856 		goto out;
8857 	}
8858 
8859 	AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8860 
8861 	error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8862 	    uap->flags, ctx);
8863 
8864 	vnode_put(fvp);
8865 out:
8866 	file_drop(uap->src_fd);
8867 	return error;
8868 }
8869 
8870 static int
rename_submounts_callback(mount_t mp,void * arg)8871 rename_submounts_callback(mount_t mp, void *arg)
8872 {
8873 	int error = 0;
8874 	mount_t pmp = (mount_t)arg;
8875 	int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8876 
8877 	if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8878 		return 0;
8879 	}
8880 
8881 	if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8882 		return 0;
8883 	}
8884 
8885 	if ((error = vfs_busy(mp, LK_NOWAIT))) {
8886 		printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8887 		return -1;
8888 	}
8889 
8890 	size_t pathlen = MAXPATHLEN;
8891 	if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8892 		printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8893 	}
8894 
8895 	vfs_unbusy(mp);
8896 
8897 	return error;
8898 }
8899 
8900 /*
8901  * Rename files.  Source and destination must either both be directories,
8902  * or both not be directories.  If target is a directory, it must be empty.
8903  */
8904 /* ARGSUSED */
8905 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8906 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8907     int tofd, user_addr_t to, int segflg, u_int uflags)
8908 {
8909 	vnode_t tvp, tdvp;
8910 	vnode_t fvp, fdvp;
8911 	vnode_t mnt_fvp;
8912 	struct nameidata *fromnd, *tond;
8913 	int error = 0;
8914 	int do_retry;
8915 	int retry_count;
8916 	int mntrename;
8917 	int need_event;
8918 	int need_kpath2;
8919 	int has_listeners;
8920 	const char *oname = NULL;
8921 	char *from_name = NULL, *to_name = NULL;
8922 	char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8923 	int from_len = 0, to_len = 0;
8924 	int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8925 	int holding_mntlock;
8926 	int vn_authorize_skipped;
8927 	mount_t locked_mp = NULL;
8928 	vnode_t oparent = NULLVP;
8929 #if CONFIG_FSE
8930 	fse_info from_finfo = {}, to_finfo;
8931 #endif
8932 	int from_truncated = 0, to_truncated = 0;
8933 	int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8934 	int batched = 0;
8935 	struct vnode_attr *fvap, *tvap;
8936 	int continuing = 0;
8937 	vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8938 	int32_t nofollow_any = 0;
8939 	/* carving out a chunk for structs that are too big to be on stack. */
8940 	struct {
8941 		struct nameidata from_node, to_node;
8942 		struct vnode_attr fv_attr, tv_attr;
8943 	} * __rename_data;
8944 
8945 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8946 	fromnd = &__rename_data->from_node;
8947 	tond = &__rename_data->to_node;
8948 
8949 	holding_mntlock = 0;
8950 	do_retry = 0;
8951 	retry_count = 0;
8952 retry:
8953 	fvp = tvp = NULL;
8954 	fdvp = tdvp = NULL;
8955 	fvap = tvap = NULL;
8956 	mnt_fvp = NULLVP;
8957 	mntrename = FALSE;
8958 	vn_authorize_skipped = FALSE;
8959 
8960 	if (uflags & RENAME_NOFOLLOW_ANY) {
8961 		nofollow_any = NAMEI_NOFOLLOW_ANY;
8962 	}
8963 	NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8964 	    segflg, from, ctx);
8965 	fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8966 
8967 	NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8968 	    segflg, to, ctx);
8969 	tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8970 
8971 continue_lookup:
8972 	if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8973 		if ((error = nameiat(fromnd, fromfd))) {
8974 			goto out1;
8975 		}
8976 		fdvp = fromnd->ni_dvp;
8977 		fvp  = fromnd->ni_vp;
8978 
8979 		if (fvp && fvp->v_type == VDIR) {
8980 			tond->ni_cnd.cn_flags |= WILLBEDIR;
8981 		}
8982 	}
8983 
8984 	if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8985 		if ((error = nameiat(tond, tofd))) {
8986 			/*
8987 			 * Translate error code for rename("dir1", "dir2/.").
8988 			 */
8989 			if (error == EISDIR && fvp->v_type == VDIR) {
8990 				error = EINVAL;
8991 			}
8992 			goto out1;
8993 		}
8994 		tdvp = tond->ni_dvp;
8995 		tvp  = tond->ni_vp;
8996 	}
8997 
8998 #if DEVELOPMENT || DEBUG
8999 	/*
9000 	 * XXX VSWAP: Check for entitlements or special flag here
9001 	 * so we can restrict access appropriately.
9002 	 */
9003 #else /* DEVELOPMENT || DEBUG */
9004 
9005 	if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9006 		error = EPERM;
9007 		goto out1;
9008 	}
9009 
9010 	if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9011 		error = EPERM;
9012 		goto out1;
9013 	}
9014 #endif /* DEVELOPMENT || DEBUG */
9015 
9016 	if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9017 		error = ENOENT;
9018 		goto out1;
9019 	}
9020 
9021 	if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9022 		int32_t pval = 0;
9023 		int err = 0;
9024 
9025 		/*
9026 		 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9027 		 * has the same name as target iff the following conditions are met:
9028 		 * 1. the target file system is case insensitive
9029 		 * 2. source and target directories are the same
9030 		 * 3. source and target files are the same
9031 		 * 4. name only differs in case (determined by underlying filesystem)
9032 		 */
9033 		if (fvp != tvp || fdvp != tdvp) {
9034 			error = EEXIST;
9035 			goto out1;
9036 		}
9037 
9038 		/*
9039 		 * Assume that the target file system is case sensitive if
9040 		 * _PC_CASE_SENSITIVE selector isn't supported.
9041 		 */
9042 		err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9043 		if (err != 0 || pval != 0) {
9044 			error = EEXIST;
9045 			goto out1;
9046 		}
9047 	}
9048 
9049 	batched = vnode_compound_rename_available(fdvp);
9050 
9051 #if CONFIG_FSE
9052 	need_event = need_fsevent(FSE_RENAME, fdvp);
9053 	if (need_event) {
9054 		if (fvp) {
9055 			get_fse_info(fvp, &from_finfo, ctx);
9056 		} else {
9057 			error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9058 			if (error) {
9059 				goto out1;
9060 			}
9061 
9062 			fvap = &__rename_data->fv_attr;
9063 		}
9064 
9065 		if (tvp) {
9066 			get_fse_info(tvp, &to_finfo, ctx);
9067 		} else if (batched) {
9068 			error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9069 			if (error) {
9070 				goto out1;
9071 			}
9072 
9073 			tvap = &__rename_data->tv_attr;
9074 		}
9075 	}
9076 #else
9077 	need_event = 0;
9078 #endif /* CONFIG_FSE */
9079 
9080 	has_listeners = kauth_authorize_fileop_has_listeners();
9081 
9082 	need_kpath2 = 0;
9083 #if CONFIG_AUDIT
9084 	if (AUDIT_RECORD_EXISTS()) {
9085 		need_kpath2 = 1;
9086 	}
9087 #endif
9088 
9089 	if (need_event || has_listeners) {
9090 		if (from_name == NULL) {
9091 			GET_PATH(from_name);
9092 		}
9093 
9094 		from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9095 
9096 		if (from_name_no_firmlink == NULL) {
9097 			GET_PATH(from_name_no_firmlink);
9098 		}
9099 
9100 		from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9101 	}
9102 
9103 	if (need_event || need_kpath2 || has_listeners) {
9104 		if (to_name == NULL) {
9105 			GET_PATH(to_name);
9106 		}
9107 
9108 		to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9109 
9110 		if (to_name_no_firmlink == NULL) {
9111 			GET_PATH(to_name_no_firmlink);
9112 		}
9113 
9114 		to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9115 		if (to_name && need_kpath2) {
9116 			AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9117 		}
9118 	}
9119 	if (!fvp) {
9120 		/*
9121 		 * Claim: this check will never reject a valid rename.
9122 		 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9123 		 * Suppose fdvp and tdvp are not on the same mount.
9124 		 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
9125 		 *      then you can't move it to within another dir on the same mountpoint.
9126 		 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9127 		 *
9128 		 * If this check passes, then we are safe to pass these vnodes to the same FS.
9129 		 */
9130 		if (fdvp->v_mount != tdvp->v_mount) {
9131 			error = EXDEV;
9132 			goto out1;
9133 		}
9134 		goto skipped_lookup;
9135 	}
9136 
9137 	/*
9138 	 * If the source and destination are the same (i.e. they're
9139 	 * links to the same vnode) and the target file system is
9140 	 * case sensitive, then there is nothing to do.
9141 	 *
9142 	 * XXX Come back to this.
9143 	 */
9144 	if (fvp == tvp) {
9145 		int pathconf_val;
9146 
9147 		/*
9148 		 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9149 		 * then assume that this file system is case sensitive.
9150 		 */
9151 		if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9152 		    pathconf_val != 0) {
9153 			vn_authorize_skipped = TRUE;
9154 			goto out1;
9155 		}
9156 	}
9157 
9158 	/*
9159 	 * Allow the renaming of mount points.
9160 	 * - target must not exist
9161 	 * - target must reside in the same directory as source
9162 	 * - union mounts cannot be renamed
9163 	 * - the root fs, and tightly-linked system volumes, cannot be renamed
9164 	 *
9165 	 * XXX Handle this in VFS after a continued lookup (if we missed
9166 	 * in the cache to start off)
9167 	 *
9168 	 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9169 	 * we'll skip past here.  The file system is responsible for
9170 	 * checking that @tvp is not a descendent of @fvp and vice versa
9171 	 * so it should always return EINVAL if either @tvp or @fvp is the
9172 	 * root of a volume.
9173 	 */
9174 	if ((fvp->v_flag & VROOT) &&
9175 	    (fvp->v_type == VDIR) &&
9176 	    (tvp == NULL) &&
9177 	    (fvp->v_mountedhere == NULL) &&
9178 	    (fdvp == tdvp) &&
9179 	    ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9180 	    ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9181 	    (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9182 		vnode_t coveredvp;
9183 
9184 		/* switch fvp to the covered vnode */
9185 		coveredvp = fvp->v_mount->mnt_vnodecovered;
9186 		if ((vnode_getwithref(coveredvp))) {
9187 			error = ENOENT;
9188 			goto out1;
9189 		}
9190 		/*
9191 		 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9192 		 * later.
9193 		 */
9194 		mnt_fvp = fvp;
9195 
9196 		fvp = coveredvp;
9197 		mntrename = TRUE;
9198 	}
9199 	/*
9200 	 * Check for cross-device rename.
9201 	 */
9202 	if ((fvp->v_mount != tdvp->v_mount) ||
9203 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
9204 		error = EXDEV;
9205 		goto out1;
9206 	}
9207 
9208 	/*
9209 	 * If source is the same as the destination (that is the
9210 	 * same inode number) then there is nothing to do...
9211 	 * EXCEPT if the underlying file system supports case
9212 	 * insensitivity and is case preserving.  In this case
9213 	 * the file system needs to handle the special case of
9214 	 * getting the same vnode as target (fvp) and source (tvp).
9215 	 *
9216 	 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9217 	 * and _PC_CASE_PRESERVING can have this exception, and they need to
9218 	 * handle the special case of getting the same vnode as target and
9219 	 * source.  NOTE: Then the target is unlocked going into vnop_rename,
9220 	 * so not to cause locking problems. There is a single reference on tvp.
9221 	 *
9222 	 * NOTE - that fvp == tvp also occurs if they are hard linked and
9223 	 * that correct behaviour then is just to return success without doing
9224 	 * anything.
9225 	 *
9226 	 * XXX filesystem should take care of this itself, perhaps...
9227 	 */
9228 	if (fvp == tvp && fdvp == tdvp) {
9229 		if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9230 		    !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9231 		    fromnd->ni_cnd.cn_namelen)) {
9232 			vn_authorize_skipped = TRUE;
9233 			goto out1;
9234 		}
9235 	}
9236 
9237 	if (holding_mntlock && fvp->v_mount != locked_mp) {
9238 		/*
9239 		 * we're holding a reference and lock
9240 		 * on locked_mp, but it no longer matches
9241 		 * what we want to do... so drop our hold
9242 		 */
9243 		mount_unlock_renames(locked_mp);
9244 		mount_drop(locked_mp, 0);
9245 		holding_mntlock = 0;
9246 	}
9247 	if (tdvp != fdvp && fvp->v_type == VDIR) {
9248 		/*
9249 		 * serialize renames that re-shape
9250 		 * the tree... if holding_mntlock is
9251 		 * set, then we're ready to go...
9252 		 * otherwise we
9253 		 * first need to drop the iocounts
9254 		 * we picked up, second take the
9255 		 * lock to serialize the access,
9256 		 * then finally start the lookup
9257 		 * process over with the lock held
9258 		 */
9259 		if (!holding_mntlock) {
9260 			/*
9261 			 * need to grab a reference on
9262 			 * the mount point before we
9263 			 * drop all the iocounts... once
9264 			 * the iocounts are gone, the mount
9265 			 * could follow
9266 			 */
9267 			locked_mp = fvp->v_mount;
9268 			mount_ref(locked_mp, 0);
9269 
9270 			/*
9271 			 * nameidone has to happen before we vnode_put(tvp)
9272 			 * since it may need to release the fs_nodelock on the tvp
9273 			 */
9274 			nameidone(tond);
9275 
9276 			if (tvp) {
9277 				vnode_put(tvp);
9278 			}
9279 			vnode_put(tdvp);
9280 
9281 			/*
9282 			 * nameidone has to happen before we vnode_put(fdvp)
9283 			 * since it may need to release the fs_nodelock on the fvp
9284 			 */
9285 			nameidone(fromnd);
9286 
9287 			vnode_put(fvp);
9288 			vnode_put(fdvp);
9289 
9290 			if (mnt_fvp != NULLVP) {
9291 				vnode_put(mnt_fvp);
9292 			}
9293 
9294 			mount_lock_renames(locked_mp);
9295 			holding_mntlock = 1;
9296 
9297 			goto retry;
9298 		}
9299 	} else {
9300 		/*
9301 		 * when we dropped the iocounts to take
9302 		 * the lock, we allowed the identity of
9303 		 * the various vnodes to change... if they did,
9304 		 * we may no longer be dealing with a rename
9305 		 * that reshapes the tree... once we're holding
9306 		 * the iocounts, the vnodes can't change type
9307 		 * so we're free to drop the lock at this point
9308 		 * and continue on
9309 		 */
9310 		if (holding_mntlock) {
9311 			mount_unlock_renames(locked_mp);
9312 			mount_drop(locked_mp, 0);
9313 			holding_mntlock = 0;
9314 		}
9315 	}
9316 
9317 	if (!batched) {
9318 		error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9319 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9320 		    flags, NULL);
9321 		if (error) {
9322 			if (error == ENOENT) {
9323 				if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9324 					/*
9325 					 * We encountered a race where after doing the namei,
9326 					 * tvp stops being valid. If so, simply re-drive the rename
9327 					 * call from the top.
9328 					 */
9329 					do_retry = 1;
9330 					retry_count += 1;
9331 				}
9332 			}
9333 			goto out1;
9334 		}
9335 	}
9336 
9337 	/* Release the 'mnt_fvp' now that it is no longer needed. */
9338 	if (mnt_fvp != NULLVP) {
9339 		vnode_put(mnt_fvp);
9340 		mnt_fvp = NULLVP;
9341 	}
9342 
9343 	// save these off so we can later verify that fvp is the same
9344 	oname   = fvp->v_name;
9345 	oparent = fvp->v_parent;
9346 
9347 skipped_lookup:
9348 #if CONFIG_FILE_LEASES
9349 	/* Lease break needed for source's parent dir? */
9350 	vnode_breakdirlease(fdvp, false, O_WRONLY);
9351 
9352 	/* Lease break needed for target's parent dir? */
9353 	vnode_breakdirlease(tdvp, false, O_WRONLY);
9354 #endif
9355 
9356 	error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9357 	    tdvp, &tvp, &tond->ni_cnd, tvap,
9358 	    flags, ctx);
9359 
9360 	if (holding_mntlock) {
9361 		/*
9362 		 * we can drop our serialization
9363 		 * lock now
9364 		 */
9365 		mount_unlock_renames(locked_mp);
9366 		mount_drop(locked_mp, 0);
9367 		holding_mntlock = 0;
9368 	}
9369 	if (error) {
9370 		if (error == EDATALESS) {
9371 			/*
9372 			 * If we've been here before, something has gone
9373 			 * horribly wrong and we should just get out lest
9374 			 * we spiral around the drain forever.
9375 			 */
9376 			if (flags & VFS_RENAME_DATALESS) {
9377 				error = EIO;
9378 				goto out1;
9379 			}
9380 
9381 			/*
9382 			 * The object we're renaming is dataless (or has a
9383 			 * dataless descendent) and requires materialization
9384 			 * before the rename occurs.  But we're holding the
9385 			 * mount point's rename lock, so it's not safe to
9386 			 * make the upcall.
9387 			 *
9388 			 * In this case, we release the lock, perform the
9389 			 * materialization, and start the whole thing over.
9390 			 */
9391 			error = vnode_materialize_dataless_file(fvp,
9392 			    NAMESPACE_HANDLER_RENAME_OP);
9393 
9394 			if (error == 0) {
9395 				/*
9396 				 * The next time around we need to tell the
9397 				 * file system that the materializtaion has
9398 				 * been performed.
9399 				 */
9400 				flags |= VFS_RENAME_DATALESS;
9401 				do_retry = 1;
9402 			}
9403 			goto out1;
9404 		}
9405 		if (error == EKEEPLOOKING) {
9406 			if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9407 				if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9408 					panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9409 				}
9410 			}
9411 
9412 			fromnd->ni_vp = fvp;
9413 			tond->ni_vp = tvp;
9414 
9415 			goto continue_lookup;
9416 		}
9417 
9418 		/*
9419 		 * We may encounter a race in the VNOP where the destination didn't
9420 		 * exist when we did the namei, but it does by the time we go and
9421 		 * try to create the entry. In this case, we should re-drive this rename
9422 		 * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
9423 		 * but other filesystems susceptible to this race could return it, too.
9424 		 */
9425 		if (error == ERECYCLE) {
9426 			if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9427 				do_retry = 1;
9428 				retry_count += 1;
9429 			} else {
9430 				printf("rename retry limit due to ERECYCLE reached\n");
9431 				error = ENOENT;
9432 			}
9433 		}
9434 
9435 		/*
9436 		 * For compound VNOPs, the authorization callback may return
9437 		 * ENOENT in case of racing hardlink lookups hitting the name
9438 		 * cache, redrive the lookup.
9439 		 */
9440 		if (batched && error == ENOENT) {
9441 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9442 				do_retry = 1;
9443 				retry_count += 1;
9444 			}
9445 		}
9446 
9447 		goto out1;
9448 	}
9449 
9450 	/* call out to allow 3rd party notification of rename.
9451 	 * Ignore result of kauth_authorize_fileop call.
9452 	 */
9453 	kauth_authorize_fileop(vfs_context_ucred(ctx),
9454 	    KAUTH_FILEOP_RENAME,
9455 	    (uintptr_t)from_name, (uintptr_t)to_name);
9456 	if (flags & VFS_RENAME_SWAP) {
9457 		kauth_authorize_fileop(vfs_context_ucred(ctx),
9458 		    KAUTH_FILEOP_RENAME,
9459 		    (uintptr_t)to_name, (uintptr_t)from_name);
9460 	}
9461 
9462 #if CONFIG_FSE
9463 	if (from_name != NULL && to_name != NULL) {
9464 		if (from_truncated || to_truncated) {
9465 			// set it here since only the from_finfo gets reported up to user space
9466 			from_finfo.mode |= FSE_TRUNCATED_PATH;
9467 		}
9468 
9469 		if (tvap && tvp) {
9470 			vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9471 		}
9472 		if (fvap) {
9473 			vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9474 		}
9475 
9476 		if (tvp) {
9477 			add_fsevent(FSE_RENAME, ctx,
9478 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9479 			    FSE_ARG_FINFO, &from_finfo,
9480 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9481 			    FSE_ARG_FINFO, &to_finfo,
9482 			    FSE_ARG_DONE);
9483 			if (flags & VFS_RENAME_SWAP) {
9484 				/*
9485 				 * Strictly speaking, swap is the equivalent of
9486 				 * *three* renames.  FSEvents clients should only take
9487 				 * the events as a hint, so we only bother reporting
9488 				 * two.
9489 				 */
9490 				add_fsevent(FSE_RENAME, ctx,
9491 				    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9492 				    FSE_ARG_FINFO, &to_finfo,
9493 				    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9494 				    FSE_ARG_FINFO, &from_finfo,
9495 				    FSE_ARG_DONE);
9496 			}
9497 		} else {
9498 			add_fsevent(FSE_RENAME, ctx,
9499 			    FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9500 			    FSE_ARG_FINFO, &from_finfo,
9501 			    FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9502 			    FSE_ARG_DONE);
9503 		}
9504 	}
9505 #endif /* CONFIG_FSE */
9506 
9507 	/*
9508 	 * update filesystem's mount point data
9509 	 */
9510 	if (mntrename) {
9511 		char *cp, *pathend, *mpname;
9512 		char * tobuf;
9513 		struct mount *mp;
9514 		int maxlen;
9515 		size_t len = 0;
9516 
9517 		mp = fvp->v_mountedhere;
9518 
9519 		if (vfs_busy(mp, LK_NOWAIT)) {
9520 			error = EBUSY;
9521 			goto out1;
9522 		}
9523 		tobuf = zalloc(ZV_NAMEI);
9524 
9525 		if (UIO_SEG_IS_USER_SPACE(segflg)) {
9526 			error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9527 		} else {
9528 			error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9529 		}
9530 		if (!error) {
9531 			/* find current mount point prefix */
9532 			pathend = &mp->mnt_vfsstat.f_mntonname[0];
9533 			for (cp = pathend; *cp != '\0'; ++cp) {
9534 				if (*cp == '/') {
9535 					pathend = cp + 1;
9536 				}
9537 			}
9538 			/* find last component of target name */
9539 			for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9540 				if (*cp == '/') {
9541 					mpname = cp + 1;
9542 				}
9543 			}
9544 
9545 			/* Update f_mntonname of sub mounts */
9546 			vfs_iterate(0, rename_submounts_callback, (void *)mp);
9547 
9548 			/* append name to prefix */
9549 			maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9550 			bzero(pathend, maxlen);
9551 
9552 			strlcpy(pathend, mpname, maxlen);
9553 		}
9554 		zfree(ZV_NAMEI, tobuf);
9555 
9556 		vfs_unbusy(mp);
9557 
9558 		vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9559 	}
9560 	/*
9561 	 * fix up name & parent pointers.  note that we first
9562 	 * check that fvp has the same name/parent pointers it
9563 	 * had before the rename call... this is a 'weak' check
9564 	 * at best...
9565 	 *
9566 	 * XXX oparent and oname may not be set in the compound vnop case
9567 	 */
9568 	if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9569 		int update_flags;
9570 
9571 		update_flags = VNODE_UPDATE_NAME;
9572 
9573 		if (fdvp != tdvp) {
9574 			update_flags |= VNODE_UPDATE_PARENT;
9575 		}
9576 
9577 		vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9578 	}
9579 out1:
9580 	/*
9581 	 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9582 	 * skipped earlier as no actual rename was performed.
9583 	 */
9584 	if (vn_authorize_skipped && error == 0) {
9585 		error = vn_authorize_renamex_with_paths(fdvp, fvp,
9586 		    &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9587 		    flags, NULL);
9588 		if (error && error == ENOENT) {
9589 			if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9590 				do_retry = 1;
9591 				retry_count += 1;
9592 			}
9593 		}
9594 	}
9595 	if (to_name != NULL) {
9596 		RELEASE_PATH(to_name);
9597 		to_name = NULL;
9598 	}
9599 	if (to_name_no_firmlink != NULL) {
9600 		RELEASE_PATH(to_name_no_firmlink);
9601 		to_name_no_firmlink = NULL;
9602 	}
9603 	if (from_name != NULL) {
9604 		RELEASE_PATH(from_name);
9605 		from_name = NULL;
9606 	}
9607 	if (from_name_no_firmlink != NULL) {
9608 		RELEASE_PATH(from_name_no_firmlink);
9609 		from_name_no_firmlink = NULL;
9610 	}
9611 	if (holding_mntlock) {
9612 		mount_unlock_renames(locked_mp);
9613 		mount_drop(locked_mp, 0);
9614 		holding_mntlock = 0;
9615 	}
9616 	if (tdvp) {
9617 		/*
9618 		 * nameidone has to happen before we vnode_put(tdvp)
9619 		 * since it may need to release the fs_nodelock on the tdvp
9620 		 */
9621 		nameidone(tond);
9622 
9623 		if (tvp) {
9624 			vnode_put(tvp);
9625 		}
9626 		vnode_put(tdvp);
9627 	}
9628 	if (fdvp) {
9629 		/*
9630 		 * nameidone has to happen before we vnode_put(fdvp)
9631 		 * since it may need to release the fs_nodelock on the fdvp
9632 		 */
9633 		nameidone(fromnd);
9634 
9635 		if (fvp) {
9636 			vnode_put(fvp);
9637 		}
9638 		vnode_put(fdvp);
9639 	}
9640 	if (mnt_fvp != NULLVP) {
9641 		vnode_put(mnt_fvp);
9642 	}
9643 	/*
9644 	 * If things changed after we did the namei, then we will re-drive
9645 	 * this rename call from the top.
9646 	 */
9647 	if (do_retry) {
9648 		do_retry = 0;
9649 		goto retry;
9650 	}
9651 
9652 	kfree_type(typeof(*__rename_data), __rename_data);
9653 	return error;
9654 }
9655 
9656 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9657 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9658 {
9659 	return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9660 	           AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9661 }
9662 
9663 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9664 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9665 {
9666 	if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9667 		return EINVAL;
9668 	}
9669 
9670 	if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9671 		return EINVAL;
9672 	}
9673 
9674 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9675 	           uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9676 }
9677 
9678 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9679 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9680 {
9681 	return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9682 	           uap->tofd, uap->to, UIO_USERSPACE, 0);
9683 }
9684 
9685 /*
9686  * Make a directory file.
9687  *
9688  * Returns:	0			Success
9689  *		EEXIST
9690  *	namei:???
9691  *	vnode_authorize:???
9692  *	vn_create:???
9693  */
9694 /* ARGSUSED */
9695 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9696 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9697     enum uio_seg segflg)
9698 {
9699 	vnode_t vp, dvp;
9700 	int error;
9701 	int update_flags = 0;
9702 	int batched;
9703 	struct nameidata nd;
9704 
9705 	AUDIT_ARG(mode, vap->va_mode);
9706 	NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9707 	    path, ctx);
9708 	nd.ni_cnd.cn_flags |= WILLBEDIR;
9709 	nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9710 
9711 continue_lookup:
9712 	error = nameiat(&nd, fd);
9713 	if (error) {
9714 		return error;
9715 	}
9716 	dvp = nd.ni_dvp;
9717 	vp = nd.ni_vp;
9718 
9719 	if (vp != NULL) {
9720 		error = EEXIST;
9721 		goto out;
9722 	}
9723 
9724 	batched = vnode_compound_mkdir_available(dvp);
9725 
9726 	VATTR_SET(vap, va_type, VDIR);
9727 
9728 	/*
9729 	 * XXX
9730 	 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9731 	 * only get EXISTS or EISDIR for existing path components, and not that it could see
9732 	 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9733 	 * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9734 	 */
9735 	if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9736 		if (error == EACCES || error == EPERM) {
9737 			int error2;
9738 
9739 			nameidone(&nd);
9740 			vnode_put(dvp);
9741 			dvp = NULLVP;
9742 
9743 			/*
9744 			 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9745 			 * rather than EACCESS if the target exists.
9746 			 */
9747 			NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9748 			    path, ctx);
9749 			error2 = nameiat(&nd, fd);
9750 			if (error2) {
9751 				goto out;
9752 			} else {
9753 				vp = nd.ni_vp;
9754 				error = EEXIST;
9755 				goto out;
9756 			}
9757 		}
9758 
9759 		goto out;
9760 	}
9761 
9762 #if CONFIG_FILE_LEASES
9763 	vnode_breakdirlease(dvp, false, O_WRONLY);
9764 #endif
9765 
9766 	/*
9767 	 * make the directory
9768 	 */
9769 	if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9770 		if (error == EKEEPLOOKING) {
9771 			nd.ni_vp = vp;
9772 			goto continue_lookup;
9773 		}
9774 
9775 		goto out;
9776 	}
9777 
9778 	// Make sure the name & parent pointers are hooked up
9779 	if (vp->v_name == NULL) {
9780 		update_flags |= VNODE_UPDATE_NAME;
9781 	}
9782 	if (vp->v_parent == NULLVP) {
9783 		update_flags |= VNODE_UPDATE_PARENT;
9784 	}
9785 
9786 	if (update_flags) {
9787 		vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9788 	}
9789 
9790 #if CONFIG_FSE
9791 	add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9792 #endif
9793 
9794 out:
9795 	/*
9796 	 * nameidone has to happen before we vnode_put(dvp)
9797 	 * since it may need to release the fs_nodelock on the dvp
9798 	 */
9799 	nameidone(&nd);
9800 
9801 	if (vp) {
9802 		vnode_put(vp);
9803 	}
9804 	if (dvp) {
9805 		vnode_put(dvp);
9806 	}
9807 
9808 	return error;
9809 }
9810 
9811 /*
9812  * mkdir_extended: Create a directory; with extended security (ACL).
9813  *
9814  * Parameters:    p                       Process requesting to create the directory
9815  *                uap                     User argument descriptor (see below)
9816  *                retval                  (ignored)
9817  *
9818  * Indirect:      uap->path               Path of directory to create
9819  *                uap->mode               Access permissions to set
9820  *                uap->xsecurity          ACL to set
9821  *
9822  * Returns:        0                      Success
9823  *                !0                      Not success
9824  *
9825  */
9826 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9827 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9828 {
9829 	int ciferror;
9830 	kauth_filesec_t xsecdst;
9831 	struct vnode_attr va;
9832 
9833 	AUDIT_ARG(owner, uap->uid, uap->gid);
9834 
9835 	xsecdst = NULL;
9836 	if ((uap->xsecurity != USER_ADDR_NULL) &&
9837 	    ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9838 		return ciferror;
9839 	}
9840 
9841 	VATTR_INIT(&va);
9842 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9843 	if (xsecdst != NULL) {
9844 		VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9845 		va.va_vaflags |= VA_FILESEC_ACL;
9846 	}
9847 
9848 	ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9849 	    UIO_USERSPACE);
9850 	if (xsecdst != NULL) {
9851 		kauth_filesec_free(xsecdst);
9852 	}
9853 	return ciferror;
9854 }
9855 
9856 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9857 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9858 {
9859 	struct vnode_attr va;
9860 
9861 	VATTR_INIT(&va);
9862 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9863 
9864 	return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9865 	           UIO_USERSPACE);
9866 }
9867 
9868 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9869 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9870 {
9871 	struct vnode_attr va;
9872 
9873 	VATTR_INIT(&va);
9874 	VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9875 
9876 	return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9877 	           UIO_USERSPACE);
9878 }
9879 
9880 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9881 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9882     enum uio_seg segflg, int unlink_flags)
9883 {
9884 	struct {
9885 		struct nameidata nd;
9886 #if CONFIG_FSE
9887 		struct vnode_attr va;
9888 #endif /* CONFIG_FSE */
9889 	} *__rmdir_data;
9890 	vnode_t vp, dvp;
9891 	int error;
9892 	struct nameidata *ndp;
9893 	char     *path = NULL;
9894 	char     *no_firmlink_path = NULL;
9895 	int       len_path = 0;
9896 	int       len_no_firmlink_path = 0;
9897 	int has_listeners = 0;
9898 	int need_event = 0;
9899 	int truncated_path = 0;
9900 	int truncated_no_firmlink_path = 0;
9901 	struct vnode_attr *vap = NULL;
9902 	int restart_count = 0;
9903 	int batched;
9904 
9905 	int restart_flag;
9906 
9907 	__rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9908 	ndp = &__rmdir_data->nd;
9909 
9910 	/*
9911 	 * This loop exists to restart rmdir in the unlikely case that two
9912 	 * processes are simultaneously trying to remove the same directory
9913 	 * containing orphaned appleDouble files.
9914 	 */
9915 	do {
9916 		NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9917 		    segflg, dirpath, ctx);
9918 		ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9919 continue_lookup:
9920 		restart_flag = 0;
9921 		vap = NULL;
9922 
9923 		error = nameiat(ndp, fd);
9924 		if (error) {
9925 			goto err_out;
9926 		}
9927 
9928 		dvp = ndp->ni_dvp;
9929 		vp = ndp->ni_vp;
9930 
9931 		if (vp) {
9932 			batched = vnode_compound_rmdir_available(vp);
9933 
9934 			if (vp->v_flag & VROOT) {
9935 				/*
9936 				 * The root of a mounted filesystem cannot be deleted.
9937 				 */
9938 				error = EBUSY;
9939 				goto out;
9940 			}
9941 
9942 #if DEVELOPMENT || DEBUG
9943 			/*
9944 			 * XXX VSWAP: Check for entitlements or special flag here
9945 			 * so we can restrict access appropriately.
9946 			 */
9947 #else /* DEVELOPMENT || DEBUG */
9948 
9949 			if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9950 				error = EPERM;
9951 				goto out;
9952 			}
9953 #endif /* DEVELOPMENT || DEBUG */
9954 
9955 			/*
9956 			 * Removed a check here; we used to abort if vp's vid
9957 			 * was not the same as what we'd seen the last time around.
9958 			 * I do not think that check was valid, because if we retry
9959 			 * and all dirents are gone, the directory could legitimately
9960 			 * be recycled but still be present in a situation where we would
9961 			 * have had permission to delete.  Therefore, we won't make
9962 			 * an effort to preserve that check now that we may not have a
9963 			 * vp here.
9964 			 */
9965 
9966 			if (!batched) {
9967 				error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9968 				if (error) {
9969 					if (error == ENOENT) {
9970 						if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9971 							restart_flag = 1;
9972 							restart_count += 1;
9973 						}
9974 					}
9975 					goto out;
9976 				}
9977 			}
9978 		} else {
9979 			batched = 1;
9980 
9981 			if (!vnode_compound_rmdir_available(dvp)) {
9982 				panic("No error, but no compound rmdir?");
9983 			}
9984 		}
9985 
9986 #if CONFIG_FSE
9987 		fse_info  finfo = {0};
9988 
9989 		need_event = need_fsevent(FSE_DELETE, dvp);
9990 		if (need_event) {
9991 			if (!batched) {
9992 				get_fse_info(vp, &finfo, ctx);
9993 			} else {
9994 				error = vfs_get_notify_attributes(&__rmdir_data->va);
9995 				if (error) {
9996 					goto out;
9997 				}
9998 
9999 				vap = &__rmdir_data->va;
10000 			}
10001 		}
10002 #endif
10003 		has_listeners = kauth_authorize_fileop_has_listeners();
10004 		if (need_event || has_listeners) {
10005 			if (path == NULL) {
10006 				GET_PATH(path);
10007 			}
10008 
10009 			len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10010 
10011 			if (no_firmlink_path == NULL) {
10012 				GET_PATH(no_firmlink_path);
10013 			}
10014 
10015 			len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10016 #if CONFIG_FSE
10017 			if (truncated_no_firmlink_path) {
10018 				finfo.mode |= FSE_TRUNCATED_PATH;
10019 			}
10020 #endif
10021 		}
10022 
10023 #if CONFIG_FILE_LEASES
10024 		vnode_breakdirlease(dvp, false, O_WRONLY);
10025 #endif
10026 
10027 		error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10028 		ndp->ni_vp = vp;
10029 		if (vp == NULLVP) {
10030 			/* Couldn't find a vnode */
10031 			goto out;
10032 		}
10033 
10034 		if (error == EKEEPLOOKING) {
10035 			goto continue_lookup;
10036 		} else if (batched && error == ENOENT) {
10037 			if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10038 				/*
10039 				 * For compound VNOPs, the authorization callback
10040 				 * may return ENOENT in case of racing hard link lookups
10041 				 * redrive the lookup.
10042 				 */
10043 				restart_flag = 1;
10044 				restart_count += 1;
10045 				goto out;
10046 			}
10047 		}
10048 
10049 		/*
10050 		 * XXX There's no provision for passing flags
10051 		 * to VNOP_RMDIR().  So, if vn_rmdir() fails
10052 		 * because it's not empty, then we try again
10053 		 * with VNOP_REMOVE(), passing in a special
10054 		 * flag that clever file systems will know
10055 		 * how to handle.
10056 		 */
10057 		if (error == ENOTEMPTY &&
10058 		    (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10059 			/*
10060 			 * If this fails, we want to keep the original
10061 			 * error.
10062 			 */
10063 			if (vn_remove(dvp, &vp, ndp,
10064 			    VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10065 				error = 0;
10066 			}
10067 		}
10068 
10069 #if CONFIG_APPLEDOUBLE
10070 		/*
10071 		 * Special case to remove orphaned AppleDouble
10072 		 * files. I don't like putting this in the kernel,
10073 		 * but carbon does not like putting this in carbon either,
10074 		 * so here we are.
10075 		 */
10076 		if (error == ENOTEMPTY) {
10077 			int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10078 			if (ad_error == EBUSY) {
10079 				error = ad_error;
10080 				goto out;
10081 			}
10082 
10083 
10084 			/*
10085 			 * Assuming everything went well, we will try the RMDIR again
10086 			 */
10087 			if (!ad_error) {
10088 				error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10089 			}
10090 		}
10091 #endif /* CONFIG_APPLEDOUBLE */
10092 		/*
10093 		 * Call out to allow 3rd party notification of delete.
10094 		 * Ignore result of kauth_authorize_fileop call.
10095 		 */
10096 		if (!error) {
10097 			if (has_listeners) {
10098 				kauth_authorize_fileop(vfs_context_ucred(ctx),
10099 				    KAUTH_FILEOP_DELETE,
10100 				    (uintptr_t)vp,
10101 				    (uintptr_t)path);
10102 			}
10103 
10104 			if (vp->v_flag & VISHARDLINK) {
10105 				// see the comment in unlink1() about why we update
10106 				// the parent of a hard link when it is removed
10107 				vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10108 			}
10109 
10110 #if CONFIG_FSE
10111 			if (need_event) {
10112 				if (vap) {
10113 					vnode_get_fse_info_from_vap(vp, &finfo, vap);
10114 				}
10115 				add_fsevent(FSE_DELETE, ctx,
10116 				    FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10117 				    FSE_ARG_FINFO, &finfo,
10118 				    FSE_ARG_DONE);
10119 			}
10120 #endif
10121 
10122 #if CONFIG_MACF
10123 			mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10124 #endif
10125 		}
10126 
10127 out:
10128 		if (path != NULL) {
10129 			RELEASE_PATH(path);
10130 			path = NULL;
10131 		}
10132 
10133 		if (no_firmlink_path != NULL) {
10134 			RELEASE_PATH(no_firmlink_path);
10135 			no_firmlink_path = NULL;
10136 		}
10137 
10138 		/*
10139 		 * nameidone has to happen before we vnode_put(dvp)
10140 		 * since it may need to release the fs_nodelock on the dvp
10141 		 */
10142 		nameidone(ndp);
10143 		vnode_put(dvp);
10144 
10145 		if (vp) {
10146 			vnode_put(vp);
10147 		}
10148 
10149 		if (restart_flag == 0) {
10150 			wakeup_one((caddr_t)vp);
10151 			goto err_out;
10152 		}
10153 		tsleep(vp, PVFS, "rm AD", 1);
10154 	} while (restart_flag != 0);
10155 
10156 err_out:
10157 	kfree_type(typeof(*__rmdir_data), __rmdir_data);
10158 
10159 	return error;
10160 }
10161 
10162 /*
10163  * Remove a directory file.
10164  */
10165 /* ARGSUSED */
10166 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10167 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10168 {
10169 	return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10170 	           CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10171 }
10172 
10173 /* Get direntry length padded to 8 byte alignment */
10174 #define DIRENT64_LEN(namlen) \
10175 	((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10176 
10177 /* Get dirent length padded to 4 byte alignment */
10178 #define DIRENT_LEN(namelen) \
10179 	((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10180 
10181 /* Get the end of this dirent */
10182 #define DIRENT_END(dep) \
10183 	(((char *)(dep)) + (dep)->d_reclen - 1)
10184 
10185 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10186 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10187     int *numdirent, vfs_context_t ctxp)
10188 {
10189 	/* Check if fs natively supports VNODE_READDIR_EXTENDED */
10190 	if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10191 	    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10192 		return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10193 	} else {
10194 		size_t bufsize;
10195 		void * bufptr;
10196 		uio_t auio;
10197 		struct direntry *entry64;
10198 		struct dirent *dep;
10199 		size_t bytesread;
10200 		int error;
10201 
10202 		/*
10203 		 * We're here because the underlying file system does not
10204 		 * support direnties or we mounted denying support so we must
10205 		 * fall back to dirents and convert them to direntries.
10206 		 *
10207 		 * Our kernel buffer needs to be smaller since re-packing will
10208 		 * expand each dirent.  The worse case (when the name length
10209 		 * is 3 or less) corresponds to a struct direntry size of 32
10210 		 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10211 		 * (4-byte aligned).  So having a buffer that is 3/8 the size
10212 		 * will prevent us from reading more than we can pack.
10213 		 *
10214 		 * Since this buffer is wired memory, we will limit the
10215 		 * buffer size to a maximum of 32K. We would really like to
10216 		 * use 32K in the MIN(), but we use magic number 87371 to
10217 		 * prevent uio_resid() * 3 / 8 from overflowing.
10218 		 */
10219 		bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10220 		bufptr = kalloc_data(bufsize, Z_WAITOK);
10221 		if (bufptr == NULL) {
10222 			return ENOMEM;
10223 		}
10224 
10225 		auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10226 		uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10227 		auio->uio_offset = uio->uio_offset;
10228 
10229 		error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10230 
10231 		dep = (struct dirent *)bufptr;
10232 		bytesread = bufsize - uio_resid(auio);
10233 
10234 		entry64 = kalloc_type(struct direntry, Z_WAITOK);
10235 		/*
10236 		 * Convert all the entries and copy them out to user's buffer.
10237 		 */
10238 		while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10239 			/* First check that the dirent struct up to d_name is within the buffer */
10240 			if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10241 			    /* Check that the length of the entire dirent is within the buffer */
10242 			    DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10243 			    /* Check that the actual length including the name doesn't exceed d_reclen */
10244 			    DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10245 				printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10246 				    vp->v_mount->mnt_vfsstat.f_mntonname,
10247 				    vp->v_name ? vp->v_name : "<unknown>");
10248 				error = EIO;
10249 				break;
10250 			}
10251 
10252 			size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
10253 
10254 			bzero(entry64, enbufsize);
10255 			/* Convert a dirent to a dirent64. */
10256 			entry64->d_ino = dep->d_ino;
10257 			entry64->d_seekoff = 0;
10258 			entry64->d_reclen = (uint16_t)enbufsize;
10259 			entry64->d_namlen = dep->d_namlen;
10260 			entry64->d_type = dep->d_type;
10261 			bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10262 
10263 			/* Move to next entry. */
10264 			dep = (struct dirent *)((char *)dep + dep->d_reclen);
10265 
10266 			/* Copy entry64 to user's buffer. */
10267 			error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10268 		}
10269 
10270 		/* Update the real offset using the offset we got from VNOP_READDIR. */
10271 		if (error == 0) {
10272 			uio->uio_offset = auio->uio_offset;
10273 		}
10274 		uio_free(auio);
10275 		kfree_data(bufptr, bufsize);
10276 		kfree_type(struct direntry, entry64);
10277 		return error;
10278 	}
10279 }
10280 
10281 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
10282 
10283 /*
10284  * Read a block of directory entries in a file system independent format.
10285  */
10286 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10287 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10288     off_t *offset, int *eofflag, int flags)
10289 {
10290 	vnode_t vp;
10291 	struct vfs_context context = *vfs_context_current();    /* local copy */
10292 	struct fileproc *fp;
10293 	uio_t auio;
10294 	int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10295 	off_t loff;
10296 	int error, numdirent;
10297 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10298 
10299 get_from_fd:
10300 	error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10301 	if (error) {
10302 		return error;
10303 	}
10304 
10305 	vn_offset_lock(fp->fp_glob);
10306 	if (((vnode_t)fp_get_data(fp)) != vp) {
10307 		vn_offset_unlock(fp->fp_glob);
10308 		file_drop(fd);
10309 		goto get_from_fd;
10310 	}
10311 
10312 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10313 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10314 		error = EBADF;
10315 		goto out;
10316 	}
10317 
10318 	if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10319 		bufsize = GETDIRENTRIES_MAXBUFSIZE;
10320 	}
10321 
10322 #if CONFIG_MACF
10323 	error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10324 	if (error) {
10325 		goto out;
10326 	}
10327 #endif
10328 
10329 	if ((error = vnode_getwithref(vp))) {
10330 		goto out;
10331 	}
10332 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10333 
10334 #if CONFIG_UNION_MOUNTS
10335 unionread:
10336 #endif /* CONFIG_UNION_MOUNTS */
10337 	if (vp->v_type != VDIR) {
10338 		(void)vnode_put(vp);
10339 		error = EINVAL;
10340 		goto out;
10341 	}
10342 
10343 #if CONFIG_MACF
10344 	error = mac_vnode_check_readdir(&context, vp);
10345 	if (error != 0) {
10346 		(void)vnode_put(vp);
10347 		goto out;
10348 	}
10349 #endif /* MAC */
10350 
10351 	loff = fp->fp_glob->fg_offset;
10352 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10353 	uio_addiov(auio, bufp, bufsize);
10354 
10355 	if (flags & VNODE_READDIR_EXTENDED) {
10356 		error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10357 		fp->fp_glob->fg_offset = uio_offset(auio);
10358 	} else {
10359 		error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10360 		fp->fp_glob->fg_offset = uio_offset(auio);
10361 	}
10362 	if (error) {
10363 		(void)vnode_put(vp);
10364 		goto out;
10365 	}
10366 
10367 #if CONFIG_UNION_MOUNTS
10368 	if ((user_ssize_t)bufsize == uio_resid(auio) &&
10369 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
10370 		vnode_t uvp;
10371 
10372 		if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10373 			if (vnode_ref(uvp) == 0) {
10374 				fp_set_data(fp, uvp);
10375 				fp->fp_glob->fg_offset = 0;
10376 				vnode_rele(vp);
10377 				vnode_put(vp);
10378 				vp = uvp;
10379 				goto unionread;
10380 			} else {
10381 				/* could not get a ref, can't replace in fd */
10382 				vnode_put(uvp);
10383 			}
10384 		}
10385 	}
10386 #endif /* CONFIG_UNION_MOUNTS */
10387 
10388 	vnode_put(vp);
10389 	if (offset) {
10390 		*offset = loff;
10391 	}
10392 
10393 	*bytesread = bufsize - uio_resid(auio);
10394 out:
10395 	vn_offset_unlock(fp->fp_glob);
10396 	file_drop(fd);
10397 	return error;
10398 }
10399 
10400 
10401 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10402 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10403 {
10404 	off_t offset;
10405 	ssize_t bytesread;
10406 	int error, eofflag;
10407 
10408 	AUDIT_ARG(fd, uap->fd);
10409 	error = getdirentries_common(uap->fd, uap->buf, uap->count,
10410 	    &bytesread, &offset, &eofflag, 0);
10411 
10412 	if (error == 0) {
10413 		if (proc_is64bit(p)) {
10414 			user64_long_t base = (user64_long_t)offset;
10415 			error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10416 		} else {
10417 			user32_long_t base = (user32_long_t)offset;
10418 			error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10419 		}
10420 		*retval = (int)bytesread;
10421 	}
10422 	return error;
10423 }
10424 
10425 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10426 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10427 {
10428 	off_t offset;
10429 	ssize_t bytesread;
10430 	int error, eofflag;
10431 	user_size_t bufsize;
10432 
10433 	AUDIT_ARG(fd, uap->fd);
10434 
10435 	/*
10436 	 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10437 	 * then the kernel carves out the last 4 bytes to return extended
10438 	 * information to userspace (namely whether we reached EOF with this call).
10439 	 */
10440 	if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10441 		bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10442 	} else {
10443 		bufsize = uap->bufsize;
10444 	}
10445 
10446 	error = getdirentries_common(uap->fd, uap->buf, bufsize,
10447 	    &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10448 
10449 	if (error == 0) {
10450 		*retval = bytesread;
10451 		error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10452 
10453 		if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10454 			getdirentries64_flags_t flags = 0;
10455 			if (eofflag) {
10456 				flags |= GETDIRENTRIES64_EOF;
10457 			}
10458 			error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10459 			    sizeof(flags));
10460 		}
10461 	}
10462 	return error;
10463 }
10464 
10465 
10466 /*
10467  * Set the mode mask for creation of filesystem nodes.
10468  * XXX implement xsecurity
10469  */
10470 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
10471 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10472 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10473 {
10474 	AUDIT_ARG(mask, newmask);
10475 	proc_fdlock(p);
10476 	*retval = p->p_fd.fd_cmask;
10477 	p->p_fd.fd_cmask = newmask & ALLPERMS;
10478 	proc_fdunlock(p);
10479 	return 0;
10480 }
10481 
10482 /*
10483  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10484  *
10485  * Parameters:    p                       Process requesting to set the umask
10486  *                uap                     User argument descriptor (see below)
10487  *                retval                  umask of the process (parameter p)
10488  *
10489  * Indirect:      uap->newmask            umask to set
10490  *                uap->xsecurity          ACL to set
10491  *
10492  * Returns:        0                      Success
10493  *                !0                      Not success
10494  *
10495  */
10496 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10497 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10498 {
10499 	return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10500 }
10501 
10502 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10503 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10504 {
10505 	return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10506 }
10507 
10508 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT                               \
10509 	"com.apple.private.vfs.revoke-mounted-device"
10510 
10511 /*
10512  * Void all references to file by ripping underlying filesystem
10513  * away from vnode.
10514  */
10515 /* ARGSUSED */
10516 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10517 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10518 {
10519 	vnode_t vp;
10520 	struct vnode_attr va;
10521 	vfs_context_t ctx = vfs_context_current();
10522 	int error;
10523 	struct nameidata nd;
10524 
10525 	NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10526 	    uap->path, ctx);
10527 	error = namei(&nd);
10528 	if (error) {
10529 		return error;
10530 	}
10531 	vp = nd.ni_vp;
10532 
10533 	nameidone(&nd);
10534 
10535 	if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10536 		error = ENOTSUP;
10537 		goto out;
10538 	}
10539 
10540 	if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10541 		error = EBUSY;
10542 		goto out;
10543 	}
10544 
10545 #if CONFIG_MACF
10546 	error = mac_vnode_check_revoke(ctx, vp);
10547 	if (error) {
10548 		goto out;
10549 	}
10550 #endif
10551 
10552 	VATTR_INIT(&va);
10553 	VATTR_WANTED(&va, va_uid);
10554 	if ((error = vnode_getattr(vp, &va, ctx))) {
10555 		goto out;
10556 	}
10557 	if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10558 	    (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10559 		goto out;
10560 	}
10561 	if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10562 		VNOP_REVOKE(vp, REVOKEALL, ctx);
10563 	}
10564 out:
10565 	vnode_put(vp);
10566 	return error;
10567 }
10568 
10569 
10570 /*
10571  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10572  *  The following system calls are designed to support features
10573  *  which are specific to the HFS & HFS Plus volume formats
10574  */
10575 
10576 
10577 /*
10578  * Obtain attribute information on objects in a directory while enumerating
10579  * the directory.
10580  */
10581 /* ARGSUSED */
10582 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10583 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10584 {
10585 	vnode_t vp;
10586 	struct fileproc *fp;
10587 	uio_t auio = NULL;
10588 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10589 	uint32_t count = 0, savecount = 0;
10590 	uint32_t newstate = 0;
10591 	int error, eofflag = 0;
10592 	off_t loff = 0;
10593 	struct attrlist attributelist;
10594 	vfs_context_t ctx = vfs_context_current();
10595 	int fd = uap->fd;
10596 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10597 	kauth_action_t action;
10598 
10599 	AUDIT_ARG(fd, fd);
10600 
10601 	/* Get the attributes into kernel space */
10602 	if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10603 		return error;
10604 	}
10605 	if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10606 		return error;
10607 	}
10608 	savecount = count;
10609 
10610 get_from_fd:
10611 	if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10612 		return error;
10613 	}
10614 
10615 	vn_offset_lock(fp->fp_glob);
10616 	if (((vnode_t)fp_get_data(fp)) != vp) {
10617 		vn_offset_unlock(fp->fp_glob);
10618 		file_drop(fd);
10619 		goto get_from_fd;
10620 	}
10621 
10622 	if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10623 		AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10624 		error = EBADF;
10625 		goto out;
10626 	}
10627 
10628 
10629 #if CONFIG_MACF
10630 	error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10631 	    fp->fp_glob);
10632 	if (error) {
10633 		goto out;
10634 	}
10635 #endif
10636 
10637 
10638 	if ((error = vnode_getwithref(vp))) {
10639 		goto out;
10640 	}
10641 
10642 	AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10643 
10644 #if CONFIG_UNION_MOUNTS
10645 unionread:
10646 #endif /* CONFIG_UNION_MOUNTS */
10647 	if (vp->v_type != VDIR) {
10648 		(void)vnode_put(vp);
10649 		error = EINVAL;
10650 		goto out;
10651 	}
10652 
10653 #if CONFIG_MACF
10654 	error = mac_vnode_check_readdir(ctx, vp);
10655 	if (error != 0) {
10656 		(void)vnode_put(vp);
10657 		goto out;
10658 	}
10659 #endif /* MAC */
10660 
10661 	/* set up the uio structure which will contain the users return buffer */
10662 	loff = fp->fp_glob->fg_offset;
10663 	auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10664 	uio_addiov(auio, uap->buffer, uap->buffersize);
10665 
10666 	/*
10667 	 * If the only item requested is file names, we can let that past with
10668 	 * just LIST_DIRECTORY.  If they want any other attributes, that means
10669 	 * they need SEARCH as well.
10670 	 */
10671 	action = KAUTH_VNODE_LIST_DIRECTORY;
10672 	if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10673 	    attributelist.fileattr || attributelist.dirattr) {
10674 		action |= KAUTH_VNODE_SEARCH;
10675 	}
10676 
10677 	if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10678 		/* Believe it or not, uap->options only has 32-bits of valid
10679 		 * info, so truncate before extending again */
10680 
10681 		error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10682 		    (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10683 	}
10684 
10685 	if (error) {
10686 		(void) vnode_put(vp);
10687 		goto out;
10688 	}
10689 
10690 #if CONFIG_UNION_MOUNTS
10691 	/*
10692 	 * If we've got the last entry of a directory in a union mount
10693 	 * then reset the eofflag and pretend there's still more to come.
10694 	 * The next call will again set eofflag and the buffer will be empty,
10695 	 * so traverse to the underlying directory and do the directory
10696 	 * read there.
10697 	 */
10698 	if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10699 		if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10700 			eofflag = 0;
10701 		} else {                                                // Empty buffer
10702 			vnode_t uvp;
10703 			if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10704 				if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10705 					fp_set_data(fp, uvp);
10706 					fp->fp_glob->fg_offset = 0; // reset index for new dir
10707 					count = savecount;
10708 					vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10709 					vnode_put(vp);
10710 					vp = uvp;
10711 					goto unionread;
10712 				} else {
10713 					/* could not get a ref, can't replace in fd */
10714 					vnode_put(uvp);
10715 				}
10716 			}
10717 		}
10718 	}
10719 #endif /* CONFIG_UNION_MOUNTS */
10720 
10721 	(void)vnode_put(vp);
10722 
10723 	if (error) {
10724 		goto out;
10725 	}
10726 	fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10727 
10728 	if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10729 		goto out;
10730 	}
10731 	if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10732 		goto out;
10733 	}
10734 	if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10735 		goto out;
10736 	}
10737 
10738 	*retval = eofflag;  /* similar to getdirentries */
10739 	error = 0;
10740 out:
10741 	vn_offset_unlock(fp->fp_glob);
10742 	file_drop(fd);
10743 	return error; /* return error earlier, an retval of 0 or 1 now */
10744 } /* end of getdirentriesattr system call */
10745 
10746 /*
10747  * Exchange data between two files
10748  */
10749 
10750 /* ARGSUSED */
10751 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10752 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10753 {
10754 	struct nameidata fnd, snd;
10755 	vfs_context_t ctx = vfs_context_current();
10756 	vnode_t fvp;
10757 	vnode_t svp;
10758 	int error;
10759 	u_int32_t nameiflags;
10760 	char *fpath = NULL;
10761 	char *spath = NULL;
10762 	int   flen = 0, slen = 0;
10763 	int from_truncated = 0, to_truncated = 0;
10764 #if CONFIG_FSE
10765 	fse_info f_finfo, s_finfo;
10766 #endif
10767 
10768 	nameiflags = 0;
10769 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10770 		nameiflags |= FOLLOW;
10771 	}
10772 
10773 	NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10774 	    UIO_USERSPACE, uap->path1, ctx);
10775 
10776 	error = namei(&fnd);
10777 	if (error) {
10778 		goto out2;
10779 	}
10780 
10781 	nameidone(&fnd);
10782 	fvp = fnd.ni_vp;
10783 
10784 	NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10785 	    UIO_USERSPACE, uap->path2, ctx);
10786 
10787 	error = namei(&snd);
10788 	if (error) {
10789 		vnode_put(fvp);
10790 		goto out2;
10791 	}
10792 	nameidone(&snd);
10793 	svp = snd.ni_vp;
10794 
10795 	/*
10796 	 * if the files are the same, return an inval error
10797 	 */
10798 	if (svp == fvp) {
10799 		error = EINVAL;
10800 		goto out;
10801 	}
10802 
10803 	/*
10804 	 * if the files are on different volumes, return an error
10805 	 */
10806 	if (svp->v_mount != fvp->v_mount) {
10807 		error = EXDEV;
10808 		goto out;
10809 	}
10810 
10811 	/* If they're not files, return an error */
10812 	if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10813 		error = EINVAL;
10814 		goto out;
10815 	}
10816 
10817 #if CONFIG_MACF
10818 	error = mac_vnode_check_exchangedata(ctx,
10819 	    fvp, svp);
10820 	if (error) {
10821 		goto out;
10822 	}
10823 #endif
10824 	if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10825 	    ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10826 		goto out;
10827 	}
10828 
10829 	if (
10830 #if CONFIG_FSE
10831 		need_fsevent(FSE_EXCHANGE, fvp) ||
10832 #endif
10833 		kauth_authorize_fileop_has_listeners()) {
10834 		GET_PATH(fpath);
10835 		GET_PATH(spath);
10836 
10837 		flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10838 		slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10839 
10840 #if CONFIG_FSE
10841 		get_fse_info(fvp, &f_finfo, ctx);
10842 		get_fse_info(svp, &s_finfo, ctx);
10843 		if (from_truncated || to_truncated) {
10844 			// set it here since only the f_finfo gets reported up to user space
10845 			f_finfo.mode |= FSE_TRUNCATED_PATH;
10846 		}
10847 #endif
10848 	}
10849 	/* Ok, make the call */
10850 	error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10851 
10852 	if (error == 0) {
10853 		const char *tmpname;
10854 
10855 		if (fpath != NULL && spath != NULL) {
10856 			/* call out to allow 3rd party notification of exchangedata.
10857 			 * Ignore result of kauth_authorize_fileop call.
10858 			 */
10859 			kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10860 			    (uintptr_t)fpath, (uintptr_t)spath);
10861 		}
10862 		name_cache_lock();
10863 
10864 		tmpname     = fvp->v_name;
10865 		fvp->v_name = svp->v_name;
10866 		svp->v_name = tmpname;
10867 
10868 		if (fvp->v_parent != svp->v_parent) {
10869 			vnode_t tmp;
10870 
10871 			tmp           = fvp->v_parent;
10872 			fvp->v_parent = svp->v_parent;
10873 			svp->v_parent = tmp;
10874 		}
10875 		name_cache_unlock();
10876 
10877 #if CONFIG_FSE
10878 		if (fpath != NULL && spath != NULL) {
10879 			add_fsevent(FSE_EXCHANGE, ctx,
10880 			    FSE_ARG_STRING, flen, fpath,
10881 			    FSE_ARG_FINFO, &f_finfo,
10882 			    FSE_ARG_STRING, slen, spath,
10883 			    FSE_ARG_FINFO, &s_finfo,
10884 			    FSE_ARG_DONE);
10885 		}
10886 #endif
10887 	}
10888 
10889 out:
10890 	if (fpath != NULL) {
10891 		RELEASE_PATH(fpath);
10892 	}
10893 	if (spath != NULL) {
10894 		RELEASE_PATH(spath);
10895 	}
10896 	vnode_put(svp);
10897 	vnode_put(fvp);
10898 out2:
10899 	return error;
10900 }
10901 
10902 /*
10903  * Return (in MB) the amount of freespace on the given vnode's volume.
10904  */
10905 uint32_t freespace_mb(vnode_t vp);
10906 
10907 uint32_t
freespace_mb(vnode_t vp)10908 freespace_mb(vnode_t vp)
10909 {
10910 	vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10911 	return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10912 	       vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10913 }
10914 
10915 #if CONFIG_SEARCHFS
10916 
10917 /* ARGSUSED */
10918 
10919 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10920 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10921 {
10922 	vnode_t vp, tvp;
10923 	int i, error = 0;
10924 	int fserror = 0;
10925 	struct nameidata nd;
10926 	struct user64_fssearchblock searchblock;
10927 	struct searchstate *state;
10928 	struct attrlist *returnattrs;
10929 	struct timeval timelimit;
10930 	void *searchparams1, *searchparams2;
10931 	uio_t auio = NULL;
10932 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10933 	uint32_t nummatches;
10934 	size_t mallocsize;
10935 	uint32_t nameiflags;
10936 	vfs_context_t ctx = vfs_context_current();
10937 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10938 
10939 	/* Start by copying in fsearchblock parameter list */
10940 	if (IS_64BIT_PROCESS(p)) {
10941 		error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10942 		timelimit.tv_sec = searchblock.timelimit.tv_sec;
10943 		timelimit.tv_usec = searchblock.timelimit.tv_usec;
10944 	} else {
10945 		struct user32_fssearchblock tmp_searchblock;
10946 
10947 		error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10948 		// munge into 64-bit version
10949 		searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10950 		searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10951 		searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10952 		searchblock.maxmatches = tmp_searchblock.maxmatches;
10953 		/*
10954 		 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10955 		 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10956 		 */
10957 		timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10958 		timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10959 		searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10960 		searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10961 		searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10962 		searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10963 		searchblock.searchattrs = tmp_searchblock.searchattrs;
10964 	}
10965 	if (error) {
10966 		return error;
10967 	}
10968 
10969 	/* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10970 	 */
10971 	if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10972 	    searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10973 		return EINVAL;
10974 	}
10975 
10976 	/* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10977 	/* It all has to do into local memory and it's not that big so we might as well  put it all together. */
10978 	/* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10979 	/* block.                                                                                             */
10980 	/*												      */
10981 	/* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
10982 	/*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
10983 	/*       assumes the size is still 556 bytes it will continue to work				      */
10984 
10985 	mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10986 	    sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10987 
10988 	searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10989 
10990 	/* Now set up the various pointers to the correct place in our newly allocated memory */
10991 
10992 	searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10993 	returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10994 	state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10995 
10996 	/* Now copy in the stuff given our local variables. */
10997 
10998 	if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10999 		goto freeandexit;
11000 	}
11001 
11002 	if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11003 		goto freeandexit;
11004 	}
11005 
11006 	if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11007 		goto freeandexit;
11008 	}
11009 
11010 	if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11011 		goto freeandexit;
11012 	}
11013 
11014 	/*
11015 	 * When searching a union mount, need to set the
11016 	 * start flag at the first call on each layer to
11017 	 * reset state for the new volume.
11018 	 */
11019 	if (uap->options & SRCHFS_START) {
11020 		state->ss_union_layer = 0;
11021 	} else {
11022 		uap->options |= state->ss_union_flags;
11023 	}
11024 	state->ss_union_flags = 0;
11025 
11026 	/*
11027 	 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11028 	 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11029 	 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11030 	 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11031 	 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11032 	 */
11033 
11034 	if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11035 		attrreference_t* string_ref;
11036 		u_int32_t* start_length;
11037 		user64_size_t param_length;
11038 
11039 		/* validate searchparams1 */
11040 		param_length = searchblock.sizeofsearchparams1;
11041 		/* skip the word that specifies length of the buffer */
11042 		start_length = (u_int32_t*) searchparams1;
11043 		start_length = start_length + 1;
11044 		string_ref = (attrreference_t*) start_length;
11045 
11046 		/* ensure no negative offsets or too big offsets */
11047 		if (string_ref->attr_dataoffset < 0) {
11048 			error = EINVAL;
11049 			goto freeandexit;
11050 		}
11051 		if (string_ref->attr_length > MAXPATHLEN) {
11052 			error = EINVAL;
11053 			goto freeandexit;
11054 		}
11055 
11056 		/* Check for pointer overflow in the string ref */
11057 		if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11058 			error = EINVAL;
11059 			goto freeandexit;
11060 		}
11061 
11062 		if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11063 			error = EINVAL;
11064 			goto freeandexit;
11065 		}
11066 		if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11067 			error = EINVAL;
11068 			goto freeandexit;
11069 		}
11070 	}
11071 
11072 	/* set up the uio structure which will contain the users return buffer */
11073 	auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11074 	uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11075 
11076 	nameiflags = 0;
11077 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11078 		nameiflags |= FOLLOW;
11079 	}
11080 	NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11081 	    UIO_USERSPACE, uap->path, ctx);
11082 
11083 	error = namei(&nd);
11084 	if (error) {
11085 		goto freeandexit;
11086 	}
11087 	vp = nd.ni_vp;
11088 	nameidone(&nd);
11089 
11090 	/*
11091 	 * Switch to the root vnode for the volume
11092 	 */
11093 	error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11094 	vnode_put(vp);
11095 	if (error) {
11096 		goto freeandexit;
11097 	}
11098 	vp = tvp;
11099 
11100 #if CONFIG_UNION_MOUNTS
11101 	/*
11102 	 * If it's a union mount, the path lookup takes
11103 	 * us to the top layer. But we may need to descend
11104 	 * to a lower layer. For non-union mounts the layer
11105 	 * is always zero.
11106 	 */
11107 	for (i = 0; i < (int) state->ss_union_layer; i++) {
11108 		if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11109 			break;
11110 		}
11111 		tvp = vp;
11112 		vp = vp->v_mount->mnt_vnodecovered;
11113 		if (vp == NULL) {
11114 			vnode_put(tvp);
11115 			error = ENOENT;
11116 			goto freeandexit;
11117 		}
11118 		error = vnode_getwithref(vp);
11119 		vnode_put(tvp);
11120 		if (error) {
11121 			goto freeandexit;
11122 		}
11123 	}
11124 #endif /* CONFIG_UNION_MOUNTS */
11125 
11126 #if CONFIG_MACF
11127 	error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11128 	if (error) {
11129 		vnode_put(vp);
11130 		goto freeandexit;
11131 	}
11132 #endif
11133 
11134 
11135 	/*
11136 	 * If searchblock.maxmatches == 0, then skip the search. This has happened
11137 	 * before and sometimes the underlying code doesnt deal with it well.
11138 	 */
11139 	if (searchblock.maxmatches == 0) {
11140 		nummatches = 0;
11141 		goto saveandexit;
11142 	}
11143 
11144 	/*
11145 	 * Allright, we have everything we need, so lets make that call.
11146 	 *
11147 	 * We keep special track of the return value from the file system:
11148 	 * EAGAIN is an acceptable error condition that shouldn't keep us
11149 	 * from copying out any results...
11150 	 */
11151 
11152 	fserror = VNOP_SEARCHFS(vp,
11153 	    searchparams1,
11154 	    searchparams2,
11155 	    &searchblock.searchattrs,
11156 	    (uint32_t)searchblock.maxmatches,
11157 	    &timelimit,
11158 	    returnattrs,
11159 	    &nummatches,
11160 	    (uint32_t)uap->scriptcode,
11161 	    (uint32_t)uap->options,
11162 	    auio,
11163 	    (struct searchstate *) &state->ss_fsstate,
11164 	    ctx);
11165 
11166 #if CONFIG_UNION_MOUNTS
11167 	/*
11168 	 * If it's a union mount we need to be called again
11169 	 * to search the mounted-on filesystem.
11170 	 */
11171 	if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11172 		state->ss_union_flags = SRCHFS_START;
11173 		state->ss_union_layer++;        // search next layer down
11174 		fserror = EAGAIN;
11175 	}
11176 #endif /* CONFIG_UNION_MOUNTS */
11177 
11178 saveandexit:
11179 
11180 	vnode_put(vp);
11181 
11182 	/* Now copy out the stuff that needs copying out. That means the number of matches, the
11183 	 *  search state.  Everything was already put into he return buffer by the vop call. */
11184 
11185 	if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11186 		goto freeandexit;
11187 	}
11188 
11189 	if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11190 		goto freeandexit;
11191 	}
11192 
11193 	error = fserror;
11194 
11195 freeandexit:
11196 
11197 	kfree_data(searchparams1, mallocsize);
11198 
11199 	return error;
11200 } /* end of searchfs system call */
11201 
11202 #else /* CONFIG_SEARCHFS */
11203 
11204 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11205 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11206 {
11207 	return ENOTSUP;
11208 }
11209 
11210 #endif /* CONFIG_SEARCHFS */
11211 
11212 
11213 #if CONFIG_DATALESS_FILES
11214 
11215 /*
11216  * === Namespace Resolver Up-call Mechanism ===
11217  *
11218  * When I/O is performed to a dataless file or directory (read, write,
11219  * lookup-in, etc.), the file system performs an upcall to the namespace
11220  * resolver (filecoordinationd) to materialize the object.
11221  *
11222  * We need multiple up-calls to be in flight at once, and we need these
11223  * up-calls to be interruptible, thus the following implementation:
11224  *
11225  * => The nspace_resolver_request represents the in-kernel request state.
11226  *    It contains a request ID, storage space for the errno code returned
11227  *    by filecoordinationd, and flags.
11228  *
11229  * => The request ID is simply a global monotonically incrementing 32-bit
11230  *    number.  Outstanding requests are stored in a hash table, and the
11231  *    hash function is extremely simple.
11232  *
11233  * => When an upcall is to be made to filecoordinationd, a request structure
11234  *    is allocated on the stack (it is small, and needs to live only during
11235  *    the duration of the call to resolve_nspace_item_ext()).  It is
11236  *    initialized and inserted into the table.  Some backpressure from
11237  *    filecoordinationd is applied by limiting the numnber of entries that
11238  *    can be inserted into the table (and thus limiting the number of
11239  *    outstanding requests issued to filecoordinationd); waiting for an
11240  *    available slot is interruptible.
11241  *
11242  * => Once the request has been inserted into the table, the up-call is made
11243  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
11244  *    immediately and filecoordinationd processes the request asynchronously.
11245  *
11246  * => The caller now waits for the request to complete.  Tnis is achieved by
11247  *    sleeping on the address of the request structure and waiting for
11248  *    filecoordinationd to mark the request structure as complete.  This
11249  *    is an interruptible sleep call; if interrupted, the request structure
11250  *    is removed from the table and EINTR is returned to the caller.  If
11251  *    this occurs, an advisory up-call is made to filecoordinationd with
11252  *    the request ID to indicate that the request can be aborted or
11253  *    de-prioritized at the discretion of filecoordinationd.
11254  *
11255  * => When filecoordinationd has completed the request, it signals completion
11256  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
11257  *    decorated as a namespace resolver can write to this sysctl node.  The
11258  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11259  *    The request ID is looked up in the table, and if the request is found,
11260  *    the error code is stored in the request structure and a wakeup()
11261  *    issued on the address of the request structure.  If the request is not
11262  *    found, we simply drop the completion notification, assuming that the
11263  *    caller was interrupted.
11264  *
11265  * => When the waiting thread wakes up, it extracts the error code from the
11266  *    request structure, removes the request from the table, and returns the
11267  *    error code to the calling function.  Fini!
11268  */
11269 
11270 struct nspace_resolver_request {
11271 	LIST_ENTRY(nspace_resolver_request) r_hashlink;
11272 	vnode_t         r_vp;
11273 	uint32_t        r_req_id;
11274 	int             r_resolver_error;
11275 	int             r_flags;
11276 };
11277 
11278 #define RRF_COMPLETE    0x0001
11279 
11280 static uint32_t
next_nspace_req_id(void)11281 next_nspace_req_id(void)
11282 {
11283 	static uint32_t next_req_id;
11284 
11285 	return OSAddAtomic(1, &next_req_id);
11286 }
11287 
11288 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
11289 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
11290 
11291 static LIST_HEAD(nspace_resolver_requesthead,
11292     nspace_resolver_request) * nspace_resolver_request_hashtbl;
11293 static u_long nspace_resolver_request_hashmask;
11294 static u_int nspace_resolver_request_count;
11295 static bool nspace_resolver_request_wait_slot;
11296 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11297 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11298     &nspace_resolver_request_lck_grp);
11299 
11300 #define NSPACE_REQ_LOCK() \
11301 	lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11302 #define NSPACE_REQ_UNLOCK() \
11303 	lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11304 
11305 #define NSPACE_RESOLVER_HASH(req_id)    \
11306 	(&nspace_resolver_request_hashtbl[(req_id) & \
11307 	 nspace_resolver_request_hashmask])
11308 
11309 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)11310 nspace_resolver_req_lookup(uint32_t req_id)
11311 {
11312 	struct nspace_resolver_requesthead *bucket;
11313 	struct nspace_resolver_request *req;
11314 
11315 	bucket = NSPACE_RESOLVER_HASH(req_id);
11316 	LIST_FOREACH(req, bucket, r_hashlink) {
11317 		if (req->r_req_id == req_id) {
11318 			return req;
11319 		}
11320 	}
11321 
11322 	return NULL;
11323 }
11324 
11325 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11326 nspace_resolver_req_add(struct nspace_resolver_request *req)
11327 {
11328 	struct nspace_resolver_requesthead *bucket;
11329 	int error;
11330 
11331 	while (nspace_resolver_request_count >=
11332 	    NSPACE_RESOLVER_MAX_OUTSTANDING) {
11333 		nspace_resolver_request_wait_slot = true;
11334 		error = msleep(&nspace_resolver_request_count,
11335 		    &nspace_resolver_request_hash_mutex,
11336 		    PVFS | PCATCH, "nspacerq", NULL);
11337 		if (error) {
11338 			return error;
11339 		}
11340 	}
11341 
11342 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11343 #if DIAGNOSTIC
11344 	assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
11345 #endif /* DIAGNOSTIC */
11346 	LIST_INSERT_HEAD(bucket, req, r_hashlink);
11347 	nspace_resolver_request_count++;
11348 
11349 	return 0;
11350 }
11351 
11352 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11353 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11354 {
11355 	struct nspace_resolver_requesthead *bucket;
11356 
11357 	bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11358 #if DIAGNOSTIC
11359 	assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
11360 #endif /* DIAGNOSTIC */
11361 	LIST_REMOVE(req, r_hashlink);
11362 	nspace_resolver_request_count--;
11363 
11364 	if (nspace_resolver_request_wait_slot) {
11365 		nspace_resolver_request_wait_slot = false;
11366 		wakeup(&nspace_resolver_request_count);
11367 	}
11368 }
11369 
11370 static void
nspace_resolver_req_cancel(uint32_t req_id)11371 nspace_resolver_req_cancel(uint32_t req_id)
11372 {
11373 	kern_return_t kr;
11374 	mach_port_t mp;
11375 
11376 	// Failures here aren't fatal -- the cancellation message
11377 	// sent to the resolver is merely advisory.
11378 
11379 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11380 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11381 		return;
11382 	}
11383 
11384 	kr = send_nspace_resolve_cancel(mp, req_id);
11385 	if (kr != KERN_SUCCESS) {
11386 		os_log_error(OS_LOG_DEFAULT,
11387 		    "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11388 	}
11389 
11390 	ipc_port_release_send(mp);
11391 }
11392 
11393 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11394 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11395 {
11396 	bool send_cancel_message = false;
11397 	int error;
11398 
11399 	NSPACE_REQ_LOCK();
11400 
11401 	while ((req->r_flags & RRF_COMPLETE) == 0) {
11402 		error = msleep(req, &nspace_resolver_request_hash_mutex,
11403 		    PVFS | PCATCH, "nspace", NULL);
11404 		if (error && error != ERESTART) {
11405 			req->r_resolver_error = (error == EINTR) ? EINTR :
11406 			    ETIMEDOUT;
11407 			send_cancel_message = true;
11408 			break;
11409 		}
11410 	}
11411 
11412 	nspace_resolver_req_remove(req);
11413 
11414 	NSPACE_REQ_UNLOCK();
11415 
11416 	if (send_cancel_message) {
11417 		nspace_resolver_req_cancel(req->r_req_id);
11418 	}
11419 
11420 	return req->r_resolver_error;
11421 }
11422 
11423 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11424 nspace_resolver_req_mark_complete(
11425 	struct nspace_resolver_request *req,
11426 	int resolver_error)
11427 {
11428 	req->r_resolver_error = resolver_error;
11429 	req->r_flags |= RRF_COMPLETE;
11430 	wakeup(req);
11431 }
11432 
11433 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)11434 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
11435 {
11436 	struct nspace_resolver_request *req;
11437 
11438 	NSPACE_REQ_LOCK();
11439 
11440 	// If we don't find the request corresponding to our req_id,
11441 	// just drop the completion signal on the floor; it's likely
11442 	// that the requester interrupted with a signal.
11443 
11444 	req = nspace_resolver_req_lookup(req_id);
11445 	if (req) {
11446 		mount_t locked_mp = NULL;
11447 
11448 		locked_mp = req->r_vp->v_mount;
11449 		mount_ref(locked_mp, 0);
11450 		mount_lock_renames(locked_mp);
11451 
11452 		//
11453 		// if the resolver isn't already returning an error and we have an
11454 		// orig_gencount, then get an iocount on the request vnode and check
11455 		// that the gencount on req->r_vp has not changed.
11456 		//
11457 		// note: a ref was taken on req->r_vp when the request was created
11458 		// and that ref will be dropped by that thread when it wakes up.
11459 		//
11460 		if (resolver_error == 0 &&
11461 		    orig_gencount != 0 &&
11462 		    vnode_getwithref(req->r_vp) == 0) {
11463 			struct vnode_attr va;
11464 			uint64_t cur_gencount;
11465 
11466 			VATTR_INIT(&va);
11467 			VATTR_WANTED(&va, va_recursive_gencount);
11468 
11469 			if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
11470 				cur_gencount = va.va_recursive_gencount;
11471 			} else {
11472 				cur_gencount = 0;
11473 			}
11474 
11475 			if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
11476 				printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
11477 
11478 				// this error will be returned to the thread that initiated the
11479 				// materialization of req->r_vp.
11480 				resolver_error = EBUSY;
11481 
11482 				// note: we explicitly do not return an error to the caller (i.e.
11483 				// the thread that did the materialization) because they said they
11484 				// don't want one.
11485 			}
11486 
11487 			vnode_put(req->r_vp);
11488 		}
11489 
11490 		mount_unlock_renames(locked_mp);
11491 		mount_drop(locked_mp, 0);
11492 
11493 		nspace_resolver_req_mark_complete(req, resolver_error);
11494 	}
11495 
11496 	NSPACE_REQ_UNLOCK();
11497 
11498 	return;
11499 }
11500 
11501 static struct proc *nspace_resolver_proc;
11502 
11503 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11504 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11505 {
11506 	*is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11507 	    p == nspace_resolver_proc) ? 1 : 0;
11508 	return 0;
11509 }
11510 
11511 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11512 
11513 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11514 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11515 {
11516 	vfs_context_t ctx = vfs_context_current();
11517 	int error = 0;
11518 
11519 	//
11520 	// The system filecoordinationd runs as uid == 0.  This also
11521 	// has the nice side-effect of filtering out filecoordinationd
11522 	// running in the simulator.
11523 	//
11524 	if (!vfs_context_issuser(ctx) ||
11525 	    !vfs_context_is_dataless_resolver(ctx)) {
11526 		return EPERM;
11527 	}
11528 
11529 	if (is_resolver) {
11530 		NSPACE_REQ_LOCK();
11531 
11532 		if (nspace_resolver_proc == NULL) {
11533 			proc_lock(p);
11534 			p->p_lflag |= P_LNSPACE_RESOLVER;
11535 			proc_unlock(p);
11536 			nspace_resolver_proc = p;
11537 		} else {
11538 			error = EBUSY;
11539 		}
11540 
11541 		NSPACE_REQ_UNLOCK();
11542 	} else {
11543 		// This is basically just like the exit case.
11544 		// nspace_resolver_exited() will verify that the
11545 		// process is the resolver, and will clear the
11546 		// global.
11547 		nspace_resolver_exited(p);
11548 	}
11549 
11550 	return error;
11551 }
11552 
11553 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11554 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11555 {
11556 	if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11557 	    (p->p_vfs_iopolicy &
11558 	    P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11559 		*is_prevented = 1;
11560 	} else {
11561 		*is_prevented = 0;
11562 	}
11563 	return 0;
11564 }
11565 
11566 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11567 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11568 {
11569 	if (p->p_lflag & P_LNSPACE_RESOLVER) {
11570 		return is_prevented ? 0 : EBUSY;
11571 	}
11572 
11573 	if (is_prevented) {
11574 		OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11575 	} else {
11576 		OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11577 	}
11578 	return 0;
11579 }
11580 
11581 static int
nspace_materialization_get_thread_state(int * is_prevented)11582 nspace_materialization_get_thread_state(int *is_prevented)
11583 {
11584 	uthread_t ut = current_uthread();
11585 
11586 	*is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11587 	return 0;
11588 }
11589 
11590 static int
nspace_materialization_set_thread_state(int is_prevented)11591 nspace_materialization_set_thread_state(int is_prevented)
11592 {
11593 	uthread_t ut = current_uthread();
11594 
11595 	if (is_prevented) {
11596 		ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11597 	} else {
11598 		ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11599 	}
11600 	return 0;
11601 }
11602 
11603 /* the vfs.nspace branch */
11604 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11605 
11606 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11607 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11608     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11609 {
11610 	struct proc *p = req->p;
11611 	int new_value, old_value, changed = 0;
11612 	int error;
11613 
11614 	error = nspace_resolver_get_proc_state(p, &old_value);
11615 	if (error) {
11616 		return error;
11617 	}
11618 
11619 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11620 	    &changed);
11621 	if (error == 0 && changed) {
11622 		error = nspace_resolver_set_proc_state(p, new_value);
11623 	}
11624 	return error;
11625 }
11626 
11627 /* decorate this process as the dataless file resolver */
11628 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11629     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11630     0, 0, sysctl_nspace_resolver, "I", "");
11631 
11632 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11633 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11634     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11635 {
11636 	struct proc *p = req->p;
11637 	int new_value, old_value, changed = 0;
11638 	int error;
11639 
11640 	error = nspace_materialization_get_proc_state(p, &old_value);
11641 	if (error) {
11642 		return error;
11643 	}
11644 
11645 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11646 	    &changed);
11647 	if (error == 0 && changed) {
11648 		error = nspace_materialization_set_proc_state(p, new_value);
11649 	}
11650 	return error;
11651 }
11652 
11653 /* decorate this process as not wanting to materialize dataless files */
11654 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11655     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11656     0, 0, sysctl_nspace_prevent_materialization, "I", "");
11657 
11658 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11659 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11660     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11661 {
11662 	int new_value, old_value, changed = 0;
11663 	int error;
11664 
11665 	error = nspace_materialization_get_thread_state(&old_value);
11666 	if (error) {
11667 		return error;
11668 	}
11669 
11670 	error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11671 	    &changed);
11672 	if (error == 0 && changed) {
11673 		error = nspace_materialization_set_thread_state(new_value);
11674 	}
11675 	return error;
11676 }
11677 
11678 /* decorate this thread as not wanting to materialize dataless files */
11679 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11680     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11681     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11682 
11683 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11684 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11685     __unused int arg2, struct sysctl_req *req)
11686 {
11687 	struct proc *p = req->p;
11688 	uint32_t req_status[2] = { 0, 0 };
11689 	uint64_t gencount = 0;
11690 	int error, is_resolver, changed = 0, gencount_changed;
11691 
11692 	error = nspace_resolver_get_proc_state(p, &is_resolver);
11693 	if (error) {
11694 		return error;
11695 	}
11696 
11697 	if (!is_resolver) {
11698 		return EPERM;
11699 	}
11700 
11701 	error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11702 	    &changed);
11703 	if (error) {
11704 		return error;
11705 	}
11706 
11707 	// get the gencount if it was passed
11708 	error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11709 	    &gencount_changed);
11710 	if (error) {
11711 		gencount = 0;
11712 		// we ignore the error because the gencount was optional
11713 		error = 0;
11714 	}
11715 
11716 	/*
11717 	 * req_status[0] is the req_id
11718 	 *
11719 	 * req_status[1] is the errno
11720 	 */
11721 	if (error == 0 && changed) {
11722 		nspace_resolver_req_completed(req_status[0],
11723 		    (int)req_status[1], gencount);
11724 	}
11725 	return error;
11726 }
11727 
11728 /* Resolver reports completed reqs here. */
11729 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11730     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11731     0, 0, sysctl_nspace_complete, "-", "");
11732 
11733 #endif /* CONFIG_DATALESS_FILES */
11734 
11735 #if CONFIG_DATALESS_FILES
11736 #define __no_dataless_unused    /* nothing */
11737 #else
11738 #define __no_dataless_unused    __unused
11739 #endif
11740 
11741 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11742 vfs_context_dataless_materialization_is_prevented(
11743 	vfs_context_t const ctx __no_dataless_unused)
11744 {
11745 #if CONFIG_DATALESS_FILES
11746 	proc_t const p = vfs_context_proc(ctx);
11747 	thread_t const t = vfs_context_thread(ctx);
11748 	uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11749 
11750 	/*
11751 	 * Kernel context ==> return EDEADLK, as we would with any random
11752 	 * process decorated as no-materialize.
11753 	 */
11754 	if (ctx == vfs_context_kernel()) {
11755 		return EDEADLK;
11756 	}
11757 
11758 	/*
11759 	 * If the process has the dataless-manipulation entitlement,
11760 	 * materialization is prevented, and depending on the kind
11761 	 * of file system operation, things get to proceed as if the
11762 	 * object is not dataless.
11763 	 */
11764 	if (vfs_context_is_dataless_manipulator(ctx)) {
11765 		return EJUSTRETURN;
11766 	}
11767 
11768 	/*
11769 	 * Per-thread decorations override any process-wide decorations.
11770 	 * (Foundation uses this, and this overrides even the dataless-
11771 	 * manipulation entitlement so as to make API contracts consistent.)
11772 	 */
11773 	if (ut != NULL) {
11774 		if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11775 			return EDEADLK;
11776 		}
11777 		if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11778 			return 0;
11779 		}
11780 	}
11781 
11782 	/*
11783 	 * If the process's iopolicy specifies that dataless files
11784 	 * can be materialized, then we let it go ahead.
11785 	 */
11786 	if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11787 		return 0;
11788 	}
11789 #endif /* CONFIG_DATALESS_FILES */
11790 
11791 	/*
11792 	 * The default behavior is to not materialize dataless files;
11793 	 * return to the caller that deadlock was detected.
11794 	 */
11795 	return EDEADLK;
11796 }
11797 
11798 void
nspace_resolver_init(void)11799 nspace_resolver_init(void)
11800 {
11801 #if CONFIG_DATALESS_FILES
11802 	nspace_resolver_request_hashtbl =
11803 	    hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11804 	    M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11805 #endif /* CONFIG_DATALESS_FILES */
11806 }
11807 
11808 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11809 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11810 {
11811 #if CONFIG_DATALESS_FILES
11812 	struct nspace_resolver_requesthead *bucket;
11813 	struct nspace_resolver_request *req;
11814 	u_long idx;
11815 
11816 	NSPACE_REQ_LOCK();
11817 
11818 	if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11819 	    p == nspace_resolver_proc) {
11820 		for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11821 			bucket = &nspace_resolver_request_hashtbl[idx];
11822 			LIST_FOREACH(req, bucket, r_hashlink) {
11823 				nspace_resolver_req_mark_complete(req,
11824 				    ETIMEDOUT);
11825 			}
11826 		}
11827 		nspace_resolver_proc = NULL;
11828 	}
11829 
11830 	NSPACE_REQ_UNLOCK();
11831 #endif /* CONFIG_DATALESS_FILES */
11832 }
11833 
11834 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11835 resolve_nspace_item(struct vnode *vp, uint64_t op)
11836 {
11837 	return resolve_nspace_item_ext(vp, op, NULL);
11838 }
11839 
11840 #define DATALESS_RESOLVER_ENTITLEMENT     \
11841 	"com.apple.private.vfs.dataless-resolver"
11842 #define DATALESS_MANIPULATION_ENTITLEMENT \
11843 	"com.apple.private.vfs.dataless-manipulation"
11844 
11845 #if CONFIG_DATALESS_FILES
11846 /*
11847  * Return TRUE if the vfs context is associated with the dataless
11848  * resolver.
11849  */
11850 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11851 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11852 {
11853 	return IOTaskHasEntitlement(vfs_context_task(ctx),
11854 	           DATALESS_RESOLVER_ENTITLEMENT);
11855 }
11856 #endif /* CONFIG_DATALESS_FILES */
11857 
11858 /*
11859  * Return TRUE if the vfs context is associated with a process entitled
11860  * for dataless manipulation.
11861  *
11862  * XXX Arguably belongs in vfs_subr.c, but is here because of the
11863  * complication around CONFIG_DATALESS_FILES.
11864  */
11865 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11866 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11867 {
11868 #if CONFIG_DATALESS_FILES
11869 	task_t task = vfs_context_task(ctx);
11870 	return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11871 	       IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11872 #else
11873 	return false;
11874 #endif /* CONFIG_DATALESS_FILES */
11875 }
11876 
11877 #if CONFIG_DATALESS_FILES
11878 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11879 log_materialization_prevented(vnode_t vp, uint64_t op)
11880 {
11881 	char p_name[MAXCOMLEN + 1];
11882 	char *vntype;
11883 	proc_selfname(&p_name[0], sizeof(p_name));
11884 
11885 	if (vp->v_type == VREG) {
11886 		vntype = "File";
11887 	} else if (vp->v_type == VDIR) {
11888 		vntype = "Dir";
11889 	} else if (vp->v_type == VLNK) {
11890 		vntype = "SymLink";
11891 	} else {
11892 		vntype = "Other";
11893 	}
11894 
11895 #if DEVELOPMENT
11896 	char *path = NULL;
11897 	int   len;
11898 
11899 	path = get_pathbuff();
11900 	len = MAXPATHLEN;
11901 	if (path) {
11902 		vn_getpath(vp, path, &len);
11903 	}
11904 
11905 	os_log_debug(OS_LOG_DEFAULT,
11906 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11907 	    p_name, proc_selfpid(),
11908 	    op, vntype, path ? path : "<unknown-path>");
11909 	if (path) {
11910 		release_pathbuff(path);
11911 	}
11912 #else
11913 	os_log_debug(OS_LOG_DEFAULT,
11914 	    "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11915 	    p_name, proc_selfpid(),
11916 	    op, vntype);
11917 #endif
11918 }
11919 #endif /* CONFIG_DATALESS_FILES */
11920 
11921 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11922 vfs_materialize_item(
11923 	struct vnode *vp __no_dataless_unused,
11924 	uint64_t op __no_dataless_unused,
11925 	int64_t offset __no_dataless_unused,
11926 	int64_t size __no_dataless_unused,
11927 	char *lookup_name __no_dataless_unused,
11928 	size_t const namelen __no_dataless_unused)
11929 {
11930 #if CONFIG_DATALESS_FILES
11931 	struct nspace_resolver_request req;
11932 	kern_return_t kern_ret;
11933 	mach_port_t mach_port;
11934 	char *path = NULL;
11935 	vfs_context_t context;
11936 	int path_len;
11937 	int error;
11938 	audit_token_t atoken;
11939 
11940 	/*
11941 	 * If this is a snapshot event and the vnode is on a disk image just
11942 	 * pretend nothing happened since any change to the disk image will
11943 	 * cause the disk image itself to get backed up and this avoids multi-
11944 	 * way deadlocks between the snapshot handler and the ever popular
11945 	 * diskimages-helper process. The variable nspace_allow_virtual_devs
11946 	 * allows this behavior to be overridden (for use by the Mobile
11947 	 * TimeMachine testing infrastructure which uses disk images).
11948 	 */
11949 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11950 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11951 		return ENOTSUP;
11952 	}
11953 
11954 	context = vfs_context_current();
11955 
11956 	error = vfs_context_dataless_materialization_is_prevented(context);
11957 	if (error) {
11958 		log_materialization_prevented(vp, op);
11959 		return error;
11960 	}
11961 
11962 	kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11963 	    &mach_port);
11964 	if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11965 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11966 		/*
11967 		 * Treat this like being unable to access the backing store
11968 		 * server.
11969 		 */
11970 		return ETIMEDOUT;
11971 	}
11972 
11973 	path = zalloc(ZV_NAMEI);
11974 	path_len = MAXPATHLEN;
11975 
11976 	error = vn_getpath(vp, path, &path_len);
11977 	if (error) {
11978 		goto out_release_port;
11979 	}
11980 
11981 	error = vfs_context_copy_audit_token(context, &atoken);
11982 	if (error) {
11983 		goto out_release_port;
11984 	}
11985 
11986 	req.r_req_id = next_nspace_req_id();
11987 	req.r_resolver_error = 0;
11988 	req.r_flags = 0;
11989 	req.r_vp = vp;
11990 
11991 	NSPACE_REQ_LOCK();
11992 	error = nspace_resolver_req_add(&req);
11993 	NSPACE_REQ_UNLOCK();
11994 	if (error) {
11995 		goto out_release_port;
11996 	}
11997 
11998 	os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11999 	if (vp->v_type == VDIR) {
12000 		char *tmpname = NULL;
12001 
12002 		/*
12003 		 * If the caller provided a lookup_name *and* a name length,
12004 		 * then we assume the lookup_name is not NUL-terminated.
12005 		 * Allocate a temporary buffer in this case to provide
12006 		 * a NUL-terminated path name to the IPC call.
12007 		 */
12008 		if (lookup_name != NULL && namelen != 0) {
12009 			if (namelen >= PATH_MAX) {
12010 				error = EINVAL;
12011 				goto out_release_port;
12012 			}
12013 			tmpname = zalloc(ZV_NAMEI);
12014 			strlcpy(tmpname, lookup_name, namelen + 1);
12015 			lookup_name = tmpname;
12016 		} else if (lookup_name != NULL) {
12017 			/*
12018 			 * If the caller provided a lookup_name with a
12019 			 * zero name length, then we assume it's NUL-
12020 			 * terminated.  Verify it has a valid length.
12021 			 */
12022 			if (strlen(lookup_name) >= PATH_MAX) {
12023 				error = EINVAL;
12024 				goto out_release_port;
12025 			}
12026 		}
12027 
12028 		kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12029 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
12030 		    lookup_name == NULL ? "" : lookup_name, path, atoken);
12031 
12032 		if (tmpname != NULL) {
12033 			zfree(ZV_NAMEI, tmpname);
12034 
12035 			/*
12036 			 * Poison lookup_name rather than reference
12037 			 * freed memory.
12038 			 */
12039 			lookup_name = NULL;
12040 		}
12041 	} else {
12042 		kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12043 		    req.r_req_id, (uint32_t)(op & 0xffffffff),
12044 		    offset, size, path, atoken);
12045 	}
12046 	if (kern_ret != KERN_SUCCESS) {
12047 		/*
12048 		 * Also treat this like being unable to access the backing
12049 		 * store server.
12050 		 */
12051 		os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12052 		    kern_ret);
12053 		error = ETIMEDOUT;
12054 
12055 		NSPACE_REQ_LOCK();
12056 		nspace_resolver_req_remove(&req);
12057 		NSPACE_REQ_UNLOCK();
12058 		goto out_release_port;
12059 	}
12060 
12061 	/*
12062 	 * Give back the memory we allocated earlier while we wait; we
12063 	 * no longer need it.
12064 	 */
12065 	zfree(ZV_NAMEI, path);
12066 	path = NULL;
12067 
12068 	/*
12069 	 * Request has been submitted to the resolver. Now (interruptibly)
12070 	 * wait for completion. Upon requrn, the request will have been
12071 	 * removed from the lookup table.
12072 	 */
12073 	error = nspace_resolver_req_wait(&req);
12074 
12075 out_release_port:
12076 	if (path != NULL) {
12077 		zfree(ZV_NAMEI, path);
12078 	}
12079 	ipc_port_release_send(mach_port);
12080 
12081 	return error;
12082 #else
12083 	return ENOTSUP;
12084 #endif /* CONFIG_DATALESS_FILES */
12085 }
12086 
12087 /*
12088  * vfs_materialize_file: Materialize a regular file.
12089  *
12090  * Inputs:
12091  * vp		The dataless file to be materialized.
12092  *
12093  * op		What kind of operation is being performed:
12094  *		-> NAMESPACE_HANDLER_READ_OP
12095  *		-> NAMESPACE_HANDLER_WRITE_OP
12096  *		-> NAMESPACE_HANDLER_LINK_CREATE
12097  *		-> NAMESPACE_HANDLER_DELETE_OP
12098  *		-> NAMESPACE_HANDLER_TRUNCATE_OP
12099  *		-> NAMESPACE_HANDLER_RENAME_OP
12100  *
12101  * offset	offset of I/O for READ or WRITE.  Ignored for
12102  *		other ops.
12103  *
12104  * size		size of I/O for READ or WRITE  Ignored for
12105  *		other ops.
12106  *
12107  * If offsize or size are -1 for a READ or WRITE, then the resolver should
12108  * consider the range to be unknown.
12109  *
12110  * Upon successful return, the caller may proceed with the operation.
12111  * N.B. the file may still be "dataless" in this case.
12112  */
12113 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12114 vfs_materialize_file(
12115 	struct vnode *vp,
12116 	uint64_t op,
12117 	int64_t offset,
12118 	int64_t size)
12119 {
12120 	if (vp->v_type != VREG) {
12121 		return EFTYPE;
12122 	}
12123 	return vfs_materialize_item(vp, op, offset, size, NULL, 0);
12124 }
12125 
12126 /*
12127  * vfs_materialize_dir:
12128  *
12129  * Inputs:
12130  * vp		The dataless directory to be materialized.
12131  *
12132  * op		What kind of operation is being performed:
12133  *		-> NAMESPACE_HANDLER_READ_OP
12134  *		-> NAMESPACE_HANDLER_WRITE_OP
12135  *		-> NAMESPACE_HANDLER_DELETE_OP
12136  *		-> NAMESPACE_HANDLER_RENAME_OP
12137  *		-> NAMESPACE_HANDLER_LOOKUP_OP
12138  *
12139  * lookup_name	Name being looked up for a LOOKUP op.  Ignored for
12140  *		other ops.  May or may not be NUL-terminated; see below.
12141  *
12142  * namelen	If non-zero, then lookup_name is assumed to not be NUL-
12143  *		terminated and namelen is the number of valid bytes in
12144  *		lookup_name. If zero, then lookup_name is assumed to be
12145  *		NUL-terminated.
12146  *
12147  * Upon successful return, the caller may proceed with the operation.
12148  * N.B. the directory may still be "dataless" in this case.
12149  */
12150 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12151 vfs_materialize_dir(
12152 	struct vnode *vp,
12153 	uint64_t op,
12154 	char *lookup_name,
12155 	size_t namelen)
12156 {
12157 	if (vp->v_type != VDIR) {
12158 		return EFTYPE;
12159 	}
12160 	if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12161 		return EINVAL;
12162 	}
12163 	return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
12164 }
12165 
12166 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)12167 resolve_nspace_item_ext(
12168 	struct vnode *vp __no_dataless_unused,
12169 	uint64_t op __no_dataless_unused,
12170 	void *arg __unused)
12171 {
12172 #if CONFIG_DATALESS_FILES
12173 	int error;
12174 	mach_port_t mp;
12175 	char *path = NULL;
12176 	int path_len;
12177 	kern_return_t kr;
12178 	struct nspace_resolver_request req;
12179 
12180 	// only allow namespace events on regular files, directories and symlinks.
12181 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
12182 		return EFTYPE;
12183 	}
12184 
12185 	//
12186 	// if this is a snapshot event and the vnode is on a
12187 	// disk image just pretend nothing happened since any
12188 	// change to the disk image will cause the disk image
12189 	// itself to get backed up and this avoids multi-way
12190 	// deadlocks between the snapshot handler and the ever
12191 	// popular diskimages-helper process.  the variable
12192 	// nspace_allow_virtual_devs allows this behavior to
12193 	// be overridden (for use by the Mobile TimeMachine
12194 	// testing infrastructure which uses disk images)
12195 	//
12196 	if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12197 		os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12198 		return ENOTSUP;
12199 	}
12200 
12201 	error = vfs_context_dataless_materialization_is_prevented(
12202 		vfs_context_current());
12203 	if (error) {
12204 		log_materialization_prevented(vp, op);
12205 		return error;
12206 	}
12207 
12208 	kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
12209 	if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
12210 		os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12211 		// Treat this like being unable to access the backing
12212 		// store server.
12213 		return ETIMEDOUT;
12214 	}
12215 
12216 	path = zalloc(ZV_NAMEI);
12217 	path_len = MAXPATHLEN;
12218 
12219 	error = vn_getpath(vp, path, &path_len);
12220 	if (error == 0) {
12221 		int xxx_rdar44371223;   /* XXX Mig bug */
12222 		req.r_req_id = next_nspace_req_id();
12223 		req.r_resolver_error = 0;
12224 		req.r_flags = 0;
12225 
12226 		if ((error = vnode_ref(vp)) == 0) {     // take a ref so that the vnode doesn't go away
12227 			req.r_vp = vp;
12228 		} else {
12229 			goto out_release_port;
12230 		}
12231 
12232 		NSPACE_REQ_LOCK();
12233 		error = nspace_resolver_req_add(&req);
12234 		NSPACE_REQ_UNLOCK();
12235 		if (error) {
12236 			vnode_rele(req.r_vp);
12237 			goto out_release_port;
12238 		}
12239 
12240 		os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12241 		kr = send_nspace_resolve_path(mp, req.r_req_id,
12242 		    proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
12243 		    path, &xxx_rdar44371223);
12244 		if (kr != KERN_SUCCESS) {
12245 			// Also treat this like being unable to access
12246 			// the backing store server.
12247 			os_log_error(OS_LOG_DEFAULT,
12248 			    "NSPACE resolve_path failure: %d", kr);
12249 			error = ETIMEDOUT;
12250 
12251 			NSPACE_REQ_LOCK();
12252 			nspace_resolver_req_remove(&req);
12253 			NSPACE_REQ_UNLOCK();
12254 			vnode_rele(req.r_vp);
12255 			goto out_release_port;
12256 		}
12257 
12258 		// Give back the memory we allocated earlier while
12259 		// we wait; we no longer need it.
12260 		zfree(ZV_NAMEI, path);
12261 		path = NULL;
12262 
12263 		// Request has been submitted to the resolver.
12264 		// Now (interruptibly) wait for completion.
12265 		// Upon requrn, the request will have been removed
12266 		// from the lookup table.
12267 		error = nspace_resolver_req_wait(&req);
12268 
12269 		vnode_rele(req.r_vp);
12270 	}
12271 
12272 out_release_port:
12273 	if (path != NULL) {
12274 		zfree(ZV_NAMEI, path);
12275 	}
12276 	ipc_port_release_send(mp);
12277 
12278 	return error;
12279 #else
12280 	return ENOTSUP;
12281 #endif /* CONFIG_DATALESS_FILES */
12282 }
12283 
12284 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)12285 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
12286     __unused uint64_t op_type, __unused void *arg)
12287 {
12288 	return 0;
12289 }
12290 
12291 #if 0
12292 static int
12293 build_volfs_path(struct vnode *vp, char *path, int *len)
12294 {
12295 	struct vnode_attr va;
12296 	int ret;
12297 
12298 	VATTR_INIT(&va);
12299 	VATTR_WANTED(&va, va_fsid);
12300 	VATTR_WANTED(&va, va_fileid);
12301 
12302 	if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12303 		*len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12304 		ret = -1;
12305 	} else {
12306 		*len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12307 		ret = 0;
12308 	}
12309 
12310 	return ret;
12311 }
12312 #endif
12313 
12314 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12315 fsctl_bogus_command_compat(unsigned long cmd)
12316 {
12317 	switch (cmd) {
12318 	case IOCBASECMD(FSIOC_SYNC_VOLUME):
12319 		return FSIOC_SYNC_VOLUME;
12320 	case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12321 		return FSIOC_ROUTEFS_SETROUTEID;
12322 	case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12323 		return FSIOC_SET_PACKAGE_EXTS;
12324 	case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12325 		return FSIOC_SET_FSTYPENAME_OVERRIDE;
12326 	case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12327 		return DISK_CONDITIONER_IOC_GET;
12328 	case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12329 		return DISK_CONDITIONER_IOC_SET;
12330 	case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12331 		return FSIOC_FIOSEEKHOLE;
12332 	case IOCBASECMD(FSIOC_FIOSEEKDATA):
12333 		return FSIOC_FIOSEEKDATA;
12334 	case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12335 		return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12336 	case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12337 		return SPOTLIGHT_IOC_GET_LAST_MTIME;
12338 	}
12339 
12340 	return cmd;
12341 }
12342 
12343 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12344 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12345 {
12346 	return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12347 }
12348 
12349 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12350 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12351 {
12352 	struct vfs_attr vfa;
12353 	mount_t mp = vp->v_mount;
12354 	unsigned arg;
12355 	int error;
12356 
12357 	/* record vid of vp so we can drop it below. */
12358 	uint32_t vvid = vp->v_id;
12359 
12360 	/*
12361 	 * Then grab mount_iterref so that we can release the vnode.
12362 	 * Without this, a thread may call vnode_iterate_prepare then
12363 	 * get into a deadlock because we've never released the root vp
12364 	 */
12365 	error = mount_iterref(mp, 0);
12366 	if (error) {
12367 		return error;
12368 	}
12369 	vnode_hold(vp);
12370 	vnode_put(vp);
12371 
12372 	arg = MNT_NOWAIT;
12373 	if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12374 		arg = MNT_WAIT;
12375 	}
12376 
12377 	/*
12378 	 * If the filessytem supports multiple filesytems in a
12379 	 * partition (For eg APFS volumes in a container, it knows
12380 	 * that the waitfor argument to VFS_SYNC are flags.
12381 	 */
12382 	VFSATTR_INIT(&vfa);
12383 	VFSATTR_WANTED(&vfa, f_capabilities);
12384 	if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12385 	    VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12386 	    ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12387 	    ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12388 		arg |= MNT_VOLUME;
12389 	}
12390 
12391 	/* issue the sync for this volume */
12392 	(void)sync_callback(mp, &arg);
12393 
12394 	/*
12395 	 * Then release the mount_iterref once we're done syncing; it's not
12396 	 * needed for the VNOP_IOCTL below
12397 	 */
12398 	mount_iterdrop(mp);
12399 
12400 	if (arg & FSCTL_SYNC_FULLSYNC) {
12401 		/* re-obtain vnode iocount on the root vp, if possible */
12402 		error = vnode_getwithvid(vp, vvid);
12403 		if (error == 0) {
12404 			error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12405 			vnode_put(vp);
12406 		}
12407 	}
12408 	vnode_drop(vp);
12409 	/* mark the argument VP as having been released */
12410 	*arg_vp = NULL;
12411 	return error;
12412 }
12413 
12414 #if ROUTEFS
12415 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12416 handle_routes(user_addr_t udata)
12417 {
12418 	char routepath[MAXPATHLEN];
12419 	size_t len = 0;
12420 	int error;
12421 
12422 	if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12423 		return error;
12424 	}
12425 	bzero(routepath, MAXPATHLEN);
12426 	error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12427 	if (error) {
12428 		return error;
12429 	}
12430 	error = routefs_kernel_mount(routepath);
12431 	return error;
12432 }
12433 #endif
12434 
12435 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12436 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12437 {
12438 	struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12439 	struct vnode_attr va;
12440 	int error;
12441 
12442 	VATTR_INIT(&va);
12443 	VATTR_SET(&va, va_flags, cas->new_flags);
12444 
12445 	error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12446 
12447 #if CONFIG_FSE
12448 	if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12449 		add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12450 	}
12451 #endif
12452 
12453 	return error;
12454 }
12455 
12456 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12457 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12458 {
12459 	struct mount *mp = NULL;
12460 	errno_t rootauth = 0;
12461 
12462 	mp = vp->v_mount;
12463 
12464 	/*
12465 	 * query the underlying FS and see if it reports something
12466 	 * sane for this vnode. If volume is authenticated via
12467 	 * chunklist, leave that for the caller to determine.
12468 	 */
12469 	rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12470 
12471 	return rootauth;
12472 }
12473 
12474 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12475 	"com.apple.private.kernel.set-package-extensions"
12476 
12477 /*
12478  * Make a filesystem-specific control call:
12479  */
12480 /* ARGSUSED */
12481 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12482 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12483 {
12484 	int error = 0;
12485 	boolean_t is64bit;
12486 	u_int size;
12487 #define STK_PARAMS 128
12488 	char stkbuf[STK_PARAMS] = {0};
12489 	caddr_t data, memp;
12490 	vnode_t vp = *arg_vp;
12491 
12492 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
12493 		return ENOTTY;
12494 	}
12495 
12496 	cmd = fsctl_bogus_command_compat(cmd);
12497 
12498 	size = IOCPARM_LEN(cmd);
12499 	if (size > IOCPARM_MAX) {
12500 		return EINVAL;
12501 	}
12502 
12503 	is64bit = proc_is64bit(p);
12504 
12505 	memp = NULL;
12506 
12507 	if (size > sizeof(stkbuf)) {
12508 		if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12509 			return ENOMEM;
12510 		}
12511 		data = memp;
12512 	} else {
12513 		data = &stkbuf[0];
12514 	};
12515 
12516 	if (cmd & IOC_IN) {
12517 		if (size) {
12518 			error = copyin(udata, data, size);
12519 			if (error) {
12520 				if (memp) {
12521 					kfree_data(memp, size);
12522 				}
12523 				return error;
12524 			}
12525 		} else {
12526 			if (is64bit) {
12527 				*(user_addr_t *)data = udata;
12528 			} else {
12529 				*(uint32_t *)data = (uint32_t)udata;
12530 			}
12531 		};
12532 	} else if ((cmd & IOC_OUT) && size) {
12533 		/*
12534 		 * Zero the buffer so the user always
12535 		 * gets back something deterministic.
12536 		 */
12537 		bzero(data, size);
12538 	} else if (cmd & IOC_VOID) {
12539 		if (is64bit) {
12540 			*(user_addr_t *)data = udata;
12541 		} else {
12542 			*(uint32_t *)data = (uint32_t)udata;
12543 		}
12544 	}
12545 
12546 	/* Check to see if it's a generic command */
12547 	switch (cmd) {
12548 	case FSIOC_SYNC_VOLUME:
12549 		error = handle_sync_volume(vp, arg_vp, data, ctx);
12550 		break;
12551 
12552 	case FSIOC_ROUTEFS_SETROUTEID:
12553 #if ROUTEFS
12554 		error = handle_routes(udata);
12555 #endif
12556 		break;
12557 
12558 	case FSIOC_SET_PACKAGE_EXTS: {
12559 		user_addr_t ext_strings;
12560 		uint32_t    num_entries;
12561 		uint32_t    max_width;
12562 
12563 		if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12564 		    SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12565 			error = EPERM;
12566 			break;
12567 		}
12568 
12569 		if ((is64bit && size != sizeof(user64_package_ext_info))
12570 		    || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12571 			// either you're 64-bit and passed a 64-bit struct or
12572 			// you're 32-bit and passed a 32-bit struct.  otherwise
12573 			// it's not ok.
12574 			error = EINVAL;
12575 			break;
12576 		}
12577 
12578 		if (is64bit) {
12579 			if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12580 				assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12581 			}
12582 			ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12583 			num_entries = ((user64_package_ext_info *)data)->num_entries;
12584 			max_width   = ((user64_package_ext_info *)data)->max_width;
12585 		} else {
12586 			ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12587 			num_entries = ((user32_package_ext_info *)data)->num_entries;
12588 			max_width   = ((user32_package_ext_info *)data)->max_width;
12589 		}
12590 		error = set_package_extensions_table(ext_strings, num_entries, max_width);
12591 	}
12592 	break;
12593 
12594 	case FSIOC_SET_FSTYPENAME_OVERRIDE:
12595 	{
12596 		if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12597 			break;
12598 		}
12599 		if (vp->v_mount) {
12600 			mount_lock(vp->v_mount);
12601 			if (data[0] != 0) {
12602 				int i;
12603 				for (i = 0; i < MFSTYPENAMELEN; i++) {
12604 					if (!data[i]) {
12605 						goto continue_copy;
12606 					}
12607 				}
12608 				/*
12609 				 * Getting here means we have a user data string which has no
12610 				 * NULL termination in its first MFSTYPENAMELEN bytes.
12611 				 * This is bogus, let's avoid strlcpy-ing the read data and
12612 				 * return an error.
12613 				 */
12614 				error = EINVAL;
12615 				goto unlock;
12616 continue_copy:
12617 				strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
12618 				vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
12619 				if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12620 					vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
12621 					vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
12622 				}
12623 			} else {
12624 				if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12625 					vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
12626 				}
12627 				vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
12628 				vp->v_mount->fstypename_override[0] = '\0';
12629 			}
12630 unlock:
12631 			mount_unlock(vp->v_mount);
12632 		}
12633 	}
12634 	break;
12635 
12636 	case DISK_CONDITIONER_IOC_GET: {
12637 		error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12638 	}
12639 	break;
12640 
12641 	case DISK_CONDITIONER_IOC_SET: {
12642 		error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12643 	}
12644 	break;
12645 
12646 	case FSIOC_CAS_BSDFLAGS:
12647 		error = handle_flags(vp, data, ctx);
12648 		break;
12649 
12650 	case FSIOC_FD_ONLY_OPEN_ONCE: {
12651 		error = 0;
12652 		if (vnode_usecount(vp) > 1) {
12653 			vnode_lock_spin(vp);
12654 			if (vp->v_lflag & VL_HASSTREAMS) {
12655 				if (vnode_isinuse_locked(vp, 1, 1)) {
12656 					error = EBUSY;
12657 				}
12658 			} else if (vnode_usecount(vp) > 1) {
12659 				error = EBUSY;
12660 			}
12661 			vnode_unlock(vp);
12662 		}
12663 	}
12664 	break;
12665 
12666 	case FSIOC_EVAL_ROOTAUTH:
12667 		error = handle_auth(vp, cmd, data, options, ctx);
12668 		break;
12669 
12670 	case FSIOC_TEST_FSE_ACCESS_GRANTED:
12671 		error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12672 		break;
12673 
12674 	default: {
12675 		/* other, known commands shouldn't be passed down here */
12676 		switch (cmd) {
12677 		case F_PUNCHHOLE:
12678 		case F_TRIM_ACTIVE_FILE:
12679 		case F_RDADVISE:
12680 		case F_TRANSCODEKEY:
12681 		case F_GETPROTECTIONLEVEL:
12682 		case F_GETDEFAULTPROTLEVEL:
12683 		case F_MAKECOMPRESSED:
12684 		case F_SET_GREEDY_MODE:
12685 		case F_SETSTATICCONTENT:
12686 		case F_SETIOTYPE:
12687 		case F_SETBACKINGSTORE:
12688 		case F_GETPATH_MTMINFO:
12689 		case APFSIOC_REVERT_TO_SNAPSHOT:
12690 		case FSIOC_FIOSEEKHOLE:
12691 		case FSIOC_FIOSEEKDATA:
12692 		case HFS_GET_BOOT_INFO:
12693 		case HFS_SET_BOOT_INFO:
12694 		case FIOPINSWAP:
12695 		case F_CHKCLEAN:
12696 		case F_FULLFSYNC:
12697 		case F_BARRIERFSYNC:
12698 		case F_FREEZE_FS:
12699 		case F_THAW_FS:
12700 		case FSIOC_KERNEL_ROOTAUTH:
12701 		case FSIOC_GRAFT_FS:
12702 		case FSIOC_UNGRAFT_FS:
12703 		case FSIOC_AUTH_FS:
12704 			error = EINVAL;
12705 			goto outdrop;
12706 		}
12707 		/* Invoke the filesystem-specific code */
12708 		error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12709 	}
12710 	} /* end switch stmt */
12711 
12712 	/*
12713 	 * if no errors, copy any data to user. Size was
12714 	 * already set and checked above.
12715 	 */
12716 	if (error == 0 && (cmd & IOC_OUT) && size) {
12717 		error = copyout(data, udata, size);
12718 	}
12719 
12720 outdrop:
12721 	if (memp) {
12722 		kfree_data(memp, size);
12723 	}
12724 
12725 	return error;
12726 }
12727 
12728 /* ARGSUSED */
12729 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12730 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12731 {
12732 	int error;
12733 	struct nameidata nd;
12734 	uint32_t nameiflags;
12735 	vnode_t vp = NULL;
12736 	vfs_context_t ctx = vfs_context_current();
12737 
12738 	AUDIT_ARG(cmd, (int)uap->cmd);
12739 	AUDIT_ARG(value32, uap->options);
12740 	/* Get the vnode for the file we are getting info on:  */
12741 	nameiflags = 0;
12742 	//
12743 	// if we come through fsctl() then the file is by definition not open.
12744 	// therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12745 	// lest the caller mistakenly thinks the only open is their own (but in
12746 	// reality it's someone elses).
12747 	//
12748 	if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12749 		return EINVAL;
12750 	}
12751 	if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12752 		nameiflags |= FOLLOW;
12753 	}
12754 	if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12755 		nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12756 	}
12757 	NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12758 	    UIO_USERSPACE, uap->path, ctx);
12759 	if ((error = namei(&nd))) {
12760 		goto done;
12761 	}
12762 	vp = nd.ni_vp;
12763 	nameidone(&nd);
12764 
12765 #if CONFIG_MACF
12766 	error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12767 	if (error) {
12768 		goto done;
12769 	}
12770 #endif
12771 
12772 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12773 
12774 done:
12775 	if (vp) {
12776 		vnode_put(vp);
12777 	}
12778 	return error;
12779 }
12780 /* ARGSUSED */
12781 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12782 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12783 {
12784 	int error;
12785 	vnode_t vp = NULL;
12786 	vfs_context_t ctx = vfs_context_current();
12787 	int fd = -1;
12788 
12789 	AUDIT_ARG(fd, uap->fd);
12790 	AUDIT_ARG(cmd, (int)uap->cmd);
12791 	AUDIT_ARG(value32, uap->options);
12792 
12793 	/* Get the vnode for the file we are getting info on:  */
12794 	if ((error = file_vnode(uap->fd, &vp))) {
12795 		return error;
12796 	}
12797 	fd = uap->fd;
12798 	if ((error = vnode_getwithref(vp))) {
12799 		file_drop(fd);
12800 		return error;
12801 	}
12802 
12803 #if CONFIG_MACF
12804 	if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12805 		file_drop(fd);
12806 		vnode_put(vp);
12807 		return error;
12808 	}
12809 #endif
12810 
12811 	error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12812 
12813 	file_drop(fd);
12814 
12815 	/*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12816 	if (vp) {
12817 		vnode_put(vp);
12818 	}
12819 
12820 	return error;
12821 }
12822 /* end of fsctl system call */
12823 
12824 #define FILESEC_ACCESS_ENTITLEMENT              \
12825 	"com.apple.private.vfs.filesec-access"
12826 
12827 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12828 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12829 {
12830 	if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12831 		/*
12832 		 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12833 		 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12834 		 */
12835 		if ((!setting && vfs_context_issuser(ctx)) ||
12836 		    IOTaskHasEntitlement(vfs_context_task(ctx),
12837 		    FILESEC_ACCESS_ENTITLEMENT)) {
12838 			return 0;
12839 		}
12840 	}
12841 
12842 	return EPERM;
12843 }
12844 
12845 /*
12846  *  Retrieve the data of an extended attribute.
12847  */
12848 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12849 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12850 {
12851 	vnode_t vp;
12852 	struct nameidata nd;
12853 	char attrname[XATTR_MAXNAMELEN + 1];
12854 	vfs_context_t ctx = vfs_context_current();
12855 	uio_t auio = NULL;
12856 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12857 	size_t attrsize = 0;
12858 	size_t namelen;
12859 	u_int32_t nameiflags;
12860 	int error;
12861 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12862 
12863 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12864 		return EINVAL;
12865 	}
12866 
12867 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12868 	NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12869 	if ((error = namei(&nd))) {
12870 		return error;
12871 	}
12872 	vp = nd.ni_vp;
12873 	nameidone(&nd);
12874 
12875 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12876 	if (error != 0) {
12877 		goto out;
12878 	}
12879 	if (xattr_protected(attrname) &&
12880 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12881 		goto out;
12882 	}
12883 	/*
12884 	 * the specific check for 0xffffffff is a hack to preserve
12885 	 * binaray compatibilty in K64 with applications that discovered
12886 	 * that passing in a buf pointer and a size of -1 resulted in
12887 	 * just the size of the indicated extended attribute being returned.
12888 	 * this isn't part of the documented behavior, but because of the
12889 	 * original implemtation's check for "uap->size > 0", this behavior
12890 	 * was allowed. In K32 that check turned into a signed comparison
12891 	 * even though uap->size is unsigned...  in K64, we blow by that
12892 	 * check because uap->size is unsigned and doesn't get sign smeared
12893 	 * in the munger for a 32 bit user app.  we also need to add a
12894 	 * check to limit the maximum size of the buffer being passed in...
12895 	 * unfortunately, the underlying fileystems seem to just malloc
12896 	 * the requested size even if the actual extended attribute is tiny.
12897 	 * because that malloc is for kernel wired memory, we have to put a
12898 	 * sane limit on it.
12899 	 *
12900 	 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12901 	 * U64 running on K64 will yield -1 (64 bits wide)
12902 	 * U32/U64 running on K32 will yield -1 (32 bits wide)
12903 	 */
12904 	if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12905 		goto no_uio;
12906 	}
12907 
12908 	if (uap->value) {
12909 		if (uap->size > (size_t)XATTR_MAXSIZE) {
12910 			uap->size = XATTR_MAXSIZE;
12911 		}
12912 
12913 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12914 		    &uio_buf[0], sizeof(uio_buf));
12915 		uio_addiov(auio, uap->value, uap->size);
12916 	}
12917 no_uio:
12918 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12919 out:
12920 	vnode_put(vp);
12921 
12922 	if (auio) {
12923 		*retval = uap->size - uio_resid(auio);
12924 	} else {
12925 		*retval = (user_ssize_t)attrsize;
12926 	}
12927 
12928 	return error;
12929 }
12930 
12931 /*
12932  * Retrieve the data of an extended attribute.
12933  */
12934 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12935 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12936 {
12937 	vnode_t vp;
12938 	char attrname[XATTR_MAXNAMELEN + 1];
12939 	vfs_context_t ctx = vfs_context_current();
12940 	uio_t auio = NULL;
12941 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12942 	size_t attrsize = 0;
12943 	size_t namelen;
12944 	int error;
12945 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12946 
12947 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12948 		return EINVAL;
12949 	}
12950 
12951 	if ((error = file_vnode(uap->fd, &vp))) {
12952 		return error;
12953 	}
12954 	if ((error = vnode_getwithref(vp))) {
12955 		file_drop(uap->fd);
12956 		return error;
12957 	}
12958 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12959 	if (error != 0) {
12960 		goto out;
12961 	}
12962 	if (xattr_protected(attrname) &&
12963 	    (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12964 		goto out;
12965 	}
12966 	if (uap->value && uap->size > 0) {
12967 		if (uap->size > (size_t)XATTR_MAXSIZE) {
12968 			uap->size = XATTR_MAXSIZE;
12969 		}
12970 
12971 		auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12972 		    &uio_buf[0], sizeof(uio_buf));
12973 		uio_addiov(auio, uap->value, uap->size);
12974 	}
12975 
12976 	error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
12977 out:
12978 	(void)vnode_put(vp);
12979 	file_drop(uap->fd);
12980 
12981 	if (auio) {
12982 		*retval = uap->size - uio_resid(auio);
12983 	} else {
12984 		*retval = (user_ssize_t)attrsize;
12985 	}
12986 	return error;
12987 }
12988 
12989 /* struct for checkdirs iteration */
12990 struct setxattr_ctx {
12991 	struct nameidata nd;
12992 	char attrname[XATTR_MAXNAMELEN + 1];
12993 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12994 };
12995 
12996 /*
12997  * Set the data of an extended attribute.
12998  */
12999 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13000 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13001 {
13002 	vnode_t vp;
13003 	vfs_context_t ctx = vfs_context_current();
13004 	uio_t auio = NULL;
13005 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13006 	size_t namelen;
13007 	u_int32_t nameiflags;
13008 	int error;
13009 	struct setxattr_ctx *sactx;
13010 
13011 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13012 		return EINVAL;
13013 	}
13014 
13015 	sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13016 	if (sactx == NULL) {
13017 		return ENOMEM;
13018 	}
13019 
13020 	error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13021 	if (error != 0) {
13022 		if (error == EPERM) {
13023 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13024 			error = ENAMETOOLONG;
13025 		}
13026 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13027 		goto out;
13028 	}
13029 	if (xattr_protected(sactx->attrname) &&
13030 	    (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13031 		goto out;
13032 	}
13033 	if (uap->size != 0 && uap->value == 0) {
13034 		error = EINVAL;
13035 		goto out;
13036 	}
13037 	if (uap->size > INT_MAX) {
13038 		error = E2BIG;
13039 		goto out;
13040 	}
13041 
13042 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13043 #if CONFIG_FILE_LEASES
13044 	nameiflags |= WANTPARENT;
13045 #endif
13046 	NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13047 	if ((error = namei(&sactx->nd))) {
13048 		goto out;
13049 	}
13050 	vp = sactx->nd.ni_vp;
13051 #if CONFIG_FILE_LEASES
13052 	vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13053 	vnode_put(sactx->nd.ni_dvp);
13054 #endif
13055 	nameidone(&sactx->nd);
13056 
13057 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13058 	    &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13059 	uio_addiov(auio, uap->value, uap->size);
13060 
13061 	error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13062 #if CONFIG_FSE
13063 	if (error == 0) {
13064 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13065 		    FSE_ARG_VNODE, vp,
13066 		    FSE_ARG_DONE);
13067 	}
13068 #endif
13069 	vnode_put(vp);
13070 out:
13071 	kfree_type(struct setxattr_ctx, sactx);
13072 	*retval = 0;
13073 	return error;
13074 }
13075 
13076 /*
13077  * Set the data of an extended attribute.
13078  */
13079 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13080 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13081 {
13082 	vnode_t vp;
13083 	char attrname[XATTR_MAXNAMELEN + 1];
13084 	vfs_context_t ctx = vfs_context_current();
13085 	uio_t auio = NULL;
13086 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13087 	size_t namelen;
13088 	int error;
13089 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13090 
13091 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13092 		return EINVAL;
13093 	}
13094 
13095 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13096 	if (error != 0) {
13097 		if (error == EPERM) {
13098 			/* if the string won't fit in attrname, copyinstr emits EPERM */
13099 			return ENAMETOOLONG;
13100 		}
13101 		/* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13102 		return error;
13103 	}
13104 	if (xattr_protected(attrname) &&
13105 	    (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13106 		return error;
13107 	}
13108 	if (uap->size != 0 && uap->value == 0) {
13109 		return EINVAL;
13110 	}
13111 	if (uap->size > INT_MAX) {
13112 		return E2BIG;
13113 	}
13114 	if ((error = file_vnode(uap->fd, &vp))) {
13115 		return error;
13116 	}
13117 	if ((error = vnode_getwithref(vp))) {
13118 		file_drop(uap->fd);
13119 		return error;
13120 	}
13121 
13122 #if CONFIG_FILE_LEASES
13123 	vnode_breakdirlease(vp, true, O_WRONLY);
13124 #endif
13125 
13126 	auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13127 	    &uio_buf[0], sizeof(uio_buf));
13128 	uio_addiov(auio, uap->value, uap->size);
13129 
13130 	error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13131 #if CONFIG_FSE
13132 	if (error == 0) {
13133 		add_fsevent(FSE_XATTR_MODIFIED, ctx,
13134 		    FSE_ARG_VNODE, vp,
13135 		    FSE_ARG_DONE);
13136 	}
13137 #endif
13138 	vnode_put(vp);
13139 	file_drop(uap->fd);
13140 	*retval = 0;
13141 	return error;
13142 }
13143 
13144 /*
13145  * Remove an extended attribute.
13146  * XXX Code duplication here.
13147  */
13148 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13149 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13150 {
13151 	vnode_t vp;
13152 	struct nameidata nd;
13153 	char attrname[XATTR_MAXNAMELEN + 1];
13154 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13155 	vfs_context_t ctx = vfs_context_current();
13156 	size_t namelen;
13157 	u_int32_t nameiflags;
13158 	int error;
13159 
13160 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13161 		return EINVAL;
13162 	}
13163 
13164 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13165 	if (error != 0) {
13166 		return error;
13167 	}
13168 	if (xattr_protected(attrname)) {
13169 		return EPERM;
13170 	}
13171 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13172 #if CONFIG_FILE_LEASES
13173 	nameiflags |= WANTPARENT;
13174 #endif
13175 	NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13176 	if ((error = namei(&nd))) {
13177 		return error;
13178 	}
13179 	vp = nd.ni_vp;
13180 #if CONFIG_FILE_LEASES
13181 	vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13182 	vnode_put(nd.ni_dvp);
13183 #endif
13184 	nameidone(&nd);
13185 
13186 	error = vn_removexattr(vp, attrname, uap->options, ctx);
13187 #if CONFIG_FSE
13188 	if (error == 0) {
13189 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13190 		    FSE_ARG_VNODE, vp,
13191 		    FSE_ARG_DONE);
13192 	}
13193 #endif
13194 	vnode_put(vp);
13195 	*retval = 0;
13196 	return error;
13197 }
13198 
13199 /*
13200  * Remove an extended attribute.
13201  * XXX Code duplication here.
13202  */
13203 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13204 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13205 {
13206 	vnode_t vp;
13207 	char attrname[XATTR_MAXNAMELEN + 1];
13208 	size_t namelen;
13209 	int error;
13210 #if CONFIG_FSE
13211 	vfs_context_t ctx = vfs_context_current();
13212 #endif
13213 
13214 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13215 		return EINVAL;
13216 	}
13217 
13218 	error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13219 	if (error != 0) {
13220 		return error;
13221 	}
13222 	if (xattr_protected(attrname)) {
13223 		return EPERM;
13224 	}
13225 	if ((error = file_vnode(uap->fd, &vp))) {
13226 		return error;
13227 	}
13228 	if ((error = vnode_getwithref(vp))) {
13229 		file_drop(uap->fd);
13230 		return error;
13231 	}
13232 
13233 #if CONFIG_FILE_LEASES
13234 	vnode_breakdirlease(vp, true, O_WRONLY);
13235 #endif
13236 
13237 	error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13238 #if CONFIG_FSE
13239 	if (error == 0) {
13240 		add_fsevent(FSE_XATTR_REMOVED, ctx,
13241 		    FSE_ARG_VNODE, vp,
13242 		    FSE_ARG_DONE);
13243 	}
13244 #endif
13245 	vnode_put(vp);
13246 	file_drop(uap->fd);
13247 	*retval = 0;
13248 	return error;
13249 }
13250 
13251 /*
13252  * Retrieve the list of extended attribute names.
13253  * XXX Code duplication here.
13254  */
13255 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13256 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13257 {
13258 	vnode_t vp;
13259 	struct nameidata nd;
13260 	vfs_context_t ctx = vfs_context_current();
13261 	uio_t auio = NULL;
13262 	int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13263 	size_t attrsize = 0;
13264 	u_int32_t nameiflags;
13265 	int error;
13266 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13267 
13268 	if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13269 		return EINVAL;
13270 	}
13271 
13272 	nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13273 	NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13274 	if ((error = namei(&nd))) {
13275 		return error;
13276 	}
13277 	vp = nd.ni_vp;
13278 	nameidone(&nd);
13279 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13280 		auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13281 		    &uio_buf[0], sizeof(uio_buf));
13282 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13283 	}
13284 
13285 	error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13286 
13287 	vnode_put(vp);
13288 	if (auio) {
13289 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13290 	} else {
13291 		*retval = (user_ssize_t)attrsize;
13292 	}
13293 	return error;
13294 }
13295 
13296 /*
13297  * Retrieve the list of extended attribute names.
13298  * XXX Code duplication here.
13299  */
13300 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13301 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13302 {
13303 	vnode_t vp;
13304 	uio_t auio = NULL;
13305 	int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13306 	size_t attrsize = 0;
13307 	int error;
13308 	uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13309 
13310 	if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13311 		return EINVAL;
13312 	}
13313 
13314 	if ((error = file_vnode(uap->fd, &vp))) {
13315 		return error;
13316 	}
13317 	if ((error = vnode_getwithref(vp))) {
13318 		file_drop(uap->fd);
13319 		return error;
13320 	}
13321 	if (uap->namebuf != 0 && uap->bufsize > 0) {
13322 		auio = uio_createwithbuffer(1, 0, spacetype,
13323 		    UIO_READ, &uio_buf[0], sizeof(uio_buf));
13324 		uio_addiov(auio, uap->namebuf, uap->bufsize);
13325 	}
13326 
13327 	error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13328 
13329 	vnode_put(vp);
13330 	file_drop(uap->fd);
13331 	if (auio) {
13332 		*retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13333 	} else {
13334 		*retval = (user_ssize_t)attrsize;
13335 	}
13336 	return error;
13337 }
13338 
13339 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13340 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13341     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13342 {
13343 	int error;
13344 	struct mount *mp = NULL;
13345 	vnode_t vp;
13346 	int length;
13347 	int bpflags;
13348 	/* maximum number of times to retry build_path */
13349 	unsigned int retries = 0x10;
13350 
13351 	if (bufsize > PAGE_SIZE) {
13352 		return EINVAL;
13353 	}
13354 
13355 	if (buf == NULL) {
13356 		return ENOMEM;
13357 	}
13358 
13359 retry:
13360 	if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13361 		error = ENOTSUP;  /* unexpected failure */
13362 		return ENOTSUP;
13363 	}
13364 
13365 #if CONFIG_UNION_MOUNTS
13366 unionget:
13367 #endif /* CONFIG_UNION_MOUNTS */
13368 	if (objid == 2) {
13369 		struct vfs_attr vfsattr;
13370 		int use_vfs_root = TRUE;
13371 
13372 		VFSATTR_INIT(&vfsattr);
13373 		VFSATTR_WANTED(&vfsattr, f_capabilities);
13374 		if (!(options & FSOPT_ISREALFSID) &&
13375 		    vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13376 		    VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13377 			if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13378 			    (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13379 				use_vfs_root = FALSE;
13380 			}
13381 		}
13382 
13383 		if (use_vfs_root) {
13384 			error = VFS_ROOT(mp, &vp, ctx);
13385 		} else {
13386 			error = VFS_VGET(mp, objid, &vp, ctx);
13387 		}
13388 	} else {
13389 		error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13390 	}
13391 
13392 #if CONFIG_UNION_MOUNTS
13393 	if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13394 		/*
13395 		 * If the fileid isn't found and we're in a union
13396 		 * mount volume, then see if the fileid is in the
13397 		 * mounted-on volume.
13398 		 */
13399 		struct mount *tmp = mp;
13400 		mp = vnode_mount(tmp->mnt_vnodecovered);
13401 		vfs_unbusy(tmp);
13402 		if (vfs_busy(mp, LK_NOWAIT) == 0) {
13403 			goto unionget;
13404 		}
13405 	} else {
13406 		vfs_unbusy(mp);
13407 	}
13408 #else
13409 	vfs_unbusy(mp);
13410 #endif /* CONFIG_UNION_MOUNTS */
13411 
13412 	if (error) {
13413 		return error;
13414 	}
13415 
13416 #if CONFIG_MACF
13417 	error = mac_vnode_check_fsgetpath(ctx, vp);
13418 	if (error) {
13419 		vnode_put(vp);
13420 		return error;
13421 	}
13422 #endif
13423 
13424 	/* Obtain the absolute path to this vnode. */
13425 	bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13426 	if (options & FSOPT_NOFIRMLINKPATH) {
13427 		bpflags |= BUILDPATH_NO_FIRMLINK;
13428 	}
13429 	bpflags |= BUILDPATH_CHECK_MOVED;
13430 	error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13431 	vnode_put(vp);
13432 
13433 	if (error) {
13434 		/* there was a race building the path, try a few more times */
13435 		if (error == EAGAIN) {
13436 			--retries;
13437 			if (retries > 0) {
13438 				goto retry;
13439 			}
13440 
13441 			error = ENOENT;
13442 		}
13443 		goto out;
13444 	}
13445 
13446 	AUDIT_ARG(text, buf);
13447 
13448 	if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13449 		unsigned long path_words[NUMPARMS];
13450 		size_t path_len = sizeof(path_words);
13451 
13452 		if ((size_t)length < path_len) {
13453 			memcpy((char *)path_words, buf, length);
13454 			memset((char *)path_words + length, 0, path_len - length);
13455 
13456 			path_len = length;
13457 		} else {
13458 			memcpy((char *)path_words, buf + (length - path_len), path_len);
13459 		}
13460 
13461 		kdebug_vfs_lookup(path_words, (int)path_len, vp,
13462 		    KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13463 	}
13464 
13465 	*pathlen = length; /* may be superseded by error */
13466 
13467 out:
13468 	return error;
13469 }
13470 
13471 /*
13472  * Obtain the full pathname of a file system object by id.
13473  */
13474 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13475 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13476     uint32_t options, user_ssize_t *retval)
13477 {
13478 	vfs_context_t ctx = vfs_context_current();
13479 	fsid_t fsid;
13480 	char *realpath;
13481 	int length;
13482 	int error;
13483 
13484 	if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13485 		return EINVAL;
13486 	}
13487 
13488 	if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13489 		return error;
13490 	}
13491 	AUDIT_ARG(value32, fsid.val[0]);
13492 	AUDIT_ARG(value64, objid);
13493 	/* Restrict output buffer size for now. */
13494 
13495 	if (bufsize > PAGE_SIZE || bufsize <= 0) {
13496 		return EINVAL;
13497 	}
13498 	realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13499 	if (realpath == NULL) {
13500 		return ENOMEM;
13501 	}
13502 
13503 	error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13504 	    options, &length);
13505 
13506 	if (error) {
13507 		goto out;
13508 	}
13509 
13510 	error = copyout((caddr_t)realpath, buf, length);
13511 
13512 	*retval = (user_ssize_t)length; /* may be superseded by error */
13513 out:
13514 	kfree_data(realpath, bufsize);
13515 	return error;
13516 }
13517 
13518 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13519 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13520 {
13521 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13522 	           0, retval);
13523 }
13524 
13525 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13526 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13527 {
13528 	return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13529 	           uap->options, retval);
13530 }
13531 
13532 /*
13533  * Common routine to handle various flavors of statfs data heading out
13534  *	to user space.
13535  *
13536  * Returns:	0			Success
13537  *		EFAULT
13538  */
13539 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13540 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13541     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13542     boolean_t partial_copy)
13543 {
13544 	int             error;
13545 	int             my_size, copy_size;
13546 
13547 	if (is_64_bit) {
13548 		struct user64_statfs sfs;
13549 		my_size = copy_size = sizeof(sfs);
13550 		bzero(&sfs, my_size);
13551 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13552 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13553 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13554 		sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13555 		sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13556 		sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13557 		sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13558 		sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13559 		sfs.f_files = (user64_long_t)sfsp->f_files;
13560 		sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13561 		sfs.f_fsid = sfsp->f_fsid;
13562 		sfs.f_owner = sfsp->f_owner;
13563 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13564 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13565 		} else {
13566 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13567 		}
13568 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13569 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13570 
13571 		if (partial_copy) {
13572 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13573 		}
13574 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13575 	} else {
13576 		struct user32_statfs sfs;
13577 
13578 		my_size = copy_size = sizeof(sfs);
13579 		bzero(&sfs, my_size);
13580 
13581 		sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13582 		sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13583 		sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13584 
13585 		/*
13586 		 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13587 		 * have to fudge the numbers here in that case.   We inflate the blocksize in order
13588 		 * to reflect the filesystem size as best we can.
13589 		 */
13590 		if ((sfsp->f_blocks > INT_MAX)
13591 		    /* Hack for 4061702 . I think the real fix is for Carbon to
13592 		     * look for some volume capability and not depend on hidden
13593 		     * semantics agreed between a FS and carbon.
13594 		     * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13595 		     * for Carbon to set bNoVolumeSizes volume attribute.
13596 		     * Without this the webdavfs files cannot be copied onto
13597 		     * disk as they look huge. This change should not affect
13598 		     * XSAN as they should not setting these to -1..
13599 		     */
13600 		    && (sfsp->f_blocks != 0xffffffffffffffffULL)
13601 		    && (sfsp->f_bfree != 0xffffffffffffffffULL)
13602 		    && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13603 			int             shift;
13604 
13605 			/*
13606 			 * Work out how far we have to shift the block count down to make it fit.
13607 			 * Note that it's possible to have to shift so far that the resulting
13608 			 * blocksize would be unreportably large.  At that point, we will clip
13609 			 * any values that don't fit.
13610 			 *
13611 			 * For safety's sake, we also ensure that f_iosize is never reported as
13612 			 * being smaller than f_bsize.
13613 			 */
13614 			for (shift = 0; shift < 32; shift++) {
13615 				if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13616 					break;
13617 				}
13618 				if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13619 					break;
13620 				}
13621 			}
13622 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13623 			sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13624 			sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13625 			sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13626 #undef __SHIFT_OR_CLIP
13627 			sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13628 			sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13629 		} else {
13630 			/* filesystem is small enough to be reported honestly */
13631 			sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13632 			sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13633 			sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13634 			sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13635 			sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13636 		}
13637 		sfs.f_files = (user32_long_t)sfsp->f_files;
13638 		sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13639 		sfs.f_fsid = sfsp->f_fsid;
13640 		sfs.f_owner = sfsp->f_owner;
13641 		if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13642 			strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13643 		} else {
13644 			strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13645 		}
13646 		strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13647 		strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13648 
13649 		if (partial_copy) {
13650 			copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13651 		}
13652 		error = copyout((caddr_t)&sfs, bufp, copy_size);
13653 	}
13654 
13655 	if (sizep != NULL) {
13656 		*sizep = my_size;
13657 	}
13658 	return error;
13659 }
13660 
13661 /*
13662  * copy stat structure into user_stat structure.
13663  */
13664 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13665 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13666 {
13667 	bzero(usbp, sizeof(*usbp));
13668 
13669 	usbp->st_dev = sbp->st_dev;
13670 	usbp->st_ino = sbp->st_ino;
13671 	usbp->st_mode = sbp->st_mode;
13672 	usbp->st_nlink = sbp->st_nlink;
13673 	usbp->st_uid = sbp->st_uid;
13674 	usbp->st_gid = sbp->st_gid;
13675 	usbp->st_rdev = sbp->st_rdev;
13676 #ifndef _POSIX_C_SOURCE
13677 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13678 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13679 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13680 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13681 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13682 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13683 #else
13684 	usbp->st_atime = sbp->st_atime;
13685 	usbp->st_atimensec = sbp->st_atimensec;
13686 	usbp->st_mtime = sbp->st_mtime;
13687 	usbp->st_mtimensec = sbp->st_mtimensec;
13688 	usbp->st_ctime = sbp->st_ctime;
13689 	usbp->st_ctimensec = sbp->st_ctimensec;
13690 #endif
13691 	usbp->st_size = sbp->st_size;
13692 	usbp->st_blocks = sbp->st_blocks;
13693 	usbp->st_blksize = sbp->st_blksize;
13694 	usbp->st_flags = sbp->st_flags;
13695 	usbp->st_gen = sbp->st_gen;
13696 	usbp->st_lspare = sbp->st_lspare;
13697 	usbp->st_qspare[0] = sbp->st_qspare[0];
13698 	usbp->st_qspare[1] = sbp->st_qspare[1];
13699 }
13700 
13701 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13702 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13703 {
13704 	bzero(usbp, sizeof(*usbp));
13705 
13706 	usbp->st_dev = sbp->st_dev;
13707 	usbp->st_ino = sbp->st_ino;
13708 	usbp->st_mode = sbp->st_mode;
13709 	usbp->st_nlink = sbp->st_nlink;
13710 	usbp->st_uid = sbp->st_uid;
13711 	usbp->st_gid = sbp->st_gid;
13712 	usbp->st_rdev = sbp->st_rdev;
13713 #ifndef _POSIX_C_SOURCE
13714 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13715 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13716 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13717 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13718 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13719 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13720 #else
13721 	usbp->st_atime = sbp->st_atime;
13722 	usbp->st_atimensec = sbp->st_atimensec;
13723 	usbp->st_mtime = sbp->st_mtime;
13724 	usbp->st_mtimensec = sbp->st_mtimensec;
13725 	usbp->st_ctime = sbp->st_ctime;
13726 	usbp->st_ctimensec = sbp->st_ctimensec;
13727 #endif
13728 	usbp->st_size = sbp->st_size;
13729 	usbp->st_blocks = sbp->st_blocks;
13730 	usbp->st_blksize = sbp->st_blksize;
13731 	usbp->st_flags = sbp->st_flags;
13732 	usbp->st_gen = sbp->st_gen;
13733 	usbp->st_lspare = sbp->st_lspare;
13734 	usbp->st_qspare[0] = sbp->st_qspare[0];
13735 	usbp->st_qspare[1] = sbp->st_qspare[1];
13736 }
13737 
13738 /*
13739  * copy stat64 structure into user_stat64 structure.
13740  */
13741 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13742 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13743 {
13744 	bzero(usbp, sizeof(*usbp));
13745 
13746 	usbp->st_dev = sbp->st_dev;
13747 	usbp->st_ino = sbp->st_ino;
13748 	usbp->st_mode = sbp->st_mode;
13749 	usbp->st_nlink = sbp->st_nlink;
13750 	usbp->st_uid = sbp->st_uid;
13751 	usbp->st_gid = sbp->st_gid;
13752 	usbp->st_rdev = sbp->st_rdev;
13753 #ifndef _POSIX_C_SOURCE
13754 	usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13755 	usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13756 	usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13757 	usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13758 	usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13759 	usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13760 	usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13761 	usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13762 #else
13763 	usbp->st_atime = sbp->st_atime;
13764 	usbp->st_atimensec = sbp->st_atimensec;
13765 	usbp->st_mtime = sbp->st_mtime;
13766 	usbp->st_mtimensec = sbp->st_mtimensec;
13767 	usbp->st_ctime = sbp->st_ctime;
13768 	usbp->st_ctimensec = sbp->st_ctimensec;
13769 	usbp->st_birthtime = sbp->st_birthtime;
13770 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13771 #endif
13772 	usbp->st_size = sbp->st_size;
13773 	usbp->st_blocks = sbp->st_blocks;
13774 	usbp->st_blksize = sbp->st_blksize;
13775 	usbp->st_flags = sbp->st_flags;
13776 	usbp->st_gen = sbp->st_gen;
13777 	usbp->st_lspare = sbp->st_lspare;
13778 	usbp->st_qspare[0] = sbp->st_qspare[0];
13779 	usbp->st_qspare[1] = sbp->st_qspare[1];
13780 }
13781 
13782 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13783 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13784 {
13785 	bzero(usbp, sizeof(*usbp));
13786 
13787 	usbp->st_dev = sbp->st_dev;
13788 	usbp->st_ino = sbp->st_ino;
13789 	usbp->st_mode = sbp->st_mode;
13790 	usbp->st_nlink = sbp->st_nlink;
13791 	usbp->st_uid = sbp->st_uid;
13792 	usbp->st_gid = sbp->st_gid;
13793 	usbp->st_rdev = sbp->st_rdev;
13794 #ifndef _POSIX_C_SOURCE
13795 	usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13796 	usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13797 	usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13798 	usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13799 	usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13800 	usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13801 	usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13802 	usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13803 #else
13804 	usbp->st_atime = sbp->st_atime;
13805 	usbp->st_atimensec = sbp->st_atimensec;
13806 	usbp->st_mtime = sbp->st_mtime;
13807 	usbp->st_mtimensec = sbp->st_mtimensec;
13808 	usbp->st_ctime = sbp->st_ctime;
13809 	usbp->st_ctimensec = sbp->st_ctimensec;
13810 	usbp->st_birthtime = sbp->st_birthtime;
13811 	usbp->st_birthtimensec = sbp->st_birthtimensec;
13812 #endif
13813 	usbp->st_size = sbp->st_size;
13814 	usbp->st_blocks = sbp->st_blocks;
13815 	usbp->st_blksize = sbp->st_blksize;
13816 	usbp->st_flags = sbp->st_flags;
13817 	usbp->st_gen = sbp->st_gen;
13818 	usbp->st_lspare = sbp->st_lspare;
13819 	usbp->st_qspare[0] = sbp->st_qspare[0];
13820 	usbp->st_qspare[1] = sbp->st_qspare[1];
13821 }
13822 
13823 /*
13824  * Purge buffer cache for simulating cold starts
13825  */
13826 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13827 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13828 {
13829 	ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13830 
13831 	return VNODE_RETURNED;
13832 }
13833 
13834 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13835 vfs_purge_callback(mount_t mp, __unused void * arg)
13836 {
13837 	vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13838 
13839 	return VFS_RETURNED;
13840 }
13841 
13842 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13843 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13844 {
13845 	if (!kauth_cred_issuser(kauth_cred_get())) {
13846 		return EPERM;
13847 	}
13848 
13849 	vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13850 
13851 	return 0;
13852 }
13853 
13854 /*
13855  * gets the vnode associated with the (unnamed) snapshot directory
13856  * for a Filesystem. The snapshot directory vnode is returned with
13857  * an iocount on it.
13858  */
13859 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13860 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13861 {
13862 	return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13863 }
13864 
13865 /*
13866  * Get the snapshot vnode.
13867  *
13868  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13869  * needs nameidone() on ndp.
13870  *
13871  * If the snapshot vnode exists it is returned in ndp->ni_vp.
13872  *
13873  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13874  * not needed.
13875  */
13876 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13877 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13878     user_addr_t name, struct nameidata *ndp, int32_t op,
13879 #if !CONFIG_TRIGGERS
13880     __unused
13881 #endif
13882     enum path_operation pathop,
13883     vfs_context_t ctx)
13884 {
13885 	int error, i;
13886 	caddr_t name_buf;
13887 	size_t name_len;
13888 	struct vfs_attr vfa;
13889 
13890 	*sdvpp = NULLVP;
13891 	*rvpp = NULLVP;
13892 
13893 	error = vnode_getfromfd(ctx, dirfd, rvpp);
13894 	if (error) {
13895 		return error;
13896 	}
13897 
13898 	if (!vnode_isvroot(*rvpp)) {
13899 		error = EINVAL;
13900 		goto out;
13901 	}
13902 
13903 	/* Make sure the filesystem supports snapshots */
13904 	VFSATTR_INIT(&vfa);
13905 	VFSATTR_WANTED(&vfa, f_capabilities);
13906 	if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13907 	    !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13908 	    !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13909 	    VOL_CAP_INT_SNAPSHOT)) ||
13910 	    !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13911 	    VOL_CAP_INT_SNAPSHOT))) {
13912 		error = ENOTSUP;
13913 		goto out;
13914 	}
13915 
13916 	error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13917 	if (error) {
13918 		goto out;
13919 	}
13920 
13921 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13922 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13923 	if (error) {
13924 		goto out1;
13925 	}
13926 
13927 	/*
13928 	 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13929 	 * (the length returned by copyinstr includes the terminating NUL)
13930 	 */
13931 	if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13932 	    (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13933 		error = EINVAL;
13934 		goto out1;
13935 	}
13936 	for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13937 		;
13938 	}
13939 	if (i < (int)name_len) {
13940 		error = EINVAL;
13941 		goto out1;
13942 	}
13943 
13944 #if CONFIG_MACF
13945 	if (op == CREATE) {
13946 		error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
13947 		    name_buf);
13948 	} else if (op == DELETE) {
13949 		error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
13950 		    name_buf);
13951 	}
13952 	if (error) {
13953 		goto out1;
13954 	}
13955 #endif
13956 
13957 	/* Check if the snapshot already exists ... */
13958 	NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
13959 	    UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
13960 	ndp->ni_dvp = *sdvpp;
13961 
13962 	error = namei(ndp);
13963 out1:
13964 	zfree(ZV_NAMEI, name_buf);
13965 out:
13966 	if (error) {
13967 		if (*sdvpp) {
13968 			vnode_put(*sdvpp);
13969 			*sdvpp = NULLVP;
13970 		}
13971 		if (*rvpp) {
13972 			vnode_put(*rvpp);
13973 			*rvpp = NULLVP;
13974 		}
13975 	}
13976 	return error;
13977 }
13978 
13979 /*
13980  * create a filesystem snapshot (for supporting filesystems)
13981  *
13982  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
13983  * We get to the (unnamed) snapshot directory vnode and create the vnode
13984  * for the snapshot in it.
13985  *
13986  * Restrictions:
13987  *
13988  *    a) Passed in name for snapshot cannot have slashes.
13989  *    b) name can't be "." or ".."
13990  *
13991  * Since this requires superuser privileges, vnode_authorize calls are not
13992  * made.
13993  */
13994 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13995 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
13996     vfs_context_t ctx)
13997 {
13998 	vnode_t rvp, snapdvp;
13999 	int error;
14000 	struct nameidata *ndp;
14001 
14002 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14003 
14004 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14005 	    OP_LINK, ctx);
14006 	if (error) {
14007 		goto out;
14008 	}
14009 
14010 	if (ndp->ni_vp) {
14011 		vnode_put(ndp->ni_vp);
14012 		error = EEXIST;
14013 	} else {
14014 		struct vnode_attr *vap;
14015 		vnode_t vp = NULLVP;
14016 
14017 		vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14018 
14019 		VATTR_INIT(vap);
14020 		VATTR_SET(vap, va_type, VREG);
14021 		VATTR_SET(vap, va_mode, 0);
14022 
14023 		error = vn_create(snapdvp, &vp, ndp, vap,
14024 		    VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14025 		if (!error && vp) {
14026 			vnode_put(vp);
14027 		}
14028 
14029 		kfree_type(struct vnode_attr, vap);
14030 	}
14031 
14032 	nameidone(ndp);
14033 	vnode_put(snapdvp);
14034 	vnode_put(rvp);
14035 out:
14036 	kfree_type(struct nameidata, ndp);
14037 
14038 	return error;
14039 }
14040 
14041 /*
14042  * Delete a Filesystem snapshot
14043  *
14044  * get the vnode for the unnamed snapshot directory and the snapshot and
14045  * delete the snapshot.
14046  */
14047 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14048 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14049     vfs_context_t ctx)
14050 {
14051 	vnode_t rvp, snapdvp;
14052 	int error;
14053 	struct nameidata *ndp;
14054 
14055 	ndp = kalloc_type(struct nameidata, Z_WAITOK);
14056 
14057 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14058 	    OP_UNLINK, ctx);
14059 	if (error) {
14060 		goto out;
14061 	}
14062 
14063 	error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14064 	    VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14065 
14066 	vnode_put(ndp->ni_vp);
14067 	nameidone(ndp);
14068 	vnode_put(snapdvp);
14069 	vnode_put(rvp);
14070 out:
14071 	kfree_type(struct nameidata, ndp);
14072 
14073 	return error;
14074 }
14075 
14076 /*
14077  * Revert a filesystem to a snapshot
14078  *
14079  * Marks the filesystem to revert to the given snapshot on next mount.
14080  */
14081 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14082 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14083     vfs_context_t ctx)
14084 {
14085 	int error;
14086 	vnode_t rvp;
14087 	mount_t mp;
14088 	struct fs_snapshot_revert_args revert_data;
14089 	struct componentname cnp;
14090 	caddr_t name_buf;
14091 	size_t name_len;
14092 
14093 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14094 	if (error) {
14095 		return error;
14096 	}
14097 	mp = vnode_mount(rvp);
14098 
14099 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14100 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14101 	if (error) {
14102 		zfree(ZV_NAMEI, name_buf);
14103 		vnode_put(rvp);
14104 		return error;
14105 	}
14106 
14107 #if CONFIG_MACF
14108 	error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14109 	if (error) {
14110 		zfree(ZV_NAMEI, name_buf);
14111 		vnode_put(rvp);
14112 		return error;
14113 	}
14114 #endif
14115 
14116 	/*
14117 	 * Grab mount_iterref so that we can release the vnode,
14118 	 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14119 	 */
14120 	error = mount_iterref(mp, 0);
14121 	vnode_put(rvp);
14122 	if (error) {
14123 		zfree(ZV_NAMEI, name_buf);
14124 		return error;
14125 	}
14126 
14127 	memset(&cnp, 0, sizeof(cnp));
14128 	cnp.cn_pnbuf = (char *)name_buf;
14129 	cnp.cn_nameiop = LOOKUP;
14130 	cnp.cn_flags = ISLASTCN | HASBUF;
14131 	cnp.cn_pnlen = MAXPATHLEN;
14132 	cnp.cn_nameptr = cnp.cn_pnbuf;
14133 	cnp.cn_namelen = (int)name_len;
14134 	revert_data.sr_cnp = &cnp;
14135 
14136 	error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14137 	mount_iterdrop(mp);
14138 	zfree(ZV_NAMEI, name_buf);
14139 
14140 	if (error) {
14141 		/* If there was any error, try again using VNOP_IOCTL */
14142 
14143 		vnode_t snapdvp;
14144 		struct nameidata namend;
14145 
14146 		error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14147 		    OP_LOOKUP, ctx);
14148 		if (error) {
14149 			return error;
14150 		}
14151 
14152 
14153 		error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14154 		    0, ctx);
14155 
14156 		vnode_put(namend.ni_vp);
14157 		nameidone(&namend);
14158 		vnode_put(snapdvp);
14159 		vnode_put(rvp);
14160 	}
14161 
14162 	return error;
14163 }
14164 
14165 /*
14166  * rename a Filesystem snapshot
14167  *
14168  * get the vnode for the unnamed snapshot directory and the snapshot and
14169  * rename the snapshot. This is a very specialised (and simple) case of
14170  * rename(2) (which has to deal with a lot more complications). It differs
14171  * slightly from rename(2) in that EEXIST is returned if the new name exists.
14172  */
14173 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14174 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14175     __unused uint32_t flags, vfs_context_t ctx)
14176 {
14177 	vnode_t rvp, snapdvp;
14178 	int error, i;
14179 	caddr_t newname_buf;
14180 	size_t name_len;
14181 	vnode_t fvp;
14182 	struct nameidata *fromnd, *tond;
14183 	/* carving out a chunk for structs that are too big to be on stack. */
14184 	struct {
14185 		struct nameidata from_node;
14186 		struct nameidata to_node;
14187 	} * __rename_data;
14188 
14189 	__rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14190 	fromnd = &__rename_data->from_node;
14191 	tond = &__rename_data->to_node;
14192 
14193 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14194 	    OP_UNLINK, ctx);
14195 	if (error) {
14196 		goto out;
14197 	}
14198 	fvp  = fromnd->ni_vp;
14199 
14200 	newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14201 	error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14202 	if (error) {
14203 		goto out1;
14204 	}
14205 
14206 	/*
14207 	 * Some sanity checks- new name can't be empty, "." or ".." or have
14208 	 * slashes.
14209 	 * (the length returned by copyinstr includes the terminating NUL)
14210 	 *
14211 	 * The FS rename VNOP is suppossed to handle this but we'll pick it
14212 	 * off here itself.
14213 	 */
14214 	if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14215 	    (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14216 		error = EINVAL;
14217 		goto out1;
14218 	}
14219 	for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14220 		;
14221 	}
14222 	if (i < (int)name_len) {
14223 		error = EINVAL;
14224 		goto out1;
14225 	}
14226 
14227 #if CONFIG_MACF
14228 	error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14229 	    newname_buf);
14230 	if (error) {
14231 		goto out1;
14232 	}
14233 #endif
14234 
14235 	NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14236 	    UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14237 	tond->ni_dvp = snapdvp;
14238 
14239 	error = namei(tond);
14240 	if (error) {
14241 		goto out2;
14242 	} else if (tond->ni_vp) {
14243 		/*
14244 		 * snapshot rename behaves differently than rename(2) - if the
14245 		 * new name exists, EEXIST is returned.
14246 		 */
14247 		vnode_put(tond->ni_vp);
14248 		error = EEXIST;
14249 		goto out2;
14250 	}
14251 
14252 	error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14253 	    &tond->ni_cnd, ctx);
14254 
14255 out2:
14256 	nameidone(tond);
14257 out1:
14258 	zfree(ZV_NAMEI, newname_buf);
14259 	vnode_put(fvp);
14260 	vnode_put(snapdvp);
14261 	vnode_put(rvp);
14262 	nameidone(fromnd);
14263 out:
14264 	kfree_type(typeof(*__rename_data), __rename_data);
14265 	return error;
14266 }
14267 
14268 /*
14269  * Mount a Filesystem snapshot
14270  *
14271  * get the vnode for the unnamed snapshot directory and the snapshot and
14272  * mount the snapshot.
14273  */
14274 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14275 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14276     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14277 {
14278 	mount_t mp;
14279 	vnode_t rvp, snapdvp, snapvp, vp, pvp;
14280 	struct fs_snapshot_mount_args smnt_data;
14281 	int error;
14282 	struct nameidata *snapndp, *dirndp;
14283 	/* carving out a chunk for structs that are too big to be on stack. */
14284 	struct {
14285 		struct nameidata snapnd;
14286 		struct nameidata dirnd;
14287 	} * __snapshot_mount_data;
14288 
14289 	__snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14290 	snapndp = &__snapshot_mount_data->snapnd;
14291 	dirndp = &__snapshot_mount_data->dirnd;
14292 
14293 	error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14294 	    OP_LOOKUP, ctx);
14295 	if (error) {
14296 		goto out;
14297 	}
14298 
14299 	snapvp  = snapndp->ni_vp;
14300 	if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14301 		error = EIO;
14302 		goto out1;
14303 	}
14304 
14305 	/* Get the vnode to be covered */
14306 	NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14307 	    UIO_USERSPACE, directory, ctx);
14308 	error = namei(dirndp);
14309 	if (error) {
14310 		goto out1;
14311 	}
14312 
14313 	vp = dirndp->ni_vp;
14314 	pvp = dirndp->ni_dvp;
14315 	mp = vnode_mount(rvp);
14316 
14317 	if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14318 		error = EINVAL;
14319 		goto out2;
14320 	}
14321 
14322 #if CONFIG_MACF
14323 	error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14324 	    mp->mnt_vfsstat.f_fstypename);
14325 	if (error) {
14326 		goto out2;
14327 	}
14328 #endif
14329 
14330 	smnt_data.sm_mp  = mp;
14331 	smnt_data.sm_cnp = &snapndp->ni_cnd;
14332 	error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14333 	    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
14334 	    KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14335 
14336 out2:
14337 	vnode_put(vp);
14338 	vnode_put(pvp);
14339 	nameidone(dirndp);
14340 out1:
14341 	vnode_put(snapvp);
14342 	vnode_put(snapdvp);
14343 	vnode_put(rvp);
14344 	nameidone(snapndp);
14345 out:
14346 	kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14347 	return error;
14348 }
14349 
14350 /*
14351  * Root from a snapshot of the filesystem
14352  *
14353  * Marks the filesystem to root from the given snapshot on next boot.
14354  */
14355 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14356 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14357     vfs_context_t ctx)
14358 {
14359 	int error;
14360 	vnode_t rvp;
14361 	mount_t mp;
14362 	struct fs_snapshot_root_args root_data;
14363 	struct componentname cnp;
14364 	caddr_t name_buf;
14365 	size_t name_len;
14366 
14367 	error = vnode_getfromfd(ctx, dirfd, &rvp);
14368 	if (error) {
14369 		return error;
14370 	}
14371 	mp = vnode_mount(rvp);
14372 
14373 	name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14374 	error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14375 	if (error) {
14376 		zfree(ZV_NAMEI, name_buf);
14377 		vnode_put(rvp);
14378 		return error;
14379 	}
14380 
14381 	// XXX MAC checks ?
14382 
14383 	/*
14384 	 * Grab mount_iterref so that we can release the vnode,
14385 	 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14386 	 */
14387 	error = mount_iterref(mp, 0);
14388 	vnode_put(rvp);
14389 	if (error) {
14390 		zfree(ZV_NAMEI, name_buf);
14391 		return error;
14392 	}
14393 
14394 	memset(&cnp, 0, sizeof(cnp));
14395 	cnp.cn_pnbuf = (char *)name_buf;
14396 	cnp.cn_nameiop = LOOKUP;
14397 	cnp.cn_flags = ISLASTCN | HASBUF;
14398 	cnp.cn_pnlen = MAXPATHLEN;
14399 	cnp.cn_nameptr = cnp.cn_pnbuf;
14400 	cnp.cn_namelen = (int)name_len;
14401 	root_data.sr_cnp = &cnp;
14402 
14403 	error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14404 
14405 	mount_iterdrop(mp);
14406 	zfree(ZV_NAMEI, name_buf);
14407 
14408 	return error;
14409 }
14410 
14411 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14412 vfs_context_can_snapshot(vfs_context_t ctx)
14413 {
14414 	static const char * const snapshot_entitlements[] = {
14415 		"com.apple.private.vfs.snapshot",
14416 		"com.apple.developer.vfs.snapshot",
14417 		"com.apple.private.apfs.arv.limited.snapshot",
14418 	};
14419 	static const size_t nentitlements =
14420 	    sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14421 	size_t i;
14422 
14423 	task_t task = vfs_context_task(ctx);
14424 	for (i = 0; i < nentitlements; i++) {
14425 		if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14426 			return TRUE;
14427 		}
14428 	}
14429 	return FALSE;
14430 }
14431 
14432 /*
14433  * FS snapshot operations dispatcher
14434  */
14435 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14436 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14437     __unused int32_t *retval)
14438 {
14439 	int error;
14440 	vfs_context_t ctx = vfs_context_current();
14441 
14442 	AUDIT_ARG(fd, uap->dirfd);
14443 	AUDIT_ARG(value32, uap->op);
14444 
14445 	if (!vfs_context_can_snapshot(ctx)) {
14446 		return EPERM;
14447 	}
14448 
14449 	/*
14450 	 * Enforce user authorization for snapshot modification operations,
14451 	 * or if trying to root from snapshot.
14452 	 */
14453 	if (uap->op != SNAPSHOT_OP_MOUNT) {
14454 		vnode_t dvp = NULLVP;
14455 		vnode_t devvp = NULLVP;
14456 		mount_t mp;
14457 
14458 		error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14459 		if (error) {
14460 			return error;
14461 		}
14462 		mp = vnode_mount(dvp);
14463 		devvp = mp->mnt_devvp;
14464 
14465 		/* get an iocount on devvp */
14466 		if (devvp == NULLVP) {
14467 			error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14468 			/* for mounts which arent block devices */
14469 			if (error == ENOENT) {
14470 				error = ENXIO;
14471 			}
14472 		} else {
14473 			error = vnode_getwithref(devvp);
14474 		}
14475 
14476 		if (error) {
14477 			vnode_put(dvp);
14478 			return error;
14479 		}
14480 
14481 		if ((vfs_context_issuser(ctx) == 0) &&
14482 		    (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14483 		    (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14484 			error = EPERM;
14485 		}
14486 		vnode_put(dvp);
14487 		vnode_put(devvp);
14488 
14489 		if (error) {
14490 			return error;
14491 		}
14492 	}
14493 
14494 	switch (uap->op) {
14495 	case SNAPSHOT_OP_CREATE:
14496 		error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14497 		break;
14498 	case SNAPSHOT_OP_DELETE:
14499 		error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14500 		break;
14501 	case SNAPSHOT_OP_RENAME:
14502 		error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14503 		    uap->flags, ctx);
14504 		break;
14505 	case SNAPSHOT_OP_MOUNT:
14506 		error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14507 		    uap->data, uap->flags, ctx);
14508 		break;
14509 	case SNAPSHOT_OP_REVERT:
14510 		error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14511 		break;
14512 #if CONFIG_MNT_ROOTSNAP
14513 	case SNAPSHOT_OP_ROOT:
14514 		error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14515 		break;
14516 #endif /* CONFIG_MNT_ROOTSNAP */
14517 	default:
14518 		error = ENOSYS;
14519 	}
14520 
14521 	return error;
14522 }
14523